From 7efa84b5cdd6d473c7e80912638fca9d7167f202 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 4 Apr 2025 15:10:02 -0700
Subject: compiler-gcc.h: Introduce __diag_GCC_all

It is not possible disabling a diagnostic for all versions of GCC
without hard coding the minimum supported version at the site, as the
GCC specific macros require a minimum version to disable the warning
for:

    __diag_ignore(GCC, 5, ...);

__diag_ignore_all() does not solve this issue because it disables a
diagnostic for all versions of both GCC and clang, not just one or the
other.

Introduce __diag_GCC_all so that developers can write

    __diag_ignore(GCC, all, ...);

to disable a particular diagnostic for all versions of GCC, while not
affecting clang.

Closes: https://lore.kernel.org/r/CAHk-=wgfX9nBGE0Ap9GjhOy7Mn=RSy=rx0MvqfYFFDx31KJXqQ@mail.gmail.com
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://patch.msgid.link/20250404-vsprintf-convert-pragmas-to-__diag-v1-1-5d6c5c55b2bd@kernel.org
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/compiler-gcc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index c9b58188ec61..c75a222880f9 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -127,6 +127,8 @@
 #define __diag_GCC_8(s)
 #endif
 
+#define __diag_GCC_all(s)	__diag(s)
+
 #define __diag_ignore_all(option, comment) \
 	__diag(__diag_GCC_ignore option)
 
-- 
cgit v1.2.3


From 4e591b890afa0cbc3479f3b88fa7dc1d28972761 Mon Sep 17 00:00:00 2001
From: Thierry Bultel <thierry.bultel.yh@bp.renesas.com>
Date: Thu, 15 May 2025 16:18:17 +0200
Subject: dt-bindings: clock: renesas,cpg-mssr: Document RZ/T2H support

Document RZ/T2H (a.k.a. r9a09g077) cpg-mssr (Clock Pulse Generator)
binding.

Reviewed-by: "Rob Herring (Arm)" <robh@kernel.org>
Signed-off-by: Thierry Bultel <thierry.bultel.yh@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/20250515141828.43444-3-thierry.bultel.yh@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 .../bindings/clock/renesas,cpg-mssr.yaml           | 46 ++++++++++++++++------
 .../dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 27 +++++++++++++
 2 files changed, 62 insertions(+), 11 deletions(-)
 create mode 100644 include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
index 77ce3615c65a..708ab6bd7d44 100644
--- a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
@@ -52,9 +52,15 @@ properties:
       - renesas,r8a779f0-cpg-mssr # R-Car S4-8
       - renesas,r8a779g0-cpg-mssr # R-Car V4H
       - renesas,r8a779h0-cpg-mssr # R-Car V4M
+      - renesas,r9a09g077-cpg-mssr # RZ/T2H
 
   reg:
-    maxItems: 1
+    minItems: 1
+    items:
+      - description: base address of register block 0
+      - description: base address of register block 1
+    description: base addresses of clock controller. Some controllers
+      (like r9a09g077) use two blocks instead of a single one.
 
   clocks:
     minItems: 1
@@ -92,16 +98,6 @@ properties:
       the datasheet.
     const: 1
 
-if:
-  not:
-    properties:
-      compatible:
-        items:
-          enum:
-            - renesas,r7s9210-cpg-mssr
-then:
-  required:
-    - '#reset-cells'
 
 required:
   - compatible
@@ -111,6 +107,34 @@ required:
   - '#clock-cells'
   - '#power-domain-cells'
 
+allOf:
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: renesas,r9a09g077-cpg-mssr
+    then:
+      properties:
+        reg:
+          minItems: 2
+        clock-names:
+          items:
+            - const: extal
+    else:
+      properties:
+        reg:
+          maxItems: 1
+  - if:
+      not:
+        properties:
+          compatible:
+            items:
+              enum:
+                - renesas,r7s9210-cpg-mssr
+    then:
+      required:
+        - '#reset-cells'
+
 additionalProperties: false
 
 examples:
diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
new file mode 100644
index 000000000000..1b22fe88dec7
--- /dev/null
+++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+ *
+ * Copyright (C) 2025 Renesas Electronics Corp.
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__
+#define __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__
+
+#include <dt-bindings/clock/renesas-cpg-mssr.h>
+
+/* R9A09G077 CPG Core Clocks */
+#define R9A09G077_CLK_CA55C0		0
+#define R9A09G077_CLK_CA55C1		1
+#define R9A09G077_CLK_CA55C2		2
+#define R9A09G077_CLK_CA55C3		3
+#define R9A09G077_CLK_CA55S		4
+#define R9A09G077_CLK_CR52_CPU0		5
+#define R9A09G077_CLK_CR52_CPU1		6
+#define R9A09G077_CLK_CKIO		7
+#define R9A09G077_CLK_PCLKAH		8
+#define R9A09G077_CLK_PCLKAM		9
+#define R9A09G077_CLK_PCLKAL		10
+#define R9A09G077_CLK_PCLKGPTL		11
+#define R9A09G077_CLK_PCLKH		12
+#define R9A09G077_CLK_PCLKM		13
+
+#endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */
-- 
cgit v1.2.3


From edfc4c8a1edffa6849e19ffade1be8dd824989d0 Mon Sep 17 00:00:00 2001
From: Michal Koutný <mkoutny@suse.com>
Date: Tue, 3 Jun 2025 17:45:27 +0200
Subject: cgroup: Drop sock_cgroup_classid() dummy implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The semantic of returning 0 is unclear when !CONFIG_CGROUP_NET_CLASSID.
Since there are no callers of sock_cgroup_classid() with that config
anymore we can undefine the helper at all and enforce all (future)
callers to handle cases when !CONFIG_CGROUP_NET_CLASSID.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Link: https://lore.kernel.org/r/Z_52r_v9-3JUzDT7@calendula/
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index e61687d5e496..cd7f093e34cd 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -898,14 +898,12 @@ static inline u16 sock_cgroup_prioidx(const struct sock_cgroup_data *skcd)
 #endif
 }
 
+#ifdef CONFIG_CGROUP_NET_CLASSID
 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd)
 {
-#ifdef CONFIG_CGROUP_NET_CLASSID
 	return READ_ONCE(skcd->classid);
-#else
-	return 0;
-#endif
 }
+#endif
 
 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 					   u16 prioidx)
@@ -915,13 +913,13 @@ static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd,
 #endif
 }
 
+#ifdef CONFIG_CGROUP_NET_CLASSID
 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd,
 					   u32 classid)
 {
-#ifdef CONFIG_CGROUP_NET_CLASSID
 	WRITE_ONCE(skcd->classid, classid);
-#endif
 }
+#endif
 
 #else	/* CONFIG_SOCK_CGROUP_DATA */
 
-- 
cgit v1.2.3


From 49b393af3130c7712c7e8f215f4126c9a8060fa6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Jun 2025 10:21:38 +0200
Subject: perf: Add comment to enum perf_event_state

Better describe the event states.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Link: https://lkml.kernel.org/r/20250604135801.GK38114@noisy.programming.kicks-ass.net
---
 include/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 52dc7cfab0e0..ec9d96025683 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -635,8 +635,46 @@ struct perf_addr_filter_range {
 	unsigned long			size;
 };
 
-/**
- * enum perf_event_state - the states of an event:
+/*
+ * The normal states are:
+ *
+ *            ACTIVE    --.
+ *               ^        |
+ *               |        |
+ *       sched_{in,out}() |
+ *               |        |
+ *               v        |
+ *      ,---> INACTIVE  --+ <-.
+ *      |                 |   |
+ *      |                {dis,en}able()
+ *   sched_in()           |   |
+ *      |       OFF    <--' --+
+ *      |                     |
+ *      `--->  ERROR    ------'
+ *
+ * That is:
+ *
+ * sched_in:       INACTIVE          -> {ACTIVE,ERROR}
+ * sched_out:      ACTIVE            -> INACTIVE
+ * disable:        {ACTIVE,INACTIVE} -> OFF
+ * enable:         {OFF,ERROR}       -> INACTIVE
+ *
+ * Where {OFF,ERROR} are disabled states.
+ *
+ * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of
+ * defunct events:
+ *
+ *  - EXIT means task that the even was assigned to died, but child events
+ *    still live, and further children can still be created. But the event
+ *    itself will never be active again. It can only transition to
+ *    {REVOKED,DEAD};
+ *
+ *  - REVOKED means the PMU the event was associated with is gone; all
+ *    functionality is stopped but the event is still alive. Can only
+ *    transition to DEAD;
+ *
+ *  - DEAD event really is DYING tearing down state and freeing bits.
+ *
  */
 enum perf_event_state {
 	PERF_EVENT_STATE_DEAD		= -5,
-- 
cgit v1.2.3


From 2fe1c59347369fa856ae259e2fac3c8c8dd9d335 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Tue, 3 Jun 2025 23:43:07 +0800
Subject: bpf: Add cookie to raw_tp bpf_link_info

After commit 68ca5d4eebb8 ("bpf: support BPF cookie in raw tracepoint
(raw_tp, tp_btf) programs"), we can show the cookie in bpf_link_info
like kprobe etc.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250603154309.3063644-1-chen.dylane@linux.dev
---
 include/uapi/linux/bpf.h       | 2 ++
 kernel/bpf/syscall.c           | 1 +
 tools/include/uapi/linux/bpf.h | 2 ++
 3 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85180e4aaa5a..f1160ebbf526 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6651,6 +6651,8 @@ struct bpf_link_info {
 		struct {
 			__aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
 			__u32 tp_name_len;     /* in/out: tp_name buffer len */
+			__u32 :32;
+			__u64 cookie;
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dd5304c6ac3c..89d027cd7ca0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3688,6 +3688,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
 		return -EINVAL;
 
 	info->raw_tracepoint.tp_name_len = tp_len + 1;
+	info->raw_tracepoint.cookie = raw_tp_link->cookie;
 
 	if (!ubuf)
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85180e4aaa5a..f1160ebbf526 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6651,6 +6651,8 @@ struct bpf_link_info {
 		struct {
 			__aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
 			__u32 tp_name_len;     /* in/out: tp_name buffer len */
+			__u32 :32;
+			__u64 cookie;
 		} raw_tracepoint;
 		struct {
 			__u32 attach_type;
-- 
cgit v1.2.3


From 267be32b0a7b70cc777f8a46f0904c92c0521d89 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Wed, 4 Jun 2025 02:07:22 +0000
Subject: ASoC: remove component->id

No one is using component->id.
One idea is we can re-use it as serial number for component.
But we have no usage, so far. Let's just remove it for now.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/877c1suuna.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-component.h | 1 -
 sound/soc/soc-core.c          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-component.h b/include/sound/soc-component.h
index 61534ac0edd1..2caa807c6249 100644
--- a/include/sound/soc-component.h
+++ b/include/sound/soc-component.h
@@ -206,7 +206,6 @@ struct snd_soc_component_driver {
 
 struct snd_soc_component {
 	const char *name;
-	int id;
 	const char *name_prefix;
 	struct device *dev;
 	struct snd_soc_card *card;
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index ecea2dddbe9a..cfafdabcdc88 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2835,7 +2835,7 @@ int snd_soc_component_initialize(struct snd_soc_component *component,
 	mutex_init(&component->io_mutex);
 
 	if (!component->name) {
-		component->name = fmt_single_name(dev, &component->id);
+		component->name = fmt_single_name(dev, NULL);
 		if (!component->name) {
 			dev_err(dev, "ASoC: Failed to allocate name\n");
 			return -ENOMEM;
-- 
cgit v1.2.3


From d7181a2d43cffb19f1e5c19f6d2328f190c87d70 Mon Sep 17 00:00:00 2001
From: Martijn de Gouw <martijn.de.gouw@prodrive-technologies.com>
Date: Sun, 25 May 2025 09:18:20 +0200
Subject: dt-bindings: regulator: add pca9450: Add regulator-allowed-modes

Make the PWM mode on the buck controllers configurable from devicetree.
Some boards require forced PWM mode to keep the supply ripple within
acceptable limits under light load conditions.

Signed-off-by: Martijn de Gouw <martijn.de.gouw@prodrive-technologies.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://patch.msgid.link/20250525071823.819342-1-martijn.de.gouw@prodrive-technologies.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../bindings/regulator/nxp,pca9450-regulator.yaml      | 14 ++++++++++++++
 include/dt-bindings/regulator/nxp,pca9450-regulator.h  | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 include/dt-bindings/regulator/nxp,pca9450-regulator.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml b/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
index 4ffe5c3faea0..a5486c36830f 100644
--- a/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/nxp,pca9450-regulator.yaml
@@ -100,6 +100,15 @@ properties:
               PMIC default "STANDBY" state voltage in uV. Only Buck1~3 have such
               dvs(dynamic voltage scaling) property.
 
+          regulator-allowed-modes:
+            description: |
+              Buck regulator operating modes allowed. Valid values below.
+              Users should use the macros from dt-bindings/regulator/nxp,pca9450-regulator.h
+                0 (PCA9450_BUCK_MODE_AUTO): Auto PFM/PWM mode
+                1 (PCA9450_BUCK_MODE_FORCE_PWM): Forced PWM mode
+            items:
+              enum: [ 0, 1 ]
+
         unevaluatedProperties: false
 
     additionalProperties: false
@@ -143,6 +152,7 @@ allOf:
 examples:
   - |
     #include <dt-bindings/interrupt-controller/irq.h>
+    #include <dt-bindings/regulator/nxp,pca9450-regulator.h>
 
     i2c {
         #address-cells = <1>;
@@ -179,6 +189,8 @@ examples:
                     regulator-max-microvolt = <3400000>;
                     regulator-boot-on;
                     regulator-always-on;
+                    regulator-initial-mode = <PCA9450_BUCK_MODE_FORCE_PWM>;
+                    regulator-allowed-modes = <PCA9450_BUCK_MODE_FORCE_PWM>;
                 };
                 buck5: BUCK5 {
                     regulator-name = "BUCK5";
@@ -186,6 +198,8 @@ examples:
                     regulator-max-microvolt = <3400000>;
                     regulator-boot-on;
                     regulator-always-on;
+                    regulator-allowed-modes = <PCA9450_BUCK_MODE_AUTO
+                                               PCA9450_BUCK_MODE_FORCE_PWM>;
                 };
                 buck6: BUCK6 {
                     regulator-name = "BUCK6";
diff --git a/include/dt-bindings/regulator/nxp,pca9450-regulator.h b/include/dt-bindings/regulator/nxp,pca9450-regulator.h
new file mode 100644
index 000000000000..08434caef429
--- /dev/null
+++ b/include/dt-bindings/regulator/nxp,pca9450-regulator.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Device Tree binding constants for the NXP PCA9450A/B/C PMIC regulators
+ */
+
+#ifndef _DT_BINDINGS_REGULATORS_NXP_PCA9450_H
+#define _DT_BINDINGS_REGULATORS_NXP_PCA9450_H
+
+/*
+ * Buck mode constants which may be used in devicetree properties (eg.
+ * regulator-initial-mode, regulator-allowed-modes).
+ * See the manufacturer's datasheet for more information on these modes.
+ */
+
+#define PCA9450_BUCK_MODE_AUTO		0
+#define PCA9450_BUCK_MODE_FORCE_PWM	1
+
+#endif
-- 
cgit v1.2.3


From dc38441890ec0c54d032395ea9c365a4307185fa Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 16 May 2025 11:26:21 +0300
Subject: iio: backend: add support for filter config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add backend support for digital filter type selection.

This setting can be adjusted within the IP cores interfacing devices.

The IP core can be configured based on the state of the actual
digital filter configuration of the part.

Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250516082630.8236-2-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 18 ++++++++++++++++++
 include/linux/iio/backend.h        | 13 +++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index c1eb9ef9db08..e2d46c12c15f 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -808,6 +808,24 @@ static int __devm_iio_backend_get(struct device *dev, struct iio_backend *back)
 	return 0;
 }
 
+/**
+ * iio_backend_filter_type_set - Set filter type
+ * @back: Backend device
+ * @type: Filter type.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_filter_type_set(struct iio_backend *back,
+				enum iio_backend_filter_type type)
+{
+	if (type >= IIO_BACKEND_FILTER_TYPE_MAX)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, filter_type_set, type);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_filter_type_set, "IIO_BACKEND");
+
 /**
  * iio_backend_ddr_enable - Enable interface DDR (Double Data Rate) mode
  * @back: Backend device
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index e59d909cb659..8a1690f21318 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -76,6 +76,14 @@ enum iio_backend_interface_type {
 	IIO_BACKEND_INTERFACE_MAX
 };
 
+enum iio_backend_filter_type {
+	IIO_BACKEND_FILTER_TYPE_DISABLED,
+	IIO_BACKEND_FILTER_TYPE_SINC1,
+	IIO_BACKEND_FILTER_TYPE_SINC5,
+	IIO_BACKEND_FILTER_TYPE_SINC5_PLUS_COMP,
+	IIO_BACKEND_FILTER_TYPE_MAX
+};
+
 /**
  * struct iio_backend_ops - operations structure for an iio_backend
  * @enable: Enable backend.
@@ -101,6 +109,7 @@ enum iio_backend_interface_type {
  * @read_raw: Read a channel attribute from a backend device
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
+ * @filter_type_set: Set filter type.
  * @ddr_enable: Enable interface DDR (Double Data Rate) mode.
  * @ddr_disable: Disable interface DDR (Double Data Rate) mode.
  * @data_stream_enable: Enable data stream.
@@ -153,6 +162,8 @@ struct iio_backend_ops {
 					 size_t len);
 	int (*debugfs_reg_access)(struct iio_backend *back, unsigned int reg,
 				  unsigned int writeval, unsigned int *readval);
+	int (*filter_type_set)(struct iio_backend *back,
+			       enum iio_backend_filter_type type);
 	int (*ddr_enable)(struct iio_backend *back);
 	int (*ddr_disable)(struct iio_backend *back);
 	int (*data_stream_enable)(struct iio_backend *back);
@@ -195,6 +206,8 @@ int iio_backend_data_sample_trigger(struct iio_backend *back,
 int devm_iio_backend_request_buffer(struct device *dev,
 				    struct iio_backend *back,
 				    struct iio_dev *indio_dev);
+int iio_backend_filter_type_set(struct iio_backend *back,
+				enum iio_backend_filter_type type);
 int iio_backend_ddr_enable(struct iio_backend *back);
 int iio_backend_ddr_disable(struct iio_backend *back);
 int iio_backend_data_stream_enable(struct iio_backend *back);
-- 
cgit v1.2.3


From 995fd6e002b0d1ac435faed68005585906467e92 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 16 May 2025 11:26:22 +0300
Subject: iio: backend: add support for data alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add backend support for staring the capture synchronization.
When activated, it initates a proccess that aligns the sample's most
significant bit (MSB) based solely on the captured data, without
considering any other external signals.

Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250516082630.8236-3-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 23 +++++++++++++++++++++++
 include/linux/iio/backend.h        |  3 +++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index e2d46c12c15f..1fd71b520691 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -826,6 +826,29 @@ int iio_backend_filter_type_set(struct iio_backend *back,
 }
 EXPORT_SYMBOL_NS_GPL(iio_backend_filter_type_set, "IIO_BACKEND");
 
+/**
+ * iio_backend_interface_data_align - Perform the data alignment process.
+ * @back: Backend device
+ * @timeout_us: Timeout value in us.
+ *
+ * When activated, it initates a proccess that aligns the sample's most
+ * significant bit (MSB) based solely on the captured data, without
+ * considering any other external signals.
+ *
+ * The timeout_us value must be greater than 0.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_interface_data_align(struct iio_backend *back, u32 timeout_us)
+{
+	if (!timeout_us)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, interface_data_align, timeout_us);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_interface_data_align, "IIO_BACKEND");
+
 /**
  * iio_backend_ddr_enable - Enable interface DDR (Double Data Rate) mode
  * @back: Backend device
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 8a1690f21318..c579eb523466 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -110,6 +110,7 @@ enum iio_backend_filter_type {
  * @debugfs_print_chan_status: Print channel status into a buffer.
  * @debugfs_reg_access: Read or write register value of backend.
  * @filter_type_set: Set filter type.
+ * @interface_data_align: Perform the data alignment process.
  * @ddr_enable: Enable interface DDR (Double Data Rate) mode.
  * @ddr_disable: Disable interface DDR (Double Data Rate) mode.
  * @data_stream_enable: Enable data stream.
@@ -164,6 +165,7 @@ struct iio_backend_ops {
 				  unsigned int writeval, unsigned int *readval);
 	int (*filter_type_set)(struct iio_backend *back,
 			       enum iio_backend_filter_type type);
+	int (*interface_data_align)(struct iio_backend *back, u32 timeout_us);
 	int (*ddr_enable)(struct iio_backend *back);
 	int (*ddr_disable)(struct iio_backend *back);
 	int (*data_stream_enable)(struct iio_backend *back);
@@ -208,6 +210,7 @@ int devm_iio_backend_request_buffer(struct device *dev,
 				    struct iio_dev *indio_dev);
 int iio_backend_filter_type_set(struct iio_backend *back,
 				enum iio_backend_filter_type type);
+int iio_backend_interface_data_align(struct iio_backend *back, u32 timeout_us);
 int iio_backend_ddr_enable(struct iio_backend *back);
 int iio_backend_ddr_disable(struct iio_backend *back);
 int iio_backend_data_stream_enable(struct iio_backend *back);
-- 
cgit v1.2.3


From 5ef4cc6d2414d115cf09a6a33c3155f12e58a27d Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus@analog.com>
Date: Fri, 16 May 2025 11:26:23 +0300
Subject: iio: backend: add support for number of lanes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add iio backend support for number of lanes to be enabled.

Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Antoniu Miclaus <antoniu.miclaus@analog.com>
Link: https://patch.msgid.link/20250516082630.8236-4-antoniu.miclaus@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 17 +++++++++++++++++
 include/linux/iio/backend.h        |  3 +++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index 1fd71b520691..6b2d3dac52b3 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -849,6 +849,23 @@ int iio_backend_interface_data_align(struct iio_backend *back, u32 timeout_us)
 }
 EXPORT_SYMBOL_NS_GPL(iio_backend_interface_data_align, "IIO_BACKEND");
 
+/**
+ * iio_backend_num_lanes_set - Number of lanes enabled.
+ * @back: Backend device
+ * @num_lanes: Number of lanes.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_num_lanes_set(struct iio_backend *back, unsigned int num_lanes)
+{
+	if (!num_lanes)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, num_lanes_set, num_lanes);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_num_lanes_set, "IIO_BACKEND");
+
 /**
  * iio_backend_ddr_enable - Enable interface DDR (Double Data Rate) mode
  * @back: Backend device
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index c579eb523466..1f528fbd9d11 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -111,6 +111,7 @@ enum iio_backend_filter_type {
  * @debugfs_reg_access: Read or write register value of backend.
  * @filter_type_set: Set filter type.
  * @interface_data_align: Perform the data alignment process.
+ * @num_lanes_set: Set the number of lanes enabled.
  * @ddr_enable: Enable interface DDR (Double Data Rate) mode.
  * @ddr_disable: Disable interface DDR (Double Data Rate) mode.
  * @data_stream_enable: Enable data stream.
@@ -166,6 +167,7 @@ struct iio_backend_ops {
 	int (*filter_type_set)(struct iio_backend *back,
 			       enum iio_backend_filter_type type);
 	int (*interface_data_align)(struct iio_backend *back, u32 timeout_us);
+	int (*num_lanes_set)(struct iio_backend *back, unsigned int num_lanes);
 	int (*ddr_enable)(struct iio_backend *back);
 	int (*ddr_disable)(struct iio_backend *back);
 	int (*data_stream_enable)(struct iio_backend *back);
@@ -211,6 +213,7 @@ int devm_iio_backend_request_buffer(struct device *dev,
 int iio_backend_filter_type_set(struct iio_backend *back,
 				enum iio_backend_filter_type type);
 int iio_backend_interface_data_align(struct iio_backend *back, u32 timeout_us);
+int iio_backend_num_lanes_set(struct iio_backend *back, unsigned int num_lanes);
 int iio_backend_ddr_enable(struct iio_backend *back);
 int iio_backend_ddr_disable(struct iio_backend *back);
 int iio_backend_data_stream_enable(struct iio_backend *back);
-- 
cgit v1.2.3


From 342c52dde2f031add61ddeaced9c100f88e04d09 Mon Sep 17 00:00:00 2001
From: Angelo Dureghello <adureghello@baylibre.com>
Date: Fri, 6 Jun 2025 16:19:17 +0200
Subject: iio: core: add ADC delay calibration definition

ADCs as ad7606 implement a phase calibration as a delay. Add such
definition, needed for ad7606.

Signed-off-by: Angelo Dureghello <adureghello@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20250606-wip-bl-ad7606-calibration-v9-2-6e014a1f92a2@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-core.c | 1 +
 include/linux/iio/types.h       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 178e99b111de..f13c3aa470d7 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -188,6 +188,7 @@ static const char * const iio_chan_info_postfix[] = {
 	[IIO_CHAN_INFO_CALIBAMBIENT] = "calibambient",
 	[IIO_CHAN_INFO_ZEROPOINT] = "zeropoint",
 	[IIO_CHAN_INFO_TROUGH] = "trough_raw",
+	[IIO_CHAN_INFO_CONVDELAY] = "convdelay",
 };
 /**
  * iio_device_id() - query the unique ID for the device
diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h
index d89982c98368..ad2761efcc83 100644
--- a/include/linux/iio/types.h
+++ b/include/linux/iio/types.h
@@ -69,6 +69,7 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_CALIBAMBIENT,
 	IIO_CHAN_INFO_ZEROPOINT,
 	IIO_CHAN_INFO_TROUGH,
+	IIO_CHAN_INFO_CONVDELAY,
 };
 
 #endif /* _IIO_TYPES_H_ */
-- 
cgit v1.2.3


From 4c46a471be12216347ba707f8eadadbf5d68e698 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 3 Jun 2025 16:38:53 +0530
Subject: firmware: arm_ffa: Fix the missing entry in struct
 ffa_indirect_msg_hdr

As per the spec, one 32 bit reserved entry is missing here, add it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Fixes: 910cc1acc9b4 ("firmware: arm_ffa: Add support for passing UUID in FFA_MSG_SEND2")
Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com>
Message-Id: <28a624fbf416975de4fbe08cfbf7c2db89cb630e.1748948911.git.viresh.kumar@linaro.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/arm_ffa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 5bded24dc24f..e1634897e159 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -283,6 +283,7 @@ struct ffa_indirect_msg_hdr {
 	u32 offset;
 	u32 send_recv_id;
 	u32 size;
+	u32 res1;
 	uuid_t uuid;
 };
 
-- 
cgit v1.2.3


From de1c831a7898f164c1c2703c6b2b9e4fb4bebefc Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 15 Apr 2025 10:02:33 -0700
Subject: slab: Decouple slab_debug and no_hash_pointers

Some system owners use slab_debug=FPZ (or similar) as a hardening option,
but do not want to be forced into having kernel addresses exposed due
to the implicit "no_hash_pointers" boot param setting.[1]

Introduce the "hash_pointers" boot param, which defaults to "auto"
(the current behavior), but also includes "always" (forcing on hashing
even when "slab_debug=..." is defined), and "never". The existing
"no_hash_pointers" boot param becomes an alias for "hash_pointers=never".

This makes it possible to boot with "slab_debug=FPZ hash_pointers=always".

Link: https://github.com/KSPP/linux/issues/368 [1]
Fixes: 792702911f58 ("slub: force on no_hash_pointers when slub_debug is enabled")
Co-developed-by: Sergio Perez Gonzalez <sperezglz@gmail.com>
Signed-off-by: Sergio Perez Gonzalez <sperezglz@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Rafael Aquini <raquini@redhat.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://patch.msgid.link/20250415170232.it.467-kees@kernel.org
[kees@kernel.org: Add note about hash_pointers into slab_debug kernel parameter documentation.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 38 ++++++++++-----
 include/linux/sprintf.h                         |  2 +-
 lib/vsprintf.c                                  | 61 +++++++++++++++++++++++--
 mm/slub.c                                       |  5 +-
 4 files changed, 86 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3f35d5b8c296..0dd5cd17e87e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1798,6 +1798,27 @@
 			backtraces on all cpus.
 			Format: 0 | 1
 
+	hash_pointers=
+			[KNL,EARLY]
+			By default, when pointers are printed to the console
+			or buffers via the %p format string, that pointer is
+			"hashed", i.e. obscured by hashing the pointer value.
+			This is a security feature that hides actual kernel
+			addresses from unprivileged users, but it also makes
+			debugging the kernel more difficult since unequal
+			pointers can no longer be compared. The choices are:
+			Format: { auto | always | never }
+			Default: auto
+
+			auto   - Hash pointers unless slab_debug is enabled.
+			always - Always hash pointers (even if slab_debug is
+				 enabled).
+			never  - Never hash pointers. This option should only
+				 be specified when debugging the kernel. Do
+				 not use on production kernels. The boot
+				 param "no_hash_pointers" is an alias for
+				 this mode.
+
 	hashdist=	[KNL,NUMA] Large hashes allocated during boot
 			are distributed across NUMA nodes.  Defaults on
 			for 64-bit NUMA, off otherwise.
@@ -4120,18 +4141,7 @@
 
 	no_hash_pointers
 			[KNL,EARLY]
-			Force pointers printed to the console or buffers to be
-			unhashed.  By default, when a pointer is printed via %p
-			format string, that pointer is "hashed", i.e. obscured
-			by hashing the pointer value.  This is a security feature
-			that hides actual kernel addresses from unprivileged
-			users, but it also makes debugging the kernel more
-			difficult since unequal pointers can no longer be
-			compared.  However, if this command-line option is
-			specified, then all normal pointers will have their true
-			value printed. This option should only be specified when
-			debugging the kernel.  Please do not use on production
-			kernels.
+			Alias for "hash_pointers=never".
 
 	nohibernate	[HIBERNATION] Disable hibernation and resume.
 
@@ -6481,6 +6491,10 @@
 			Documentation/mm/slub.rst.
 			(slub_debug legacy name also accepted for now)
 
+			Using this option implies the "no_hash_pointers"
+			option which can be undone by adding the
+			"hash_pointers=always" option.
+
 	slab_max_order= [MM]
 			Determines the maximum allowed order for slabs.
 			A high setting may cause OOMs due to memory
diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h
index 51cab2def9ec..521bb2cd2648 100644
--- a/include/linux/sprintf.h
+++ b/include/linux/sprintf.h
@@ -22,7 +22,7 @@ __scanf(2, 0) int vsscanf(const char *, const char *, va_list);
 
 /* These are for specific cases, do not use without real need */
 extern bool no_hash_pointers;
-int no_hash_pointers_enable(char *str);
+void hash_pointers_finalize(bool slub_debug);
 
 /* Used for Rust formatting ('%pA') */
 char *rust_fmt_argument(char *buf, char *end, const void *ptr);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 01699852f30c..22cbd75266ef 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -60,6 +60,20 @@
 bool no_hash_pointers __ro_after_init;
 EXPORT_SYMBOL_GPL(no_hash_pointers);
 
+/*
+ * Hashed pointers policy selected by "hash_pointers=..." boot param
+ *
+ * `auto`   - Hashed pointers enabled unless disabled by slub_debug_enabled=true
+ * `always` - Hashed pointers enabled unconditionally
+ * `never`  - Hashed pointers disabled unconditionally
+ */
+enum hash_pointers_policy {
+	HASH_PTR_AUTO = 0,
+	HASH_PTR_ALWAYS,
+	HASH_PTR_NEVER
+};
+static enum hash_pointers_policy hash_pointers_mode __initdata;
+
 noinline
 static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
 {
@@ -2271,12 +2285,23 @@ char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
 	return resource_string(buf, end, ptr, spec, fmt);
 }
 
-int __init no_hash_pointers_enable(char *str)
+void __init hash_pointers_finalize(bool slub_debug)
 {
-	if (no_hash_pointers)
-		return 0;
+	switch (hash_pointers_mode) {
+	case HASH_PTR_ALWAYS:
+		no_hash_pointers = false;
+		break;
+	case HASH_PTR_NEVER:
+		no_hash_pointers = true;
+		break;
+	case HASH_PTR_AUTO:
+	default:
+		no_hash_pointers = slub_debug;
+		break;
+	}
 
-	no_hash_pointers = true;
+	if (!no_hash_pointers)
+		return;
 
 	pr_warn("**********************************************************\n");
 	pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
@@ -2289,11 +2314,39 @@ int __init no_hash_pointers_enable(char *str)
 	pr_warn("** the kernel, report this immediately to your system   **\n");
 	pr_warn("** administrator!                                       **\n");
 	pr_warn("**                                                      **\n");
+	pr_warn("** Use hash_pointers=always to force this mode off      **\n");
+	pr_warn("**                                                      **\n");
 	pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
 	pr_warn("**********************************************************\n");
+}
+
+static int __init hash_pointers_mode_parse(char *str)
+{
+	if (!str) {
+		pr_warn("Hash pointers mode empty; falling back to auto.\n");
+		hash_pointers_mode = HASH_PTR_AUTO;
+	} else if (strncmp(str, "auto", 4) == 0)   {
+		pr_info("Hash pointers mode set to auto.\n");
+		hash_pointers_mode = HASH_PTR_AUTO;
+	} else if (strncmp(str, "never", 5) == 0) {
+		pr_info("Hash pointers mode set to never.\n");
+		hash_pointers_mode = HASH_PTR_NEVER;
+	} else if (strncmp(str, "always", 6) == 0) {
+		pr_info("Hash pointers mode set to always.\n");
+		hash_pointers_mode = HASH_PTR_ALWAYS;
+	} else {
+		pr_warn("Unknown hash_pointers mode '%s' specified; assuming auto.\n", str);
+		hash_pointers_mode = HASH_PTR_AUTO;
+	}
 
 	return 0;
 }
+early_param("hash_pointers", hash_pointers_mode_parse);
+
+static int __init no_hash_pointers_enable(char *str)
+{
+	return hash_pointers_mode_parse("never");
+}
 early_param("no_hash_pointers", no_hash_pointers_enable);
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71..f3d61b330a76 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -6314,9 +6314,8 @@ void __init kmem_cache_init(void)
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
 
-	/* Print slub debugging pointers without hashing */
-	if (__slub_debug_enabled())
-		no_hash_pointers_enable(NULL);
+	/* Inform pointer hashing choice about slub debugging state. */
+	hash_pointers_finalize(__slub_debug_enabled());
 
 	kmem_cache_node = &boot_kmem_cache_node;
 	kmem_cache = &boot_kmem_cache;
-- 
cgit v1.2.3


From 78b2d9908b42ea70e42f00af5db08ad514727a45 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 5 May 2025 13:14:22 -0700
Subject: net: intel: rename 'hena' to 'hashcfg' for clarity

i40e, ice, and iAVF all use 'hena' as a shorthand for the "hash enable"
configuration. This comes originally from the X710 datasheet 'xxQF_HENA'
registers. In the context of the registers the meaning is fairly clear.

However, on its own, hena is a weird name that can be more difficult to
understand. This is especially true in ice. The E810 hardware doesn't even
have registers with HENA in the name.

Replace the shorthand 'hena' with 'hashcfg'. This makes it clear the
variables deal with the Hash configuration, not just a single boolean
on/off for all hashing.

Do not update the register names. These come directly from the datasheet
for X710 and X722, and it is more important that the names can be searched.

Suggested-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h        |  8 ++--
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 46 +++++++++++-----------
 drivers/net/ethernet/intel/iavf/iavf.h             | 10 ++---
 drivers/net/ethernet/intel/iavf/iavf_main.c        | 17 ++++----
 drivers/net/ethernet/intel/iavf/iavf_txrx.h        |  4 +-
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c    | 33 ++++++++--------
 drivers/net/ethernet/intel/ice/ice_flow.h          |  4 +-
 drivers/net/ethernet/intel/ice/ice_lib.c           |  2 +-
 drivers/net/ethernet/intel/ice/ice_virtchnl.c      | 44 ++++++++++-----------
 drivers/net/ethernet/intel/ice/ice_virtchnl.h      |  4 +-
 .../ethernet/intel/ice/ice_virtchnl_allowlist.c    |  2 +-
 include/linux/avf/virtchnl.h                       | 22 +++++------
 13 files changed, 101 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 120d68654e3f..516e07b58161 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12507,7 +12507,7 @@ static int i40e_pf_config_rss(struct i40e_pf *pf)
 	/* By default we enable TCP/UDP with IPv4/IPv6 ptypes */
 	hena = (u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(0)) |
 		((u64)i40e_read_rx_ctl(hw, I40E_PFQF_HENA(1)) << 32);
-	hena |= i40e_pf_get_default_rss_hena(pf);
+	hena |= i40e_pf_get_default_rss_hashcfg(pf);
 
 	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(0), (u32)hena);
 	i40e_write_rx_ctl(hw, I40E_PFQF_HENA(1), (u32)(hena >> 32));
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 7c26c9a2bf65..b007a84268a7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -71,7 +71,7 @@ enum i40e_dyn_idx {
 #define I40E_SW_ITR    I40E_IDX_ITR2
 
 /* Supported RSS offloads */
-#define I40E_DEFAULT_RSS_HENA ( \
+#define I40E_DEFAULT_RSS_HASHCFG ( \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP) | \
@@ -84,7 +84,7 @@ enum i40e_dyn_idx {
 	BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_L2_PAYLOAD))
 
-#define I40E_DEFAULT_RSS_HENA_EXPANDED (I40E_DEFAULT_RSS_HENA | \
+#define I40E_DEFAULT_RSS_HASHCFG_EXPANDED (I40E_DEFAULT_RSS_HASHCFG | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
@@ -92,9 +92,9 @@ enum i40e_dyn_idx {
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
 	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
-#define i40e_pf_get_default_rss_hena(pf) \
+#define i40e_pf_get_default_rss_hashcfg(pf) \
 	(test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE, (pf)->hw.caps) ? \
-	 I40E_DEFAULT_RSS_HENA_EXPANDED : I40E_DEFAULT_RSS_HENA)
+	 I40E_DEFAULT_RSS_HASHCFG_EXPANDED : I40E_DEFAULT_RSS_HASHCFG)
 
 /* Supported Rx Buffer Sizes (a multiple of 128) */
 #define I40E_RXBUFFER_256   256
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 1120f8e4bb67..2d9b7e51bbe1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -812,7 +812,7 @@ static int i40e_alloc_vsi_res(struct i40e_vf *vf, u8 idx)
 	}
 
 	if (!idx) {
-		u64 hena = i40e_pf_get_default_rss_hena(pf);
+		u64 hashcfg = i40e_pf_get_default_rss_hashcfg(pf);
 		u8 broadcast[ETH_ALEN];
 
 		vf->lan_vsi_idx = vsi->idx;
@@ -841,8 +841,9 @@ static int i40e_alloc_vsi_res(struct i40e_vf *vf, u8 idx)
 			dev_info(&pf->pdev->dev,
 				 "Could not allocate VF broadcast filter\n");
 		spin_unlock_bh(&vsi->mac_filter_hash_lock);
-		wr32(&pf->hw, I40E_VFQF_HENA1(0, vf->vf_id), (u32)hena);
-		wr32(&pf->hw, I40E_VFQF_HENA1(1, vf->vf_id), (u32)(hena >> 32));
+		wr32(&pf->hw, I40E_VFQF_HENA1(0, vf->vf_id), (u32)hashcfg);
+		wr32(&pf->hw, I40E_VFQF_HENA1(1, vf->vf_id),
+		     (u32)(hashcfg >> 32));
 		/* program mac filter only for VF VSI */
 		ret = i40e_sync_vsi_filters(vsi);
 		if (ret)
@@ -3447,15 +3448,15 @@ err:
 }
 
 /**
- * i40e_vc_get_rss_hena
+ * i40e_vc_get_rss_hashcfg
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
  *
- * Return the RSS HENA bits allowed by the hardware
+ * Return the RSS Hash configuration bits allowed by the hardware
  **/
-static int i40e_vc_get_rss_hena(struct i40e_vf *vf, u8 *msg)
+static int i40e_vc_get_rss_hashcfg(struct i40e_vf *vf, u8 *msg)
 {
-	struct virtchnl_rss_hena *vrh = NULL;
+	struct virtchnl_rss_hashcfg *vrh = NULL;
 	struct i40e_pf *pf = vf->pf;
 	int aq_ret = 0;
 	int len = 0;
@@ -3464,7 +3465,7 @@ static int i40e_vc_get_rss_hena(struct i40e_vf *vf, u8 *msg)
 		aq_ret = -EINVAL;
 		goto err;
 	}
-	len = sizeof(struct virtchnl_rss_hena);
+	len = sizeof(struct virtchnl_rss_hashcfg);
 
 	vrh = kzalloc(len, GFP_KERNEL);
 	if (!vrh) {
@@ -3472,26 +3473,26 @@ static int i40e_vc_get_rss_hena(struct i40e_vf *vf, u8 *msg)
 		len = 0;
 		goto err;
 	}
-	vrh->hena = i40e_pf_get_default_rss_hena(pf);
+	vrh->hashcfg = i40e_pf_get_default_rss_hashcfg(pf);
 err:
 	/* send the response back to the VF */
-	aq_ret = i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HENA_CAPS,
+	aq_ret = i40e_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS,
 					aq_ret, (u8 *)vrh, len);
 	kfree(vrh);
 	return aq_ret;
 }
 
 /**
- * i40e_vc_set_rss_hena
+ * i40e_vc_set_rss_hashcfg
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
  *
- * Set the RSS HENA bits for the VF
+ * Set the RSS Hash configuration bits for the VF
  **/
-static int i40e_vc_set_rss_hena(struct i40e_vf *vf, u8 *msg)
+static int i40e_vc_set_rss_hashcfg(struct i40e_vf *vf, u8 *msg)
 {
-	struct virtchnl_rss_hena *vrh =
-		(struct virtchnl_rss_hena *)msg;
+	struct virtchnl_rss_hashcfg *vrh =
+		(struct virtchnl_rss_hashcfg *)msg;
 	struct i40e_pf *pf = vf->pf;
 	struct i40e_hw *hw = &pf->hw;
 	int aq_ret = 0;
@@ -3500,13 +3501,14 @@ static int i40e_vc_set_rss_hena(struct i40e_vf *vf, u8 *msg)
 		aq_ret = -EINVAL;
 		goto err;
 	}
-	i40e_write_rx_ctl(hw, I40E_VFQF_HENA1(0, vf->vf_id), (u32)vrh->hena);
+	i40e_write_rx_ctl(hw, I40E_VFQF_HENA1(0, vf->vf_id),
+			  (u32)vrh->hashcfg);
 	i40e_write_rx_ctl(hw, I40E_VFQF_HENA1(1, vf->vf_id),
-			  (u32)(vrh->hena >> 32));
+			  (u32)(vrh->hashcfg >> 32));
 
 	/* send the response to the VF */
 err:
-	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, aq_ret);
+	return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_SET_RSS_HASHCFG, aq_ret);
 }
 
 /**
@@ -4253,11 +4255,11 @@ int i40e_vc_process_vf_msg(struct i40e_pf *pf, s16 vf_id, u32 v_opcode,
 	case VIRTCHNL_OP_CONFIG_RSS_LUT:
 		ret = i40e_vc_config_rss_lut(vf, msg);
 		break;
-	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
-		ret = i40e_vc_get_rss_hena(vf, msg);
+	case VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS:
+		ret = i40e_vc_get_rss_hashcfg(vf, msg);
 		break;
-	case VIRTCHNL_OP_SET_RSS_HENA:
-		ret = i40e_vc_set_rss_hena(vf, msg);
+	case VIRTCHNL_OP_SET_RSS_HASHCFG:
+		ret = i40e_vc_set_rss_hashcfg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 		ret = i40e_vc_enable_vlan_stripping(vf, msg);
diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index f7a98ff43a57..eb86cca38be2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -315,8 +315,8 @@ struct iavf_adapter {
 #define IAVF_FLAG_AQ_CONFIGURE_RSS		BIT_ULL(9) /* direct AQ config */
 #define IAVF_FLAG_AQ_GET_CONFIG			BIT_ULL(10)
 /* Newer style, RSS done by the PF so we can ignore hardware vagaries. */
-#define IAVF_FLAG_AQ_GET_HENA			BIT_ULL(11)
-#define IAVF_FLAG_AQ_SET_HENA			BIT_ULL(12)
+#define IAVF_FLAG_AQ_GET_RSS_HASHCFG		BIT_ULL(11)
+#define IAVF_FLAG_AQ_SET_RSS_HASHCFG		BIT_ULL(12)
 #define IAVF_FLAG_AQ_SET_RSS_KEY		BIT_ULL(13)
 #define IAVF_FLAG_AQ_SET_RSS_LUT		BIT_ULL(14)
 #define IAVF_FLAG_AQ_SET_RSS_HFUNC		BIT_ULL(15)
@@ -456,7 +456,7 @@ struct iavf_adapter {
 	u32 aq_wait_count;
 	/* RSS stuff */
 	enum virtchnl_rss_algorithm hfunc;
-	u64 hena;
+	u64 rss_hashcfg;
 	u16 rss_key_size;
 	u16 rss_lut_size;
 	u8 *rss_key;
@@ -600,8 +600,8 @@ void iavf_set_promiscuous(struct iavf_adapter *adapter);
 bool iavf_promiscuous_mode_changed(struct iavf_adapter *adapter);
 void iavf_request_stats(struct iavf_adapter *adapter);
 int iavf_request_reset(struct iavf_adapter *adapter);
-void iavf_get_hena(struct iavf_adapter *adapter);
-void iavf_set_hena(struct iavf_adapter *adapter);
+void iavf_get_rss_hashcfg(struct iavf_adapter *adapter);
+void iavf_set_rss_hashcfg(struct iavf_adapter *adapter);
 void iavf_set_rss_key(struct iavf_adapter *adapter);
 void iavf_set_rss_lut(struct iavf_adapter *adapter);
 void iavf_set_rss_hfunc(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 2c0bb41809a4..01e11ac5055b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1823,12 +1823,13 @@ static int iavf_init_rss(struct iavf_adapter *adapter)
 		/* Enable PCTYPES for RSS, TCP/UDP with IPv4/IPv6 */
 		if (adapter->vf_res->vf_cap_flags &
 		    VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2)
-			adapter->hena = IAVF_DEFAULT_RSS_HENA_EXPANDED;
+			adapter->rss_hashcfg =
+				IAVF_DEFAULT_RSS_HASHCFG_EXPANDED;
 		else
-			adapter->hena = IAVF_DEFAULT_RSS_HENA;
+			adapter->rss_hashcfg = IAVF_DEFAULT_RSS_HASHCFG;
 
-		wr32(hw, IAVF_VFQF_HENA(0), (u32)adapter->hena);
-		wr32(hw, IAVF_VFQF_HENA(1), (u32)(adapter->hena >> 32));
+		wr32(hw, IAVF_VFQF_HENA(0), (u32)adapter->rss_hashcfg);
+		wr32(hw, IAVF_VFQF_HENA(1), (u32)(adapter->rss_hashcfg >> 32));
 	}
 
 	iavf_fill_rss_lut(adapter);
@@ -2195,12 +2196,12 @@ static int iavf_process_aq_command(struct iavf_adapter *adapter)
 		adapter->aq_required &= ~IAVF_FLAG_AQ_CONFIGURE_RSS;
 		return 0;
 	}
-	if (adapter->aq_required & IAVF_FLAG_AQ_GET_HENA) {
-		iavf_get_hena(adapter);
+	if (adapter->aq_required & IAVF_FLAG_AQ_GET_RSS_HASHCFG) {
+		iavf_get_rss_hashcfg(adapter);
 		return 0;
 	}
-	if (adapter->aq_required & IAVF_FLAG_AQ_SET_HENA) {
-		iavf_set_hena(adapter);
+	if (adapter->aq_required & IAVF_FLAG_AQ_SET_RSS_HASHCFG) {
+		iavf_set_rss_hashcfg(adapter);
 		return 0;
 	}
 	if (adapter->aq_required & IAVF_FLAG_AQ_SET_RSS_KEY) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 79ad554f2d53..94b324f212bd 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -59,7 +59,7 @@ enum iavf_dyn_idx_t {
 #define IAVF_PE_ITR    IAVF_IDX_ITR2
 
 /* Supported RSS offloads */
-#define IAVF_DEFAULT_RSS_HENA ( \
+#define IAVF_DEFAULT_RSS_HASHCFG ( \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_UDP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_TCP) | \
@@ -72,7 +72,7 @@ enum iavf_dyn_idx_t {
 	BIT_ULL(IAVF_FILTER_PCTYPE_FRAG_IPV6) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_L2_PAYLOAD))
 
-#define IAVF_DEFAULT_RSS_HENA_EXPANDED (IAVF_DEFAULT_RSS_HENA | \
+#define IAVF_DEFAULT_RSS_HASHCFG_EXPANDED (IAVF_DEFAULT_RSS_HASHCFG | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index a6f0e5990be2..1815cf3e28f4 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -1128,12 +1128,12 @@ void iavf_request_stats(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_get_hena
+ * iavf_get_rss_hashcfg
  * @adapter: adapter structure
  *
- * Request hash enable capabilities from PF
+ * Request RSS Hash enable bits from PF
  **/
-void iavf_get_hena(struct iavf_adapter *adapter)
+void iavf_get_rss_hashcfg(struct iavf_adapter *adapter)
 {
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
@@ -1141,20 +1141,20 @@ void iavf_get_hena(struct iavf_adapter *adapter)
 			adapter->current_op);
 		return;
 	}
-	adapter->current_op = VIRTCHNL_OP_GET_RSS_HENA_CAPS;
-	adapter->aq_required &= ~IAVF_FLAG_AQ_GET_HENA;
-	iavf_send_pf_msg(adapter, VIRTCHNL_OP_GET_RSS_HENA_CAPS, NULL, 0);
+	adapter->current_op = VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS;
+	adapter->aq_required &= ~IAVF_FLAG_AQ_GET_RSS_HASHCFG;
+	iavf_send_pf_msg(adapter, VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS, NULL, 0);
 }
 
 /**
- * iavf_set_hena
+ * iavf_set_rss_hashcfg
  * @adapter: adapter structure
  *
  * Request the PF to set our RSS hash capabilities
  **/
-void iavf_set_hena(struct iavf_adapter *adapter)
+void iavf_set_rss_hashcfg(struct iavf_adapter *adapter)
 {
-	struct virtchnl_rss_hena vrh;
+	struct virtchnl_rss_hashcfg vrh;
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
@@ -1162,10 +1162,10 @@ void iavf_set_hena(struct iavf_adapter *adapter)
 			adapter->current_op);
 		return;
 	}
-	vrh.hena = adapter->hena;
-	adapter->current_op = VIRTCHNL_OP_SET_RSS_HENA;
-	adapter->aq_required &= ~IAVF_FLAG_AQ_SET_HENA;
-	iavf_send_pf_msg(adapter, VIRTCHNL_OP_SET_RSS_HENA, (u8 *)&vrh,
+	vrh.hashcfg = adapter->rss_hashcfg;
+	adapter->current_op = VIRTCHNL_OP_SET_RSS_HASHCFG;
+	adapter->aq_required &= ~IAVF_FLAG_AQ_SET_RSS_HASHCFG;
+	iavf_send_pf_msg(adapter, VIRTCHNL_OP_SET_RSS_HASHCFG, (u8 *)&vrh,
 			 sizeof(vrh));
 }
 
@@ -2735,11 +2735,12 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 		if (v_opcode != adapter->current_op)
 			return;
 		break;
-	case VIRTCHNL_OP_GET_RSS_HENA_CAPS: {
-		struct virtchnl_rss_hena *vrh = (struct virtchnl_rss_hena *)msg;
+	case VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS: {
+		struct virtchnl_rss_hashcfg *vrh =
+			(struct virtchnl_rss_hashcfg *)msg;
 
 		if (msglen == sizeof(*vrh))
-			adapter->hena = vrh->hena;
+			adapter->rss_hashcfg = vrh->hashcfg;
 		else
 			dev_warn(&adapter->pdev->dev,
 				 "Invalid message %d from PF\n", v_opcode);
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
index 6cb7bb879c98..b1313fb61677 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.h
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -295,10 +295,10 @@ enum ice_flow_avf_hdr_field {
 };
 
 /* Supported RSS offloads  This macro is defined to support
- * VIRTCHNL_OP_GET_RSS_HENA_CAPS ops. PF driver sends the RSS hardware
+ * VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS ops. PF driver sends the RSS hardware
  * capabilities to the caller of this ops.
  */
-#define ICE_DEFAULT_RSS_HENA ( \
+#define ICE_DEFAULT_RSS_HASHCFG ( \
 	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP) | \
 	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP) | \
 	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP) | \
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 03bb16191237..2cc050db509f 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -1579,7 +1579,7 @@ static void ice_vsi_set_vf_rss_flow_fld(struct ice_vsi *vsi)
 		return;
 	}
 
-	status = ice_add_avf_rss_cfg(&pf->hw, vsi, ICE_DEFAULT_RSS_HENA);
+	status = ice_add_avf_rss_cfg(&pf->hw, vsi, ICE_DEFAULT_RSS_HASHCFG);
 	if (status)
 		dev_dbg(dev, "ice_add_avf_rss_cfg failed for vsi = %d, error = %d\n",
 			vsi->vsi_num, status);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index eeeb9968e477..24426dcd8aa2 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -2999,13 +2999,13 @@ error_param:
 }
 
 /**
- * ice_vc_get_rss_hena - return the RSS HENA bits allowed by the hardware
+ * ice_vc_get_rss_hashcfg - return the RSS Hash configuration
  * @vf: pointer to the VF info
  */
-static int ice_vc_get_rss_hena(struct ice_vf *vf)
+static int ice_vc_get_rss_hashcfg(struct ice_vf *vf)
 {
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
-	struct virtchnl_rss_hena *vrh = NULL;
+	struct virtchnl_rss_hashcfg *vrh = NULL;
 	int len = 0, ret;
 
 	if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
@@ -3019,7 +3019,7 @@ static int ice_vc_get_rss_hena(struct ice_vf *vf)
 		goto err;
 	}
 
-	len = sizeof(struct virtchnl_rss_hena);
+	len = sizeof(struct virtchnl_rss_hashcfg);
 	vrh = kzalloc(len, GFP_KERNEL);
 	if (!vrh) {
 		v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY;
@@ -3027,23 +3027,23 @@ static int ice_vc_get_rss_hena(struct ice_vf *vf)
 		goto err;
 	}
 
-	vrh->hena = ICE_DEFAULT_RSS_HENA;
+	vrh->hashcfg = ICE_DEFAULT_RSS_HASHCFG;
 err:
 	/* send the response back to the VF */
-	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HENA_CAPS, v_ret,
+	ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS, v_ret,
 				    (u8 *)vrh, len);
 	kfree(vrh);
 	return ret;
 }
 
 /**
- * ice_vc_set_rss_hena - set RSS HENA bits for the VF
+ * ice_vc_set_rss_hashcfg - set RSS Hash configuration bits for the VF
  * @vf: pointer to the VF info
  * @msg: pointer to the msg buffer
  */
-static int ice_vc_set_rss_hena(struct ice_vf *vf, u8 *msg)
+static int ice_vc_set_rss_hashcfg(struct ice_vf *vf, u8 *msg)
 {
-	struct virtchnl_rss_hena *vrh = (struct virtchnl_rss_hena *)msg;
+	struct virtchnl_rss_hashcfg *vrh = (struct virtchnl_rss_hashcfg *)msg;
 	enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS;
 	struct ice_pf *pf = vf->pf;
 	struct ice_vsi *vsi;
@@ -3074,9 +3074,9 @@ static int ice_vc_set_rss_hena(struct ice_vf *vf, u8 *msg)
 	 * disable RSS
 	 */
 	status = ice_rem_vsi_rss_cfg(&pf->hw, vsi->idx);
-	if (status && !vrh->hena) {
+	if (status && !vrh->hashcfg) {
 		/* only report failure to clear the current RSS configuration if
-		 * that was clearly the VF's intention (i.e. vrh->hena = 0)
+		 * that was clearly the VF's intention (i.e. vrh->hashcfg = 0)
 		 */
 		v_ret = ice_err_to_virt_err(status);
 		goto err;
@@ -3089,14 +3089,14 @@ static int ice_vc_set_rss_hena(struct ice_vf *vf, u8 *msg)
 			 vf->vf_id);
 	}
 
-	if (vrh->hena) {
-		status = ice_add_avf_rss_cfg(&pf->hw, vsi, vrh->hena);
+	if (vrh->hashcfg) {
+		status = ice_add_avf_rss_cfg(&pf->hw, vsi, vrh->hashcfg);
 		v_ret = ice_err_to_virt_err(status);
 	}
 
 	/* send the response to the VF */
 err:
-	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_SET_RSS_HENA, v_ret,
+	return ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_SET_RSS_HASHCFG, v_ret,
 				     NULL, 0);
 }
 
@@ -4243,8 +4243,8 @@ static const struct ice_virtchnl_ops ice_virtchnl_dflt_ops = {
 	.add_vlan_msg = ice_vc_add_vlan_msg,
 	.remove_vlan_msg = ice_vc_remove_vlan_msg,
 	.query_rxdid = ice_vc_query_rxdid,
-	.get_rss_hena = ice_vc_get_rss_hena,
-	.set_rss_hena_msg = ice_vc_set_rss_hena,
+	.get_rss_hashcfg = ice_vc_get_rss_hashcfg,
+	.set_rss_hashcfg = ice_vc_set_rss_hashcfg,
 	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
 	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
 	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
@@ -4380,8 +4380,8 @@ static const struct ice_virtchnl_ops ice_virtchnl_repr_ops = {
 	.add_vlan_msg = ice_vc_add_vlan_msg,
 	.remove_vlan_msg = ice_vc_remove_vlan_msg,
 	.query_rxdid = ice_vc_query_rxdid,
-	.get_rss_hena = ice_vc_get_rss_hena,
-	.set_rss_hena_msg = ice_vc_set_rss_hena,
+	.get_rss_hashcfg = ice_vc_get_rss_hashcfg,
+	.set_rss_hashcfg = ice_vc_set_rss_hashcfg,
 	.ena_vlan_stripping = ice_vc_ena_vlan_stripping,
 	.dis_vlan_stripping = ice_vc_dis_vlan_stripping,
 	.handle_rss_cfg_msg = ice_vc_handle_rss_cfg,
@@ -4582,11 +4582,11 @@ error_handler:
 	case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS:
 		err = ops->query_rxdid(vf);
 		break;
-	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
-		err = ops->get_rss_hena(vf);
+	case VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS:
+		err = ops->get_rss_hashcfg(vf);
 		break;
-	case VIRTCHNL_OP_SET_RSS_HENA:
-		err = ops->set_rss_hena_msg(vf, msg);
+	case VIRTCHNL_OP_SET_RSS_HASHCFG:
+		err = ops->set_rss_hashcfg(vf, msg);
 		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 		err = ops->ena_vlan_stripping(vf);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.h b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
index 222990f229d5..b3eece8c6780 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.h
@@ -57,8 +57,8 @@ struct ice_virtchnl_ops {
 	int (*add_vlan_msg)(struct ice_vf *vf, u8 *msg);
 	int (*remove_vlan_msg)(struct ice_vf *vf, u8 *msg);
 	int (*query_rxdid)(struct ice_vf *vf);
-	int (*get_rss_hena)(struct ice_vf *vf);
-	int (*set_rss_hena_msg)(struct ice_vf *vf, u8 *msg);
+	int (*get_rss_hashcfg)(struct ice_vf *vf);
+	int (*set_rss_hashcfg)(struct ice_vf *vf, u8 *msg);
 	int (*ena_vlan_stripping)(struct ice_vf *vf);
 	int (*dis_vlan_stripping)(struct ice_vf *vf);
 	int (*handle_rss_cfg_msg)(struct ice_vf *vf, u8 *msg, bool add);
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
index a3d1579a619a..4c2ec2337b38 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c
@@ -65,7 +65,7 @@ static const u32 vlan_v2_allowlist_opcodes[] = {
 /* VIRTCHNL_VF_OFFLOAD_RSS_PF */
 static const u32 rss_pf_allowlist_opcodes[] = {
 	VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT,
-	VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA,
+	VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS, VIRTCHNL_OP_SET_RSS_HASHCFG,
 	VIRTCHNL_OP_CONFIG_RSS_HFUNC,
 };
 
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index cf0afa60e4a7..362d1cdc8cd8 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -132,8 +132,8 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_RELEASE_RDMA_IRQ_MAP = VIRTCHNL_OP_RELEASE_IWARP_IRQ_MAP,
 	VIRTCHNL_OP_CONFIG_RSS_KEY = 23,
 	VIRTCHNL_OP_CONFIG_RSS_LUT = 24,
-	VIRTCHNL_OP_GET_RSS_HENA_CAPS = 25,
-	VIRTCHNL_OP_SET_RSS_HENA = 26,
+	VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS = 25,
+	VIRTCHNL_OP_SET_RSS_HASHCFG = 26,
 	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
 	VIRTCHNL_OP_REQUEST_QUEUES = 29,
@@ -974,18 +974,18 @@ struct virtchnl_rss_lut {
 VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_rss_lut);
 #define virtchnl_rss_lut_LEGACY_SIZEOF	6
 
-/* VIRTCHNL_OP_GET_RSS_HENA_CAPS
- * VIRTCHNL_OP_SET_RSS_HENA
- * VF sends these messages to get and set the hash filter enable bits for RSS.
+/* VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS
+ * VIRTCHNL_OP_SET_RSS_HASHCFG
+ * VF sends these messages to get and set the hash filter configuration for RSS.
  * By default, the PF sets these to all possible traffic types that the
  * hardware supports. The VF can query this value if it wants to change the
  * traffic types that are hashed by the hardware.
  */
-struct virtchnl_rss_hena {
-	u64 hena;
+struct virtchnl_rss_hashcfg {
+	u64 hashcfg;
 };
 
-VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena);
+VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hashcfg);
 
 /* Type of RSS algorithm */
 enum virtchnl_rss_algorithm {
@@ -1779,10 +1779,10 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_CONFIG_RSS_HFUNC:
 		valid_len = sizeof(struct virtchnl_rss_hfunc);
 		break;
-	case VIRTCHNL_OP_GET_RSS_HENA_CAPS:
+	case VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS:
 		break;
-	case VIRTCHNL_OP_SET_RSS_HENA:
-		valid_len = sizeof(struct virtchnl_rss_hena);
+	case VIRTCHNL_OP_SET_RSS_HASHCFG:
+		valid_len = sizeof(struct virtchnl_rss_hashcfg);
 		break;
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
-- 
cgit v1.2.3


From 141d0c9037ca57dac2d2c4e5d3c21521aa70ff12 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 5 May 2025 13:14:23 -0700
Subject: net: intel: move RSS packet classifier types to libie

The Intel i40e, iavf, and ice drivers all include a definition of the
packet classifier filter types used to program RSS hash enable bits. For
i40e, these bits are used for both the PF and VF to configure the PFQF_HENA
and VFQF_HENA registers.

For ice and iAVF, these bits are used to communicate the desired hash
enable filter over virtchnl via its struct virtchnl_rss_hashena. The
virtchnl.h header makes no mention of where the bit definitions reside.

Maintaining a separate copy of these bits across three drivers is
cumbersome. Move the definition to libie as a new pctype.h header file.
Each driver can include this, and drop its own definition.

The ice implementation also defined a ICE_AVF_FLOW_FIELD_INVALID, intending
to use this to indicate when there were no hash enable bits set. This is
confusing, since the enumeration is using bit positions. A value of 0
*should* indicate the first bit. Instead, rewrite the code that uses
ICE_AVF_FLOW_FIELD_INVALID to just check if the avf_hash is zero. From
context this should be clear that we're checking if none of the bits are
set.

The values are kept as bit positions instead of encoding the BIT_ULL
directly into their value. While most users will simply use BIT_ULL
immediately, i40e uses the macros both with BIT_ULL and test_bit/set_bit
calls.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 81 +++++++++++++-------------
 drivers/net/ethernet/intel/i40e/i40e_main.c    | 23 ++++----
 drivers/net/ethernet/intel/i40e/i40e_txrx.c    | 25 ++++----
 drivers/net/ethernet/intel/i40e/i40e_txrx.h    | 35 +++++------
 drivers/net/ethernet/intel/i40e/i40e_type.h    | 32 ----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h    | 36 ++++++------
 drivers/net/ethernet/intel/iavf/iavf_type.h    | 32 ----------
 drivers/net/ethernet/intel/ice/ice_flow.c      | 45 +++++++-------
 drivers/net/ethernet/intel/ice/ice_flow.h      | 64 ++++++--------------
 include/linux/avf/virtchnl.h                   |  1 +
 include/linux/net/intel/libie/pctype.h         | 41 +++++++++++++
 11 files changed, 185 insertions(+), 230 deletions(-)
 create mode 100644 include/linux/net/intel/libie/pctype.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 8a7a83f83ee5..814e20325feb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -3,6 +3,7 @@
 
 /* ethtool support for i40e */
 
+#include <linux/net/intel/libie/pctype.h>
 #include "i40e_devids.h"
 #include "i40e_diag.h"
 #include "i40e_txrx_common.h"
@@ -3146,16 +3147,16 @@ static int i40e_get_rss_hash_opts(struct i40e_pf *pf, struct ethtool_rxnfc *cmd)
 
 	switch (cmd->flow_type) {
 	case TCP_V4_FLOW:
-		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		flow_pctype = LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP;
 		break;
 	case UDP_V4_FLOW:
-		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		flow_pctype = LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP;
 		break;
 	case TCP_V6_FLOW:
-		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_TCP;
+		flow_pctype = LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP;
 		break;
 	case UDP_V6_FLOW:
-		flow_pctype = I40E_FILTER_PCTYPE_NONF_IPV6_UDP;
+		flow_pctype = LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP;
 		break;
 	case SCTP_V4_FLOW:
 	case AH_ESP_V4_FLOW:
@@ -3412,28 +3413,28 @@ static int i40e_get_ethtool_fdir_entry(struct i40e_pf *pf,
 
 	switch (rule->flow_type) {
 	case SCTP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP;
 		break;
 	case TCP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP;
 		break;
 	case UDP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP;
 		break;
 	case SCTP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_SCTP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP;
 		break;
 	case TCP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_TCP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP;
 		break;
 	case UDP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_UDP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP;
 		break;
 	case IP_USER_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER;
 		break;
 	case IPV6_USER_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_OTHER;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER;
 		break;
 	default:
 		/* If we have stored a filter with a flow type not listed here
@@ -3643,40 +3644,40 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc)
 
 	switch (nfc->flow_type) {
 	case TCP_V4_FLOW:
-		set_bit(I40E_FILTER_PCTYPE_NONF_IPV4_TCP, flow_pctypes);
+		set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP, flow_pctypes);
 		if (test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE,
 			     pf->hw.caps))
-			set_bit(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK,
 				flow_pctypes);
 		break;
 	case TCP_V6_FLOW:
-		set_bit(I40E_FILTER_PCTYPE_NONF_IPV6_TCP, flow_pctypes);
+		set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP, flow_pctypes);
 		if (test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE,
 			     pf->hw.caps))
-			set_bit(I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK,
 				flow_pctypes);
 		break;
 	case UDP_V4_FLOW:
-		set_bit(I40E_FILTER_PCTYPE_NONF_IPV4_UDP, flow_pctypes);
+		set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP, flow_pctypes);
 		if (test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE,
 			     pf->hw.caps)) {
-			set_bit(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP,
 				flow_pctypes);
-			set_bit(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP,
 				flow_pctypes);
 		}
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4);
 		break;
 	case UDP_V6_FLOW:
-		set_bit(I40E_FILTER_PCTYPE_NONF_IPV6_UDP, flow_pctypes);
+		set_bit(LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP, flow_pctypes);
 		if (test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE,
 			     pf->hw.caps)) {
-			set_bit(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP,
 				flow_pctypes);
-			set_bit(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP,
+			set_bit(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP,
 				flow_pctypes);
 		}
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6);
 		break;
 	case AH_ESP_V4_FLOW:
 	case AH_V4_FLOW:
@@ -3685,7 +3686,7 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc)
 		if ((nfc->data & RXH_L4_B_0_1) ||
 		    (nfc->data & RXH_L4_B_2_3))
 			return -EINVAL;
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER);
 		break;
 	case AH_ESP_V6_FLOW:
 	case AH_V6_FLOW:
@@ -3694,15 +3695,15 @@ static int i40e_set_rss_hash_opt(struct i40e_pf *pf, struct ethtool_rxnfc *nfc)
 		if ((nfc->data & RXH_L4_B_0_1) ||
 		    (nfc->data & RXH_L4_B_2_3))
 			return -EINVAL;
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER);
 		break;
 	case IPV4_FLOW:
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER) |
-			BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER) |
+			BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4);
 		break;
 	case IPV6_FLOW:
-		hena |= BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER) |
-			BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6);
+		hena |= BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER) |
+			BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6);
 		break;
 	default:
 		return -EINVAL;
@@ -4312,36 +4313,36 @@ static int i40e_check_fdir_input_set(struct i40e_vsi *vsi,
 
 	switch (fsp->flow_type & ~FLOW_EXT) {
 	case SCTP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_SCTP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP;
 		fdir_filter_count = &pf->fd_sctp4_filter_cnt;
 		break;
 	case TCP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_TCP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP;
 		fdir_filter_count = &pf->fd_tcp4_filter_cnt;
 		break;
 	case UDP_V4_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_UDP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP;
 		fdir_filter_count = &pf->fd_udp4_filter_cnt;
 		break;
 	case SCTP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_SCTP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP;
 		fdir_filter_count = &pf->fd_sctp6_filter_cnt;
 		break;
 	case TCP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_TCP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP;
 		fdir_filter_count = &pf->fd_tcp6_filter_cnt;
 		break;
 	case UDP_V6_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_UDP;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP;
 		fdir_filter_count = &pf->fd_udp6_filter_cnt;
 		break;
 	case IP_USER_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER;
 		fdir_filter_count = &pf->fd_ip4_filter_cnt;
 		flex_l3 = true;
 		break;
 	case IPV6_USER_FLOW:
-		index = I40E_FILTER_PCTYPE_NONF_IPV6_OTHER;
+		index = LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER;
 		fdir_filter_count = &pf->fd_ip6_filter_cnt;
 		flex_l3 = true;
 		break;
@@ -4677,8 +4678,8 @@ static int i40e_check_fdir_input_set(struct i40e_vsi *vsi,
 	 * separate support, we'll always assume and enforce that the two flow
 	 * types must have matching input sets.
 	 */
-	if (index == I40E_FILTER_PCTYPE_NONF_IPV4_OTHER)
-		i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+	if (index == LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER)
+		i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_FRAG_IPV4,
 					new_mask);
 
 	/* Add the new offset and update table, if necessary */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 516e07b58161..67faf5a8dcbf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3,6 +3,7 @@
 
 #include <generated/utsrelease.h>
 #include <linux/crash_dump.h>
+#include <linux/net/intel/libie/pctype.h>
 #include <linux/if_bridge.h>
 #include <linux/if_macvlan.h>
 #include <linux/module.h>
@@ -9188,47 +9189,47 @@ static void i40e_fdir_filter_exit(struct i40e_pf *pf)
 	i40e_reset_fdir_filter_cnt(pf);
 
 	/* Reprogram the default input set for TCP/IPv4 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_TCP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for TCP/IPv6 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV6_TCP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP,
 				I40E_L3_V6_SRC_MASK | I40E_L3_V6_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for UDP/IPv4 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_UDP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for UDP/IPv6 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV6_UDP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP,
 				I40E_L3_V6_SRC_MASK | I40E_L3_V6_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for SCTP/IPv4 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_SCTP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for SCTP/IPv6 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV6_SCTP,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP,
 				I40E_L3_V6_SRC_MASK | I40E_L3_V6_DST_MASK |
 				I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
 	/* Reprogram the default input set for Other/IPv4 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_OTHER,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
 
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV4,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_FRAG_IPV4,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
 
 	/* Reprogram the default input set for Other/IPv6 */
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV6_OTHER,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
 
-	i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_FRAG_IPV6,
+	i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_FRAG_IPV6,
 				I40E_L3_SRC_MASK | I40E_L3_DST_MASK);
 }
 
@@ -9656,7 +9657,7 @@ static void i40e_reenable_fdir_atr(struct i40e_pf *pf)
 		 * settings. It is safe to restore the default input set
 		 * because there are no active TCPv4 filter rules.
 		 */
-		i40e_write_fd_input_set(pf, I40E_FILTER_PCTYPE_NONF_IPV4_TCP,
+		i40e_write_fd_input_set(pf, LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP,
 					I40E_L3_SRC_MASK | I40E_L3_DST_MASK |
 					I40E_L4_SRC_MASK | I40E_L4_DST_MASK);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c006f716a3bd..048c33039130 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2,6 +2,7 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
+#include <linux/net/intel/libie/pctype.h>
 #include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 #include <linux/sctp.h>
@@ -397,12 +398,12 @@ static int i40e_add_del_fdir_udp(struct i40e_vsi *vsi,
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_UDPIP_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV4_UDP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP);
 	else
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_UDPIP6_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV6_UDP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP);
 
 	if (ret) {
 		kfree(raw_packet);
@@ -444,12 +445,12 @@ static int i40e_add_del_fdir_tcp(struct i40e_vsi *vsi,
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_TCPIP_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV4_TCP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP);
 	else
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_TCPIP6_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV6_TCP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP);
 
 	if (ret) {
 		kfree(raw_packet);
@@ -499,12 +500,12 @@ static int i40e_add_del_fdir_sctp(struct i40e_vsi *vsi,
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_SCTPIP_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV4_SCTP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP);
 	else
 		ret = i40e_prepare_fdir_filter
 			(pf, fd_data, add, raw_packet,
 			 I40E_SCTPIP6_DUMMY_PACKET_LEN,
-			 I40E_FILTER_PCTYPE_NONF_IPV6_SCTP);
+			 LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP);
 
 	if (ret) {
 		kfree(raw_packet);
@@ -543,11 +544,11 @@ static int i40e_add_del_fdir_ip(struct i40e_vsi *vsi,
 	int i;
 
 	if (ipv4) {
-		iter_start = I40E_FILTER_PCTYPE_NONF_IPV4_OTHER;
-		iter_end = I40E_FILTER_PCTYPE_FRAG_IPV4;
+		iter_start = LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER;
+		iter_end = LIBIE_FILTER_PCTYPE_FRAG_IPV4;
 	} else {
-		iter_start = I40E_FILTER_PCTYPE_NONF_IPV6_OTHER;
-		iter_end = I40E_FILTER_PCTYPE_FRAG_IPV6;
+		iter_start = LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER;
+		iter_end = LIBIE_FILTER_PCTYPE_FRAG_IPV6;
 	}
 
 	for (i = iter_start; i <= iter_end; i++) {
@@ -2948,9 +2949,9 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
 	flex_ptype = FIELD_PREP(I40E_TXD_FLTR_QW0_QINDEX_MASK,
 				tx_ring->queue_index);
 	flex_ptype |= (tx_flags & I40E_TX_FLAGS_IPV4) ?
-		      (I40E_FILTER_PCTYPE_NONF_IPV4_TCP <<
+		      (LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP <<
 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT) :
-		      (I40E_FILTER_PCTYPE_NONF_IPV6_TCP <<
+		      (LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP <<
 		       I40E_TXD_FLTR_QW0_PCTYPE_SHIFT);
 
 	flex_ptype |= tx_ring->vsi->id << I40E_TXD_FLTR_QW0_DEST_VSI_SHIFT;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index b007a84268a7..1e5fd63d47f4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -4,6 +4,7 @@
 #ifndef _I40E_TXRX_H_
 #define _I40E_TXRX_H_
 
+#include <linux/net/intel/libie/pctype.h>
 #include <net/xdp.h>
 #include "i40e_type.h"
 
@@ -72,25 +73,25 @@ enum i40e_dyn_idx {
 
 /* Supported RSS offloads */
 #define I40E_DEFAULT_RSS_HASHCFG ( \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_UDP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV4) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_UDP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_FRAG_IPV6) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_L2_PAYLOAD))
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_L2_PAYLOAD))
 
 #define I40E_DEFAULT_RSS_HASHCFG_EXPANDED (I40E_DEFAULT_RSS_HASHCFG | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
-	BIT_ULL(I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
 #define i40e_pf_get_default_rss_hashcfg(pf) \
 	(test_bit(I40E_HW_CAP_MULTI_TCP_UDP_RSS_PCTYPE, (pf)->hw.caps) ? \
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 28568e126850..a09ed83835ff 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -929,38 +929,6 @@ struct i40e_filter_program_desc {
 #define I40E_TXD_FLTR_QW0_PCTYPE_MASK	(0x3FUL << \
 					 I40E_TXD_FLTR_QW0_PCTYPE_SHIFT)
 
-/* Packet Classifier Types for filters */
-enum i40e_filter_pctype {
-	/* Note: Values 0-28 are reserved for future use.
-	 * Value 29, 30, 32 are not supported on XL710 and X710.
-	 */
-	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP	= 29,
-	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP	= 30,
-	I40E_FILTER_PCTYPE_NONF_IPV4_UDP		= 31,
-	I40E_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK	= 32,
-	I40E_FILTER_PCTYPE_NONF_IPV4_TCP		= 33,
-	I40E_FILTER_PCTYPE_NONF_IPV4_SCTP		= 34,
-	I40E_FILTER_PCTYPE_NONF_IPV4_OTHER		= 35,
-	I40E_FILTER_PCTYPE_FRAG_IPV4			= 36,
-	/* Note: Values 37-38 are reserved for future use.
-	 * Value 39, 40, 42 are not supported on XL710 and X710.
-	 */
-	I40E_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP	= 39,
-	I40E_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP	= 40,
-	I40E_FILTER_PCTYPE_NONF_IPV6_UDP		= 41,
-	I40E_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK	= 42,
-	I40E_FILTER_PCTYPE_NONF_IPV6_TCP		= 43,
-	I40E_FILTER_PCTYPE_NONF_IPV6_SCTP		= 44,
-	I40E_FILTER_PCTYPE_NONF_IPV6_OTHER		= 45,
-	I40E_FILTER_PCTYPE_FRAG_IPV6			= 46,
-	/* Note: Value 47 is reserved for future use */
-	I40E_FILTER_PCTYPE_FCOE_OX			= 48,
-	I40E_FILTER_PCTYPE_FCOE_RX			= 49,
-	I40E_FILTER_PCTYPE_FCOE_OTHER			= 50,
-	/* Note: Values 51-62 are reserved for future use */
-	I40E_FILTER_PCTYPE_L2_PAYLOAD			= 63,
-};
-
 enum i40e_filter_program_desc_dest {
 	I40E_FILTER_PROGRAM_DESC_DEST_DROP_PACKET		= 0x0,
 	I40E_FILTER_PROGRAM_DESC_DEST_DIRECT_PACKET_QINDEX	= 0x1,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 94b324f212bd..df49b0b1d54a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -4,6 +4,8 @@
 #ifndef _IAVF_TXRX_H_
 #define _IAVF_TXRX_H_
 
+#include <linux/net/intel/libie/pctype.h>
+
 /* Interrupt Throttling and Rate Limiting Goodies */
 #define IAVF_DEFAULT_IRQ_WORK      256
 
@@ -60,25 +62,25 @@ enum iavf_dyn_idx_t {
 
 /* Supported RSS offloads */
 #define IAVF_DEFAULT_RSS_HASHCFG ( \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_UDP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_TCP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_FRAG_IPV4) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV6_UDP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV6_TCP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_FRAG_IPV6) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_L2_PAYLOAD))
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_L2_PAYLOAD))
 
 #define IAVF_DEFAULT_RSS_HASHCFG_EXPANDED (IAVF_DEFAULT_RSS_HASHCFG | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
-	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */
 #define IAVF_RX_INCREMENT(r, i) \
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index f9e1319620f4..cb12e86ba4a6 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -463,38 +463,6 @@ enum iavf_tx_ctx_desc_cmd_bits {
 	IAVF_TX_CTX_DESC_SWPE		= 0x40
 };
 
-/* Packet Classifier Types for filters */
-enum iavf_filter_pctype {
-	/* Note: Values 0-28 are reserved for future use.
-	 * Value 29, 30, 32 are not supported on XL710 and X710.
-	 */
-	IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP	= 29,
-	IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP	= 30,
-	IAVF_FILTER_PCTYPE_NONF_IPV4_UDP		= 31,
-	IAVF_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK	= 32,
-	IAVF_FILTER_PCTYPE_NONF_IPV4_TCP		= 33,
-	IAVF_FILTER_PCTYPE_NONF_IPV4_SCTP		= 34,
-	IAVF_FILTER_PCTYPE_NONF_IPV4_OTHER		= 35,
-	IAVF_FILTER_PCTYPE_FRAG_IPV4			= 36,
-	/* Note: Values 37-38 are reserved for future use.
-	 * Value 39, 40, 42 are not supported on XL710 and X710.
-	 */
-	IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP	= 39,
-	IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP	= 40,
-	IAVF_FILTER_PCTYPE_NONF_IPV6_UDP		= 41,
-	IAVF_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK	= 42,
-	IAVF_FILTER_PCTYPE_NONF_IPV6_TCP		= 43,
-	IAVF_FILTER_PCTYPE_NONF_IPV6_SCTP		= 44,
-	IAVF_FILTER_PCTYPE_NONF_IPV6_OTHER		= 45,
-	IAVF_FILTER_PCTYPE_FRAG_IPV6			= 46,
-	/* Note: Value 47 is reserved for future use */
-	IAVF_FILTER_PCTYPE_FCOE_OX			= 48,
-	IAVF_FILTER_PCTYPE_FCOE_RX			= 49,
-	IAVF_FILTER_PCTYPE_FCOE_OTHER			= 50,
-	/* Note: Values 51-62 are reserved for future use */
-	IAVF_FILTER_PCTYPE_L2_PAYLOAD			= 63,
-};
-
 #define IAVF_TXD_CTX_QW1_TSO_LEN_SHIFT	30
 #define IAVF_TXD_CTX_QW1_TSO_LEN_MASK	(0x3FFFFULL << \
 					 IAVF_TXD_CTX_QW1_TSO_LEN_SHIFT)
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.c b/drivers/net/ethernet/intel/ice/ice_flow.c
index d97b751052f2..278e57686274 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.c
+++ b/drivers/net/ethernet/intel/ice/ice_flow.c
@@ -2573,38 +2573,38 @@ ice_rem_rss_cfg(struct ice_hw *hw, u16 vsi_handle,
  * convert its values to their appropriate flow L3, L4 values.
  */
 #define ICE_FLOW_AVF_RSS_IPV4_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4))
 #define ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP))
 #define ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP))
 #define ICE_FLOW_AVF_RSS_ALL_IPV4_MASKS \
 	(ICE_FLOW_AVF_RSS_TCP_IPV4_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS | \
-	 ICE_FLOW_AVF_RSS_IPV4_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP))
+	 ICE_FLOW_AVF_RSS_IPV4_MASKS | BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP))
 
 #define ICE_FLOW_AVF_RSS_IPV6_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6))
 #define ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP))
 #define ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS \
-	(BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
-	 BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP))
+	(BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
+	 BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP))
 #define ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS \
 	(ICE_FLOW_AVF_RSS_TCP_IPV6_MASKS | ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS | \
-	 ICE_FLOW_AVF_RSS_IPV6_MASKS | BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP))
+	 ICE_FLOW_AVF_RSS_IPV6_MASKS | BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP))
 
 /**
  * ice_add_avf_rss_cfg - add an RSS configuration for AVF driver
  * @hw: pointer to the hardware structure
  * @vsi: VF's VSI
- * @avf_hash: hash bit fields (ICE_AVF_FLOW_FIELD_*) to configure
+ * @avf_hash: hash bit fields (LIBIE_FILTER_PCTYPE_*) to configure
  *
  * This function will take the hash bitmap provided by the AVF driver via a
  * message, convert it to ICE-compatible values, and configure RSS flow
@@ -2621,8 +2621,7 @@ int ice_add_avf_rss_cfg(struct ice_hw *hw, struct ice_vsi *vsi, u64 avf_hash)
 		return -EINVAL;
 
 	vsi_handle = vsi->idx;
-	if (avf_hash == ICE_AVF_FLOW_FIELD_INVALID ||
-	    !ice_is_vsi_valid(hw, vsi_handle))
+	if (!avf_hash || !ice_is_vsi_valid(hw, vsi_handle))
 		return -EINVAL;
 
 	/* Make sure no unsupported bits are specified */
@@ -2658,11 +2657,11 @@ int ice_add_avf_rss_cfg(struct ice_hw *hw, struct ice_vsi *vsi, u64 avf_hash)
 					ICE_FLOW_HASH_UDP_PORT;
 				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV4_MASKS;
 			} else if (hash_flds &
-				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP)) {
+				   BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP)) {
 				rss_hash = ICE_FLOW_HASH_IPV4 |
 					ICE_FLOW_HASH_SCTP_PORT;
 				hash_flds &=
-					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP);
+					~BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP);
 			}
 		} else if (hash_flds & ICE_FLOW_AVF_RSS_ALL_IPV6_MASKS) {
 			if (hash_flds & ICE_FLOW_AVF_RSS_IPV6_MASKS) {
@@ -2679,11 +2678,11 @@ int ice_add_avf_rss_cfg(struct ice_hw *hw, struct ice_vsi *vsi, u64 avf_hash)
 					ICE_FLOW_HASH_UDP_PORT;
 				hash_flds &= ~ICE_FLOW_AVF_RSS_UDP_IPV6_MASKS;
 			} else if (hash_flds &
-				   BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP)) {
+				   BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP)) {
 				rss_hash = ICE_FLOW_HASH_IPV6 |
 					ICE_FLOW_HASH_SCTP_PORT;
 				hash_flds &=
-					~BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP);
+					~BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP);
 			}
 		}
 
diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h
index b1313fb61677..52f906d89eca 100644
--- a/drivers/net/ethernet/intel/ice/ice_flow.h
+++ b/drivers/net/ethernet/intel/ice/ice_flow.h
@@ -4,6 +4,8 @@
 #ifndef _ICE_FLOW_H_
 #define _ICE_FLOW_H_
 
+#include <linux/net/intel/libie/pctype.h>
+
 #include "ice_flex_type.h"
 #include "ice_parser.h"
 
@@ -264,57 +266,27 @@ enum ice_flow_field {
 #define ICE_FLOW_HASH_FLD_GTPU_DWN_TEID \
 	BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_DWN_TEID)
 
-/* Flow headers and fields for AVF support */
-enum ice_flow_avf_hdr_field {
-	/* Values 0 - 28 are reserved for future use */
-	ICE_AVF_FLOW_FIELD_INVALID		= 0,
-	ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP	= 29,
-	ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP,
-	ICE_AVF_FLOW_FIELD_IPV4_UDP,
-	ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK,
-	ICE_AVF_FLOW_FIELD_IPV4_TCP,
-	ICE_AVF_FLOW_FIELD_IPV4_SCTP,
-	ICE_AVF_FLOW_FIELD_IPV4_OTHER,
-	ICE_AVF_FLOW_FIELD_FRAG_IPV4,
-	/* Values 37-38 are reserved */
-	ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP	= 39,
-	ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP,
-	ICE_AVF_FLOW_FIELD_IPV6_UDP,
-	ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK,
-	ICE_AVF_FLOW_FIELD_IPV6_TCP,
-	ICE_AVF_FLOW_FIELD_IPV6_SCTP,
-	ICE_AVF_FLOW_FIELD_IPV6_OTHER,
-	ICE_AVF_FLOW_FIELD_FRAG_IPV6,
-	ICE_AVF_FLOW_FIELD_RSVD47,
-	ICE_AVF_FLOW_FIELD_FCOE_OX,
-	ICE_AVF_FLOW_FIELD_FCOE_RX,
-	ICE_AVF_FLOW_FIELD_FCOE_OTHER,
-	/* Values 51-62 are reserved */
-	ICE_AVF_FLOW_FIELD_L2_PAYLOAD		= 63,
-	ICE_AVF_FLOW_FIELD_MAX
-};
-
 /* Supported RSS offloads  This macro is defined to support
  * VIRTCHNL_OP_GET_RSS_HASHCFG_CAPS ops. PF driver sends the RSS hardware
  * capabilities to the caller of this ops.
  */
 #define ICE_DEFAULT_RSS_HASHCFG ( \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_UDP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_SCTP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_OTHER) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV4) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_UDP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_SCTP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_OTHER) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_FRAG_IPV6) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV4_TCP_SYN_NO_ACK) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV4_UDP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV4_UDP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_IPV6_TCP_SYN_NO_ACK) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_UNICAST_IPV6_UDP) | \
-	BIT_ULL(ICE_AVF_FLOW_FIELD_MULTICAST_IPV6_UDP))
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV4) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_FRAG_IPV6) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
+	BIT_ULL(LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
 enum ice_rss_cfg_hdr_type {
 	ICE_RSS_OUTER_HEADERS, /* take outer headers as inputset. */
diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 362d1cdc8cd8..5be1881abbb6 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -982,6 +982,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(4, virtchnl_rss_lut);
  * traffic types that are hashed by the hardware.
  */
 struct virtchnl_rss_hashcfg {
+	/* Bits defined by enum libie_filter_pctype */
 	u64 hashcfg;
 };
 
diff --git a/include/linux/net/intel/libie/pctype.h b/include/linux/net/intel/libie/pctype.h
new file mode 100644
index 000000000000..d783417fbf36
--- /dev/null
+++ b/include/linux/net/intel/libie/pctype.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBIE_PCTYPE_H
+#define __LIBIE_PCTYPE_H
+
+/* Packet Classifier Type indexes, used to set the xxQF_HENA registers. Also
+ * communicated over the virtchnl API as part of struct virtchnl_rss_hashena.
+ */
+enum libie_filter_pctype {
+	/* Note: Values 0-28 are reserved for future use.
+	 * Value 29, 30, 32 are not supported on XL710 and X710.
+	 */
+	LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV4_UDP	= 29,
+	LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV4_UDP	= 30,
+	LIBIE_FILTER_PCTYPE_NONF_IPV4_UDP		= 31,
+	LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP_SYN_NO_ACK	= 32,
+	LIBIE_FILTER_PCTYPE_NONF_IPV4_TCP		= 33,
+	LIBIE_FILTER_PCTYPE_NONF_IPV4_SCTP		= 34,
+	LIBIE_FILTER_PCTYPE_NONF_IPV4_OTHER		= 35,
+	LIBIE_FILTER_PCTYPE_FRAG_IPV4			= 36,
+	/* Note: Values 37-38 are reserved for future use.
+	 * Value 39, 40, 42 are not supported on XL710 and X710.
+	 */
+	LIBIE_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP	= 39,
+	LIBIE_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP	= 40,
+	LIBIE_FILTER_PCTYPE_NONF_IPV6_UDP		= 41,
+	LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP_SYN_NO_ACK	= 42,
+	LIBIE_FILTER_PCTYPE_NONF_IPV6_TCP		= 43,
+	LIBIE_FILTER_PCTYPE_NONF_IPV6_SCTP		= 44,
+	LIBIE_FILTER_PCTYPE_NONF_IPV6_OTHER		= 45,
+	LIBIE_FILTER_PCTYPE_FRAG_IPV6			= 46,
+	/* Note: Value 47 is reserved for future use */
+	LIBIE_FILTER_PCTYPE_FCOE_OX			= 48,
+	LIBIE_FILTER_PCTYPE_FCOE_RX			= 49,
+	LIBIE_FILTER_PCTYPE_FCOE_OTHER			= 50,
+	/* Note: Values 51-62 are reserved for future use */
+	LIBIE_FILTER_PCTYPE_L2_PAYLOAD			= 63
+};
+
+#endif /* __LIBIE_PCTYPE_H */
-- 
cgit v1.2.3


From 7b746d584ab97c66a0aa8ef15da1e2aa8152e3fa Mon Sep 17 00:00:00 2001
From: Andrea della Porta <andrea.porta@suse.com>
Date: Thu, 29 May 2025 15:50:38 +0200
Subject: dt-bindings: clock: Add RaspberryPi RP1 clock bindings

Add device tree bindings for the clock generator found in RP1 multi
function device, and relative entries in MAINTAINERS file.

Signed-off-by: Andrea della Porta <andrea.porta@suse.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20250529135052.28398-1-andrea.porta@suse.com
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 .../bindings/clock/raspberrypi,rp1-clocks.yaml     | 58 ++++++++++++++++++++
 include/dt-bindings/clock/raspberrypi,rp1-clocks.h | 61 ++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
 create mode 100644 include/dt-bindings/clock/raspberrypi,rp1-clocks.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml b/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
new file mode 100644
index 000000000000..cc4491f7ee5f
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/raspberrypi,rp1-clocks.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RaspberryPi RP1 clock generator
+
+maintainers:
+  - A. della Porta <andrea.porta@suse.com>
+
+description: |
+  The RP1 contains a clock generator designed as three PLLs (CORE, AUDIO,
+  VIDEO), and each PLL output can be programmed through dividers to generate
+  the clocks to drive the sub-peripherals embedded inside the chipset.
+
+  Link to datasheet:
+  https://datasheets.raspberrypi.com/rp1/rp1-peripherals.pdf
+
+properties:
+  compatible:
+    const: raspberrypi,rp1-clocks
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+    description:
+      The available clocks are defined in
+      include/dt-bindings/clock/raspberrypi,rp1-clocks.h.
+
+  clocks:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - '#clock-cells'
+  - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/raspberrypi,rp1-clocks.h>
+
+    rp1 {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        clocks@c040018000 {
+            compatible = "raspberrypi,rp1-clocks";
+            reg = <0xc0 0x40018000 0x0 0x10038>;
+            #clock-cells = <1>;
+            clocks = <&clk_rp1_xosc>;
+        };
+    };
diff --git a/include/dt-bindings/clock/raspberrypi,rp1-clocks.h b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h
new file mode 100644
index 000000000000..248efb895f35
--- /dev/null
+++ b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2021 Raspberry Pi Ltd.
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_RASPBERRYPI_RP1
+#define __DT_BINDINGS_CLOCK_RASPBERRYPI_RP1
+
+#define RP1_PLL_SYS_CORE		0
+#define RP1_PLL_AUDIO_CORE		1
+#define RP1_PLL_VIDEO_CORE		2
+
+#define RP1_PLL_SYS			3
+#define RP1_PLL_AUDIO			4
+#define RP1_PLL_VIDEO			5
+
+#define RP1_PLL_SYS_PRI_PH		6
+#define RP1_PLL_SYS_SEC_PH		7
+#define RP1_PLL_AUDIO_PRI_PH		8
+
+#define RP1_PLL_SYS_SEC			9
+#define RP1_PLL_AUDIO_SEC		10
+#define RP1_PLL_VIDEO_SEC		11
+
+#define RP1_CLK_SYS			12
+#define RP1_CLK_SLOW_SYS		13
+#define RP1_CLK_DMA			14
+#define RP1_CLK_UART			15
+#define RP1_CLK_ETH			16
+#define RP1_CLK_PWM0			17
+#define RP1_CLK_PWM1			18
+#define RP1_CLK_AUDIO_IN		19
+#define RP1_CLK_AUDIO_OUT		20
+#define RP1_CLK_I2S			21
+#define RP1_CLK_MIPI0_CFG		22
+#define RP1_CLK_MIPI1_CFG		23
+#define RP1_CLK_PCIE_AUX		24
+#define RP1_CLK_USBH0_MICROFRAME	25
+#define RP1_CLK_USBH1_MICROFRAME	26
+#define RP1_CLK_USBH0_SUSPEND		27
+#define RP1_CLK_USBH1_SUSPEND		28
+#define RP1_CLK_ETH_TSU			29
+#define RP1_CLK_ADC			30
+#define RP1_CLK_SDIO_TIMER		31
+#define RP1_CLK_SDIO_ALT_SRC		32
+#define RP1_CLK_GP0			33
+#define RP1_CLK_GP1			34
+#define RP1_CLK_GP2			35
+#define RP1_CLK_GP3			36
+#define RP1_CLK_GP4			37
+#define RP1_CLK_GP5			38
+#define RP1_CLK_VEC			39
+#define RP1_CLK_DPI			40
+#define RP1_CLK_MIPI0_DPI		41
+#define RP1_CLK_MIPI1_DPI		42
+
+/* Extra PLL output channels - RP1B0 only */
+#define RP1_PLL_VIDEO_PRI_PH		43
+#define RP1_PLL_AUDIO_TERN		44
+
+#endif
-- 
cgit v1.2.3


From 9b8367b604c739947ec308874f087fe0eb80f412 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 6 Jun 2025 09:31:36 -0700
Subject: cgroup: Add bpf prog revisions to struct cgroup_bpf

One of key items in mprog API is revision for prog list. The revision
number will be increased if the prog list changed, e.g., attach, detach
or replace.

Add 'revisions' field to struct cgroup_bpf, representing revisions for
all cgroup related attachment types. The initial revision value is
set to 1, the same as kernel mprog implementations.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606163136.2428732-1-yonghong.song@linux.dev
---
 include/linux/bpf-cgroup-defs.h | 1 +
 kernel/cgroup/cgroup.c          | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index 0985221d5478..c9e6b26abab6 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -63,6 +63,7 @@ struct cgroup_bpf {
 	 */
 	struct hlist_head progs[MAX_CGROUP_BPF_ATTACH_TYPE];
 	u8 flags[MAX_CGROUP_BPF_ATTACH_TYPE];
+	u64 revisions[MAX_CGROUP_BPF_ATTACH_TYPE];
 
 	/* list of cgroup shared storages */
 	struct list_head storages;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index a723b7dc6e4e..312c6a8b55bb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 
+#ifdef CONFIG_CGROUP_BPF
+	for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++)
+		cgrp->bpf.revisions[i] = 1;
+#endif
+
 	init_waitqueue_head(&cgrp->offline_waitq);
 	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
 }
-- 
cgit v1.2.3


From 1209339844601ec1766f4ff430673fbcfe42bb51 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Fri, 6 Jun 2025 09:31:41 -0700
Subject: bpf: Implement mprog API on top of existing cgroup progs

Current cgroup prog ordering is appending at attachment time. This is not
ideal. In some cases, users want specific ordering at a particular cgroup
level. To address this, the existing mprog API seems an ideal solution with
supporting BPF_F_BEFORE and BPF_F_AFTER flags.

But there are a few obstacles to directly use kernel mprog interface.
Currently cgroup bpf progs already support prog attach/detach/replace
and link-based attach/detach/replace. For example, in struct
bpf_prog_array_item, the cgroup_storage field needs to be together
with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog
as the member, which makes it difficult to use kernel mprog interface.

In another case, the current cgroup prog detach tries to use the
same flag as in attach. This is different from mprog kernel interface
which uses flags passed from user space.

So to avoid modifying existing behavior, I made the following changes to
support mprog API for cgroup progs:
 - The support is for prog list at cgroup level. Cross-level prog list
   (a.k.a. effective prog list) is not supported.
 - Previously, BPF_F_PREORDER is supported only for prog attach, now
   BPF_F_PREORDER is also supported by link-based attach.
 - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported
   similar to kernel mprog but with different implementation.
 - For detach and replace, use the existing implementation.
 - For attach, detach and replace, the revision for a particular prog
   list, associated with a particular attach type, will be updated
   by increasing count by 1.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606163141.2428937-1-yonghong.song@linux.dev
---
 include/uapi/linux/bpf.h       |   7 ++
 kernel/bpf/cgroup.c            | 182 ++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/syscall.c           |  46 +++++++----
 tools/include/uapi/linux/bpf.h |   7 ++
 4 files changed, 205 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f1160ebbf526..25e9cf92ffaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1794,6 +1794,13 @@ union bpf_attr {
 				};
 				__u64		expected_revision;
 			} netkit;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} cgroup;
 		};
 	} link_create;
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9122c39870bf..ffbafbef5010 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 	return NULL;
 }
 
+static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
+{
+	struct bpf_link *link = ERR_PTR(-EINVAL);
+
+	if (flags & BPF_F_ID)
+		link = bpf_link_by_id(id_or_fd);
+	else if (id_or_fd)
+		link = bpf_link_get_from_fd(id_or_fd);
+	return link;
+}
+
+static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
+{
+	struct bpf_prog *prog = ERR_PTR(-EINVAL);
+
+	if (flags & BPF_F_ID)
+		prog = bpf_prog_by_id(id_or_fd);
+	else if (id_or_fd)
+		prog = bpf_prog_get(id_or_fd);
+	return prog;
+}
+
+static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
+					   struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
+{
+	bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
+	struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
+	bool preorder = flags & BPF_F_PREORDER;
+	struct bpf_link *anchor_link = NULL;
+	struct bpf_prog *anchor_prog = NULL;
+	bool is_before, is_after;
+
+	is_before = flags & BPF_F_BEFORE;
+	is_after = flags & BPF_F_AFTER;
+	if (is_link || is_id || id_or_fd) {
+		/* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
+		if (is_before == is_after)
+			return ERR_PTR(-EINVAL);
+		if ((is_link && !link) || (!is_link && !prog))
+			return ERR_PTR(-EINVAL);
+	} else if (!hlist_empty(progs)) {
+		/* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
+		if (is_before && is_after)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (is_link) {
+		anchor_link = bpf_get_anchor_link(flags, id_or_fd);
+		if (IS_ERR(anchor_link))
+			return ERR_PTR(PTR_ERR(anchor_link));
+	} else if (is_id || id_or_fd) {
+		anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
+		if (IS_ERR(anchor_prog))
+			return ERR_PTR(PTR_ERR(anchor_prog));
+	}
+
+	if (!anchor_prog && !anchor_link) {
+		/* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
+		 * doesn't matter since either prepend or append to a combined
+		 * list of progs will end up with correct result.
+		 */
+		hlist_for_each_entry(pltmp, progs, node) {
+			if (is_before)
+				return pltmp;
+			if (pltmp->node.next)
+				continue;
+			return pltmp;
+		}
+		return NULL;
+	}
+
+	hlist_for_each_entry(pltmp, progs, node) {
+		if ((anchor_prog && anchor_prog == pltmp->prog) ||
+		    (anchor_link && anchor_link == &pltmp->link->link)) {
+			if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
+				goto out;
+			pl = pltmp;
+			goto out;
+		}
+	}
+
+	pl = ERR_PTR(-ENOENT);
+out:
+	if (anchor_link)
+		bpf_link_put(anchor_link);
+	else
+		bpf_prog_put(anchor_prog);
+	return pl;
+}
+
+static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
+			      struct bpf_prog *prog, struct bpf_cgroup_link *link,
+			      u32 flags, u32 id_or_fd)
+{
+	struct bpf_prog_list *pltmp;
+
+	pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
+	if (IS_ERR(pltmp))
+		return PTR_ERR(pltmp);
+
+	if (!pltmp)
+		hlist_add_head(&pl->node, progs);
+	else if (flags & BPF_F_BEFORE)
+		hlist_add_before(&pl->node, &pltmp->node);
+	else
+		hlist_add_behind(&pl->node, &pltmp->node);
+
+	return 0;
+}
+
 /**
  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
  *                         propagate the change to descendants
@@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
  * @type: Type of attach operation
  * @flags: Option flags
+ * @id_or_fd: Relative prog id or fd
+ * @revision: bpf_prog_list revision
  *
  * Exactly one of @prog or @link can be non-null.
  * Must be called with cgroup_mutex held.
@@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
 static int __cgroup_bpf_attach(struct cgroup *cgrp,
 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
 			       struct bpf_cgroup_link *link,
-			       enum bpf_attach_type type, u32 flags)
+			       enum bpf_attach_type type, u32 flags, u32 id_or_fd,
+			       u64 revision)
 {
 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
 	struct bpf_prog *old_prog = NULL;
@@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
 		/* invalid combination */
 		return -EINVAL;
+	if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
+		/* only either replace or insertion with before/after */
+		return -EINVAL;
 	if (link && (prog || replace_prog))
 		/* only either link or prog/replace_prog can be specified */
 		return -EINVAL;
@@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
+	if (revision && revision != cgrp->bpf.revisions[atype])
+		return -ESTALE;
 
 	progs = &cgrp->bpf.progs[atype];
 
@@ -728,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (pl) {
 		old_prog = pl->prog;
 	} else {
-		struct hlist_node *last = NULL;
-
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
 			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
 		}
-		if (hlist_empty(progs))
-			hlist_add_head(&pl->node, progs);
-		else
-			hlist_for_each(last, progs) {
-				if (last->next)
-					continue;
-				hlist_add_behind(&pl->node, last);
-				break;
-			}
+
+		err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
+		if (err) {
+			kfree(pl);
+			bpf_cgroup_storages_free(new_storage);
+			return err;
+		}
 	}
 
 	pl->prog = prog;
@@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (err)
 		goto cleanup_trampoline;
 
+	cgrp->bpf.revisions[atype] += 1;
 	if (old_prog) {
 		if (type == BPF_LSM_CGROUP)
 			bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
 			     struct bpf_cgroup_link *link,
 			     enum bpf_attach_type type,
-			     u32 flags)
+			     u32 flags, u32 id_or_fd, u64 revision)
 {
 	int ret;
 
 	cgroup_lock();
-	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
+				  id_or_fd, revision);
 	cgroup_unlock();
 	return ret;
 }
@@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	if (!found)
 		return -ENOENT;
 
+	cgrp->bpf.revisions[atype] += 1;
 	old_prog = xchg(&link->link.prog, new_prog);
 	replace_effective_prog(cgrp, atype, link);
 	bpf_prog_put(old_prog);
@@ -1011,12 +1128,14 @@ found:
  * @prog: A program to detach or NULL
  * @link: A link to detach or NULL
  * @type: Type of detach operation
+ * @revision: bpf_prog_list revision
  *
  * At most one of @prog or @link can be non-NULL.
  * Must be called with cgroup_mutex held.
  */
 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
+			       struct bpf_cgroup_link *link, enum bpf_attach_type type,
+			       u64 revision)
 {
 	enum cgroup_bpf_attach_type atype;
 	struct bpf_prog *old_prog;
@@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	if (atype < 0)
 		return -EINVAL;
 
+	if (revision && revision != cgrp->bpf.revisions[atype])
+		return -ESTALE;
+
 	progs = &cgrp->bpf.progs[atype];
 	flags = cgrp->bpf.flags[atype];
 
@@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	hlist_del(&pl->node);
+	cgrp->bpf.revisions[atype] += 1;
 
 	kfree(pl);
 	if (hlist_empty(progs))
@@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 }
 
 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-			     enum bpf_attach_type type)
+			     enum bpf_attach_type type, u64 revision)
 {
 	int ret;
 
 	cgroup_lock();
-	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
 	cgroup_unlock();
 	return ret;
 }
@@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	struct bpf_prog_array *effective;
 	int cnt, ret = 0, i;
 	int total_cnt = 0;
+	u64 revision = 0;
 	u32 flags;
 
 	if (effective_query && prog_attach_flags)
@@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		return -EFAULT;
 	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
 		return -EFAULT;
+	if (!effective_query && from_atype == to_atype)
+		revision = cgrp->bpf.revisions[from_atype];
+	if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+		return -EFAULT;
 	if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
 		/* return early if user requested only program count + flags */
 		return 0;
@@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 	}
 
 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
-				attr->attach_type, attr->attach_flags);
+				attr->attach_type, attr->attach_flags,
+				attr->relative_fd, attr->expected_revision);
 
 	if (replace_prog)
 		bpf_prog_put(replace_prog);
@@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 	if (IS_ERR(prog))
 		prog = NULL;
 
-	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
 	if (prog)
 		bpf_prog_put(prog);
 
@@ -1267,7 +1396,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 	}
 
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
-				    cg_link->type));
+				    cg_link->type, 0));
 	if (cg_link->type == BPF_LSM_CGROUP)
 		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
 
@@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = {
 	.fill_link_info = bpf_cgroup_link_fill_link_info,
 };
 
+#define BPF_F_LINK_ATTACH_MASK	\
+	(BPF_F_ID |		\
+	 BPF_F_BEFORE |		\
+	 BPF_F_AFTER |		\
+	 BPF_F_PREORDER |	\
+	 BPF_F_LINK)
+
 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_link_primer link_primer;
@@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	struct cgroup *cgrp;
 	int err;
 
-	if (attr->link_create.flags)
+	if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
 		return -EINVAL;
 
 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -1370,7 +1506,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	}
 
 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
-				link->type, BPF_F_ALLOW_MULTI);
+				link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+				attr->link_create.cgroup.relative_fd,
+				attr->link_create.cgroup.expected_revision);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_cgroup;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 89d027cd7ca0..97ad57ffc404 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4186,6 +4186,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	}
 }
 
+static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
+				bool check_atype)
+{
+	switch (ptype) {
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_SOCK_OPS:
+		return true;
+	case BPF_PROG_TYPE_LSM:
+		return check_atype ? atype == BPF_LSM_CGROUP : true;
+	default:
+		return false;
+	}
+}
+
 #define BPF_PROG_ATTACH_LAST_FIELD expected_revision
 
 #define BPF_F_ATTACH_MASK_BASE	\
@@ -4216,6 +4235,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (bpf_mprog_supported(ptype)) {
 		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
 			return -EINVAL;
+	} else if (is_cgroup_prog_type(ptype, 0, false)) {
+		if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
+			return -EINVAL;
 	} else {
 		if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
 			return -EINVAL;
@@ -4233,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return -EINVAL;
 	}
 
+	if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+		goto out;
+	}
+
 	switch (ptype) {
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
@@ -4244,20 +4271,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 		ret = netns_bpf_prog_attach(attr, prog);
 		break;
-	case BPF_PROG_TYPE_CGROUP_DEVICE:
-	case BPF_PROG_TYPE_CGROUP_SKB:
-	case BPF_PROG_TYPE_CGROUP_SOCK:
-	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
-	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
-	case BPF_PROG_TYPE_CGROUP_SYSCTL:
-	case BPF_PROG_TYPE_SOCK_OPS:
-	case BPF_PROG_TYPE_LSM:
-		if (ptype == BPF_PROG_TYPE_LSM &&
-		    prog->expected_attach_type != BPF_LSM_CGROUP)
-			ret = -EINVAL;
-		else
-			ret = cgroup_bpf_prog_attach(attr, ptype, prog);
-		break;
 	case BPF_PROG_TYPE_SCHED_CLS:
 		if (attr->attach_type == BPF_TCX_INGRESS ||
 		    attr->attach_type == BPF_TCX_EGRESS)
@@ -4268,7 +4281,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	default:
 		ret = -EINVAL;
 	}
-
+out:
 	if (ret)
 		bpf_prog_put(prog);
 	return ret;
@@ -4296,6 +4309,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 			if (IS_ERR(prog))
 				return PTR_ERR(prog);
 		}
+	} else if (is_cgroup_prog_type(ptype, 0, false)) {
+		if (attr->attach_flags || attr->relative_fd)
+			return -EINVAL;
 	} else if (attr->attach_flags ||
 		   attr->relative_fd ||
 		   attr->expected_revision) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f1160ebbf526..25e9cf92ffaf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1794,6 +1794,13 @@ union bpf_attr {
 				};
 				__u64		expected_revision;
 			} netkit;
+			struct {
+				union {
+					__u32	relative_fd;
+					__u32	relative_id;
+				};
+				__u64		expected_revision;
+			} cgroup;
 		};
 	} link_create;
 
-- 
cgit v1.2.3


From c7beb48344d2ea0f3f1869b078309dbeb2ed4c96 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Sat, 7 Jun 2025 00:58:14 +0800
Subject: bpf: Add cookie to tracing bpf_link_info

bpf_tramp_link includes cookie info, we can add it in bpf_link_info.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20250606165818.3394397-1-chen.dylane@linux.dev
---
 include/uapi/linux/bpf.h       | 2 ++
 kernel/bpf/syscall.c           | 1 +
 tools/include/uapi/linux/bpf.h | 2 ++
 3 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 25e9cf92ffaf..194ed9891b40 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6665,6 +6665,8 @@ struct bpf_link_info {
 			__u32 attach_type;
 			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
 			__u32 target_btf_id; /* BTF type id inside the object */
+			__u32 :32;
+			__u64 cookie;
 		} tracing;
 		struct {
 			__u64 cgroup_id;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0c267f37775b..85e080c3333d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3416,6 +3416,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 		container_of(link, struct bpf_tracing_link, link.link);
 
 	info->tracing.attach_type = tr_link->attach_type;
+	info->tracing.cookie = tr_link->link.cookie;
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
 				  &info->tracing.target_obj_id,
 				  &info->tracing.target_btf_id);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 25e9cf92ffaf..194ed9891b40 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6665,6 +6665,8 @@ struct bpf_link_info {
 			__u32 attach_type;
 			__u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
 			__u32 target_btf_id; /* BTF type id inside the object */
+			__u32 :32;
+			__u64 cookie;
 		} tracing;
 		struct {
 			__u64 cgroup_id;
-- 
cgit v1.2.3


From ae7795a8c2582b5fb7971132753810a3f158e7b2 Mon Sep 17 00:00:00 2001
From: Huan Tang <tanghuan@vivo.com>
Date: Fri, 23 May 2025 14:46:04 +0800
Subject: scsi: ufs: core: Add HID support

Follow JESD220G, support HID(Host Initiated Defragmentation) through
sysfs, the relevant sysfs nodes are as follows:

	1. analysis_trigger
	2. defrag_trigger
	3. fragmented_size
	4. defrag_size
	5. progress_ratio
	6. state

The detailed definition of the six nodes can be found in the sysfs
documentation.

HID's execution policy is given to user-space.

Signed-off-by: Huan Tang <tanghuan@vivo.com>
Signed-off-by: Wenxing Cheng <wenxing.cheng@vivo.com>
Link: https://lore.kernel.org/r/20250523064604.800-1-tanghuan@vivo.com
Suggested-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Peter Wang <peter.wang@mediatek.com>
Reviewed-by: Bean Huo <huobean@gmail.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Yangtao Li <frank.li@vivo.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 Documentation/ABI/testing/sysfs-driver-ufs |  83 +++++++++++++
 drivers/ufs/core/ufs-sysfs.c               | 190 +++++++++++++++++++++++++++++
 drivers/ufs/core/ufshcd.c                  |   4 +
 include/ufs/ufs.h                          |  26 ++++
 4 files changed, 303 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-driver-ufs b/Documentation/ABI/testing/sysfs-driver-ufs
index d4140dc6c5ba..f3de8c521bbd 100644
--- a/Documentation/ABI/testing/sysfs-driver-ufs
+++ b/Documentation/ABI/testing/sysfs-driver-ufs
@@ -1685,3 +1685,86 @@ Description:
 		================  ========================================
 
 		The file is read only.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/analysis_trigger
+What:		/sys/bus/platform/devices/*.ufs/hid/analysis_trigger
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		The host can enable or disable HID analysis operation.
+
+		=======  =========================================
+		disable   disable HID analysis operation
+		enable    enable HID analysis operation
+		=======  =========================================
+
+		The file is write only.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/defrag_trigger
+What:		/sys/bus/platform/devices/*.ufs/hid/defrag_trigger
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		The host can enable or disable HID defragmentation operation.
+
+		=======  =========================================
+		disable   disable HID defragmentation operation
+		enable    enable HID defragmentation operation
+		=======  =========================================
+
+		The attribute is write only.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/fragmented_size
+What:		/sys/bus/platform/devices/*.ufs/hid/fragmented_size
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		The total fragmented size in the device is reported through
+		this attribute.
+
+		The attribute is read only.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/defrag_size
+What:		/sys/bus/platform/devices/*.ufs/hid/defrag_size
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		The host sets the size to be defragmented by an HID
+		defragmentation operation.
+
+		The attribute is read/write.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/progress_ratio
+What:		/sys/bus/platform/devices/*.ufs/hid/progress_ratio
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		Defragmentation progress is reported by this attribute,
+		indicates the ratio of the completed defragmentation size
+		over the requested defragmentation size.
+
+		====  ============================================
+		1     1%
+		...
+		100   100%
+		====  ============================================
+
+		The attribute is read only.
+
+What:		/sys/bus/platform/drivers/ufshcd/*/hid/state
+What:		/sys/bus/platform/devices/*.ufs/hid/state
+Date:		May 2025
+Contact:	Huan Tang <tanghuan@vivo.com>
+Description:
+		The HID state is reported by this attribute.
+
+		====================   ===========================
+		idle			Idle (analysis required)
+		analysis_in_progress    Analysis in progress
+		defrag_required      	Defrag required
+		defrag_in_progress      Defrag in progress
+		defrag_completed      	Defrag completed
+		defrag_not_required     Defrag is not required
+		====================   ===========================
+
+		The attribute is read only.
diff --git a/drivers/ufs/core/ufs-sysfs.c b/drivers/ufs/core/ufs-sysfs.c
index de8b6acd4058..10006ae5ee35 100644
--- a/drivers/ufs/core/ufs-sysfs.c
+++ b/drivers/ufs/core/ufs-sysfs.c
@@ -87,6 +87,23 @@ static const char *ufs_wb_resize_status_to_string(enum wb_resize_status status)
 	}
 }
 
+static const char * const ufs_hid_states[] = {
+	[HID_IDLE]		= "idle",
+	[ANALYSIS_IN_PROGRESS]	= "analysis_in_progress",
+	[DEFRAG_REQUIRED]	= "defrag_required",
+	[DEFRAG_IN_PROGRESS]	= "defrag_in_progress",
+	[DEFRAG_COMPLETED]	= "defrag_completed",
+	[DEFRAG_NOT_REQUIRED]	= "defrag_not_required",
+};
+
+static const char *ufs_hid_state_to_string(enum ufs_hid_state state)
+{
+	if (state < NUM_UFS_HID_STATES)
+		return ufs_hid_states[state];
+
+	return "unknown";
+}
+
 static const char *ufshcd_uic_link_state_to_string(
 			enum uic_link_state state)
 {
@@ -1763,6 +1780,178 @@ static const struct attribute_group ufs_sysfs_attributes_group = {
 	.attrs = ufs_sysfs_attributes,
 };
 
+static int hid_query_attr(struct ufs_hba *hba, enum query_opcode opcode,
+			enum attr_idn idn, u32 *attr_val)
+{
+	int ret;
+
+	down(&hba->host_sem);
+	if (!ufshcd_is_user_access_allowed(hba)) {
+		up(&hba->host_sem);
+		return -EBUSY;
+	}
+
+	ufshcd_rpm_get_sync(hba);
+	ret = ufshcd_query_attr(hba, opcode, idn, 0, 0, attr_val);
+	ufshcd_rpm_put_sync(hba);
+
+	up(&hba->host_sem);
+	return ret;
+}
+
+static ssize_t analysis_trigger_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	int mode;
+	int ret;
+
+	if (sysfs_streq(buf, "enable"))
+		mode = HID_ANALYSIS_ENABLE;
+	else if (sysfs_streq(buf, "disable"))
+		mode = HID_ANALYSIS_AND_DEFRAG_DISABLE;
+	else
+		return -EINVAL;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR,
+			QUERY_ATTR_IDN_HID_DEFRAG_OPERATION, &mode);
+
+	return ret < 0 ? ret : count;
+}
+
+static DEVICE_ATTR_WO(analysis_trigger);
+
+static ssize_t defrag_trigger_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	int mode;
+	int ret;
+
+	if (sysfs_streq(buf, "enable"))
+		mode = HID_ANALYSIS_AND_DEFRAG_ENABLE;
+	else if (sysfs_streq(buf, "disable"))
+		mode = HID_ANALYSIS_AND_DEFRAG_DISABLE;
+	else
+		return -EINVAL;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR,
+			QUERY_ATTR_IDN_HID_DEFRAG_OPERATION, &mode);
+
+	return ret < 0 ? ret : count;
+}
+
+static DEVICE_ATTR_WO(defrag_trigger);
+
+static ssize_t fragmented_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	u32 value;
+	int ret;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+			QUERY_ATTR_IDN_HID_AVAILABLE_SIZE, &value);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%u\n", value);
+}
+
+static DEVICE_ATTR_RO(fragmented_size);
+
+static ssize_t defrag_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	u32 value;
+	int ret;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+			QUERY_ATTR_IDN_HID_SIZE, &value);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%u\n", value);
+}
+
+static ssize_t defrag_size_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	u32 value;
+	int ret;
+
+	if (kstrtou32(buf, 0, &value))
+		return -EINVAL;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_WRITE_ATTR,
+			QUERY_ATTR_IDN_HID_SIZE, &value);
+
+	return ret < 0 ? ret : count;
+}
+
+static DEVICE_ATTR_RW(defrag_size);
+
+static ssize_t progress_ratio_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	u32 value;
+	int ret;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+			QUERY_ATTR_IDN_HID_PROGRESS_RATIO, &value);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%u\n", value);
+}
+
+static DEVICE_ATTR_RO(progress_ratio);
+
+static ssize_t state_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+	u32 value;
+	int ret;
+
+	ret = hid_query_attr(hba, UPIU_QUERY_OPCODE_READ_ATTR,
+			QUERY_ATTR_IDN_HID_STATE, &value);
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%s\n", ufs_hid_state_to_string(value));
+}
+
+static DEVICE_ATTR_RO(state);
+
+static struct attribute *ufs_sysfs_hid[] = {
+	&dev_attr_analysis_trigger.attr,
+	&dev_attr_defrag_trigger.attr,
+	&dev_attr_fragmented_size.attr,
+	&dev_attr_defrag_size.attr,
+	&dev_attr_progress_ratio.attr,
+	&dev_attr_state.attr,
+	NULL,
+};
+
+static umode_t ufs_sysfs_hid_is_visible(struct kobject *kobj,
+		struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct ufs_hba *hba = dev_get_drvdata(dev);
+
+	return	hba->dev_info.hid_sup ? attr->mode : 0;
+}
+
+static const struct attribute_group ufs_sysfs_hid_group = {
+	.name = "hid",
+	.attrs = ufs_sysfs_hid,
+	.is_visible = ufs_sysfs_hid_is_visible,
+};
+
 static const struct attribute_group *ufs_sysfs_groups[] = {
 	&ufs_sysfs_default_group,
 	&ufs_sysfs_capabilities_group,
@@ -1777,6 +1966,7 @@ static const struct attribute_group *ufs_sysfs_groups[] = {
 	&ufs_sysfs_string_descriptors_group,
 	&ufs_sysfs_flags_group,
 	&ufs_sysfs_attributes_group,
+	&ufs_sysfs_hid_group,
 	NULL,
 };
 
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 4410e7d93b7d..0a702356a715 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -8414,6 +8414,10 @@ static int ufs_get_device_desc(struct ufs_hba *hba)
 
 	dev_info->rtt_cap = desc_buf[DEVICE_DESC_PARAM_RTT_CAP];
 
+	dev_info->hid_sup = get_unaligned_be32(desc_buf +
+				DEVICE_DESC_PARAM_EXT_UFS_FEATURE_SUP) &
+				UFS_DEV_HID_SUPPORT;
+
 	model_index = desc_buf[DEVICE_DESC_PARAM_PRDCT_NAME];
 
 	err = ufshcd_read_string_desc(hba, model_index,
diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h
index c0c59a8f7256..72fd385037a6 100644
--- a/include/ufs/ufs.h
+++ b/include/ufs/ufs.h
@@ -182,6 +182,11 @@ enum attr_idn {
 	QUERY_ATTR_IDN_CURR_WB_BUFF_SIZE        = 0x1F,
 	QUERY_ATTR_IDN_TIMESTAMP		= 0x30,
 	QUERY_ATTR_IDN_DEV_LVL_EXCEPTION_ID     = 0x34,
+	QUERY_ATTR_IDN_HID_DEFRAG_OPERATION	= 0x35,
+	QUERY_ATTR_IDN_HID_AVAILABLE_SIZE	= 0x36,
+	QUERY_ATTR_IDN_HID_SIZE			= 0x37,
+	QUERY_ATTR_IDN_HID_PROGRESS_RATIO	= 0x38,
+	QUERY_ATTR_IDN_HID_STATE		= 0x39,
 	QUERY_ATTR_IDN_WB_BUF_RESIZE_HINT	= 0x3C,
 	QUERY_ATTR_IDN_WB_BUF_RESIZE_EN		= 0x3D,
 	QUERY_ATTR_IDN_WB_BUF_RESIZE_STATUS	= 0x3E,
@@ -401,6 +406,7 @@ enum {
 	UFS_DEV_HPB_SUPPORT		= BIT(7),
 	UFS_DEV_WRITE_BOOSTER_SUP	= BIT(8),
 	UFS_DEV_LVL_EXCEPTION_SUP       = BIT(12),
+	UFS_DEV_HID_SUPPORT		= BIT(13),
 };
 #define UFS_DEV_HPB_SUPPORT_VERSION		0x310
 
@@ -466,6 +472,24 @@ enum ufs_ref_clk_freq {
 	REF_CLK_FREQ_INVAL	= -1,
 };
 
+/* bDefragOperation attribute values */
+enum ufs_hid_defrag_operation {
+	HID_ANALYSIS_AND_DEFRAG_DISABLE	= 0,
+	HID_ANALYSIS_ENABLE		= 1,
+	HID_ANALYSIS_AND_DEFRAG_ENABLE	= 2,
+};
+
+/* bHIDState attribute values */
+enum ufs_hid_state {
+	HID_IDLE		= 0,
+	ANALYSIS_IN_PROGRESS	= 1,
+	DEFRAG_REQUIRED		= 2,
+	DEFRAG_IN_PROGRESS	= 3,
+	DEFRAG_COMPLETED	= 4,
+	DEFRAG_NOT_REQUIRED	= 5,
+	NUM_UFS_HID_STATES	= 6,
+};
+
 /* bWriteBoosterBufferResizeEn attribute */
 enum wb_resize_en {
 	WB_RESIZE_EN_IDLE	= 0,
@@ -625,6 +649,8 @@ struct ufs_dev_info {
 	u32 rtc_update_period;
 
 	u8 rtt_cap; /* bDeviceRTTCap */
+
+	bool hid_sup;
 };
 
 /*
-- 
cgit v1.2.3


From 9d2c232d575a8c8dfa66276ed7edccfac482a4df Mon Sep 17 00:00:00 2001
From: Kassey Li <quic_yingangl@quicinc.com>
Date: Wed, 21 May 2025 09:17:11 +0800
Subject: scsi: trace: Show rtn in string for scsi_dispatch_cmd_error()

By default the scsi_dispatch_cmd_error() return value is displayed in
decimal:

  kworker/3:1H-183 [003] ....  51.035474: scsi_dispatch_cmd_error: host_no=0 channel=0 id=0 lun=4 data_sgl=1  prot_sgl=0 prot_op=SCSI_PROT_NORMAL cmnd=(READ_10 lba=3907214  txlen=1 protect=0 raw=28 00 00 3b 9e 8e 00 00 01 00) rtn=4181

However, these numbers are not particularly helpful wrt. debugging
errors. Especially since the kernel code consistently uses the following
defines in hexadecimal:

  SCSI_MLQUEUE_HOST_BUSY   0x1055
  SCSI_MLQUEUE_DEVICE_BUSY 0x1056
  SCSI_MLQUEUE_EH_RETRY    0x1057
  SCSI_MLQUEUE_TARGET_BUSY 0x1058

Switch to using the string form of these values in the trace output:

  dd-1059    [007] .....    31.689529: scsi_dispatch_cmd_error: host_no=0 channel=0 id=0 lun=4 data_sgl=65 prot_sgl=0 prot_op=SCSI_PROT_NORMAL driver_tag=23 scheduler_tag=117 cmnd=(READ_10 lba=0 txlen=128 protect=0 raw=28 00 00 00 00 00 00 00 80 00) rtn=SCSI_MLQUEUE_DEVICE_BUSY

Signed-off-by: Kassey Li <quic_yingangl@quicinc.com>
Link: https://lore.kernel.org/r/20250521011711.1983625-1-quic_yingangl@quicinc.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/trace/events/scsi.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/scsi.h b/include/trace/events/scsi.h
index bf6cc98d9122..c36c72ab7f2b 100644
--- a/include/trace/events/scsi.h
+++ b/include/trace/events/scsi.h
@@ -200,6 +200,14 @@ TRACE_EVENT(scsi_dispatch_cmd_start,
 		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len))
 );
 
+#define scsi_rtn_name(result)	{ result, #result }
+#define show_rtn_name(val)					\
+	__print_symbolic(val,					\
+		scsi_rtn_name(SCSI_MLQUEUE_HOST_BUSY),		\
+		scsi_rtn_name(SCSI_MLQUEUE_DEVICE_BUSY),	\
+		scsi_rtn_name(SCSI_MLQUEUE_EH_RETRY),		\
+		scsi_rtn_name(SCSI_MLQUEUE_TARGET_BUSY))
+
 TRACE_EVENT(scsi_dispatch_cmd_error,
 
 	TP_PROTO(struct scsi_cmnd *cmd, int rtn),
@@ -240,14 +248,15 @@ TRACE_EVENT(scsi_dispatch_cmd_error,
 
 	TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \
 		  " prot_op=%s driver_tag=%d scheduler_tag=%d cmnd=(%s %s raw=%s)" \
-		  " rtn=%d",
+		  " rtn=%s",
 		  __entry->host_no, __entry->channel, __entry->id,
 		  __entry->lun, __entry->data_sglen, __entry->prot_sglen,
 		  show_prot_op_name(__entry->prot_op), __entry->driver_tag,
 		  __entry->scheduler_tag, show_opcode_name(__entry->opcode),
 		  __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len),
 		  __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len),
-		  __entry->rtn)
+		  show_rtn_name(__entry->rtn)
+	  )
 );
 
 DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template,
-- 
cgit v1.2.3


From 03c68a0f8c68936a0bb915b030693923784724cb Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <luis.gerhorst@fau.de>
Date: Tue, 3 Jun 2025 23:13:18 +0200
Subject: bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JITs can set bpf_jit_bypass_spec_v1/v4() if they want the verifier to
skip analysis/patching for the respective vulnerability. For v4, this
will reduce the number of barriers the verifier inserts. For v1, it
allows more programs to be accepted.

The primary motivation for this is to not regress unpriv BPF's
performance on ARM64 in a future commit where BPF_NOSPEC is also used
against Spectre v1.

This has the user-visible change that v1-induced rejections on
non-vulnerable PowerPC CPUs are avoided.

For now, this does not change the semantics of BPF_NOSPEC. It is still a
v4-only barrier and must not be implemented if bypass_spec_v4 is always
true for the arch. Changing it to a v1 AND v4-barrier is done in a
future commit.

As an alternative to bypass_spec_v1/v4, one could introduce NOSPEC_V1
AND NOSPEC_V4 instructions and allow backends to skip their lowering as
suggested by commit f5e81d111750 ("bpf: Introduce BPF nospec instruction
for mitigating Spectre v4"). Adding bpf_jit_bypass_spec_v1/v4() was
found to be preferable for the following reason:

* bypass_spec_v1/v4 benefits non-vulnerable CPUs: Always performing the
  same analysis (not taking into account whether the current CPU is
  vulnerable), needlessly restricts users of CPUs that are not
  vulnerable. The only use case for this would be portability-testing,
  but this can later be added easily when needed by allowing users to
  force bypass_spec_v1/v4 to false.

* Portability is still acceptable: Directly disabling the analysis
  instead of skipping the lowering of BPF_NOSPEC(_V1/V4) might allow
  programs on non-vulnerable CPUs to be accepted while the program will
  be rejected on vulnerable CPUs. With the fallback to speculation
  barriers for Spectre v1 implemented in a future commit, this will only
  affect programs that do variable stack-accesses or are very complex.

For PowerPC, the SEC_FTR checking in bpf_jit_bypass_spec_v4() is based
on the check that was previously located in the BPF_NOSPEC case.

For LoongArch, it would likely be safe to set both
bpf_jit_bypass_spec_v1() and _v4() according to
commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation
barrier opcode"). This is omitted here as I am unable to do any testing
for LoongArch.

Hari's ack concerns the PowerPC part only.

Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Cc: Henriette Herzog <henriette.herzog@rub.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
Link: https://lore.kernel.org/r/20250603211318.337474-1-luis.gerhorst@fau.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c     | 21 ++++++++++++---------
 arch/powerpc/net/bpf_jit_comp64.c | 21 +++++++++++++++++----
 include/linux/bpf.h               | 11 +++++++++--
 kernel/bpf/core.c                 | 15 +++++++++++++++
 4 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index da8b89dd2910..2cab9063f563 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1632,15 +1632,7 @@ emit_cond_jmp:
 
 	/* speculation barrier */
 	case BPF_ST | BPF_NOSPEC:
-		/*
-		 * Nothing required here.
-		 *
-		 * In case of arm64, we rely on the firmware mitigation of
-		 * Speculative Store Bypass as controlled via the ssbd kernel
-		 * parameter. Whenever the mitigation is enabled, it works
-		 * for all of the kernel code with no need to provide any
-		 * additional instructions.
-		 */
+		/* See bpf_jit_bypass_spec_v4() */
 		break;
 
 	/* ST: *(size *)(dst + off) = imm */
@@ -2911,6 +2903,17 @@ bool bpf_jit_supports_percpu_insn(void)
 	return true;
 }
 
+bool bpf_jit_bypass_spec_v4(void)
+{
+	/* In case of arm64, we rely on the firmware mitigation of Speculative
+	 * Store Bypass as controlled via the ssbd kernel parameter. Whenever
+	 * the mitigation is enabled, it works for all of the kernel code with
+	 * no need to provide any additional instructions. Therefore, skip
+	 * inserting nospec insns against Spectre v4.
+	 */
+	return true;
+}
+
 bool bpf_jit_inlines_helper_call(s32 imm)
 {
 	switch (imm) {
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 5daa77aee7f7..a4335761b7f9 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -370,6 +370,23 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
 	return 0;
 }
 
+bool bpf_jit_bypass_spec_v1(void)
+{
+#if defined(CONFIG_PPC_E500) || defined(CONFIG_PPC_BOOK3S_64)
+	return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR));
+#else
+	return true;
+#endif
+}
+
+bool bpf_jit_bypass_spec_v4(void)
+{
+	return !(security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+		 security_ftr_enabled(SEC_FTR_STF_BARRIER) &&
+		 stf_barrier_type_get() != STF_BARRIER_NONE);
+}
+
 /*
  * We spill into the redzone always, even if the bpf program has its own stackframe.
  * Offsets hardcoded based on BPF_PPC_STACK_SAVE -- see bpf_jit_stack_local()
@@ -791,10 +808,6 @@ emit_clear:
 		 * BPF_ST NOSPEC (speculation barrier)
 		 */
 		case BPF_ST | BPF_NOSPEC:
-			if (!security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) ||
-					!security_ftr_enabled(SEC_FTR_STF_BARRIER))
-				break;
-
 			switch (stf_barrier) {
 			case STF_BARRIER_EIEIO:
 				EMIT(PPC_RAW_EIEIO() | 0x02000000);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b25d278409b..5dd556e89cce 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2288,6 +2288,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array,
 	return ret;
 }
 
+bool bpf_jit_bypass_spec_v1(void);
+bool bpf_jit_bypass_spec_v4(void);
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 extern struct mutex bpf_stats_enabled_mutex;
@@ -2475,12 +2478,16 @@ static inline bool bpf_allow_uninit_stack(const struct bpf_token *token)
 
 static inline bool bpf_bypass_spec_v1(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return bpf_jit_bypass_spec_v1() ||
+		cpu_mitigations_off() ||
+		bpf_token_capable(token, CAP_PERFMON);
 }
 
 static inline bool bpf_bypass_spec_v4(const struct bpf_token *token)
 {
-	return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON);
+	return bpf_jit_bypass_spec_v4() ||
+		cpu_mitigations_off() ||
+		bpf_token_capable(token, CAP_PERFMON);
 }
 
 int bpf_map_new_fd(struct bpf_map *map, int flags);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c20babbf998f..f9bd9625438b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3034,6 +3034,21 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+/* By default, enable the verifier's mitigations against Spectre v1 and v4 for
+ * all archs. The value returned must not change at runtime as there is
+ * currently no support for reloading programs that were loaded without
+ * mitigations.
+ */
+bool __weak bpf_jit_bypass_spec_v1(void)
+{
+	return false;
+}
+
+bool __weak bpf_jit_bypass_spec_v4(void)
+{
+	return false;
+}
+
 /* Return true if the JIT inlines the call to the helper corresponding to
  * the imm.
  *
-- 
cgit v1.2.3


From dff883d9e93a7f2f2fa4e38a9444b2c79d6da91a Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <luis.gerhorst@fau.de>
Date: Tue, 3 Jun 2025 23:17:03 +0200
Subject: bpf, arm64, powerpc: Change nospec to include v1 barrier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This changes the semantics of BPF_NOSPEC (previously a v4-only barrier)
to always emit a speculation barrier that works against both Spectre v1
AND v4. If mitigation is not needed on an architecture, the backend
should set bpf_jit_bypass_spec_v4/v1().

As of now, this commit only has the user-visible implication that unpriv
BPF's performance on PowerPC is reduced. This is the case because we
have to emit additional v1 barrier instructions for BPF_NOSPEC now.

This commit is required for a future commit to allow us to rely on
BPF_NOSPEC for Spectre v1 mitigation. As of this commit, the feature
that nospec acts as a v1 barrier is unused.

Commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for
mitigating Spectre v4") noted that mitigation instructions for v1 and v4
might be different on some archs. While this would potentially offer
improved performance on PowerPC, it was dismissed after the following
considerations:

* Only having one barrier simplifies the verifier and allows us to
  easily rely on v4-induced barriers for reducing the complexity of
  v1-induced speculative path verification.

* For the architectures that implemented BPF_NOSPEC, only PowerPC has
  distinct instructions for v1 and v4. Even there, some insns may be
  shared between the barriers for v1 and v4 (e.g., 'ori 31,31,0' and
  'sync'). If this is still found to impact performance in an
  unacceptable way, BPF_NOSPEC can be split into BPF_NOSPEC_V1 and
  BPF_NOSPEC_V4 later. As an optimization, we can already skip v1/v4
  insns from being emitted for PowerPC with this setup if
  bypass_spec_v1/v4 is set.

Vulnerability-status for BPF_NOSPEC-based Spectre mitigations (v4 as of
this commit, v1 in the future) is therefore:

* x86 (32-bit and 64-bit), ARM64, and PowerPC (64-bit): Mitigated - This
  patch implements BPF_NOSPEC for these architectures. The previous
  v4-only version was supported since commit f5e81d111750 ("bpf:
  Introduce BPF nospec instruction for mitigating Spectre v4") and
  commit b7540d625094 ("powerpc/bpf: Emit stf barrier instruction
  sequences for BPF_NOSPEC").

* LoongArch: Not Vulnerable - Commit a6f6a95f2580 ("LoongArch, bpf: Fix
  jit to skip speculation barrier opcode") is the only other past commit
  related to BPF_NOSPEC and indicates that the insn is not required
  there.

* MIPS: Vulnerable (if unprivileged BPF is enabled) -
  Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation
  barrier opcode") indicates that it is not vulnerable, but this
  contradicts the kernel and Debian documentation. Therefore, I assume
  that there exist vulnerable MIPS CPUs (but maybe not from Loongson?).
  In the future, BPF_NOSPEC could be implemented for MIPS based on the
  GCC speculation_barrier [1]. For now, we rely on unprivileged BPF
  being disabled by default.

* Other: Unknown - To the best of my knowledge there is no definitive
  information available that indicates that any other arch is
  vulnerable. They are therefore left untouched (BPF_NOSPEC is not
  implemented, but bypass_spec_v1/v4 is also not set).

I did the following testing to ensure the insn encoding is correct:

* ARM64:
  * 'dsb nsh; isb' was successfully tested with the BPF CI in [2]
  * 'sb' locally using QEMU v7.2.15 -cpu max (emitted sb insn is
    executed for example with './test_progs -t verifier_array_access')

* PowerPC: The following configs were tested locally with ppc64le QEMU
  v8.2 '-machine pseries -cpu POWER9':
  * STF_BARRIER_EIEIO + CONFIG_PPC_BOOK32_64
  * STF_BARRIER_SYNC_ORI (forced on) + CONFIG_PPC_BOOK32_64
  * STF_BARRIER_FALLBACK (forced on) + CONFIG_PPC_BOOK32_64
  * CONFIG_PPC_E500 (forced on) + STF_BARRIER_EIEIO
  * CONFIG_PPC_E500 (forced on) + STF_BARRIER_SYNC_ORI (forced on)
  * CONFIG_PPC_E500 (forced on) + STF_BARRIER_FALLBACK (forced on)
  * CONFIG_PPC_E500 (forced on) + STF_BARRIER_NONE (forced on)
  Most of those cobinations should not occur in practice, but I was not
  able to get an PPC e6500 rootfs (for testing PPC_E500 without forcing
  it on). In any case, this should ensure that there are no unexpected
  conflicts between the insns when combined like this. Individual v1/v4
  barriers were already emitted elsewhere.

Hari's ack is for the PowerPC changes only.

[1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=29b74545531f6afbee9fc38c267524326dbfbedf
    ("MIPS: Add speculation_barrier support")
[2] https://github.com/kernel-patches/bpf/pull/8576

Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Cc: Henriette Herzog <henriette.herzog@rub.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
Link: https://lore.kernel.org/r/20250603211703.337860-1-luis.gerhorst@fau.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit.h          |  5 ++++
 arch/arm64/net/bpf_jit_comp.c     |  9 ++++--
 arch/powerpc/net/bpf_jit_comp64.c | 59 ++++++++++++++++++++++++++++-----------
 include/linux/filter.h            |  2 +-
 kernel/bpf/core.c                 | 17 +++++------
 5 files changed, 65 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index a3b0e693a125..bbea4f36f9f2 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -325,4 +325,9 @@
 #define A64_MRS_SP_EL0(Rt) \
 	aarch64_insn_gen_mrs(Rt, AARCH64_INSN_SYSREG_SP_EL0)
 
+/* Barriers */
+#define A64_SB aarch64_insn_get_sb_value()
+#define A64_DSB_NSH (aarch64_insn_get_dsb_base_value() | 0x7 << 8)
+#define A64_ISB aarch64_insn_get_isb_value()
+
 #endif /* _BPF_JIT_H */
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 2cab9063f563..b6c42b5c9668 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1630,9 +1630,14 @@ emit_cond_jmp:
 			return ret;
 		break;
 
-	/* speculation barrier */
+	/* speculation barrier against v1 and v4 */
 	case BPF_ST | BPF_NOSPEC:
-		/* See bpf_jit_bypass_spec_v4() */
+		if (alternative_has_cap_likely(ARM64_HAS_SB)) {
+			emit(A64_SB, ctx);
+		} else {
+			emit(A64_DSB_NSH, ctx);
+			emit(A64_ISB, ctx);
+		}
 		break;
 
 	/* ST: *(size *)(dst + off) = imm */
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index a4335761b7f9..3665ff8bb4bc 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -414,6 +414,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
 		       u32 *addrs, int pass, bool extra_pass)
 {
 	enum stf_barrier_type stf_barrier = stf_barrier_type_get();
+	bool sync_emitted, ori31_emitted;
 	const struct bpf_insn *insn = fp->insnsi;
 	int flen = fp->len;
 	int i, ret;
@@ -806,26 +807,52 @@ emit_clear:
 
 		/*
 		 * BPF_ST NOSPEC (speculation barrier)
+		 *
+		 * The following must act as a barrier against both Spectre v1
+		 * and v4 if we requested both mitigations. Therefore, also emit
+		 * 'isync; sync' on E500 or 'ori31' on BOOK3S_64 in addition to
+		 * the insns needed for a Spectre v4 barrier.
+		 *
+		 * If we requested only !bypass_spec_v1 OR only !bypass_spec_v4,
+		 * we can skip the respective other barrier type as an
+		 * optimization.
 		 */
 		case BPF_ST | BPF_NOSPEC:
-			switch (stf_barrier) {
-			case STF_BARRIER_EIEIO:
-				EMIT(PPC_RAW_EIEIO() | 0x02000000);
-				break;
-			case STF_BARRIER_SYNC_ORI:
+			sync_emitted = false;
+			ori31_emitted = false;
+#ifdef CONFIG_PPC_E500
+			if (!bpf_jit_bypass_spec_v1()) {
+				EMIT(PPC_RAW_ISYNC());
 				EMIT(PPC_RAW_SYNC());
-				EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
-				EMIT(PPC_RAW_ORI(_R31, _R31, 0));
-				break;
-			case STF_BARRIER_FALLBACK:
-				ctx->seen |= SEEN_FUNC;
-				PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
-				EMIT(PPC_RAW_MTCTR(_R12));
-				EMIT(PPC_RAW_BCTRL());
-				break;
-			case STF_BARRIER_NONE:
-				break;
+				sync_emitted = true;
+			}
+#endif
+			if (!bpf_jit_bypass_spec_v4()) {
+				switch (stf_barrier) {
+				case STF_BARRIER_EIEIO:
+					EMIT(PPC_RAW_EIEIO() | 0x02000000);
+					break;
+				case STF_BARRIER_SYNC_ORI:
+					if (!sync_emitted)
+						EMIT(PPC_RAW_SYNC());
+					EMIT(PPC_RAW_LD(tmp1_reg, _R13, 0));
+					EMIT(PPC_RAW_ORI(_R31, _R31, 0));
+					ori31_emitted = true;
+					break;
+				case STF_BARRIER_FALLBACK:
+					ctx->seen |= SEEN_FUNC;
+					PPC_LI64(_R12, dereference_kernel_function_descriptor(bpf_stf_barrier));
+					EMIT(PPC_RAW_MTCTR(_R12));
+					EMIT(PPC_RAW_BCTRL());
+					break;
+				case STF_BARRIER_NONE:
+					break;
+				}
 			}
+#ifdef CONFIG_PPC_BOOK3S_64
+			if (!bpf_jit_bypass_spec_v1() && !ori31_emitted)
+				EMIT(PPC_RAW_ORI(_R31, _R31, 0));
+#endif
 			break;
 
 		/*
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5cf4d35d83e..eca229752cbe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -82,7 +82,7 @@ struct ctl_table_header;
 #define BPF_CALL_ARGS	0xe0
 
 /* unused opcode to mark speculation barrier for mitigating
- * Speculative Store Bypass
+ * Spectre v1 and v4
  */
 #define BPF_NOSPEC	0xc0
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f9bd9625438b..e536a34a32c8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2102,14 +2102,15 @@ out:
 #undef COND_JMP
 	/* ST, STX and LDX*/
 	ST_NOSPEC:
-		/* Speculation barrier for mitigating Speculative Store Bypass.
-		 * In case of arm64, we rely on the firmware mitigation as
-		 * controlled via the ssbd kernel parameter. Whenever the
-		 * mitigation is enabled, it works for all of the kernel code
-		 * with no need to provide any additional instructions here.
-		 * In case of x86, we use 'lfence' insn for mitigation. We
-		 * reuse preexisting logic from Spectre v1 mitigation that
-		 * happens to produce the required code on x86 for v4 as well.
+		/* Speculation barrier for mitigating Speculative Store Bypass,
+		 * Bounds-Check Bypass and Type Confusion. In case of arm64, we
+		 * rely on the firmware mitigation as controlled via the ssbd
+		 * kernel parameter. Whenever the mitigation is enabled, it
+		 * works for all of the kernel code with no need to provide any
+		 * additional instructions here. In case of x86, we use 'lfence'
+		 * insn for mitigation. We reuse preexisting logic from Spectre
+		 * v1 mitigation that happens to produce the required code on
+		 * x86 for v4 as well.
 		 */
 		barrier_nospec();
 		CONT;
-- 
cgit v1.2.3


From 9124a4508007f146206a279f0c5e81dde314bda1 Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <luis.gerhorst@fau.de>
Date: Tue, 3 Jun 2025 23:20:24 +0200
Subject: bpf: Rename sanitize_stack_spill to nospec_result

This is made to clarify that this flag will cause a nospec to be added
after this insn and can therefore be relied upon to reduce speculative
path analysis.

Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Henriette Herzog <henriette.herzog@rub.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
Link: https://lore.kernel.org/r/20250603212024.338154-1-luis.gerhorst@fau.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 +-
 kernel/bpf/verifier.c        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 256274acb1d8..2b0954202226 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -580,7 +580,7 @@ struct bpf_insn_aux_data {
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
-	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
+	bool nospec_result; /* result is unsafe under speculation, nospec must follow */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool needs_zext; /* alu op needs to clear upper bits */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04465e317f10..79ae0ee395b0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5027,7 +5027,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 		}
 
 		if (sanitize)
-			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+			env->insn_aux_data[insn_idx].nospec_result = true;
 	}
 
 	err = destroy_if_dynptr_stack_slot(env, state, spi);
@@ -20930,7 +20930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		}
 
 		if (type == BPF_WRITE &&
-		    env->insn_aux_data[i + delta].sanitize_stack_spill) {
+		    env->insn_aux_data[i + delta].nospec_result) {
 			struct bpf_insn patch[] = {
 				*insn,
 				BPF_ST_NOSPEC(),
-- 
cgit v1.2.3


From d6f1c85f22534d2d9fea9b32645da19c91ebe7d2 Mon Sep 17 00:00:00 2001
From: Luis Gerhorst <luis.gerhorst@fau.de>
Date: Tue, 3 Jun 2025 23:24:28 +0200
Subject: bpf: Fall back to nospec for Spectre v1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This implements the core of the series and causes the verifier to fall
back to mitigating Spectre v1 using speculation barriers. The approach
was presented at LPC'24 [1] and RAID'24 [2].

If we find any forbidden behavior on a speculative path, we insert a
nospec (e.g., lfence speculation barrier on x86) before the instruction
and stop verifying the path. While verifying a speculative path, we can
furthermore stop verification of that path whenever we encounter a
nospec instruction.

A minimal example program would look as follows:

	A = true
	B = true
	if A goto e
	f()
	if B goto e
	unsafe()
e:	exit

There are the following speculative and non-speculative paths
(`cur->speculative` and `speculative` referring to the value of the
push_stack() parameters):

- A = true
- B = true
- if A goto e
  - A && !cur->speculative && !speculative
    - exit
  - !A && !cur->speculative && speculative
    - f()
    - if B goto e
      - B && cur->speculative && !speculative
        - exit
      - !B && cur->speculative && speculative
        - unsafe()

If f() contains any unsafe behavior under Spectre v1 and the unsafe
behavior matches `state->speculative &&
error_recoverable_with_nospec(err)`, do_check() will now add a nospec
before f() instead of rejecting the program:

	A = true
	B = true
	if A goto e
	nospec
	f()
	if B goto e
	unsafe()
e:	exit

Alternatively, the algorithm also takes advantage of nospec instructions
inserted for other reasons (e.g., Spectre v4). Taking the program above
as an example, speculative path exploration can stop before f() if a
nospec was inserted there because of Spectre v4 sanitization.

In this example, all instructions after the nospec are dead code (and
with the nospec they are also dead code speculatively).

For this, it relies on the fact that speculation barriers generally
prevent all later instructions from executing if the speculation was not
correct:

* On Intel x86_64, lfence acts as full speculation barrier, not only as
  a load fence [3]:

    An LFENCE instruction or a serializing instruction will ensure that
    no later instructions execute, even speculatively, until all prior
    instructions complete locally. [...] Inserting an LFENCE instruction
    after a bounds check prevents later operations from executing before
    the bound check completes.

  This was experimentally confirmed in [4].

* On AMD x86_64, lfence is dispatch-serializing [5] (requires MSR
  C001_1029[1] to be set if the MSR is supported, this happens in
  init_amd()). AMD further specifies "A dispatch serializing instruction
  forces the processor to retire the serializing instruction and all
  previous instructions before the next instruction is executed" [8]. As
  dispatch is not specific to memory loads or branches, lfence therefore
  also affects all instructions there. Also, if retiring a branch means
  it's PC change becomes architectural (should be), this means any
  "wrong" speculation is aborted as required for this series.

* ARM's SB speculation barrier instruction also affects "any instruction
  that appears later in the program order than the barrier" [6].

* PowerPC's barrier also affects all subsequent instructions [7]:

    [...] executing an ori R31,R31,0 instruction ensures that all
    instructions preceding the ori R31,R31,0 instruction have completed
    before the ori R31,R31,0 instruction completes, and that no
    subsequent instructions are initiated, even out-of-order, until
    after the ori R31,R31,0 instruction completes. The ori R31,R31,0
    instruction may complete before storage accesses associated with
    instructions preceding the ori R31,R31,0 instruction have been
    performed

Regarding the example, this implies that `if B goto e` will not execute
before `if A goto e` completes. Once `if A goto e` completes, the CPU
should find that the speculation was wrong and continue with `exit`.

If there is any other path that leads to `if B goto e` (and therefore
`unsafe()`) without going through `if A goto e`, then a nospec will
still be needed there. However, this patch assumes this other path will
be explored separately and therefore be discovered by the verifier even
if the exploration discussed here stops at the nospec.

This patch furthermore has the unfortunate consequence that Spectre v1
mitigations now only support architectures which implement BPF_NOSPEC.
Before this commit, Spectre v1 mitigations prevented exploits by
rejecting the programs on all architectures. Because some JITs do not
implement BPF_NOSPEC, this patch therefore may regress unpriv BPF's
security to a limited extent:

* The regression is limited to systems vulnerable to Spectre v1, have
  unprivileged BPF enabled, and do NOT emit insns for BPF_NOSPEC. The
  latter is not the case for x86 64- and 32-bit, arm64, and powerpc
  64-bit and they are therefore not affected by the regression.
  According to commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip
  speculation barrier opcode"), LoongArch is not vulnerable to Spectre
  v1 and therefore also not affected by the regression.

* To the best of my knowledge this regression may therefore only affect
  MIPS. This is deemed acceptable because unpriv BPF is still disabled
  there by default. As stated in a previous commit, BPF_NOSPEC could be
  implemented for MIPS based on GCC's speculation_barrier
  implementation.

* It is unclear which other architectures (besides x86 64- and 32-bit,
  ARM64, PowerPC 64-bit, LoongArch, and MIPS) supported by the kernel
  are vulnerable to Spectre v1. Also, it is not clear if barriers are
  available on these architectures. Implementing BPF_NOSPEC on these
  architectures therefore is non-trivial. Searching GCC and the kernel
  for speculation barrier implementations for these architectures
  yielded no result.

* If any of those regressed systems is also vulnerable to Spectre v4,
  the system was already vulnerable to Spectre v4 attacks based on
  unpriv BPF before this patch and the impact is therefore further
  limited.

As an alternative to regressing security, one could still reject
programs if the architecture does not emit BPF_NOSPEC (e.g., by removing
the empty BPF_NOSPEC-case from all JITs except for LoongArch where it
appears justified). However, this will cause rejections on these archs
that are likely unfounded in the vast majority of cases.

In the tests, some are now successful where we previously had a
false-positive (i.e., rejection). Change them to reflect where the
nospec should be inserted (using __xlated_unpriv) and modify the error
message if the nospec is able to mitigate a problem that previously
shadowed another problem (in that case __xlated_unpriv does not work,
therefore just add a comment).

Define SPEC_V1 to avoid duplicating this ifdef whenever we check for
nospec insns using __xlated_unpriv, define it here once. This also
improves readability. PowerPC can probably also be added here. However,
omit it for now because the BPF CI currently does not include a test.

Limit it to EPERM, EACCES, and EINVAL (and not everything except for
EFAULT and ENOMEM) as it already has the desired effect for most
real-world programs. Briefly went through all the occurrences of EPERM,
EINVAL, and EACCESS in verifier.c to validate that catching them like
this makes sense.

Thanks to Dustin for their help in checking the vendor documentation.

[1] https://lpc.events/event/18/contributions/1954/ ("Mitigating
    Spectre-PHT using Speculation Barriers in Linux eBPF")
[2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and
    Precise Spectre Defenses for Untrusted Linux Kernel Extensions")
[3] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/runtime-speculative-side-channel-mitigations.html
    ("Managed Runtime Speculative Execution Side Channel Mitigations")
[4] https://dl.acm.org/doi/pdf/10.1145/3359789.3359837 ("Speculator: a
    tool to analyze speculative execution attacks and mitigations" -
    Section 4.6 "Stopping Speculative Execution")
[5] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/software-techniques-for-managing-speculation.pdf
    ("White Paper - SOFTWARE TECHNIQUES FOR MANAGING SPECULATION ON AMD
    PROCESSORS - REVISION 5.09.23")
[6] https://developer.arm.com/documentation/ddi0597/2020-12/Base-Instructions/SB--Speculation-Barrier-
    ("SB - Speculation Barrier - Arm Armv8-A A32/T32 Instruction Set
    Architecture (2020-12)")
[7] https://wiki.raptorcs.com/w/images/5/5f/OPF_PowerISA_v3.1C.pdf
    ("Power ISA™ - Version 3.1C - May 26, 2024 - Section 9.2.1 of Book
    III")
[8] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/40332.pdf
    ("AMD64 Architecture Programmer’s Manual Volumes 1–5 - Revision 4.08
    - April 2024 - 7.6.4 Serializing Instructions")

Signed-off-by: Luis Gerhorst <luis.gerhorst@fau.de>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Henriette Herzog <henriette.herzog@rub.de>
Cc: Dustin Nguyen <nguyen@cs.fau.de>
Cc: Maximilian Ott <ott@cs.fau.de>
Cc: Milan Stephan <milan.stephan@fau.de>
Link: https://lore.kernel.org/r/20250603212428.338473-1-luis.gerhorst@fau.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                       |  1 +
 kernel/bpf/verifier.c                              | 78 ++++++++++++++++++++--
 tools/testing/selftests/bpf/progs/bpf_misc.h       |  4 ++
 tools/testing/selftests/bpf/progs/verifier_and.c   |  8 ++-
 .../testing/selftests/bpf/progs/verifier_bounds.c  | 61 +++++++++++++----
 tools/testing/selftests/bpf/progs/verifier_movsx.c | 16 ++++-
 .../testing/selftests/bpf/progs/verifier_unpriv.c  |  8 ++-
 .../selftests/bpf/progs/verifier_value_ptr_arith.c | 16 +++--
 tools/testing/selftests/bpf/verifier/dead_code.c   |  3 +-
 tools/testing/selftests/bpf/verifier/jmp32.c       | 33 +++------
 tools/testing/selftests/bpf/verifier/jset.c        | 10 ++-
 11 files changed, 184 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 2b0954202226..e6c26393c029 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -580,6 +580,7 @@ struct bpf_insn_aux_data {
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
+	bool nospec; /* do not execute this instruction speculatively */
 	bool nospec_result; /* result is unsafe under speculation, nospec must follow */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool needs_zext; /* alu op needs to clear upper bits */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 79ae0ee395b0..b1f797616f20 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2013,6 +2013,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 	return 0;
 }
 
+static bool error_recoverable_with_nospec(int err)
+{
+	/* Should only return true for non-fatal errors that are allowed to
+	 * occur during speculative verification. For these we can insert a
+	 * nospec and the program might still be accepted. Do not include
+	 * something like ENOMEM because it is likely to re-occur for the next
+	 * architectural path once it has been recovered-from in all speculative
+	 * paths.
+	 */
+	return err == -EPERM || err == -EACCES || err == -EINVAL;
+}
+
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx,
 					     bool speculative)
@@ -11147,7 +11159,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
 	return -ENOTSUPP;
 }
 
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env)
 {
 	return &env->insn_aux_data[env->insn_idx];
 }
@@ -14015,7 +14027,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
 static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
 				    const struct bpf_insn *insn)
 {
-	return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K;
+	return env->bypass_spec_v1 ||
+		BPF_SRC(insn->code) == BPF_K ||
+		cur_aux(env)->nospec;
 }
 
 static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
@@ -19732,10 +19746,41 @@ static int do_check(struct bpf_verifier_env *env)
 		sanitize_mark_insn_seen(env);
 		prev_insn_idx = env->insn_idx;
 
+		/* Reduce verification complexity by stopping speculative path
+		 * verification when a nospec is encountered.
+		 */
+		if (state->speculative && cur_aux(env)->nospec)
+			goto process_bpf_exit;
+
 		err = do_check_insn(env, &do_print_state);
-		if (err < 0) {
+		if (state->speculative && error_recoverable_with_nospec(err)) {
+			/* Prevent this speculative path from ever reaching the
+			 * insn that would have been unsafe to execute.
+			 */
+			cur_aux(env)->nospec = true;
+			/* If it was an ADD/SUB insn, potentially remove any
+			 * markings for alu sanitization.
+			 */
+			cur_aux(env)->alu_state = 0;
+			goto process_bpf_exit;
+		} else if (err < 0) {
 			return err;
 		} else if (err == PROCESS_BPF_EXIT) {
+			goto process_bpf_exit;
+		}
+		WARN_ON_ONCE(err);
+
+		if (state->speculative && cur_aux(env)->nospec_result) {
+			/* If we are on a path that performed a jump-op, this
+			 * may skip a nospec patched-in after the jump. This can
+			 * currently never happen because nospec_result is only
+			 * used for the write-ops
+			 * `*(size*)(dst_reg+off)=src_reg|imm32` which must
+			 * never skip the following insn. Still, add a warning
+			 * to document this in case nospec_result is used
+			 * elsewhere in the future.
+			 */
+			WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1);
 process_bpf_exit:
 			mark_verifier_state_scratched(env);
 			update_branch_counts(env, env->cur_state);
@@ -19753,7 +19798,6 @@ process_bpf_exit:
 				continue;
 			}
 		}
-		WARN_ON_ONCE(err);
 	}
 
 	return 0;
@@ -20881,6 +20925,29 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		bpf_convert_ctx_access_t convert_ctx_access;
 		u8 mode;
 
+		if (env->insn_aux_data[i + delta].nospec) {
+			WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state);
+			struct bpf_insn patch[] = {
+				BPF_ST_NOSPEC(),
+				*insn,
+			};
+
+			cnt = ARRAY_SIZE(patch);
+			new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			/* This can not be easily merged with the
+			 * nospec_result-case, because an insn may require a
+			 * nospec before and after itself. Therefore also do not
+			 * 'continue' here but potentially apply further
+			 * patching to insn. *insn should equal patch[1] now.
+			 */
+		}
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
@@ -20931,6 +20998,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 
 		if (type == BPF_WRITE &&
 		    env->insn_aux_data[i + delta].nospec_result) {
+			/* nospec_result is only used to mitigate Spectre v4 and
+			 * to limit verification-time for Spectre v1.
+			 */
 			struct bpf_insn patch[] = {
 				*insn,
 				BPF_ST_NOSPEC(),
diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h
index 6e208e24ba3b..a678463e972c 100644
--- a/tools/testing/selftests/bpf/progs/bpf_misc.h
+++ b/tools/testing/selftests/bpf/progs/bpf_misc.h
@@ -231,4 +231,8 @@
 #define CAN_USE_LOAD_ACQ_STORE_REL
 #endif
 
+#if defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)
+#define SPEC_V1
+#endif
+
 #endif
diff --git a/tools/testing/selftests/bpf/progs/verifier_and.c b/tools/testing/selftests/bpf/progs/verifier_and.c
index e97e518516b6..2b4fdca162be 100644
--- a/tools/testing/selftests/bpf/progs/verifier_and.c
+++ b/tools/testing/selftests/bpf/progs/verifier_and.c
@@ -85,8 +85,14 @@ l0_%=:	r0 = r0;					\
 
 SEC("socket")
 __description("check known subreg with unknown reg")
-__success __failure_unpriv __msg_unpriv("R1 !read_ok")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if w0 < 0x1 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R1 !read_ok'` */
+__xlated_unpriv("goto pc-1") /* `r1 = *(u32*)(r1 + 512)`, sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void known_subreg_with_unknown_reg(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c
index 0eb33bb801b5..30e16153fdf1 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c
@@ -620,8 +620,14 @@ l1_%=:	exit;						\
 
 SEC("socket")
 __description("bounds check mixed 32bit and 64bit arithmetic. test1")
-__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 invalid mem access 'scalar'` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("exit")
+#endif
 __naked void _32bit_and_64bit_arithmetic_test1(void)
 {
 	asm volatile ("					\
@@ -643,8 +649,14 @@ l1_%=:	exit;						\
 
 SEC("socket")
 __description("bounds check mixed 32bit and 64bit arithmetic. test2")
-__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 invalid mem access 'scalar'` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("exit")
+#endif
 __naked void _32bit_and_64bit_arithmetic_test2(void)
 {
 	asm volatile ("					\
@@ -691,9 +703,14 @@ l0_%=:	r0 = 0;						\
 
 SEC("socket")
 __description("bounds check for reg = 0, reg xor 1")
-__success __failure_unpriv
-__msg_unpriv("R0 min value is outside of the allowed memory range")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if r1 != 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void reg_0_reg_xor_1(void)
 {
 	asm volatile ("					\
@@ -719,9 +736,14 @@ l1_%=:	r0 = 0;						\
 
 SEC("socket")
 __description("bounds check for reg32 = 0, reg32 xor 1")
-__success __failure_unpriv
-__msg_unpriv("R0 min value is outside of the allowed memory range")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if w1 != 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void reg32_0_reg32_xor_1(void)
 {
 	asm volatile ("					\
@@ -747,9 +769,14 @@ l1_%=:	r0 = 0;						\
 
 SEC("socket")
 __description("bounds check for reg = 2, reg xor 3")
-__success __failure_unpriv
-__msg_unpriv("R0 min value is outside of the allowed memory range")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if r1 > 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void reg_2_reg_xor_3(void)
 {
 	asm volatile ("					\
@@ -829,9 +856,14 @@ l1_%=:	r0 = 0;						\
 
 SEC("socket")
 __description("bounds check for reg > 0, reg xor 3")
-__success __failure_unpriv
-__msg_unpriv("R0 min value is outside of the allowed memory range")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if r1 >= 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void reg_0_reg_xor_3(void)
 {
 	asm volatile ("					\
@@ -858,9 +890,14 @@ l1_%=:	r0 = 0;						\
 
 SEC("socket")
 __description("bounds check for reg32 > 0, reg32 xor 3")
-__success __failure_unpriv
-__msg_unpriv("R0 min value is outside of the allowed memory range")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if w1 >= 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 min value is outside of the allowed memory range` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("r0 = 0")
+#endif
 __naked void reg32_0_reg32_xor_3(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_movsx.c b/tools/testing/selftests/bpf/progs/verifier_movsx.c
index 994bbc346d25..a4d8814eb5ed 100644
--- a/tools/testing/selftests/bpf/progs/verifier_movsx.c
+++ b/tools/testing/selftests/bpf/progs/verifier_movsx.c
@@ -245,7 +245,13 @@ l0_%=:							\
 SEC("socket")
 __description("MOV32SX, S8, var_off not u32_max, positive after s8 extension")
 __success __retval(0)
-__failure_unpriv __msg_unpriv("frame pointer is read only")
+__success_unpriv
+#ifdef SPEC_V1
+__xlated_unpriv("w0 = 0")
+__xlated_unpriv("exit")
+__xlated_unpriv("nospec") /* inserted to prevent `frame pointer is read only` */
+__xlated_unpriv("goto pc-1")
+#endif
 __naked void mov64sx_s32_varoff_2(void)
 {
 	asm volatile ("					\
@@ -267,7 +273,13 @@ l0_%=:							\
 SEC("socket")
 __description("MOV32SX, S8, var_off not u32_max, negative after s8 extension")
 __success __retval(0)
-__failure_unpriv __msg_unpriv("frame pointer is read only")
+__success_unpriv
+#ifdef SPEC_V1
+__xlated_unpriv("w0 = 0")
+__xlated_unpriv("exit")
+__xlated_unpriv("nospec") /* inserted to prevent `frame pointer is read only` */
+__xlated_unpriv("goto pc-1")
+#endif
 __naked void mov64sx_s32_varoff_3(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_unpriv.c b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
index db52ba66e880..9bd4aef72140 100644
--- a/tools/testing/selftests/bpf/progs/verifier_unpriv.c
+++ b/tools/testing/selftests/bpf/progs/verifier_unpriv.c
@@ -572,8 +572,14 @@ l0_%=:	exit;						\
 
 SEC("socket")
 __description("alu32: mov u32 const")
-__success __failure_unpriv __msg_unpriv("R7 invalid mem access 'scalar'")
+__success __success_unpriv
 __retval(0)
+#ifdef SPEC_V1
+__xlated_unpriv("if r0 == 0x0 goto pc+2")
+__xlated_unpriv("nospec") /* inserted to prevent `R7 invalid mem access 'scalar'` */
+__xlated_unpriv("goto pc-1") /* sanitized dead code */
+__xlated_unpriv("exit")
+#endif
 __naked void alu32_mov_u32_const(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
index 5ba6e53571c8..fcea9819e359 100644
--- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
@@ -398,7 +398,8 @@ l2_%=:	r0 = 1;						\
 
 SEC("socket")
 __description("map access: mixing value pointer and scalar, 1")
-__success __failure_unpriv __msg_unpriv("R2 pointer comparison prohibited")
+__success __failure_unpriv
+__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root")
 __retval(0)
 __naked void value_pointer_and_scalar_1(void)
 {
@@ -433,6 +434,7 @@ l2_%=:	/* common instruction */			\
 l3_%=:	/* branch B */					\
 	r0 = 0x13371337;				\
 	/* verifier follows fall-through */		\
+	/* unpriv: nospec (inserted to prevent `R2 pointer comparison prohibited`) */\
 	if r2 != 0x100000 goto l4_%=;			\
 	r0 = 0;						\
 	exit;						\
@@ -450,7 +452,8 @@ l4_%=:	/* fake-dead code; targeted from branch A to	\
 
 SEC("socket")
 __description("map access: mixing value pointer and scalar, 2")
-__success __failure_unpriv __msg_unpriv("R0 invalid mem access 'scalar'")
+__success __failure_unpriv
+__msg_unpriv("R2 tried to add from different maps, paths or scalars, pointer arithmetic with it prohibited for !root")
 __retval(0)
 __naked void value_pointer_and_scalar_2(void)
 {
@@ -492,6 +495,7 @@ l4_%=:	/* fake-dead code; targeted from branch A to	\
 	 * prevent dead code sanitization, rejected	\
 	 * via branch B however				\
 	 */						\
+	/* unpriv: nospec (inserted to prevent `R0 invalid mem access 'scalar'`) */\
 	r0 = *(u8*)(r0 + 0);				\
 	r0 = 0;						\
 	exit;						\
@@ -1296,9 +1300,13 @@ l0_%=:	r0 = 1;						\
 
 SEC("socket")
 __description("map access: value_ptr -= unknown scalar, 2")
-__success __failure_unpriv
-__msg_unpriv("R0 pointer arithmetic of map value goes out of range")
+__success __success_unpriv
 __retval(1)
+#ifdef SPEC_V1
+__xlated_unpriv("r1 &= 7")
+__xlated_unpriv("nospec") /* inserted to prevent `R0 pointer arithmetic of map value goes out of range` */
+__xlated_unpriv("r0 -= r1")
+#endif
 __naked void value_ptr_unknown_scalar_2_2(void)
 {
 	asm volatile ("					\
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
index ee454327e5c6..77207b498c6f 100644
--- a/tools/testing/selftests/bpf/verifier/dead_code.c
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -2,14 +2,13 @@
 	"dead code: start",
 	.insns = {
 	BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_JMP_IMM(BPF_JA, 0, 0, 2),
 	BPF_MOV64_IMM(BPF_REG_0, 7),
 	BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 7,
 },
diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c
index 43776f6f92f4..91d83e9cb148 100644
--- a/tools/testing/selftests/bpf/verifier/jmp32.c
+++ b/tools/testing/selftests/bpf/verifier/jmp32.c
@@ -84,11 +84,10 @@
 	BPF_JMP32_IMM(BPF_JSET, BPF_REG_7, 0x10, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x10, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -149,11 +148,10 @@
 	BPF_JMP32_IMM(BPF_JEQ, BPF_REG_7, 0x10, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP32_IMM(BPF_JSGE, BPF_REG_7, 0xf, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -214,11 +212,10 @@
 	BPF_JMP32_IMM(BPF_JNE, BPF_REG_7, 0x10, 1),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x10, 1),
 	BPF_EXIT_INSN(),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -283,11 +280,10 @@
 	BPF_JMP32_REG(BPF_JGE, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -354,11 +350,10 @@
 	BPF_JMP32_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -425,11 +420,10 @@
 	BPF_JMP32_REG(BPF_JLE, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -496,11 +490,10 @@
 	BPF_JMP32_REG(BPF_JLT, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -567,11 +560,10 @@
 	BPF_JMP32_REG(BPF_JSGE, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -638,11 +630,10 @@
 	BPF_JMP32_REG(BPF_JSGT, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JSGT, BPF_REG_7, -2, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -709,11 +700,10 @@
 	BPF_JMP32_REG(BPF_JSLE, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JSLE, BPF_REG_7, 0x7ffffff0, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
@@ -780,11 +770,10 @@
 	BPF_JMP32_REG(BPF_JSLT, BPF_REG_7, BPF_REG_8, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP32_IMM(BPF_JSLT, BPF_REG_7, -1, 1),
+	/* unpriv: nospec (inserted to prevent "R0 invalid mem access 'scalar'") */
 	BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
-	.errstr_unpriv = "R0 invalid mem access 'scalar'",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 	.retval = 2,
 	.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c
index 11fc68da735e..e901eefd774a 100644
--- a/tools/testing/selftests/bpf/verifier/jset.c
+++ b/tools/testing/selftests/bpf/verifier/jset.c
@@ -78,12 +78,11 @@
 	.insns = {
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.retval = 1,
 	.result = ACCEPT,
 },
@@ -136,13 +135,12 @@
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
 	BPF_ALU64_IMM(BPF_OR, BPF_REG_0, 2),
 	BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 3, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_MOV64_IMM(BPF_REG_0, 0),
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
 {
@@ -154,16 +152,16 @@
 	BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xff),
 	BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0xf0, 3),
 	BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 0x10, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0x10, 1),
 	BPF_EXIT_INSN(),
 	BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0x10, 1),
+	/* unpriv: nospec (inserted to prevent "R9 !read_ok") */
 	BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
 	BPF_EXIT_INSN(),
 	},
 	.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
-	.errstr_unpriv = "R9 !read_ok",
-	.result_unpriv = REJECT,
 	.result = ACCEPT,
 },
-- 
cgit v1.2.3


From bee7e3322a2859a80a67077591128323bbc4052f Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 13 Nov 2024 01:50:18 +0900
Subject: can: bittiming: rename CAN_CTRLMODE_TDC_MASK into
 CAN_CTRLMODE_FD_TDC_MASK

With the introduction of CAN XL, a new CAN_CTRLMODE_XL_TDC_MASK will
be introduced later on. Because CAN_CTRLMODE_TDC_MASK is not part of
the uapi, rename it to CAN_CTRLMODE_FD_TDC_MASK to make it more
explicit that this mask is meant for CAN FD.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://patch.msgid.link/20241112165118.586613-10-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/calc_bittiming.c |  2 +-
 drivers/net/can/dev/netlink.c        | 12 ++++++------
 include/linux/can/bittiming.h        |  2 +-
 include/linux/can/dev.h              |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/calc_bittiming.c b/drivers/net/can/dev/calc_bittiming.c
index 3809c148fb88..a94bd67c670c 100644
--- a/drivers/net/can/dev/calc_bittiming.c
+++ b/drivers/net/can/dev/calc_bittiming.c
@@ -179,7 +179,7 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const,
 	if (!tdc_const || !(ctrlmode_supported & CAN_CTRLMODE_TDC_AUTO))
 		return;
 
-	*ctrlmode &= ~CAN_CTRLMODE_TDC_MASK;
+	*ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
 
 	/* As specified in ISO 11898-1 section 11.3.3 "Transmitter
 	 * delay compensation" (TDC) is only applicable if data BRP is
diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 4ebd5181aea9..08261cfcf6b2 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -67,12 +67,12 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[],
 
 	if (data[IFLA_CAN_CTRLMODE]) {
 		struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]);
-		u32 tdc_flags = cm->flags & CAN_CTRLMODE_TDC_MASK;
+		u32 tdc_flags = cm->flags & CAN_CTRLMODE_FD_TDC_MASK;
 
 		is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD;
 
 		/* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */
-		if (tdc_flags == CAN_CTRLMODE_TDC_MASK)
+		if (tdc_flags == CAN_CTRLMODE_FD_TDC_MASK)
 			return -EOPNOTSUPP;
 		/* If one of the CAN_CTRLMODE_TDC_* flag is set then
 		 * TDC must be set and vice-versa
@@ -230,16 +230,16 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 			dev->mtu = CAN_MTU;
 			memset(&priv->fd.data_bittiming, 0,
 			       sizeof(priv->fd.data_bittiming));
-			priv->ctrlmode &= ~CAN_CTRLMODE_TDC_MASK;
+			priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
 			memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc));
 		}
 
-		tdc_mask = cm->mask & CAN_CTRLMODE_TDC_MASK;
+		tdc_mask = cm->mask & CAN_CTRLMODE_FD_TDC_MASK;
 		/* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually
 		 * exclusive: make sure to turn the other one off
 		 */
 		if (tdc_mask)
-			priv->ctrlmode &= cm->flags | ~CAN_CTRLMODE_TDC_MASK;
+			priv->ctrlmode &= cm->flags | ~CAN_CTRLMODE_FD_TDC_MASK;
 	}
 
 	if (data[IFLA_CAN_BITTIMING]) {
@@ -339,7 +339,7 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[],
 			err = can_tdc_changelink(priv, data[IFLA_CAN_TDC],
 						 extack);
 			if (err) {
-				priv->ctrlmode &= ~CAN_CTRLMODE_TDC_MASK;
+				priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK;
 				return err;
 			}
 		} else if (!tdc_mask) {
diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h
index 9b8a9c39614b..5dfdbb63b1d5 100644
--- a/include/linux/can/bittiming.h
+++ b/include/linux/can/bittiming.h
@@ -14,7 +14,7 @@
 #define CAN_BITRATE_UNSET 0
 #define CAN_BITRATE_UNKNOWN (-1U)
 
-#define CAN_CTRLMODE_TDC_MASK					\
+#define CAN_CTRLMODE_FD_TDC_MASK				\
 	(CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL)
 
 /*
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index 492d23bec7be..e492dfa8a472 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -93,7 +93,7 @@ struct can_priv {
 
 static inline bool can_tdc_is_enabled(const struct can_priv *priv)
 {
-	return !!(priv->ctrlmode & CAN_CTRLMODE_TDC_MASK);
+	return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK);
 }
 
 /*
-- 
cgit v1.2.3


From 23c0dc95bfa86503eed9fa99423fa0bb39a3bcb0 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 13 Nov 2024 01:50:19 +0900
Subject: can: bittiming: rename can_tdc_is_enabled() into
 can_fd_tdc_is_enabled()

With the introduction of CAN XL, a new can_xl_tdc_is_enabled() helper
function will be introduced later on. Rename can_tdc_is_enabled() into
can_fd_tdc_is_enabled() to make it more explicit that this helper is
meant for CAN FD.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://patch.msgid.link/20241112165118.586613-11-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev/netlink.c             | 6 +++---
 drivers/net/can/usb/etas_es58x/es58x_fd.c | 2 +-
 drivers/net/can/xilinx_can.c              | 2 +-
 include/linux/can/dev.h                   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c
index 08261cfcf6b2..16b0f326c143 100644
--- a/drivers/net/can/dev/netlink.c
+++ b/drivers/net/can/dev/netlink.c
@@ -144,7 +144,7 @@ static int can_tdc_changelink(struct can_priv *priv, const struct nlattr *nla,
 	const struct can_tdc_const *tdc_const = priv->fd.tdc_const;
 	int err;
 
-	if (!tdc_const || !can_tdc_is_enabled(priv))
+	if (!tdc_const || !can_fd_tdc_is_enabled(priv))
 		return -EOPNOTSUPP;
 
 	err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, nla,
@@ -409,7 +409,7 @@ static size_t can_tdc_get_size(const struct net_device *dev)
 		size += nla_total_size(sizeof(u32));	/* IFLA_CAN_TDCF_MAX */
 	}
 
-	if (can_tdc_is_enabled(priv)) {
+	if (can_fd_tdc_is_enabled(priv)) {
 		if (priv->ctrlmode & CAN_CTRLMODE_TDC_MANUAL ||
 		    priv->fd.do_get_auto_tdcv)
 			size += nla_total_size(sizeof(u32));	/* IFLA_CAN_TDCV */
@@ -490,7 +490,7 @@ static int can_tdc_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	     nla_put_u32(skb, IFLA_CAN_TDC_TDCF_MAX, tdc_const->tdcf_max)))
 		goto err_cancel;
 
-	if (can_tdc_is_enabled(priv)) {
+	if (can_fd_tdc_is_enabled(priv)) {
 		u32 tdcv;
 		int err = -EINVAL;
 
diff --git a/drivers/net/can/usb/etas_es58x/es58x_fd.c b/drivers/net/can/usb/etas_es58x/es58x_fd.c
index d924b053677b..6476add1c105 100644
--- a/drivers/net/can/usb/etas_es58x/es58x_fd.c
+++ b/drivers/net/can/usb/etas_es58x/es58x_fd.c
@@ -429,7 +429,7 @@ static int es58x_fd_enable_channel(struct es58x_priv *priv)
 		es58x_fd_convert_bittiming(&tx_conf_msg.data_bittiming,
 					   &priv->can.fd.data_bittiming);
 
-		if (can_tdc_is_enabled(&priv->can)) {
+		if (can_fd_tdc_is_enabled(&priv->can)) {
 			tx_conf_msg.tdc_enabled = 1;
 			tx_conf_msg.tdco = cpu_to_le16(priv->can.fd.tdc.tdco);
 			tx_conf_msg.tdcf = cpu_to_le16(priv->can.fd.tdc.tdcf);
diff --git a/drivers/net/can/xilinx_can.c b/drivers/net/can/xilinx_can.c
index 3f2e378199ab..81baec8eb1e5 100644
--- a/drivers/net/can/xilinx_can.c
+++ b/drivers/net/can/xilinx_can.c
@@ -515,7 +515,7 @@ static int xcan_set_bittiming(struct net_device *ndev)
 	    priv->devtype.cantype == XAXI_CANFD_2_0) {
 		/* Setting Baud Rate prescaler value in F_BRPR Register */
 		btr0 = dbt->brp - 1;
-		if (can_tdc_is_enabled(&priv->can)) {
+		if (can_fd_tdc_is_enabled(&priv->can)) {
 			if (priv->devtype.cantype == XAXI_CANFD)
 				btr0 |= FIELD_PREP(XCAN_BRPR_TDCO_MASK, priv->can.fd.tdc.tdco) |
 					XCAN_BRPR_TDC_ENABLE;
diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h
index e492dfa8a472..9a92cbe5b2cb 100644
--- a/include/linux/can/dev.h
+++ b/include/linux/can/dev.h
@@ -91,7 +91,7 @@ struct can_priv {
 				   struct can_berr_counter *bec);
 };
 
-static inline bool can_tdc_is_enabled(const struct can_priv *priv)
+static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv)
 {
 	return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK);
 }
-- 
cgit v1.2.3


From a5589313383074c48a1b3751d592a6e084ae0573 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sun, 1 Jun 2025 00:21:39 +0300
Subject: gpiolib: Remove unused devm_gpio_request()

Remove devm_gpio_request() due to lack of users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20250531212331.3635269-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 -
 drivers/gpio/gpiolib-legacy.c                    | 38 ------------------------
 include/linux/gpio.h                             |  8 -----
 3 files changed, 47 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 3d56f94ac2ee..2b36ebde9cec 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -275,7 +275,6 @@ GPIO
   devm_gpiod_put()
   devm_gpiod_unhinge()
   devm_gpiochip_add_data()
-  devm_gpio_request()
   devm_gpio_request_one()
 
 I2C
diff --git a/drivers/gpio/gpiolib-legacy.c b/drivers/gpio/gpiolib-legacy.c
index aeae6df8bec9..3bc93ccadb5b 100644
--- a/drivers/gpio/gpiolib-legacy.c
+++ b/drivers/gpio/gpiolib-legacy.c
@@ -85,44 +85,6 @@ static void devm_gpio_release(struct device *dev, void *res)
 	gpio_free(*gpio);
 }
 
-/**
- * devm_gpio_request - request a GPIO for a managed device
- * @dev: device to request the GPIO for
- * @gpio: GPIO to allocate
- * @label: the name of the requested GPIO
- *
- * Except for the extra @dev argument, this function takes the
- * same arguments and performs the same function as gpio_request().
- * GPIOs requested with this function will be automatically freed
- * on driver detach.
- *
- * **DEPRECATED** This function is deprecated and must not be used in new code.
- *
- * Returns:
- * 0 on success, or negative errno on failure.
- */
-int devm_gpio_request(struct device *dev, unsigned gpio, const char *label)
-{
-	unsigned *dr;
-	int rc;
-
-	dr = devres_alloc(devm_gpio_release, sizeof(unsigned), GFP_KERNEL);
-	if (!dr)
-		return -ENOMEM;
-
-	rc = gpio_request(gpio, label);
-	if (rc) {
-		devres_free(dr);
-		return rc;
-	}
-
-	*dr = gpio;
-	devres_add(dev, dr);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(devm_gpio_request);
-
 /**
  * devm_gpio_request_one - request a single GPIO with initial setup
  * @dev: device to request for
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index c1ec62c11ed3..61e9f43c082a 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -110,7 +110,6 @@ static inline int gpio_to_irq(unsigned gpio)
 
 int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
 
-int devm_gpio_request(struct device *dev, unsigned gpio, const char *label);
 int devm_gpio_request_one(struct device *dev, unsigned gpio,
 			  unsigned long flags, const char *label);
 
@@ -188,13 +187,6 @@ static inline int gpio_to_irq(unsigned gpio)
 	return -EINVAL;
 }
 
-static inline int devm_gpio_request(struct device *dev, unsigned gpio,
-				    const char *label)
-{
-	WARN_ON(1);
-	return -EINVAL;
-}
-
 static inline int devm_gpio_request_one(struct device *dev, unsigned gpio,
 					unsigned long flags, const char *label)
 {
-- 
cgit v1.2.3


From 9b4d4c952e28f97c5e653c8b9453690f7e63cc5a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sat, 31 May 2025 22:55:43 +0300
Subject: gpio: Remove unused 'struct gpio' definition

There is no user for the legacy 'struct gpio', remove it for good.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20250531195801.3632110-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 61e9f43c082a..8697ab817898 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -21,18 +21,6 @@ struct device;
 #define GPIOF_OUT_INIT_LOW	((0 << 0) | (0 << 1))
 #define GPIOF_OUT_INIT_HIGH	((0 << 0) | (1 << 1))
 
-/**
- * struct gpio - a structure describing a GPIO with configuration
- * @gpio:	the GPIO number
- * @flags:	GPIO configuration as specified by GPIOF_*
- * @label:	a literal description string of this GPIO
- */
-struct gpio {
-	unsigned	gpio;
-	unsigned long	flags;
-	const char	*label;
-};
-
 #ifdef CONFIG_GPIOLIB
 
 #include <linux/gpio/consumer.h>
-- 
cgit v1.2.3


From 6595ea2761df191c2ec500d5f54b57592b969f5c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Sat, 31 May 2025 22:55:44 +0300
Subject: gpiolib: Move GPIO_DYNAMIC_* constants to its only user

There is no need to export GPIO_DYNAMIC_* constants, especially via
legacy header which is subject to remove. Move the mentioned constants
to its only user, i.e. gpiolib.c.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20250531195801.3632110-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c | 13 +++++++++++++
 include/linux/gpio.h   | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index fdafa0df1b43..5b0b4fc97543 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -74,6 +74,19 @@ static const struct bus_type gpio_bus_type = {
 	.match = gpio_bus_match,
 };
 
+/*
+ * At the end we want all GPIOs to be dynamically allocated from 0.
+ * However, some legacy drivers still perform fixed allocation.
+ * Until they are all fixed, leave 0-512 space for them.
+ */
+#define GPIO_DYNAMIC_BASE	512
+/*
+ * Define the maximum of the possible GPIO in the global numberspace.
+ * While the GPIO base and numbers are positive, we limit it with signed
+ * maximum as a lot of code is using negative values for special cases.
+ */
+#define GPIO_DYNAMIC_MAX	INT_MAX
+
 /*
  * Number of GPIOs to use for the fast path in set array
  */
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 8697ab817898..ff99ed76fdc3 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -45,19 +45,6 @@ static inline bool gpio_is_valid(int number)
  * extra memory (for code and for per-GPIO table entries).
  */
 
-/*
- * At the end we want all GPIOs to be dynamically allocated from 0.
- * However, some legacy drivers still perform fixed allocation.
- * Until they are all fixed, leave 0-512 space for them.
- */
-#define GPIO_DYNAMIC_BASE	512
-/*
- * Define the maximum of the possible GPIO in the global numberspace.
- * While the GPIO base and numbers are positive, we limit it with signed
- * maximum as a lot of code is using negative values for special cases.
- */
-#define GPIO_DYNAMIC_MAX	INT_MAX
-
 /* Always use the library code for GPIO management calls,
  * or when sleeping may be involved.
  */
-- 
cgit v1.2.3


From 127c49624a0980ee7b8a5ba9094d6942332a48da Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 4 Jun 2025 18:06:04 +0200
Subject: can: add drop reasons in the receive path of AF_CAN

Besides the existing pr_warn_once(), use skb drop reasons in case AF_CAN
layer drops non-conformant CAN{,FD,XL} frames, or conformant frames
received by "wrong" devices, so that it's possible to debug (and count)
such events using existing tracepoints:

| # perf record -e skb:kfree_skb -aR -- ./drv/canfdtest -v -g -l 1 vcan0
| # perf script
| [...]
| canfdtest  1123 [000]  3893.271264: skb:kfree_skb: skbaddr=0xffff975703c9f700 rx_sk=(nil) protocol=12 location=can_rcv+0x4b  reason: CAN_RX_INVALID_FRAME

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Link: https://patch.msgid.link/20250604160605.1005704-2-dcaratti@redhat.com
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/net/dropreason-core.h | 18 ++++++++++++++++++
 net/can/af_can.c              |  6 +++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index bcf9d7467e1a..b9e78290269e 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -121,6 +121,9 @@
 	FN(ARP_PVLAN_DISABLE)		\
 	FN(MAC_IEEE_MAC_CONTROL)	\
 	FN(BRIDGE_INGRESS_STP_STATE)	\
+	FN(CAN_RX_INVALID_FRAME)	\
+	FN(CANFD_RX_INVALID_FRAME)	\
+	FN(CANXL_RX_INVALID_FRAME)	\
 	FNe(MAX)
 
 /**
@@ -573,6 +576,21 @@ enum skb_drop_reason {
 	 * ingress bridge port does not allow frames to be forwarded.
 	 */
 	SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE,
+	/**
+	 * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received
+	 * non conform CAN frame (or device is unable to receive CAN frames)
+	 */
+	SKB_DROP_REASON_CAN_RX_INVALID_FRAME,
+	/**
+	 * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received
+	 * non conform CAN-FD frame (or device is unable to receive CAN frames)
+	 */
+	SKB_DROP_REASON_CANFD_RX_INVALID_FRAME,
+	/**
+	 * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received
+	 * non conform CAN-XL frame (or device is unable to receive CAN frames)
+	 */
+	SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 4aab7033c933..b2387a46794a 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -683,7 +683,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
 		pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
 			     dev->type, skb->len);
 
-		kfree_skb(skb);
+		kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME);
 		return NET_RX_DROP;
 	}
 
@@ -698,7 +698,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
 		pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
 			     dev->type, skb->len);
 
-		kfree_skb(skb);
+		kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME);
 		return NET_RX_DROP;
 	}
 
@@ -713,7 +713,7 @@ static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
 		pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
 			     dev->type, skb->len);
 
-		kfree_skb(skb);
+		kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME);
 		return NET_RX_DROP;
 	}
 
-- 
cgit v1.2.3


From 4a59e02a5aa178f80d065ad0368d29f25620c334 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Tue, 27 May 2025 14:24:02 +0300
Subject: dt-bindings: clock: rzg2l: Drop power domain IDs

Since the configuration order between the individual MSTOP and CLKON
bits cannot be preserved with the power domain abstraction, drop the
power domain IDs.  The corresponding code has also been removed.
Currently, there are no device tree users for these IDs.

Acked-by: "Rob Herring (Arm)" <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Link: https://lore.kernel.org/20250527112403.1254122-8-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a07g043-cpg.h | 53 -----------------------
 include/dt-bindings/clock/r9a07g044-cpg.h | 58 -------------------------
 include/dt-bindings/clock/r9a07g054-cpg.h | 58 -------------------------
 include/dt-bindings/clock/r9a08g045-cpg.h | 71 -------------------------------
 4 files changed, 240 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g043-cpg.h b/include/dt-bindings/clock/r9a07g043-cpg.h
index 131993343777..e1f65f1928cf 100644
--- a/include/dt-bindings/clock/r9a07g043-cpg.h
+++ b/include/dt-bindings/clock/r9a07g043-cpg.h
@@ -200,57 +200,4 @@
 #define R9A07G043_AX45MP_CORE0_RESETN	78	/* RZ/Five Only */
 #define R9A07G043_IAX45_RESETN		79	/* RZ/Five Only */
 
-/* Power domain IDs. */
-#define R9A07G043_PD_ALWAYS_ON		0
-#define R9A07G043_PD_GIC		1	/* RZ/G2UL Only */
-#define R9A07G043_PD_IA55		2	/* RZ/G2UL Only */
-#define R9A07G043_PD_MHU		3	/* RZ/G2UL Only */
-#define R9A07G043_PD_CORESIGHT		4	/* RZ/G2UL Only */
-#define R9A07G043_PD_SYC		5	/* RZ/G2UL Only */
-#define R9A07G043_PD_DMAC		6
-#define R9A07G043_PD_GTM0		7
-#define R9A07G043_PD_GTM1		8
-#define R9A07G043_PD_GTM2		9
-#define R9A07G043_PD_MTU		10
-#define R9A07G043_PD_POE3		11
-#define R9A07G043_PD_WDT0		12
-#define R9A07G043_PD_SPI		13
-#define R9A07G043_PD_SDHI0		14
-#define R9A07G043_PD_SDHI1		15
-#define R9A07G043_PD_ISU		16	/* RZ/G2UL Only */
-#define R9A07G043_PD_CRU		17	/* RZ/G2UL Only */
-#define R9A07G043_PD_LCDC		18	/* RZ/G2UL Only */
-#define R9A07G043_PD_SSI0		19
-#define R9A07G043_PD_SSI1		20
-#define R9A07G043_PD_SSI2		21
-#define R9A07G043_PD_SSI3		22
-#define R9A07G043_PD_SRC		23
-#define R9A07G043_PD_USB0		24
-#define R9A07G043_PD_USB1		25
-#define R9A07G043_PD_USB_PHY		26
-#define R9A07G043_PD_ETHER0		27
-#define R9A07G043_PD_ETHER1		28
-#define R9A07G043_PD_I2C0		29
-#define R9A07G043_PD_I2C1		30
-#define R9A07G043_PD_I2C2		31
-#define R9A07G043_PD_I2C3		32
-#define R9A07G043_PD_SCIF0		33
-#define R9A07G043_PD_SCIF1		34
-#define R9A07G043_PD_SCIF2		35
-#define R9A07G043_PD_SCIF3		36
-#define R9A07G043_PD_SCIF4		37
-#define R9A07G043_PD_SCI0		38
-#define R9A07G043_PD_SCI1		39
-#define R9A07G043_PD_IRDA		40
-#define R9A07G043_PD_RSPI0		41
-#define R9A07G043_PD_RSPI1		42
-#define R9A07G043_PD_RSPI2		43
-#define R9A07G043_PD_CANFD		44
-#define R9A07G043_PD_ADC		45
-#define R9A07G043_PD_TSU		46
-#define R9A07G043_PD_PLIC		47	/* RZ/Five Only */
-#define R9A07G043_PD_IAX45		48	/* RZ/Five Only */
-#define R9A07G043_PD_NCEPLDM		49	/* RZ/Five Only */
-#define R9A07G043_PD_NCEPLMT		50	/* RZ/Five Only */
-
 #endif /* __DT_BINDINGS_CLOCK_R9A07G043_CPG_H__ */
diff --git a/include/dt-bindings/clock/r9a07g044-cpg.h b/include/dt-bindings/clock/r9a07g044-cpg.h
index e209f96f92b7..0bb17ff1a01a 100644
--- a/include/dt-bindings/clock/r9a07g044-cpg.h
+++ b/include/dt-bindings/clock/r9a07g044-cpg.h
@@ -217,62 +217,4 @@
 #define R9A07G044_ADC_ADRST_N		82
 #define R9A07G044_TSU_PRESETN		83
 
-/* Power domain IDs. */
-#define R9A07G044_PD_ALWAYS_ON		0
-#define R9A07G044_PD_GIC		1
-#define R9A07G044_PD_IA55		2
-#define R9A07G044_PD_MHU		3
-#define R9A07G044_PD_CORESIGHT		4
-#define R9A07G044_PD_SYC		5
-#define R9A07G044_PD_DMAC		6
-#define R9A07G044_PD_GTM0		7
-#define R9A07G044_PD_GTM1		8
-#define R9A07G044_PD_GTM2		9
-#define R9A07G044_PD_MTU		10
-#define R9A07G044_PD_POE3		11
-#define R9A07G044_PD_GPT		12
-#define R9A07G044_PD_POEGA		13
-#define R9A07G044_PD_POEGB		14
-#define R9A07G044_PD_POEGC		15
-#define R9A07G044_PD_POEGD		16
-#define R9A07G044_PD_WDT0		17
-#define R9A07G044_PD_WDT1		18
-#define R9A07G044_PD_SPI		19
-#define R9A07G044_PD_SDHI0		20
-#define R9A07G044_PD_SDHI1		21
-#define R9A07G044_PD_3DGE		22
-#define R9A07G044_PD_ISU		23
-#define R9A07G044_PD_VCPL4		24
-#define R9A07G044_PD_CRU		25
-#define R9A07G044_PD_MIPI_DSI		26
-#define R9A07G044_PD_LCDC		27
-#define R9A07G044_PD_SSI0		28
-#define R9A07G044_PD_SSI1		29
-#define R9A07G044_PD_SSI2		30
-#define R9A07G044_PD_SSI3		31
-#define R9A07G044_PD_SRC		32
-#define R9A07G044_PD_USB0		33
-#define R9A07G044_PD_USB1		34
-#define R9A07G044_PD_USB_PHY		35
-#define R9A07G044_PD_ETHER0		36
-#define R9A07G044_PD_ETHER1		37
-#define R9A07G044_PD_I2C0		38
-#define R9A07G044_PD_I2C1		39
-#define R9A07G044_PD_I2C2		40
-#define R9A07G044_PD_I2C3		41
-#define R9A07G044_PD_SCIF0		42
-#define R9A07G044_PD_SCIF1		43
-#define R9A07G044_PD_SCIF2		44
-#define R9A07G044_PD_SCIF3		45
-#define R9A07G044_PD_SCIF4		46
-#define R9A07G044_PD_SCI0		47
-#define R9A07G044_PD_SCI1		48
-#define R9A07G044_PD_IRDA		49
-#define R9A07G044_PD_RSPI0		50
-#define R9A07G044_PD_RSPI1		51
-#define R9A07G044_PD_RSPI2		52
-#define R9A07G044_PD_CANFD		53
-#define R9A07G044_PD_ADC		54
-#define R9A07G044_PD_TSU		55
-
 #endif /* __DT_BINDINGS_CLOCK_R9A07G044_CPG_H__ */
diff --git a/include/dt-bindings/clock/r9a07g054-cpg.h b/include/dt-bindings/clock/r9a07g054-cpg.h
index 2c99f89397c4..43f4dbda872c 100644
--- a/include/dt-bindings/clock/r9a07g054-cpg.h
+++ b/include/dt-bindings/clock/r9a07g054-cpg.h
@@ -226,62 +226,4 @@
 #define R9A07G054_TSU_PRESETN		83
 #define R9A07G054_STPAI_ARESETN		84
 
-/* Power domain IDs. */
-#define R9A07G054_PD_ALWAYS_ON		0
-#define R9A07G054_PD_GIC		1
-#define R9A07G054_PD_IA55		2
-#define R9A07G054_PD_MHU		3
-#define R9A07G054_PD_CORESIGHT		4
-#define R9A07G054_PD_SYC		5
-#define R9A07G054_PD_DMAC		6
-#define R9A07G054_PD_GTM0		7
-#define R9A07G054_PD_GTM1		8
-#define R9A07G054_PD_GTM2		9
-#define R9A07G054_PD_MTU		10
-#define R9A07G054_PD_POE3		11
-#define R9A07G054_PD_GPT		12
-#define R9A07G054_PD_POEGA		13
-#define R9A07G054_PD_POEGB		14
-#define R9A07G054_PD_POEGC		15
-#define R9A07G054_PD_POEGD		16
-#define R9A07G054_PD_WDT0		17
-#define R9A07G054_PD_WDT1		18
-#define R9A07G054_PD_SPI		19
-#define R9A07G054_PD_SDHI0		20
-#define R9A07G054_PD_SDHI1		21
-#define R9A07G054_PD_3DGE		22
-#define R9A07G054_PD_ISU		23
-#define R9A07G054_PD_VCPL4		24
-#define R9A07G054_PD_CRU		25
-#define R9A07G054_PD_MIPI_DSI		26
-#define R9A07G054_PD_LCDC		27
-#define R9A07G054_PD_SSI0		28
-#define R9A07G054_PD_SSI1		29
-#define R9A07G054_PD_SSI2		30
-#define R9A07G054_PD_SSI3		31
-#define R9A07G054_PD_SRC		32
-#define R9A07G054_PD_USB0		33
-#define R9A07G054_PD_USB1		34
-#define R9A07G054_PD_USB_PHY		35
-#define R9A07G054_PD_ETHER0		36
-#define R9A07G054_PD_ETHER1		37
-#define R9A07G054_PD_I2C0		38
-#define R9A07G054_PD_I2C1		39
-#define R9A07G054_PD_I2C2		40
-#define R9A07G054_PD_I2C3		41
-#define R9A07G054_PD_SCIF0		42
-#define R9A07G054_PD_SCIF1		43
-#define R9A07G054_PD_SCIF2		44
-#define R9A07G054_PD_SCIF3		45
-#define R9A07G054_PD_SCIF4		46
-#define R9A07G054_PD_SCI0		47
-#define R9A07G054_PD_SCI1		48
-#define R9A07G054_PD_IRDA		49
-#define R9A07G054_PD_RSPI0		50
-#define R9A07G054_PD_RSPI1		51
-#define R9A07G054_PD_RSPI2		52
-#define R9A07G054_PD_CANFD		53
-#define R9A07G054_PD_ADC		54
-#define R9A07G054_PD_TSU		55
-
 #endif /* __DT_BINDINGS_CLOCK_R9A07G054_CPG_H__ */
diff --git a/include/dt-bindings/clock/r9a08g045-cpg.h b/include/dt-bindings/clock/r9a08g045-cpg.h
index 311521fe4b59..410725b778a8 100644
--- a/include/dt-bindings/clock/r9a08g045-cpg.h
+++ b/include/dt-bindings/clock/r9a08g045-cpg.h
@@ -239,75 +239,4 @@
 #define R9A08G045_I3C_PRESETN		92
 #define R9A08G045_VBAT_BRESETN		93
 
-/* Power domain IDs. */
-#define R9A08G045_PD_ALWAYS_ON		0
-#define R9A08G045_PD_GIC		1
-#define R9A08G045_PD_IA55		2
-#define R9A08G045_PD_MHU		3
-#define R9A08G045_PD_CORESIGHT		4
-#define R9A08G045_PD_SYC		5
-#define R9A08G045_PD_DMAC		6
-#define R9A08G045_PD_GTM0		7
-#define R9A08G045_PD_GTM1		8
-#define R9A08G045_PD_GTM2		9
-#define R9A08G045_PD_GTM3		10
-#define R9A08G045_PD_GTM4		11
-#define R9A08G045_PD_GTM5		12
-#define R9A08G045_PD_GTM6		13
-#define R9A08G045_PD_GTM7		14
-#define R9A08G045_PD_MTU		15
-#define R9A08G045_PD_POE3		16
-#define R9A08G045_PD_GPT		17
-#define R9A08G045_PD_POEGA		18
-#define R9A08G045_PD_POEGB		19
-#define R9A08G045_PD_POEGC		20
-#define R9A08G045_PD_POEGD		21
-#define R9A08G045_PD_WDT0		22
-#define R9A08G045_PD_XSPI		23
-#define R9A08G045_PD_SDHI0		24
-#define R9A08G045_PD_SDHI1		25
-#define R9A08G045_PD_SDHI2		26
-#define R9A08G045_PD_SSI0		27
-#define R9A08G045_PD_SSI1		28
-#define R9A08G045_PD_SSI2		29
-#define R9A08G045_PD_SSI3		30
-#define R9A08G045_PD_SRC		31
-#define R9A08G045_PD_USB0		32
-#define R9A08G045_PD_USB1		33
-#define R9A08G045_PD_USB_PHY		34
-#define R9A08G045_PD_ETHER0		35
-#define R9A08G045_PD_ETHER1		36
-#define R9A08G045_PD_I2C0		37
-#define R9A08G045_PD_I2C1		38
-#define R9A08G045_PD_I2C2		39
-#define R9A08G045_PD_I2C3		40
-#define R9A08G045_PD_SCIF0		41
-#define R9A08G045_PD_SCIF1		42
-#define R9A08G045_PD_SCIF2		43
-#define R9A08G045_PD_SCIF3		44
-#define R9A08G045_PD_SCIF4		45
-#define R9A08G045_PD_SCIF5		46
-#define R9A08G045_PD_SCI0		47
-#define R9A08G045_PD_SCI1		48
-#define R9A08G045_PD_IRDA		49
-#define R9A08G045_PD_RSPI0		50
-#define R9A08G045_PD_RSPI1		51
-#define R9A08G045_PD_RSPI2		52
-#define R9A08G045_PD_RSPI3		53
-#define R9A08G045_PD_RSPI4		54
-#define R9A08G045_PD_CANFD		55
-#define R9A08G045_PD_ADC		56
-#define R9A08G045_PD_TSU		57
-#define R9A08G045_PD_OCTA		58
-#define R9A08G045_PD_PDM		59
-#define R9A08G045_PD_PCI		60
-#define R9A08G045_PD_SPDIF		61
-#define R9A08G045_PD_I3C		62
-#define R9A08G045_PD_VBAT		63
-
-#define R9A08G045_PD_DDR		64
-#define R9A08G045_PD_TZCDDR		65
-#define R9A08G045_PD_OTFDE_DDR		66
-#define R9A08G045_PD_RTC		67
-
 #endif /* __DT_BINDINGS_CLOCK_R9A08G045_CPG_H__ */
-- 
cgit v1.2.3


From d209f6e122950d9b6f329f3538b785dd709001e5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 2 Jun 2025 07:58:54 -0400
Subject: filelock: add new locks_wake_up_waiter() helper

Currently the function that does this takes a struct file_lock, but
__locks_wake_up_blocks() deals with both locks and leases. Currently
this works because both file_lock and file_lease have the file_lock_core
at the beginning of the struct, but it's fragile to rely on that.

Add a new locks_wake_up_waiter() function and call that from
__locks_wake_up_blocks().

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/20250602-filelock-6-16-v1-1-7da5b2c930fd@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/locks.c               | 2 +-
 include/linux/filelock.h | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/locks.c b/fs/locks.c
index 1619cddfa7a4..f96024feab17 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker)
 		    fl->fl_lmops && fl->fl_lmops->lm_notify)
 			fl->fl_lmops->lm_notify(fl);
 		else
-			locks_wake_up(fl);
+			locks_wake_up_waiter(waiter);
 
 		/*
 		 * The setting of flc_blocker to NULL marks the "done"
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index c412ded9171e..c2ce8ba05d06 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -175,9 +175,14 @@ static inline bool lock_is_write(struct file_lock *fl)
 	return fl->c.flc_type == F_WRLCK;
 }
 
+static inline void locks_wake_up_waiter(struct file_lock_core *flc)
+{
+	wake_up(&flc->flc_wait);
+}
+
 static inline void locks_wake_up(struct file_lock *fl)
 {
-	wake_up(&fl->c.flc_wait);
+	locks_wake_up_waiter(&fl->c);
 }
 
 static inline bool locks_can_async_lock(const struct file_operations *fops)
-- 
cgit v1.2.3


From 33877220b8641b4cde474a4229ea92c0e3637883 Mon Sep 17 00:00:00 2001
From: Tasos Sahanidis <tasos@tasossah.com>
Date: Mon, 19 May 2025 11:56:55 +0300
Subject: ata: libata-acpi: Do not assume 40 wire cable if no devices are
 enabled

On at least an ASRock 990FX Extreme 4 with a VIA VT6330, the devices
have not yet been enabled by the first time ata_acpi_cbl_80wire() is
called. This means that the ata_for_each_dev loop is never entered,
and a 40 wire cable is assumed.

The VIA controller on this board does not report the cable in the PCI
config space, thus having to fall back to ACPI even though no SATA
bridge is present.

The _GTM values are correctly reported by the firmware through ACPI,
which has already set up faster transfer modes, but due to the above
the controller is forced down to a maximum of UDMA/33.

Resolve this by modifying ata_acpi_cbl_80wire() to directly return the
cable type. First, an unknown cable is assumed which preserves the mode
set by the firmware, and then on subsequent calls when the devices have
been enabled, an 80 wire cable is correctly detected.

Since the function now directly returns the cable type, it is renamed
to ata_acpi_cbl_pata_type().

Signed-off-by: Tasos Sahanidis <tasos@tasossah.com>
Link: https://lore.kernel.org/r/20250519085945.1399466-1-tasos@tasossah.com
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-acpi.c | 24 ++++++++++++++++--------
 drivers/ata/pata_via.c    |  6 ++----
 include/linux/libata.h    |  7 +++----
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index b7f0bf795521..f2140fc06ba0 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -514,15 +514,19 @@ unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
 EXPORT_SYMBOL_GPL(ata_acpi_gtm_xfermask);
 
 /**
- * ata_acpi_cbl_80wire		-	Check for 80 wire cable
+ * ata_acpi_cbl_pata_type - Return PATA cable type
  * @ap: Port to check
- * @gtm: GTM data to use
  *
- * Return 1 if the @gtm indicates the BIOS selected an 80wire mode.
+ * Return ATA_CBL_PATA* according to the transfer mode selected by BIOS
  */
-int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm)
+int ata_acpi_cbl_pata_type(struct ata_port *ap)
 {
 	struct ata_device *dev;
+	int ret = ATA_CBL_PATA_UNK;
+	const struct ata_acpi_gtm *gtm = ata_acpi_init_gtm(ap);
+
+	if (!gtm)
+		return ATA_CBL_PATA40;
 
 	ata_for_each_dev(dev, &ap->link, ENABLED) {
 		unsigned int xfer_mask, udma_mask;
@@ -530,13 +534,17 @@ int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm)
 		xfer_mask = ata_acpi_gtm_xfermask(dev, gtm);
 		ata_unpack_xfermask(xfer_mask, NULL, NULL, &udma_mask);
 
-		if (udma_mask & ~ATA_UDMA_MASK_40C)
-			return 1;
+		ret = ATA_CBL_PATA40;
+
+		if (udma_mask & ~ATA_UDMA_MASK_40C) {
+			ret = ATA_CBL_PATA80;
+			break;
+		}
 	}
 
-	return 0;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(ata_acpi_cbl_80wire);
+EXPORT_SYMBOL_GPL(ata_acpi_cbl_pata_type);
 
 static void ata_acpi_gtf_to_tf(struct ata_device *dev,
 			       const struct ata_acpi_gtf *gtf,
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index d82728a01832..bb80e7800dcb 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -201,11 +201,9 @@ static int via_cable_detect(struct ata_port *ap) {
 	   two drives */
 	if (ata66 & (0x10100000 >> (16 * ap->port_no)))
 		return ATA_CBL_PATA80;
+
 	/* Check with ACPI so we can spot BIOS reported SATA bridges */
-	if (ata_acpi_init_gtm(ap) &&
-	    ata_acpi_cbl_80wire(ap, ata_acpi_init_gtm(ap)))
-		return ATA_CBL_PATA80;
-	return ATA_CBL_PATA40;
+	return ata_acpi_cbl_pata_type(ap);
 }
 
 static int via_pre_reset(struct ata_link *link, unsigned long deadline)
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 31be45fd47a6..1e5aec839041 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1352,7 +1352,7 @@ int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm);
 int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm);
 unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
 				   const struct ata_acpi_gtm *gtm);
-int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm);
+int ata_acpi_cbl_pata_type(struct ata_port *ap);
 #else
 static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap)
 {
@@ -1377,10 +1377,9 @@ static inline unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev,
 	return 0;
 }
 
-static inline int ata_acpi_cbl_80wire(struct ata_port *ap,
-				      const struct ata_acpi_gtm *gtm)
+static inline int ata_acpi_cbl_pata_type(struct ata_port *ap)
 {
-	return 0;
+	return ATA_CBL_PATA40;
 }
 #endif
 
-- 
cgit v1.2.3


From 2d72dd14d77f31a7caa619fe0b889304844e612e Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 10 Jun 2025 16:07:56 +0200
Subject: bpf: adjust path to trace_output sample eBPF program

The sample file was renamed from trace_output_kern.c to
trace_output.bpf.c in commit d4fffba4d04b ("samples/bpf: Change _kern
suffix to .bpf with syscall tracing program"). Adjust the path in the
documentation comment for bpf_perf_event_output.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Link: https://lore.kernel.org/r/20250610140756.16332-1-tklauser@distanz.ch
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 194ed9891b40..39e7818cca80 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2409,7 +2409,7 @@ union bpf_attr {
  * 		into it. An example is available in file
  * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
  * 		tree (the eBPF program counterpart is in
- * 		*samples/bpf/trace_output_kern.c*).
+ *		*samples/bpf/trace_output.bpf.c*).
  *
  * 		**bpf_perf_event_output**\ () achieves better performance
  * 		than **bpf_trace_printk**\ () for sharing data with user
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 194ed9891b40..39e7818cca80 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2409,7 +2409,7 @@ union bpf_attr {
  * 		into it. An example is available in file
  * 		*samples/bpf/trace_output_user.c* in the Linux kernel source
  * 		tree (the eBPF program counterpart is in
- * 		*samples/bpf/trace_output_kern.c*).
+ *		*samples/bpf/trace_output.bpf.c*).
  *
  * 		**bpf_perf_event_output**\ () achieves better performance
  * 		than **bpf_trace_printk**\ () for sharing data with user
-- 
cgit v1.2.3


From c09ef59e17c6921c577d54bc8da4331b955d01a7 Mon Sep 17 00:00:00 2001
From: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Date: Mon, 9 Jun 2025 03:01:03 -0700
Subject: net: mana: Expose additional hardware counters for drop and TC via
 ethtool.

Add support for reporting additional hardware counters for drop and
TC using the ethtool -S interface.

These counters include:

- Aggregate Rx/Tx drop counters
- Per-TC Rx/Tx packet counters
- Per-TC Rx/Tx byte counters
- Per-TC Rx/Tx pause frame counters

The counters are exposed using ethtool_ops->get_ethtool_stats and
ethtool_ops->get_strings. This feature/counters are not available
to all versions of hardware.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Reviewed-by: Subbaraya Sundeep <sbhatta@marvell.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/20250609100103.GA7102@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/hw_channel.c   |   6 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c      |  87 +++++++++++++-
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c |  76 +++++++++++-
 include/net/mana/mana.h                            | 131 +++++++++++++++++++++
 4 files changed, 292 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index a8c4d8db75a5..3d3677c0d014 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2021, Microsoft Corporation. */
 
 #include <net/mana/gdma.h>
+#include <net/mana/mana.h>
 #include <net/mana/hw_channel.h>
 #include <linux/vmalloc.h>
 
@@ -890,8 +891,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 	}
 
 	if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) {
-		dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n",
-			ctx->status_code);
+		if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT)
+			dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n",
+				ctx->status_code);
 		err = -EPROTO;
 		goto out;
 	}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ccd2885c939e..e68b8190bb7a 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -774,8 +774,9 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 	err = mana_gd_send_request(gc, in_len, in_buf, out_len,
 				   out_buf);
 	if (err || resp->status) {
-		dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
-			err, resp->status);
+		if (req->req.msg_type != MANA_QUERY_PHY_STAT)
+			dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
+				err, resp->status);
 		return err ? err : -EPROTO;
 	}
 
@@ -2611,6 +2612,88 @@ void mana_query_gf_stats(struct mana_port_context *apc)
 	apc->eth_stats.hc_tx_err_gdma = resp.tx_err_gdma;
 }
 
+void mana_query_phy_stats(struct mana_port_context *apc)
+{
+	struct mana_query_phy_stat_resp resp = {};
+	struct mana_query_phy_stat_req req = {};
+	struct net_device *ndev = apc->ndev;
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_PHY_STAT,
+			     sizeof(req), sizeof(resp));
+	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+				sizeof(resp));
+	if (err)
+		return;
+
+	err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_PHY_STAT,
+				   sizeof(resp));
+	if (err || resp.hdr.status) {
+		netdev_err(ndev,
+			   "Failed to query PHY stats: %d, resp:0x%x\n",
+				err, resp.hdr.status);
+		return;
+	}
+
+	/* Aggregate drop counters */
+	apc->phy_stats.rx_pkt_drop_phy = resp.rx_pkt_drop_phy;
+	apc->phy_stats.tx_pkt_drop_phy = resp.tx_pkt_drop_phy;
+
+	/* Per TC traffic Counters */
+	apc->phy_stats.rx_pkt_tc0_phy = resp.rx_pkt_tc0_phy;
+	apc->phy_stats.tx_pkt_tc0_phy = resp.tx_pkt_tc0_phy;
+	apc->phy_stats.rx_pkt_tc1_phy = resp.rx_pkt_tc1_phy;
+	apc->phy_stats.tx_pkt_tc1_phy = resp.tx_pkt_tc1_phy;
+	apc->phy_stats.rx_pkt_tc2_phy = resp.rx_pkt_tc2_phy;
+	apc->phy_stats.tx_pkt_tc2_phy = resp.tx_pkt_tc2_phy;
+	apc->phy_stats.rx_pkt_tc3_phy = resp.rx_pkt_tc3_phy;
+	apc->phy_stats.tx_pkt_tc3_phy = resp.tx_pkt_tc3_phy;
+	apc->phy_stats.rx_pkt_tc4_phy = resp.rx_pkt_tc4_phy;
+	apc->phy_stats.tx_pkt_tc4_phy = resp.tx_pkt_tc4_phy;
+	apc->phy_stats.rx_pkt_tc5_phy = resp.rx_pkt_tc5_phy;
+	apc->phy_stats.tx_pkt_tc5_phy = resp.tx_pkt_tc5_phy;
+	apc->phy_stats.rx_pkt_tc6_phy = resp.rx_pkt_tc6_phy;
+	apc->phy_stats.tx_pkt_tc6_phy = resp.tx_pkt_tc6_phy;
+	apc->phy_stats.rx_pkt_tc7_phy = resp.rx_pkt_tc7_phy;
+	apc->phy_stats.tx_pkt_tc7_phy = resp.tx_pkt_tc7_phy;
+
+	/* Per TC byte Counters */
+	apc->phy_stats.rx_byte_tc0_phy = resp.rx_byte_tc0_phy;
+	apc->phy_stats.tx_byte_tc0_phy = resp.tx_byte_tc0_phy;
+	apc->phy_stats.rx_byte_tc1_phy = resp.rx_byte_tc1_phy;
+	apc->phy_stats.tx_byte_tc1_phy = resp.tx_byte_tc1_phy;
+	apc->phy_stats.rx_byte_tc2_phy = resp.rx_byte_tc2_phy;
+	apc->phy_stats.tx_byte_tc2_phy = resp.tx_byte_tc2_phy;
+	apc->phy_stats.rx_byte_tc3_phy = resp.rx_byte_tc3_phy;
+	apc->phy_stats.tx_byte_tc3_phy = resp.tx_byte_tc3_phy;
+	apc->phy_stats.rx_byte_tc4_phy = resp.rx_byte_tc4_phy;
+	apc->phy_stats.tx_byte_tc4_phy = resp.tx_byte_tc4_phy;
+	apc->phy_stats.rx_byte_tc5_phy = resp.rx_byte_tc5_phy;
+	apc->phy_stats.tx_byte_tc5_phy = resp.tx_byte_tc5_phy;
+	apc->phy_stats.rx_byte_tc6_phy = resp.rx_byte_tc6_phy;
+	apc->phy_stats.tx_byte_tc6_phy = resp.tx_byte_tc6_phy;
+	apc->phy_stats.rx_byte_tc7_phy = resp.rx_byte_tc7_phy;
+	apc->phy_stats.tx_byte_tc7_phy = resp.tx_byte_tc7_phy;
+
+	/* Per TC pause Counters */
+	apc->phy_stats.rx_pause_tc0_phy = resp.rx_pause_tc0_phy;
+	apc->phy_stats.tx_pause_tc0_phy = resp.tx_pause_tc0_phy;
+	apc->phy_stats.rx_pause_tc1_phy = resp.rx_pause_tc1_phy;
+	apc->phy_stats.tx_pause_tc1_phy = resp.tx_pause_tc1_phy;
+	apc->phy_stats.rx_pause_tc2_phy = resp.rx_pause_tc2_phy;
+	apc->phy_stats.tx_pause_tc2_phy = resp.tx_pause_tc2_phy;
+	apc->phy_stats.rx_pause_tc3_phy = resp.rx_pause_tc3_phy;
+	apc->phy_stats.tx_pause_tc3_phy = resp.tx_pause_tc3_phy;
+	apc->phy_stats.rx_pause_tc4_phy = resp.rx_pause_tc4_phy;
+	apc->phy_stats.tx_pause_tc4_phy = resp.tx_pause_tc4_phy;
+	apc->phy_stats.rx_pause_tc5_phy = resp.rx_pause_tc5_phy;
+	apc->phy_stats.tx_pause_tc5_phy = resp.tx_pause_tc5_phy;
+	apc->phy_stats.rx_pause_tc6_phy = resp.rx_pause_tc6_phy;
+	apc->phy_stats.tx_pause_tc6_phy = resp.tx_pause_tc6_phy;
+	apc->phy_stats.rx_pause_tc7_phy = resp.rx_pause_tc7_phy;
+	apc->phy_stats.tx_pause_tc7_phy = resp.tx_pause_tc7_phy;
+}
+
 static int mana_init_port(struct net_device *ndev)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index c419626073f5..4fb3a04994a2 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -7,10 +7,12 @@
 
 #include <net/mana/mana.h>
 
-static const struct {
+struct mana_stats_desc {
 	char name[ETH_GSTRING_LEN];
 	u16 offset;
-} mana_eth_stats[] = {
+};
+
+static const struct mana_stats_desc mana_eth_stats[] = {
 	{"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)},
 	{"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)},
 	{"hc_rx_discards_no_wqe", offsetof(struct mana_ethtool_stats,
@@ -75,6 +77,59 @@ static const struct {
 					rx_cqe_unknown_type)},
 };
 
+static const struct mana_stats_desc mana_phy_stats[] = {
+	{ "hc_rx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_drop_phy) },
+	{ "hc_tx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_drop_phy) },
+	{ "hc_tc0_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc0_phy) },
+	{ "hc_tc0_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc0_phy) },
+	{ "hc_tc0_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc0_phy) },
+	{ "hc_tc0_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc0_phy) },
+	{ "hc_tc1_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc1_phy) },
+	{ "hc_tc1_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc1_phy) },
+	{ "hc_tc1_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc1_phy) },
+	{ "hc_tc1_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc1_phy) },
+	{ "hc_tc2_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc2_phy) },
+	{ "hc_tc2_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc2_phy) },
+	{ "hc_tc2_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc2_phy) },
+	{ "hc_tc2_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc2_phy) },
+	{ "hc_tc3_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc3_phy) },
+	{ "hc_tc3_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc3_phy) },
+	{ "hc_tc3_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc3_phy) },
+	{ "hc_tc3_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc3_phy) },
+	{ "hc_tc4_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc4_phy) },
+	{ "hc_tc4_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc4_phy) },
+	{ "hc_tc4_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc4_phy) },
+	{ "hc_tc4_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc4_phy) },
+	{ "hc_tc5_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc5_phy) },
+	{ "hc_tc5_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc5_phy) },
+	{ "hc_tc5_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc5_phy) },
+	{ "hc_tc5_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc5_phy) },
+	{ "hc_tc6_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc6_phy) },
+	{ "hc_tc6_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc6_phy) },
+	{ "hc_tc6_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc6_phy) },
+	{ "hc_tc6_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc6_phy) },
+	{ "hc_tc7_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc7_phy) },
+	{ "hc_tc7_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc7_phy) },
+	{ "hc_tc7_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc7_phy) },
+	{ "hc_tc7_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc7_phy) },
+	{ "hc_tc0_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc0_phy) },
+	{ "hc_tc0_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc0_phy) },
+	{ "hc_tc1_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc1_phy) },
+	{ "hc_tc1_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc1_phy) },
+	{ "hc_tc2_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc2_phy) },
+	{ "hc_tc2_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc2_phy) },
+	{ "hc_tc3_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc3_phy) },
+	{ "hc_tc3_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc3_phy) },
+	{ "hc_tc4_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc4_phy) },
+	{ "hc_tc4_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc4_phy) },
+	{ "hc_tc5_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc5_phy) },
+	{ "hc_tc5_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc5_phy) },
+	{ "hc_tc6_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc6_phy) },
+	{ "hc_tc6_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc6_phy) },
+	{ "hc_tc7_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc7_phy) },
+	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -83,8 +138,8 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	if (stringset != ETH_SS_STATS)
 		return -EINVAL;
 
-	return ARRAY_SIZE(mana_eth_stats) + num_queues *
-				(MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) +
+			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
 }
 
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
@@ -99,6 +154,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
 		ethtool_puts(&data, mana_eth_stats[i].name);
 
+	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
+		ethtool_puts(&data, mana_phy_stats[i].name);
+
 	for (i = 0; i < num_queues; i++) {
 		ethtool_sprintf(&data, "rx_%d_packets", i);
 		ethtool_sprintf(&data, "rx_%d_bytes", i);
@@ -128,6 +186,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	void *eth_stats = &apc->eth_stats;
+	void *phy_stats = &apc->phy_stats;
 	struct mana_stats_rx *rx_stats;
 	struct mana_stats_tx *tx_stats;
 	unsigned int start;
@@ -151,9 +210,18 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	/* we call mana function to update stats from GDMA */
 	mana_query_gf_stats(apc);
 
+	/* We call this mana function to get the phy stats from GDMA and includes
+	 * aggregate tx/rx drop counters, Per-TC(Traffic Channel) tx/rx and pause
+	 * counters.
+	 */
+	mana_query_phy_stats(apc);
+
 	for (q = 0; q < ARRAY_SIZE(mana_eth_stats); q++)
 		data[i++] = *(u64 *)(eth_stats + mana_eth_stats[q].offset);
 
+	for (q = 0; q < ARRAY_SIZE(mana_phy_stats); q++)
+		data[i++] = *(u64 *)(phy_stats + mana_phy_stats[q].offset);
+
 	for (q = 0; q < num_queues; q++) {
 		rx_stats = &apc->rxqs[q]->stats;
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 9abb66461211..4176edf1be71 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -404,6 +404,65 @@ struct mana_ethtool_stats {
 	u64 rx_cqe_unknown_type;
 };
 
+struct mana_ethtool_phy_stats {
+	/* Drop Counters */
+	u64 rx_pkt_drop_phy;
+	u64 tx_pkt_drop_phy;
+
+	/* Per TC traffic Counters */
+	u64 rx_pkt_tc0_phy;
+	u64 tx_pkt_tc0_phy;
+	u64 rx_pkt_tc1_phy;
+	u64 tx_pkt_tc1_phy;
+	u64 rx_pkt_tc2_phy;
+	u64 tx_pkt_tc2_phy;
+	u64 rx_pkt_tc3_phy;
+	u64 tx_pkt_tc3_phy;
+	u64 rx_pkt_tc4_phy;
+	u64 tx_pkt_tc4_phy;
+	u64 rx_pkt_tc5_phy;
+	u64 tx_pkt_tc5_phy;
+	u64 rx_pkt_tc6_phy;
+	u64 tx_pkt_tc6_phy;
+	u64 rx_pkt_tc7_phy;
+	u64 tx_pkt_tc7_phy;
+
+	u64 rx_byte_tc0_phy;
+	u64 tx_byte_tc0_phy;
+	u64 rx_byte_tc1_phy;
+	u64 tx_byte_tc1_phy;
+	u64 rx_byte_tc2_phy;
+	u64 tx_byte_tc2_phy;
+	u64 rx_byte_tc3_phy;
+	u64 tx_byte_tc3_phy;
+	u64 rx_byte_tc4_phy;
+	u64 tx_byte_tc4_phy;
+	u64 rx_byte_tc5_phy;
+	u64 tx_byte_tc5_phy;
+	u64 rx_byte_tc6_phy;
+	u64 tx_byte_tc6_phy;
+	u64 rx_byte_tc7_phy;
+	u64 tx_byte_tc7_phy;
+
+	/* Per TC pause Counters */
+	u64 rx_pause_tc0_phy;
+	u64 tx_pause_tc0_phy;
+	u64 rx_pause_tc1_phy;
+	u64 tx_pause_tc1_phy;
+	u64 rx_pause_tc2_phy;
+	u64 tx_pause_tc2_phy;
+	u64 rx_pause_tc3_phy;
+	u64 tx_pause_tc3_phy;
+	u64 rx_pause_tc4_phy;
+	u64 tx_pause_tc4_phy;
+	u64 rx_pause_tc5_phy;
+	u64 tx_pause_tc5_phy;
+	u64 rx_pause_tc6_phy;
+	u64 tx_pause_tc6_phy;
+	u64 rx_pause_tc7_phy;
+	u64 tx_pause_tc7_phy;
+};
+
 struct mana_context {
 	struct gdma_dev *gdma_dev;
 
@@ -474,6 +533,8 @@ struct mana_port_context {
 
 	struct mana_ethtool_stats eth_stats;
 
+	struct mana_ethtool_phy_stats phy_stats;
+
 	/* Debugfs */
 	struct dentry *mana_port_debugfs;
 };
@@ -501,6 +562,7 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
 void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
 void mana_query_gf_stats(struct mana_port_context *apc);
+void mana_query_phy_stats(struct mana_port_context *apc);
 int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
 void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
 
@@ -527,6 +589,7 @@ enum mana_command_code {
 	MANA_FENCE_RQ		= 0x20006,
 	MANA_CONFIG_VPORT_RX	= 0x20007,
 	MANA_QUERY_VPORT_CONFIG	= 0x20008,
+	MANA_QUERY_PHY_STAT     = 0x2000c,
 
 	/* Privileged commands for the PF mode */
 	MANA_REGISTER_FILTER	= 0x28000,
@@ -689,6 +752,74 @@ struct mana_query_gf_stat_resp {
 	u64 tx_err_gdma;
 }; /* HW DATA */
 
+/* Query phy stats */
+struct mana_query_phy_stat_req {
+	struct gdma_req_hdr hdr;
+	u64 req_stats;
+}; /* HW DATA */
+
+struct mana_query_phy_stat_resp {
+	struct gdma_resp_hdr hdr;
+	u64 reported_stats;
+
+	/* Aggregate Drop Counters */
+	u64 rx_pkt_drop_phy;
+	u64 tx_pkt_drop_phy;
+
+	/* Per TC(Traffic class) traffic Counters */
+	u64 rx_pkt_tc0_phy;
+	u64 tx_pkt_tc0_phy;
+	u64 rx_pkt_tc1_phy;
+	u64 tx_pkt_tc1_phy;
+	u64 rx_pkt_tc2_phy;
+	u64 tx_pkt_tc2_phy;
+	u64 rx_pkt_tc3_phy;
+	u64 tx_pkt_tc3_phy;
+	u64 rx_pkt_tc4_phy;
+	u64 tx_pkt_tc4_phy;
+	u64 rx_pkt_tc5_phy;
+	u64 tx_pkt_tc5_phy;
+	u64 rx_pkt_tc6_phy;
+	u64 tx_pkt_tc6_phy;
+	u64 rx_pkt_tc7_phy;
+	u64 tx_pkt_tc7_phy;
+
+	u64 rx_byte_tc0_phy;
+	u64 tx_byte_tc0_phy;
+	u64 rx_byte_tc1_phy;
+	u64 tx_byte_tc1_phy;
+	u64 rx_byte_tc2_phy;
+	u64 tx_byte_tc2_phy;
+	u64 rx_byte_tc3_phy;
+	u64 tx_byte_tc3_phy;
+	u64 rx_byte_tc4_phy;
+	u64 tx_byte_tc4_phy;
+	u64 rx_byte_tc5_phy;
+	u64 tx_byte_tc5_phy;
+	u64 rx_byte_tc6_phy;
+	u64 tx_byte_tc6_phy;
+	u64 rx_byte_tc7_phy;
+	u64 tx_byte_tc7_phy;
+
+	/* Per TC(Traffic Class) pause Counters */
+	u64 rx_pause_tc0_phy;
+	u64 tx_pause_tc0_phy;
+	u64 rx_pause_tc1_phy;
+	u64 tx_pause_tc1_phy;
+	u64 rx_pause_tc2_phy;
+	u64 tx_pause_tc2_phy;
+	u64 rx_pause_tc3_phy;
+	u64 tx_pause_tc3_phy;
+	u64 rx_pause_tc4_phy;
+	u64 tx_pause_tc4_phy;
+	u64 rx_pause_tc5_phy;
+	u64 tx_pause_tc5_phy;
+	u64 rx_pause_tc6_phy;
+	u64 tx_pause_tc6_phy;
+	u64 rx_pause_tc7_phy;
+	u64 tx_pause_tc7_phy;
+}; /* HW DATA */
+
 /* Configure vPort Rx Steering */
 struct mana_cfg_rx_steer_req_v2 {
 	struct gdma_req_hdr hdr;
-- 
cgit v1.2.3


From 31557b3487b349464daf42bc4366153743c1e727 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Jun 2025 07:39:33 -0700
Subject: uapi: in6: restore visibility of most IPv6 socket options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A decade ago commit 6d08acd2d32e ("in6: fix conflict with glibc")
hid the definitions of IPV6 options, because GCC was complaining
about duplicates. The commit did not list the warnings seen, but
trying to recreate them now I think they are (building iproute2):

In file included from ./include/uapi/rdma/rdma_user_cm.h:39,
                 from rdma.h:16,
                 from res.h:9,
                 from res-ctx.c:7:
../include/uapi/linux/in6.h:171:9: warning: ‘IPV6_ADD_MEMBERSHIP’ redefined
  171 | #define IPV6_ADD_MEMBERSHIP     20
      |         ^~~~~~~~~~~~~~~~~~~
In file included from /usr/include/netinet/in.h:37,
                 from rdma.h:13:
/usr/include/bits/in.h:233:10: note: this is the location of the previous definition
  233 | # define IPV6_ADD_MEMBERSHIP    IPV6_JOIN_GROUP
      |          ^~~~~~~~~~~~~~~~~~~
../include/uapi/linux/in6.h:172:9: warning: ‘IPV6_DROP_MEMBERSHIP’ redefined
  172 | #define IPV6_DROP_MEMBERSHIP    21
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/include/bits/in.h:234:10: note: this is the location of the previous definition
  234 | # define IPV6_DROP_MEMBERSHIP   IPV6_LEAVE_GROUP
      |          ^~~~~~~~~~~~~~~~~~~~

Compilers don't complain about redefinition if the defines
are identical, but here we have the kernel using the literal
value, and glibc using an indirection (defining to a name
of another define, with the same numerical value).

Problem is, the commit in question hid all the IPV6 socket
options, and glibc has a pretty sparse list. For instance
it lacks Flow Label related options. Willem called this out
in commit 3fb321fde22d ("selftests/net: ipv6 flowlabel"):

  /* uapi/glibc weirdness may leave this undefined */
  #ifndef IPV6_FLOWINFO
  #define IPV6_FLOWINFO 11
  #endif

More interestingly some applications (socat) use
a #ifdef IPV6_FLOWINFO to gate compilation of thier
rudimentary flow label support. (For added confusion
socat misspells it as IPV4_FLOWINFO in some places.)

Hide only the two defines we know glibc has a problem
with. If we discover more warnings we can hide more
but we should avoid covering the entire block of
defines for "IPV6 socket options".

Link: https://patch.msgid.link/20250609143933.1654417-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/in6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index ff8d21f9e95b..5a47339ef7d7 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -152,7 +152,6 @@ struct in6_flowlabel_req {
 /*
  *	IPV6 socket options
  */
-#if __UAPI_DEF_IPV6_OPTIONS
 #define IPV6_ADDRFORM		1
 #define IPV6_2292PKTINFO	2
 #define IPV6_2292HOPOPTS	3
@@ -169,8 +168,10 @@ struct in6_flowlabel_req {
 #define IPV6_MULTICAST_IF	17
 #define IPV6_MULTICAST_HOPS	18
 #define IPV6_MULTICAST_LOOP	19
+#if __UAPI_DEF_IPV6_OPTIONS
 #define IPV6_ADD_MEMBERSHIP	20
 #define IPV6_DROP_MEMBERSHIP	21
+#endif
 #define IPV6_ROUTER_ALERT	22
 #define IPV6_MTU_DISCOVER	23
 #define IPV6_MTU		24
@@ -203,7 +204,6 @@ struct in6_flowlabel_req {
 #define IPV6_IPSEC_POLICY	34
 #define IPV6_XFRM_POLICY	35
 #define IPV6_HDRINCL		36
-#endif
 
 /*
  * Multicast:
-- 
cgit v1.2.3


From 561939ed44932da639ba703ffcd4d4d5ff2c7569 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Mon, 9 Jun 2025 11:32:35 -0400
Subject: net: remove unused sock_enable_timestamps

This function was introduced in commit 783da70e8396 ("net: add
sock_enable_timestamps"), with one caller in rxrpc.

That only caller was removed in commit 7903d4438b3f ("rxrpc: Don't use
received skbuff timestamps").

Signed-off-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://patch.msgid.link/20250609153254.3504909-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 1 -
 net/core/sock.c    | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 92e7c1aae3cc..85e17da5c9db 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2982,7 +2982,6 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
 int sock_set_timestamping(struct sock *sk, int optname,
 			  struct so_timestamping timestamping);
 
-void sock_enable_timestamps(struct sock *sk);
 #if defined(CONFIG_CGROUP_BPF)
 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
 #else
diff --git a/net/core/sock.c b/net/core/sock.c
index 3b409bc8ef6d..502042a0d3b5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -837,14 +837,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 	}
 }
 
-void sock_enable_timestamps(struct sock *sk)
-{
-	lock_sock(sk);
-	__sock_set_timestamps(sk, true, false, true);
-	release_sock(sk);
-}
-EXPORT_SYMBOL(sock_enable_timestamps);
-
 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 {
 	switch (optname) {
-- 
cgit v1.2.3


From 2bc64b89c4c4073ee8f9543373c64da9b6bbe5e0 Mon Sep 17 00:00:00 2001
From: Gur Stavi <gur.stavi@huawei.com>
Date: Mon, 9 Jun 2025 18:07:52 +0300
Subject: queue_api: add subqueue variant netif_subqueue_sent

Add a new function, netif_subqueue_sent, which is a wrapper for
netdev_tx_sent_queue.

Drivers that use the subqueue variant macros, netif_subqueue_xxx,
identify queue by index and are not required to obtain
struct netdev_queue explicitly.

Such drivers still need to call netdev_tx_sent_queue which is a
counterpart of netif_subqueue_completed_wake. Allowing drivers to use a
subqueue variant for this purpose improves their code consistency by
always referring to queue by its index.

Signed-off-by: Gur Stavi <gur.stavi@huawei.com>
Link: https://patch.msgid.link/909a5c92db49cad39f0954d6cb86775e6480ef4c.1749038081.git.gur.stavi@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netdev_queues.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index ba2eaf39089b..6e835972abd1 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -294,6 +294,15 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
 		netif_txq_try_stop(_txq, get_desc, start_thrs);		\
 	})
 
+static inline void netif_subqueue_sent(const struct net_device *dev,
+				       unsigned int idx, unsigned int bytes)
+{
+	struct netdev_queue *txq;
+
+	txq = netdev_get_tx_queue(dev, idx);
+	netdev_tx_sent_queue(txq, bytes);
+}
+
 #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
 	({								\
 		struct netdev_queue *_txq;				\
-- 
cgit v1.2.3


From 5943c611c47c9444e834555c867dec744158b7ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Feb 2025 16:04:47 -0500
Subject: procfs: kill ->proc_dops

It has two possible values - one for "forced lookup" entries, another
for the normal ones.  We'd be better off with that as an explicit
flag anyway and in addition to that it opens some fun possibilities
with ->d_op and ->d_flags handling.

[moved PROC_ENTRY_FORCE_LOOKUP to include/linux/proc_fs.h, switched it
to an unused bit - there was a conflict]

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       | 8 +++++---
 fs/proc/internal.h      | 3 +--
 include/linux/proc_fs.h | 2 ++
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a3e22803cddf..38ce45ce0eb6 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -254,7 +254,10 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
 		inode = proc_get_inode(dir->i_sb, de);
 		if (!inode)
 			return ERR_PTR(-ENOMEM);
-		d_set_d_op(dentry, de->proc_dops);
+		if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+			d_set_d_op(dentry, &proc_net_dentry_ops);
+		else
+			d_set_d_op(dentry, &proc_misc_dentry_ops);
 		return d_splice_alias(inode, dentry);
 	}
 	read_unlock(&proc_subdir_lock);
@@ -448,9 +451,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	INIT_LIST_HEAD(&ent->pde_openers);
 	proc_set_user(ent, (*parent)->uid, (*parent)->gid);
 
-	ent->proc_dops = &proc_misc_dentry_ops;
 	/* Revalidate everything under /proc/${pid}/net */
-	if ((*parent)->proc_dops == &proc_net_dentry_ops)
+	if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
 		pde_force_lookup(ent);
 
 out:
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 96122e91c645..a4054916f6da 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,7 +44,6 @@ struct proc_dir_entry {
 		const struct proc_ops *proc_ops;
 		const struct file_operations *proc_dir_ops;
 	};
-	const struct dentry_operations *proc_dops;
 	union {
 		const struct seq_operations *seq_ops;
 		int (*single_show)(struct seq_file *, void *);
@@ -403,7 +402,7 @@ extern const struct dentry_operations proc_net_dentry_ops;
 static inline void pde_force_lookup(struct proc_dir_entry *pde)
 {
 	/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
-	pde->proc_dops = &proc_net_dentry_ops;
+	pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
 }
 
 /*
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index ea62201c74c4..de1d24f19f76 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -27,6 +27,8 @@ enum {
 
 	PROC_ENTRY_proc_read_iter	= 1U << 1,
 	PROC_ENTRY_proc_compat_ioctl	= 1U << 2,
+
+	PROC_ENTRY_FORCE_LOOKUP		= 1U << 7,
 };
 
 struct proc_ops {
-- 
cgit v1.2.3


From 790fa81b8c43cda9fe25c1b564d0afe3ddeeb370 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 24 Feb 2025 12:46:49 -0500
Subject: new helper: d_splice_alias_ops()

Uses of d_set_d_op() on live dentry can be very dangerous; it is going
to be withdrawn and replaced with saner things.

The best way for a filesystem is to have the default dentry_operations
set at mount time and be done with that - __d_alloc() will use that.

Currently there are two cases when d_set_d_op() is used on a live dentry -
one is procfs, which has several genuinely different dentry_operations
instances (different ->d_revalidate(), etc.) and another is
simple_lookup(), where we would be better off without overriding ->d_op.

For procfs we have d_set_d_op() calls followed by d_splice_alias();
provide a new helper (d_splice_alias_ops(inode, dentry, d_ops)) that would
combine those two, and do the d_set_d_op() part while under ->d_lock.
That eliminates one of the places where ->d_flags had been modified
without holding ->d_lock; current behaviour is not racy, but the reasons
for that are far too brittle.  Better move to uniform locking rules and
simpler proof of correctness...

The next commit will convert procfs to use of that helper; it is not
exported and won't be until somebody comes up with convincing modular
user for it.

Again, the best approach is to have default ->d_op and let __d_alloc()
do the right thing; filesystem _may_ need non-uniform ->d_op (procfs
does), but there'd better be good reasons for that.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 63 ++++++++++++++++++++++++++++----------------------
 include/linux/dcache.h |  3 +++
 2 files changed, 39 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3c3cfb345233..bf550d438e40 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2667,7 +2667,8 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake);
 
 /* inode->i_lock held if inode is non-NULL */
 
-static inline void __d_add(struct dentry *dentry, struct inode *inode)
+static inline void __d_add(struct dentry *dentry, struct inode *inode,
+			   const struct dentry_operations *ops)
 {
 	wait_queue_head_t *d_wait;
 	struct inode *dir = NULL;
@@ -2678,6 +2679,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		n = start_dir_add(dir);
 		d_wait = __d_lookup_unhash(dentry);
 	}
+	if (unlikely(ops))
+		d_set_d_op(dentry, ops);
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2709,7 +2712,7 @@ void d_add(struct dentry *entry, struct inode *inode)
 		security_d_instantiate(entry, inode);
 		spin_lock(&inode->i_lock);
 	}
-	__d_add(entry, inode);
+	__d_add(entry, inode, NULL);
 }
 EXPORT_SYMBOL(d_add);
 
@@ -2961,30 +2964,8 @@ out_err:
 	return ret;
 }
 
-/**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode:  the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has an IS_ROOT alias, then d_move that in
- * place of the given dentry and return it, else simply d_add the inode
- * to the dentry and return NULL.
- *
- * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
- * we should error out: directories can't have multiple aliases.
- *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
- *
- * If a dentry was found and moved, then it is returned.  Otherwise NULL
- * is returned.  This matches the expected return value of ->lookup.
- *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
- */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
+				  const struct dentry_operations *ops)
 {
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
@@ -3030,9 +3011,37 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 		}
 	}
 out:
-	__d_add(dentry, inode);
+	__d_add(dentry, inode, ops);
 	return NULL;
 }
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+	return d_splice_alias_ops(inode, dentry, NULL);
+}
 EXPORT_SYMBOL(d_splice_alias);
 
 /*
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index e29823c701ac..1993e6704552 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -245,6 +245,9 @@ extern struct dentry * d_alloc_anon(struct super_block *);
 extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
 					wait_queue_head_t *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
+/* weird procfs mess; *NOT* exported */
+extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *,
+					  const struct dentry_operations *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
 extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
 			const struct qstr *name);
-- 
cgit v1.2.3


From 05fb0e666495cda068c068a681ecbbf8e57324d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Feb 2025 19:39:47 -0500
Subject: new helper: set_default_d_op()

... to be used instead of manually assigning to ->s_d_op.
All in-tree filesystem converted (and field itself is renamed,
so any out-of-tree ones in need of conversion will be caught
by compiler).

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  7 +++++++
 fs/9p/vfs_super.c                     |  4 ++--
 fs/adfs/super.c                       |  2 +-
 fs/affs/super.c                       |  4 ++--
 fs/afs/super.c                        |  4 ++--
 fs/autofs/inode.c                     |  2 +-
 fs/btrfs/super.c                      |  2 +-
 fs/ceph/super.c                       |  2 +-
 fs/coda/inode.c                       |  2 +-
 fs/configfs/mount.c                   |  2 +-
 fs/dcache.c                           | 10 ++++++++--
 fs/debugfs/inode.c                    |  2 +-
 fs/devpts/inode.c                     |  2 +-
 fs/ecryptfs/main.c                    |  2 +-
 fs/efivarfs/super.c                   |  2 +-
 fs/exfat/super.c                      |  4 ++--
 fs/fat/namei_msdos.c                  |  2 +-
 fs/fat/namei_vfat.c                   |  4 ++--
 fs/fuse/inode.c                       |  4 ++--
 fs/gfs2/ops_fstype.c                  |  2 +-
 fs/hfs/super.c                        |  2 +-
 fs/hfsplus/super.c                    |  2 +-
 fs/hostfs/hostfs_kern.c               |  2 +-
 fs/hpfs/super.c                       |  2 +-
 fs/isofs/inode.c                      |  2 +-
 fs/jfs/super.c                        |  2 +-
 fs/kernfs/mount.c                     |  2 +-
 fs/libfs.c                            | 16 ++++++++--------
 fs/nfs/super.c                        |  2 +-
 fs/ntfs3/super.c                      |  3 ++-
 fs/ocfs2/super.c                      |  2 +-
 fs/orangefs/super.c                   |  2 +-
 fs/overlayfs/super.c                  |  2 +-
 fs/smb/client/cifsfs.c                |  4 ++--
 fs/tracefs/inode.c                    |  2 +-
 fs/vboxsf/super.c                     |  2 +-
 include/linux/dcache.h                |  2 ++
 include/linux/fs.h                    |  2 +-
 mm/shmem.c                            |  2 +-
 net/sunrpc/rpc_pipe.c                 |  2 +-
 40 files changed, 69 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 3616d7161dab..b16139e91942 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1249,3 +1249,10 @@ Using try_lookup_noperm() will require linux/namei.h to be included.
 
 Calling conventions for ->d_automount() have changed; we should *not* grab
 an extra reference to new mount - it should be returned with refcount 1.
+
+---
+
+**mandatory**
+
+If your filesystem sets the default dentry_operations, use set_default_d_op()
+rather than manually setting sb->s_d_op.
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 489db161abc9..5c3dc3efb909 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -135,9 +135,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 
 	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		sb->s_d_op = &v9fs_cached_dentry_operations;
+		set_default_d_op(sb, &v9fs_cached_dentry_operations);
 	else
-		sb->s_d_op = &v9fs_dentry_operations;
+		set_default_d_op(sb, &v9fs_dentry_operations);
 
 	inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
 	if (IS_ERR(inode)) {
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 017c48a80203..fdccdbbfc213 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -397,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (asb->s_ftsuffix)
 		asb->s_namelen += 4;
 
-	sb->s_d_op = &adfs_dentry_operations;
+	set_default_d_op(sb, &adfs_dentry_operations);
 	root = adfs_iget(sb, &root_obj);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2fa40337776d..44f8aa883100 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -500,9 +500,9 @@ got_root:
 		return PTR_ERR(root_inode);
 
 	if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
-		sb->s_d_op = &affs_intl_dentry_operations;
+		set_default_d_op(sb, &affs_intl_dentry_operations);
 	else
-		sb->s_d_op = &affs_dentry_operations;
+		set_default_d_op(sb, &affs_dentry_operations);
 
 	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root) {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 25b306db6992..da407f2d6f0d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -483,9 +483,9 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
 		goto error;
 
 	if (as->dyn_root) {
-		sb->s_d_op = &afs_dynroot_dentry_operations;
+		set_default_d_op(sb, &afs_dynroot_dentry_operations);
 	} else {
-		sb->s_d_op = &afs_fs_dentry_operations;
+		set_default_d_op(sb, &afs_fs_dentry_operations);
 		rcu_assign_pointer(as->volume->sb, sb);
 	}
 
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index ee2edccaef70..f5c16ffba013 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -311,7 +311,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_blocksize_bits = 10;
 	s->s_magic = AUTOFS_SUPER_MAGIC;
 	s->s_op = &autofs_sops;
-	s->s_d_op = &autofs_dentry_operations;
+	set_default_d_op(s, &autofs_dentry_operations);
 	s->s_time_gran = 1;
 
 	/*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a0c65adce1ab..ad75d9f8f404 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -950,7 +950,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
 	sb->s_op = &btrfs_super_ops;
-	sb->s_d_op = &btrfs_dentry_operations;
+	set_default_d_op(sb, &btrfs_dentry_operations);
 	sb->s_export_op = &btrfs_export_ops;
 #ifdef CONFIG_FS_VERITY
 	sb->s_vop = &btrfs_verityops;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2b8438d8a324..c3eb651862c5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1219,7 +1219,7 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
 	fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
 
 	s->s_op = &ceph_super_ops;
-	s->s_d_op = &ceph_dentry_ops;
+	set_default_d_op(s, &ceph_dentry_ops);
 	s->s_export_op = &ceph_export_ops;
 
 	s->s_time_gran = 1;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6896fce122e1..08450d006016 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -230,7 +230,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_blocksize_bits = 12;
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
-	sb->s_d_op = &coda_dentry_operations;
+	set_default_d_op(sb, &coda_dentry_operations);
 	sb->s_time_gran = 1;
 	sb->s_time_min = S64_MIN;
 	sb->s_time_max = S64_MAX;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c2d820063ec4..20412eaca972 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -92,7 +92,7 @@ static int configfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	configfs_root_group.cg_item.ci_dentry = root;
 	root->d_fsdata = &configfs_root;
 	sb->s_root = root;
-	sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+	set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */
 	return 0;
 }
 
diff --git a/fs/dcache.c b/fs/dcache.c
index bf550d438e40..2ed875558ccc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1738,7 +1738,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	INIT_HLIST_HEAD(&dentry->d_children);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
 	INIT_HLIST_NODE(&dentry->d_sib);
-	d_set_d_op(dentry, dentry->d_sb->s_d_op);
+	d_set_d_op(dentry, dentry->d_sb->__s_d_op);
 
 	if (dentry->d_op && dentry->d_op->d_init) {
 		err = dentry->d_op->d_init(dentry);
@@ -1821,7 +1821,7 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 	struct dentry *dentry = __d_alloc(sb, name);
 	if (likely(dentry)) {
 		dentry->d_flags |= DCACHE_NORCU;
-		if (!sb->s_d_op)
+		if (!dentry->d_op)
 			d_set_d_op(dentry, &anon_ops);
 	}
 	return dentry;
@@ -1867,6 +1867,12 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 }
 EXPORT_SYMBOL(d_set_d_op);
 
+void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
+{
+	s->__s_d_op = ops;
+}
+EXPORT_SYMBOL(set_default_d_op);
+
 static unsigned d_flags_for_inode(struct inode *inode)
 {
 	unsigned add_flags = DCACHE_REGULAR_TYPE;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30c4944e1862..29c5ec382342 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -273,7 +273,7 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
 		return err;
 
 	sb->s_op = &debugfs_super_operations;
-	sb->s_d_op = &debugfs_dops;
+	set_default_d_op(sb, &debugfs_dops);
 
 	debugfs_apply_options(sb);
 
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 9c20d78e41f6..fd17992ee298 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -381,7 +381,7 @@ static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
-	s->s_d_op = &simple_dentry_operations;
+	set_default_d_op(s, &simple_dentry_operations);
 	s->s_time_gran = 1;
 	fsi->sb = s;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8dd1d7189c3b..45f9ca4465da 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -471,7 +471,7 @@ static int ecryptfs_get_tree(struct fs_context *fc)
 	sbi = NULL;
 	s->s_op = &ecryptfs_sops;
 	s->s_xattr = ecryptfs_xattr_handlers;
-	s->s_d_op = &ecryptfs_dops;
+	set_default_d_op(s, &ecryptfs_dops);
 
 	err = "Reading sb failed";
 	rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index c900d98bf494..f76d8dfa646b 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -350,7 +350,7 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_blocksize_bits    = PAGE_SHIFT;
 	sb->s_magic             = EFIVARFS_MAGIC;
 	sb->s_op                = &efivarfs_ops;
-	sb->s_d_op		= &efivarfs_d_ops;
+	set_default_d_op(sb, &efivarfs_d_ops);
 	sb->s_time_gran         = 1;
 
 	if (!efivar_supports_writes())
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 7ed858937d45..ea5c1334a214 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -667,9 +667,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
 	}
 
 	if (sbi->options.utf8)
-		sb->s_d_op = &exfat_utf8_dentry_ops;
+		set_default_d_op(sb, &exfat_utf8_dentry_ops);
 	else
-		sb->s_d_op = &exfat_dentry_ops;
+		set_default_d_op(sb, &exfat_dentry_ops);
 
 	root_inode = new_inode(sb);
 	if (!root_inode) {
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 23e9b9371ec3..0b920ee40a7f 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -646,7 +646,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
 static void setup(struct super_block *sb)
 {
 	MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
-	sb->s_d_op = &msdos_dentry_operations;
+	set_default_d_op(sb, &msdos_dentry_operations);
 	sb->s_flags |= SB_NOATIME;
 }
 
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index dd910edd2404..5dbc4cbb8fce 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1187,9 +1187,9 @@ static void setup(struct super_block *sb)
 {
 	MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
 	if (MSDOS_SB(sb)->options.name_check != 's')
-		sb->s_d_op = &vfat_ci_dentry_ops;
+		set_default_d_op(sb, &vfat_ci_dentry_ops);
 	else
-		sb->s_d_op = &vfat_dentry_ops;
+		set_default_d_op(sb, &vfat_dentry_ops);
 }
 
 static int vfat_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index eb6177508598..0dd65c0e9e29 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1715,7 +1715,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
 	fi = get_fuse_inode(root);
 	fi->nlookup--;
 
-	sb->s_d_op = &fuse_dentry_operations;
+	set_default_d_op(sb, &fuse_dentry_operations);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root)
 		return -ENOMEM;
@@ -1850,7 +1850,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
 
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, ctx->rootmode);
-	sb->s_d_op = &fuse_dentry_operations;
+	set_default_d_op(sb, &fuse_dentry_operations);
 	root_dentry = d_make_root(root);
 	if (!root_dentry)
 		goto err_dev_free;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 85c491fcf1a3..b568767dba46 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1145,7 +1145,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 
-	sb->s_d_op = &gfs2_dops;
+	set_default_d_op(sb, &gfs2_dops);
 	sb->s_export_op = &gfs2_export_ops;
 	sb->s_qcop = &gfs2_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index fe09c2093a93..388a318297ec 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -365,7 +365,7 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (!root_inode)
 		goto bail_no_root;
 
-	sb->s_d_op = &hfs_dentry_operations;
+	set_default_d_op(sb, &hfs_dentry_operations);
 	res = -ENOMEM;
 	sb->s_root = d_make_root(root_inode);
 	if (!sb->s_root)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 948b8aaee33e..0caf7aa1c249 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -508,7 +508,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
 		goto out_put_alloc_file;
 	}
 
-	sb->s_d_op = &hfsplus_dentry_operations;
+	set_default_d_op(sb, &hfsplus_dentry_operations);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		err = -ENOMEM;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 702c41317589..1c0f5038e19c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -933,7 +933,7 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_blocksize_bits = 10;
 	sb->s_magic = HOSTFS_SUPER_MAGIC;
 	sb->s_op = &hostfs_sbops;
-	sb->s_d_op = &simple_dentry_operations;
+	set_default_d_op(sb, &simple_dentry_operations);
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	err = super_setup_bdi(sb);
 	if (err)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 27567920abe4..42b779b4d87f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -554,7 +554,7 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
 	/* Fill superblock stuff */
 	s->s_magic = HPFS_SUPER_MAGIC;
 	s->s_op = &hpfs_sops;
-	s->s_d_op = &hpfs_dentry_operations;
+	set_default_d_op(s, &hpfs_dentry_operations);
 	s->s_time_min =  local_to_gmt(s, 0);
 	s->s_time_max =  local_to_gmt(s, U32_MAX);
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d5da9817df9b..8624393c0d8c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,7 +939,7 @@ root_found:
 	sbi->s_check = opt->check;
 
 	if (table)
-		s->s_d_op = &isofs_dentry_ops[table - 1];
+		set_default_d_op(s, &isofs_dentry_ops[table - 1]);
 
 	/* get the root dentry */
 	s->s_root = d_make_root(inode);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 10368c188c5e..3cfb86c5a36e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -542,7 +542,7 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_magic = JFS_SUPER_MAGIC;
 
 	if (sbi->mntflag & JFS_OS2)
-		sb->s_d_op = &jfs_ci_dentry_operations;
+		set_default_d_op(sb, &jfs_ci_dentry_operations);
 
 	inode = jfs_iget(sb, ROOT_I);
 	if (IS_ERR(inode)) {
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index c1719b5778a1..e384a69fbece 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -318,7 +318,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
 		return -ENOMEM;
 	}
 	sb->s_root = root;
-	sb->s_d_op = &kernfs_dops;
+	set_default_d_op(sb, &kernfs_dops);
 	return 0;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 9ea0ecc325a8..ab82de070310 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -75,7 +75,7 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
 {
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
-	if (!dentry->d_sb->s_d_op)
+	if (!dentry->d_op)
 		d_set_d_op(dentry, &simple_dentry_operations);
 
 	if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
@@ -684,7 +684,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 	s->s_root = d_make_root(root);
 	if (!s->s_root)
 		return -ENOMEM;
-	s->s_d_op = ctx->dops;
+	set_default_d_op(s, ctx->dops);
 	return 0;
 }
 
@@ -1950,22 +1950,22 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
  * @sb: superblock to be configured
  *
  * Filesystems supporting casefolding and/or fscrypt can call this
- * helper at mount-time to configure sb->s_d_op to best set of dentry
- * operations required for the enabled features. The helper must be
- * called after these have been configured, but before the root dentry
- * is created.
+ * helper at mount-time to configure default dentry_operations to the
+ * best set of dentry operations required for the enabled features.
+ * The helper must be called after these have been configured, but
+ * before the root dentry is created.
  */
 void generic_set_sb_d_ops(struct super_block *sb)
 {
 #if IS_ENABLED(CONFIG_UNICODE)
 	if (sb->s_encoding) {
-		sb->s_d_op = &generic_ci_dentry_ops;
+		set_default_d_op(sb, &generic_ci_dentry_ops);
 		return;
 	}
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
 	if (sb->s_cop) {
-		sb->s_d_op = &generic_encrypted_dentry_ops;
+		set_default_d_op(sb, &generic_encrypted_dentry_ops);
 		return;
 	}
 #endif
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 91b5503b6f74..72dee6f3050e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1183,7 +1183,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc)
 	struct nfs_server *server = fc->s_fs_info;
 	int ret;
 
-	s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+	set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops);
 	ret = set_anon_super(s, server);
 	if (ret == 0)
 		server->s_dev = s->s_dev;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 920a1ab47b63..ddff94c091b8 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1223,7 +1223,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_export_op = &ntfs_export_ops;
 	sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
 	sb->s_xattr = ntfs_xattr_handlers;
-	sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;
+	if (options->nocase)
+		set_default_d_op(sb, &ntfs_dentry_ops);
 
 	options->nls = ntfs_load_nls(options->nls_name);
 	if (IS_ERR(options->nls)) {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3d2533950bae..53daa4482406 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1962,7 +1962,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
-	sb->s_d_op = &ocfs2_dentry_ops;
+	set_default_d_op(sb, &ocfs2_dentry_ops);
 	sb->s_export_op = &ocfs2_export_ops;
 	sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	sb->dq_op = &ocfs2_quota_operations;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 64ca9498f550..f3da840758e7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -416,7 +416,7 @@ static int orangefs_fill_sb(struct super_block *sb,
 	sb->s_xattr = orangefs_xattr_handlers;
 	sb->s_magic = ORANGEFS_SUPER_MAGIC;
 	sb->s_op = &orangefs_s_ops;
-	sb->s_d_op = &orangefs_dentry_operations;
+	set_default_d_op(sb, &orangefs_dentry_operations);
 
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e19940d649ca..efbf0b291551 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1322,7 +1322,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	if (WARN_ON(fc->user_ns != current_user_ns()))
 		goto out_err;
 
-	sb->s_d_op = &ovl_dentry_operations;
+	set_default_d_op(sb, &ovl_dentry_operations);
 
 	err = -ENOMEM;
 	if (!ofs->creator_cred)
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 0a5266ecfd15..d4ec73359922 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -260,9 +260,9 @@ cifs_read_super(struct super_block *sb)
 	}
 
 	if (tcon->nocase)
-		sb->s_d_op = &cifs_ci_dentry_ops;
+		set_default_d_op(sb, &cifs_ci_dentry_ops);
 	else
-		sb->s_d_op = &cifs_dentry_ops;
+		set_default_d_op(sb, &cifs_dentry_ops);
 
 	sb->s_root = d_make_root(inode);
 	if (!sb->s_root) {
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a3fd3cc591bd..c8ca61777323 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -480,7 +480,7 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
 		return err;
 
 	sb->s_op = &tracefs_super_operations;
-	sb->s_d_op = &tracefs_dentry_operations;
+	set_default_d_op(sb, &tracefs_dentry_operations);
 
 	return 0;
 }
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 0bc96ab6580b..241647b060ee 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -189,7 +189,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_blocksize = 1024;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_op = &vboxsf_super_ops;
-	sb->s_d_op = &vboxsf_dentry_ops;
+	set_default_d_op(sb, &vboxsf_dentry_ops);
 
 	iroot = iget_locked(sb, 0);
 	if (!iroot) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1993e6704552..be7ae058fa90 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -607,4 +607,6 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry)
 	return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
 }
 
+void set_default_d_op(struct super_block *, const struct dentry_operations *);
+
 #endif	/* __LINUX_DCACHE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..7cd8eaab4d4e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1413,7 +1413,7 @@ struct super_block {
 	 */
 	const char *s_subtype;
 
-	const struct dentry_operations *s_d_op; /* default d_op for dentries */
+	const struct dentry_operations *__s_d_op; /* default d_op for dentries */
 
 	struct shrinker *s_shrink;	/* per-sb shrinker handle */
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 0c5fb4ffa03a..3583508800fc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5028,7 +5028,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 
 	if (ctx->encoding) {
 		sb->s_encoding = ctx->encoding;
-		sb->s_d_op = &shmem_ci_dentry_ops;
+		set_default_d_op(sb, &shmem_ci_dentry_ops);
 		if (ctx->strict_encoding)
 			sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
 	}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 98f78cd55905..f4e880383f67 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1363,7 +1363,7 @@ rpc_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = RPCAUTH_GSSMAGIC;
 	sb->s_op = &s_ops;
-	sb->s_d_op = &simple_dentry_operations;
+	set_default_d_op(sb, &simple_dentry_operations);
 	sb->s_time_gran = 1;
 
 	inode = rpc_get_inode(sb, S_IFDIR | 0555);
-- 
cgit v1.2.3


From 4ccd065a69df163cd9fe0dd8e0f609f1eeb4723d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Jun 2025 17:54:41 +0800
Subject: crypto: ahash - Add support for drivers with no fallback

Some drivers cannot have a fallback, e.g., because the key is held
in hardware.  Allow these to be used with ahash by adding the bit
CRYPTO_ALG_NO_FALLBACK.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
---
 crypto/ahash.c         | 10 +++++++++-
 include/linux/crypto.h |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index e10bc2659ae4..bd9e49950201 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -347,6 +347,9 @@ static int ahash_do_req_chain(struct ahash_request *req,
 	if (crypto_ahash_statesize(tfm) > HASH_MAX_STATESIZE)
 		return -ENOSYS;
 
+	if (!crypto_ahash_need_fallback(tfm))
+		return -ENOSYS;
+
 	{
 		u8 state[HASH_MAX_STATESIZE];
 
@@ -952,6 +955,10 @@ static int ahash_prepare_alg(struct ahash_alg *alg)
 	    base->cra_reqsize > MAX_SYNC_HASH_REQSIZE)
 		return -EINVAL;
 
+	if (base->cra_flags & CRYPTO_ALG_NEED_FALLBACK &&
+	    base->cra_flags & CRYPTO_ALG_NO_FALLBACK)
+		return -EINVAL;
+
 	err = hash_prepare_alg(&alg->halg);
 	if (err)
 		return err;
@@ -960,7 +967,8 @@ static int ahash_prepare_alg(struct ahash_alg *alg)
 	base->cra_flags |= CRYPTO_ALG_TYPE_AHASH;
 
 	if ((base->cra_flags ^ CRYPTO_ALG_REQ_VIRT) &
-	    (CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_VIRT))
+	    (CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_VIRT) &&
+	    !(base->cra_flags & CRYPTO_ALG_NO_FALLBACK))
 		base->cra_flags |= CRYPTO_ALG_NEED_FALLBACK;
 
 	if (!alg->setkey)
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index b50f1954d1bb..a2137e19be7d 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -136,6 +136,9 @@
 /* Set if the algorithm supports virtual addresses. */
 #define CRYPTO_ALG_REQ_VIRT		0x00040000
 
+/* Set if the algorithm cannot have a fallback (e.g., phmac). */
+#define CRYPTO_ALG_NO_FALLBACK		0x00080000
+
 /* The high bits 0xff000000 are reserved for type-specific flags. */
 
 /*
-- 
cgit v1.2.3


From 19ac3579af14e17c56c5b8a10979c6ca4aee6e38 Mon Sep 17 00:00:00 2001
From: Satya Priya Kakitapalli <quic_skakitap@quicinc.com>
Date: Mon, 12 May 2025 10:34:36 +0530
Subject: dt-bindings: clock: qcom: Add missing bindings on gcc-sc8180x

The multi-media AHB clocks are needed to create HW dependency in
the multimedia CC dt blocks and avoid any issues. They were not
defined in the initial bindings. Add all the missing clock bindings
for gcc-sc8180x.

Signed-off-by: Satya Priya Kakitapalli <quic_skakitap@quicinc.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20250512-sc8180x-camcc-support-v4-1-8fb1d3265f52@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,gcc-sc8180x.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,gcc-sc8180x.h b/include/dt-bindings/clock/qcom,gcc-sc8180x.h
index e364006aa6ea..b9d8438a15ff 100644
--- a/include/dt-bindings/clock/qcom,gcc-sc8180x.h
+++ b/include/dt-bindings/clock/qcom,gcc-sc8180x.h
@@ -249,6 +249,16 @@
 #define GCC_UFS_MEM_CLKREF_EN					239
 #define GCC_UFS_CARD_CLKREF_EN					240
 #define GPLL9							241
+#define GCC_CAMERA_AHB_CLK					242
+#define GCC_CAMERA_XO_CLK					243
+#define GCC_CPUSS_DVM_BUS_CLK					244
+#define GCC_CPUSS_GNOC_CLK					245
+#define GCC_DISP_AHB_CLK					246
+#define GCC_DISP_XO_CLK						247
+#define GCC_GPU_CFG_AHB_CLK					248
+#define GCC_NPU_CFG_AHB_CLK					249
+#define GCC_VIDEO_AHB_CLK					250
+#define GCC_VIDEO_XO_CLK					251
 
 #define GCC_EMAC_BCR						0
 #define GCC_GPU_BCR						1
-- 
cgit v1.2.3


From b5975ce4615fc075b4a135b867988f654370a268 Mon Sep 17 00:00:00 2001
From: Satya Priya Kakitapalli <quic_skakitap@quicinc.com>
Date: Mon, 12 May 2025 10:34:37 +0530
Subject: dt-bindings: clock: Add Qualcomm SC8180X Camera clock controller

Add device tree bindings for the camera clock controller on
Qualcomm SC8180X platform.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Satya Priya Kakitapalli <quic_skakitap@quicinc.com>
Link: https://lore.kernel.org/r/20250512-sc8180x-camcc-support-v4-2-8fb1d3265f52@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,sc8180x-camcc.yaml         |  67 ++++++++
 include/dt-bindings/clock/qcom,sc8180x-camcc.h     | 181 +++++++++++++++++++++
 2 files changed, 248 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,sc8180x-camcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,sc8180x-camcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sc8180x-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sc8180x-camcc.yaml
new file mode 100644
index 000000000000..477ee687520e
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,sc8180x-camcc.yaml
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,sc8180x-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on SC8180X
+
+maintainers:
+  - Satya Priya Kakitapalli <quic_skakitap@quicinc.com>
+
+description: |
+  Qualcomm camera clock control module provides the clocks, resets and
+  power domains on SC8180X.
+
+  See also: include/dt-bindings/clock/qcom,sc8180x-camcc.h
+
+properties:
+  compatible:
+    const: qcom,sc8180x-camcc
+
+  clocks:
+    items:
+      - description: Camera AHB clock from GCC
+      - description: Board XO source
+      - description: Sleep clock source
+
+  power-domains:
+    maxItems: 1
+    description:
+      A phandle and PM domain specifier for the MMCX power domain.
+
+  required-opps:
+    maxItems: 1
+    description:
+      A phandle to an OPP node describing required MMCX performance point.
+
+required:
+  - compatible
+  - clocks
+  - power-domains
+  - required-opps
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sc8180x.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/power/qcom-rpmpd.h>
+    clock-controller@ad00000 {
+      compatible = "qcom,sc8180x-camcc";
+      reg = <0x0ad00000 0x20000>;
+      clocks = <&gcc GCC_CAMERA_AHB_CLK>,
+               <&rpmhcc RPMH_CXO_CLK>,
+               <&sleep_clk>;
+      power-domains = <&rpmhpd SC8180X_MMCX>;
+      required-opps = <&rpmhpd_opp_low_svs>;
+
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,sc8180x-camcc.h b/include/dt-bindings/clock/qcom,sc8180x-camcc.h
new file mode 100644
index 000000000000..3e57b80f65e8
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,sc8180x-camcc.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2025, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_CAM_CC_SC8180X_H
+#define _DT_BINDINGS_CLK_QCOM_CAM_CC_SC8180X_H
+
+/* CAM_CC clocks */
+#define CAM_CC_BPS_AHB_CLK					0
+#define CAM_CC_BPS_AREG_CLK					1
+#define CAM_CC_BPS_AXI_CLK					2
+#define CAM_CC_BPS_CLK						3
+#define CAM_CC_BPS_CLK_SRC					4
+#define CAM_CC_CAMNOC_AXI_CLK					5
+#define CAM_CC_CAMNOC_AXI_CLK_SRC				6
+#define CAM_CC_CAMNOC_DCD_XO_CLK				7
+#define CAM_CC_CCI_0_CLK					8
+#define CAM_CC_CCI_0_CLK_SRC					9
+#define CAM_CC_CCI_1_CLK					10
+#define CAM_CC_CCI_1_CLK_SRC					11
+#define CAM_CC_CCI_2_CLK					12
+#define CAM_CC_CCI_2_CLK_SRC					13
+#define CAM_CC_CCI_3_CLK					14
+#define CAM_CC_CCI_3_CLK_SRC					15
+#define CAM_CC_CORE_AHB_CLK					16
+#define CAM_CC_CPAS_AHB_CLK					17
+#define CAM_CC_CPHY_RX_CLK_SRC					18
+#define CAM_CC_CSI0PHYTIMER_CLK					19
+#define CAM_CC_CSI0PHYTIMER_CLK_SRC				20
+#define CAM_CC_CSI1PHYTIMER_CLK					21
+#define CAM_CC_CSI1PHYTIMER_CLK_SRC				22
+#define CAM_CC_CSI2PHYTIMER_CLK					23
+#define CAM_CC_CSI2PHYTIMER_CLK_SRC				24
+#define CAM_CC_CSI3PHYTIMER_CLK					25
+#define CAM_CC_CSI3PHYTIMER_CLK_SRC				26
+#define CAM_CC_CSIPHY0_CLK					27
+#define CAM_CC_CSIPHY1_CLK					28
+#define CAM_CC_CSIPHY2_CLK					29
+#define CAM_CC_CSIPHY3_CLK					30
+#define CAM_CC_FAST_AHB_CLK_SRC					31
+#define CAM_CC_FD_CORE_CLK					32
+#define CAM_CC_FD_CORE_CLK_SRC					33
+#define CAM_CC_FD_CORE_UAR_CLK					34
+#define CAM_CC_ICP_AHB_CLK					35
+#define CAM_CC_ICP_CLK						36
+#define CAM_CC_ICP_CLK_SRC					37
+#define CAM_CC_IFE_0_AXI_CLK					38
+#define CAM_CC_IFE_0_CLK					39
+#define CAM_CC_IFE_0_CLK_SRC					40
+#define CAM_CC_IFE_0_CPHY_RX_CLK				41
+#define CAM_CC_IFE_0_CSID_CLK					42
+#define CAM_CC_IFE_0_CSID_CLK_SRC				43
+#define CAM_CC_IFE_0_DSP_CLK					44
+#define CAM_CC_IFE_1_AXI_CLK					45
+#define CAM_CC_IFE_1_CLK					46
+#define CAM_CC_IFE_1_CLK_SRC					47
+#define CAM_CC_IFE_1_CPHY_RX_CLK				48
+#define CAM_CC_IFE_1_CSID_CLK					49
+#define CAM_CC_IFE_1_CSID_CLK_SRC				50
+#define CAM_CC_IFE_1_DSP_CLK					51
+#define CAM_CC_IFE_2_AXI_CLK					52
+#define CAM_CC_IFE_2_CLK					53
+#define CAM_CC_IFE_2_CLK_SRC					54
+#define CAM_CC_IFE_2_CPHY_RX_CLK				55
+#define CAM_CC_IFE_2_CSID_CLK					56
+#define CAM_CC_IFE_2_CSID_CLK_SRC				57
+#define CAM_CC_IFE_2_DSP_CLK					58
+#define CAM_CC_IFE_3_AXI_CLK					59
+#define CAM_CC_IFE_3_CLK					60
+#define CAM_CC_IFE_3_CLK_SRC					61
+#define CAM_CC_IFE_3_CPHY_RX_CLK				62
+#define CAM_CC_IFE_3_CSID_CLK					63
+#define CAM_CC_IFE_3_CSID_CLK_SRC				64
+#define CAM_CC_IFE_3_DSP_CLK					65
+#define CAM_CC_IFE_LITE_0_CLK					66
+#define CAM_CC_IFE_LITE_0_CLK_SRC				67
+#define CAM_CC_IFE_LITE_0_CPHY_RX_CLK				68
+#define CAM_CC_IFE_LITE_0_CSID_CLK				69
+#define CAM_CC_IFE_LITE_0_CSID_CLK_SRC				70
+#define CAM_CC_IFE_LITE_1_CLK					71
+#define CAM_CC_IFE_LITE_1_CLK_SRC				72
+#define CAM_CC_IFE_LITE_1_CPHY_RX_CLK				73
+#define CAM_CC_IFE_LITE_1_CSID_CLK				74
+#define CAM_CC_IFE_LITE_1_CSID_CLK_SRC				75
+#define CAM_CC_IFE_LITE_2_CLK					76
+#define CAM_CC_IFE_LITE_2_CLK_SRC				77
+#define CAM_CC_IFE_LITE_2_CPHY_RX_CLK				78
+#define CAM_CC_IFE_LITE_2_CSID_CLK				79
+#define CAM_CC_IFE_LITE_2_CSID_CLK_SRC				80
+#define CAM_CC_IFE_LITE_3_CLK					81
+#define CAM_CC_IFE_LITE_3_CLK_SRC				82
+#define CAM_CC_IFE_LITE_3_CPHY_RX_CLK				83
+#define CAM_CC_IFE_LITE_3_CSID_CLK				84
+#define CAM_CC_IFE_LITE_3_CSID_CLK_SRC				85
+#define CAM_CC_IPE_0_AHB_CLK					86
+#define CAM_CC_IPE_0_AREG_CLK					87
+#define CAM_CC_IPE_0_AXI_CLK					88
+#define CAM_CC_IPE_0_CLK					89
+#define CAM_CC_IPE_0_CLK_SRC					90
+#define CAM_CC_IPE_1_AHB_CLK					91
+#define CAM_CC_IPE_1_AREG_CLK					92
+#define CAM_CC_IPE_1_AXI_CLK					93
+#define CAM_CC_IPE_1_CLK					94
+#define CAM_CC_JPEG_CLK						95
+#define CAM_CC_JPEG_CLK_SRC					96
+#define CAM_CC_LRME_CLK						97
+#define CAM_CC_LRME_CLK_SRC					98
+#define CAM_CC_MCLK0_CLK					99
+#define CAM_CC_MCLK0_CLK_SRC					100
+#define CAM_CC_MCLK1_CLK					101
+#define CAM_CC_MCLK1_CLK_SRC					102
+#define CAM_CC_MCLK2_CLK					103
+#define CAM_CC_MCLK2_CLK_SRC					104
+#define CAM_CC_MCLK3_CLK					105
+#define CAM_CC_MCLK3_CLK_SRC					106
+#define CAM_CC_MCLK4_CLK					107
+#define CAM_CC_MCLK4_CLK_SRC					108
+#define CAM_CC_MCLK5_CLK					109
+#define CAM_CC_MCLK5_CLK_SRC					110
+#define CAM_CC_MCLK6_CLK					111
+#define CAM_CC_MCLK6_CLK_SRC					112
+#define CAM_CC_MCLK7_CLK					113
+#define CAM_CC_MCLK7_CLK_SRC					114
+#define CAM_CC_PLL0						115
+#define CAM_CC_PLL0_OUT_EVEN					116
+#define CAM_CC_PLL0_OUT_ODD					117
+#define CAM_CC_PLL1						118
+#define CAM_CC_PLL2						119
+#define CAM_CC_PLL2_OUT_MAIN					120
+#define CAM_CC_PLL3						121
+#define CAM_CC_PLL4						122
+#define CAM_CC_PLL5						123
+#define CAM_CC_PLL6						124
+#define CAM_CC_SLOW_AHB_CLK_SRC					125
+#define CAM_CC_XO_CLK_SRC					126
+
+
+/* CAM_CC power domains */
+#define BPS_GDSC						0
+#define IFE_0_GDSC						1
+#define IFE_1_GDSC						2
+#define IFE_2_GDSC						3
+#define IFE_3_GDSC						4
+#define IPE_0_GDSC						5
+#define IPE_1_GDSC						6
+#define TITAN_TOP_GDSC						7
+
+/* CAM_CC resets */
+#define CAM_CC_BPS_BCR						0
+#define CAM_CC_CAMNOC_BCR					1
+#define CAM_CC_CCI_BCR						2
+#define CAM_CC_CPAS_BCR						3
+#define CAM_CC_CSI0PHY_BCR					4
+#define CAM_CC_CSI1PHY_BCR					5
+#define CAM_CC_CSI2PHY_BCR					6
+#define CAM_CC_CSI3PHY_BCR					7
+#define CAM_CC_FD_BCR						8
+#define CAM_CC_ICP_BCR						9
+#define CAM_CC_IFE_0_BCR					10
+#define CAM_CC_IFE_1_BCR					11
+#define CAM_CC_IFE_2_BCR					12
+#define CAM_CC_IFE_3_BCR					13
+#define CAM_CC_IFE_LITE_0_BCR					14
+#define CAM_CC_IFE_LITE_1_BCR					15
+#define CAM_CC_IFE_LITE_2_BCR					16
+#define CAM_CC_IFE_LITE_3_BCR					17
+#define CAM_CC_IPE_0_BCR					18
+#define CAM_CC_IPE_1_BCR					19
+#define CAM_CC_JPEG_BCR						20
+#define CAM_CC_LRME_BCR						21
+#define CAM_CC_MCLK0_BCR					22
+#define CAM_CC_MCLK1_BCR					23
+#define CAM_CC_MCLK2_BCR					24
+#define CAM_CC_MCLK3_BCR					25
+#define CAM_CC_MCLK4_BCR					26
+#define CAM_CC_MCLK5_BCR					27
+#define CAM_CC_MCLK6_BCR					28
+#define CAM_CC_MCLK7_BCR					29
+
+#endif
-- 
cgit v1.2.3


From 0b0cae7119a0ec9449d7261b5e672a5fed765068 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:43 +0300
Subject: x86/its: move its_pages array to struct mod_arch_specific

The of pages with ITS thunks allocated for modules are tracked by an
array in 'struct module'.

Since this is very architecture specific data structure, move it to
'struct mod_arch_specific'.

No functional changes.

Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org
---
 arch/x86/include/asm/module.h |  8 ++++++++
 arch/x86/kernel/alternative.c | 19 ++++++++++---------
 include/linux/module.h        |  5 -----
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e988bac0a4a1..3c2de4ce3b10 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,12 +5,20 @@
 #include <asm-generic/module.h>
 #include <asm/orc_types.h>
 
+struct its_array {
+#ifdef CONFIG_MITIGATION_ITS
+	void **pages;
+	int num;
+#endif
+};
+
 struct mod_arch_specific {
 #ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;
 #endif
+	struct its_array its_pages;
 };
 
 #endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ecfe7b497cad..b50fe6ce4655 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -173,8 +173,8 @@ void its_fini_mod(struct module *mod)
 	its_page = NULL;
 	mutex_unlock(&text_mutex);
 
-	for (int i = 0; i < mod->its_num_pages; i++) {
-		void *page = mod->its_page_array[i];
+	for (int i = 0; i < mod->arch.its_pages.num; i++) {
+		void *page = mod->arch.its_pages.pages[i];
 		execmem_restore_rox(page, PAGE_SIZE);
 	}
 }
@@ -184,11 +184,11 @@ void its_free_mod(struct module *mod)
 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 		return;
 
-	for (int i = 0; i < mod->its_num_pages; i++) {
-		void *page = mod->its_page_array[i];
+	for (int i = 0; i < mod->arch.its_pages.num; i++) {
+		void *page = mod->arch.its_pages.pages[i];
 		execmem_free(page);
 	}
-	kfree(mod->its_page_array);
+	kfree(mod->arch.its_pages.pages);
 }
 #endif /* CONFIG_MODULES */
 
@@ -201,14 +201,15 @@ static void *its_alloc(void)
 
 #ifdef CONFIG_MODULES
 	if (its_mod) {
-		void *tmp = krealloc(its_mod->its_page_array,
-				     (its_mod->its_num_pages+1) * sizeof(void *),
+		struct its_array *pages = &its_mod->arch.its_pages;
+		void *tmp = krealloc(pages->pages,
+				     (pages->num+1) * sizeof(void *),
 				     GFP_KERNEL);
 		if (!tmp)
 			return NULL;
 
-		its_mod->its_page_array = tmp;
-		its_mod->its_page_array[its_mod->its_num_pages++] = page;
+		pages->pages = tmp;
+		pages->pages[pages->num++] = page;
 
 		execmem_make_temp_rw(page, PAGE_SIZE);
 	}
diff --git a/include/linux/module.h b/include/linux/module.h
index 92e1420fccdf..5faa1fb1f4b4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -586,11 +586,6 @@ struct module {
 	atomic_t refcnt;
 #endif
 
-#ifdef CONFIG_MITIGATION_ITS
-	int its_num_pages;
-	void **its_page_array;
-#endif
-
 #ifdef CONFIG_CONSTRUCTORS
 	/* Constructor functions. */
 	ctor_fn_t *ctors;
-- 
cgit v1.2.3


From 7cd9a11dd0c3d1dd225795ed1b5b53132888e7b5 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:45 +0300
Subject: Revert "mm/execmem: Unify early execmem_cache behaviour"

The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache
behaviour") changed early behaviour of execemem ROX cache to allow its
usage in early x86 code that allocates text pages when
CONFIG_MITGATION_ITS is enabled.

The permission management of the pages allocated from execmem for ITS
mitigation is now completely contained in arch/x86/kernel/alternatives.c
and therefore there is no need to special case early allocations in
execmem.

This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org
---
 arch/x86/mm/init_32.c   |  3 ---
 arch/x86/mm/init_64.c   |  3 ---
 include/linux/execmem.h |  8 +-------
 mm/execmem.c            | 40 +++-------------------------------------
 4 files changed, 4 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 607d6a2e66e2..8a34fff6ab2b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -30,7 +30,6 @@
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
 #include <linux/gfp.h>
-#include <linux/execmem.h>
 
 #include <asm/asm.h>
 #include <asm/bios_ebda.h>
@@ -749,8 +748,6 @@ void mark_rodata_ro(void)
 	pr_info("Write protecting kernel text and read-only data: %luk\n",
 		size >> 10);
 
-	execmem_cache_make_ro();
-
 	kernel_set_to_readonly = 1;
 
 #ifdef CONFIG_CPA_DEBUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee66fae9ebcc..fdb6cab524f0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
-#include <linux/execmem.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
 	       (end - start) >> 10);
 	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
-	execmem_cache_make_ro();
-
 	kernel_set_to_readonly = 1;
 
 	/*
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index ca42d5e46ccc..3be35680a54f 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -54,7 +54,7 @@ enum execmem_range_flags {
 	EXECMEM_ROX_CACHE	= (1 << 1),
 };
 
-#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM)
+#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
 /**
  * execmem_fill_trapping_insns - set memory to contain instructions that
  *				 will trap
@@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size_t size);
  * Return: 0 on success or negative error code on failure.
  */
 int execmem_restore_rox(void *ptr, size_t size);
-
-/*
- * Called from mark_readonly(), where the system transitions to ROX.
- */
-void execmem_cache_make_ro(void);
 #else
 static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
 static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
-static inline void execmem_cache_make_ro(void) { }
 #endif
 
 /**
diff --git a/mm/execmem.c b/mm/execmem.c
index 9720ac2dfa41..2b683e7d864d 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -254,34 +254,6 @@ out_unlock:
 	return ptr;
 }
 
-static bool execmem_cache_rox = false;
-
-void execmem_cache_make_ro(void)
-{
-	struct maple_tree *free_areas = &execmem_cache.free_areas;
-	struct maple_tree *busy_areas = &execmem_cache.busy_areas;
-	MA_STATE(mas_free, free_areas, 0, ULONG_MAX);
-	MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX);
-	struct mutex *mutex = &execmem_cache.mutex;
-	void *area;
-
-	execmem_cache_rox = true;
-
-	mutex_lock(mutex);
-
-	mas_for_each(&mas_free, area, ULONG_MAX) {
-		unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT;
-		set_memory_ro(mas_free.index, pages);
-	}
-
-	mas_for_each(&mas_busy, area, ULONG_MAX) {
-		unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT;
-		set_memory_ro(mas_busy.index, pages);
-	}
-
-	mutex_unlock(mutex);
-}
-
 static int execmem_cache_populate(struct execmem_range *range, size_t size)
 {
 	unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
@@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
 	/* fill memory with instructions that will trap */
 	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
 
-	if (execmem_cache_rox) {
-		err = set_memory_rox((unsigned long)p, vm->nr_pages);
-		if (err)
-			goto err_free_mem;
-	} else {
-		err = set_memory_x((unsigned long)p, vm->nr_pages);
-		if (err)
-			goto err_free_mem;
-	}
+	err = set_memory_rox((unsigned long)p, vm->nr_pages);
+	if (err)
+		goto err_free_mem;
 
 	err = execmem_cache_add(p, alloc_size);
 	if (err)
-- 
cgit v1.2.3


From 6a9e2fb1bab53b54d02714a2ee3c6612d19629ce Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Jun 2025 11:45:07 +0200
Subject: nsfs: move root inode number to uapi

Userspace relies on the root inode numbers to identify the initial
namespaces. That's already a hard dependency. So we cannot change that
anymore. Move the initial inode numbers to a public header.

Link: https://github.com/systemd/systemd/commit/d293fade24b34ccc2f5716b0ff5513e9533cf0c4
Link: https://lore.kernel.org/20250606-work-nsfs-v1-1-b8749c9a8844@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/proc_ns.h   | 13 +++++++------
 include/uapi/linux/nsfs.h |  9 +++++++++
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 5ea470eb4d76..e77a37b23ca7 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -6,6 +6,7 @@
 #define _LINUX_PROC_NS_H
 
 #include <linux/ns_common.h>
+#include <uapi/linux/nsfs.h>
 
 struct pid_namespace;
 struct nsset;
@@ -40,12 +41,12 @@ extern const struct proc_ns_operations timens_for_children_operations;
  */
 enum {
 	PROC_ROOT_INO		= 1,
-	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
-	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
-	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
-	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
-	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
-	PROC_TIME_INIT_INO	= 0xEFFFFFFAU,
+	PROC_IPC_INIT_INO	= IPC_NS_INIT_INO,
+	PROC_UTS_INIT_INO	= UTS_NS_INIT_INO,
+	PROC_USER_INIT_INO	= USER_NS_INIT_INO,
+	PROC_PID_INIT_INO	= PID_NS_INIT_INO,
+	PROC_CGROUP_INIT_INO	= CGROUP_NS_INIT_INO,
+	PROC_TIME_INIT_INO	= TIME_NS_INIT_INO,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 34127653fd00..6683e7ca3996 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -42,4 +42,13 @@ struct mnt_ns_info {
 /* Get previous namespace. */
 #define NS_MNT_GET_PREV		_IOR(NSIO, 12, struct mnt_ns_info)
 
+enum init_ns_ino {
+	IPC_NS_INIT_INO		= 0xEFFFFFFFU,
+	UTS_NS_INIT_INO		= 0xEFFFFFFEU,
+	USER_NS_INIT_INO	= 0xEFFFFFFDU,
+	PID_NS_INIT_INO		= 0xEFFFFFFCU,
+	CGROUP_NS_INIT_INO	= 0xEFFFFFFBU,
+	TIME_NS_INIT_INO	= 0xEFFFFFFAU,
+};
+
 #endif /* __LINUX_NSFS_H */
-- 
cgit v1.2.3


From 9b0240b3ccc325c7a96cf362877180bc9e10d546 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Jun 2025 11:45:08 +0200
Subject: netns: use stable inode number for initial mount ns

Apart from the network and mount namespace all other namespaces expose a
stable inode number and userspace has been relying on that for a very
long time now. It's very much heavily used API. Align the network
namespace and use a stable inode number from the reserved procfs inode
number space so this is consistent across all namespaces.

Link: https://lore.kernel.org/20250606-work-nsfs-v1-2-b8749c9a8844@kernel.org
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/proc_ns.h   | 1 +
 include/uapi/linux/nsfs.h | 1 +
 net/core/net_namespace.c  | 8 ++++++++
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index e77a37b23ca7..3ff0bd381704 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -47,6 +47,7 @@ enum {
 	PROC_PID_INIT_INO	= PID_NS_INIT_INO,
 	PROC_CGROUP_INIT_INO	= CGROUP_NS_INIT_INO,
 	PROC_TIME_INIT_INO	= TIME_NS_INIT_INO,
+	PROC_NET_INIT_INO	= NET_NS_INIT_INO,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 6683e7ca3996..393778489d85 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -49,6 +49,7 @@ enum init_ns_ino {
 	PID_NS_INIT_INO		= 0xEFFFFFFCU,
 	CGROUP_NS_INIT_INO	= 0xEFFFFFFBU,
 	TIME_NS_INIT_INO	= 0xEFFFFFFAU,
+	NET_NS_INIT_INO		= 0xEFFFFFF9U,
 };
 
 #endif /* __LINUX_NSFS_H */
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ae54f26709ca..03cf87d3b380 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -796,11 +796,19 @@ static __net_init int net_ns_net_init(struct net *net)
 #ifdef CONFIG_NET_NS
 	net->ns.ops = &netns_operations;
 #endif
+	if (net == &init_net) {
+		net->ns.inum = PROC_NET_INIT_INO;
+		return 0;
+	}
 	return ns_alloc_inum(&net->ns);
 }
 
 static __net_exit void net_ns_net_exit(struct net *net)
 {
+	/*
+	 * Initial network namespace doesn't exit so we don't need any
+	 * special checks here.
+	 */
 	ns_free_inum(&net->ns);
 }
 
-- 
cgit v1.2.3


From 7f4f229195b73606ded77e56943f463b78adf635 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 6 Jun 2025 11:45:09 +0200
Subject: mntns: use stable inode number for initial mount ns

Apart from the network and mount namespace all other namespaces expose a
stable inode number and userspace has been relying on that for a very
long time now. It's very much heavily used API. Align the mount
namespace and use a stable inode number from the reserved procfs inode
number space so this is consistent across all namespaces.

Link: https://lore.kernel.org/20250606-work-nsfs-v1-3-b8749c9a8844@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c            | 4 +++-
 include/linux/proc_ns.h   | 1 +
 include/uapi/linux/nsfs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/namespace.c b/fs/namespace.c
index e13d9ab4f564..7ca4612c7ae9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6203,9 +6203,11 @@ static void __init init_mount_tree(void)
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
-	ns = alloc_mnt_ns(&init_user_ns, false);
+	ns = alloc_mnt_ns(&init_user_ns, true);
 	if (IS_ERR(ns))
 		panic("Can't allocate initial namespace");
+	ns->seq = atomic64_inc_return(&mnt_ns_seq);
+	ns->ns.inum = PROC_MNT_INIT_INO;
 	m = real_mount(mnt);
 	ns->root = m;
 	ns->nr_mounts = 1;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 3ff0bd381704..6258455e49a4 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -48,6 +48,7 @@ enum {
 	PROC_CGROUP_INIT_INO	= CGROUP_NS_INIT_INO,
 	PROC_TIME_INIT_INO	= TIME_NS_INIT_INO,
 	PROC_NET_INIT_INO	= NET_NS_INIT_INO,
+	PROC_MNT_INIT_INO	= MNT_NS_INIT_INO,
 };
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h
index 393778489d85..97d8d80d139f 100644
--- a/include/uapi/linux/nsfs.h
+++ b/include/uapi/linux/nsfs.h
@@ -50,6 +50,7 @@ enum init_ns_ino {
 	CGROUP_NS_INIT_INO	= 0xEFFFFFFBU,
 	TIME_NS_INIT_INO	= 0xEFFFFFFAU,
 	NET_NS_INIT_INO		= 0xEFFFFFF9U,
+	MNT_NS_INIT_INO		= 0xEFFFFFF8U,
 };
 
 #endif /* __LINUX_NSFS_H */
-- 
cgit v1.2.3


From 6bdd3a01fe4627ad7a562ba38eb759eba715b671 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2025 16:00:20 +0200
Subject: fs: add missing values to TRACE_IOCB_STRINGS

Make sure all values are covered.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250610140020.2227932-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..d27c402f1162 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -399,7 +399,9 @@ struct readahead_control;
 	{ IOCB_WAITQ,		"WAITQ" }, \
 	{ IOCB_NOIO,		"NOIO" }, \
 	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
-	{ IOCB_DIO_CALLER_COMP,	"CALLER_COMP" }
+	{ IOCB_DIO_CALLER_COMP,	"CALLER_COMP" }, \
+	{ IOCB_AIO_RW,		"AIO_RW" }, \
+	{ IOCB_HAS_METADATA,	"AIO_HAS_METADATA" }
 
 struct kiocb {
 	struct file		*ki_filp;
-- 
cgit v1.2.3


From 2fa8bf42c50582c7302918474aae8c52b59e7910 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Feb 2025 19:53:00 -0500
Subject: set_default_d_op(): calculate the matching value for ->d_flags

... and store it in ->s_d_flags, to be used by __d_alloc()

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c        | 6 ++++--
 include/linux/fs.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/dcache.c b/fs/dcache.c
index 27e6d2f36973..7519c5f66f79 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1731,14 +1731,14 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_inode = NULL;
 	dentry->d_parent = dentry;
 	dentry->d_sb = sb;
-	dentry->d_op = NULL;
+	dentry->d_op = sb->__s_d_op;
+	dentry->d_flags = sb->s_d_flags;
 	dentry->d_fsdata = NULL;
 	INIT_HLIST_BL_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_HLIST_HEAD(&dentry->d_children);
 	INIT_HLIST_NODE(&dentry->d_u.d_alias);
 	INIT_HLIST_NODE(&dentry->d_sib);
-	d_set_d_op(dentry, dentry->d_sb->__s_d_op);
 
 	if (dentry->d_op && dentry->d_op->d_init) {
 		err = dentry->d_op->d_init(dentry);
@@ -1877,7 +1877,9 @@ EXPORT_SYMBOL(d_set_d_op);
 
 void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
 {
+	unsigned int flags = d_op_flags(ops);
 	s->__s_d_op = ops;
+	s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags;
 }
 EXPORT_SYMBOL(set_default_d_op);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7cd8eaab4d4e..65548e70e596 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1400,6 +1400,7 @@ struct super_block {
 	char			s_sysfs_name[UUID_STRING_LEN + 1];
 
 	unsigned int		s_max_links;
+	unsigned int		s_d_flags;	/* default d_flags for dentries */
 
 	/*
 	 * The next field is for VFS *only*. No filesystems have any business
-- 
cgit v1.2.3


From 691fb82ca6ccdcdb9e60e754b55659271d5280e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Feb 2025 20:18:15 -0500
Subject: make d_set_d_op() static

Convert the last user (d_alloc_pseudo()) and be done with that.
Any out-of-tree filesystem using it should switch to d_splice_alias_ops()
or, better yet, check whether it really needs to have ->d_op vary among
its dentries.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst | 11 +++++++++++
 fs/dcache.c                           |  6 +++---
 include/linux/dcache.h                |  1 -
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index b16139e91942..579f17df46cf 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1256,3 +1256,14 @@ an extra reference to new mount - it should be returned with refcount 1.
 
 If your filesystem sets the default dentry_operations, use set_default_d_op()
 rather than manually setting sb->s_d_op.
+
+---
+
+**mandatory**
+
+d_set_d_op() is no longer exported (or public, for that matter); _if_
+your filesystem really needed that, make use of d_splice_alias_ops()
+to have them set.  Better yet, think hard whether you need different
+->d_op for different dentries - if not, just use set_default_d_op()
+at mount time and be done with that.  Currently procfs is the only
+thing that really needs ->d_op varying between dentries.
diff --git a/fs/dcache.c b/fs/dcache.c
index 7519c5f66f79..4e6ab27471a4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1821,8 +1821,9 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 	struct dentry *dentry = __d_alloc(sb, name);
 	if (likely(dentry)) {
 		dentry->d_flags |= DCACHE_NORCU;
+		/* d_op_flags(&anon_ops) is 0 */
 		if (!dentry->d_op)
-			d_set_d_op(dentry, &anon_ops);
+			dentry->d_op = &anon_ops;
 	}
 	return dentry;
 }
@@ -1864,7 +1865,7 @@ static unsigned int d_op_flags(const struct dentry_operations *op)
 	return flags;
 }
 
-void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 {
 	unsigned int flags = d_op_flags(op);
 	WARN_ON_ONCE(dentry->d_op);
@@ -1873,7 +1874,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 	if (flags)
 		dentry->d_flags |= flags;
 }
-EXPORT_SYMBOL(d_set_d_op);
 
 void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
 {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index be7ae058fa90..cc3e1c1a3454 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -237,7 +237,6 @@ extern void d_instantiate_new(struct dentry *, struct inode *);
 extern void __d_drop(struct dentry *dentry);
 extern void d_drop(struct dentry *dentry);
 extern void d_delete(struct dentry *);
-extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);
 
 /* allocate/de-allocate */
 extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
-- 
cgit v1.2.3


From 0b136e7d18fa8bb1251ab06f4f30e883da780245 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Feb 2025 20:59:01 -0500
Subject: kill simple_dentry_operations

No users left and anything that wants it would be better off just
setting DCACHE_DONTCACHE in their ->s_d_flags.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 5 -----
 include/linux/fs.h | 1 -
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index 19cc12651708..3051211998b6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -62,11 +62,6 @@ int always_delete_dentry(const struct dentry *dentry)
 }
 EXPORT_SYMBOL(always_delete_dentry);
 
-const struct dentry_operations simple_dentry_operations = {
-	.d_delete = always_delete_dentry,
-};
-EXPORT_SYMBOL(simple_dentry_operations);
-
 /*
  * Lookup the data. This is trivial - if the dentry didn't already
  * exist, we know it is negative.  Set d_op to delete negative dentries.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65548e70e596..d58bbb8262e8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3606,7 +3606,6 @@ extern const struct address_space_operations ram_aops;
 extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
 extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
-extern const struct dentry_operations simple_dentry_operations;
 
 extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
-- 
cgit v1.2.3


From 29d673b1508fcaa14be32e92679874f10a099bc8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 13 May 2024 23:36:53 -0600
Subject: make securityfs_remove() remove the entire subtree

... and fix the mount leak when anything's mounted there.
securityfs_recursive_remove becomes an alias for securityfs_remove -
we'll probably need to remove it in a cycle or two.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/security.h |  3 ++-
 security/inode.c         | 47 ++++++++++-------------------------------------
 2 files changed, 12 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index dba349629229..386463b5e848 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2211,7 +2211,6 @@ struct dentry *securityfs_create_symlink(const char *name,
 					 const char *target,
 					 const struct inode_operations *iops);
 extern void securityfs_remove(struct dentry *dentry);
-extern void securityfs_recursive_remove(struct dentry *dentry);
 
 #else /* CONFIG_SECURITYFS */
 
@@ -2243,6 +2242,8 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#define securityfs_recursive_remove securityfs_remove
+
 #ifdef CONFIG_BPF_SYSCALL
 union bpf_attr;
 struct bpf_map;
diff --git a/security/inode.c b/security/inode.c
index 1ecb8859c272..43382ef8896e 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -281,6 +281,12 @@ struct dentry *securityfs_create_symlink(const char *name,
 }
 EXPORT_SYMBOL_GPL(securityfs_create_symlink);
 
+static void remove_one(struct dentry *victim)
+{
+	if (victim->d_parent == victim->d_sb->s_root)
+		simple_release_fs(&mount, &mount_count);
+}
+
 /**
  * securityfs_remove - removes a file or directory from the securityfs filesystem
  *
@@ -293,44 +299,11 @@ EXPORT_SYMBOL_GPL(securityfs_create_symlink);
  * This function is required to be called in order for the file to be
  * removed. No automatic cleanup of files will happen when a module is
  * removed; you are responsible here.
- */
-void securityfs_remove(struct dentry *dentry)
-{
-	struct inode *dir;
-
-	if (IS_ERR_OR_NULL(dentry))
-		return;
-
-	dir = d_inode(dentry->d_parent);
-	inode_lock(dir);
-	if (simple_positive(dentry)) {
-		if (d_is_dir(dentry))
-			simple_rmdir(dir, dentry);
-		else
-			simple_unlink(dir, dentry);
-	}
-	inode_unlock(dir);
-	if (dir == dir->i_sb->s_root->d_inode)
-		simple_release_fs(&mount, &mount_count);
-}
-EXPORT_SYMBOL_GPL(securityfs_remove);
-
-static void remove_one(struct dentry *victim)
-{
-	if (victim->d_parent == victim->d_sb->s_root)
-		simple_release_fs(&mount, &mount_count);
-}
-
-/**
- * securityfs_recursive_remove - recursively removes a file or directory
- *
- * @dentry: a pointer to a the dentry of the file or directory to be removed.
  *
- * This function recursively removes a file or directory in securityfs that was
- * previously created with a call to another securityfs function (like
- * securityfs_create_file() or variants thereof.)
+ * AV: when applied to directory it will take all children out; no need to call
+ * it for descendents if ancestor is getting killed.
  */
-void securityfs_recursive_remove(struct dentry *dentry)
+void securityfs_remove(struct dentry *dentry)
 {
 	if (IS_ERR_OR_NULL(dentry))
 		return;
@@ -339,7 +312,7 @@ void securityfs_recursive_remove(struct dentry *dentry)
 	simple_recursive_removal(dentry, remove_one);
 	simple_release_fs(&mount, &mount_count);
 }
-EXPORT_SYMBOL_GPL(securityfs_recursive_remove);
+EXPORT_SYMBOL_GPL(securityfs_remove);
 
 #ifdef CONFIG_SECURITY
 static struct dentry *lsm_dentry;
-- 
cgit v1.2.3


From e1f4b1f167581a4932dd4f017c80a6e46d28761a Mon Sep 17 00:00:00 2001
From: Mohsin Bashir <mohsin.bashr@gmail.com>
Date: Tue, 10 Jun 2025 10:11:08 -0700
Subject: eth: Update rmon hist range

The fbnic driver reports up-to 11 ranges resulting in the drop of the
last range. This patch increment the value of ETHTOOL_RMON_HIST_MAX to
address this limitation.

Signed-off-by: Mohsin Bashir <mohsin.bashr@gmail.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20250610171109.1481229-2-mohsin.bashr@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 5e0dd333ad1f..90da1aee6e56 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -536,7 +536,7 @@ struct ethtool_rmon_hist_range {
 	u16 high;
 };
 
-#define ETHTOOL_RMON_HIST_MAX	10
+#define ETHTOOL_RMON_HIST_MAX	11
 
 /**
  * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics
-- 
cgit v1.2.3


From c0f21029f123d1b15f8eddc8e3976bf0c8781c43 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Sun, 8 Jun 2025 10:42:53 +0300
Subject: xfrm: always initialize offload path

Offload path is used for GRO with SW IPsec, and not just for HW
offload. So initialize it anyway.

Fixes: 585b64f5a620 ("xfrm: delay initialization of offload path till its actually requested")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Closes: https://lore.kernel.org/all/aEGW_5HfPqU1rFjl@krikkit
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     | 2 +-
 net/xfrm/xfrm_device.c | 1 -
 net/xfrm/xfrm_state.c  | 6 ++----
 net/xfrm/xfrm_user.c   | 1 +
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a21e276dbe44..e45a275fca26 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -474,7 +474,7 @@ struct xfrm_type_offload {
 
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
-void xfrm_set_type_offload(struct xfrm_state *x);
+void xfrm_set_type_offload(struct xfrm_state *x, bool try_load);
 static inline void xfrm_unset_type_offload(struct xfrm_state *x)
 {
 	if (!x->type_offload)
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 81fd486b5e56..d2819baea414 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -305,7 +305,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		return -EINVAL;
 	}
 
-	xfrm_set_type_offload(x);
 	if (!x->type_offload) {
 		NL_SET_ERR_MSG(extack, "Type doesn't support offload");
 		dev_put(dev);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 7e34fc94f668..c7e6472c623d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -424,11 +424,10 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
 }
 EXPORT_SYMBOL(xfrm_unregister_type_offload);
 
-void xfrm_set_type_offload(struct xfrm_state *x)
+void xfrm_set_type_offload(struct xfrm_state *x, bool try_load)
 {
 	const struct xfrm_type_offload *type = NULL;
 	struct xfrm_state_afinfo *afinfo;
-	bool try_load = true;
 
 retry:
 	afinfo = xfrm_state_get_afinfo(x->props.family);
@@ -607,6 +606,7 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
 	kfree(x->coaddr);
 	kfree(x->replay_esn);
 	kfree(x->preplay_esn);
+	xfrm_unset_type_offload(x);
 	if (x->type) {
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
@@ -780,8 +780,6 @@ void xfrm_dev_state_free(struct xfrm_state *x)
 	struct xfrm_dev_offload *xso = &x->xso;
 	struct net_device *dev = READ_ONCE(xso->dev);
 
-	xfrm_unset_type_offload(x);
-
 	if (dev && dev->xfrmdev_ops) {
 		spin_lock_bh(&xfrm_state_dev_gc_lock);
 		if (!hlist_unhashed(&x->dev_gclist))
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 59f258daf830..1db18f470f42 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -977,6 +977,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	/* override default values from above */
 	xfrm_update_ae_params(x, attrs, 0);
 
+	xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]);
 	/* configure the hardware if offload is requested */
 	if (attrs[XFRMA_OFFLOAD_DEV]) {
 		err = xfrm_dev_state_add(net, x,
-- 
cgit v1.2.3


From 12b5b138d111db0588492002fdd8089af61b80e5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 3 Jun 2025 15:31:55 +0200
Subject: coredump: allow for flexible coredump handling

Extend the coredump socket to allow the coredump server to tell the
kernel how to process individual coredumps.

When the crashing task connects to the coredump socket the kernel will
send a struct coredump_req to the coredump server. The kernel will set
the size member of struct coredump_req allowing the coredump server how
much data can be read.

The coredump server uses MSG_PEEK to peek the size of struct
coredump_req. If the kernel uses a newer struct coredump_req the
coredump server just reads the size it knows and discard any remaining
bytes in the buffer. If the kernel uses an older struct coredump_req
the coredump server just reads the size the kernel knows.

The returned struct coredump_req will inform the coredump server what
features the kernel supports. The coredump_req->mask member is set to
the currently know features.

The coredump server may only use features whose bits were raised by the
kernel in coredump_req->mask.

In response to a coredump_req from the kernel the coredump server sends
a struct coredump_ack to the kernel. The kernel informs the coredump
server what version of struct coredump_ack it supports by setting struct
coredump_req->size_ack to the size it knows about. The coredump server
may only send as many bytes as coredump_req->size_ack indicates (a
smaller size is fine of course). The coredump server must set
coredump_ack->size accordingly.

The coredump server sets the features it wants to use in struct
coredump_ack->mask. Only bits returned in struct coredump_req->mask may
be used.

In case an invalid struct coredump_ack is sent to the kernel a non-zero
u32 integer is sent indicating the reason for the failure. If it was
successful a zero u32 integer is sent.

In the initial version the following features are supported in
coredump_{req,ack}->mask:

* COREDUMP_KERNEL
  The kernel will write the coredump data to the socket.

* COREDUMP_USERSPACE
  The kernel will not write coredump data but will indicate to the
  parent that a coredump has been generated. This is used when userspace
  generates its own coredumps.

* COREDUMP_REJECT
  The kernel will skip generating a coredump for this task.

* COREDUMP_WAIT
  The kernel will prevent the task from exiting until the coredump
  server has shutdown the socket connection.

The flexible coredump socket can be enabled by using the "@@" prefix
instead of the single "@" prefix for the regular coredump socket:

  @@/run/systemd/coredump.socket

will enable flexible coredump handling. Current kernels already enforce
that "@" must be followed by "/" and will reject anything else. So
extending this is backward and forward compatible.

Link: https://lore.kernel.org/20250603-work-coredump-socket-protocol-v2-1-05a5f0c18ecc@kernel.org
Acked-by: Lennart Poettering <lennart@poettering.net>
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c                 | 195 ++++++++++++++++++++++++++++++++++++------
 include/uapi/linux/coredump.h | 104 ++++++++++++++++++++++
 2 files changed, 272 insertions(+), 27 deletions(-)
 create mode 100644 include/uapi/linux/coredump.h

(limited to 'include')

diff --git a/fs/coredump.c b/fs/coredump.c
index f217ebf2b3b6..b3eaa8c27ced 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -51,6 +51,7 @@
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 #include <uapi/linux/un.h>
+#include <uapi/linux/coredump.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -83,15 +84,17 @@ static int core_name_size = CORENAME_MAX_SIZE;
 unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
 
 enum coredump_type_t {
-	COREDUMP_FILE = 1,
-	COREDUMP_PIPE = 2,
-	COREDUMP_SOCK = 3,
+	COREDUMP_FILE		= 1,
+	COREDUMP_PIPE		= 2,
+	COREDUMP_SOCK		= 3,
+	COREDUMP_SOCK_REQ	= 4,
 };
 
 struct core_name {
 	char *corename;
 	int used, size;
 	enum coredump_type_t core_type;
+	u64 mask;
 };
 
 static int expand_corename(struct core_name *cn, int size)
@@ -235,6 +238,9 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
 	int pid_in_pattern = 0;
 	int err = 0;
 
+	cn->mask = COREDUMP_KERNEL;
+	if (core_pipe_limit)
+		cn->mask |= COREDUMP_WAIT;
 	cn->used = 0;
 	cn->corename = NULL;
 	if (*pat_ptr == '|')
@@ -264,6 +270,13 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
 		pat_ptr++;
 		if (!(*pat_ptr))
 			return -ENOMEM;
+		if (*pat_ptr == '@') {
+			pat_ptr++;
+			if (!(*pat_ptr))
+				return -ENOMEM;
+
+			cn->core_type = COREDUMP_SOCK_REQ;
+		}
 
 		err = cn_printf(cn, "%s", pat_ptr);
 		if (err)
@@ -632,6 +645,135 @@ static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
 	return 0;
 }
 
+#ifdef CONFIG_UNIX
+static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags)
+{
+	struct msghdr msg = {};
+	struct kvec iov = { .iov_base = ack, .iov_len = size };
+	ssize_t ret;
+
+	memset(ack, 0, size);
+	ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags);
+	return ret == size;
+}
+
+static inline bool coredump_sock_send(struct file *file, struct coredump_req *req)
+{
+	struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+	struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) };
+	ssize_t ret;
+
+	ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req));
+	return ret == sizeof(*req);
+}
+
+static_assert(sizeof(enum coredump_mark) == sizeof(__u32));
+
+static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark)
+{
+	struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+	struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) };
+	ssize_t ret;
+
+	ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark));
+	return ret == sizeof(mark);
+}
+
+static inline void coredump_sock_wait(struct file *file)
+{
+	ssize_t n;
+
+	/*
+	 * We use a simple read to wait for the coredump processing to
+	 * finish. Either the socket is closed or we get sent unexpected
+	 * data. In both cases, we're done.
+	 */
+	n = __kernel_read(file, &(char){ 0 }, 1, NULL);
+	if (n > 0)
+		coredump_report_failure("Coredump socket had unexpected data");
+	else if (n < 0)
+		coredump_report_failure("Coredump socket failed");
+}
+
+static inline void coredump_sock_shutdown(struct file *file)
+{
+	struct socket *socket;
+
+	socket = sock_from_file(file);
+	if (!socket)
+		return;
+
+	/* Let userspace know we're done processing the coredump. */
+	kernel_sock_shutdown(socket, SHUT_WR);
+}
+
+static bool coredump_request(struct core_name *cn, struct coredump_params *cprm)
+{
+	struct coredump_req req = {
+		.size		= sizeof(struct coredump_req),
+		.mask		= COREDUMP_KERNEL | COREDUMP_USERSPACE |
+				  COREDUMP_REJECT | COREDUMP_WAIT,
+		.size_ack	= sizeof(struct coredump_ack),
+	};
+	struct coredump_ack ack = {};
+	ssize_t usize;
+
+	if (cn->core_type != COREDUMP_SOCK_REQ)
+		return true;
+
+	/* Let userspace know what we support. */
+	if (!coredump_sock_send(cprm->file, &req))
+		return false;
+
+	/* Peek the size of the coredump_ack. */
+	if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size),
+				MSG_PEEK | MSG_WAITALL))
+		return false;
+
+	/* Refuse unknown coredump_ack sizes. */
+	usize = ack.size;
+	if (usize < COREDUMP_ACK_SIZE_VER0) {
+		coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE);
+		return false;
+	}
+
+	if (usize > sizeof(ack)) {
+		coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE);
+		return false;
+	}
+
+	/* Now retrieve the coredump_ack. */
+	if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL))
+		return false;
+	if (ack.size != usize)
+		return false;
+
+	/* Refuse unknown coredump_ack flags. */
+	if (ack.mask & ~req.mask) {
+		coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+		return false;
+	}
+
+	/* Refuse mutually exclusive options. */
+	if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL |
+				  COREDUMP_REJECT)) != 1) {
+		coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING);
+		return false;
+	}
+
+	if (ack.spare) {
+		coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+		return false;
+	}
+
+	cn->mask = ack.mask;
+	return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK);
+}
+#else
+static inline void coredump_sock_wait(struct file *file) { }
+static inline void coredump_sock_shutdown(struct file *file) { }
+#endif
+
 void do_coredump(const kernel_siginfo_t *siginfo)
 {
 	struct core_state core_state;
@@ -850,6 +992,8 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		}
 		break;
 	}
+	case COREDUMP_SOCK_REQ:
+		fallthrough;
 	case COREDUMP_SOCK: {
 #ifdef CONFIG_UNIX
 		struct file *file __free(fput) = NULL;
@@ -918,6 +1062,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 
 		cprm.limit = RLIM_INFINITY;
 		cprm.file = no_free_ptr(file);
+
+		if (!coredump_request(&cn, &cprm))
+			goto close_fail;
 #else
 		coredump_report_failure("Core dump socket support %s disabled", cn.corename);
 		goto close_fail;
@@ -929,12 +1076,17 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		goto close_fail;
 	}
 
+	/* Don't even generate the coredump. */
+	if (cn.mask & COREDUMP_REJECT)
+		goto close_fail;
+
 	/* get us an unshared descriptor table; almost always a no-op */
 	/* The cell spufs coredump code reads the file descriptor tables */
 	retval = unshare_files();
 	if (retval)
 		goto close_fail;
-	if (!dump_interrupted()) {
+
+	if ((cn.mask & COREDUMP_KERNEL) && !dump_interrupted()) {
 		/*
 		 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
 		 * have this set to NULL.
@@ -962,38 +1114,27 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		free_vma_snapshot(&cprm);
 	}
 
-#ifdef CONFIG_UNIX
-	/* Let userspace know we're done processing the coredump. */
-	if (sock_from_file(cprm.file))
-		kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR);
-#endif
+	coredump_sock_shutdown(cprm.file);
+
+	/* Let the parent know that a coredump was generated. */
+	if (cn.mask & COREDUMP_USERSPACE)
+		core_dumped = true;
 
 	/*
 	 * When core_pipe_limit is set we wait for the coredump server
 	 * or usermodehelper to finish before exiting so it can e.g.,
 	 * inspect /proc/<pid>.
 	 */
-	if (core_pipe_limit) {
+	if (cn.mask & COREDUMP_WAIT) {
 		switch (cn.core_type) {
 		case COREDUMP_PIPE:
 			wait_for_dump_helpers(cprm.file);
 			break;
-#ifdef CONFIG_UNIX
-		case COREDUMP_SOCK: {
-			ssize_t n;
-
-			/*
-			 * We use a simple read to wait for the coredump
-			 * processing to finish. Either the socket is
-			 * closed or we get sent unexpected data. In
-			 * both cases, we're done.
-			 */
-			n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL);
-			if (n != 0)
-				coredump_report_failure("Unexpected data on coredump socket");
+		case COREDUMP_SOCK_REQ:
+			fallthrough;
+		case COREDUMP_SOCK:
+			coredump_sock_wait(cprm.file);
 			break;
-		}
-#endif
 		default:
 			break;
 		}
@@ -1249,8 +1390,8 @@ static inline bool check_coredump_socket(void)
 	if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
 		return false;
 
-	/* Must be an absolute path. */
-	if (*(core_pattern + 1) != '/')
+	/* Must be an absolute path or the socket request. */
+	if (*(core_pattern + 1) != '/' && *(core_pattern + 1) != '@')
 		return false;
 
 	return true;
diff --git a/include/uapi/linux/coredump.h b/include/uapi/linux/coredump.h
new file mode 100644
index 000000000000..dc3789b78af0
--- /dev/null
+++ b/include/uapi/linux/coredump.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef _UAPI_LINUX_COREDUMP_H
+#define _UAPI_LINUX_COREDUMP_H
+
+#include <linux/types.h>
+
+/**
+ * coredump_{req,ack} flags
+ * @COREDUMP_KERNEL: kernel writes coredump
+ * @COREDUMP_USERSPACE: userspace writes coredump
+ * @COREDUMP_REJECT: don't generate coredump
+ * @COREDUMP_WAIT: wait for coredump server
+ */
+enum {
+	COREDUMP_KERNEL		= (1ULL << 0),
+	COREDUMP_USERSPACE	= (1ULL << 1),
+	COREDUMP_REJECT		= (1ULL << 2),
+	COREDUMP_WAIT		= (1ULL << 3),
+};
+
+/**
+ * struct coredump_req - message kernel sends to userspace
+ * @size: size of struct coredump_req
+ * @size_ack: known size of struct coredump_ack on this kernel
+ * @mask: supported features
+ *
+ * When a coredump happens the kernel will connect to the coredump
+ * socket and send a coredump request to the coredump server. The @size
+ * member is set to the size of struct coredump_req and provides a hint
+ * to userspace how much data can be read. Userspace may use MSG_PEEK to
+ * peek the size of struct coredump_req and then choose to consume it in
+ * one go. Userspace may also simply read a COREDUMP_ACK_SIZE_VER0
+ * request. If the size the kernel sends is larger userspace simply
+ * discards any remaining data.
+ *
+ * The coredump_req->mask member is set to the currently know features.
+ * Userspace may only set coredump_ack->mask to the bits raised by the
+ * kernel in coredump_req->mask.
+ *
+ * The coredump_req->size_ack member is set by the kernel to the size of
+ * struct coredump_ack the kernel knows. Userspace may only send up to
+ * coredump_req->size_ack bytes to the kernel and must set
+ * coredump_ack->size accordingly.
+ */
+struct coredump_req {
+	__u32 size;
+	__u32 size_ack;
+	__u64 mask;
+};
+
+enum {
+	COREDUMP_REQ_SIZE_VER0 = 16U, /* size of first published struct */
+};
+
+/**
+ * struct coredump_ack - message userspace sends to kernel
+ * @size: size of the struct
+ * @spare: unused
+ * @mask: features kernel is supposed to use
+ *
+ * The @size member must be set to the size of struct coredump_ack. It
+ * may never exceed what the kernel returned in coredump_req->size_ack
+ * but it may of course be smaller (>= COREDUMP_ACK_SIZE_VER0 and <=
+ * coredump_req->size_ack).
+ *
+ * The @mask member must be set to the features the coredump server
+ * wants the kernel to use. Only bits the kernel returned in
+ * coredump_req->mask may be set.
+ */
+struct coredump_ack {
+	__u32 size;
+	__u32 spare;
+	__u64 mask;
+};
+
+enum {
+	COREDUMP_ACK_SIZE_VER0 = 16U, /* size of first published struct */
+};
+
+/**
+ * enum coredump_mark - Markers for the coredump socket
+ *
+ * The kernel will place a single byte on the coredump socket. The
+ * markers notify userspace whether the coredump ack succeeded or
+ * failed.
+ *
+ * @COREDUMP_MARK_MINSIZE: the provided coredump_ack size was too small
+ * @COREDUMP_MARK_MAXSIZE: the provided coredump_ack size was too big
+ * @COREDUMP_MARK_UNSUPPORTED: the provided coredump_ack mask was invalid
+ * @COREDUMP_MARK_CONFLICTING: the provided coredump_ack mask has conflicting options
+ * @COREDUMP_MARK_REQACK: the coredump request and ack was successful
+ * @__COREDUMP_MARK_MAX: the maximum coredump mark value
+ */
+enum coredump_mark {
+	COREDUMP_MARK_REQACK		= 0U,
+	COREDUMP_MARK_MINSIZE		= 1U,
+	COREDUMP_MARK_MAXSIZE		= 2U,
+	COREDUMP_MARK_UNSUPPORTED	= 3U,
+	COREDUMP_MARK_CONFLICTING	= 4U,
+	__COREDUMP_MARK_MAX		= (1U << 31),
+};
+
+#endif /* _UAPI_LINUX_COREDUMP_H */
-- 
cgit v1.2.3


From 8a5a5cecb79058b608e5562d8998123a3adb313c Mon Sep 17 00:00:00 2001
From: Shenghao Ding <shenghao-ding@ti.com>
Date: Thu, 12 Jun 2025 12:42:52 +0800
Subject: ASoC: tas2781: Move the "include linux/debugfs.h" into tas2781.h

Move the include linux/debugfs.h into tas2781.h for code clean.

Signed-off-by: Shenghao Ding <shenghao-ding@ti.com>
Link: https://patch.msgid.link/20250612044252.1025-1-shenghao-ding@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781.h        | 4 ++++
 sound/soc/codecs/tas2781-i2c.c | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h
index 40cd3bd079b5..3875e92f1ec5 100644
--- a/include/sound/tas2781.h
+++ b/include/sound/tas2781.h
@@ -17,6 +17,10 @@
 #ifndef __TAS2781_H__
 #define __TAS2781_H__
 
+#ifdef CONFIG_SND_SOC_TAS2781_ACOUST_I2C
+#include <linux/debugfs.h>
+#endif
+
 #include "tas2781-dsp.h"
 
 /* version number */
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index c40d8f754d89..9f4d965a1335 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -14,9 +14,6 @@
 //
 
 #include <linux/crc8.h>
-#ifdef CONFIG_SND_SOC_TAS2781_ACOUST_I2C
-#include <linux/debugfs.h>
-#endif
 #include <linux/firmware.h>
 #include <linux/gpio/consumer.h>
 #include <linux/i2c.h>
-- 
cgit v1.2.3


From da5cb65d25f747236a003b82525eb6de5d49a2e6 Mon Sep 17 00:00:00 2001
From: Raghav Sharma <raghav.s@samsung.com>
Date: Thu, 29 May 2025 16:56:38 +0530
Subject: dt-bindings: clock: exynosautov920: add hsi2 clock definitions

Add device tree clock binding definitions for CMU_HSI2

Signed-off-by: Raghav Sharma <raghav.s@samsung.com>
Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250529112640.1646740-3-raghav.s@samsung.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../clock/samsung,exynosautov920-clock.yaml        | 29 ++++++++++++++++++++--
 include/dt-bindings/clock/samsung,exynosautov920.h |  9 +++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
index 77117b887e19..72f59db73f76 100644
--- a/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/samsung,exynosautov920-clock.yaml
@@ -37,6 +37,7 @@ properties:
       - samsung,exynosautov920-cmu-cpucl2
       - samsung,exynosautov920-cmu-hsi0
       - samsung,exynosautov920-cmu-hsi1
+      - samsung,exynosautov920-cmu-hsi2
       - samsung,exynosautov920-cmu-misc
       - samsung,exynosautov920-cmu-peric0
       - samsung,exynosautov920-cmu-peric1
@@ -44,11 +45,11 @@ properties:
 
   clocks:
     minItems: 1
-    maxItems: 4
+    maxItems: 5
 
   clock-names:
     minItems: 1
-    maxItems: 4
+    maxItems: 5
 
   "#clock-cells":
     const: 1
@@ -201,6 +202,30 @@ allOf:
             - const: usbdrd
             - const: mmc_card
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: samsung,exynosautov920-cmu-hsi2
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (38.4 MHz)
+            - description: CMU_HSI2 NOC clock (from CMU_TOP)
+            - description: CMU_HSI2 NOC UFS clock (from CMU_TOP)
+            - description: CMU_HSI2 UFS EMBD clock (from CMU_TOP)
+            - description: CMU_HSI2 ETHERNET clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: noc
+            - const: ufs
+            - const: embd
+            - const: ethernet
+
 required:
   - compatible
   - "#clock-cells"
diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h
index 5e6896e9627f..93e6233d1358 100644
--- a/include/dt-bindings/clock/samsung,exynosautov920.h
+++ b/include/dt-bindings/clock/samsung,exynosautov920.h
@@ -286,4 +286,13 @@
 #define CLK_MOUT_HSI1_USBDRD_USER	3
 #define CLK_MOUT_HSI1_USBDRD		4
 
+/* CMU_HSI2 */
+#define FOUT_PLL_ETH                    1
+#define CLK_MOUT_HSI2_NOC_UFS_USER      2
+#define CLK_MOUT_HSI2_UFS_EMBD_USER     3
+#define CLK_MOUT_HSI2_ETHERNET          4
+#define CLK_MOUT_HSI2_ETHERNET_USER     5
+#define CLK_DOUT_HSI2_ETHERNET          6
+#define CLK_DOUT_HSI2_ETHERNET_PTP      7
+
 #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */
-- 
cgit v1.2.3


From baaebe0928bf321a1cd980d569e308dec66be94c Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Jun 2025 13:08:26 -0700
Subject: Revert "bpf: use common instruction history across all states"

This reverts commit 96a30e469ca1d2b8cc7811b40911f8614b558241.
Next patches in the series modify propagate_precision() to allow
arbitrary starting state. Precision propagation requires access to
jump history, and arbitrary states represent history not belonging to
`env->cur_state`.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250611200836.4135542-1-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  19 ++++----
 kernel/bpf/verifier.c        | 109 ++++++++++++++++++++++---------------------
 2 files changed, 64 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e6c26393c029..3e77befdbc4b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -344,7 +344,7 @@ struct bpf_func_state {
 
 #define MAX_CALL_FRAMES 8
 
-/* instruction history flags, used in bpf_insn_hist_entry.flags field */
+/* instruction history flags, used in bpf_jmp_history_entry.flags field */
 enum {
 	/* instruction references stack slot through PTR_TO_STACK register;
 	 * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8)
@@ -366,7 +366,7 @@ enum {
 static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES);
 static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8);
 
-struct bpf_insn_hist_entry {
+struct bpf_jmp_history_entry {
 	u32 idx;
 	/* insn idx can't be bigger than 1 million */
 	u32 prev_idx : 20;
@@ -459,14 +459,13 @@ struct bpf_verifier_state {
 	 * See get_loop_entry() for more information.
 	 */
 	struct bpf_verifier_state *loop_entry;
-	/* Sub-range of env->insn_hist[] corresponding to this state's
-	 * instruction history.
-	 * Backtracking is using it to go from last to first.
-	 * For most states instruction history is short, 0-3 instructions.
+	/* jmp history recorded from first to last.
+	 * backtracking is using it to go from last to first.
+	 * For most states jmp_history_cnt is [0-3].
 	 * For loops can go up to ~40.
 	 */
-	u32 insn_hist_start;
-	u32 insn_hist_end;
+	struct bpf_jmp_history_entry *jmp_history;
+	u32 jmp_history_cnt;
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
 	u32 may_goto_depth;
@@ -776,9 +775,7 @@ struct bpf_verifier_env {
 		int cur_postorder;
 	} cfg;
 	struct backtrack_state bt;
-	struct bpf_insn_hist_entry *insn_hist;
-	struct bpf_insn_hist_entry *cur_hist_ent;
-	u32 insn_hist_cap;
+	struct bpf_jmp_history_entry *cur_hist_ent;
 	u32 pass_cnt; /* number of times do_check() was called */
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b1f797616f20..92f2dad5f453 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1660,6 +1660,13 @@ static void free_func_state(struct bpf_func_state *state)
 	kfree(state);
 }
 
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+	kfree(state->jmp_history);
+	state->jmp_history = NULL;
+	state->jmp_history_cnt = 0;
+}
+
 static void free_verifier_state(struct bpf_verifier_state *state,
 				bool free_self)
 {
@@ -1670,6 +1677,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		state->frame[i] = NULL;
 	}
 	kfree(state->refs);
+	clear_jmp_history(state);
 	if (free_self)
 		kfree(state);
 }
@@ -1734,6 +1742,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	struct bpf_func_state *dst;
 	int i, err;
 
+	dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history,
+					  src->jmp_history_cnt, sizeof(*dst_state->jmp_history),
+					  GFP_USER);
+	if (!dst_state->jmp_history)
+		return -ENOMEM;
+	dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
 	/* if dst has more stack frames then src frame, free them, this is also
 	 * necessary in case of exceptional exits using bpf_throw.
 	 */
@@ -1751,8 +1766,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->parent = src->parent;
 	dst_state->first_insn_idx = src->first_insn_idx;
 	dst_state->last_insn_idx = src->last_insn_idx;
-	dst_state->insn_hist_start = src->insn_hist_start;
-	dst_state->insn_hist_end = src->insn_hist_end;
 	dst_state->dfs_depth = src->dfs_depth;
 	dst_state->callback_unroll_depth = src->callback_unroll_depth;
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
@@ -2820,14 +2833,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	 * The caller state doesn't matter.
 	 * This is async callback. It starts in a fresh stack.
 	 * Initialize it similar to do_check_common().
-	 * But we do need to make sure to not clobber insn_hist, so we keep
-	 * chaining insn_hist_start/insn_hist_end indices as for a normal
-	 * child state.
 	 */
 	elem->st.branches = 1;
 	elem->st.in_sleepable = is_sleepable;
-	elem->st.insn_hist_start = env->cur_state->insn_hist_end;
-	elem->st.insn_hist_end = elem->st.insn_hist_start;
 	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 	if (!frame)
 		goto err;
@@ -3856,10 +3864,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s)
 }
 
 /* for any branch, call, exit record the history of jmps in the given state */
-static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
-			     int insn_flags, u64 linked_regs)
+static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur,
+			    int insn_flags, u64 linked_regs)
 {
-	struct bpf_insn_hist_entry *p;
+	u32 cnt = cur->jmp_history_cnt;
+	struct bpf_jmp_history_entry *p;
 	size_t alloc_size;
 
 	/* combine instruction flags if we already recorded this instruction */
@@ -3879,32 +3888,29 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s
 		return 0;
 	}
 
-	if (cur->insn_hist_end + 1 > env->insn_hist_cap) {
-		alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p));
-		p = kvrealloc(env->insn_hist, alloc_size, GFP_USER);
-		if (!p)
-			return -ENOMEM;
-		env->insn_hist = p;
-		env->insn_hist_cap = alloc_size / sizeof(*p);
-	}
+	cnt++;
+	alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+	p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
+	if (!p)
+		return -ENOMEM;
+	cur->jmp_history = p;
 
-	p = &env->insn_hist[cur->insn_hist_end];
+	p = &cur->jmp_history[cnt - 1];
 	p->idx = env->insn_idx;
 	p->prev_idx = env->prev_insn_idx;
 	p->flags = insn_flags;
 	p->linked_regs = linked_regs;
-
-	cur->insn_hist_end++;
+	cur->jmp_history_cnt = cnt;
 	env->cur_hist_ent = p;
 
 	return 0;
 }
 
-static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env,
-						       u32 hist_start, u32 hist_end, int insn_idx)
+static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st,
+						        u32 hist_end, int insn_idx)
 {
-	if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx)
-		return &env->insn_hist[hist_end - 1];
+	if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx)
+		return &st->jmp_history[hist_end - 1];
 	return NULL;
 }
 
@@ -3921,26 +3927,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *
  * history entry recording a jump from last instruction of parent state and
  * first instruction of given state.
  */
-static int get_prev_insn_idx(const struct bpf_verifier_env *env,
-			     struct bpf_verifier_state *st,
-			     int insn_idx, u32 hist_start, u32 *hist_endp)
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+			     u32 *history)
 {
-	u32 hist_end = *hist_endp;
-	u32 cnt = hist_end - hist_start;
+	u32 cnt = *history;
 
-	if (insn_idx == st->first_insn_idx) {
+	if (i == st->first_insn_idx) {
 		if (cnt == 0)
 			return -ENOENT;
-		if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx)
+		if (cnt == 1 && st->jmp_history[0].idx == i)
 			return -ENOENT;
 	}
 
-	if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) {
-		(*hist_endp)--;
-		return env->insn_hist[hist_end - 1].prev_idx;
+	if (cnt && st->jmp_history[cnt - 1].idx == i) {
+		i = st->jmp_history[cnt - 1].prev_idx;
+		(*history)--;
 	} else {
-		return insn_idx - 1;
+		i--;
 	}
+	return i;
 }
 
 static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
@@ -4121,7 +4126,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
 /* If any register R in hist->linked_regs is marked as precise in bt,
  * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs.
  */
-static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist)
+static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist)
 {
 	struct linked_regs linked_regs;
 	bool some_precise = false;
@@ -4166,7 +4171,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
  *   - *was* processed previously during backtracking.
  */
 static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
-			  struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
+			  struct bpf_jmp_history_entry *hist, struct backtrack_state *bt)
 {
 	struct bpf_insn *insn = env->prog->insnsi + idx;
 	u8 class = BPF_CLASS(insn->code);
@@ -4584,7 +4589,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_
  * SCALARS, as well as any other registers and slots that contribute to
  * a tracked state of given registers/stack slots, depending on specific BPF
  * assembly instructions (see backtrack_insns() for exact instruction handling
- * logic). This backtracking relies on recorded insn_hist and is able to
+ * logic). This backtracking relies on recorded jmp_history and is able to
  * traverse entire chain of parent states. This process ends only when all the
  * necessary registers/slots and their transitive dependencies are marked as
  * precise.
@@ -4701,9 +4706,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 
 	for (;;) {
 		DECLARE_BITMAP(mask, 64);
-		u32 hist_start = st->insn_hist_start;
-		u32 hist_end = st->insn_hist_end;
-		struct bpf_insn_hist_entry *hist;
+		u32 history = st->jmp_history_cnt;
+		struct bpf_jmp_history_entry *hist;
 
 		if (env->log.level & BPF_LOG_LEVEL2) {
 			verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n",
@@ -4741,7 +4745,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				err = 0;
 				skip_first = false;
 			} else {
-				hist = get_insn_hist_entry(env, hist_start, hist_end, i);
+				hist = get_jmp_hist_entry(st, history, i);
 				err = backtrack_insn(env, i, subseq_idx, hist, bt);
 			}
 			if (err == -ENOTSUPP) {
@@ -4758,7 +4762,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
 				 */
 				return 0;
 			subseq_idx = i;
-			i = get_prev_insn_idx(env, st, i, hist_start, &hist_end);
+			i = get_prev_insn_idx(st, i, &history);
 			if (i == -ENOENT)
 				break;
 			if (i >= env->prog->len) {
@@ -5122,7 +5126,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags)
-		return push_insn_history(env, env->cur_state, insn_flags, 0);
+		return push_jmp_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -5429,7 +5433,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 		insn_flags = 0; /* we are not restoring spilled register */
 	}
 	if (insn_flags)
-		return push_insn_history(env, env->cur_state, insn_flags, 0);
+		return push_jmp_history(env, env->cur_state, insn_flags, 0);
 	return 0;
 }
 
@@ -16496,7 +16500,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	}
 
 	if (insn_flags) {
-		err = push_insn_history(env, this_branch, insn_flags, 0);
+		err = push_jmp_history(env, this_branch, insn_flags, 0);
 		if (err)
 			return err;
 	}
@@ -16554,7 +16558,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (dst_reg->type == SCALAR_VALUE && dst_reg->id)
 		collect_linked_regs(this_branch, dst_reg->id, &linked_regs);
 	if (linked_regs.cnt > 1) {
-		err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
+		err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs));
 		if (err)
 			return err;
 	}
@@ -19052,7 +19056,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 
 	force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
 			  /* Avoid accumulating infinitely long jmp history */
-			  cur->insn_hist_end - cur->insn_hist_start > 40;
+			  cur->jmp_history_cnt > 40;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -19251,7 +19255,7 @@ hit:
 			 * the current state.
 			 */
 			if (is_jmp_point(env, env->insn_idx))
-				err = err ? : push_insn_history(env, cur, 0, 0);
+				err = err ? : push_jmp_history(env, cur, 0, 0);
 			err = err ? : propagate_precision(env, &sl->state);
 			if (err)
 				return err;
@@ -19333,8 +19337,8 @@ miss:
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
-	cur->insn_hist_start = cur->insn_hist_end;
 	cur->dfs_depth = new->dfs_depth + 1;
+	clear_jmp_history(cur);
 	list_add(&new_sl->node, head);
 
 	/* connect new state to parentage chain. Current frame needs all
@@ -19704,7 +19708,7 @@ static int do_check(struct bpf_verifier_env *env)
 		}
 
 		if (is_jmp_point(env, env->insn_idx)) {
-			err = push_insn_history(env, state, 0, 0);
+			err = push_jmp_history(env, state, 0, 0);
 			if (err)
 				return err;
 		}
@@ -24291,7 +24295,6 @@ err_unlock:
 	if (!is_priv)
 		mutex_unlock(&bpf_verifier_lock);
 	vfree(env->insn_aux_data);
-	kvfree(env->insn_hist);
 err_free_env:
 	kvfree(env->cfg.insn_postorder);
 	kvfree(env);
-- 
cgit v1.2.3


From 96c6aa4c63af0bb0675c41b3e61a2fc7f6fed998 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Jun 2025 13:08:27 -0700
Subject: bpf: compute SCCs in program control flow graph

Compute strongly connected components in the program CFG.
Assign an SCC number to each instruction, recorded in
env->insn_aux[*].scc. Use Tarjan's algorithm for SCC computation
adapted to run non-recursively.

For debug purposes print out computed SCCs as a part of full program
dump in compute_live_registers() at log level 2, e.g.:

  func#0 @0
  Live regs before insn:
        0: .......... (b4) w6 = 10
    2   1: ......6... (18) r1 = 0xffff88810bbb5565
    2   3: .1....6... (b4) w2 = 2
    2   4: .12...6... (85) call bpf_trace_printk#6
    2   5: ......6... (04) w6 += -1
    2   6: ......6... (56) if w6 != 0x0 goto pc-6
        7: .......... (b4) w6 = 5
    1   8: ......6... (18) r1 = 0xffff88810bbb5567
    1  10: .1....6... (b4) w2 = 2
    1  11: .12...6... (85) call bpf_trace_printk#6
    1  12: ......6... (04) w6 += -1
    1  13: ......6... (56) if w6 != 0x0 goto pc-6
       14: .......... (b4) w0 = 0
       15: 0......... (95) exit
   ^^^
  SCC number for the instruction

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250611200836.4135542-2-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   5 ++
 kernel/bpf/verifier.c        | 182 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3e77befdbc4b..95f5211610f4 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -609,6 +609,11 @@ struct bpf_insn_aux_data {
 	 * accepts callback function as a parameter.
 	 */
 	bool calls_callback;
+	/*
+	 * CFG strongly connected component this instruction belongs to,
+	 * zero if it is a singleton SCC.
+	 */
+	u32 scc;
 	/* registers alive before this instruction. */
 	u16 live_regs_before;
 };
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92f2dad5f453..75e4f6544b2a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24013,6 +24013,10 @@ static int compute_live_registers(struct bpf_verifier_env *env)
 	if (env->log.level & BPF_LOG_LEVEL2) {
 		verbose(env, "Live regs before insn:\n");
 		for (i = 0; i < insn_cnt; ++i) {
+			if (env->insn_aux_data[i].scc)
+				verbose(env, "%3d ", env->insn_aux_data[i].scc);
+			else
+				verbose(env, "    ");
 			verbose(env, "%3d: ", i);
 			for (j = BPF_REG_0; j < BPF_REG_10; ++j)
 				if (insn_aux[i].live_regs_before & BIT(j))
@@ -24034,6 +24038,180 @@ out:
 	return err;
 }
 
+/*
+ * Compute strongly connected components (SCCs) on the CFG.
+ * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc.
+ * If instruction is a sole member of its SCC and there are no self edges,
+ * assign it SCC number of zero.
+ * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation.
+ */
+static int compute_scc(struct bpf_verifier_env *env)
+{
+	const u32 NOT_ON_STACK = U32_MAX;
+
+	struct bpf_insn_aux_data *aux = env->insn_aux_data;
+	const u32 insn_cnt = env->prog->len;
+	int stack_sz, dfs_sz, err = 0;
+	u32 *stack, *pre, *low, *dfs;
+	u32 succ_cnt, i, j, t, w;
+	u32 next_preorder_num;
+	u32 next_scc_id;
+	bool assign_scc;
+	u32 succ[2];
+
+	next_preorder_num = 1;
+	next_scc_id = 1;
+	/*
+	 * - 'stack' accumulates vertices in DFS order, see invariant comment below;
+	 * - 'pre[t] == p' => preorder number of vertex 't' is 'p';
+	 * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n';
+	 * - 'dfs' DFS traversal stack, used to emulate explicit recursion.
+	 */
+	stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL);
+	if (!stack || !pre || !low || !dfs) {
+		err = -ENOMEM;
+		goto exit;
+	}
+	/*
+	 * References:
+	 * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms"
+	 * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components"
+	 *
+	 * The algorithm maintains the following invariant:
+	 * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]';
+	 * - then, vertex 'u' remains on stack while vertex 'v' is on stack.
+	 *
+	 * Consequently:
+	 * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u',
+	 *   such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack,
+	 *   and thus there is an SCC (loop) containing both 'u' and 'v'.
+	 * - If 'low[v] == pre[v]', loops containing 'v' have been explored,
+	 *   and 'v' can be considered the root of some SCC.
+	 *
+	 * Here is a pseudo-code for an explicitly recursive version of the algorithm:
+	 *
+	 *    NOT_ON_STACK = insn_cnt + 1
+	 *    pre = [0] * insn_cnt
+	 *    low = [0] * insn_cnt
+	 *    scc = [0] * insn_cnt
+	 *    stack = []
+	 *
+	 *    next_preorder_num = 1
+	 *    next_scc_id = 1
+	 *
+	 *    def recur(w):
+	 *        nonlocal next_preorder_num
+	 *        nonlocal next_scc_id
+	 *
+	 *        pre[w] = next_preorder_num
+	 *        low[w] = next_preorder_num
+	 *        next_preorder_num += 1
+	 *        stack.append(w)
+	 *        for s in successors(w):
+	 *            # Note: for classic algorithm the block below should look as:
+	 *            #
+	 *            # if pre[s] == 0:
+	 *            #     recur(s)
+	 *            #	    low[w] = min(low[w], low[s])
+	 *            # elif low[s] != NOT_ON_STACK:
+	 *            #     low[w] = min(low[w], pre[s])
+	 *            #
+	 *            # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])'
+	 *            # does not break the invariant and makes itartive version of the algorithm
+	 *            # simpler. See 'Algorithm #3' from [2].
+	 *
+	 *            # 's' not yet visited
+	 *            if pre[s] == 0:
+	 *                recur(s)
+	 *            # if 's' is on stack, pick lowest reachable preorder number from it;
+	 *            # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]',
+	 *            # so 'min' would be a noop.
+	 *            low[w] = min(low[w], low[s])
+	 *
+	 *        if low[w] == pre[w]:
+	 *            # 'w' is the root of an SCC, pop all vertices
+	 *            # below 'w' on stack and assign same SCC to them.
+	 *            while True:
+	 *                t = stack.pop()
+	 *                low[t] = NOT_ON_STACK
+	 *                scc[t] = next_scc_id
+	 *                if t == w:
+	 *                    break
+	 *            next_scc_id += 1
+	 *
+	 *    for i in range(0, insn_cnt):
+	 *        if pre[i] == 0:
+	 *            recur(i)
+	 *
+	 * Below implementation replaces explicit recusion with array 'dfs'.
+	 */
+	for (i = 0; i < insn_cnt; i++) {
+		if (pre[i])
+			continue;
+		stack_sz = 0;
+		dfs_sz = 1;
+		dfs[0] = i;
+dfs_continue:
+		while (dfs_sz) {
+			w = dfs[dfs_sz - 1];
+			if (pre[w] == 0) {
+				low[w] = next_preorder_num;
+				pre[w] = next_preorder_num;
+				next_preorder_num++;
+				stack[stack_sz++] = w;
+			}
+			/* Visit 'w' successors */
+			succ_cnt = insn_successors(env->prog, w, succ);
+			for (j = 0; j < succ_cnt; ++j) {
+				if (pre[succ[j]]) {
+					low[w] = min(low[w], low[succ[j]]);
+				} else {
+					dfs[dfs_sz++] = succ[j];
+					goto dfs_continue;
+				}
+			}
+			/*
+			 * Preserve the invariant: if some vertex above in the stack
+			 * is reachable from 'w', keep 'w' on the stack.
+			 */
+			if (low[w] < pre[w]) {
+				dfs_sz--;
+				goto dfs_continue;
+			}
+			/*
+			 * Assign SCC number only if component has two or more elements,
+			 * or if component has a self reference.
+			 */
+			assign_scc = stack[stack_sz - 1] != w;
+			for (j = 0; j < succ_cnt; ++j) {
+				if (succ[j] == w) {
+					assign_scc = true;
+					break;
+				}
+			}
+			/* Pop component elements from stack */
+			do {
+				t = stack[--stack_sz];
+				low[t] = NOT_ON_STACK;
+				if (assign_scc)
+					aux[t].scc = next_scc_id;
+			} while (t != w);
+			if (assign_scc)
+				next_scc_id++;
+			dfs_sz--;
+		}
+	}
+exit:
+	kvfree(stack);
+	kvfree(pre);
+	kvfree(low);
+	kvfree(dfs);
+	return err;
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
@@ -24155,6 +24333,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret)
 		goto skip_full_check;
 
+	ret = compute_scc(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	ret = compute_live_registers(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From c9e31900b54cadf5398dfb838c0a63effa1defec Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Jun 2025 13:08:33 -0700
Subject: bpf: propagate read/precision marks over state graph backedges

Current loop_entry-based exact states comparison logic does not handle
the following case:

 .-> A --.  Assume the states are visited in the order A, B, C.
 |   |   |  Assume that state B reaches a state equivalent to state A.
 |   v   v  At this point, state C is not processed yet, so state A
 '-- B   C  has not received any read or precision marks from C.
            As a result, these marks won't be propagated to B.

If B has incomplete marks, it is unsafe to use it in states_equal()
checks.

This commit replaces the existing logic with the following:
- Strongly connected components (SCCs) are computed over the program's
  control flow graph (intraprocedurally).
- When a verifier state enters an SCC, that state is recorded as the
  SCC entry point.
- When a verifier state is found equivalent to another (e.g., B to A
  in the example), it is recorded as a states graph backedge.
  Backedges are accumulated per SCC.
- When an SCC entry state reaches `branches == 0`, read and precision
  marks are propagated through the backedges (e.g., from A to B, from
  C to A, and then again from A to B).

To support nested subprogram calls, the entry state and backedge list
are associated not with the SCC itself but with an object called
`bpf_scc_callchain`. A callchain is a tuple `(callsite*, scc_id)`,
where `callsite` is the index of a call instruction for each frame
except the last.

See the comments added in `is_state_visited()` and
`compute_scc_callchain()` for more details.

Fixes: 2a0992829ea3 ("bpf: correct loop detection for iterators convergence")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250611200836.4135542-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  38 ++++
 kernel/bpf/verifier.c        | 452 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 422 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 95f5211610f4..b0273f759589 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -459,6 +459,10 @@ struct bpf_verifier_state {
 	 * See get_loop_entry() for more information.
 	 */
 	struct bpf_verifier_state *loop_entry;
+	/* if this state is a backedge state then equal_state
+	 * records cached state to which this state is equal.
+	 */
+	struct bpf_verifier_state *equal_state;
 	/* jmp history recorded from first to last.
 	 * backtracking is using it to go from last to first.
 	 * For most states jmp_history_cnt is [0-3].
@@ -723,6 +727,37 @@ struct bpf_idset {
 	u32 ids[BPF_ID_MAP_SIZE];
 };
 
+/* see verifier.c:compute_scc_callchain() */
+struct bpf_scc_callchain {
+	/* call sites from bpf_verifier_state->frame[*]->callsite leading to this SCC */
+	u32 callsites[MAX_CALL_FRAMES - 1];
+	/* last frame in a chain is identified by SCC id */
+	u32 scc;
+};
+
+/* verifier state waiting for propagate_backedges() */
+struct bpf_scc_backedge {
+	struct bpf_scc_backedge *next;
+	struct bpf_verifier_state state;
+};
+
+struct bpf_scc_visit {
+	struct bpf_scc_callchain callchain;
+	/* first state in current verification path that entered SCC
+	 * identified by the callchain
+	 */
+	struct bpf_verifier_state *entry_state;
+	struct bpf_scc_backedge *backedges; /* list of backedges */
+};
+
+/* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc
+ * but having different bpf_scc_callchain->callsites.
+ */
+struct bpf_scc_info {
+	u32 num_visits;
+	struct bpf_scc_visit visits[];
+};
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
@@ -819,6 +854,9 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 	struct bpf_insn insn_buf[INSN_BUF_SIZE];
 	struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
+	/* array of pointers to bpf_scc_info indexed by SCC id */
+	struct bpf_scc_info **scc_info;
+	u32 scc_cnt;
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 90b3d1a0bd86..aa1bb4be7b8b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1700,6 +1700,9 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif
 	return NULL;
 }
 
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+				  struct bpf_verifier_state *st);
+
 /* A state can be freed if it is no longer referenced:
  * - is in the env->free_list;
  * - has no children states;
@@ -1710,20 +1713,14 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif
 static void maybe_free_verifier_state(struct bpf_verifier_env *env,
 				      struct bpf_verifier_state_list *sl)
 {
-	struct bpf_verifier_state_list *loop_entry_sl;
-
-	while (sl && sl->in_free_list &&
-		     sl->state.branches == 0 &&
-		     sl->state.used_as_loop_entry == 0) {
-		loop_entry_sl = state_loop_entry_as_list(&sl->state);
-		if (loop_entry_sl)
-			loop_entry_sl->state.used_as_loop_entry--;
-		list_del(&sl->node);
-		free_verifier_state(&sl->state, false);
-		kfree(sl);
-		env->free_list_size--;
-		sl = loop_entry_sl;
-	}
+	if (!sl->in_free_list
+	    || sl->state.branches != 0
+	    || incomplete_read_marks(env, &sl->state))
+		return;
+	list_del(&sl->node);
+	free_verifier_state(&sl->state, false);
+	kfree(sl);
+	env->free_list_size--;
 }
 
 /* copy verifier state from src to dst growing dst stack space
@@ -1771,6 +1768,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	dst_state->may_goto_depth = src->may_goto_depth;
 	dst_state->loop_entry = src->loop_entry;
+	dst_state->equal_state = src->equal_state;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
 		if (!dst) {
@@ -1972,22 +1970,218 @@ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
 	       : st->frame[frame + 1]->callsite;
 }
 
-static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+/* For state @st look for a topmost frame with frame_insn_idx() in some SCC,
+ * if such frame exists form a corresponding @callchain as an array of
+ * call sites leading to this frame and SCC id.
+ * E.g.:
+ *
+ *    void foo()  { A: loop {... SCC#1 ...}; }
+ *    void bar()  { B: loop { C: foo(); ... SCC#2 ... }
+ *                  D: loop { E: foo(); ... SCC#3 ... } }
+ *    void main() { F: bar(); }
+ *
+ * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending
+ * on @st frame call sites being (F,C,A) or (F,E,A).
+ */
+static bool compute_scc_callchain(struct bpf_verifier_env *env,
+				  struct bpf_verifier_state *st,
+				  struct bpf_scc_callchain *callchain)
+{
+	u32 i, scc, insn_idx;
+
+	memset(callchain, 0, sizeof(*callchain));
+	for (i = 0; i <= st->curframe; i++) {
+		insn_idx = frame_insn_idx(st, i);
+		scc = env->insn_aux_data[insn_idx].scc;
+		if (scc) {
+			callchain->scc = scc;
+			break;
+		} else if (i < st->curframe) {
+			callchain->callsites[i] = insn_idx;
+		} else {
+			return false;
+		}
+	}
+	return true;
+}
+
+/* Check if bpf_scc_visit instance for @callchain exists. */
+static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env,
+					      struct bpf_scc_callchain *callchain)
+{
+	struct bpf_scc_info *info = env->scc_info[callchain->scc];
+	struct bpf_scc_visit *visits = info->visits;
+	u32 i;
+
+	if (!info)
+		return NULL;
+	for (i = 0; i < info->num_visits; i++)
+		if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0)
+			return &visits[i];
+	return NULL;
+}
+
+/* Allocate a new bpf_scc_visit instance corresponding to @callchain.
+ * Allocated instances are alive for a duration of the do_check_common()
+ * call and are freed by free_states().
+ */
+static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env,
+					     struct bpf_scc_callchain *callchain)
+{
+	struct bpf_scc_visit *visit;
+	struct bpf_scc_info *info;
+	u32 scc, num_visits;
+	u64 new_sz;
+
+	scc = callchain->scc;
+	info = env->scc_info[scc];
+	num_visits = info ? info->num_visits : 0;
+	new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1);
+	info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL);
+	if (!info)
+		return NULL;
+	env->scc_info[scc] = info;
+	info->num_visits = num_visits + 1;
+	visit = &info->visits[num_visits];
+	memset(visit, 0, sizeof(*visit));
+	memcpy(&visit->callchain, callchain, sizeof(*callchain));
+	return visit;
+}
+
+/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */
+static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain)
+{
+	char *buf = env->tmp_str_buf;
+	int i, delta = 0;
+
+	delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "(");
+	for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) {
+		if (!callchain->callsites[i])
+			break;
+		delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,",
+				  callchain->callsites[i]);
+	}
+	delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc);
+	return env->tmp_str_buf;
+}
+
+/* If callchain for @st exists (@st is in some SCC), ensure that
+ * bpf_scc_visit instance for this callchain exists.
+ * If instance does not exist or is empty, assign visit->entry_state to @st.
+ */
+static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+	struct bpf_scc_callchain callchain;
+	struct bpf_scc_visit *visit;
+
+	if (!compute_scc_callchain(env, st, &callchain))
+		return 0;
+	visit = scc_visit_lookup(env, &callchain);
+	visit = visit ?: scc_visit_alloc(env, &callchain);
+	if (!visit)
+		return -ENOMEM;
+	if (!visit->entry_state) {
+		visit->entry_state = st;
+		if (env->log.level & BPF_LOG_LEVEL2)
+			verbose(env, "SCC enter %s\n", format_callchain(env, &callchain));
+	}
+	return 0;
+}
+
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit);
+
+/* If callchain for @st exists (@st is in some SCC), make it empty:
+ * - set visit->entry_state to NULL;
+ * - flush accumulated backedges.
+ */
+static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+	struct bpf_scc_callchain callchain;
+	struct bpf_scc_visit *visit;
+
+	if (!compute_scc_callchain(env, st, &callchain))
+		return 0;
+	visit = scc_visit_lookup(env, &callchain);
+	if (!visit) {
+		verifier_bug(env, "scc exit: no visit info for call chain %s",
+			     format_callchain(env, &callchain));
+		return -EFAULT;
+	}
+	if (visit->entry_state != st)
+		return 0;
+	if (env->log.level & BPF_LOG_LEVEL2)
+		verbose(env, "SCC exit %s\n", format_callchain(env, &callchain));
+	visit->entry_state = NULL;
+	return propagate_backedges(env, visit);
+}
+
+/* Lookup an bpf_scc_visit instance corresponding to @st callchain
+ * and add @backedge to visit->backedges. @st callchain must exist.
+ */
+static int add_scc_backedge(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *st,
+			    struct bpf_scc_backedge *backedge)
+{
+	struct bpf_scc_callchain callchain;
+	struct bpf_scc_visit *visit;
+
+	if (!compute_scc_callchain(env, st, &callchain)) {
+		verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
+			     st->insn_idx);
+		return -EFAULT;
+	}
+	visit = scc_visit_lookup(env, &callchain);
+	if (!visit) {
+		verifier_bug(env, "add backedge: no visit info for call chain %s",
+			     format_callchain(env, &callchain));
+		return -EFAULT;
+	}
+	if (env->log.level & BPF_LOG_LEVEL2)
+		verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain));
+	backedge->next = visit->backedges;
+	visit->backedges = backedge;
+	return 0;
+}
+
+/* bpf_reg_state->live marks for registers in a state @st are incomplete,
+ * if state @st is in some SCC and not all execution paths starting at this
+ * SCC are fully explored.
+ */
+static bool incomplete_read_marks(struct bpf_verifier_env *env,
+				  struct bpf_verifier_state *st)
+{
+	struct bpf_scc_callchain callchain;
+	struct bpf_scc_visit *visit;
+
+	if (!compute_scc_callchain(env, st, &callchain))
+		return false;
+	visit = scc_visit_lookup(env, &callchain);
+	if (!visit)
+		return false;
+	return !!visit->backedges;
+}
+
+static void free_backedges(struct bpf_scc_visit *visit)
+{
+	struct bpf_scc_backedge *backedge, *next;
+
+	for (backedge = visit->backedges; backedge; backedge = next) {
+		free_verifier_state(&backedge->state, false);
+		next = backedge->next;
+		kvfree(backedge);
+	}
+	visit->backedges = NULL;
+}
+
+static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
 	struct bpf_verifier_state_list *sl = NULL, *parent_sl;
 	struct bpf_verifier_state *parent;
+	int err;
 
 	while (st) {
 		u32 br = --st->branches;
 
-		/* br == 0 signals that DFS exploration for 'st' is finished,
-		 * thus it is necessary to update parent's loop entry if it
-		 * turned out that st is a part of some loop.
-		 * This is a part of 'case A' in get_loop_entry() comment.
-		 */
-		if (br == 0 && st->parent && st->loop_entry)
-			update_loop_entry(env, st->parent, st->loop_entry);
-
 		/* WARN_ON(br > 1) technically makes sense here,
 		 * but see comment in push_stack(), hence:
 		 */
@@ -1996,6 +2190,9 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
 			  br);
 		if (br)
 			break;
+		err = maybe_exit_scc(env, st);
+		if (err)
+			return err;
 		parent = st->parent;
 		parent_sl = state_parent_as_list(st);
 		if (sl)
@@ -2003,6 +2200,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
 		st = parent;
 		sl = parent_sl;
 	}
+	return 0;
 }
 
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
@@ -18344,7 +18542,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
 static void clean_live_states(struct bpf_verifier_env *env, int insn,
 			      struct bpf_verifier_state *cur)
 {
-	struct bpf_verifier_state *loop_entry;
 	struct bpf_verifier_state_list *sl;
 	struct list_head *pos, *head;
 
@@ -18353,15 +18550,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
 		sl = container_of(pos, struct bpf_verifier_state_list, node);
 		if (sl->state.branches)
 			continue;
-		loop_entry = get_loop_entry(env, &sl->state);
-		if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
-			continue;
 		if (sl->state.insn_idx != insn ||
 		    !same_callsites(&sl->state, cur))
 			continue;
 		if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE)
 			/* all regs in this state in all frames were already marked */
 			continue;
+		if (incomplete_read_marks(env, &sl->state))
+			continue;
 		clean_verifier_state(env, &sl->state);
 	}
 }
@@ -18963,6 +19159,46 @@ static int propagate_precision(struct bpf_verifier_env *env,
 	return 0;
 }
 
+#define MAX_BACKEDGE_ITERS 64
+
+/* Propagate read and precision marks from visit->backedges[*].state->equal_state
+ * to corresponding parent states of visit->backedges[*].state until fixed point is reached,
+ * then free visit->backedges.
+ * After execution of this function incomplete_read_marks() will return false
+ * for all states corresponding to @visit->callchain.
+ */
+static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit)
+{
+	struct bpf_scc_backedge *backedge;
+	struct bpf_verifier_state *st;
+	bool changed;
+	int i, err;
+
+	i = 0;
+	do {
+		if (i++ > MAX_BACKEDGE_ITERS) {
+			if (env->log.level & BPF_LOG_LEVEL2)
+				verbose(env, "%s: too many iterations\n", __func__);
+			for (backedge = visit->backedges; backedge; backedge = backedge->next)
+				mark_all_scalars_precise(env, &backedge->state);
+			break;
+		}
+		changed = false;
+		for (backedge = visit->backedges; backedge; backedge = backedge->next) {
+			st = &backedge->state;
+			err = propagate_liveness(env, st->equal_state, st, &changed);
+			if (err)
+				return err;
+			err = propagate_precision(env, st->equal_state, st, &changed);
+			if (err)
+				return err;
+		}
+	} while (changed);
+
+	free_backedges(visit);
+	return 0;
+}
+
 static bool states_maybe_looping(struct bpf_verifier_state *old,
 				 struct bpf_verifier_state *cur)
 {
@@ -19072,9 +19308,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
-	struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
+	struct bpf_verifier_state *cur = env->cur_state, *new;
+	bool force_new_state, add_new_state, loop;
 	int i, j, n, err, states_cnt = 0;
-	bool force_new_state, add_new_state, force_exact;
 	struct list_head *pos, *tmp, *head;
 
 	force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
@@ -19096,6 +19332,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 
 	clean_live_states(env, insn_idx, cur);
 
+	loop = false;
 	head = explored_state(env, insn_idx);
 	list_for_each_safe(pos, tmp, head) {
 		sl = container_of(pos, struct bpf_verifier_state_list, node);
@@ -19175,7 +19412,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
 					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
 					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
-						update_loop_entry(env, cur, &sl->state);
+						loop = true;
 						goto hit;
 					}
 				}
@@ -19184,7 +19421,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			if (is_may_goto_insn_at(env, insn_idx)) {
 				if (sl->state.may_goto_depth != cur->may_goto_depth &&
 				    states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
-					update_loop_entry(env, cur, &sl->state);
+					loop = true;
 					goto hit;
 				}
 			}
@@ -19226,38 +19463,9 @@ skip_inf_loop_check:
 				add_new_state = false;
 			goto miss;
 		}
-		/* If sl->state is a part of a loop and this loop's entry is a part of
-		 * current verification path then states have to be compared exactly.
-		 * 'force_exact' is needed to catch the following case:
-		 *
-		 *                initial     Here state 'succ' was processed first,
-		 *                  |         it was eventually tracked to produce a
-		 *                  V         state identical to 'hdr'.
-		 *     .---------> hdr        All branches from 'succ' had been explored
-		 *     |            |         and thus 'succ' has its .branches == 0.
-		 *     |            V
-		 *     |    .------...        Suppose states 'cur' and 'succ' correspond
-		 *     |    |       |         to the same instruction + callsites.
-		 *     |    V       V         In such case it is necessary to check
-		 *     |   ...     ...        if 'succ' and 'cur' are states_equal().
-		 *     |    |       |         If 'succ' and 'cur' are a part of the
-		 *     |    V       V         same loop exact flag has to be set.
-		 *     |   succ <- cur        To check if that is the case, verify
-		 *     |    |                 if loop entry of 'succ' is in current
-		 *     |    V                 DFS path.
-		 *     |   ...
-		 *     |    |
-		 *     '----'
-		 *
-		 * Additional details are in the comment before get_loop_entry().
-		 */
-		loop_entry = get_loop_entry(env, &sl->state);
-		if (IS_ERR(loop_entry))
-			return PTR_ERR(loop_entry);
-		force_exact = loop_entry && loop_entry->branches > 0;
-		if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
-			if (force_exact)
-				update_loop_entry(env, cur, loop_entry);
+		/* See comments for mark_all_regs_read_and_precise() */
+		loop = incomplete_read_marks(env, &sl->state);
+		if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) {
 hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
@@ -19282,6 +19490,94 @@ hit:
 			err = err ? : propagate_precision(env, &sl->state, cur, NULL);
 			if (err)
 				return err;
+			/* When processing iterator based loops above propagate_liveness and
+			 * propagate_precision calls are not sufficient to transfer all relevant
+			 * read and precision marks. E.g. consider the following case:
+			 *
+			 *  .-> A --.  Assume the states are visited in the order A, B, C.
+			 *  |   |   |  Assume that state B reaches a state equivalent to state A.
+			 *  |   v   v  At this point, state C is not processed yet, so state A
+			 *  '-- B   C  has not received any read or precision marks from C.
+			 *             Thus, marks propagated from A to B are incomplete.
+			 *
+			 * The verifier mitigates this by performing the following steps:
+			 *
+			 * - Prior to the main verification pass, strongly connected components
+			 *   (SCCs) are computed over the program's control flow graph,
+			 *   intraprocedurally.
+			 *
+			 * - During the main verification pass, `maybe_enter_scc()` checks
+			 *   whether the current verifier state is entering an SCC. If so, an
+			 *   instance of a `bpf_scc_visit` object is created, and the state
+			 *   entering the SCC is recorded as the entry state.
+			 *
+			 * - This instance is associated not with the SCC itself, but with a
+			 *   `bpf_scc_callchain`: a tuple consisting of the call sites leading to
+			 *   the SCC and the SCC id. See `compute_scc_callchain()`.
+			 *
+			 * - When a verification path encounters a `states_equal(...,
+			 *   RANGE_WITHIN)` condition, there exists a call chain describing the
+			 *   current state and a corresponding `bpf_scc_visit` instance. A copy
+			 *   of the current state is created and added to
+			 *   `bpf_scc_visit->backedges`.
+			 *
+			 * - When a verification path terminates, `maybe_exit_scc()` is called
+			 *   from `update_branch_counts()`. For states with `branches == 0`, it
+			 *   checks whether the state is the entry state of any `bpf_scc_visit`
+			 *   instance. If it is, this indicates that all paths originating from
+			 *   this SCC visit have been explored. `propagate_backedges()` is then
+			 *   called, which propagates read and precision marks through the
+			 *   backedges until a fixed point is reached.
+			 *   (In the earlier example, this would propagate marks from A to B,
+			 *    from C to A, and then again from A to B.)
+			 *
+			 * A note on callchains
+			 * --------------------
+			 *
+			 * Consider the following example:
+			 *
+			 *     void foo() { loop { ... SCC#1 ... } }
+			 *     void main() {
+			 *       A: foo();
+			 *       B: ...
+			 *       C: foo();
+			 *     }
+			 *
+			 * Here, there are two distinct callchains leading to SCC#1:
+			 * - (A, SCC#1)
+			 * - (C, SCC#1)
+			 *
+			 * Each callchain identifies a separate `bpf_scc_visit` instance that
+			 * accumulates backedge states. The `propagate_{liveness,precision}()`
+			 * functions traverse the parent state of each backedge state, which
+			 * means these parent states must remain valid (i.e., not freed) while
+			 * the corresponding `bpf_scc_visit` instance exists.
+			 *
+			 * Associating `bpf_scc_visit` instances directly with SCCs instead of
+			 * callchains would break this invariant:
+			 * - States explored during `C: foo()` would contribute backedges to
+			 *   SCC#1, but SCC#1 would only be exited once the exploration of
+			 *   `A: foo()` completes.
+			 * - By that time, the states explored between `A: foo()` and `C: foo()`
+			 *   (i.e., `B: ...`) may have already been freed, causing the parent
+			 *   links for states from `C: foo()` to become invalid.
+			 */
+			if (loop) {
+				struct bpf_scc_backedge *backedge;
+
+				backedge = kzalloc(sizeof(*backedge), GFP_KERNEL);
+				if (!backedge)
+					return -ENOMEM;
+				err = copy_verifier_state(&backedge->state, cur);
+				backedge->state.equal_state = &sl->state;
+				backedge->state.insn_idx = insn_idx;
+				err = err ?: add_scc_backedge(env, &sl->state, backedge);
+				if (err) {
+					free_verifier_state(&backedge->state, false);
+					kvfree(backedge);
+					return err;
+				}
+			}
 			return 1;
 		}
 miss:
@@ -19357,6 +19653,12 @@ miss:
 	new->insn_idx = insn_idx;
 	WARN_ONCE(new->branches != 1,
 		  "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+	err = maybe_enter_scc(env, new);
+	if (err) {
+		free_verifier_state(new, false);
+		kvfree(new_sl);
+		return err;
+	}
 
 	cur->parent = new;
 	cur->first_insn_idx = insn_idx;
@@ -19811,7 +20113,9 @@ static int do_check(struct bpf_verifier_env *env)
 			WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1);
 process_bpf_exit:
 			mark_verifier_state_scratched(env);
-			update_branch_counts(env, env->cur_state);
+			err = update_branch_counts(env, env->cur_state);
+			if (err)
+				return err;
 			err = pop_stack(env, &prev_insn_idx, &env->insn_idx,
 					pop_log);
 			if (err < 0) {
@@ -19819,9 +20123,6 @@ process_bpf_exit:
 					return err;
 				break;
 			} else {
-				if (verifier_bug_if(env->cur_state->loop_entry, env,
-						    "broken loop detection"))
-					return -EFAULT;
 				do_print_state = true;
 				continue;
 			}
@@ -22787,7 +23088,8 @@ static void free_states(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state_list *sl;
 	struct list_head *head, *pos, *tmp;
-	int i;
+	struct bpf_scc_info *info;
+	int i, j;
 
 	list_for_each_safe(pos, tmp, &env->free_list) {
 		sl = container_of(pos, struct bpf_verifier_state_list, node);
@@ -22796,6 +23098,14 @@ static void free_states(struct bpf_verifier_env *env)
 	}
 	INIT_LIST_HEAD(&env->free_list);
 
+	for (i = 0; i < env->scc_cnt; ++i) {
+		info = env->scc_info[i];
+		for (j = 0; j < info->num_visits; j++)
+			free_backedges(&info->visits[j]);
+		kvfree(info);
+		env->scc_info[i] = NULL;
+	}
+
 	if (!env->explored_states)
 		return;
 
@@ -24228,6 +24538,11 @@ dfs_continue:
 			dfs_sz--;
 		}
 	}
+	env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL);
+	if (!env->scc_info) {
+		err = -ENOMEM;
+		goto exit;
+	}
 exit:
 	kvfree(stack);
 	kvfree(pre);
@@ -24503,6 +24818,7 @@ err_unlock:
 	vfree(env->insn_aux_data);
 err_free_env:
 	kvfree(env->cfg.insn_postorder);
+	kvfree(env->scc_info);
 	kvfree(env);
 	return ret;
 }
-- 
cgit v1.2.3


From 0e0da5f901f582b97bfeefbf1f36a27e9d427ff4 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Jun 2025 13:08:34 -0700
Subject: bpf: remove {update,get}_loop_entry functions

The previous patch switched read and precision tracking for
iterator-based loops from state-graph-based loop tracking to
control-flow-graph-based loop tracking.

This patch removes the now-unused `update_loop_entry()` and
`get_loop_entry()` functions, which were part of the state-graph-based
logic.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250611200836.4135542-9-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  15 ----
 kernel/bpf/verifier.c        | 165 +------------------------------------------
 2 files changed, 1 insertion(+), 179 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b0273f759589..1ae588679e20 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -449,16 +449,6 @@ struct bpf_verifier_state {
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
 	u32 last_insn_idx;
-	/* If this state is a part of states loop this field points to some
-	 * parent of this state such that:
-	 * - it is also a member of the same states loop;
-	 * - DFS states traversal starting from initial state visits loop_entry
-	 *   state before this state.
-	 * Used to compute topmost loop entry for state loops.
-	 * State loops might appear because of open coded iterators logic.
-	 * See get_loop_entry() for more information.
-	 */
-	struct bpf_verifier_state *loop_entry;
 	/* if this state is a backedge state then equal_state
 	 * records cached state to which this state is equal.
 	 */
@@ -473,11 +463,6 @@ struct bpf_verifier_state {
 	u32 dfs_depth;
 	u32 callback_unroll_depth;
 	u32 may_goto_depth;
-	/* If this state was ever pointed-to by other state's loop_entry field
-	 * this flag would be set to true. Used to avoid freeing such states
-	 * while they are still in use.
-	 */
-	u32 used_as_loop_entry;
 };
 
 #define bpf_get_spilled_reg(slot, frame, mask)				\
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aa1bb4be7b8b..48847f8da5b1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1682,7 +1682,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
 		kfree(state);
 }
 
-/* struct bpf_verifier_state->{parent,loop_entry} refer to states
+/* struct bpf_verifier_state->parent refers to states
  * that are in either of env->{expored_states,free_list}.
  * In both cases the state is contained in struct bpf_verifier_state_list.
  */
@@ -1693,22 +1693,12 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_
 	return NULL;
 }
 
-static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st)
-{
-	if (st->loop_entry)
-		return container_of(st->loop_entry, struct bpf_verifier_state_list, state);
-	return NULL;
-}
-
 static bool incomplete_read_marks(struct bpf_verifier_env *env,
 				  struct bpf_verifier_state *st);
 
 /* A state can be freed if it is no longer referenced:
  * - is in the env->free_list;
  * - has no children states;
- * - is not used as loop_entry.
- *
- * Freeing a state can make it's loop_entry free-able.
  */
 static void maybe_free_verifier_state(struct bpf_verifier_env *env,
 				      struct bpf_verifier_state_list *sl)
@@ -1765,9 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	dst_state->last_insn_idx = src->last_insn_idx;
 	dst_state->dfs_depth = src->dfs_depth;
 	dst_state->callback_unroll_depth = src->callback_unroll_depth;
-	dst_state->used_as_loop_entry = src->used_as_loop_entry;
 	dst_state->may_goto_depth = src->may_goto_depth;
-	dst_state->loop_entry = src->loop_entry;
 	dst_state->equal_state = src->equal_state;
 	for (i = 0; i <= src->curframe; i++) {
 		dst = dst_state->frame[i];
@@ -1811,157 +1799,6 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
 	return true;
 }
 
-/* Open coded iterators allow back-edges in the state graph in order to
- * check unbounded loops that iterators.
- *
- * In is_state_visited() it is necessary to know if explored states are
- * part of some loops in order to decide whether non-exact states
- * comparison could be used:
- * - non-exact states comparison establishes sub-state relation and uses
- *   read and precision marks to do so, these marks are propagated from
- *   children states and thus are not guaranteed to be final in a loop;
- * - exact states comparison just checks if current and explored states
- *   are identical (and thus form a back-edge).
- *
- * Paper "A New Algorithm for Identifying Loops in Decompilation"
- * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient
- * algorithm for loop structure detection and gives an overview of
- * relevant terminology. It also has helpful illustrations.
- *
- * [1] https://api.semanticscholar.org/CorpusID:15784067
- *
- * We use a similar algorithm but because loop nested structure is
- * irrelevant for verifier ours is significantly simpler and resembles
- * strongly connected components algorithm from Sedgewick's textbook.
- *
- * Define topmost loop entry as a first node of the loop traversed in a
- * depth first search starting from initial state. The goal of the loop
- * tracking algorithm is to associate topmost loop entries with states
- * derived from these entries.
- *
- * For each step in the DFS states traversal algorithm needs to identify
- * the following situations:
- *
- *          initial                     initial                   initial
- *            |                           |                         |
- *            V                           V                         V
- *           ...                         ...           .---------> hdr
- *            |                           |            |            |
- *            V                           V            |            V
- *           cur                     .-> succ          |    .------...
- *            |                      |    |            |    |       |
- *            V                      |    V            |    V       V
- *           succ                    '-- cur           |   ...     ...
- *                                                     |    |       |
- *                                                     |    V       V
- *                                                     |   succ <- cur
- *                                                     |    |
- *                                                     |    V
- *                                                     |   ...
- *                                                     |    |
- *                                                     '----'
- *
- *  (A) successor state of cur   (B) successor state of cur or it's entry
- *      not yet traversed            are in current DFS path, thus cur and succ
- *                                   are members of the same outermost loop
- *
- *                      initial                  initial
- *                        |                        |
- *                        V                        V
- *                       ...                      ...
- *                        |                        |
- *                        V                        V
- *                .------...               .------...
- *                |       |                |       |
- *                V       V                V       V
- *           .-> hdr     ...              ...     ...
- *           |    |       |                |       |
- *           |    V       V                V       V
- *           |   succ <- cur              succ <- cur
- *           |    |                        |
- *           |    V                        V
- *           |   ...                      ...
- *           |    |                        |
- *           '----'                       exit
- *
- * (C) successor state of cur is a part of some loop but this loop
- *     does not include cur or successor state is not in a loop at all.
- *
- * Algorithm could be described as the following python code:
- *
- *     traversed = set()   # Set of traversed nodes
- *     entries = {}        # Mapping from node to loop entry
- *     depths = {}         # Depth level assigned to graph node
- *     path = set()        # Current DFS path
- *
- *     # Find outermost loop entry known for n
- *     def get_loop_entry(n):
- *         h = entries.get(n, None)
- *         while h in entries:
- *             h = entries[h]
- *         return h
- *
- *     # Update n's loop entry if h comes before n in current DFS path.
- *     def update_loop_entry(n, h):
- *         if h in path and depths[entries.get(n, n)] < depths[n]:
- *             entries[n] = h1
- *
- *     def dfs(n, depth):
- *         traversed.add(n)
- *         path.add(n)
- *         depths[n] = depth
- *         for succ in G.successors(n):
- *             if succ not in traversed:
- *                 # Case A: explore succ and update cur's loop entry
- *                 #         only if succ's entry is in current DFS path.
- *                 dfs(succ, depth + 1)
- *                 h = entries.get(succ, None)
- *                 update_loop_entry(n, h)
- *             else:
- *                 # Case B or C depending on `h1 in path` check in update_loop_entry().
- *                 update_loop_entry(n, succ)
- *         path.remove(n)
- *
- * To adapt this algorithm for use with verifier:
- * - use st->branch == 0 as a signal that DFS of succ had been finished
- *   and cur's loop entry has to be updated (case A), handle this in
- *   update_branch_counts();
- * - use st->branch > 0 as a signal that st is in the current DFS path;
- * - handle cases B and C in is_state_visited().
- */
-static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
-						 struct bpf_verifier_state *st)
-{
-	struct bpf_verifier_state *topmost = st->loop_entry;
-	u32 steps = 0;
-
-	while (topmost && topmost->loop_entry) {
-		if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop"))
-			return ERR_PTR(-EFAULT);
-		topmost = topmost->loop_entry;
-	}
-	return topmost;
-}
-
-static void update_loop_entry(struct bpf_verifier_env *env,
-			      struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
-{
-	/* The hdr->branches check decides between cases B and C in
-	 * comment for get_loop_entry(). If hdr->branches == 0 then
-	 * head's topmost loop entry is not in current DFS path,
-	 * hence 'cur' and 'hdr' are not in the same loop and there is
-	 * no need to update cur->loop_entry.
-	 */
-	if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
-		if (cur->loop_entry) {
-			cur->loop_entry->used_as_loop_entry--;
-			maybe_free_verifier_state(env, state_loop_entry_as_list(cur));
-		}
-		cur->loop_entry = hdr;
-		hdr->used_as_loop_entry++;
-	}
-}
-
 /* Return IP for a given frame in a call stack */
 static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame)
 {
-- 
cgit v1.2.3


From 0f54ff54700315caa8ed3bea36fa0ff3ebc53f56 Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Wed, 11 Jun 2025 13:08:35 -0700
Subject: bpf: include backedges in peak_states stat

Count states accumulated in bpf_scc_visit->backedges in
env->peak_states.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250611200836.4135542-10-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 2 ++
 kernel/bpf/verifier.c        | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1ae588679e20..7e459e839f8b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -733,6 +733,7 @@ struct bpf_scc_visit {
 	 */
 	struct bpf_verifier_state *entry_state;
 	struct bpf_scc_backedge *backedges; /* list of backedges */
+	u32 num_backedges;
 };
 
 /* An array of bpf_scc_visit structs sharing tht same bpf_scc_callchain->scc
@@ -822,6 +823,7 @@ struct bpf_verifier_env {
 	u32 longest_mark_read_walk;
 	u32 free_list_size;
 	u32 explored_states_size;
+	u32 num_backedges;
 	bpfptr_t fd_array;
 
 	/* bit mask to keep track of whether a register has been accessed
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48847f8da5b1..1d3277bf935e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1648,7 +1648,7 @@ static void update_peak_states(struct bpf_verifier_env *env)
 {
 	u32 cur_states;
 
-	cur_states = env->explored_states_size + env->free_list_size;
+	cur_states = env->explored_states_size + env->free_list_size + env->num_backedges;
 	env->peak_states = max(env->peak_states, cur_states);
 }
 
@@ -1949,6 +1949,9 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat
 	if (env->log.level & BPF_LOG_LEVEL2)
 		verbose(env, "SCC exit %s\n", format_callchain(env, &callchain));
 	visit->entry_state = NULL;
+	env->num_backedges -= visit->num_backedges;
+	visit->num_backedges = 0;
+	update_peak_states(env);
 	return propagate_backedges(env, visit);
 }
 
@@ -1977,6 +1980,9 @@ static int add_scc_backedge(struct bpf_verifier_env *env,
 		verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain));
 	backedge->next = visit->backedges;
 	visit->backedges = backedge;
+	visit->num_backedges++;
+	env->num_backedges++;
+	update_peak_states(env);
 	return 0;
 }
 
-- 
cgit v1.2.3


From fac4b41741b5cd0826cf0fa5b14e177f70a6b509 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 11 Jun 2025 07:59:43 -0700
Subject: net: ethtool: require drivers to opt into the per-RSS ctx RXFH

RX Flow Hashing supports using different configuration for different
RSS contexts. Only two drivers seem to support it. Make sure we
uniformly error out for drivers which don't.

Reviewed-by: Joe Damato <joe@dama.to>
Link: https://patch.msgid.link/20250611145949.2674086-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 1 +
 drivers/net/ethernet/sfc/ethtool.c                   | 1 +
 include/linux/ethtool.h                              | 3 +++
 net/ethtool/ioctl.c                                  | 8 ++++++++
 4 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 3cb8d3bf9044..8b9ee8bac674 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -2616,6 +2616,7 @@ static void mlx5e_get_ts_stats(struct net_device *netdev,
 const struct ethtool_ops mlx5e_ethtool_ops = {
 	.cap_link_lanes_supported = true,
 	.cap_rss_ctx_supported	= true,
+	.rxfh_per_ctx_fields	= true,
 	.rxfh_per_ctx_key	= true,
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES |
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 83d715544f7f..afbedca63b29 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -262,6 +262,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.set_rxnfc		= efx_ethtool_set_rxnfc,
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
+	.rxfh_per_ctx_fields	= true,
 	.rxfh_per_ctx_key	= true,
 	.cap_rss_rxnfc_adds	= true,
 	.rxfh_priv_size		= sizeof(struct efx_rss_context_priv),
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 90da1aee6e56..1a6737721d7f 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -855,6 +855,8 @@ struct kernel_ethtool_ts_info {
  * @cap_rss_ctx_supported: indicates if the driver supports RSS
  *	contexts via legacy API, drivers implementing @create_rxfh_context
  *	do not have to set this bit.
+ * @rxfh_per_ctx_fields: device supports selecting different header fields
+ *	for Rx hash calculation and RSS for each additional context.
  * @rxfh_per_ctx_key: device supports setting different RSS key for each
  *	additional context. Netlink API should report hfunc, key, and input_xfrm
  *	for every context, not just context 0.
@@ -1084,6 +1086,7 @@ struct ethtool_ops {
 	u32     supported_input_xfrm:8;
 	u32     cap_link_lanes_supported:1;
 	u32     cap_rss_ctx_supported:1;
+	u32	rxfh_per_ctx_fields:1;
 	u32	rxfh_per_ctx_key:1;
 	u32	cap_rss_rxnfc_adds:1;
 	u32	rxfh_indir_space;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 33892099cdad..1a1705e900b3 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1075,6 +1075,10 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 	if (rc)
 		return rc;
 
+	if (info.flow_type & FLOW_RSS && info.rss_context &&
+	    !ops->rxfh_per_ctx_fields)
+		return -EINVAL;
+
 	if (ops->get_rxfh) {
 		struct ethtool_rxfh_param rxfh = {};
 
@@ -1105,6 +1109,10 @@ ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 	if (ret)
 		return ret;
 
+	if (info.flow_type & FLOW_RSS && info.rss_context &&
+	    !ops->rxfh_per_ctx_fields)
+		return -EINVAL;
+
 	ret = ops->get_rxnfc(dev, &info, NULL);
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3


From 9bb00786fc61e865e121aa20dd12aa4d1311a990 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 11 Jun 2025 07:59:44 -0700
Subject: net: ethtool: add dedicated callbacks for getting and setting rxfh
 fields

We mux multiple calls to the drivers via the .get_nfc and .set_nfc
callbacks. This is slightly inconvenient to the drivers as they
have to de-mux them back. It will also be awkward for netlink code
to construct struct ethtool_rxnfc when it wants to get info about
RX Flow Hash, from the RSS module.

Add dedicated driver callbacks. Create struct ethtool_rxfh_fields
which contains only data relevant to RXFH. Maintain the names of
the fields to avoid having to heavily modify the drivers.

For now support both callbacks, once all drivers are converted
ethtool_*et_rxfh_fields() will stop using the rxnfc callbacks.

Link: https://patch.msgid.link/20250611145949.2674086-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 20 ++++++++++++++++++
 net/ethtool/ioctl.c     | 55 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1a6737721d7f..59877fd2a1d3 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -825,6 +825,19 @@ struct ethtool_rxfh_param {
 	u8	input_xfrm;
 };
 
+/**
+ * struct ethtool_rxfh_fields - Rx Flow Hashing (RXFH) header field config
+ * @data: which header fields are used for hashing, bitmask of RXH_* defines
+ * @flow_type: L2-L4 network traffic flow type
+ * @rss_context: RSS context, will only be used if rxfh_per_ctx_fields is
+ *	set in struct ethtool_ops
+ */
+struct ethtool_rxfh_fields {
+	u32 data;
+	u32 flow_type;
+	u32 rss_context;
+};
+
 /**
  * struct kernel_ethtool_ts_info - kernel copy of struct ethtool_ts_info
  * @cmd: command number = %ETHTOOL_GET_TS_INFO
@@ -970,6 +983,8 @@ struct kernel_ethtool_ts_info {
  *	will remain unchanged.
  *	Returns a negative error code or zero. An error code must be returned
  *	if at least one unsupported change was requested.
+ * @get_rxfh_fields: Get header fields used for flow hashing.
+ * @set_rxfh_fields: Set header fields used for flow hashing.
  * @create_rxfh_context: Create a new RSS context with the specified RX flow
  *	hash indirection table, hash key, and hash function.
  *	The &struct ethtool_rxfh_context for this context is passed in @ctx;
@@ -1156,6 +1171,11 @@ struct ethtool_ops {
 	int	(*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *);
 	int	(*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *,
 			    struct netlink_ext_ack *extack);
+	int	(*get_rxfh_fields)(struct net_device *,
+				   struct ethtool_rxfh_fields *);
+	int	(*set_rxfh_fields)(struct net_device *,
+				   const struct ethtool_rxfh_fields *,
+				   struct netlink_ext_ack *extack);
 	int	(*create_rxfh_context)(struct net_device *,
 				       struct ethtool_rxfh_context *ctx,
 				       const struct ethtool_rxfh_param *rxfh,
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 1a1705e900b3..a14cf901c32d 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1048,9 +1048,20 @@ static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm)
 			continue;
 
 		info.flow_type = i;
-		err = ops->get_rxnfc(dev, &info, NULL);
-		if (err)
-			continue;
+
+		if (ops->get_rxfh_fields) {
+			struct ethtool_rxfh_fields fields = {
+				.flow_type	= info.flow_type,
+			};
+
+			if (ops->get_rxfh_fields(dev, &fields))
+				continue;
+
+			info.data = fields.data;
+		} else {
+			if (ops->get_rxnfc(dev, &info, NULL))
+				continue;
+		}
 
 		err = ethtool_check_xfrm_rxfh(input_xfrm, info.data);
 		if (err)
@@ -1064,11 +1075,12 @@ static noinline_for_stack int
 ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 {
 	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct ethtool_rxfh_fields fields = {};
 	struct ethtool_rxnfc info;
 	size_t info_size = sizeof(info);
 	int rc;
 
-	if (!ops->set_rxnfc)
+	if (!ops->set_rxnfc && !ops->set_rxfh_fields)
 		return -EOPNOTSUPP;
 
 	rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
@@ -1091,7 +1103,15 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 			return rc;
 	}
 
-	return ops->set_rxnfc(dev, &info);
+	if (!ops->set_rxfh_fields)
+		return ops->set_rxnfc(dev, &info);
+
+	fields.data = info.data;
+	fields.flow_type = info.flow_type & ~FLOW_RSS;
+	if (info.flow_type & FLOW_RSS)
+		fields.rss_context = info.rss_context;
+
+	return ops->set_rxfh_fields(dev, &fields, NULL);
 }
 
 static noinline_for_stack int
@@ -1102,7 +1122,7 @@ ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	int ret;
 
-	if (!ops->get_rxnfc)
+	if (!ops->get_rxnfc && !ops->get_rxfh_fields)
 		return -EOPNOTSUPP;
 
 	ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
@@ -1113,9 +1133,24 @@ ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 	    !ops->rxfh_per_ctx_fields)
 		return -EINVAL;
 
-	ret = ops->get_rxnfc(dev, &info, NULL);
-	if (ret < 0)
-		return ret;
+	if (ops->get_rxfh_fields) {
+		struct ethtool_rxfh_fields fields = {
+			.flow_type	= info.flow_type & ~FLOW_RSS,
+		};
+
+		if (info.flow_type & FLOW_RSS)
+			fields.rss_context = info.rss_context;
+
+		ret = ops->get_rxfh_fields(dev, &fields);
+		if (ret < 0)
+			return ret;
+
+		info.data = fields.data;
+	} else {
+		ret = ops->get_rxnfc(dev, &info, NULL);
+		if (ret < 0)
+			return ret;
+	}
 
 	return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
 }
@@ -1493,7 +1528,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	u8 *rss_config;
 	int ret;
 
-	if (!ops->get_rxnfc || !ops->set_rxfh)
+	if ((!ops->get_rxnfc && !ops->get_rxfh_fields) || !ops->set_rxfh)
 		return -EOPNOTSUPP;
 
 	if (ops->get_rxfh_indir_size)
-- 
cgit v1.2.3


From b1b36680107ede3a4ec7fa41d052971606d6b325 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 10 Jun 2025 08:03:43 +0200
Subject: net: phy: assign default match function for non-PHY MDIO devices

Make mdio_device_bus_match() the default match function for non-PHY
MDIO devices. Benefit is that we don't have to export this function
any longer. As long as mdiodev->modalias isn't set, there's no change
in behavior. mdiobus_create_device() is the only place where
mdiodev->modalias gets set, but this function sets
mdio_device_bus_match() as match function anyway.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/6c94e3d3-bfb0-4ddc-a518-6fddbc64e1d0@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mdio_bus_provider.c | 1 -
 drivers/net/phy/mdio_device.c       | 5 +++--
 include/linux/mdio.h                | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c
index 65850e36284d..48dc4bf85125 100644
--- a/drivers/net/phy/mdio_bus_provider.c
+++ b/drivers/net/phy/mdio_bus_provider.c
@@ -152,7 +152,6 @@ static int mdiobus_create_device(struct mii_bus *bus,
 
 	strscpy(mdiodev->modalias, bi->modalias,
 		sizeof(mdiodev->modalias));
-	mdiodev->bus_match = mdio_device_bus_match;
 	mdiodev->dev.platform_data = (void *)bi->platform_data;
 
 	ret = mdio_device_register(mdiodev);
diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c
index cce3f405d1a4..f64176e0e197 100644
--- a/drivers/net/phy/mdio_device.c
+++ b/drivers/net/phy/mdio_device.c
@@ -35,7 +35,8 @@ static void mdio_device_release(struct device *dev)
 	kfree(to_mdio_device(dev));
 }
 
-int mdio_device_bus_match(struct device *dev, const struct device_driver *drv)
+static int mdio_device_bus_match(struct device *dev,
+				 const struct device_driver *drv)
 {
 	struct mdio_device *mdiodev = to_mdio_device(dev);
 	const struct mdio_driver *mdiodrv = to_mdio_driver(drv);
@@ -45,7 +46,6 @@ int mdio_device_bus_match(struct device *dev, const struct device_driver *drv)
 
 	return strcmp(mdiodev->modalias, drv->name) == 0;
 }
-EXPORT_SYMBOL_GPL(mdio_device_bus_match);
 
 struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr)
 {
@@ -59,6 +59,7 @@ struct mdio_device *mdio_device_create(struct mii_bus *bus, int addr)
 	mdiodev->dev.release = mdio_device_release;
 	mdiodev->dev.parent = &bus->dev;
 	mdiodev->dev.bus = &mdio_bus_type;
+	mdiodev->bus_match = mdio_device_bus_match;
 	mdiodev->device_free = mdio_device_free;
 	mdiodev->device_remove = mdio_device_remove;
 	mdiodev->bus = bus;
diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index e43ff9f980a4..c640ba44dd6e 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -95,7 +95,6 @@ void mdio_device_remove(struct mdio_device *mdiodev);
 void mdio_device_reset(struct mdio_device *mdiodev, int value);
 int mdio_driver_register(struct mdio_driver *drv);
 void mdio_driver_unregister(struct mdio_driver *drv);
-int mdio_device_bus_match(struct device *dev, const struct device_driver *drv);
 
 static inline void mdio_device_get(struct mdio_device *mdiodev)
 {
-- 
cgit v1.2.3


From 00ee2537255e25a14360288dbd94ff62c0db497d Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 10 Jun 2025 23:34:53 +0200
Subject: net: phy: move definition of genphy_c45_driver to phy_device.c

genphy_c45_read_status() is exported, so we can move definition of
genphy_c45_driver to phy_device.c and make it static. This helps
to clean up phy.h a little.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/ead3ab17-22d0-4cd3-901c-3d493ab851e6@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy-c45.c    | 7 -------
 drivers/net/phy/phy_device.c | 7 +++++++
 include/linux/phy.h          | 3 ---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index bdd70d424491..61670be0f095 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -1573,10 +1573,3 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 	return ret;
 }
 EXPORT_SYMBOL(genphy_c45_ethtool_set_eee);
-
-struct phy_driver genphy_c45_driver = {
-	.phy_id         = 0xffffffff,
-	.phy_id_mask    = 0xffffffff,
-	.name           = "Generic Clause 45 PHY",
-	.read_status    = genphy_c45_read_status,
-};
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 73f9cb2e2844..2902193e12f2 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -59,6 +59,13 @@ struct phy_fixup {
 	int (*run)(struct phy_device *phydev);
 };
 
+static struct phy_driver genphy_c45_driver = {
+	.phy_id         = 0xffffffff,
+	.phy_id_mask    = 0xffffffff,
+	.name           = "Generic Clause 45 PHY",
+	.read_status    = genphy_c45_read_status,
+};
+
 __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init;
 EXPORT_SYMBOL_GPL(phy_basic_features);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e194dad1623d..c021b351ab0d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1941,9 +1941,6 @@ int genphy_c45_ethtool_set_eee(struct phy_device *phydev,
 			       struct ethtool_keee *data);
 int genphy_c45_an_config_eee_aneg(struct phy_device *phydev);
 
-/* Generic C45 PHY driver */
-extern struct phy_driver genphy_c45_driver;
-
 /* The gen10g_* functions are the old Clause 45 stub */
 int gen10g_config_aneg(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From c4688ff47fd719e2371b984d59759f9fa09dd6a2 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 11 Jun 2025 14:56:19 +0100
Subject: net: phy: simplify phy_get_internal_delay()

Simplify the arguments passed to phy_get_internal_delay() - the "dev"
argument is always &phydev->mdio.dev, and as the phydev is passed in,
there's no need to also pass in the struct device, especially when this
function is the only reason for the caller to have a local "dev"
variable.

Remove the redundant "dev" argument, and update the callers.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://patch.msgid.link/E1uPLwB-003VzR-4C@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/dp83822.c        | 7 ++-----
 drivers/net/phy/dp83869.c        | 7 +++----
 drivers/net/phy/intel-xway.c     | 7 ++-----
 drivers/net/phy/mscc/mscc_main.c | 5 ++---
 drivers/net/phy/phy_device.c     | 6 +++---
 include/linux/phy.h              | 4 ++--
 6 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c
index 01255dada600..33db21251f2e 100644
--- a/drivers/net/phy/dp83822.c
+++ b/drivers/net/phy/dp83822.c
@@ -516,7 +516,6 @@ static int dp83822_config_init_leds(struct phy_device *phydev)
 static int dp83822_config_init(struct phy_device *phydev)
 {
 	struct dp83822_private *dp83822 = phydev->priv;
-	struct device *dev = &phydev->mdio.dev;
 	int rgmii_delay = 0;
 	s32 rx_int_delay;
 	s32 tx_int_delay;
@@ -549,15 +548,13 @@ static int dp83822_config_init(struct phy_device *phydev)
 		return err;
 
 	if (phy_interface_is_rgmii(phydev)) {
-		rx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0,
-						      true);
+		rx_int_delay = phy_get_internal_delay(phydev, NULL, 0, true);
 
 		/* Set DP83822_RX_CLK_SHIFT to enable rx clk internal delay */
 		if (rx_int_delay > 0)
 			rgmii_delay |= DP83822_RX_CLK_SHIFT;
 
-		tx_int_delay = phy_get_internal_delay(phydev, dev, NULL, 0,
-						      false);
+		tx_int_delay = phy_get_internal_delay(phydev, NULL, 0, false);
 
 		/* Set DP83822_TX_CLK_SHIFT to disable tx clk internal delay */
 		if (tx_int_delay <= 0)
diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c
index a62cd838a9ea..a2cd1cc35cde 100644
--- a/drivers/net/phy/dp83869.c
+++ b/drivers/net/phy/dp83869.c
@@ -540,9 +540,8 @@ static const int dp83869_internal_delay[] = {250, 500, 750, 1000, 1250, 1500,
 
 static int dp83869_of_init(struct phy_device *phydev)
 {
+	struct device_node *of_node = phydev->mdio.dev.of_node;
 	struct dp83869_private *dp83869 = phydev->priv;
-	struct device *dev = &phydev->mdio.dev;
-	struct device_node *of_node = dev->of_node;
 	int delay_size = ARRAY_SIZE(dp83869_internal_delay);
 	int ret;
 
@@ -597,13 +596,13 @@ static int dp83869_of_init(struct phy_device *phydev)
 				 &dp83869->tx_fifo_depth))
 		dp83869->tx_fifo_depth = DP83869_PHYCR_FIFO_DEPTH_4_B_NIB;
 
-	dp83869->rx_int_delay = phy_get_internal_delay(phydev, dev,
+	dp83869->rx_int_delay = phy_get_internal_delay(phydev,
 						       &dp83869_internal_delay[0],
 						       delay_size, true);
 	if (dp83869->rx_int_delay < 0)
 		dp83869->rx_int_delay = DP83869_CLK_DELAY_DEF;
 
-	dp83869->tx_int_delay = phy_get_internal_delay(phydev, dev,
+	dp83869->tx_int_delay = phy_get_internal_delay(phydev,
 						       &dp83869_internal_delay[0],
 						       delay_size, false);
 	if (dp83869->tx_int_delay < 0)
diff --git a/drivers/net/phy/intel-xway.c b/drivers/net/phy/intel-xway.c
index a44771e8acdc..9766dd99afaa 100644
--- a/drivers/net/phy/intel-xway.c
+++ b/drivers/net/phy/intel-xway.c
@@ -174,7 +174,6 @@ static const int xway_internal_delay[] = {0, 500, 1000, 1500, 2000, 2500,
 
 static int xway_gphy_rgmii_init(struct phy_device *phydev)
 {
-	struct device *dev = &phydev->mdio.dev;
 	unsigned int delay_size = ARRAY_SIZE(xway_internal_delay);
 	s32 int_delay;
 	int val = 0;
@@ -207,8 +206,7 @@ static int xway_gphy_rgmii_init(struct phy_device *phydev)
 
 	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
 	    phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID) {
-		int_delay = phy_get_internal_delay(phydev, dev,
-						   xway_internal_delay,
+		int_delay = phy_get_internal_delay(phydev, xway_internal_delay,
 						   delay_size, true);
 
 		/* if rx-internal-delay-ps is missing, use default of 2.0 ns */
@@ -220,8 +218,7 @@ static int xway_gphy_rgmii_init(struct phy_device *phydev)
 
 	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_ID ||
 	    phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) {
-		int_delay = phy_get_internal_delay(phydev, dev,
-						   xway_internal_delay,
+		int_delay = phy_get_internal_delay(phydev, xway_internal_delay,
 						   delay_size, false);
 
 		/* if tx-internal-delay-ps is missing, use default of 2.0 ns */
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 7ff975efd8e7..7ed6522fb0ef 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -530,7 +530,6 @@ static int vsc85xx_update_rgmii_cntl(struct phy_device *phydev, u32 rgmii_cntl,
 	u16 rgmii_rx_delay_pos = ffs(rgmii_rx_delay_mask) - 1;
 	u16 rgmii_tx_delay_pos = ffs(rgmii_tx_delay_mask) - 1;
 	int delay_size = ARRAY_SIZE(vsc85xx_internal_delay);
-	struct device *dev = &phydev->mdio.dev;
 	u16 reg_val = 0;
 	u16 mask = 0;
 	s32 rx_delay;
@@ -549,7 +548,7 @@ static int vsc85xx_update_rgmii_cntl(struct phy_device *phydev, u32 rgmii_cntl,
 	if (phy_interface_is_rgmii(phydev))
 		mask |= rgmii_rx_delay_mask | rgmii_tx_delay_mask;
 
-	rx_delay = phy_get_internal_delay(phydev, dev, vsc85xx_internal_delay,
+	rx_delay = phy_get_internal_delay(phydev, vsc85xx_internal_delay,
 					  delay_size, true);
 	if (rx_delay < 0) {
 		if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID ||
@@ -559,7 +558,7 @@ static int vsc85xx_update_rgmii_cntl(struct phy_device *phydev, u32 rgmii_cntl,
 			rx_delay = RGMII_CLK_DELAY_0_2_NS;
 	}
 
-	tx_delay = phy_get_internal_delay(phydev, dev, vsc85xx_internal_delay,
+	tx_delay = phy_get_internal_delay(phydev, vsc85xx_internal_delay,
 					  delay_size, false);
 	if (tx_delay < 0) {
 		if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID ||
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2902193e12f2..509078344020 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2906,7 +2906,6 @@ static int phy_get_u32_property(struct device *dev, const char *name, u32 *val)
 /**
  * phy_get_internal_delay - returns the index of the internal delay
  * @phydev: phy_device struct
- * @dev: pointer to the devices device struct
  * @delay_values: array of delays the PHY supports
  * @size: the size of the delay array
  * @is_rx: boolean to indicate to get the rx internal delay
@@ -2919,9 +2918,10 @@ static int phy_get_u32_property(struct device *dev, const char *name, u32 *val)
  * array then size = 0 and the value of the delay property is returned.
  * Return -EINVAL if the delay is invalid or cannot be found.
  */
-s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev,
-			   const int *delay_values, int size, bool is_rx)
+s32 phy_get_internal_delay(struct phy_device *phydev, const int *delay_values,
+			   int size, bool is_rx)
 {
+	struct device *dev = &phydev->mdio.dev;
 	int i, ret;
 	u32 delay;
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index c021b351ab0d..c4d8f7c82627 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1994,8 +1994,8 @@ bool phy_validate_pause(struct phy_device *phydev,
 			struct ethtool_pauseparam *pp);
 void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause);
 
-s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev,
-			   const int *delay_values, int size, bool is_rx);
+s32 phy_get_internal_delay(struct phy_device *phydev, const int *delay_values,
+			   int size, bool is_rx);
 
 int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev,
 			      enum ethtool_link_mode_bit_indices linkmode,
-- 
cgit v1.2.3


From a9a5f41b04dd137a353d4d1d6fc7d6e80aaad193 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 11 Jun 2025 15:56:15 -0400
Subject: xdp: Remove unused events xdp_redirect_map and xdp_redirect_map_err
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each TRACE_EVENT() defined can take up around 5K of text and meta data
regardless if they are used or not. New code is being developed that will
warn when a tracepoint is defined but not used.

The trace events xdp_redirect_map and xdp_redirect_map_err are defined but
not used, but there's also a comment that states these are kept around for
backward compatibility. Which is interesting because since they are not
used, any old BPF program that expects them to exist will get incorrect
data (no data) when they use them. It's worse than not working, it's
silently failing.

Remove them as they will soon cause warnings, or if they really need to
stick around, then code needs to be added to use them.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@kernel.org>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://lore.kernel.org/r/20250611155615.0c2cf61c@batman.local.home
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/xdp.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index d3ef86c97ae3..0fe0893c2567 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -168,25 +168,6 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
 #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \
 	 trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index)
 
-/* not used anymore, but kept around so as not to break old programs */
-DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map,
-	TP_PROTO(const struct net_device *dev,
-		 const struct bpf_prog *xdp,
-		 const void *tgt, int err,
-		 enum bpf_map_type map_type,
-		 u32 map_id, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
-);
-
-DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err,
-	TP_PROTO(const struct net_device *dev,
-		 const struct bpf_prog *xdp,
-		 const void *tgt, int err,
-		 enum bpf_map_type map_type,
-		 u32 map_id, u32 index),
-	TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index)
-);
-
 TRACE_EVENT(xdp_cpumap_kthread,
 
 	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
-- 
cgit v1.2.3


From 16f3c7ad887c1f8fd698ab568b5851cadb65b5a8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 18:20:23 -0400
Subject: xdp: tracing: Hide some xdp events under CONFIG_BPF_SYSCALL

The events xdp_cpumap_kthread, xdp_cpumap_enqueue and xdp_devmap_xmit are
only called when CONFIG_BPF_SYSCALL is defined.  As each event can take up
to 5K regardless if they are used or not, it's best not to define them
when they are not used. Add #ifdef around these events when they are not
used.

Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20250612182023.78397b76@batman.local.home
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/xdp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 0fe0893c2567..18c0ac514fcb 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -168,6 +168,7 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err,
 #define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \
 	 trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index)
 
+#ifdef CONFIG_BPF_SYSCALL
 TRACE_EVENT(xdp_cpumap_kthread,
 
 	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
@@ -281,6 +282,7 @@ TRACE_EVENT(xdp_devmap_xmit,
 		  __entry->sent, __entry->drops,
 		  __entry->err)
 );
+#endif /* CONFIG_BPF_SYSCALL */
 
 /* Expect users already include <net/xdp.h>, but not xdp_priv.h */
 #include <net/xdp_priv.h>
-- 
cgit v1.2.3


From 5202c25dd17c54cd4c21f266d9a51b644d7cd682 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 28 May 2025 10:09:00 +0200
Subject: sched/smp: Always define sched_domains_mutex_lock()/unlock(),
 def_root_domain and sched_domains_mutex

Simplify the scheduler by making CONFIG_SMP=y primitives and data
structures unconditional.

Unconditionally build kernel/sched/topology.c and the main sched-domains
locking primitives.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250528080924.2273858-20-mingo@kernel.org
---
 include/linux/sched.h        | 5 -----
 kernel/sched/build_utility.c | 3 ++-
 kernel/sched/topology.c      | 4 ++++
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..aa54d75034ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -395,15 +395,10 @@ enum uclamp_id {
 	UCLAMP_CNT
 };
 
-#ifdef CONFIG_SMP
 extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 extern void sched_domains_mutex_lock(void);
 extern void sched_domains_mutex_unlock(void);
-#else
-static inline void sched_domains_mutex_lock(void) { }
-static inline void sched_domains_mutex_unlock(void) { }
-#endif
 
 struct sched_param {
 	int sched_priority;
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index bf9d8db94b70..5c485b2dfb95 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -83,9 +83,10 @@
 #ifdef CONFIG_SMP
 # include "cpupri.c"
 # include "stop_task.c"
-# include "topology.c"
 #endif
 
+#include "topology.c"
+
 #ifdef CONFIG_SCHED_CORE
 # include "core_sched.c"
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2352caf4f942..ee347d9c5df4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void)
 	mutex_unlock(&sched_domains_mutex);
 }
 
+#ifdef CONFIG_SMP
+
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
@@ -2842,3 +2844,5 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
 	sched_domains_mutex_unlock();
 }
+
+#endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From cac5cefbade90ff0bb0b393d301fa3b5234cf056 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 28 May 2025 10:09:01 +0200
Subject: sched/smp: Make SMP unconditional

Simplify the scheduler by making CONFIG_SMP=y primitives and data
structures unconditional.

Introduce transitory wrappers for functionality not yet converted to SMP.

Note that this patch is pretty large, because there's no clear separation
between various aspects of the SMP scheduler, it's basically a huge block
of #ifdef CONFIG_SMP. A fair amount of it has to be switched on for it to
boot and work on UP systems.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250528080924.2273858-21-mingo@kernel.org
---
 include/linux/preempt.h        |   9 ---
 include/linux/sched.h          |  42 --------------
 include/linux/sched/deadline.h |   4 --
 include/linux/sched/idle.h     |   4 --
 include/linux/sched/nohz.h     |   4 +-
 include/linux/sched/topology.h |  32 -----------
 kernel/sched/build_policy.c    |   6 +-
 kernel/sched/build_utility.c   |   6 +-
 kernel/sched/core.c            | 106 +++++-------------------------------
 kernel/sched/cpudeadline.h     |   2 -
 kernel/sched/cpupri.h          |   2 -
 kernel/sched/deadline.c        |  95 --------------------------------
 kernel/sched/debug.c           |  12 ----
 kernel/sched/fair.c            | 115 ---------------------------------------
 kernel/sched/pelt.h            |  52 ------------------
 kernel/sched/rt.c              |   6 +-
 kernel/sched/sched.h           | 121 +----------------------------------------
 kernel/sched/syscalls.c        |   2 -
 kernel/sched/topology.c        |  10 +---
 19 files changed, 31 insertions(+), 599 deletions(-)

(limited to 'include')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index b0af8d4ef6e6..1fad1c8a4c76 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -369,8 +369,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 
 #endif
 
-#ifdef CONFIG_SMP
-
 /*
  * Migrate-Disable and why it is undesired.
  *
@@ -429,13 +427,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
 extern void migrate_disable(void);
 extern void migrate_enable(void);
 
-#else
-
-static inline void migrate_disable(void) { }
-static inline void migrate_enable(void) { }
-
-#endif /* CONFIG_SMP */
-
 /**
  * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
  *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa54d75034ea..376befdec4b0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,7 +599,6 @@ struct sched_entity {
 	unsigned long			runnable_weight;
 #endif
 
-#ifdef CONFIG_SMP
 	/*
 	 * Per entity load average tracking.
 	 *
@@ -607,7 +606,6 @@ struct sched_entity {
 	 * collide with read-mostly values above.
 	 */
 	struct sched_avg		avg;
-#endif
 };
 
 struct sched_rt_entity {
@@ -837,7 +835,6 @@ struct task_struct {
 	struct alloc_tag		*alloc_tag;
 #endif
 
-#ifdef CONFIG_SMP
 	int				on_cpu;
 	struct __call_single_node	wake_entry;
 	unsigned int			wakee_flips;
@@ -853,7 +850,6 @@ struct task_struct {
 	 */
 	int				recent_used_cpu;
 	int				wake_cpu;
-#endif
 	int				on_rq;
 
 	int				prio;
@@ -912,9 +908,7 @@ struct task_struct {
 	cpumask_t			*user_cpus_ptr;
 	cpumask_t			cpus_mask;
 	void				*migration_pending;
-#ifdef CONFIG_SMP
 	unsigned short			migration_disabled;
-#endif
 	unsigned short			migration_flags;
 
 #ifdef CONFIG_PREEMPT_RCU
@@ -946,10 +940,8 @@ struct task_struct {
 	struct sched_info		sched_info;
 
 	struct list_head		tasks;
-#ifdef CONFIG_SMP
 	struct plist_node		pushable_tasks;
 	struct rb_node			pushable_dl_tasks;
-#endif
 
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
@@ -1843,7 +1835,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu
 extern int task_can_attach(struct task_struct *p);
 extern int dl_bw_alloc(int cpu, u64 dl_bw);
 extern void dl_bw_free(int cpu, u64 dl_bw);
-#ifdef CONFIG_SMP
 
 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
@@ -1861,33 +1852,6 @@ extern void release_user_cpus_ptr(struct task_struct *p);
 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
 extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
 extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
-#else
-static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
-}
-static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
-	/* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */
-	if ((*cpumask_bits(new_mask) & 1) == 0)
-		return -EINVAL;
-	return 0;
-}
-static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
-{
-	if (src->user_cpus_ptr)
-		return -EINVAL;
-	return 0;
-}
-static inline void release_user_cpus_ptr(struct task_struct *p)
-{
-	WARN_ON(p->user_cpus_ptr);
-}
-
-static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
-	return 0;
-}
-#endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
@@ -1976,11 +1940,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state);
 extern int wake_up_process(struct task_struct *tsk);
 extern void wake_up_new_task(struct task_struct *tsk);
 
-#ifdef CONFIG_SMP
 extern void kick_process(struct task_struct *tsk);
-#else
-static inline void kick_process(struct task_struct *tsk) { }
-#endif
 
 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
 #define set_task_comm(tsk, from) ({			\
@@ -2225,7 +2185,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
-#ifdef CONFIG_SMP
 static inline bool owner_on_cpu(struct task_struct *owner)
 {
 	/*
@@ -2237,7 +2196,6 @@ static inline bool owner_on_cpu(struct task_struct *owner)
 
 /* Returns effective CPU energy utilization, as seen by the scheduler */
 unsigned long sched_cpu_util(int cpu);
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SCHED_CORE
 extern void sched_core_free(struct task_struct *tsk);
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index f9aabbc9d22e..c40115d4e34d 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -29,15 +29,11 @@ static inline bool dl_time_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-#ifdef CONFIG_SMP
-
 struct root_domain;
 extern void dl_add_task_root_domain(struct task_struct *p);
 extern void dl_clear_root_domain(struct root_domain *rd);
 extern void dl_clear_root_domain_cpu(int cpu);
 
-#endif /* CONFIG_SMP */
-
 extern u64 dl_cookie;
 extern bool dl_bw_visited(int cpu, u64 cookie);
 
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 439f6029d3b9..8465ff1f20d1 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -11,11 +11,7 @@ enum cpu_idle_type {
 	CPU_MAX_IDLE_TYPES
 };
 
-#ifdef CONFIG_SMP
 extern void wake_up_if_idle(int cpu);
-#else
-static inline void wake_up_if_idle(int cpu) { }
-#endif
 
 /*
  * Idle thread specific functions to determine the need_resched
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 6d67e9a5af6b..0db7f67935fe 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -6,7 +6,7 @@
  * This is the interface between the scheduler and nohz/dynticks:
  */
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
 extern void nohz_balance_enter_idle(int cpu);
 extern int get_nohz_timer_target(void);
 #else
@@ -23,7 +23,7 @@ static inline void calc_load_nohz_remote(struct rq *rq) { }
 static inline void calc_load_nohz_stop(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
+#ifdef CONFIG_NO_HZ_COMMON
 extern void wake_up_nohz_cpu(int cpu);
 #else
 static inline void wake_up_nohz_cpu(int cpu) { }
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 198bb5cc1774..e54e7fa76ba6 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -9,7 +9,6 @@
 /*
  * sched-domains (multiprocessor balancing) declarations:
  */
-#ifdef CONFIG_SMP
 
 /* Generate SD flag indexes */
 #define SD_FLAG(name, mflags) __##name,
@@ -200,37 +199,6 @@ extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
 
 # define SD_INIT_NAME(type)		.name = #type
 
-#else /* CONFIG_SMP */
-
-struct sched_domain_attr;
-
-static inline void
-partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
-			struct sched_domain_attr *dattr_new)
-{
-}
-
-static inline bool cpus_equal_capacity(int this_cpu, int that_cpu)
-{
-	return true;
-}
-
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
-{
-	return true;
-}
-
-static inline bool cpus_share_resources(int this_cpu, int that_cpu)
-{
-	return true;
-}
-
-static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
-{
-}
-
-#endif	/* !CONFIG_SMP */
-
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern void rebuild_sched_domains_energy(void);
 #else
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 72d97aa8b726..c4a488e67aa7 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -50,11 +50,9 @@
 #include "idle.c"
 
 #include "rt.c"
+#include "cpudeadline.c"
 
-#ifdef CONFIG_SMP
-# include "cpudeadline.c"
-# include "pelt.c"
-#endif
+#include "pelt.c"
 
 #include "cputime.c"
 #include "deadline.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 5c485b2dfb95..e2cf3b08d4e9 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -80,10 +80,8 @@
 #include "wait_bit.c"
 #include "wait.c"
 
-#ifdef CONFIG_SMP
-# include "cpupri.c"
-# include "stop_task.c"
-#endif
+#include "cpupri.c"
+#include "stop_task.c"
 
 #include "topology.c"
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1ef6d29792c..9fc44f4b779a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -650,7 +650,6 @@ void raw_spin_rq_unlock(struct rq *rq)
 	raw_spin_unlock(rq_lockp(rq));
 }
 
-#ifdef CONFIG_SMP
 /*
  * double_rq_lock - safely lock two runqueues
  */
@@ -667,7 +666,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
 
 	double_rq_clock_clear_update(rq1, rq2);
 }
-#endif /* CONFIG_SMP */
 
 /*
  * __task_rq_lock - lock the rq @p resides on.
@@ -949,7 +947,7 @@ static inline void hrtick_rq_init(struct rq *rq)
 	_val;								\
 })
 
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
 /*
  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
  * this avoids any races wrt polling state changes and thereby avoids
@@ -988,13 +986,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
 	return true;
 }
 
-#ifdef CONFIG_SMP
 static inline bool set_nr_if_polling(struct task_struct *p)
 {
 	return false;
 }
 #endif
-#endif
 
 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
 {
@@ -1167,7 +1163,6 @@ void resched_cpu(int cpu)
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 }
 
-#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy CPU for migrating timers
@@ -1374,10 +1369,8 @@ bool sched_can_stop_tick(struct rq *rq)
 	return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
-#endif /* CONFIG_SMP */
 
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
-			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
 /*
  * Iterate task_group tree rooted at *from, calling @down when first entering a
  * node and @up when leaving it for the final time.
@@ -2353,8 +2346,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 	return ncsw;
 }
 
-#ifdef CONFIG_SMP
-
 static void
 __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
 
@@ -3305,6 +3296,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
 	WARN_ON_ONCE(ret);
 }
 
+#ifdef CONFIG_SMP
+
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 	unsigned int state = READ_ONCE(p->__state);
@@ -3358,6 +3351,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	__set_task_cpu(p, new_cpu);
 }
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_NUMA_BALANCING
 static void __migrate_swap_task(struct task_struct *p, int cpu)
@@ -3661,17 +3655,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
 	}
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
-	return false;
-}
-
-#endif /* !CONFIG_SMP */
-
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
@@ -3682,7 +3665,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 
 	rq = this_rq();
 
-#ifdef CONFIG_SMP
 	if (cpu == rq->cpu) {
 		__schedstat_inc(rq->ttwu_local);
 		__schedstat_inc(p->stats.nr_wakeups_local);
@@ -3702,7 +3684,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 
 	if (wake_flags & WF_MIGRATED)
 		__schedstat_inc(p->stats.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
 
 	__schedstat_inc(rq->ttwu_count);
 	__schedstat_inc(p->stats.nr_wakeups);
@@ -3731,13 +3712,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
 
-#ifdef CONFIG_SMP
 	if (wake_flags & WF_RQ_SELECTED)
 		en_flags |= ENQUEUE_RQ_SELECTED;
 	if (wake_flags & WF_MIGRATED)
 		en_flags |= ENQUEUE_MIGRATED;
 	else
-#endif
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
@@ -3748,7 +3727,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 
 	ttwu_do_wakeup(p);
 
-#ifdef CONFIG_SMP
 	if (p->sched_class->task_woken) {
 		/*
 		 * Our task @p is fully woken up and running; so it's safe to
@@ -3770,7 +3748,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 
 		rq->idle_stamp = 0;
 	}
-#endif /* CONFIG_SMP */
 }
 
 /*
@@ -3824,7 +3801,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
 	return ret;
 }
 
-#ifdef CONFIG_SMP
 void sched_ttwu_pending(void *arg)
 {
 	struct llist_node *llist = arg;
@@ -3891,7 +3867,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
 	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
 
 	WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
 	__smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
 }
 
 void wake_up_if_idle(int cpu)
@@ -3992,15 +3970,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 	return false;
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
-{
-	return false;
-}
-
-#endif /* !CONFIG_SMP */
-
 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -4533,10 +4502,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->capture_control = NULL;
 #endif
 	init_numa_balancing(clone_flags, p);
-#ifdef CONFIG_SMP
 	p->wake_entry.u_flags = CSD_TYPE_TTWU;
 	p->migration_pending = NULL;
-#endif
 	init_sched_mm_cid(p);
 }
 
@@ -4787,14 +4754,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#ifdef CONFIG_SMP
 	p->on_cpu = 0;
-#endif
 	init_task_preempt_count(p);
-#ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
+
 	return 0;
 }
 
@@ -4871,7 +4835,6 @@ void wake_up_new_task(struct task_struct *p)
 
 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 	WRITE_ONCE(p->__state, TASK_RUNNING);
-#ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_ptr can change in the fork path
@@ -4883,7 +4846,6 @@ void wake_up_new_task(struct task_struct *p)
 	p->recent_used_cpu = task_cpu(p);
 	rseq_migrate(p);
 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
-#endif
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
 	post_init_entity_util_avg(p);
@@ -4994,7 +4956,6 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
 
 static inline void prepare_task(struct task_struct *next)
 {
-#ifdef CONFIG_SMP
 	/*
 	 * Claim the task as running, we do this before switching to it
 	 * such that any running task will have this set.
@@ -5003,12 +4964,10 @@ static inline void prepare_task(struct task_struct *next)
 	 * its ordering comment.
 	 */
 	WRITE_ONCE(next->on_cpu, 1);
-#endif
 }
 
 static inline void finish_task(struct task_struct *prev)
 {
-#ifdef CONFIG_SMP
 	/*
 	 * This must be the very last reference to @prev from this CPU. After
 	 * p->on_cpu is cleared, the task can be moved to a different CPU. We
@@ -5021,11 +4980,8 @@ static inline void finish_task(struct task_struct *prev)
 	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
 	 */
 	smp_store_release(&prev->on_cpu, 0);
-#endif
 }
 
-#ifdef CONFIG_SMP
-
 static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
 {
 	void (*func)(struct rq *rq);
@@ -5107,14 +5063,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
 	}
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-#endif /* !CONFIG_SMP */
-
 static inline void
 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
 {
@@ -5563,7 +5511,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	struct rq *rq;
 	u64 ns;
 
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
 	/*
 	 * 64-bit doesn't need locks to atomically read a 64-bit value.
 	 * So we have a optimization chance when the task's delta_exec is 0.
@@ -5690,12 +5638,10 @@ void sched_tick(void)
 	if (donor->flags & PF_WQ_WORKER)
 		wq_worker_tick(donor);
 
-#ifdef CONFIG_SMP
 	if (!scx_switched_all()) {
 		rq->idle_balance = idle_cpu(cpu);
 		sched_balance_trigger(rq);
 	}
-#endif
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -7819,12 +7765,10 @@ void show_state_filter(unsigned int state_filter)
  */
 void __init init_idle(struct task_struct *idle, int cpu)
 {
-#ifdef CONFIG_SMP
 	struct affinity_context ac = (struct affinity_context) {
 		.new_mask  = cpumask_of(cpu),
 		.flags     = 0,
 	};
-#endif
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
@@ -7840,13 +7784,11 @@ void __init init_idle(struct task_struct *idle, int cpu)
 	idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
 	kthread_set_per_cpu(idle, cpu);
 
-#ifdef CONFIG_SMP
 	/*
 	 * No validation and serialization required at boot time and for
 	 * setting up the idle tasks of not yet online CPUs.
 	 */
 	set_cpus_allowed_common(idle, &ac);
-#endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the CPU isn't yet set to this CPU so the
@@ -7865,9 +7807,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
 	rq_set_donor(rq, idle);
 	rcu_assign_pointer(rq->curr, idle);
 	idle->on_rq = TASK_ON_RQ_QUEUED;
-#ifdef CONFIG_SMP
 	idle->on_cpu = 1;
-#endif
 	raw_spin_rq_unlock(rq);
 	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
@@ -7880,13 +7820,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
-#ifdef CONFIG_SMP
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
 }
 
-#ifdef CONFIG_SMP
-
 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
 			      const struct cpumask *trial)
 {
@@ -8480,13 +8416,6 @@ static int __init migration_init(void)
 }
 early_initcall(migration_init);
 
-#else /* !CONFIG_SMP: */
-void __init sched_init_smp(void)
-{
-	sched_init_granularity();
-}
-#endif /* !CONFIG_SMP */
-
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
@@ -8512,9 +8441,7 @@ void __init sched_init(void)
 	int i;
 
 	/* Make sure the linker didn't screw up */
-#ifdef CONFIG_SMP
 	BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-#endif
 	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
 	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
 	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
@@ -8557,9 +8484,7 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 	}
 
-#ifdef CONFIG_SMP
 	init_defrootdomain();
-#endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -8620,7 +8545,6 @@ void __init sched_init(void)
 		rq->rt.rt_runtime = global_rt_runtime();
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
-#ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8637,16 +8561,15 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
 		rq_attach_root(rq, &def_root_domain);
-# ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_NO_HZ_COMMON
 		rq->last_blocked_load_update_tick = jiffies;
 		atomic_set(&rq->nohz_flags, 0);
 
 		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
-# endif
-# ifdef CONFIG_HOTPLUG_CPU
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
 		rcuwait_init(&rq->hotplug_wait);
-# endif
-#endif /* CONFIG_SMP */
+#endif
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
 		fair_server_init(rq);
@@ -8696,8 +8619,9 @@ void __init sched_init(void)
 
 #ifdef CONFIG_SMP
 	idle_thread_set_boot_cpu();
-	balance_push_set(smp_processor_id(), false);
 #endif
+
+	balance_push_set(smp_processor_id(), false);
 	init_sched_fair_class();
 	init_sched_ext_class();
 
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 3f7c73d1d189..11c0f1faa7e1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -17,7 +17,6 @@ struct cpudl {
 	struct cpudl_item	*elements;
 };
 
-#ifdef CONFIG_SMP
 int  cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
 void cpudl_clear(struct cpudl *cp, int cpu);
@@ -25,4 +24,3 @@ int  cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
-#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 24add19625ff..6f562088c056 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -20,7 +20,6 @@ struct cpupri {
 	int			*cpu_to_pri;
 };
 
-#ifdef CONFIG_SMP
 int  cpupri_find(struct cpupri *cp, struct task_struct *p,
 		 struct cpumask *lowest_mask);
 int  cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
@@ -29,4 +28,3 @@ int  cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
 int  cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
-#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2cea68b2198..bf9b70a3ff95 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -115,7 +115,6 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
 }
 #endif /* !CONFIG_RT_MUTEXES */
 
-#ifdef CONFIG_SMP
 static inline struct dl_bw *dl_bw_of(int i)
 {
 	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -195,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw)
 		rq->dl.extra_bw += bw;
 	}
 }
-#else /* !CONFIG_SMP: */
-static inline struct dl_bw *dl_bw_of(int i)
-{
-	return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
-	return 1;
-}
-
-static inline unsigned long dl_bw_capacity(int i)
-{
-	return SCHED_CAPACITY_SCALE;
-}
-
-bool dl_bw_visited(int cpu, u64 cookie)
-{
-	return false;
-}
-
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
-	struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
-	dl->extra_bw += bw;
-}
-#endif /* !CONFIG_SMP */
 
 static inline
 void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
@@ -556,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq)
 {
 	dl_rq->root = RB_ROOT_CACHED;
 
-#ifdef CONFIG_SMP
 	/* zero means no -deadline tasks */
 	dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
 
 	dl_rq->overloaded = 0;
 	dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
-#else
-	init_dl_bw(&dl_rq->dl_bw);
-#endif
 
 	dl_rq->running_bw = 0;
 	dl_rq->this_bw = 0;
 	init_dl_rq_bw_ratio(dl_rq);
 }
 
-#ifdef CONFIG_SMP
-
 static inline int dl_overloaded(struct rq *rq)
 {
 	return atomic_read(&rq->rd->dlo_count);
@@ -757,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 	return later_rq;
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline
-void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline
-void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline void deadline_queue_push_tasks(struct rq *rq)
-{
-}
-
-static inline void deadline_queue_pull_task(struct rq *rq)
-{
-}
-#endif /* !CONFIG_SMP */
-
 static void
 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1199,7 +1132,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
 
 static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
 {
-#ifdef CONFIG_SMP
 	/*
 	 * Queueing this task back might have overloaded rq, check if we need
 	 * to kick someone away.
@@ -1213,7 +1145,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
 		push_dl_task(rq);
 		rq_repin_lock(rq, rf);
 	}
-#endif /* CONFIG_SMP */
 }
 
 /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
@@ -1343,7 +1274,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		goto unlock;
 	}
 
-#ifdef CONFIG_SMP
 	if (unlikely(!rq->online)) {
 		/*
 		 * If the runqueue is no longer available, migrate the
@@ -1360,7 +1290,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		 * there.
 		 */
 	}
-#endif /* CONFIG_SMP */
 
 	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 	if (dl_task(rq->donor))
@@ -1848,8 +1777,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
 #define __node_2_dle(node) \
 	rb_entry((node), struct sched_dl_entity, rb_node)
 
-#ifdef CONFIG_SMP
-
 static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 {
 	struct rq *rq = rq_of_dl_rq(dl_rq);
@@ -1885,13 +1812,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 	}
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-
-#endif /* !CONFIG_SMP */
-
 static inline
 void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
@@ -2218,8 +2138,6 @@ static void yield_task_dl(struct rq *rq)
 	rq_clock_skip_update(rq);
 }
 
-#ifdef CONFIG_SMP
-
 static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
 						 struct rq *rq)
 {
@@ -2349,7 +2267,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 
 	return sched_stop_runnable(rq) || sched_dl_runnable(rq);
 }
-#endif /* CONFIG_SMP */
 
 /*
  * Only called when both the current and waking task are -deadline
@@ -2363,7 +2280,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 		return;
 	}
 
-#ifdef CONFIG_SMP
 	/*
 	 * In the unlikely case current and p have the same deadline
 	 * let us try to decide what's the best thing to do...
@@ -2371,7 +2287,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
 	if ((p->dl.deadline == rq->donor->dl.deadline) &&
 	    !test_tsk_need_resched(rq->curr))
 		check_preempt_equal_dl(rq, p);
-#endif /* CONFIG_SMP */
 }
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -2504,8 +2419,6 @@ static void task_fork_dl(struct task_struct *p)
 	 */
 }
 
-#ifdef CONFIG_SMP
-
 /* Only try algorithms three times */
 #define DL_MAX_TRIES 3
 
@@ -2999,8 +2912,6 @@ void dl_clear_root_domain_cpu(int cpu)
 	dl_clear_root_domain(cpu_rq(cpu)->rd);
 }
 
-#endif /* CONFIG_SMP */
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
 	/*
@@ -3073,10 +2984,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	}
 
 	if (rq->donor != p) {
-#ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
 			deadline_queue_push_tasks(rq);
-#endif
 		if (dl_task(rq->donor))
 			wakeup_preempt_dl(rq, p, 0);
 		else
@@ -3153,7 +3062,6 @@ DEFINE_SCHED_CLASS(dl) = {
 	.put_prev_task		= put_prev_task_dl,
 	.set_next_task		= set_next_task_dl,
 
-#ifdef CONFIG_SMP
 	.balance		= balance_dl,
 	.select_task_rq		= select_task_rq_dl,
 	.migrate_task_rq	= migrate_task_rq_dl,
@@ -3162,7 +3070,6 @@ DEFINE_SCHED_CLASS(dl) = {
 	.rq_offline             = rq_offline_dl,
 	.task_woken		= task_woken_dl,
 	.find_lock_rq		= find_lock_later_rq,
-#endif /* CONFIG_SMP */
 
 	.task_tick		= task_tick_dl,
 	.task_fork              = task_fork_dl,
@@ -3462,7 +3369,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 	return false;
 }
 
-#ifdef CONFIG_SMP
 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
 				 const struct cpumask *trial)
 {
@@ -3574,7 +3480,6 @@ void dl_bw_free(int cpu, u64 dl_bw)
 {
 	dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
 }
-#endif /* CONFIG_SMP */
 
 void print_dl_stats(struct seq_file *m, int cpu)
 {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 748709c03214..04c0354f05e4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -286,7 +286,6 @@ static const struct file_operations sched_dynamic_fops = {
 
 __read_mostly bool sched_debug_verbose;
 
-#ifdef CONFIG_SMP
 static struct dentry           *sd_dentry;
 
 
@@ -314,9 +313,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
 
 	return result;
 }
-#else /* !CONFIG_SMP: */
-# define sched_verbose_write debugfs_write_file_bool
-#endif /* !CONFIG_SMP */
 
 static const struct file_operations sched_verbose_fops = {
 	.read =         debugfs_read_file_bool,
@@ -543,8 +539,6 @@ static __init int sched_init_debug(void)
 }
 late_initcall(sched_init_debug);
 
-#ifdef CONFIG_SMP
-
 static cpumask_var_t		sd_sysctl_cpus;
 
 static int sd_flags_show(struct seq_file *m, void *v)
@@ -655,8 +649,6 @@ void dirty_sched_domain_sysctl(int cpu)
 		__cpumask_set_cpu(cpu, sd_sysctl_cpus);
 }
 
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
@@ -932,11 +924,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 	SEQ_printf(m, "  .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
 
 	PU(dl_nr_running);
-#ifdef CONFIG_SMP
 	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
-#else
-	dl_bw = &dl_rq->dl_bw;
-#endif
 	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
 	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1fabbe01bf93..6b17d3da034a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str)
 }
 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
 
-#ifdef CONFIG_SMP
 /*
  * For asym packing, by default the lower numbered CPU has higher priority.
  */
@@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu)
  * (default: ~5%)
  */
 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
@@ -996,7 +994,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 /**************************************************************
  * Scheduling class statistics methods:
  */
-#ifdef CONFIG_SMP
 int sched_update_scaling(void)
 {
 	unsigned int factor = get_update_sysctl_factor();
@@ -1008,7 +1005,6 @@ int sched_update_scaling(void)
 
 	return 0;
 }
-#endif /* CONFIG_SMP */
 
 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
 
@@ -1042,8 +1038,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #include "pelt.h"
 
-#ifdef CONFIG_SMP
-
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 static unsigned long capacity_of(int cpu);
@@ -1132,18 +1126,6 @@ void post_init_entity_util_avg(struct task_struct *p)
 	sa->runnable_avg = sa->util_avg;
 }
 
-#else /* !CONFIG_SMP: */
-void init_entity_runnable_average(struct sched_entity *se)
-{
-}
-void post_init_entity_util_avg(struct task_struct *p)
-{
-}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* !CONFIG_SMP */
-
 static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
 {
 	u64 now = rq_clock_task(rq);
@@ -3698,14 +3680,12 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
 	if (entity_is_task(se)) {
 		struct rq *rq = rq_of(cfs_rq);
 
 		account_numa_enqueue(rq, task_of(se));
 		list_add(&se->group_node, &rq->cfs_tasks);
 	}
-#endif
 	cfs_rq->nr_queued++;
 }
 
@@ -3713,12 +3693,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
 	}
-#endif
 	cfs_rq->nr_queued--;
 }
 
@@ -3770,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
 } while (0)
 
-#ifdef CONFIG_SMP
 static inline void
 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -3787,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
 					  cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
 }
-#else /* !CONFIG_SMP: */
-static inline void
-enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-#endif /* !CONFIG_SMP */
 
 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
 
@@ -3824,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 	update_load_set(&se->load, weight);
 
-#ifdef CONFIG_SMP
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
 
 		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 	} while (0);
-#endif
 
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
@@ -3865,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
 /*
  * All this does is approximate the hierarchical proportion which includes that
  * global sum we all love to hate.
@@ -3972,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 	 */
 	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
-#endif /* CONFIG_SMP */
 
 /*
  * Recomputes the group entity based on the current state of its group
@@ -3993,11 +3960,7 @@ static void update_cfs_group(struct sched_entity *se)
 	if (throttled_hierarchy(gcfs_rq))
 		return;
 
-#ifndef CONFIG_SMP
-	shares = READ_ONCE(gcfs_rq->tg->shares);
-#else
 	shares = calc_group_shares(gcfs_rq);
-#endif
 	if (unlikely(se->load.weight != shares))
 		reweight_entity(cfs_rq_of(se), se, shares);
 }
@@ -4031,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 	}
 }
 
-#ifdef CONFIG_SMP
 static inline bool load_avg_is_decayed(struct sched_avg *sa)
 {
 	if (sa->load_sum)
@@ -5146,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
 }
 
-#else /* !CONFIG_SMP: */
-
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-	return !cfs_rq->nr_queued;
-}
-
-#define UPDATE_TG	0x0
-#define SKIP_AGE_LOAD	0x0
-#define DO_ATTACH	0x0
-#define DO_DETACH	0x0
-
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
-{
-	cfs_rq_util_change(cfs_rq, 0);
-}
-
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
-
-static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-
-static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
-{
-	return 0;
-}
-
-static inline void
-util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
-		bool task_sleep) {}
-static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
-#endif /* !CONFIG_SMP */
-
 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
 {
 	struct sched_entity *se = &p->se;
@@ -6090,7 +6010,6 @@ unthrottle_throttle:
 		resched_curr(rq);
 }
 
-#ifdef CONFIG_SMP
 static void __cfsb_csd_unthrottle(void *arg)
 {
 	struct cfs_rq *cursor, *tmp;
@@ -6149,12 +6068,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 	if (first)
 		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
 }
-#else /* !CONFIG_SMP: */
-static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-{
-	unthrottle_cfs_rq(cfs_rq);
-}
-#endif /* !CONFIG_SMP */
 
 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 {
@@ -6610,7 +6523,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	 * guaranteed at this point that no additional cfs_rq of this group can
 	 * join a CSD list.
 	 */
-#ifdef CONFIG_SMP
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		unsigned long flags;
@@ -6622,7 +6534,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 		__cfsb_csd_unthrottle(rq);
 		local_irq_restore(flags);
 	}
-#endif
 }
 
 /*
@@ -6835,7 +6746,6 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif /* !CONFIG_SCHED_HRTICK */
 
-#ifdef CONFIG_SMP
 static inline bool cpu_overutilized(int cpu)
 {
 	unsigned long  rq_util_min, rq_util_max;
@@ -6877,9 +6787,6 @@ static inline void check_update_overutilized_status(struct rq *rq)
 	if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
 		set_rd_overutilized(rq->rd, 1);
 }
-#else /* !CONFIG_SMP: */
-static inline void check_update_overutilized_status(struct rq *rq) { }
-#endif /* !CONFIG_SMP */
 
 /* Runqueue only has SCHED_IDLE tasks enqueued */
 static int sched_idle_rq(struct rq *rq)
@@ -6888,12 +6795,10 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
-#ifdef CONFIG_SMP
 static int sched_idle_cpu(int cpu)
 {
 	return sched_idle_rq(cpu_rq(cpu));
 }
-#endif
 
 static void
 requeue_delayed_entity(struct sched_entity *se)
@@ -7208,8 +7113,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
 	return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
 }
 
-#ifdef CONFIG_SMP
-
 /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
@@ -8745,9 +8648,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	return sched_balance_newidle(rq, rf) != 0;
 }
-#else /* !CONFIG_SMP: */
-static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
-#endif /* !CONFIG_SMP */
 
 static void set_next_buddy(struct sched_entity *se)
 {
@@ -9057,7 +8957,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 	return true;
 }
 
-#ifdef CONFIG_SMP
 /**************************************************
  * Fair scheduling class load-balancing methods.
  *
@@ -12980,8 +12879,6 @@ static void rq_offline_fair(struct rq *rq)
 	clear_tg_offline_cfs_rqs(rq);
 }
 
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_SCHED_CORE
 static inline bool
 __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
@@ -13209,7 +13106,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
-#ifdef CONFIG_SMP
 	/*
 	 * In case the task sched_avg hasn't been attached:
 	 * - A forked task which hasn't been woken up by wake_up_new_task().
@@ -13218,7 +13114,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 	 */
 	if (!se->avg.last_update_time)
 		return;
-#endif
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
 	update_load_avg(cfs_rq, se, 0);
@@ -13282,7 +13177,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 {
 	struct sched_entity *se = &p->se;
 
-#ifdef CONFIG_SMP
 	if (task_on_rq_queued(p)) {
 		/*
 		 * Move the next running task to the front of the list, so our
@@ -13290,7 +13184,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
 		 */
 		list_move(&se->group_node, &rq->cfs_tasks);
 	}
-#endif
 	if (!first)
 		return;
 
@@ -13328,9 +13221,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifdef CONFIG_SMP
 	raw_spin_lock_init(&cfs_rq->removed.lock);
-#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13345,10 +13236,8 @@ static void task_change_group_fair(struct task_struct *p)
 
 	detach_task_cfs_rq(p);
 
-#ifdef CONFIG_SMP
 	/* Tell se's cfs_rq has been changed -- migrated */
 	p->se.avg.last_update_time = 0;
-#endif
 	set_task_rq(p, task_cpu(p));
 	attach_task_cfs_rq(p);
 }
@@ -13644,7 +13533,6 @@ DEFINE_SCHED_CLASS(fair) = {
 	.put_prev_task		= put_prev_task_fair,
 	.set_next_task          = set_next_task_fair,
 
-#ifdef CONFIG_SMP
 	.balance		= balance_fair,
 	.select_task_rq		= select_task_rq_fair,
 	.migrate_task_rq	= migrate_task_rq_fair,
@@ -13654,7 +13542,6 @@ DEFINE_SCHED_CLASS(fair) = {
 
 	.task_dead		= task_dead_fair,
 	.set_cpus_allowed	= set_cpus_allowed_fair,
-#endif
 
 	.task_tick		= task_tick_fair,
 	.task_fork		= task_fork_fair,
@@ -13717,7 +13604,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 
 __init void init_sched_fair_class(void)
 {
-#ifdef CONFIG_SMP
 	int i;
 
 	for_each_possible_cpu(i) {
@@ -13739,5 +13625,4 @@ __init void init_sched_fair_class(void)
 	nohz.next_blocked = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 #endif
-#endif /* CONFIG_SMP */
 }
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index a5d4933e6b70..62c3fa543c0f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -3,7 +3,6 @@
 #define _KERNEL_SCHED_PELT_H
 #include "sched.h"
 
-#ifdef CONFIG_SMP
 #include "sched-pelt.h"
 
 int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
@@ -187,55 +186,4 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
 }
 #endif /* !CONFIG_CFS_BANDWIDTH */
 
-#else /* !CONFIG_SMP: */
-
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
-	return 0;
-}
-
-static inline int
-update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
-{
-	return 0;
-}
-
-static inline int
-update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
-{
-	return 0;
-}
-
-static inline int
-update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
-{
-	return 0;
-}
-
-static inline u64 hw_load_avg(struct rq *rq)
-{
-	return 0;
-}
-
-static inline int
-update_irq_load_avg(struct rq *rq, u64 running)
-{
-	return 0;
-}
-
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
-	return rq_clock_task(rq);
-}
-
-static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
-
-static inline void
-update_idle_rq_clock_pelt(struct rq *rq) { }
-
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
-#endif /* !CONFIG_SMP */
-
 #endif /* _KERNEL_SCHED_PELT_H */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7e8ed05d302a..ab211706b160 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2454,7 +2454,11 @@ void __init init_sched_rt_class(void)
 					GFP_KERNEL, cpu_to_node(i));
 	}
 }
-#endif /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
+void __init init_sched_rt_class(void)
+{
+}
+#endif /* !CONFIG_SMP */
 
 /*
  * When switching a task to RT, we may overload the runqueue
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2bf804b8c89b..7a7ebc2a3675 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -443,14 +443,12 @@ struct task_group {
 	/* runqueue "owned" by this group on each CPU */
 	struct cfs_rq		**cfs_rq;
 	unsigned long		shares;
-#ifdef CONFIG_SMP
 	/*
 	 * load_avg can be heavily contended at clock tick time, so put
 	 * it in its own cache-line separated from the fields above which
 	 * will also be accessed at each tick.
 	 */
 	atomic_long_t		load_avg ____cacheline_aligned;
-#endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -574,13 +572,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern int sched_group_set_idle(struct task_group *tg, long idle);
 
-#ifdef CONFIG_SMP
 extern void set_task_rq_fair(struct sched_entity *se,
 			     struct cfs_rq *prev, struct cfs_rq *next);
-#else /* !CONFIG_SMP: */
-static inline void set_task_rq_fair(struct sched_entity *se,
-			     struct cfs_rq *prev, struct cfs_rq *next) { }
-#endif /* !CONFIG_SMP */
 #else /* !CONFIG_FAIR_GROUP_SCHED: */
 static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
 static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
@@ -668,7 +661,6 @@ struct cfs_rq {
 	struct sched_entity	*curr;
 	struct sched_entity	*next;
 
-#ifdef CONFIG_SMP
 	/*
 	 * CFS load tracking
 	 */
@@ -700,7 +692,6 @@ struct cfs_rq {
 	u64			last_h_load_update;
 	struct sched_entity	*h_load_next;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
@@ -797,14 +788,10 @@ struct rt_rq {
 	struct rt_prio_array	active;
 	unsigned int		rt_nr_running;
 	unsigned int		rr_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int		curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
 		int		next; /* next highest */
-#endif
 	} highest_prio;
-#endif
 #ifdef CONFIG_SMP
 	bool			overloaded;
 	struct plist_head	pushable_tasks;
@@ -840,7 +827,6 @@ struct dl_rq {
 
 	unsigned int		dl_nr_running;
 
-#ifdef CONFIG_SMP
 	/*
 	 * Deadline values of the currently executing and the
 	 * earliest ready task on this rq. Caching these facilitates
@@ -860,9 +846,7 @@ struct dl_rq {
 	 * of the leftmost (earliest deadline) element.
 	 */
 	struct rb_root_cached	pushable_dl_tasks_root;
-#else /* !CONFIG_SMP: */
-	struct dl_bw		dl_bw;
-#endif /* !CONFIG_SMP */
+
 	/*
 	 * "Active utilization" for this runqueue: increased when a
 	 * task wakes up (becomes TASK_RUNNING) and decreased when a
@@ -933,7 +917,6 @@ static inline long se_runnable(struct sched_entity *se)
 
 #endif /* !CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_SMP
 /*
  * XXX we want to get rid of these helpers and use the full load resolution.
  */
@@ -1044,7 +1027,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status)
 #ifdef HAVE_RT_PUSH_IPI
 extern void rto_push_irq_work_func(struct irq_work *work);
 #endif
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_UCLAMP_TASK
 /*
@@ -1108,18 +1090,14 @@ struct rq {
 	unsigned int		numa_migrate_on;
 #endif
 #ifdef CONFIG_NO_HZ_COMMON
-#ifdef CONFIG_SMP
 	unsigned long		last_blocked_load_update_tick;
 	unsigned int		has_blocked_load;
 	call_single_data_t	nohz_csd;
-#endif /* CONFIG_SMP */
 	unsigned int		nohz_tick_stopped;
 	atomic_t		nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifdef CONFIG_SMP
 	unsigned int		ttwu_pending;
-#endif
 	u64			nr_switches;
 
 #ifdef CONFIG_UCLAMP_TASK
@@ -1184,7 +1162,6 @@ struct rq {
 	int membarrier_state;
 #endif
 
-#ifdef CONFIG_SMP
 	struct root_domain		*rd;
 	struct sched_domain __rcu	*sd;
 
@@ -1225,7 +1202,6 @@ struct rq {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct rcuwait		hotplug_wait;
 #endif
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	u64			prev_irq_time;
@@ -1272,9 +1248,7 @@ struct rq {
 	struct cpuidle_state	*idle_state;
 #endif
 
-#ifdef CONFIG_SMP
 	unsigned int		nr_pinned;
-#endif
 	unsigned int		push_busy;
 	struct cpu_stop_work	push_work;
 
@@ -1300,7 +1274,7 @@ struct rq {
 	/* Scratch cpumask to be temporarily used under rq_lock */
 	cpumask_var_t		scratch_mask;
 
-#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+#ifdef CONFIG_CFS_BANDWIDTH
 	call_single_data_t	cfsb_csd;
 	struct list_head	cfsb_csd_list;
 #endif
@@ -1963,8 +1937,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 
 #endif /* !CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_SMP
-
 static inline void
 queue_balance_callback(struct rq *rq,
 		       struct balance_callback *head,
@@ -2130,8 +2102,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
 	return p->user_cpus_ptr;
 }
 
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
@@ -2418,7 +2388,6 @@ struct sched_class {
 	void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
 	void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
 
-#ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
 
 	void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
@@ -2431,7 +2400,6 @@ struct sched_class {
 	void (*rq_offline)(struct rq *rq);
 
 	struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
-#endif /* CONFIG_SMP */
 
 	void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
 	void (*task_fork)(struct task_struct *p);
@@ -2583,8 +2551,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq);
 #define SCA_MIGRATE_ENABLE	0x04
 #define SCA_USER		0x08
 
-#ifdef CONFIG_SMP
-
 extern void update_group_capacity(struct sched_domain *sd, int cpu);
 
 extern void sched_balance_trigger(struct rq *rq);
@@ -2636,26 +2602,6 @@ static inline struct task_struct *get_push_task(struct rq *rq)
 
 extern int push_cpu_stop(void *arg);
 
-#else /* !CONFIG_SMP: */
-
-static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
-{
-	return true;
-}
-
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-					 struct affinity_context *ctx)
-{
-	return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
-	return NULL;
-}
-
-#endif /* !CONFIG_SMP */
-
 #ifdef CONFIG_CPU_IDLE
 
 static inline void idle_set_state(struct rq *rq,
@@ -2932,8 +2878,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock
 { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t;			\
   _lock; return _t; }
 
-#ifdef CONFIG_SMP
-
 static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
 {
 #ifdef CONFIG_SCHED_CORE
@@ -3093,42 +3037,6 @@ extern void set_rq_offline(struct rq *rq);
 
 extern bool sched_smp_initialized;
 
-#else /* !CONFIG_SMP: */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
-	__acquires(rq1->lock)
-	__acquires(rq2->lock)
-{
-	WARN_ON_ONCE(!irqs_disabled());
-	WARN_ON_ONCE(rq1 != rq2);
-	raw_spin_rq_lock(rq1);
-	__acquire(rq2->lock);	/* Fake it out ;) */
-	double_rq_clock_clear_update(rq1, rq2);
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-	__releases(rq1->lock)
-	__releases(rq2->lock)
-{
-	WARN_ON_ONCE(rq1 != rq2);
-	raw_spin_rq_unlock(rq1);
-	__release(rq2->lock);
-}
-
-#endif /* !CONFIG_SMP */
-
 DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
 		    double_rq_lock(_T->lock, _T->lock2),
 		    double_rq_unlock(_T->lock, _T->lock2))
@@ -3187,7 +3095,7 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
 extern void nohz_run_idle_balance(int cpu);
 #else
 static inline void nohz_run_idle_balance(int cpu) { }
@@ -3313,8 +3221,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
 # define arch_scale_freq_invariant()	false
 #endif
 
-#ifdef CONFIG_SMP
-
 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
 				 unsigned long *min,
 				 unsigned long *max);
@@ -3358,10 +3264,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
 	return READ_ONCE(rq->avg_rt.util_avg);
 }
 
-#else /* !CONFIG_SMP: */
-static inline bool update_other_load_avgs(struct rq *rq) { return false; }
-#endif /* !CONFIG_SMP */
-
 #ifdef CONFIG_UCLAMP_TASK
 
 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
@@ -3580,7 +3482,6 @@ static inline void membarrier_switch_mm(struct rq *rq,
 
 #endif /* !CONFIG_MEMBARRIER */
 
-#ifdef CONFIG_SMP
 static inline bool is_per_cpu_kthread(struct task_struct *p)
 {
 	if (!(p->flags & PF_KTHREAD))
@@ -3591,7 +3492,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 
 	return true;
 }
-#endif /* CONFIG_SMP */
 
 extern void swake_up_all_locked(struct swait_queue_head *q);
 extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
@@ -3890,7 +3790,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
 
 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-#ifdef CONFIG_SMP
 static inline
 void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
 {
@@ -3911,7 +3810,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
 
 	return false;
 }
-#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_RT_MUTEXES
 
@@ -3952,21 +3850,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
 				const struct sched_class *prev_class,
 				int oldprio);
 
-#ifdef CONFIG_SMP
 extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
 extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#else /* !CONFIG_SMP: */
-
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
-	return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
-#endif /* !CONFIG_SMP */
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 /*
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 5cb5e9487f0d..d7fccf871c7d 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1119,7 +1119,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
 }
 
-#ifdef CONFIG_SMP
 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 {
 	/*
@@ -1148,7 +1147,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 
 	return 0;
 }
-#endif /* CONFIG_SMP */
 
 int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ee347d9c5df4..f2c10167f2bc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -17,8 +17,6 @@ void sched_domains_mutex_unlock(void)
 	mutex_unlock(&sched_domains_mutex);
 }
 
-#ifdef CONFIG_SMP
-
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
@@ -1322,11 +1320,10 @@ next:
 	update_group_capacity(sd, cpu);
 }
 
-#ifdef CONFIG_SMP
-
 /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
 void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
 {
+#ifdef CONFIG_SMP
 	int asym_prefer_cpu = cpu;
 	struct sched_domain *sd;
 
@@ -1376,9 +1373,8 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
 
 		WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu);
 	}
-}
-
 #endif /* CONFIG_SMP */
+}
 
 /*
  * Set of available CPUs grouped by their corresponding capacities
@@ -2844,5 +2840,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
 	sched_domains_mutex_unlock();
 }
-
-#endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 06ddd17521bf11a3e7f59dafdf5c148f29467d2c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 28 May 2025 10:09:02 +0200
Subject: sched/smp: Always define is_percpu_thread() and scheduler_ipi()

Simplify the scheduler by making the CONFIG_SMP=y primitives
of is_percpu_thread() and scheduler_ipi() unconditional.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250528080924.2273858-22-mingo@kernel.org
---
 include/linux/sched.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 376befdec4b0..eec6b225e9d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1765,12 +1765,8 @@ extern struct pid *cad_pid;
 
 static __always_inline bool is_percpu_thread(void)
 {
-#ifdef CONFIG_SMP
 	return (current->flags & PF_NO_SETAFFINITY) &&
 		(current->nr_cpus_allowed  == 1);
-#else
-	return true;
-#endif
 }
 
 /* Per-process atomic flags. */
@@ -1967,7 +1963,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec
 	buf;						\
 })
 
-#ifdef CONFIG_SMP
 static __always_inline void scheduler_ipi(void)
 {
 	/*
@@ -1977,9 +1972,6 @@ static __always_inline void scheduler_ipi(void)
 	 */
 	preempt_fold_need_resched();
 }
-#else
-static inline void scheduler_ipi(void) { }
-#endif
 
 extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
 
-- 
cgit v1.2.3


From 1f25730e5a780b33f78e3ea23e64d3f75e0b2042 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 28 May 2025 10:09:07 +0200
Subject: sched/smp: Use the SMP version of sched_exec()

Simplify the scheduler making CONFIG_SMP=y sched_exec()
code unconditional.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Shrikanth Hegde <sshegde@linux.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20250528080924.2273858-27-mingo@kernel.org
---
 include/linux/sched/task.h | 4 ----
 kernel/sched/core.c        | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ca1db4b92c32..c517dbc242f7 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -109,11 +109,7 @@ int kernel_wait(pid_t pid, int *stat);
 extern void free_task(struct task_struct *tsk);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
 extern void sched_exec(void);
-#else
-#define sched_exec()   {}
-#endif
 
 static inline struct task_struct *get_task_struct(struct task_struct *t)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c108b5c2e115..fa89006e05e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5419,8 +5419,6 @@ unsigned int nr_iowait(void)
 	return sum;
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * sched_exec - execve() is a valuable balancing opportunity, because at
  * this point the task has the smallest effective memory and cache footprint.
@@ -5444,8 +5442,6 @@ void sched_exec(void)
 	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
 }
 
-#endif /* CONFIG_SMP */
-
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 
-- 
cgit v1.2.3


From dfce24f0032439113848939816ef78b6e83f4086 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 12 Jun 2025 17:00:22 +0100
Subject: ALSA: hda: cs35l41: Add support for center channel in CS35L41 HDA

Currently only left and right channels are supported for each amp.
Support is needed for a center channel, using both left and right
channel audio.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Link: https://patch.msgid.link/20250612160029.848104-2-sbinding@opensource.cirrus.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/cs35l41.h     |  12 ++++++
 sound/pci/hda/cs35l41_hda.c | 103 ++++++++++++++++++++++++++++----------------
 sound/pci/hda/cs35l41_hda.h |   1 +
 3 files changed, 78 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 43c6a9ef8d9f..7542cabfa726 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -609,6 +609,18 @@
 #define CS35L41_DSP_NG_DELAY_MASK	0x0F00
 #define CS35L41_DSP_NG_DELAY_SHIFT	8
 
+#define CS35L41_ASP_RX1_EN_MASK		0x00010000
+#define CS35L41_ASP_RX1_EN_SHIFT	16
+#define CS35L41_ASP_RX2_EN_MASK		0x00020000
+#define CS35L41_ASP_RX2_EN_SHIFT	17
+#define CS35L41_ASP_TX1_EN_MASK		0x00000001
+#define CS35L41_ASP_TX1_EN_SHIFT	0
+#define CS35L41_ASP_TX2_EN_MASK		0x00000002
+#define CS35L41_ASP_TX2_EN_SHIFT	1
+#define CS35L41_ASP_TX3_EN_MASK		0x00000004
+#define CS35L41_ASP_TX3_EN_SHIFT	2
+#define CS35L41_ASP_TX4_EN_MASK		0x00000008
+#define CS35L41_ASP_TX4_EN_SHIFT	3
 #define CS35L41_ASP_FMT_MASK		0x0700
 #define CS35L41_ASP_FMT_SHIFT		8
 #define CS35L41_ASP_DOUT_HIZ_MASK	0x03
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index d5bc81099d0d..c2cf3813872a 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -93,47 +93,36 @@ module_param(firmware_autostart, bool, 0444);
 MODULE_PARM_DESC(firmware_autostart, "Allow automatic firmware download on boot"
 			     "(0=Disable, 1=Enable) (default=1); ");
 
+static const char channel_name[3] = { 'L', 'R', 'C' };
+
 static const struct reg_sequence cs35l41_hda_config[] = {
 	{ CS35L41_PLL_CLK_CTRL,		0x00000430 }, // 3072000Hz, BCLK Input, PLL_REFCLK_EN = 1
 	{ CS35L41_DSP_CLK_CTRL,		0x00000003 }, // DSP CLK EN
 	{ CS35L41_GLOBAL_CLK_CTRL,	0x00000003 }, // GLOBAL_FS = 48 kHz
-	{ CS35L41_SP_ENABLES,		0x00010000 }, // ASP_RX1_EN = 1
 	{ CS35L41_SP_RATE_CTRL,		0x00000021 }, // ASP_BCLK_FREQ = 3.072 MHz
 	{ CS35L41_SP_FORMAT,		0x20200200 }, // 32 bits RX/TX slots, I2S, clk consumer
-	{ CS35L41_SP_HIZ_CTRL,		0x00000002 }, // Hi-Z unused
 	{ CS35L41_SP_TX_WL,		0x00000018 }, // 24 cycles/slot
 	{ CS35L41_SP_RX_WL,		0x00000018 }, // 24 cycles/slot
-	{ CS35L41_DAC_PCM1_SRC,		0x00000008 }, // DACPCM1_SRC = ASPRX1
 	{ CS35L41_ASP_TX1_SRC,		0x00000018 }, // ASPTX1 SRC = VMON
 	{ CS35L41_ASP_TX2_SRC,		0x00000019 }, // ASPTX2 SRC = IMON
-	{ CS35L41_ASP_TX3_SRC,		0x00000032 }, // ASPTX3 SRC = ERRVOL
-	{ CS35L41_ASP_TX4_SRC,		0x00000033 }, // ASPTX4 SRC = CLASSH_TGT
-	{ CS35L41_DSP1_RX1_SRC,		0x00000008 }, // DSP1RX1 SRC = ASPRX1
-	{ CS35L41_DSP1_RX2_SRC,		0x00000009 }, // DSP1RX2 SRC = ASPRX2
 	{ CS35L41_DSP1_RX3_SRC,         0x00000018 }, // DSP1RX3 SRC = VMON
 	{ CS35L41_DSP1_RX4_SRC,         0x00000019 }, // DSP1RX4 SRC = IMON
+};
+
+static const struct reg_sequence cs35l41_hda_config_no_dsp[] = {
+	{ CS35L41_SP_HIZ_CTRL,		0x00000002 }, // Hi-Z unused
+	{ CS35L41_DAC_PCM1_SRC,		0x00000008 }, // DACPCM1_SRC = ASPRX1
+	{ CS35L41_ASP_TX3_SRC,		0x00000000 }, // ASPTX3 SRC = ZERO FILL
+	{ CS35L41_ASP_TX4_SRC,		0x00000000 }, // ASPTX4 SRC = ZERO FILL
 	{ CS35L41_DSP1_RX5_SRC,         0x00000020 }, // DSP1RX5 SRC = ERRVOL
+	{ CS35L41_DSP1_RX6_SRC,         0x00000021 }, // DSP1RX6 SRC = CLASSH_TGT
 };
 
 static const struct reg_sequence cs35l41_hda_config_dsp[] = {
-	{ CS35L41_PLL_CLK_CTRL,		0x00000430 }, // 3072000Hz, BCLK Input, PLL_REFCLK_EN = 1
-	{ CS35L41_DSP_CLK_CTRL,		0x00000003 }, // DSP CLK EN
-	{ CS35L41_GLOBAL_CLK_CTRL,	0x00000003 }, // GLOBAL_FS = 48 kHz
-	{ CS35L41_SP_ENABLES,		0x00010001 }, // ASP_RX1_EN = 1, ASP_TX1_EN = 1
-	{ CS35L41_SP_RATE_CTRL,		0x00000021 }, // ASP_BCLK_FREQ = 3.072 MHz
-	{ CS35L41_SP_FORMAT,		0x20200200 }, // 32 bits RX/TX slots, I2S, clk consumer
 	{ CS35L41_SP_HIZ_CTRL,		0x00000003 }, // Hi-Z unused/disabled
-	{ CS35L41_SP_TX_WL,		0x00000018 }, // 24 cycles/slot
-	{ CS35L41_SP_RX_WL,		0x00000018 }, // 24 cycles/slot
 	{ CS35L41_DAC_PCM1_SRC,		0x00000032 }, // DACPCM1_SRC = DSP1TX1
-	{ CS35L41_ASP_TX1_SRC,		0x00000018 }, // ASPTX1 SRC = VMON
-	{ CS35L41_ASP_TX2_SRC,		0x00000019 }, // ASPTX2 SRC = IMON
 	{ CS35L41_ASP_TX3_SRC,		0x00000028 }, // ASPTX3 SRC = VPMON
 	{ CS35L41_ASP_TX4_SRC,		0x00000029 }, // ASPTX4 SRC = VBSTMON
-	{ CS35L41_DSP1_RX1_SRC,		0x00000008 }, // DSP1RX1 SRC = ASPRX1
-	{ CS35L41_DSP1_RX2_SRC,		0x00000008 }, // DSP1RX2 SRC = ASPRX1
-	{ CS35L41_DSP1_RX3_SRC,         0x00000018 }, // DSP1RX3 SRC = VMON
-	{ CS35L41_DSP1_RX4_SRC,         0x00000019 }, // DSP1RX4 SRC = IMON
 	{ CS35L41_DSP1_RX6_SRC,         0x00000029 }, // DSP1RX6 SRC = VBSTMON
 };
 
@@ -657,6 +646,41 @@ static void cs35l41_irq_release(struct cs35l41_hda *cs35l41)
 	cs35l41->irq_errors = 0;
 }
 
+static void cs35l41_update_mixer(struct cs35l41_hda *cs35l41)
+{
+	struct regmap *reg = cs35l41->regmap;
+	unsigned int asp_en = 0;
+	unsigned int dsp1rx2_src = 0;
+
+	regmap_multi_reg_write(reg, cs35l41_hda_config, ARRAY_SIZE(cs35l41_hda_config));
+
+	if (cs35l41->cs_dsp.running) {
+		asp_en |= CS35L41_ASP_TX1_EN_MASK; // ASP_TX1_EN = 1
+		regmap_multi_reg_write(reg, cs35l41_hda_config_dsp,
+				       ARRAY_SIZE(cs35l41_hda_config_dsp));
+		if (cs35l41->hw_cfg.bst_type == CS35L41_INT_BOOST)
+			regmap_write(reg, CS35L41_DSP1_RX5_SRC, CS35L41_INPUT_SRC_VPMON);
+		else
+			regmap_write(reg, CS35L41_DSP1_RX5_SRC, CS35L41_INPUT_SRC_VBSTMON);
+	} else {
+		regmap_multi_reg_write(reg, cs35l41_hda_config_no_dsp,
+				       ARRAY_SIZE(cs35l41_hda_config_no_dsp));
+	}
+
+	if (cs35l41->hw_cfg.spk_pos == CS35L41_CENTER) {
+		asp_en |= CS35L41_ASP_RX2_EN_MASK; // ASP_RX2_EN = 1
+		dsp1rx2_src = 0x00000009; // DSP1RX2 SRC = ASPRX2
+	} else {
+		dsp1rx2_src = 0x00000008; // DSP1RX2 SRC = ASPRX1
+	}
+
+	asp_en |= CS35L41_ASP_RX1_EN_MASK; // ASP_RX1_EN = 1
+
+	regmap_write(reg, CS35L41_SP_ENABLES, asp_en);
+	regmap_write(reg, CS35L41_DSP1_RX1_SRC, 0x00000008); // DSP1RX1 SRC = ASPRX1
+	regmap_write(reg, CS35L41_DSP1_RX2_SRC, dsp1rx2_src);
+}
+
 static void cs35l41_hda_play_start(struct device *dev)
 {
 	struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev);
@@ -671,19 +695,13 @@ static void cs35l41_hda_play_start(struct device *dev)
 
 	cs35l41->playback_started = true;
 
+	cs35l41_update_mixer(cs35l41);
+
 	if (cs35l41->cs_dsp.running) {
-		regmap_multi_reg_write(reg, cs35l41_hda_config_dsp,
-				       ARRAY_SIZE(cs35l41_hda_config_dsp));
-		if (cs35l41->hw_cfg.bst_type == CS35L41_INT_BOOST)
-			regmap_write(reg, CS35L41_DSP1_RX5_SRC, CS35L41_INPUT_SRC_VPMON);
-		else
-			regmap_write(reg, CS35L41_DSP1_RX5_SRC, CS35L41_INPUT_SRC_VBSTMON);
 		regmap_update_bits(reg, CS35L41_PWR_CTRL2,
 				   CS35L41_VMON_EN_MASK | CS35L41_IMON_EN_MASK,
 				   1 << CS35L41_VMON_EN_SHIFT | 1 << CS35L41_IMON_EN_SHIFT);
 		cs35l41_set_cspl_mbox_cmd(cs35l41->dev, reg, CSPL_MBOX_CMD_RESUME);
-	} else {
-		regmap_multi_reg_write(reg, cs35l41_hda_config, ARRAY_SIZE(cs35l41_hda_config));
 	}
 	regmap_update_bits(reg, CS35L41_PWR_CTRL2, CS35L41_AMP_EN_MASK, 1 << CS35L41_AMP_EN_SHIFT);
 	if (cs35l41->hw_cfg.bst_type == CS35L41_EXT_BOOST)
@@ -841,22 +859,31 @@ static void cs35l41_hda_post_playback_hook(struct device *dev, int action)
 	}
 }
 
-static int cs35l41_hda_channel_map(struct device *dev, unsigned int tx_num, unsigned int *tx_slot,
-				    unsigned int rx_num, unsigned int *rx_slot)
+static int cs35l41_hda_channel_map(struct cs35l41_hda *cs35l41)
 {
-	struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev);
-	static const char * const channel_name[] = { "L", "R" };
+	unsigned int tx_num = 0;
+	unsigned int *tx_slot = NULL;
+	unsigned int rx_num;
+	unsigned int *rx_slot;
+	unsigned int mono = 0;
 
 	if (!cs35l41->amp_name) {
-		if (*rx_slot >= ARRAY_SIZE(channel_name))
+		if (cs35l41->hw_cfg.spk_pos >= ARRAY_SIZE(channel_name))
 			return -EINVAL;
 
-		cs35l41->amp_name = devm_kasprintf(cs35l41->dev, GFP_KERNEL, "%s%d",
-						   channel_name[*rx_slot], cs35l41->channel_index);
+		cs35l41->amp_name = devm_kasprintf(cs35l41->dev, GFP_KERNEL, "%c%d",
+						   channel_name[cs35l41->hw_cfg.spk_pos],
+						   cs35l41->channel_index);
 		if (!cs35l41->amp_name)
 			return -ENOMEM;
 	}
 
+	rx_num = 1;
+	if (cs35l41->hw_cfg.spk_pos == CS35L41_CENTER)
+		rx_slot = &mono;
+	else
+		rx_slot = &cs35l41->hw_cfg.spk_pos;
+
 	return cs35l41_set_channels(cs35l41->dev, cs35l41->regmap, tx_num, tx_slot, rx_num,
 				    rx_slot);
 }
@@ -1495,7 +1522,7 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas
 		 "CS35L41 Bound - SSID: %s, BST: %d, VSPK: %d, CH: %c, FW EN: %d, SPKID: %d\n",
 		 cs35l41->acpi_subsystem_id, cs35l41->hw_cfg.bst_type,
 		 cs35l41->hw_cfg.gpio1.func == CS35l41_VSPK_SWITCH,
-		 cs35l41->hw_cfg.spk_pos ? 'R' : 'L',
+		 channel_name[cs35l41->hw_cfg.spk_pos],
 		 cs35l41->cs_dsp.running, cs35l41->speaker_id);
 
 	return ret;
@@ -1709,7 +1736,7 @@ static int cs35l41_hda_apply_properties(struct cs35l41_hda *cs35l41)
 	if (using_irq)
 		cs35l41_configure_interrupt(cs35l41, irq_pol);
 
-	return cs35l41_hda_channel_map(cs35l41->dev, 0, NULL, 1, &hw_cfg->spk_pos);
+	return cs35l41_hda_channel_map(cs35l41);
 }
 
 int cs35l41_get_speaker_id(struct device *dev, int amp_index, int num_amps, int fixed_gpio_id)
diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h
index c730b3351589..7d003c598e93 100644
--- a/sound/pci/hda/cs35l41_hda.h
+++ b/sound/pci/hda/cs35l41_hda.h
@@ -42,6 +42,7 @@ struct cs35l41_amp_efi_data {
 enum cs35l41_hda_spk_pos {
 	CS35L41_LEFT,
 	CS35L41_RIGHT,
+	CS35L41_CENTER,
 };
 
 enum cs35l41_hda_gpio_function {
-- 
cgit v1.2.3


From ac90aad0e9bf7c37e706fdc08ce763a553890bdf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 12 Jun 2025 10:47:09 -0700
Subject: crypto: testmgr - reinstate kconfig control over full self-tests

Commit 698de822780f ("crypto: testmgr - make it easier to enable the
full set of tests") removed support for building kernels that run only
the "fast" set of crypto self-tests by default.  This assumed that
nearly everyone actually wanted the full set of tests, *if* they had
already chosen to enable the tests at all.

Unfortunately, it turns out that both Debian and Fedora intentionally
have the crypto self-tests enabled in their production kernels.  And for
production kernels we do need to keep the testing time down, which
implies just running the "fast" tests, not the full set of tests.

For Fedora, a reason for enabling the tests in production is that they
are being (mis)used to meet the FIPS 140-3 pre-operational testing
requirement.

However, the other reason for enabling the tests in production, which
applies to both distros, is that they provide some value in protecting
users from buggy drivers.  Unfortunately, the crypto/ subsystem has many
buggy and untested drivers for off-CPU hardware accelerators on rare
platforms.  These broken drivers get shipped to users, and there have
been multiple examples of the tests preventing these buggy drivers from
being used.  So effectively, the tests are being relied on in production
kernels.  I think this is kind of crazy (untested drivers should just
not be enabled at all), but that seems to be how things work currently.

Thus, reintroduce a kconfig option that controls the level of testing.
Call it CRYPTO_SELFTESTS_FULL instead of the original name
CRYPTO_MANAGER_EXTRA_TESTS, which was slightly misleading.

Moreover, given the "production kernel" use case, make CRYPTO_SELFTESTS
depend on EXPERT instead of DEBUG_KERNEL.

I also haven't reinstated all the #ifdefs in crypto/testmgr.c.  Instead,
just rely on the compiler to optimize out unused code.

Fixes: 40b9969796bf ("crypto: testmgr - replace CRYPTO_MANAGER_DISABLE_TESTS with CRYPTO_SELFTESTS")
Fixes: 698de822780f ("crypto: testmgr - make it easier to enable the full set of tests")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Kconfig                 | 25 +++++++++++++++++++++----
 crypto/testmgr.c               | 15 ++++++++++++---
 include/crypto/internal/simd.h |  6 ++++--
 lib/crypto/Makefile            |  2 +-
 4 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index e9fee7818e27..e1cfd0d4cc8f 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -176,16 +176,33 @@ config CRYPTO_USER
 
 config CRYPTO_SELFTESTS
 	bool "Enable cryptographic self-tests"
-	depends on DEBUG_KERNEL
+	depends on EXPERT
 	help
 	  Enable the cryptographic self-tests.
 
 	  The cryptographic self-tests run at boot time, or at algorithm
 	  registration time if algorithms are dynamically loaded later.
 
-	  This is primarily intended for developer use.  It should not be
-	  enabled in production kernels, unless you are trying to use these
-	  tests to fulfill a FIPS testing requirement.
+	  There are two main use cases for these tests:
+
+	  - Development and pre-release testing.  In this case, also enable
+	    CRYPTO_SELFTESTS_FULL to get the full set of tests.  All crypto code
+	    in the kernel is expected to pass the full set of tests.
+
+	  - Production kernels, to help prevent buggy drivers from being used
+	    and/or meet FIPS 140-3 pre-operational testing requirements.  In
+	    this case, enable CRYPTO_SELFTESTS but not CRYPTO_SELFTESTS_FULL.
+
+config CRYPTO_SELFTESTS_FULL
+	bool "Enable the full set of cryptographic self-tests"
+	depends on CRYPTO_SELFTESTS
+	help
+	  Enable the full set of cryptographic self-tests for each algorithm.
+
+	  The full set of tests should be enabled for development and
+	  pre-release testing, but not in production kernels.
+
+	  All crypto code in the kernel is expected to pass the full tests.
 
 config CRYPTO_NULL
 	tristate "Null algorithms"
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 72005074a5c2..32f753d6c430 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -45,6 +45,7 @@ static bool notests;
 module_param(notests, bool, 0644);
 MODULE_PARM_DESC(notests, "disable all crypto self-tests");
 
+#ifdef CONFIG_CRYPTO_SELFTESTS_FULL
 static bool noslowtests;
 module_param(noslowtests, bool, 0644);
 MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests");
@@ -52,6 +53,10 @@ MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests");
 static unsigned int fuzz_iterations = 100;
 module_param(fuzz_iterations, uint, 0644);
 MODULE_PARM_DESC(fuzz_iterations, "number of fuzz test iterations");
+#else
+#define noslowtests 1
+#define fuzz_iterations 0
+#endif
 
 #ifndef CONFIG_CRYPTO_SELFTESTS
 
@@ -319,9 +324,9 @@ struct testvec_config {
 
 /*
  * The following are the lists of testvec_configs to test for each algorithm
- * type when the fast crypto self-tests are enabled.  They aim to provide good
- * test coverage, while keeping the test time much shorter than the full tests
- * so that the fast tests can be used to fulfill FIPS 140 testing requirements.
+ * type when the "fast" crypto self-tests are enabled.  They aim to provide good
+ * test coverage, while keeping the test time much shorter than the "full" tests
+ * so that the "fast" tests can be enabled in a wider range of circumstances.
  */
 
 /* Configs for skciphers and aeads */
@@ -1183,14 +1188,18 @@ static void generate_random_testvec_config(struct rnd_state *rng,
 
 static void crypto_disable_simd_for_test(void)
 {
+#ifdef CONFIG_CRYPTO_SELFTESTS_FULL
 	migrate_disable();
 	__this_cpu_write(crypto_simd_disabled_for_test, true);
+#endif
 }
 
 static void crypto_reenable_simd_for_test(void)
 {
+#ifdef CONFIG_CRYPTO_SELFTESTS_FULL
 	__this_cpu_write(crypto_simd_disabled_for_test, false);
 	migrate_enable();
+#endif
 }
 
 /*
diff --git a/include/crypto/internal/simd.h b/include/crypto/internal/simd.h
index 7e7f1ac3b7fd..9e338e7aafbd 100644
--- a/include/crypto/internal/simd.h
+++ b/include/crypto/internal/simd.h
@@ -44,9 +44,11 @@ void simd_unregister_aeads(struct aead_alg *algs, int count,
  *
  * This delegates to may_use_simd(), except that this also returns false if SIMD
  * in crypto code has been temporarily disabled on this CPU by the crypto
- * self-tests, in order to test the no-SIMD fallback code.
+ * self-tests, in order to test the no-SIMD fallback code.  This override is
+ * currently limited to configurations where the "full" self-tests are enabled,
+ * because it might be a bit too invasive to be part of the "fast" self-tests.
  */
-#ifdef CONFIG_CRYPTO_SELFTESTS
+#ifdef CONFIG_CRYPTO_SELFTESTS_FULL
 DECLARE_PER_CPU(bool, crypto_simd_disabled_for_test);
 #define crypto_simd_usable() \
 	(may_use_simd() && !this_cpu_read(crypto_simd_disabled_for_test))
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 3e79283b617d..f9e44aac6619 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -62,7 +62,7 @@ libsha256-generic-y				:= sha256-generic.o
 
 obj-$(CONFIG_MPILIB) += mpi/
 
-obj-$(CONFIG_CRYPTO_SELFTESTS)			+= simd.o
+obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
 
 obj-$(CONFIG_CRYPTO_LIB_SM3)			+= libsm3.o
 libsm3-y					:= sm3.o
-- 
cgit v1.2.3


From 71203f68c7749609d7fc8ae6ad054bdedeb24f91 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 24 May 2025 20:32:20 +0800
Subject: padata: Fix pd UAF once and for all

There is a race condition/UAF in padata_reorder that goes back
to the initial commit.  A reference count is taken at the start
of the process in padata_do_parallel, and released at the end in
padata_serial_worker.

This reference count is (and only is) required for padata_replace
to function correctly.  If padata_replace is never called then
there is no issue.

In the function padata_reorder which serves as the core of padata,
as soon as padata is added to queue->serial.list, and the associated
spin lock released, that padata may be processed and the reference
count on pd would go away.

Fix this by getting the next padata before the squeue->serial lock
is released.

In order to make this possible, simplify padata_reorder by only
calling it once the next padata arrives.

Fixes: 16295bec6398 ("padata: Generic parallelization/serialization interface")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |   3 --
 kernel/padata.c        | 132 ++++++++++++++-----------------------------------
 2 files changed, 37 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 0146daf34430..b486c7359de2 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -91,7 +91,6 @@ struct padata_cpumask {
  * @cpu: Next CPU to be processed.
  * @cpumask: The cpumasks in use for parallel and serial workers.
  * @reorder_work: work struct for reordering.
- * @lock: Reorder lock.
  */
 struct parallel_data {
 	struct padata_shell		*ps;
@@ -102,8 +101,6 @@ struct parallel_data {
 	unsigned int			processed;
 	int				cpu;
 	struct padata_cpumask		cpumask;
-	struct work_struct		reorder_work;
-	spinlock_t                      ____cacheline_aligned lock;
 };
 
 /**
diff --git a/kernel/padata.c b/kernel/padata.c
index 7eee94166357..25cd3406477a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -261,20 +261,17 @@ EXPORT_SYMBOL(padata_do_parallel);
  *   be parallel processed by another cpu and is not yet present in
  *   the cpu's reorder queue.
  */
-static struct padata_priv *padata_find_next(struct parallel_data *pd,
-					    bool remove_object)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+					    unsigned int processed)
 {
 	struct padata_priv *padata;
 	struct padata_list *reorder;
-	int cpu = pd->cpu;
 
 	reorder = per_cpu_ptr(pd->reorder_list, cpu);
 
 	spin_lock(&reorder->lock);
-	if (list_empty(&reorder->list)) {
-		spin_unlock(&reorder->lock);
-		return NULL;
-	}
+	if (list_empty(&reorder->list))
+		goto notfound;
 
 	padata = list_entry(reorder->list.next, struct padata_priv, list);
 
@@ -282,97 +279,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
 	 * Checks the rare case where two or more parallel jobs have hashed to
 	 * the same CPU and one of the later ones finishes first.
 	 */
-	if (padata->seq_nr != pd->processed) {
-		spin_unlock(&reorder->lock);
-		return NULL;
-	}
-
-	if (remove_object) {
-		list_del_init(&padata->list);
-		++pd->processed;
-		pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
-	}
+	if (padata->seq_nr != processed)
+		goto notfound;
 
+	list_del_init(&padata->list);
 	spin_unlock(&reorder->lock);
 	return padata;
+
+notfound:
+	pd->processed = processed;
+	pd->cpu = cpu;
+	spin_unlock(&reorder->lock);
+	return NULL;
 }
 
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
 {
+	struct parallel_data *pd = padata->pd;
 	struct padata_instance *pinst = pd->ps->pinst;
-	int cb_cpu;
-	struct padata_priv *padata;
-	struct padata_serial_queue *squeue;
-	struct padata_list *reorder;
+	unsigned int processed;
+	int cpu;
 
-	/*
-	 * We need to ensure that only one cpu can work on dequeueing of
-	 * the reorder queue the time. Calculating in which percpu reorder
-	 * queue the next object will arrive takes some time. A spinlock
-	 * would be highly contended. Also it is not clear in which order
-	 * the objects arrive to the reorder queues. So a cpu could wait to
-	 * get the lock just to notice that there is nothing to do at the
-	 * moment. Therefore we use a trylock and let the holder of the lock
-	 * care for all the objects enqueued during the holdtime of the lock.
-	 */
-	if (!spin_trylock_bh(&pd->lock))
-		return;
+	processed = pd->processed;
+	cpu = pd->cpu;
 
-	while (1) {
-		padata = padata_find_next(pd, true);
+	do {
+		struct padata_serial_queue *squeue;
+		int cb_cpu;
 
-		/*
-		 * If the next object that needs serialization is parallel
-		 * processed by another cpu and is still on it's way to the
-		 * cpu's reorder queue, nothing to do for now.
-		 */
-		if (!padata)
-			break;
+		cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
+		processed++;
 
 		cb_cpu = padata->cb_cpu;
 		squeue = per_cpu_ptr(pd->squeue, cb_cpu);
 
 		spin_lock(&squeue->serial.lock);
 		list_add_tail(&padata->list, &squeue->serial.list);
-		spin_unlock(&squeue->serial.lock);
-
 		queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
-	}
 
-	spin_unlock_bh(&pd->lock);
-
-	/*
-	 * The next object that needs serialization might have arrived to
-	 * the reorder queues in the meantime.
-	 *
-	 * Ensure reorder queue is read after pd->lock is dropped so we see
-	 * new objects from another task in padata_do_serial.  Pairs with
-	 * smp_mb in padata_do_serial.
-	 */
-	smp_mb();
-
-	reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
-	if (!list_empty(&reorder->list) && padata_find_next(pd, false)) {
 		/*
-		 * Other context(eg. the padata_serial_worker) can finish the request.
-		 * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
+		 * If the next object that needs serialization is parallel
+		 * processed by another cpu and is still on it's way to the
+		 * cpu's reorder queue, end the loop.
 		 */
-		padata_get_pd(pd);
-		if (!queue_work(pinst->serial_wq, &pd->reorder_work))
-			padata_put_pd(pd);
-	}
-}
-
-static void invoke_padata_reorder(struct work_struct *work)
-{
-	struct parallel_data *pd;
-
-	local_bh_disable();
-	pd = container_of(work, struct parallel_data, reorder_work);
-	padata_reorder(pd);
-	local_bh_enable();
-	/* Pairs with putting the reorder_work in the serial_wq */
-	padata_put_pd(pd);
+		padata = padata_find_next(pd, cpu, processed);
+		spin_unlock(&squeue->serial.lock);
+	} while (padata);
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
@@ -423,6 +375,7 @@ void padata_do_serial(struct padata_priv *padata)
 	struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
 	struct padata_priv *cur;
 	struct list_head *pos;
+	bool gotit = true;
 
 	spin_lock(&reorder->lock);
 	/* Sort in ascending order of sequence number. */
@@ -432,17 +385,14 @@ void padata_do_serial(struct padata_priv *padata)
 		if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
 			break;
 	}
-	list_add(&padata->list, pos);
+	if (padata->seq_nr != pd->processed) {
+		gotit = false;
+		list_add(&padata->list, pos);
+	}
 	spin_unlock(&reorder->lock);
 
-	/*
-	 * Ensure the addition to the reorder list is ordered correctly
-	 * with the trylock of pd->lock in padata_reorder.  Pairs with smp_mb
-	 * in padata_reorder.
-	 */
-	smp_mb();
-
-	padata_reorder(pd);
+	if (gotit)
+		padata_reorder(padata);
 }
 EXPORT_SYMBOL(padata_do_serial);
 
@@ -632,9 +582,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
 	padata_init_squeues(pd);
 	pd->seq_nr = -1;
 	refcount_set(&pd->refcnt, 1);
-	spin_lock_init(&pd->lock);
 	pd->cpu = cpumask_first(pd->cpumask.pcpu);
-	INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
 
 	return pd;
 
@@ -1144,12 +1092,6 @@ void padata_free_shell(struct padata_shell *ps)
 	if (!ps)
 		return;
 
-	/*
-	 * Wait for all _do_serial calls to finish to avoid touching
-	 * freed pd's and ps's.
-	 */
-	synchronize_rcu();
-
 	mutex_lock(&ps->pinst->lock);
 	list_del(&ps->list);
 	pd = rcu_dereference_protected(ps->pd, 1);
-- 
cgit v1.2.3


From 09735f0624b494c0959f3327af009283567af320 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 10 Jun 2025 13:27:13 +0530
Subject: smp: Fix typo in comment for raw_smp_processor_id()

The comment in `smp.h` incorrectly refers to `raw_processor_id()`
instead of the correct function name `raw_smp_processor_id()`.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/all/d096779819962c305b85cd12bda41b593e0981aa.1749536622.git.viresh.kumar@linaro.org
---
 include/linux/smp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index f1aa0952e8c3..bea8d2826e09 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -234,7 +234,7 @@ static inline int get_boot_cpu_id(void)
 #endif /* !SMP */
 
 /**
- * raw_processor_id() - get the current (unstable) CPU id
+ * raw_smp_processor_id() - get the current (unstable) CPU id
  *
  * For then you know what you are doing and need an unstable
  * CPU id.
-- 
cgit v1.2.3


From a2fc422ed75748eef2985454e97847fb22f873c2 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Wed, 21 May 2025 17:04:29 +0200
Subject: syscall_user_dispatch: Add PR_SYS_DISPATCH_INCLUSIVE_ON

There are two possible scenarios for syscall filtering:
 - having a trusted/allowed range of PCs, and intercepting everything else
 - or the opposite: a single untrusted/intercepted range and allowing
   everything else (this is relevant for any kind of sandboxing scenario,
   or monitoring behavior of a single library)

The current API only allows the former use case due to allowed
range wrap-around check. Add PR_SYS_DISPATCH_INCLUSIVE_ON that
enables the second use case.

Add PR_SYS_DISPATCH_EXCLUSIVE_ON alias for PR_SYS_DISPATCH_ON
to make it clear how it's different from the new
PR_SYS_DISPATCH_INCLUSIVE_ON.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/97947cc8e205ff49675826d7b0327ef2e2c66eea.1747839857.git.dvyukov@google.com
---
 .../admin-guide/syscall-user-dispatch.rst          | 23 ++++++++------
 include/uapi/linux/prctl.h                         |  7 ++++-
 kernel/entry/syscall_user_dispatch.c               | 36 ++++++++++++++--------
 tools/include/uapi/linux/prctl.h                   |  7 ++++-
 4 files changed, 49 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst
index e3cfffef5a63..c1768d9e80fa 100644
--- a/Documentation/admin-guide/syscall-user-dispatch.rst
+++ b/Documentation/admin-guide/syscall-user-dispatch.rst
@@ -53,20 +53,25 @@ following prctl:
 
   prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <offset>, <length>, [selector])
 
-<op> is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and
-disable the mechanism globally for that thread.  When
-PR_SYS_DISPATCH_OFF is used, the other fields must be zero.
-
-[<offset>, <offset>+<length>) delimit a memory region interval
-from which syscalls are always executed directly, regardless of the
-userspace selector.  This provides a fast path for the C library, which
-includes the most common syscall dispatchers in the native code
-applications, and also provides a way for the signal handler to return
+<op> is either PR_SYS_DISPATCH_EXCLUSIVE_ON/PR_SYS_DISPATCH_INCLUSIVE_ON
+or PR_SYS_DISPATCH_OFF, to enable and disable the mechanism globally for
+that thread.  When PR_SYS_DISPATCH_OFF is used, the other fields must be zero.
+
+For PR_SYS_DISPATCH_EXCLUSIVE_ON [<offset>, <offset>+<length>) delimit
+a memory region interval from which syscalls are always executed directly,
+regardless of the userspace selector.  This provides a fast path for the
+C library, which includes the most common syscall dispatchers in the native
+code applications, and also provides a way for the signal handler to return
 without triggering a nested SIGSYS on (rt\_)sigreturn.  Users of this
 interface should make sure that at least the signal trampoline code is
 included in this region. In addition, for syscalls that implement the
 trampoline code on the vDSO, that trampoline is never intercepted.
 
+For PR_SYS_DISPATCH_INCLUSIVE_ON [<offset>, <offset>+<length>) delimit
+a memory region interval from which syscalls are dispatched based on
+the userspace selector. Syscalls from outside of the range are always
+executed directly.
+
 [selector] is a pointer to a char-sized region in the process memory
 region, that provides a quick way to enable disable syscall redirection
 thread-wide, without the need to invoke the kernel directly.  selector
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 43dec6eed559..9785c1d49f05 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -255,7 +255,12 @@ struct prctl_mm_map {
 /* Dispatch syscalls to a userspace handler */
 #define PR_SET_SYSCALL_USER_DISPATCH	59
 # define PR_SYS_DISPATCH_OFF		0
-# define PR_SYS_DISPATCH_ON		1
+/* Enable dispatch except for the specified range */
+# define PR_SYS_DISPATCH_EXCLUSIVE_ON	1
+/* Enable dispatch for the specified range */
+# define PR_SYS_DISPATCH_INCLUSIVE_ON	2
+/* Legacy name for backwards compatibility */
+# define PR_SYS_DISPATCH_ON		PR_SYS_DISPATCH_EXCLUSIVE_ON
 /* The control values for the user space selector when dispatch is enabled */
 # define SYSCALL_DISPATCH_FILTER_ALLOW	0
 # define SYSCALL_DISPATCH_FILTER_BLOCK	1
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 5340c5aa89e7..a9055eccb27e 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
 		if (offset || len || selector)
 			return -EINVAL;
 		break;
-	case PR_SYS_DISPATCH_ON:
+	case PR_SYS_DISPATCH_EXCLUSIVE_ON:
 		/*
 		 * Validate the direct dispatcher region just for basic
 		 * sanity against overflow and a 0-sized dispatcher
@@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
 		 */
 		if (offset && offset + len <= offset)
 			return -EINVAL;
-
+		break;
+	case PR_SYS_DISPATCH_INCLUSIVE_ON:
+		if (len == 0 || offset + len <= offset)
+			return -EINVAL;
 		/*
-		 * access_ok() will clear memory tags for tagged addresses
-		 * if current has memory tagging enabled.
-
-		 * To enable a tracer to set a tracees selector the
-		 * selector address must be untagged for access_ok(),
-		 * otherwise an untagged tracer will always fail to set a
-		 * tagged tracees selector.
+		 * Invert the range, the check in syscall_user_dispatch()
+		 * supports wrap-around.
 		 */
-		if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
-			return -EFAULT;
-
+		offset = offset + len;
+		len = -len;
 		break;
 	default:
 		return -EINVAL;
 	}
 
+	/*
+	 * access_ok() will clear memory tags for tagged addresses
+	 * if current has memory tagging enabled.
+	 *
+	 * To enable a tracer to set a tracees selector the
+	 * selector address must be untagged for access_ok(),
+	 * otherwise an untagged tracer will always fail to set a
+	 * tagged tracees selector.
+	 */
+	if (mode != PR_SYS_DISPATCH_OFF && selector &&
+		!access_ok(untagged_addr(selector), sizeof(*selector)))
+		return -EFAULT;
+
 	task->syscall_dispatch.selector = selector;
 	task->syscall_dispatch.offset = offset;
 	task->syscall_dispatch.len = len;
 	task->syscall_dispatch.on_dispatch = false;
 
-	if (mode == PR_SYS_DISPATCH_ON)
+	if (mode != PR_SYS_DISPATCH_OFF)
 		set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
 	else
 		clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 43dec6eed559..9785c1d49f05 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -255,7 +255,12 @@ struct prctl_mm_map {
 /* Dispatch syscalls to a userspace handler */
 #define PR_SET_SYSCALL_USER_DISPATCH	59
 # define PR_SYS_DISPATCH_OFF		0
-# define PR_SYS_DISPATCH_ON		1
+/* Enable dispatch except for the specified range */
+# define PR_SYS_DISPATCH_EXCLUSIVE_ON	1
+/* Enable dispatch for the specified range */
+# define PR_SYS_DISPATCH_INCLUSIVE_ON	2
+/* Legacy name for backwards compatibility */
+# define PR_SYS_DISPATCH_ON		PR_SYS_DISPATCH_EXCLUSIVE_ON
 /* The control values for the user space selector when dispatch is enabled */
 # define SYSCALL_DISPATCH_FILTER_ALLOW	0
 # define SYSCALL_DISPATCH_FILTER_BLOCK	1
-- 
cgit v1.2.3


From b776999bf25ddca9880bc3c9c30b8f84a748504b Mon Sep 17 00:00:00 2001
From: RubenKelevra <rubenkelevra@gmail.com>
Date: Thu, 12 Jun 2025 16:50:12 +0200
Subject: net: pfcp: fix typo in message_priority field name

The field is spelled "message_priprity" in the big-endian bit-field
definition.  Nothing in-tree currently references the member, so the
typo does not break kernel builds, but it is clearly incorrect.

Signed-off-by: RubenKelevra <rubenkelevra@gmail.com>
Link: https://patch.msgid.link/20250612145012.185321-1-rubenkelevra@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pfcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/pfcp.h b/include/net/pfcp.h
index af14f970b80e..639553797d3e 100644
--- a/include/net/pfcp.h
+++ b/include/net/pfcp.h
@@ -45,7 +45,7 @@ struct pfcphdr_session {
 		reserved:4;
 #elif defined(__BIG_ENDIAN_BITFIELD)
 	u8	reserved:4,
-		message_priprity:4;
+		message_priority:4;
 #else
 #error "Please fix <asm/byteorder>"
 #endif
-- 
cgit v1.2.3


From 91695b8592638c85dc78a15d59250c62b9c68891 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Thu, 12 Jun 2025 16:21:04 +0100
Subject: net: phy: improve rgmii_clock() documentation

Improve the rgmii_clock() documentation to indicate that it can also
be used for MII, GMII and RMII modes as well as RGMII as the required
clock rates are identical, but note that it won't error out for 1G
speeds for MII and RMII.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/E1uPjjk-0049pI-MD@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index c4d8f7c82627..8e2e4fcd050e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -269,8 +269,10 @@ static inline const char *phy_modes(phy_interface_t interface)
  * rgmii_clock - map link speed to the clock rate
  * @speed: link speed value
  *
- * Description: maps RGMII supported link speeds
- * into the clock rates.
+ * Description: maps RGMII supported link speeds into the clock rates.
+ * This can also be used for MII, GMII, and RMII interface modes as the
+ * clock rates are indentical, but the caller must be aware that errors
+ * for unsupported clock rates will not be signalled.
  *
  * Returns: clock rate or negative errno
  */
-- 
cgit v1.2.3


From c035e736038045b411cb368e63f07bc2f5dbc0e1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 12 Jun 2025 17:28:33 +0200
Subject: dpll: add phase-offset-monitor feature to netlink spec

Add enum dpll_feature_state for control over features.

Add dpll device level attribute:
DPLL_A_PHASE_OFFSET_MONITOR - to allow control over a phase offset monitor
feature. Attribute is present and shall return current state of a feature
(enum dpll_feature_state), if the device driver provides such capability,
otherwie attribute shall not be present.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20250612152835.1703397-2-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/driver-api/dpll.rst     | 18 ++++++++++++++++++
 Documentation/netlink/specs/dpll.yaml | 24 ++++++++++++++++++++++++
 drivers/dpll/dpll_nl.c                |  5 +++--
 include/uapi/linux/dpll.h             | 12 ++++++++++++
 4 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst
index e6855cd37e85..195e1e5d9a58 100644
--- a/Documentation/driver-api/dpll.rst
+++ b/Documentation/driver-api/dpll.rst
@@ -214,6 +214,24 @@ offset values are fractional with 3-digit decimal places and shell be
 divided with ``DPLL_PIN_PHASE_OFFSET_DIVIDER`` to get integer part and
 modulo divided to get fractional part.
 
+Phase offset monitor
+====================
+
+Phase offset measurement is typically performed against the current active
+source. However, some DPLL (Digital Phase-Locked Loop) devices may offer
+the capability to monitor phase offsets across all available inputs.
+The attribute and current feature state shall be included in the response
+message of the ``DPLL_CMD_DEVICE_GET`` command for supported DPLL devices.
+In such cases, users can also control the feature using the
+``DPLL_CMD_DEVICE_SET`` command by setting the ``enum dpll_feature_state``
+values for the attribute.
+Once enabled the phase offset measurements for the input shall be returned
+in the ``DPLL_A_PIN_PHASE_OFFSET`` attribute.
+
+  =============================== ========================
+  ``DPLL_A_PHASE_OFFSET_MONITOR`` attr state of a feature
+  =============================== ========================
+
 Embedded SYNC
 =============
 
diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index 115d1a8f50bd..3bd6851c1d3c 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -240,6 +240,20 @@ definitions:
       integer part of a measured phase offset value.
       Value of (DPLL_A_PHASE_OFFSET % DPLL_PHASE_OFFSET_DIVIDER) is a
       fractional part of a measured phase offset value.
+  -
+    type: enum
+    name: feature-state
+    doc: |
+      Allow control (enable/disable) and status checking over features.
+    entries:
+      -
+        name: disable
+        doc: |
+          feature shall be disabled
+      -
+        name: enable
+        doc: |
+          feature shall be enabled
 
 attribute-sets:
   -
@@ -293,6 +307,14 @@ attribute-sets:
           be put to message multiple times to indicate possible parallel
           quality levels (e.g. one specified by ITU option 1 and another
           one specified by option 2).
+      -
+        name: phase-offset-monitor
+        type: u32
+        enum: feature-state
+        doc: Receive or request state of phase offset monitor feature.
+          If enabled, dpll device shall monitor and notify all currently
+          available inputs for changes of their phase offset against the
+          dpll device.
   -
     name: pin
     enum-name: dpll_a_pin
@@ -483,6 +505,7 @@ operations:
             - temp
             - clock-id
             - type
+            - phase-offset-monitor
 
       dump:
         reply: *dev-attrs
@@ -499,6 +522,7 @@ operations:
         request:
           attributes:
             - id
+            - phase-offset-monitor
     -
       name: device-create-ntf
       doc: Notification about device appearing
diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c
index fe9b6893d261..8de90310c3be 100644
--- a/drivers/dpll/dpll_nl.c
+++ b/drivers/dpll/dpll_nl.c
@@ -37,8 +37,9 @@ static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = {
 };
 
 /* DPLL_CMD_DEVICE_SET - do */
-static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_ID + 1] = {
+static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_MONITOR + 1] = {
 	[DPLL_A_ID] = { .type = NLA_U32, },
+	[DPLL_A_PHASE_OFFSET_MONITOR] = NLA_POLICY_MAX(NLA_U32, 1),
 };
 
 /* DPLL_CMD_PIN_ID_GET - do */
@@ -105,7 +106,7 @@ static const struct genl_split_ops dpll_nl_ops[] = {
 		.doit		= dpll_nl_device_set_doit,
 		.post_doit	= dpll_post_doit,
 		.policy		= dpll_device_set_nl_policy,
-		.maxattr	= DPLL_A_ID,
+		.maxattr	= DPLL_A_PHASE_OFFSET_MONITOR,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index bf97d4b6d51f..349e1b3ca1ae 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -192,6 +192,17 @@ enum dpll_pin_capabilities {
 
 #define DPLL_PHASE_OFFSET_DIVIDER	1000
 
+/**
+ * enum dpll_feature_state - Allow control (enable/disable) and status checking
+ *   over features.
+ * @DPLL_FEATURE_STATE_DISABLE: feature shall be disabled
+ * @DPLL_FEATURE_STATE_ENABLE: feature shall be enabled
+ */
+enum dpll_feature_state {
+	DPLL_FEATURE_STATE_DISABLE,
+	DPLL_FEATURE_STATE_ENABLE,
+};
+
 enum dpll_a {
 	DPLL_A_ID = 1,
 	DPLL_A_MODULE_NAME,
@@ -204,6 +215,7 @@ enum dpll_a {
 	DPLL_A_TYPE,
 	DPLL_A_LOCK_STATUS_ERROR,
 	DPLL_A_CLOCK_QUALITY_LEVEL,
+	DPLL_A_PHASE_OFFSET_MONITOR,
 
 	__DPLL_A_MAX,
 	DPLL_A_MAX = (__DPLL_A_MAX - 1)
-- 
cgit v1.2.3


From 2952daf44a84670a6aa9e13edbc105bdab83ccba Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 12 Jun 2025 17:28:34 +0200
Subject: dpll: add phase_offset_monitor_get/set callback ops

Add new callback operations for a dpll device:
- phase_offset_monitor_get(..) - to obtain current state of phase offset
  monitor feature from dpll device,
- phase_offset_monitor_set(..) - to allow feature configuration.

Obtain the feature state value using the get callback and provide it to
the user if the device driver implements callbacks.

Execute the set callback upon user requests.

Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20250612152835.1703397-3-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/dpll/dpll_netlink.c | 69 +++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dpll.h        |  8 ++++++
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index c130f87147fa..4619aaa18b9c 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -126,6 +126,26 @@ dpll_msg_add_mode_supported(struct sk_buff *msg, struct dpll_device *dpll,
 	return 0;
 }
 
+static int
+dpll_msg_add_phase_offset_monitor(struct sk_buff *msg, struct dpll_device *dpll,
+				  struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_feature_state state;
+	int ret;
+
+	if (ops->phase_offset_monitor_set && ops->phase_offset_monitor_get) {
+		ret = ops->phase_offset_monitor_get(dpll, dpll_priv(dpll),
+						    &state, extack);
+		if (ret)
+			return ret;
+		if (nla_put_u32(msg, DPLL_A_PHASE_OFFSET_MONITOR, state))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
 static int
 dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll,
 			 struct netlink_ext_ack *extack)
@@ -591,6 +611,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg,
 		return ret;
 	if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type))
 		return -EMSGSIZE;
+	ret = dpll_msg_add_phase_offset_monitor(msg, dpll, extack);
+	if (ret)
+		return ret;
 
 	return 0;
 }
@@ -746,6 +769,31 @@ int dpll_pin_change_ntf(struct dpll_pin *pin)
 }
 EXPORT_SYMBOL_GPL(dpll_pin_change_ntf);
 
+static int
+dpll_phase_offset_monitor_set(struct dpll_device *dpll, struct nlattr *a,
+			      struct netlink_ext_ack *extack)
+{
+	const struct dpll_device_ops *ops = dpll_device_ops(dpll);
+	enum dpll_feature_state state = nla_get_u32(a), old_state;
+	int ret;
+
+	if (!(ops->phase_offset_monitor_set && ops->phase_offset_monitor_get)) {
+		NL_SET_ERR_MSG_ATTR(extack, a, "dpll device not capable of phase offset monitor");
+		return -EOPNOTSUPP;
+	}
+	ret = ops->phase_offset_monitor_get(dpll, dpll_priv(dpll), &old_state,
+					    extack);
+	if (ret) {
+		NL_SET_ERR_MSG(extack, "unable to get current state of phase offset monitor");
+		return ret;
+	}
+	if (state == old_state)
+		return 0;
+
+	return ops->phase_offset_monitor_set(dpll, dpll_priv(dpll), state,
+					     extack);
+}
+
 static int
 dpll_pin_freq_set(struct dpll_pin *pin, struct nlattr *a,
 		  struct netlink_ext_ack *extack)
@@ -1533,12 +1581,29 @@ int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info)
 	return genlmsg_reply(msg, info);
 }
 
-int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info)
+static int
+dpll_set_from_nlattr(struct dpll_device *dpll, struct genl_info *info)
 {
-	/* placeholder for set command */
+	int ret;
+
+	if (info->attrs[DPLL_A_PHASE_OFFSET_MONITOR]) {
+		struct nlattr *a = info->attrs[DPLL_A_PHASE_OFFSET_MONITOR];
+
+		ret = dpll_phase_offset_monitor_set(dpll, a, info->extack);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 
+int dpll_nl_device_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct dpll_device *dpll = info->user_ptr[0];
+
+	return dpll_set_from_nlattr(dpll, info);
+}
+
 int dpll_nl_device_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct dpll_dump_ctx *ctx = dpll_dump_context(cb);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 5e4f9ab1cf75..6ad6c2968a28 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -30,6 +30,14 @@ struct dpll_device_ops {
 				       void *dpll_priv,
 				       unsigned long *qls,
 				       struct netlink_ext_ack *extack);
+	int (*phase_offset_monitor_set)(const struct dpll_device *dpll,
+					void *dpll_priv,
+					enum dpll_feature_state state,
+					struct netlink_ext_ack *extack);
+	int (*phase_offset_monitor_get)(const struct dpll_device *dpll,
+					void *dpll_priv,
+					enum dpll_feature_state *state,
+					struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_ops {
-- 
cgit v1.2.3


From 385a766bed48c5bcf620061f24e864dafeca671a Mon Sep 17 00:00:00 2001
From: Igor Belwon <igor.belwon@mentallysanemainliners.org>
Date: Thu, 15 May 2025 16:43:02 +0200
Subject: phy: exynos5-usbdrd: Add support for the Exynos990 usbdrd phy

The Exynos990 usbdrd PHY is a combo PHY which supports USB SS, HS and
DisplayPort outputs. This commit adds support only for UTMI+ (USB HS).

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Igor Belwon <igor.belwon@mentallysanemainliners.org>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20250515-usb-resends-may-15-v3-2-ad33a85b6cee@mentallysanemainliners.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/phy/samsung/phy-exynos5-usbdrd.c    | 32 +++++++++++++++++++++++++++++
 include/linux/soc/samsung/exynos-regs-pmu.h |  3 +++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/drivers/phy/samsung/phy-exynos5-usbdrd.c b/drivers/phy/samsung/phy-exynos5-usbdrd.c
index 917a76d584f0..dd660ebe8045 100644
--- a/drivers/phy/samsung/phy-exynos5-usbdrd.c
+++ b/drivers/phy/samsung/phy-exynos5-usbdrd.c
@@ -2025,6 +2025,35 @@ static const struct exynos5_usbdrd_phy_drvdata exynos850_usbdrd_phy = {
 	.n_regulators		= ARRAY_SIZE(exynos5_regulator_names),
 };
 
+static const struct exynos5_usbdrd_phy_tuning exynos990_tunes_utmi_postinit[] = {
+	PHY_TUNING_ENTRY_PHY(EXYNOS850_DRD_HSPPARACON,
+			     (HSPPARACON_TXVREF |
+			      HSPPARACON_TXPREEMPAMP | HSPPARACON_SQRX |
+			      HSPPARACON_COMPDIS),
+			     (FIELD_PREP_CONST(HSPPARACON_TXVREF, 7) |
+			      FIELD_PREP_CONST(HSPPARACON_TXPREEMPAMP, 3) |
+			      FIELD_PREP_CONST(HSPPARACON_SQRX, 5) |
+			      FIELD_PREP_CONST(HSPPARACON_COMPDIS, 7))),
+	PHY_TUNING_ENTRY_LAST
+};
+
+static const struct exynos5_usbdrd_phy_tuning *exynos990_tunes[PTS_MAX] = {
+	[PTS_UTMI_POSTINIT] = exynos990_tunes_utmi_postinit,
+};
+
+static const struct exynos5_usbdrd_phy_drvdata exynos990_usbdrd_phy = {
+	.phy_cfg		= phy_cfg_exynos850,
+	.phy_ops		= &exynos850_usbdrd_phy_ops,
+	.phy_tunes		= exynos990_tunes,
+	.pmu_offset_usbdrd0_phy	= EXYNOS990_PHY_CTRL_USB20,
+	.clk_names		= exynos5_clk_names,
+	.n_clks			= ARRAY_SIZE(exynos5_clk_names),
+	.core_clk_names		= exynos5_core_clk_names,
+	.n_core_clks		= ARRAY_SIZE(exynos5_core_clk_names),
+	.regulator_names	= exynos5_regulator_names,
+	.n_regulators		= ARRAY_SIZE(exynos5_regulator_names),
+};
+
 static const struct exynos5_usbdrd_phy_config phy_cfg_gs101[] = {
 	{
 		.id		= EXYNOS5_DRDPHY_UTMI,
@@ -2228,6 +2257,9 @@ static const struct of_device_id exynos5_usbdrd_phy_of_match[] = {
 	}, {
 		.compatible = "samsung,exynos850-usbdrd-phy",
 		.data = &exynos850_usbdrd_phy
+	}, {
+		.compatible = "samsung,exynos990-usbdrd-phy",
+		.data = &exynos990_usbdrd_phy
 	},
 	{ },
 };
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 1a2c0e0838f9..7754697e5810 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -662,6 +662,9 @@
 #define EXYNOS5433_PAD_RETENTION_UFS_OPTION			(0x3268)
 #define EXYNOS5433_PAD_RETENTION_FSYSGENIO_OPTION		(0x32A8)
 
+/* For Exynos990 */
+#define EXYNOS990_PHY_CTRL_USB20				(0x72C)
+
 /* For Tensor GS101 */
 /* PMU ALIVE */
 #define GS101_SYSIP_DAT0					(0x810)
-- 
cgit v1.2.3


From 0c17270f9b920e4e1777488f1911bbfdaf2af3be Mon Sep 17 00:00:00 2001
From: Yajun Deng <yajun.deng@linux.dev>
Date: Thu, 12 Jun 2025 14:27:07 +0000
Subject: net: sysfs: Implement is_visible for phys_(port_id, port_name,
 switch_id)

phys_port_id_show, phys_port_name_show and phys_switch_id_show would
return -EOPNOTSUPP if the netdev didn't implement the corresponding
method.

There is no point in creating these files if they are unsupported.

Put these attributes in netdev_phys_group and implement the is_visible
method. make phys_(port_id, port_name, switch_id) invisible if the netdev
dosen't implement the corresponding method.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250612142707.4644-1-yajun.deng@linux.dev
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  2 +-
 net/core/net-sysfs.c      | 59 ++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index adb14db25798..9cbc4e54b7e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2388,7 +2388,7 @@ struct net_device {
 	struct dm_hw_stat_delta __rcu *dm_private;
 #endif
 	struct device		dev;
-	const struct attribute_group *sysfs_groups[4];
+	const struct attribute_group *sysfs_groups[5];
 	const struct attribute_group *sysfs_rx_queue_group;
 
 	const struct rtnl_link_ops *rtnl_link_ops;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1ace0cd01adc..c9b969386399 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -641,12 +641,6 @@ static ssize_t phys_port_id_show(struct device *dev,
 	struct netdev_phys_item_id ppid;
 	ssize_t ret;
 
-	/* The check is also done in dev_get_phys_port_id; this helps returning
-	 * early without hitting the locking section below.
-	 */
-	if (!netdev->netdev_ops->ndo_get_phys_port_id)
-		return -EOPNOTSUPP;
-
 	ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
 	if (ret)
 		return ret;
@@ -668,13 +662,6 @@ static ssize_t phys_port_name_show(struct device *dev,
 	char name[IFNAMSIZ];
 	ssize_t ret;
 
-	/* The checks are also done in dev_get_phys_port_name; this helps
-	 * returning early without hitting the locking section below.
-	 */
-	if (!netdev->netdev_ops->ndo_get_phys_port_name &&
-	    !netdev->devlink_port)
-		return -EOPNOTSUPP;
-
 	ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
 	if (ret)
 		return ret;
@@ -696,14 +683,6 @@ static ssize_t phys_switch_id_show(struct device *dev,
 	struct netdev_phys_item_id ppid = { };
 	ssize_t ret;
 
-	/* The checks are also done in dev_get_phys_port_name; this helps
-	 * returning early without hitting the locking section below. This works
-	 * because recurse is false when calling dev_get_port_parent_id.
-	 */
-	if (!netdev->netdev_ops->ndo_get_port_parent_id &&
-	    !netdev->devlink_port)
-		return -EOPNOTSUPP;
-
 	ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
 	if (ret)
 		return ret;
@@ -718,6 +697,40 @@ static ssize_t phys_switch_id_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(phys_switch_id);
 
+static struct attribute *netdev_phys_attrs[] __ro_after_init = {
+	&dev_attr_phys_port_id.attr,
+	&dev_attr_phys_port_name.attr,
+	&dev_attr_phys_switch_id.attr,
+	NULL,
+};
+
+static umode_t netdev_phys_is_visible(struct kobject *kobj,
+				      struct attribute *attr, int index)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct net_device *netdev = to_net_dev(dev);
+
+	if (attr == &dev_attr_phys_port_id.attr) {
+		if (!netdev->netdev_ops->ndo_get_phys_port_id)
+			return 0;
+	} else if (attr == &dev_attr_phys_port_name.attr) {
+		if (!netdev->netdev_ops->ndo_get_phys_port_name &&
+		    !netdev->devlink_port)
+			return 0;
+	} else if (attr == &dev_attr_phys_switch_id.attr) {
+		if (!netdev->netdev_ops->ndo_get_port_parent_id &&
+		    !netdev->devlink_port)
+			return 0;
+	}
+
+	return attr->mode;
+}
+
+static const struct attribute_group netdev_phys_group = {
+	.attrs = netdev_phys_attrs,
+	.is_visible = netdev_phys_is_visible,
+};
+
 static ssize_t threaded_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
@@ -783,9 +796,6 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
 	&dev_attr_tx_queue_len.attr,
 	&dev_attr_gro_flush_timeout.attr,
 	&dev_attr_napi_defer_hard_irqs.attr,
-	&dev_attr_phys_port_id.attr,
-	&dev_attr_phys_port_name.attr,
-	&dev_attr_phys_switch_id.attr,
 	&dev_attr_proto_down.attr,
 	&dev_attr_carrier_up_count.attr,
 	&dev_attr_carrier_down_count.attr,
@@ -2328,6 +2338,7 @@ int netdev_register_kobject(struct net_device *ndev)
 		groups++;
 
 	*groups++ = &netstat_group;
+	*groups++ = &netdev_phys_group;
 
 	if (wireless_group_needed(ndev))
 		*groups++ = &wireless_group;
-- 
cgit v1.2.3


From 72bf1441231ab421a380771e37a5c595493db178 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Sun, 15 Jun 2025 22:32:51 +0900
Subject: firewire: core: allocate workqueue for AR/AT request/response
 contexts

Some tasklets (softIRQs) are still used as bottom-halves to handle
events for 1394 OHCI AR/AT contexts. However, using softIRQs for IRQ
bottom halves is generally discouraged today.

This commit adds a per-fw_card workqueue to accommodate the behaviour
specified by the 1394 OHCI specification.

According to the 1394 OHCI specification, system memory pages are
reserved for each asynchronous DMA context. This allows concurrent
operation across contexts. In the 1394 OHCI PCI driver implementation,
the hardware generates IRQs either upon receiving asynchronous packets
from other nodes (incoming) or after completing transmission to them
(outgoing). These independent events can occur in the same transmission
cycle, therefore the max_active parameter for the workqueue is set to the
total number of AR/AT contexts (=4). The WQ_UNBOUND flag is used to
allow the work to be scheduled on any available core, since there is
little CPU cache affinity benefit for the data.

Each DMA context uses a circular descriptor list in system memory,
allowing deferred data processing in software as long as buffer overrun
are avoided. Since the overall operation is sleepable except for small
atomic regions, WQ_BH is not used. As the descriptors contain
timestamps, WQ_HIGHPRI is specified to support semi-real-time
processing.

The asynchronous context is also used by the SCSI over IEEE 1394
protocol implementation (sbp2), which can be part of memory reclaim paths.
Therefore, WQ_MEM_RECLAIM is required.

To allow uses to adjust CPU affinity according to workload, WQ_SYSFS is
specified so that workqueue attributes are exposed to user space.

Link: https://lore.kernel.org/r/20250615133253.433057-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-card.c | 48 +++++++++++++++++++++++++++++++-------------
 include/linux/firewire.h     |  1 +
 2 files changed, 35 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index 2b6ad47b6d57..b3e48ca516fe 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -574,7 +574,6 @@ EXPORT_SYMBOL(fw_card_initialize);
 int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid,
 		unsigned int supported_isoc_contexts)
 {
-	struct workqueue_struct *isoc_wq;
 	int ret;
 
 	// This workqueue should be:
@@ -589,29 +588,48 @@ int fw_card_add(struct fw_card *card, u32 max_receive, u32 link_speed, u64 guid,
 	//  * == WQ_SYSFS		Parameters are available via sysfs.
 	//  * max_active == n_it + n_ir	A hardIRQ could notify events for multiple isochronous
 	//				contexts if they are scheduled to the same cycle.
-	isoc_wq = alloc_workqueue("firewire-isoc-card%u",
-				  WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS,
-				  supported_isoc_contexts, card->index);
-	if (!isoc_wq)
+	card->isoc_wq = alloc_workqueue("firewire-isoc-card%u",
+					WQ_UNBOUND | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS,
+					supported_isoc_contexts, card->index);
+	if (!card->isoc_wq)
 		return -ENOMEM;
 
+	// This workqueue should be:
+	//  * != WQ_BH			Sleepable.
+	//  * == WQ_UNBOUND		Any core can process data for asynchronous context.
+	//  * == WQ_MEM_RECLAIM		Used for any backend of block device.
+	//  * == WQ_FREEZABLE		The target device would not be available when being freezed.
+	//  * == WQ_HIGHPRI		High priority to process semi-realtime timestamped data.
+	//  * == WQ_SYSFS		Parameters are available via sysfs.
+	//  * max_active == 4		A hardIRQ could notify events for a pair of requests and
+	//				response AR/AT contexts.
+	card->async_wq = alloc_workqueue("firewire-async-card%u",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI | WQ_SYSFS,
+					 4, card->index);
+	if (!card->async_wq) {
+		ret = -ENOMEM;
+		goto err_isoc;
+	}
+
 	card->max_receive = max_receive;
 	card->link_speed = link_speed;
 	card->guid = guid;
 
-	guard(mutex)(&card_mutex);
+	scoped_guard(mutex, &card_mutex) {
+		generate_config_rom(card, tmp_config_rom);
+		ret = card->driver->enable(card, tmp_config_rom, config_rom_length);
+		if (ret < 0)
+			goto err_async;
 
-	generate_config_rom(card, tmp_config_rom);
-	ret = card->driver->enable(card, tmp_config_rom, config_rom_length);
-	if (ret < 0) {
-		destroy_workqueue(isoc_wq);
-		return ret;
+		list_add_tail(&card->link, &card_list);
 	}
 
-	card->isoc_wq = isoc_wq;
-	list_add_tail(&card->link, &card_list);
-
 	return 0;
+err_async:
+	destroy_workqueue(card->async_wq);
+err_isoc:
+	destroy_workqueue(card->isoc_wq);
+	return ret;
 }
 EXPORT_SYMBOL(fw_card_add);
 
@@ -744,6 +762,7 @@ void fw_core_remove_card(struct fw_card *card)
 	dummy_driver.stop_iso		= card->driver->stop_iso;
 	card->driver = &dummy_driver;
 	drain_workqueue(card->isoc_wq);
+	drain_workqueue(card->async_wq);
 
 	scoped_guard(spinlock_irqsave, &card->lock)
 		fw_destroy_nodes(card);
@@ -753,6 +772,7 @@ void fw_core_remove_card(struct fw_card *card)
 	wait_for_completion(&card->done);
 
 	destroy_workqueue(card->isoc_wq);
+	destroy_workqueue(card->async_wq);
 
 	WARN_ON(!list_empty(&card->transaction_list));
 }
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index b632eec3ab52..c55b8e30e700 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -136,6 +136,7 @@ struct fw_card {
 	__be32 maint_utility_register;
 
 	struct workqueue_struct *isoc_wq;
+	struct workqueue_struct *async_wq;
 };
 
 static inline struct fw_card *fw_card_get(struct fw_card *card)
-- 
cgit v1.2.3


From aef6bcc0f278eba408751f8b3e0beae992e9faec Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Sun, 15 Jun 2025 22:32:53 +0900
Subject: firewire: ohci: use workqueue to handle events of AT request/response
 contexts

This commit adds a work item to handle events of 1394 OHCI AT
request/response contexts, and queues the item to the specific
workqueue. The call of struct fw_packet.callbaqck() is done in the
workqueue when receiving acknowledgement to the asynchronous packet
transferred to remote node.

Link: https://lore.kernel.org/r/20250615133253.433057-4-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/net.c   |  4 ++--
 drivers/firewire/ohci.c  | 40 ++++++++++++++++++++++++----------------
 include/linux/firewire.h | 11 +++++++++--
 3 files changed, 35 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 1bf0e15c1540..6d6446713539 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1007,7 +1007,7 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 
 		spin_lock_irqsave(&dev->lock, flags);
 
-		/* If the AT tasklet already ran, we may be last user. */
+		/* If the AT work item already ran, we may be last user. */
 		free = (ptask->outstanding_pkts == 0 && !ptask->enqueued);
 		if (!free)
 			ptask->enqueued = true;
@@ -1026,7 +1026,7 @@ static int fwnet_send_packet(struct fwnet_packet_task *ptask)
 
 	spin_lock_irqsave(&dev->lock, flags);
 
-	/* If the AT tasklet already ran, we may be last user. */
+	/* If the AT work item already ran, we may be last user. */
 	free = (ptask->outstanding_pkts == 0 && !ptask->enqueued);
 	if (!free)
 		ptask->enqueued = true;
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 68317b5a64a7..709a714fd5c8 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -158,7 +158,7 @@ struct context {
 
 	descriptor_callback_t callback;
 
-	struct tasklet_struct tasklet;
+	struct work_struct work;
 };
 
 struct iso_context {
@@ -1176,9 +1176,9 @@ static void context_retire_descriptors(struct context *ctx)
 	}
 }
 
-static void context_tasklet(unsigned long data)
+static void ohci_at_context_work(struct work_struct *work)
 {
-	struct context *ctx = (struct context *) data;
+	struct context *ctx = from_work(ctx, work, work);
 
 	context_retire_descriptors(ctx);
 }
@@ -1243,7 +1243,6 @@ static int context_init(struct context *ctx, struct fw_ohci *ohci,
 	ctx->buffer_tail = list_entry(ctx->buffer_list.next,
 			struct descriptor_buffer, list);
 
-	tasklet_init(&ctx->tasklet, context_tasklet, (unsigned long)ctx);
 	ctx->callback = callback;
 
 	/*
@@ -1524,13 +1523,17 @@ static int at_context_queue_packet(struct context *ctx,
 
 static void at_context_flush(struct context *ctx)
 {
-	tasklet_disable(&ctx->tasklet);
+	// Avoid dead lock due to programming mistake.
+	if (WARN_ON_ONCE(current_work() == &ctx->work))
+		return;
 
-	ctx->flushing = true;
-	context_tasklet((unsigned long)ctx);
-	ctx->flushing = false;
+	disable_work_sync(&ctx->work);
 
-	tasklet_enable(&ctx->tasklet);
+	WRITE_ONCE(ctx->flushing, true);
+	ohci_at_context_work(&ctx->work);
+	WRITE_ONCE(ctx->flushing, false);
+
+	enable_work(&ctx->work);
 }
 
 static int handle_at_packet(struct context *context,
@@ -1542,7 +1545,7 @@ static int handle_at_packet(struct context *context,
 	struct fw_ohci *ohci = context->ohci;
 	int evt;
 
-	if (last->transfer_status == 0 && !context->flushing)
+	if (last->transfer_status == 0 && !READ_ONCE(context->flushing))
 		/* This descriptor isn't done yet, stop iteration. */
 		return 0;
 
@@ -1576,7 +1579,7 @@ static int handle_at_packet(struct context *context,
 		break;
 
 	case OHCI1394_evt_missing_ack:
-		if (context->flushing)
+		if (READ_ONCE(context->flushing))
 			packet->ack = RCODE_GENERATION;
 		else {
 			/*
@@ -1598,7 +1601,7 @@ static int handle_at_packet(struct context *context,
 		break;
 
 	case OHCI1394_evt_no_status:
-		if (context->flushing) {
+		if (READ_ONCE(context->flushing)) {
 			packet->ack = RCODE_GENERATION;
 			break;
 		}
@@ -2239,10 +2242,10 @@ static irqreturn_t irq_handler(int irq, void *data)
 		queue_work(ohci->card.async_wq, &ohci->ar_response_ctx.work);
 
 	if (event & OHCI1394_reqTxComplete)
-		tasklet_schedule(&ohci->at_request_ctx.tasklet);
+		queue_work(ohci->card.async_wq, &ohci->at_request_ctx.work);
 
 	if (event & OHCI1394_respTxComplete)
-		tasklet_schedule(&ohci->at_response_ctx.tasklet);
+		queue_work(ohci->card.async_wq, &ohci->at_response_ctx.work);
 
 	if (event & OHCI1394_isochRx) {
 		iso_event = reg_read(ohci, OHCI1394_IsoRecvIntEventClear);
@@ -2684,7 +2687,10 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet)
 	struct driver_data *driver_data = packet->driver_data;
 	int ret = -ENOENT;
 
-	tasklet_disable_in_atomic(&ctx->tasklet);
+	// Avoid dead lock due to programming mistake.
+	if (WARN_ON_ONCE(current_work() == &ctx->work))
+		return 0;
+	disable_work_sync(&ctx->work);
 
 	if (packet->ack != 0)
 		goto out;
@@ -2703,7 +2709,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet)
 	packet->callback(packet, &ohci->card, packet->ack);
 	ret = 0;
  out:
-	tasklet_enable(&ctx->tasklet);
+	enable_work(&ctx->work);
 
 	return ret;
 }
@@ -3765,11 +3771,13 @@ static int pci_probe(struct pci_dev *dev,
 			   OHCI1394_AsReqTrContextControlSet, handle_at_packet);
 	if (err < 0)
 		return err;
+	INIT_WORK(&ohci->at_request_ctx.work, ohci_at_context_work);
 
 	err = context_init(&ohci->at_response_ctx, ohci,
 			   OHCI1394_AsRspTrContextControlSet, handle_at_packet);
 	if (err < 0)
 		return err;
+	INIT_WORK(&ohci->at_response_ctx.work, ohci_at_context_work);
 
 	reg_write(ohci, OHCI1394_IsoRecvIntMaskSet, ~0);
 	ohci->ir_context_channels = ~0ULL;
diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index c55b8e30e700..cceb70415ed2 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -308,8 +308,7 @@ struct fw_packet {
 	 * For successful transmission, the status code is the ack received
 	 * from the destination.  Otherwise it is one of the juju-specific
 	 * rcodes:  RCODE_SEND_ERROR, _CANCELLED, _BUSY, _GENERATION, _NO_ACK.
-	 * The callback can be called from tasklet context and thus
-	 * must never block.
+	 * The callback can be called from workqueue and thus must never block.
 	 */
 	fw_packet_callback_t callback;
 	int ack;
@@ -382,6 +381,10 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode
  *
  * A variation of __fw_send_request() to generate callback for response subaction without time
  * stamp.
+ *
+ * The callback is invoked in the workqueue context in most cases. However, if an error is detected
+ * before queueing or the destination address refers to the local node, it is invoked in the
+ * current context instead.
  */
 static inline void fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode,
 				   int destination_id, int generation, int speed,
@@ -411,6 +414,10 @@ static inline void fw_send_request(struct fw_card *card, struct fw_transaction *
  * @callback_data:	data to be passed to the transaction completion callback
  *
  * A variation of __fw_send_request() to generate callback for response subaction with time stamp.
+ *
+ * The callback is invoked in the workqueue context in most cases. However, if an error is detected
+ * before queueing or the destination address refers to the local node, it is invoked in the current
+ * context instead.
  */
 static inline void fw_send_request_with_tstamp(struct fw_card *card, struct fw_transaction *t,
 	int tcode, int destination_id, int generation, int speed, unsigned long long offset,
-- 
cgit v1.2.3


From 543f5e314282c4c2e5114f88ddecc9aeaf0985e2 Mon Sep 17 00:00:00 2001
From: Kaustabh Chakraborty <kauschluss@disroot.org>
Date: Thu, 12 Jun 2025 20:39:30 +0530
Subject: phy: exynos-mipi-video: introduce support for exynos7870

Add support for Exynos7870 in the existing MIPI CSIS/DSIM driver. The
SoC has one DSIM phy and three CSIS phys.

Signed-off-by: Kaustabh Chakraborty <kauschluss@disroot.org>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20250612-exynos7870-mipi-phy-v1-2-3fff0b62d9d3@disroot.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/phy/samsung/phy-exynos-mipi-video.c | 52 +++++++++++++++++++++++++++++
 include/linux/soc/samsung/exynos-regs-pmu.h |  5 +++
 2 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/phy/samsung/phy-exynos-mipi-video.c b/drivers/phy/samsung/phy-exynos-mipi-video.c
index f6756a609a9a..b184923b9b40 100644
--- a/drivers/phy/samsung/phy-exynos-mipi-video.c
+++ b/drivers/phy/samsung/phy-exynos-mipi-video.c
@@ -213,6 +213,55 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 	},
 };
 
+static const struct mipi_phy_device_desc exynos7870_mipi_phy = {
+	.num_regmaps = 3,
+	.regmap_names = {
+		"samsung,pmu-syscon",
+		"samsung,disp-sysreg",
+		"samsung,cam-sysreg"
+	},
+	.num_phys = 4,
+	.phys = {
+		{
+			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
+			.enable_val = EXYNOS4_PHY_ENABLE,
+			.enable_reg = EXYNOS7870_MIPI_PHY_CONTROL0,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = 0,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
+			.enable_val = EXYNOS4_PHY_ENABLE,
+			.enable_reg = EXYNOS7870_MIPI_PHY_CONTROL0,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(0),
+			.resetn_reg = 0,
+			.resetn_map = EXYNOS_MIPI_REGMAP_DISP,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
+			.enable_reg = EXYNOS7870_MIPI_PHY_CONTROL1,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(1),
+			.resetn_reg = 0,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		}, {
+			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
+			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
+			.enable_reg = EXYNOS7870_MIPI_PHY_CONTROL2,
+			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
+			.resetn_val = BIT(2),
+			.resetn_reg = 0,
+			.resetn_map = EXYNOS_MIPI_REGMAP_CAM0,
+		},
+	},
+};
+
 struct exynos_mipi_video_phy {
 	struct regmap *regmaps[EXYNOS_MIPI_REGMAPS_NUM];
 	int num_phys;
@@ -351,6 +400,9 @@ static const struct of_device_id exynos_mipi_video_phy_of_match[] = {
 	}, {
 		.compatible = "samsung,exynos5433-mipi-video-phy",
 		.data = &exynos5433_mipi_phy,
+	}, {
+		.compatible = "samsung,exynos7870-mipi-video-phy",
+		.data = &exynos7870_mipi_phy,
 	},
 	{ /* sentinel */ },
 };
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 7754697e5810..fa28a8784d65 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -665,6 +665,11 @@
 /* For Exynos990 */
 #define EXYNOS990_PHY_CTRL_USB20				(0x72C)
 
+/* For Exynos7870 */
+#define EXYNOS7870_MIPI_PHY_CONTROL0				(0x070c)
+#define EXYNOS7870_MIPI_PHY_CONTROL1				(0x0714)
+#define EXYNOS7870_MIPI_PHY_CONTROL2				(0x0734)
+
 /* For Tensor GS101 */
 /* PMU ALIVE */
 #define GS101_SYSIP_DAT0					(0x810)
-- 
cgit v1.2.3


From cf0233491b3a15933234a26efd9ecbc1c0764674 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Date: Thu, 5 Jun 2025 14:25:49 +0300
Subject: phy: use per-PHY lockdep keys

If the PHY driver uses another PHY internally (e.g. in case of eUSB2,
repeaters are represented as PHYs), then it would trigger the following
lockdep splat because all PHYs use a single static lockdep key and thus
lockdep can not identify whether there is a dependency or not and
reports a false positive.

Make PHY subsystem use dynamic lockdep keys, assigning each driver a
separate key. This way lockdep can correctly identify dependency graph
between mutexes.

 ============================================
 WARNING: possible recursive locking detected
 6.15.0-rc7-next-20250522-12896-g3932f283970c #3455 Not tainted
 --------------------------------------------
 kworker/u51:0/78 is trying to acquire lock:
 ffff0008116554f0 (&phy->mutex){+.+.}-{4:4}, at: phy_init+0x4c/0x12c

 but task is already holding lock:
 ffff000813c10cf0 (&phy->mutex){+.+.}-{4:4}, at: phy_init+0x4c/0x12c

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&phy->mutex);
   lock(&phy->mutex);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 4 locks held by kworker/u51:0/78:
  #0: ffff000800010948 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x18c/0x5ec
  #1: ffff80008036bdb0 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1b4/0x5ec
  #2: ffff0008094ac8f8 (&dev->mutex){....}-{4:4}, at: __device_attach+0x38/0x188
  #3: ffff000813c10cf0 (&phy->mutex){+.+.}-{4:4}, at: phy_init+0x4c/0x12c

 stack backtrace:
 CPU: 0 UID: 0 PID: 78 Comm: kworker/u51:0 Not tainted 6.15.0-rc7-next-20250522-12896-g3932f283970c #3455 PREEMPT
 Hardware name: Qualcomm CRD, BIOS 6.0.240904.BOOT.MXF.2.4-00528.1-HAMOA-1 09/ 4/2024
 Workqueue: events_unbound deferred_probe_work_func
 Call trace:
  show_stack+0x18/0x24 (C)
  dump_stack_lvl+0x90/0xd0
  dump_stack+0x18/0x24
  print_deadlock_bug+0x258/0x348
  __lock_acquire+0x10fc/0x1f84
  lock_acquire+0x1c8/0x338
  __mutex_lock+0xb8/0x59c
  mutex_lock_nested+0x24/0x30
  phy_init+0x4c/0x12c
  snps_eusb2_hsphy_init+0x54/0x1a0
  phy_init+0xe0/0x12c
  dwc3_core_init+0x450/0x10b4
  dwc3_core_probe+0xce4/0x15fc
  dwc3_probe+0x64/0xb0
  platform_probe+0x68/0xc4
  really_probe+0xbc/0x298
  __driver_probe_device+0x78/0x12c
  driver_probe_device+0x3c/0x160
  __device_attach_driver+0xb8/0x138
  bus_for_each_drv+0x84/0xe0
  __device_attach+0x9c/0x188
  device_initial_probe+0x14/0x20
  bus_probe_device+0xac/0xb0
  deferred_probe_work_func+0x8c/0xc8
  process_one_work+0x208/0x5ec
  worker_thread+0x1c0/0x368
  kthread+0x14c/0x20c
  ret_from_fork+0x10/0x20

Fixes: 3584f6392f09 ("phy: qcom: phy-qcom-snps-eusb2: Add support for eUSB2 repeater")
Fixes: e2463559ff1d ("phy: amlogic: Add Amlogic AXG PCIE PHY Driver")
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Reported-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/lkml/ZnpoAVGJMG4Zu-Jw@hovoldconsulting.com/
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Tested-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250605-phy-subinit-v3-1-1e1e849e10cd@oss.qualcomm.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/phy/phy-core.c  | 5 ++++-
 include/linux/phy/phy.h | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c
index 8e2daea81666..04a5a34e7a95 100644
--- a/drivers/phy/phy-core.c
+++ b/drivers/phy/phy-core.c
@@ -994,7 +994,8 @@ struct phy *phy_create(struct device *dev, struct device_node *node,
 	}
 
 	device_initialize(&phy->dev);
-	mutex_init(&phy->mutex);
+	lockdep_register_key(&phy->lockdep_key);
+	mutex_init_with_key(&phy->mutex, &phy->lockdep_key);
 
 	phy->dev.class = &phy_class;
 	phy->dev.parent = dev;
@@ -1259,6 +1260,8 @@ static void phy_release(struct device *dev)
 	dev_vdbg(dev, "releasing '%s'\n", dev_name(dev));
 	debugfs_remove_recursive(phy->debugfs);
 	regulator_put(phy->pwr);
+	mutex_destroy(&phy->mutex);
+	lockdep_unregister_key(&phy->lockdep_key);
 	ida_free(&phy_ida, phy->id);
 	kfree(phy);
 }
diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h
index 437769e061b7..13add0c2c407 100644
--- a/include/linux/phy/phy.h
+++ b/include/linux/phy/phy.h
@@ -154,6 +154,7 @@ struct phy_attrs {
  * @id: id of the phy device
  * @ops: function pointers for performing phy operations
  * @mutex: mutex to protect phy_ops
+ * @lockdep_key: lockdep information for this mutex
  * @init_count: used to protect when the PHY is used by multiple consumers
  * @power_count: used to protect when the PHY is used by multiple consumers
  * @attrs: used to specify PHY specific attributes
@@ -165,6 +166,7 @@ struct phy {
 	int			id;
 	const struct phy_ops	*ops;
 	struct mutex		mutex;
+	struct lock_class_key	lockdep_key;
 	int			init_count;
 	int			power_count;
 	struct phy_attrs	attrs;
-- 
cgit v1.2.3


From e7af416aebb36e6681b9c6950d0f6352aee7c084 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 16 Jun 2025 11:30:52 +0100
Subject: firmware: cs_dsp: Remove unused struct list_head from
 cs_dsp_coeff_ctl

Remove two unused pointers from struct cs_dsp_coeff_ctl by taking the
struct list_head out of struct cs_dsp_alg_region. On a x86_64 build
this saves 16 bytes per control.

Each cs_dsp_coeff_ctl instance needs to keep information about the
algorithm region it refers to. This is done by embedding an instance
of struct cs_dsp_alg_region. But cs_dsp_alg_region was also used to
store entries in a list of algorithm regions, and so had a struct
list_head object for that purpose. This list_head object is not used
with the embedded object in struct cs_dsp_alg_region so was just
wasted bytes.

A new struct cs_dsp_alg_region_list_item has been defined for creating
the list of algorithm regions. It contains a struct cs_dsp_alg_region
and a struct list_head.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Link: https://patch.msgid.link/20250616103052.66537-1-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 45 +++++++++++++++++++---------------
 include/linux/firmware/cirrus/cs_dsp.h |  2 --
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 560724ce21aa..f51047d8ea64 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -311,6 +311,11 @@ static const struct cs_dsp_ops cs_dsp_adsp2_ops[];
 static const struct cs_dsp_ops cs_dsp_halo_ops;
 static const struct cs_dsp_ops cs_dsp_halo_ao_ops;
 
+struct cs_dsp_alg_region_list_item {
+	struct list_head list;
+	struct cs_dsp_alg_region alg_region;
+};
+
 struct cs_dsp_buf {
 	struct list_head list;
 	void *buf;
@@ -1752,13 +1757,13 @@ static void *cs_dsp_read_algs(struct cs_dsp *dsp, size_t n_algs,
 struct cs_dsp_alg_region *cs_dsp_find_alg_region(struct cs_dsp *dsp,
 						 int type, unsigned int id)
 {
-	struct cs_dsp_alg_region *alg_region;
+	struct cs_dsp_alg_region_list_item *item;
 
 	lockdep_assert_held(&dsp->pwr_lock);
 
-	list_for_each_entry(alg_region, &dsp->alg_regions, list) {
-		if (id == alg_region->alg && type == alg_region->type)
-			return alg_region;
+	list_for_each_entry(item, &dsp->alg_regions, list) {
+		if (id == item->alg_region.alg && type == item->alg_region.type)
+			return &item->alg_region;
 	}
 
 	return NULL;
@@ -1769,35 +1774,35 @@ static struct cs_dsp_alg_region *cs_dsp_create_region(struct cs_dsp *dsp,
 						      int type, __be32 id,
 						      __be32 ver, __be32 base)
 {
-	struct cs_dsp_alg_region *alg_region;
+	struct cs_dsp_alg_region_list_item *item;
 
-	alg_region = kzalloc(sizeof(*alg_region), GFP_KERNEL);
-	if (!alg_region)
+	item = kzalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
 		return ERR_PTR(-ENOMEM);
 
-	alg_region->type = type;
-	alg_region->alg = be32_to_cpu(id);
-	alg_region->ver = be32_to_cpu(ver);
-	alg_region->base = be32_to_cpu(base);
+	item->alg_region.type = type;
+	item->alg_region.alg = be32_to_cpu(id);
+	item->alg_region.ver = be32_to_cpu(ver);
+	item->alg_region.base = be32_to_cpu(base);
 
-	list_add_tail(&alg_region->list, &dsp->alg_regions);
+	list_add_tail(&item->list, &dsp->alg_regions);
 
 	if (dsp->wmfw_ver > 0)
-		cs_dsp_ctl_fixup_base(dsp, alg_region);
+		cs_dsp_ctl_fixup_base(dsp, &item->alg_region);
 
-	return alg_region;
+	return &item->alg_region;
 }
 
 static void cs_dsp_free_alg_regions(struct cs_dsp *dsp)
 {
-	struct cs_dsp_alg_region *alg_region;
+	struct cs_dsp_alg_region_list_item *item;
 
 	while (!list_empty(&dsp->alg_regions)) {
-		alg_region = list_first_entry(&dsp->alg_regions,
-					      struct cs_dsp_alg_region,
-					      list);
-		list_del(&alg_region->list);
-		kfree(alg_region);
+		item = list_first_entry(&dsp->alg_regions,
+					struct cs_dsp_alg_region_list_item,
+					list);
+		list_del(&item->list);
+		kfree(item);
 	}
 }
 
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 7cae703b3137..a66eb7624730 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -64,14 +64,12 @@ struct cs_dsp_region {
 
 /**
  * struct cs_dsp_alg_region - Describes a logical algorithm region in DSP address space
- * @list:	List node for internal use
  * @alg:	Algorithm id
  * @ver:	Expected algorithm version
  * @type:	Memory region type
  * @base:	Address of region
  */
 struct cs_dsp_alg_region {
-	struct list_head list;
 	unsigned int alg;
 	unsigned int ver;
 	int type;
-- 
cgit v1.2.3


From bc9241367aac08de44633fd957b2452a6da8e6d4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Fri, 13 Jun 2025 09:28:10 +1000
Subject: VFS: change old_dir and new_dir in struct renamedata to dentrys

all users of 'struct renamedata' have the dentry for the old and new
directories, and often have no use for the inode except to store it in
the renamedata.

This patch changes struct renamedata to hold the dentry, rather than
the inode, for the old and new directories, and changes callers to
match.  The names are also changed from a _dir suffix to _parent.  This
is consistent with other usage in namei.c and elsewhere.

This results in the removal of several local variables and several
dereferences of ->d_inode at the cost of adding ->d_inode dereferences
to vfs_rename().

Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: NeilBrown <neil@brown.name>
Link: https://lore.kernel.org/174977089072.608730.4244531834577097454@noble.neil.brown.name
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/cachefiles/namei.c    |  4 ++--
 fs/ecryptfs/inode.c      |  4 ++--
 fs/namei.c               |  7 ++++---
 fs/nfsd/vfs.c            |  7 ++-----
 fs/overlayfs/copy_up.c   |  6 +++---
 fs/overlayfs/dir.c       | 16 ++++++++--------
 fs/overlayfs/overlayfs.h | 16 ++++++++--------
 fs/overlayfs/readdir.c   |  2 +-
 fs/overlayfs/super.c     |  2 +-
 fs/overlayfs/util.c      |  2 +-
 fs/smb/server/vfs.c      |  4 ++--
 include/linux/fs.h       |  8 ++++----
 12 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index aecfc5c37b49..91dfd0231877 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -388,10 +388,10 @@ try_again:
 	} else {
 		struct renamedata rd = {
 			.old_mnt_idmap	= &nop_mnt_idmap,
-			.old_dir	= d_inode(dir),
+			.old_parent	= dir,
 			.old_dentry	= rep,
 			.new_mnt_idmap	= &nop_mnt_idmap,
-			.new_dir	= d_inode(cache->graveyard),
+			.new_parent	= cache->graveyard,
 			.new_dentry	= grave,
 		};
 		trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 493d7f194956..bd317d943d62 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -635,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	}
 
 	rd.old_mnt_idmap	= &nop_mnt_idmap;
-	rd.old_dir		= d_inode(lower_old_dir_dentry);
+	rd.old_parent		= lower_old_dir_dentry;
 	rd.old_dentry		= lower_old_dentry;
 	rd.new_mnt_idmap	= &nop_mnt_idmap;
-	rd.new_dir		= d_inode(lower_new_dir_dentry);
+	rd.new_parent		= lower_new_dir_dentry;
 	rd.new_dentry		= lower_new_dentry;
 	rc = vfs_rename(&rd);
 	if (rc)
diff --git a/fs/namei.c b/fs/namei.c
index 4bb889fc980b..981da44e1291 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5007,7 +5007,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 int vfs_rename(struct renamedata *rd)
 {
 	int error;
-	struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+	struct inode *old_dir = d_inode(rd->old_parent);
+	struct inode *new_dir = d_inode(rd->new_parent);
 	struct dentry *old_dentry = rd->old_dentry;
 	struct dentry *new_dentry = rd->new_dentry;
 	struct inode **delegated_inode = rd->delegated_inode;
@@ -5266,10 +5267,10 @@ retry_deleg:
 	if (error)
 		goto exit5;
 
-	rd.old_dir	   = old_path.dentry->d_inode;
+	rd.old_parent	   = old_path.dentry;
 	rd.old_dentry	   = old_dentry;
 	rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
-	rd.new_dir	   = new_path.dentry->d_inode;
+	rd.new_parent	   = new_path.dentry;
 	rd.new_dentry	   = new_dentry;
 	rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
 	rd.delegated_inode = &delegated_inode;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cd689df2ca5d..7d522e426b2d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1864,7 +1864,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 			    struct svc_fh *tfhp, char *tname, int tlen)
 {
 	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
-	struct inode	*fdir, *tdir;
 	int		type = S_IFDIR;
 	__be32		err;
 	int		host_err;
@@ -1880,10 +1879,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		goto out;
 
 	fdentry = ffhp->fh_dentry;
-	fdir = d_inode(fdentry);
 
 	tdentry = tfhp->fh_dentry;
-	tdir = d_inode(tdentry);
 
 	err = nfserr_perm;
 	if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
@@ -1944,10 +1941,10 @@ retry:
 	} else {
 		struct renamedata rd = {
 			.old_mnt_idmap	= &nop_mnt_idmap,
-			.old_dir	= fdir,
+			.old_parent	= fdentry,
 			.old_dentry	= odentry,
 			.new_mnt_idmap	= &nop_mnt_idmap,
-			.new_dir	= tdir,
+			.new_parent	= tdentry,
 			.new_dentry	= ndentry,
 		};
 		int retries;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index d7310fcf3888..8a3c0d18ec2e 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -563,7 +563,7 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
 	if (IS_ERR(index)) {
 		err = PTR_ERR(index);
 	} else {
-		err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
+		err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0);
 		dput(index);
 	}
 out:
@@ -762,7 +762,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 {
 	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
 	struct inode *inode;
-	struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
+	struct inode *wdir = d_inode(c->workdir);
 	struct path path = { .mnt = ovl_upper_mnt(ofs) };
 	struct dentry *temp, *upper, *trap;
 	struct ovl_cu_creds cc;
@@ -829,7 +829,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
 	if (IS_ERR(upper))
 		goto cleanup;
 
-	err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
+	err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0);
 	dput(upper);
 	if (err)
 		goto cleanup;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index fe493f3ed6b6..4fc221ea6480 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -107,7 +107,7 @@ out:
 }
 
 /* Caller must hold i_mutex on both workdir and dir */
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
 			     struct dentry *dentry)
 {
 	struct inode *wdir = ofs->workdir->d_inode;
@@ -123,7 +123,7 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
 	if (d_is_dir(dentry))
 		flags = RENAME_EXCHANGE;
 
-	err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
+	err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags);
 	if (err)
 		goto kill_whiteout;
 	if (flags)
@@ -384,7 +384,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (err)
 		goto out_cleanup;
 
-	err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+	err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE);
 	if (err)
 		goto out_cleanup;
 
@@ -491,14 +491,14 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 		if (err)
 			goto out_cleanup;
 
-		err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
+		err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper,
 				    RENAME_EXCHANGE);
 		if (err)
 			goto out_cleanup;
 
 		ovl_cleanup(ofs, wdir, upper);
 	} else {
-		err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
+		err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0);
 		if (err)
 			goto out_cleanup;
 	}
@@ -774,7 +774,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
 		goto out_dput_upper;
 	}
 
-	err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper);
+	err = ovl_cleanup_and_whiteout(ofs, upperdir, upper);
 	if (err)
 		goto out_d_drop;
 
@@ -1246,8 +1246,8 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
 	if (err)
 		goto out_dput;
 
-	err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
-			    new_upperdir->d_inode, newdentry, flags);
+	err = ovl_do_rename(ofs, old_upperdir, olddentry,
+			    new_upperdir, newdentry, flags);
 	if (err)
 		goto out_dput;
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 8baaba0a3fe5..78deb89e16b5 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -353,19 +353,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
 	return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
 }
 
-static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
-				struct dentry *olddentry, struct inode *newdir,
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
+				struct dentry *olddentry, struct dentry *newdir,
 				struct dentry *newdentry, unsigned int flags)
 {
 	int err;
 	struct renamedata rd = {
 		.old_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
-		.old_dir 	= olddir,
-		.old_dentry 	= olddentry,
+		.old_parent	= olddir,
+		.old_dentry	= olddentry,
 		.new_mnt_idmap	= ovl_upper_mnt_idmap(ofs),
-		.new_dir 	= newdir,
-		.new_dentry 	= newdentry,
-		.flags 		= flags,
+		.new_parent	= newdir,
+		.new_dentry	= newdentry,
+		.flags		= flags,
 	};
 
 	pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
@@ -826,7 +826,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to)
 
 /* dir.c */
 extern const struct inode_operations ovl_dir_inode_operations;
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
 			     struct dentry *dentry);
 struct ovl_cattr {
 	dev_t rdev;
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 474c80d210d1..68cca52ae2ac 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -1235,7 +1235,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
 			 * Whiteout orphan index to block future open by
 			 * handle after overlay nlink dropped to zero.
 			 */
-			err = ovl_cleanup_and_whiteout(ofs, dir, index);
+			err = ovl_cleanup_and_whiteout(ofs, indexdir, index);
 		} else {
 			/* Cleanup orphan index entries */
 			err = ovl_cleanup(ofs, dir, index);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e19940d649ca..cf99b276fdfb 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -580,7 +580,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
 
 	/* Name is inline and stable - using snapshot as a copy helper */
 	take_dentry_name_snapshot(&name, temp);
-	err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
+	err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT);
 	if (err) {
 		if (err == -EINVAL)
 			err = 0;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index dcccb4b4a66c..2b4754c645ee 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1115,7 +1115,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
 	} else if (ovl_index_all(dentry->d_sb)) {
 		/* Whiteout orphan index to block future open by handle */
 		err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
-					       dir, index);
+					       indexdir, index);
 	} else {
 		/* Cleanup orphan index entries */
 		err = ovl_cleanup(ofs, dir, index);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index ba45e809555a..2f0171896e5d 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -764,10 +764,10 @@ retry:
 	}
 
 	rd.old_mnt_idmap	= mnt_idmap(old_path->mnt),
-	rd.old_dir		= d_inode(old_parent),
+	rd.old_parent		= old_parent,
 	rd.old_dentry		= old_child,
 	rd.new_mnt_idmap	= mnt_idmap(new_path.mnt),
-	rd.new_dir		= new_path.dentry->d_inode,
+	rd.new_parent		= new_path.dentry,
 	rd.new_dentry		= new_dentry,
 	rd.flags		= flags,
 	rd.delegated_inode	= NULL,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..1d9586a78041 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2004,20 +2004,20 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
 /**
  * struct renamedata - contains all information required for renaming
  * @old_mnt_idmap:     idmap of the old mount the inode was found from
- * @old_dir:           parent of source
+ * @old_parent:        parent of source
  * @old_dentry:                source
  * @new_mnt_idmap:     idmap of the new mount the inode was found from
- * @new_dir:           parent of destination
+ * @new_parent:        parent of destination
  * @new_dentry:                destination
  * @delegated_inode:   returns an inode needing a delegation break
  * @flags:             rename flags
  */
 struct renamedata {
 	struct mnt_idmap *old_mnt_idmap;
-	struct inode *old_dir;
+	struct dentry *old_parent;
 	struct dentry *old_dentry;
 	struct mnt_idmap *new_mnt_idmap;
-	struct inode *new_dir;
+	struct dentry *new_parent;
 	struct dentry *new_dentry;
 	struct inode **delegated_inode;
 	unsigned int flags;
-- 
cgit v1.2.3


From 0da3e3822cfabf062945e449f91ea3ca529eeaa4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 12 Jun 2025 15:25:19 +0200
Subject: fs: move name_contains_dotdot() to header

Move the helper from the firmware specific code to a header so we can
reuse it for coredump sockets.

Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-5-315c0c34ba94@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/base/firmware_loader/main.c | 31 +++++++++++--------------------
 include/linux/fs.h                  | 16 ++++++++++++++++
 2 files changed, 27 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c
index 44486b2c7172..6942c62fa59d 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -822,26 +822,6 @@ static void fw_log_firmware_info(const struct firmware *fw, const char *name,
 {}
 #endif
 
-/*
- * Reject firmware file names with ".." path components.
- * There are drivers that construct firmware file names from device-supplied
- * strings, and we don't want some device to be able to tell us "I would like to
- * be sent my firmware from ../../../etc/shadow, please".
- *
- * Search for ".." surrounded by either '/' or start/end of string.
- *
- * This intentionally only looks at the firmware name, not at the firmware base
- * directory or at symlink contents.
- */
-static bool name_contains_dotdot(const char *name)
-{
-	size_t name_len = strlen(name);
-
-	return strcmp(name, "..") == 0 || strncmp(name, "../", 3) == 0 ||
-	       strstr(name, "/../") != NULL ||
-	       (name_len >= 3 && strcmp(name+name_len-3, "/..") == 0);
-}
-
 /* called from request_firmware() and request_firmware_work_func() */
 static int
 _request_firmware(const struct firmware **firmware_p, const char *name,
@@ -862,6 +842,17 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 		goto out;
 	}
 
+
+	/*
+	 * Reject firmware file names with ".." path components.
+	 * There are drivers that construct firmware file names from
+	 * device-supplied strings, and we don't want some device to be
+	 * able to tell us "I would like to be sent my firmware from
+	 * ../../../etc/shadow, please".
+	 *
+	 * This intentionally only looks at the firmware name, not at
+	 * the firmware base directory or at symlink contents.
+	 */
 	if (name_contains_dotdot(name)) {
 		dev_warn(device,
 			 "Firmware load for '%s' refused, path contains '..' component\n",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..18fdbd184eea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3264,6 +3264,22 @@ static inline bool is_dot_dotdot(const char *name, size_t len)
 		(len == 1 || (len == 2 && name[1] == '.'));
 }
 
+/**
+ * name_contains_dotdot - check if a file name contains ".." path components
+ *
+ * Search for ".." surrounded by either '/' or start/end of string.
+ */
+static inline bool name_contains_dotdot(const char *name)
+{
+	size_t name_len;
+
+	name_len = strlen(name);
+	return strcmp(name, "..") == 0 ||
+	       strncmp(name, "../", 3) == 0 ||
+	       strstr(name, "/../") != NULL ||
+	       (name_len >= 3 && strcmp(name + name_len - 3, "/..") == 0);
+}
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
-- 
cgit v1.2.3


From 70e3ee31282d293c794fb5bbec8efe495c32044b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 12 Jun 2025 15:25:23 +0200
Subject: coredump: rename do_coredump() to vfs_coredump()

Align the naming with the rest of our helpers exposed
outside of core vfs.

Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-9-315c0c34ba94@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/security/credentials.rst                    | 2 +-
 Documentation/translations/zh_CN/security/credentials.rst | 2 +-
 fs/coredump.c                                             | 2 +-
 include/linux/coredump.h                                  | 4 ++--
 kernel/signal.c                                           | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/security/credentials.rst b/Documentation/security/credentials.rst
index 2aa0791bcefe..d0191c8b8060 100644
--- a/Documentation/security/credentials.rst
+++ b/Documentation/security/credentials.rst
@@ -555,5 +555,5 @@ the VFS, and that can be done by calling into such as ``vfs_mkdir()`` with a
 different set of credentials.  This is done in the following places:
 
  * ``sys_faccessat()``.
- * ``do_coredump()``.
+ * ``vfs_coredump()``.
  * nfs4recover.c.
diff --git a/Documentation/translations/zh_CN/security/credentials.rst b/Documentation/translations/zh_CN/security/credentials.rst
index 91c353dfb622..88fcd9152ffe 100644
--- a/Documentation/translations/zh_CN/security/credentials.rst
+++ b/Documentation/translations/zh_CN/security/credentials.rst
@@ -475,5 +475,5 @@ const指针上操作，因此不需要进行类型转换，但需要临时放弃
 如 ``vfs_mkdir()`` 来实现。以下是一些进行此操作的位置:
 
  * ``sys_faccessat()``.
- * ``do_coredump()``.
+ * ``vfs_coredump()``.
  * nfs4recover.c.
diff --git a/fs/coredump.c b/fs/coredump.c
index 52efd1b34261..8a401eeee940 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -865,7 +865,7 @@ static inline void coredump_sock_wait(struct file *file) { }
 static inline void coredump_sock_shutdown(struct file *file) { }
 #endif
 
-void do_coredump(const kernel_siginfo_t *siginfo)
+void vfs_coredump(const kernel_siginfo_t *siginfo)
 {
 	struct core_state core_state;
 	struct core_name cn;
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 76e41805b92d..96e8a66da133 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -43,7 +43,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len);
-extern void do_coredump(const kernel_siginfo_t *siginfo);
+extern void vfs_coredump(const kernel_siginfo_t *siginfo);
 
 /*
  * Logging for the coredump code, ratelimited.
@@ -63,7 +63,7 @@ extern void do_coredump(const kernel_siginfo_t *siginfo);
 #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__)
 
 #else
-static inline void do_coredump(const kernel_siginfo_t *siginfo) {}
+static inline void vfs_coredump(const kernel_siginfo_t *siginfo) {}
 
 #define coredump_report(...)
 #define coredump_report_failure(...)
diff --git a/kernel/signal.c b/kernel/signal.c
index 148082db9a55..e2c928de7d2c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3016,7 +3016,7 @@ relock:
 			 * first and our do_group_exit call below will use
 			 * that value and ignore the one we pass it.
 			 */
-			do_coredump(&ksig->info);
+			vfs_coredump(&ksig->info);
 		}
 
 		/*
-- 
cgit v1.2.3


From fda6add9243867486f8cd456d7b05395d2132e0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 16 Jun 2025 13:59:20 -0400
Subject: workqueue: Basic memory allocation profiling support

Hook alloc_workqueue and alloc_workqueue_attrs() so that they're
accounted to the callsite. Since we're doing allocations on behalf of
another subsystem, this helps when using memory allocation profiling to
check for leaks.

Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 12 ++++++++----
 kernel/workqueue.c        | 14 +++++++-------
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 6e30f275da77..e907c9bb840c 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -6,6 +6,7 @@
 #ifndef _LINUX_WORKQUEUE_H
 #define _LINUX_WORKQUEUE_H
 
+#include <linux/alloc_tag.h>
 #include <linux/timer.h>
 #include <linux/linkage.h>
 #include <linux/bitops.h>
@@ -505,7 +506,8 @@ void workqueue_softirq_dead(unsigned int cpu);
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
 __printf(1, 4) struct workqueue_struct *
-alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);
+alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...);
+#define alloc_workqueue(...)	alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__))
 
 #ifdef CONFIG_LOCKDEP
 /**
@@ -544,8 +546,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
 #define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)	\
-	alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),	\
-				    1, lockdep_map, ##args)
+	alloc_hooks(alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),\
+						1, lockdep_map, ##args))
 #endif
 
 /**
@@ -577,7 +579,9 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
-struct workqueue_attrs *alloc_workqueue_attrs(void);
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void);
+#define alloc_workqueue_attrs(...)	alloc_hooks(alloc_workqueue_attrs_noprof(__VA_ARGS__))
+
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d9de0f2a2e00..c24844afaa98 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4626,7 +4626,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
  *
  * Return: The allocated new workqueue_attr on success. %NULL on failure.
  */
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
 {
 	struct workqueue_attrs *attrs;
 
@@ -5679,12 +5679,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
 	else
 		wq_size = sizeof(*wq);
 
-	wq = kzalloc(wq_size, GFP_KERNEL);
+	wq = kzalloc_noprof(wq_size, GFP_KERNEL);
 	if (!wq)
 		return NULL;
 
 	if (flags & WQ_UNBOUND) {
-		wq->unbound_attrs = alloc_workqueue_attrs();
+		wq->unbound_attrs = alloc_workqueue_attrs_noprof();
 		if (!wq->unbound_attrs)
 			goto err_free_wq;
 	}
@@ -5774,9 +5774,9 @@ err_destroy:
 }
 
 __printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
-					 unsigned int flags,
-					 int max_active, ...)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+						unsigned int flags,
+						int max_active, ...)
 {
 	struct workqueue_struct *wq;
 	va_list args;
@@ -5791,7 +5791,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	return wq;
 }
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
 
 #ifdef CONFIG_LOCKDEP
 __printf(1, 5)
-- 
cgit v1.2.3


From b1ba03c49a711c30e24735733dfd68f2422fa150 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 11 Jun 2025 18:34:21 +0900
Subject: scsi: core: Remember if a device is an ATA device

scsi_add_lun() tests the device vendor string of SCSI devices to detect
if a SCSI device is in fact an ATA device, in order to correctly handle
SATL power management. The function scsi_cdl_enable() also requires
knowing if a SCSI device is an ATA device to control the state of the
device CDL feature but this function does that by testing for the
presence of the VPD page 89h (ATA INFORMATION page).
sd_read_write_same() also has a similar test.

Simplify these different methods by adding the is_ata field to struct
scsi_device to remember that a SCSI device is in fact an ATA one based
on the device vendor name test. This field can also allow low level
SCSI host adapter drivers to take special actions for ATA devices
(e.g. to better handle ATA NCQ errors).

With this, simplify scsi_cdl_enable() and sd_read_write_same().

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250611093421.2901633-1-dlemoal@kernel.org
Reviewed-by: Igor Pylypiv <ipylypiv@google.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi.c        |  7 +------
 drivers/scsi/scsi_scan.c   |  3 ++-
 drivers/scsi/sd.c          | 13 ++++---------
 include/scsi/scsi_device.h |  5 +++++
 4 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 518a252eb6aa..534310224e8f 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -708,20 +708,15 @@ void scsi_cdl_check(struct scsi_device *sdev)
 int scsi_cdl_enable(struct scsi_device *sdev, bool enable)
 {
 	char buf[64];
-	bool is_ata;
 	int ret;
 
 	if (!sdev->cdl_supported)
 		return -EOPNOTSUPP;
 
-	rcu_read_lock();
-	is_ata = rcu_dereference(sdev->vpd_pg89);
-	rcu_read_unlock();
-
 	/*
 	 * For ATA devices, CDL needs to be enabled with a SET FEATURES command.
 	 */
-	if (is_ata) {
+	if (sdev->is_ata) {
 		struct scsi_mode_data data;
 		struct scsi_sense_hdr sshdr;
 		char *buf_data;
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 4833b8fe251b..160c2f74c7e7 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -909,7 +909,8 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	sdev->model = (char *) (sdev->inquiry + 16);
 	sdev->rev = (char *) (sdev->inquiry + 32);
 
-	if (strncmp(sdev->vendor, "ATA     ", 8) == 0) {
+	sdev->is_ata = strncmp(sdev->vendor, "ATA     ", 8) == 0;
+	if (sdev->is_ata) {
 		/*
 		 * sata emulation layer device.  This is a hack to work around
 		 * the SATL power management specifications which state that
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3f6e87705b62..daddef2e9e87 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3459,19 +3459,14 @@ static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer)
 	}
 
 	if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) {
-		struct scsi_vpd *vpd;
-
 		sdev->no_report_opcodes = 1;
 
-		/* Disable WRITE SAME if REPORT SUPPORTED OPERATION
-		 * CODES is unsupported and the device has an ATA
-		 * Information VPD page (SAT).
+		/*
+		 * Disable WRITE SAME if REPORT SUPPORTED OPERATION CODES is
+		 * unsupported and this is an ATA device.
 		 */
-		rcu_read_lock();
-		vpd = rcu_dereference(sdev->vpd_pg89);
-		if (vpd)
+		if (sdev->is_ata)
 			sdev->no_write_same = 1;
-		rcu_read_unlock();
 	}
 
 	if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1)
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 68dd49947d04..6d6500148c4b 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -184,6 +184,11 @@ struct scsi_device {
 	 */
 	unsigned force_runtime_start_on_system_start:1;
 
+	/*
+	 * Set if the device is an ATA device.
+	 */
+	unsigned is_ata:1;
+
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */
 	unsigned busy:1;	/* Used to prevent races */
-- 
cgit v1.2.3


From 6ad5ff6e7282d1252364cc08af88260ef0ec4cda Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:19 +0200
Subject: libeth: convert to netmem

Back when the libeth Rx core was initially written, devmem was a draft
and netmem_ref didn't exist in the mainline. Now that it's here, make
libeth MP-agnostic before introducing any new code or any new library
users.
When it's known that the created PP/FQ is for header buffers, use faster
"unsafe" underscored netmem <--> virt accessors as netmem_is_net_iov()
is always false in that case, but consumes some cycles (bit test +
true branch).

Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c        | 14 +++++----
 .../net/ethernet/intel/idpf/idpf_singleq_txrx.c    |  2 +-
 drivers/net/ethernet/intel/idpf/idpf_txrx.c        | 36 +++++++++++++---------
 drivers/net/ethernet/intel/libeth/rx.c             |  8 ++---
 include/net/libeth/rx.h                            | 22 +++++++------
 5 files changed, 46 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 23e786b9793d..aaf70c625655 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -723,7 +723,7 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 	for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) {
 		const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
 
-		page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false);
+		libeth_rx_recycle_slow(rx_fqes->netmem);
 
 		if (unlikely(++i == rx_ring->count))
 			i = 0;
@@ -1197,10 +1197,11 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
 			     const struct libeth_fqe *rx_buffer,
 			     unsigned int size)
 {
-	u32 hr = rx_buffer->page->pp->p.offset;
+	u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset;
 
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
-			rx_buffer->offset + hr, size, rx_buffer->truesize);
+	skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags,
+			       rx_buffer->netmem, rx_buffer->offset + hr,
+			       size, rx_buffer->truesize);
 }
 
 /**
@@ -1214,12 +1215,13 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
 static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer,
 				      unsigned int size)
 {
-	u32 hr = rx_buffer->page->pp->p.offset;
+	struct page *buf_page = __netmem_to_page(rx_buffer->netmem);
+	u32 hr = buf_page->pp->p.offset;
 	struct sk_buff *skb;
 	void *va;
 
 	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->offset;
+	va = page_address(buf_page) + rx_buffer->offset;
 	net_prefetch(va + hr);
 
 	/* build an skb around the page buffer */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
index 993c354aa27a..555879b1248d 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c
@@ -1006,7 +1006,7 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget)
 			break;
 
 skip_data:
-		rx_buf->page = NULL;
+		rx_buf->netmem = 0;
 
 		IDPF_SINGLEQ_BUMP_RING_IDX(rx_q, ntc);
 		cleaned_count++;
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 5cf440e09d0a..cef9dfb877e8 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -383,12 +383,12 @@ err_out:
  */
 static void idpf_rx_page_rel(struct libeth_fqe *rx_buf)
 {
-	if (unlikely(!rx_buf->page))
+	if (unlikely(!rx_buf->netmem))
 		return;
 
-	page_pool_put_full_page(rx_buf->page->pp, rx_buf->page, false);
+	libeth_rx_recycle_slow(rx_buf->netmem);
 
-	rx_buf->page = NULL;
+	rx_buf->netmem = 0;
 	rx_buf->offset = 0;
 }
 
@@ -3240,10 +3240,10 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb,
 void idpf_rx_add_frag(struct idpf_rx_buf *rx_buf, struct sk_buff *skb,
 		      unsigned int size)
 {
-	u32 hr = rx_buf->page->pp->p.offset;
+	u32 hr = netmem_get_pp(rx_buf->netmem)->p.offset;
 
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page,
-			rx_buf->offset + hr, size, rx_buf->truesize);
+	skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags, rx_buf->netmem,
+			       rx_buf->offset + hr, size, rx_buf->truesize);
 }
 
 /**
@@ -3266,16 +3266,20 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr,
 			     struct libeth_fqe *buf, u32 data_len)
 {
 	u32 copy = data_len <= L1_CACHE_BYTES ? data_len : ETH_HLEN;
+	struct page *hdr_page, *buf_page;
 	const void *src;
 	void *dst;
 
-	if (!libeth_rx_sync_for_cpu(buf, copy))
+	if (unlikely(netmem_is_net_iov(buf->netmem)) ||
+	    !libeth_rx_sync_for_cpu(buf, copy))
 		return 0;
 
-	dst = page_address(hdr->page) + hdr->offset + hdr->page->pp->p.offset;
-	src = page_address(buf->page) + buf->offset + buf->page->pp->p.offset;
-	memcpy(dst, src, LARGEST_ALIGN(copy));
+	hdr_page = __netmem_to_page(hdr->netmem);
+	buf_page = __netmem_to_page(buf->netmem);
+	dst = page_address(hdr_page) + hdr->offset + hdr_page->pp->p.offset;
+	src = page_address(buf_page) + buf->offset + buf_page->pp->p.offset;
 
+	memcpy(dst, src, LARGEST_ALIGN(copy));
 	buf->offset += copy;
 
 	return copy;
@@ -3291,11 +3295,12 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr,
  */
 struct sk_buff *idpf_rx_build_skb(const struct libeth_fqe *buf, u32 size)
 {
-	u32 hr = buf->page->pp->p.offset;
+	struct page *buf_page = __netmem_to_page(buf->netmem);
+	u32 hr = buf_page->pp->p.offset;
 	struct sk_buff *skb;
 	void *va;
 
-	va = page_address(buf->page) + buf->offset;
+	va = page_address(buf_page) + buf->offset;
 	prefetch(va + hr);
 
 	skb = napi_build_skb(va, buf->truesize);
@@ -3429,7 +3434,8 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
 
 		if (unlikely(!hdr_len && !skb)) {
 			hdr_len = idpf_rx_hsplit_wa(hdr, rx_buf, pkt_len);
-			pkt_len -= hdr_len;
+			/* If failed, drop both buffers by setting len to 0 */
+			pkt_len -= hdr_len ? : pkt_len;
 
 			u64_stats_update_begin(&rxq->stats_sync);
 			u64_stats_inc(&rxq->q_stats.hsplit_buf_ovf);
@@ -3446,7 +3452,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
 			u64_stats_update_end(&rxq->stats_sync);
 		}
 
-		hdr->page = NULL;
+		hdr->netmem = 0;
 
 payload:
 		if (!libeth_rx_sync_for_cpu(rx_buf, pkt_len))
@@ -3462,7 +3468,7 @@ payload:
 			break;
 
 skip_data:
-		rx_buf->page = NULL;
+		rx_buf->netmem = 0;
 
 		idpf_rx_post_buf_refill(refillq, buf_id);
 		IDPF_RX_BUMP_NTC(rxq, ntc);
diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c
index c2c53552c440..2afa6e33f160 100644
--- a/drivers/net/ethernet/intel/libeth/rx.c
+++ b/drivers/net/ethernet/intel/libeth/rx.c
@@ -204,14 +204,14 @@ void libeth_rx_fq_destroy(struct libeth_fq *fq)
 EXPORT_SYMBOL_GPL(libeth_rx_fq_destroy);
 
 /**
- * libeth_rx_recycle_slow - recycle a libeth page from the NAPI context
- * @page: page to recycle
+ * libeth_rx_recycle_slow - recycle libeth netmem
+ * @netmem: network memory to recycle
  *
  * To be used on exceptions or rare cases not requiring fast inline recycling.
  */
-void libeth_rx_recycle_slow(struct page *page)
+void __cold libeth_rx_recycle_slow(netmem_ref netmem)
 {
-	page_pool_recycle_direct(page->pp, page);
+	page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
 }
 EXPORT_SYMBOL_GPL(libeth_rx_recycle_slow);
 
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index ab05024be518..7d5dc58984b1 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
 
 #ifndef __LIBETH_RX_H
 #define __LIBETH_RX_H
@@ -31,7 +31,7 @@
 
 /**
  * struct libeth_fqe - structure representing an Rx buffer (fill queue element)
- * @page: page holding the buffer
+ * @netmem: network memory reference holding the buffer
  * @offset: offset from the page start (to the headroom)
  * @truesize: total space occupied by the buffer (w/ headroom and tailroom)
  *
@@ -40,7 +40,7 @@
  * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
  */
 struct libeth_fqe {
-	struct page		*page;
+	netmem_ref		netmem;
 	u32			offset;
 	u32			truesize;
 } __aligned_largest;
@@ -102,15 +102,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
 	struct libeth_fqe *buf = &fq->fqes[i];
 
 	buf->truesize = fq->truesize;
-	buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
-	if (unlikely(!buf->page))
+	buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset,
+						 &buf->truesize);
+	if (unlikely(!buf->netmem))
 		return DMA_MAPPING_ERROR;
 
-	return page_pool_get_dma_addr(buf->page) + buf->offset +
+	return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset +
 	       fq->pp->p.offset;
 }
 
-void libeth_rx_recycle_slow(struct page *page);
+void libeth_rx_recycle_slow(netmem_ref netmem);
 
 /**
  * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
@@ -126,18 +127,19 @@ void libeth_rx_recycle_slow(struct page *page);
 static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
 					  u32 len)
 {
-	struct page *page = fqe->page;
+	netmem_ref netmem = fqe->netmem;
 
 	/* Very rare, but possible case. The most common reason:
 	 * the last fragment contained FCS only, which was then
 	 * stripped by the HW.
 	 */
 	if (unlikely(!len)) {
-		libeth_rx_recycle_slow(page);
+		libeth_rx_recycle_slow(netmem);
 		return false;
 	}
 
-	page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
+	page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem,
+					  fqe->offset, len);
 
 	return true;
 }
-- 
cgit v1.2.3


From 35c64b6500ef7308155bf0dc556c646e4d7b0fd3 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:20 +0200
Subject: libeth: support native XDP and register memory model

Expand libeth's Page Pool functionality by adding native XDP support.
This means picking the appropriate headroom and DMA direction.
Also, register all the created &page_pools as XDP memory models.
A driver then can call xdp_rxq_info_attach_page_pool() when registering
its RxQ info.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/rx.c | 20 +++++++++++++++-----
 include/net/libeth/rx.h                |  6 +++++-
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c
index 2afa6e33f160..62521a1f4ec9 100644
--- a/drivers/net/ethernet/intel/libeth/rx.c
+++ b/drivers/net/ethernet/intel/libeth/rx.c
@@ -72,7 +72,7 @@ static u32 libeth_rx_hw_len_truesize(const struct page_pool_params *pp,
 static bool libeth_rx_page_pool_params(struct libeth_fq *fq,
 				       struct page_pool_params *pp)
 {
-	pp->offset = LIBETH_SKB_HEADROOM;
+	pp->offset = fq->xdp ? LIBETH_XDP_HEADROOM : LIBETH_SKB_HEADROOM;
 	/* HW-writeable / syncable length per one page */
 	pp->max_len = LIBETH_RX_PAGE_LEN(pp->offset);
 
@@ -159,11 +159,12 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
 		.dev		= napi->dev->dev.parent,
 		.netdev		= napi->dev,
 		.napi		= napi,
-		.dma_dir	= DMA_FROM_DEVICE,
 	};
 	struct libeth_fqe *fqes;
 	struct page_pool *pool;
-	bool ret;
+	int ret;
+
+	pp.dma_dir = fq->xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 
 	if (!fq->hsplit)
 		ret = libeth_rx_page_pool_params(fq, &pp);
@@ -177,18 +178,26 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
 		return PTR_ERR(pool);
 
 	fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid);
-	if (!fqes)
+	if (!fqes) {
+		ret = -ENOMEM;
 		goto err_buf;
+	}
+
+	ret = xdp_reg_page_pool(pool);
+	if (ret)
+		goto err_mem;
 
 	fq->fqes = fqes;
 	fq->pp = pool;
 
 	return 0;
 
+err_mem:
+	kvfree(fqes);
 err_buf:
 	page_pool_destroy(pool);
 
-	return -ENOMEM;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(libeth_rx_fq_create);
 
@@ -198,6 +207,7 @@ EXPORT_SYMBOL_GPL(libeth_rx_fq_create);
  */
 void libeth_rx_fq_destroy(struct libeth_fq *fq)
 {
+	xdp_unreg_page_pool(fq->pp);
 	kvfree(fq->fqes);
 	page_pool_destroy(fq->pp);
 }
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index 7d5dc58984b1..5d991404845e 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -13,8 +13,10 @@
 
 /* Space reserved in front of each frame */
 #define LIBETH_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBETH_XDP_HEADROOM	(ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \
+				 NET_IP_ALIGN)
 /* Maximum headroom for worst-case calculations */
-#define LIBETH_MAX_HEADROOM	LIBETH_SKB_HEADROOM
+#define LIBETH_MAX_HEADROOM	LIBETH_XDP_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBETH_RX_LL_LEN	(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 /* Maximum supported L2-L4 header length */
@@ -66,6 +68,7 @@ enum libeth_fqe_type {
  * @count: number of descriptors/buffers the queue has
  * @type: type of the buffers this queue has
  * @hsplit: flag whether header split is enabled
+ * @xdp: flag indicating whether XDP is enabled
  * @buf_len: HW-writeable length per each buffer
  * @nid: ID of the closest NUMA node with memory
  */
@@ -81,6 +84,7 @@ struct libeth_fq {
 	/* Cold fields */
 	enum libeth_fqe_type	type:2;
 	bool			hsplit:1;
+	bool			xdp:1;
 
 	u32			buf_len;
 	int			nid;
-- 
cgit v1.2.3


From 8591c3afe8882a00d9070daf78c384b003b596f3 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:21 +0200
Subject: libeth: xdp: add XDP_TX buffers sending

Start adding XDP-specific code to libeth, namely handling XDP_TX buffers
(only sending).
The idea is that we accumulate up to 16 buffers on the stack, then,
if either the limit is reached or the polling is finished, flush them
at once with only one XDPSQ cleaning (if needed). The main sending
function will be aware of the sending budget and already have all the
info to send the buffers, so it can't fail.
Drivers need to provide 2 inline callbacks to the main sending function:
for cleaning an XDPSQ and for filling descriptors; the library code
takes care of the rest.
Note that unlike the generic code, multi-buffer support is not wrapped
here with unlikely() to not hurt header split setups.

&libeth_xdp_buff is a simple extension over &xdp_buff which has a direct
pointer to the corresponding Rx descriptor (and, luckily, precisely 1 CL
size and 16-byte alignment on x86_64).

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> # xmit logic
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/Kconfig  |  10 +-
 drivers/net/ethernet/intel/libeth/Makefile |   6 +-
 drivers/net/ethernet/intel/libeth/xdp.c    |  89 +++++
 include/net/libeth/tx.h                    |  11 +-
 include/net/libeth/xdp.h                   | 541 +++++++++++++++++++++++++++++
 5 files changed, 652 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libeth/xdp.c
 create mode 100644 include/net/libeth/xdp.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig
index 480293b71dbc..d8c4926574fb 100644
--- a/drivers/net/ethernet/intel/libeth/Kconfig
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -1,9 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0-only
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 
 config LIBETH
-	tristate
+	tristate "Common Ethernet library (libeth)" if COMPILE_TEST
 	select PAGE_POOL
 	help
 	  libeth is a common library containing routines shared between several
 	  drivers, but not yet promoted to the generic kernel API.
+
+config LIBETH_XDP
+	tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST
+	select LIBETH
+	help
+	  XDP helpers based on libeth hotpath management.
diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile
index 52492b081132..9ba78f463f2e 100644
--- a/drivers/net/ethernet/intel/libeth/Makefile
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0-only
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 
 obj-$(CONFIG_LIBETH)		+= libeth.o
 
 libeth-y			:= rx.o
+
+obj-$(CONFIG_LIBETH_XDP)	+= libeth_xdp.o
+
+libeth_xdp-y			+= xdp.o
diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
new file mode 100644
index 000000000000..444449c72221
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#define DEFAULT_SYMBOL_NAMESPACE	"LIBETH_XDP"
+
+#include <linux/export.h>
+
+#include <net/libeth/xdp.h>
+
+/* ``XDP_TX`` bulking */
+
+static void __cold
+libeth_xdp_tx_return_one(const struct libeth_xdp_tx_frame *frm)
+{
+	if (frm->len_fl & LIBETH_XDP_TX_MULTI)
+		libeth_xdp_return_frags(frm->data + frm->soff, true);
+
+	libeth_xdp_return_va(frm->data, true);
+}
+
+static void __cold
+libeth_xdp_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, u32 count)
+{
+	for (u32 i = 0; i < count; i++) {
+		const struct libeth_xdp_tx_frame *frm = &bq[i];
+
+		if (!(frm->len_fl & LIBETH_XDP_TX_FIRST))
+			continue;
+
+		libeth_xdp_tx_return_one(frm);
+	}
+}
+
+static void __cold libeth_trace_xdp_exception(const struct net_device *dev,
+					      const struct bpf_prog *prog,
+					      u32 act)
+{
+	trace_xdp_exception(dev, prog, act);
+}
+
+/**
+ * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames
+ * @bq: XDP Tx frame bulk
+ * @sent: number of frames sent successfully (from this bulk)
+ * @flags: internal libeth_xdp flags
+ *
+ * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly.
+ * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust
+ * the Tx bulk to try again later.
+ */
+void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
+				    u32 flags)
+{
+	const struct libeth_xdp_tx_frame *pos = &bq->bulk[sent];
+	u32 left = bq->count - sent;
+
+	libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX);
+
+	if (!(flags & LIBETH_XDP_TX_DROP)) {
+		memmove(bq->bulk, pos, left * sizeof(*bq->bulk));
+		bq->count = left;
+
+		return;
+	}
+
+	libeth_xdp_tx_return_bulk(pos, left);
+
+	bq->count = 0;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_tx_exception);
+
+/* Rx polling path */
+
+/**
+ * libeth_xdp_return_buff_slow - free &libeth_xdp_buff
+ * @xdp: buffer to free/return
+ *
+ * Slowpath version of libeth_xdp_return_buff() to be called on exceptions,
+ * queue clean-ups etc., without unwanted inlining.
+ */
+void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp)
+{
+	__libeth_xdp_return_buff(xdp, false);
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow);
+
+MODULE_DESCRIPTION("Common Ethernet library - XDP infra");
+MODULE_IMPORT_NS("LIBETH");
+MODULE_LICENSE("GPL");
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 35614f9523f6..3e68d11914f7 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
 
 #ifndef __LIBETH_TX_H
 #define __LIBETH_TX_H
@@ -12,11 +12,13 @@
 
 /**
  * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
- * @LIBETH_SQE_EMPTY: unused/empty, no action required
+ * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX frag, no action required
  * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
  * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
  * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
  * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats
+ * @__LIBETH_SQE_XDP_START: separator between skb and XDP types
+ * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
  */
 enum libeth_sqe_type {
 	LIBETH_SQE_EMPTY		= 0U,
@@ -24,6 +26,9 @@ enum libeth_sqe_type {
 	LIBETH_SQE_SLAB,
 	LIBETH_SQE_FRAG,
 	LIBETH_SQE_SKB,
+
+	__LIBETH_SQE_XDP_START,
+	LIBETH_SQE_XDP_TX		= __LIBETH_SQE_XDP_START,
 };
 
 /**
@@ -32,6 +37,7 @@ enum libeth_sqe_type {
  * @rs_idx: index of the last buffer from the batch this one was sent in
  * @raw: slab buffer to free via kfree()
  * @skb: &sk_buff to consume
+ * @sinfo: skb shared info of an XDP_TX frame
  * @dma: DMA address to unmap
  * @len: length of the mapped region to unmap
  * @nr_frags: number of frags in the frame this buffer belongs to
@@ -46,6 +52,7 @@ struct libeth_sqe {
 	union {
 		void				*raw;
 		struct sk_buff			*skb;
+		struct skb_shared_info		*sinfo;
 	};
 
 	DEFINE_DMA_UNMAP_ADDR(dma);
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
new file mode 100644
index 000000000000..4988453a3d70
--- /dev/null
+++ b/include/net/libeth/xdp.h
@@ -0,0 +1,541 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XDP_H
+#define __LIBETH_XDP_H
+
+#include <linux/bpf_trace.h>
+#include <linux/unroll.h>
+
+#include <net/libeth/rx.h>
+#include <net/libeth/tx.h>
+#include <net/xsk_buff_pool.h>
+
+/*
+ * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to,
+ * pick maximum pointer-compatible alignment.
+ */
+#define __LIBETH_XDP_BUFF_ALIGN						      \
+	(IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 :		      \
+	 IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 :		      \
+	 sizeof(long))
+
+/**
+ * struct libeth_xdp_buff - libeth extension over &xdp_buff
+ * @base: main &xdp_buff
+ * @data: shortcut for @base.data
+ * @desc: RQ descriptor containing metadata for this buffer
+ * @priv: driver-private scratchspace
+ *
+ * The main reason for this is to have a pointer to the descriptor to be able
+ * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks
+ * (as well as bigger alignment).
+ * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk.
+ */
+struct libeth_xdp_buff {
+	union {
+		struct xdp_buff		base;
+		void			*data;
+	};
+
+	const void			*desc;
+	unsigned long			priv[]
+					__aligned(__LIBETH_XDP_BUFF_ALIGN);
+} __aligned(__LIBETH_XDP_BUFF_ALIGN);
+static_assert(offsetof(struct libeth_xdp_buff, data) ==
+	      offsetof(struct xdp_buff_xsk, xdp.data));
+static_assert(offsetof(struct libeth_xdp_buff, desc) ==
+	      offsetof(struct xdp_buff_xsk, cb));
+static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
+			 __alignof(struct libeth_xdp_buff)));
+
+/* Common Tx bits */
+
+/**
+ * enum - libeth_xdp internal Tx flags
+ * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue
+ * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled
+ * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent
+ */
+enum {
+	LIBETH_XDP_TX_BULK		= DEV_MAP_BULK_SIZE,
+	LIBETH_XDP_TX_BATCH		= 8,
+
+	LIBETH_XDP_TX_DROP		= BIT(0),
+};
+
+/**
+ * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags
+ * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length
+ * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame
+ * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame
+ * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags
+ * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags
+ */
+enum {
+	LIBETH_XDP_TX_LEN		= GENMASK(15, 0),
+
+	LIBETH_XDP_TX_FIRST		= BIT(16),
+	LIBETH_XDP_TX_LAST		= BIT(17),
+	LIBETH_XDP_TX_MULTI		= BIT(18),
+
+	LIBETH_XDP_TX_FLAGS		= GENMASK(31, 16),
+};
+
+/**
+ * struct libeth_xdp_tx_frame - represents one XDP Tx element
+ * @data: frame start pointer for ``XDP_TX``
+ * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed
+ * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info
+ * @frag: one (non-head) frag for ``XDP_TX``
+ */
+struct libeth_xdp_tx_frame {
+	union {
+		/* ``XDP_TX`` */
+		struct {
+			void				*data;
+			u32				len_fl;
+			u32				soff;
+		};
+
+		/* ``XDP_TX`` frag */
+		skb_frag_t			frag;
+	};
+} __aligned_largest;
+static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
+	      offsetof(struct libeth_xdp_tx_frame, len_fl));
+
+/**
+ * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending
+ * @prog: corresponding active XDP program
+ * @dev: &net_device which the frames are transmitted on
+ * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure
+ * @count: current number of frames in @bulk
+ * @bulk: array of queued frames for bulk Tx
+ *
+ * All XDP Tx operations queue each frame to the bulk first and flush it
+ * when @count reaches the array end. Bulk is always placed on the stack
+ * for performance. One bulk element contains all the data necessary
+ * for sending a frame and then freeing it on completion.
+ */
+struct libeth_xdp_tx_bulk {
+	const struct bpf_prog		*prog;
+	struct net_device		*dev;
+	void				*xdpsq;
+
+	u32				count;
+	struct libeth_xdp_tx_frame	bulk[LIBETH_XDP_TX_BULK];
+} __aligned(sizeof(struct libeth_xdp_tx_frame));
+
+/**
+ * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack
+ * @bq: name of the variable to declare
+ *
+ * Helper to declare a bulk on the stack with a compiler hint that it should
+ * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for
+ * performance reasons.
+ */
+#define LIBETH_XDP_ONSTACK_BULK(bq)					      \
+	struct libeth_xdp_tx_bulk bq __uninitialized
+
+/**
+ * struct libeth_xdpsq - abstraction for an XDPSQ
+ * @sqes: array of Tx buffers from the actual queue struct
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: pointer to the next free descriptor index
+ * @count: number of descriptors on that queue
+ * @pending: pointer to the number of sent-not-completed descs on that queue
+ * @xdp_tx: pointer to the above
+ *
+ * Abstraction for driver-independent implementation of Tx. Placed on the stack
+ * and filled by the driver before the transmission, so that the generic
+ * functions can access and modify driver-specific resources.
+ */
+struct libeth_xdpsq {
+	struct libeth_sqe		*sqes;
+	void				*descs;
+
+	u32				*ntu;
+	u32				count;
+
+	u32				*pending;
+	u32				*xdp_tx;
+};
+
+/**
+ * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor
+ * @addr: DMA address of the frame
+ * @len: length of the frame
+ * @flags: XDP Tx flags
+ * @opts: combined @len + @flags for speed
+ *
+ * Filled by the generic functions and then passed to driver-specific functions
+ * to fill a HW Tx descriptor, always placed on the [function] stack.
+ */
+struct libeth_xdp_tx_desc {
+	dma_addr_t			addr;
+	union {
+		struct {
+			u32				len;
+			u32				flags;
+		};
+		aligned_u64			opts;
+	};
+} __aligned_largest;
+
+/**
+ * libeth_xdp_tx_xmit_bulk - main XDP Tx function
+ * @bulk: array of frames to send
+ * @xdpsq: pointer to the driver-specific XDPSQ struct
+ * @n: number of frames to send
+ * @unroll: whether to unroll the queue filling loop for speed
+ * @priv: driver-specific private data
+ * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq
+ * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: callback for filling a HW descriptor with the frame info
+ *
+ * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for
+ * all types of frames.
+ * @unroll greatly increases the object code size, but also greatly increases
+ * performance.
+ * The compilers inline all those onstack abstractions to direct data accesses.
+ *
+ * Return: number of frames actually placed on the queue, <= @n. The function
+ * can't fail, but can send less frames if there's no enough free descriptors
+ * available. The actual free space is returned by @prep from the driver.
+ */
+static __always_inline u32
+libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq,
+			u32 n, bool unroll, u64 priv,
+			u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+			struct libeth_xdp_tx_desc
+			(*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+				const struct libeth_xdpsq *sq, u64 priv),
+			void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+				     const struct libeth_xdpsq *sq, u64 priv))
+{
+	struct libeth_xdpsq sq __uninitialized;
+	u32 this, batched, off = 0;
+	u32 ntu, i = 0;
+
+	n = min(n, prep(xdpsq, &sq));
+	if (unlikely(!n))
+		return 0;
+
+	ntu = *sq.ntu;
+
+	this = sq.count - ntu;
+	if (likely(this > n))
+		this = n;
+
+again:
+	if (!unroll)
+		goto linear;
+
+	batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH);
+
+	for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) {
+		u32 base = ntu + i - off;
+
+		unrolled_count(LIBETH_XDP_TX_BATCH)
+		for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++)
+			xmit(fill(bulk[i + j], base + j, &sq, priv),
+			     base + j, &sq, priv);
+	}
+
+	if (batched < this) {
+linear:
+		for ( ; i < off + this; i++)
+			xmit(fill(bulk[i], ntu + i - off, &sq, priv),
+			     ntu + i - off, &sq, priv);
+	}
+
+	ntu += this;
+	if (likely(ntu < sq.count))
+		goto out;
+
+	ntu = 0;
+
+	if (i < n) {
+		this = n - i;
+		off = i;
+
+		goto again;
+	}
+
+out:
+	*sq.ntu = ntu;
+	*sq.pending += n;
+	if (sq.xdp_tx)
+		*sq.xdp_tx += n;
+
+	return n;
+}
+
+/* ``XDP_TX`` bulking */
+
+void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XDP buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+					    const struct libeth_xdp_buff *xdp)
+{
+	const struct xdp_buff *base = &xdp->base;
+
+	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+		.data	= xdp->data,
+		.len_fl	= (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST,
+		.soff	= xdp_data_hard_end(base) - xdp->data,
+	};
+
+	if (!xdp_buff_has_frags(base))
+		return false;
+
+	bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI;
+
+	return true;
+}
+
+/**
+ * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ */
+static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+					    const skb_frag_t *frag)
+{
+	bq->bulk[bq->count++].frag = *frag;
+}
+
+/**
+ * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XDP buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+			 struct libeth_xdp_buff *xdp,
+			 bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					    u32 flags))
+{
+	const struct skb_shared_info *sinfo;
+	bool ret = true;
+	u32 nr_frags;
+
+	if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+	    unlikely(!flush_bulk(bq, 0))) {
+		libeth_xdp_return_buff_slow(xdp);
+		return false;
+	}
+
+	if (!libeth_xdp_tx_queue_head(bq, xdp))
+		goto out;
+
+	sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+	nr_frags = sinfo->nr_frags;
+
+	for (u32 i = 0; i < nr_frags; i++) {
+		if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+		    unlikely(!flush_bulk(bq, 0))) {
+			ret = false;
+			break;
+		}
+
+		libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]);
+	}
+
+out:
+	bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST;
+	xdp->data = NULL;
+
+	return ret;
+}
+
+/**
+ * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats
+ * @sqe: SQ element to fill
+ * @desc: libeth_xdp Tx descriptor
+ * @sinfo: &skb_shared_info for this frame
+ *
+ * Internal helper for filling an SQE with the frame stats, do not use in
+ * drivers. Fills the number of frags and bytes for this frame.
+ */
+#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo)			      \
+	__libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_),	      \
+				   __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_))
+
+#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do {	      \
+	const struct libeth_xdp_tx_desc *ud = (desc);			      \
+	const struct skb_shared_info *us;				      \
+	struct libeth_sqe *ue = (sqe);					      \
+									      \
+	ue->nr_frags = 1;						      \
+	ue->bytes = ud->len;						      \
+									      \
+	if (ud->flags & LIBETH_XDP_TX_MULTI) {				      \
+		us = (sinfo);						      \
+		ue->nr_frags += us->nr_frags;				      \
+		ue->bytes += us->xdp_frags_size;			      \
+	}								      \
+} while (0)
+
+/**
+ * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+		       const struct libeth_xdpsq *sq, u64 priv)
+{
+	struct libeth_xdp_tx_desc desc;
+	struct skb_shared_info *sinfo;
+	skb_frag_t *frag = &frm.frag;
+	struct libeth_sqe *sqe;
+	netmem_ref netmem;
+
+	if (frm.len_fl & LIBETH_XDP_TX_FIRST) {
+		sinfo = frm.data + frm.soff;
+		skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data),
+					  offset_in_page(frm.data),
+					  frm.len_fl);
+	} else {
+		sinfo = NULL;
+	}
+
+	netmem = skb_frag_netmem(frag);
+	desc = (typeof(desc)){
+		.addr	= page_pool_get_dma_addr_netmem(netmem) +
+			  skb_frag_off(frag),
+		.len	= skb_frag_size(frag) & LIBETH_XDP_TX_LEN,
+		.flags	= skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS,
+	};
+
+	dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr,
+				   desc.len, DMA_BIDIRECTIONAL);
+
+	if (!sinfo)
+		return desc;
+
+	sqe = &sq->sqes[i];
+	sqe->type = LIBETH_SQE_XDP_TX;
+	sqe->sinfo = sinfo;
+	libeth_xdp_tx_fill_stats(sqe, &desc, sinfo);
+
+	return desc;
+}
+
+void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
+			     u32 flags);
+
+/**
+ * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk
+ * @bq: bulk to flush
+ * @flags: XDP TX flags
+ * @prep: driver-specific callback to prepare the queue for sending
+ * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Internal abstraction to create bulk flush functions for drivers.
+ *
+ * Return: true if anything was sent, false otherwise.
+ */
+static __always_inline bool
+__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
+			   u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+			   struct libeth_xdp_tx_desc
+			   (*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+				   const struct libeth_xdpsq *sq, u64 priv),
+			   void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+					const struct libeth_xdpsq *sq,
+					u64 priv))
+{
+	u32 sent, drops;
+	int err = 0;
+
+	sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq,
+				       min(bq->count, LIBETH_XDP_TX_BULK),
+				       false, 0, prep, fill, xmit);
+	drops = bq->count - sent;
+
+	if (unlikely(drops)) {
+		libeth_xdp_tx_exception(bq, sent, flags);
+		err = -ENXIO;
+	} else {
+		bq->count = 0;
+	}
+
+	trace_xdp_bulk_tx(bq->dev, sent, drops, err);
+
+	return likely(sent);
+}
+
+/**
+ * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see above
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ */
+#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit)			      \
+	__libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf,   \
+				   xmit)
+
+/* Rx polling path */
+
+static inline void libeth_xdp_return_va(const void *data, bool napi)
+{
+	netmem_ref netmem = virt_to_netmem(data);
+
+	page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi);
+}
+
+static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo,
+					   bool napi)
+{
+	for (u32 i = 0; i < sinfo->nr_frags; i++) {
+		netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]);
+
+		page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi);
+	}
+}
+
+/**
+ * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff
+ * @xdp: buffer to free
+ *
+ * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(),
+ * it's faster as it gets inlined and always assumes order-0 pages and safe
+ * direct recycling. Zeroes @xdp->data to avoid UAFs.
+ */
+#define libeth_xdp_return_buff(xdp)	__libeth_xdp_return_buff(xdp, true)
+
+static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp,
+					    bool napi)
+{
+	if (!xdp_buff_has_frags(&xdp->base))
+		goto out;
+
+	libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base),
+				napi);
+
+out:
+	libeth_xdp_return_va(xdp->data, napi);
+	xdp->data = NULL;
+}
+
+#endif /* __LIBETH_XDP_H */
-- 
cgit v1.2.3


From 084ceda7decdbeff2bafbe2d28f57aed50b3bc46 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:22 +0200
Subject: libeth: xdp: add .ndo_xdp_xmit() helpers

Add helpers for implementing .ndo_xdp_xmit().
Same as for XDP_TX, accumulate up to 16 DMA-mapped frames on the stack,
then flush. If DMA mapping is failed for some reason, don't try mapping
further frames, but still flush what was already prepared.
DMA address of a head frame is stored in its headroom, assuming it
has enough of it for an 8 (or 4) byte value.
In addition to @prep and @xmit driver callbacks in XDP_TX, xmit also
needs @finalize to kick the XDPSQ after filling.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c |  37 +++-
 include/net/libeth/tx.h                 |   6 +
 include/net/libeth/xdp.h                | 290 +++++++++++++++++++++++++++++++-
 3 files changed, 328 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index 444449c72221..c65ea5d2746a 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -42,7 +42,7 @@ static void __cold libeth_trace_xdp_exception(const struct net_device *dev,
  * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames
  * @bq: XDP Tx frame bulk
  * @sent: number of frames sent successfully (from this bulk)
- * @flags: internal libeth_xdp flags
+ * @flags: internal libeth_xdp flags (.ndo_xdp_xmit etc.)
  *
  * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly.
  * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust
@@ -54,7 +54,8 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
 	const struct libeth_xdp_tx_frame *pos = &bq->bulk[sent];
 	u32 left = bq->count - sent;
 
-	libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX);
+	if (!(flags & LIBETH_XDP_TX_NDO))
+		libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX);
 
 	if (!(flags & LIBETH_XDP_TX_DROP)) {
 		memmove(bq->bulk, pos, left * sizeof(*bq->bulk));
@@ -63,12 +64,42 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
 		return;
 	}
 
-	libeth_xdp_tx_return_bulk(pos, left);
+	if (!(flags & LIBETH_XDP_TX_NDO))
+		libeth_xdp_tx_return_bulk(pos, left);
+	else
+		libeth_xdp_xmit_return_bulk(pos, left, bq->dev);
 
 	bq->count = 0;
 }
 EXPORT_SYMBOL_GPL(libeth_xdp_tx_exception);
 
+/* .ndo_xdp_xmit() implementation */
+
+u32 __cold libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq,
+				       u32 count, const struct net_device *dev)
+{
+	u32 n = 0;
+
+	for (u32 i = 0; i < count; i++) {
+		const struct libeth_xdp_tx_frame *frm = &bq[i];
+		dma_addr_t dma;
+
+		if (frm->flags & LIBETH_XDP_TX_FIRST)
+			dma = *libeth_xdp_xmit_frame_dma(frm->xdpf);
+		else
+			dma = dma_unmap_addr(frm, dma);
+
+		dma_unmap_page(dev->dev.parent, dma, dma_unmap_len(frm, len),
+			       DMA_TO_DEVICE);
+
+		/* Actual xdp_frames are freed by the core */
+		n += !!(frm->flags & LIBETH_XDP_TX_FIRST);
+	}
+
+	return n;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_xmit_return_bulk);
+
 /* Rx polling path */
 
 /**
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 3e68d11914f7..e2b62a8b4c57 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -19,6 +19,8 @@
  * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats
  * @__LIBETH_SQE_XDP_START: separator between skb and XDP types
  * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA
  */
 enum libeth_sqe_type {
 	LIBETH_SQE_EMPTY		= 0U,
@@ -29,6 +31,8 @@ enum libeth_sqe_type {
 
 	__LIBETH_SQE_XDP_START,
 	LIBETH_SQE_XDP_TX		= __LIBETH_SQE_XDP_START,
+	LIBETH_SQE_XDP_XMIT,
+	LIBETH_SQE_XDP_XMIT_FRAG,
 };
 
 /**
@@ -38,6 +42,7 @@ enum libeth_sqe_type {
  * @raw: slab buffer to free via kfree()
  * @skb: &sk_buff to consume
  * @sinfo: skb shared info of an XDP_TX frame
+ * @xdpf: XDP frame from ::ndo_xdp_xmit()
  * @dma: DMA address to unmap
  * @len: length of the mapped region to unmap
  * @nr_frags: number of frags in the frame this buffer belongs to
@@ -53,6 +58,7 @@ struct libeth_sqe {
 		void				*raw;
 		struct sk_buff			*skb;
 		struct skb_shared_info		*sinfo;
+		struct xdp_frame		*xdpf;
 	};
 
 	DEFINE_DMA_UNMAP_ADDR(dma);
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 4988453a3d70..839001d901b2 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -11,6 +11,17 @@
 #include <net/libeth/tx.h>
 #include <net/xsk_buff_pool.h>
 
+/*
+ * Defined as bits to be able to use them as a mask on Rx.
+ * Also used as internal return values on Tx.
+ */
+enum {
+	LIBETH_XDP_PASS			= 0U,
+	LIBETH_XDP_DROP			= BIT(0),
+	LIBETH_XDP_ABORTED		= BIT(1),
+	LIBETH_XDP_TX			= BIT(2),
+};
+
 /*
  * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to,
  * pick maximum pointer-compatible alignment.
@@ -56,12 +67,14 @@ static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
  * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue
  * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled
  * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent
+ * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit()
  */
 enum {
 	LIBETH_XDP_TX_BULK		= DEV_MAP_BULK_SIZE,
 	LIBETH_XDP_TX_BATCH		= 8,
 
 	LIBETH_XDP_TX_DROP		= BIT(0),
+	LIBETH_XDP_TX_NDO		= BIT(1),
 };
 
 /**
@@ -88,6 +101,11 @@ enum {
  * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed
  * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info
  * @frag: one (non-head) frag for ``XDP_TX``
+ * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit()
+ * @dma: DMA address of the non-head frag for .ndo_xdp_xmit()
+ * @len: frag length for .ndo_xdp_xmit()
+ * @flags: Tx flags for the above
+ * @opts: combined @len + @flags for the above for speed
  */
 struct libeth_xdp_tx_frame {
 	union {
@@ -100,6 +118,21 @@ struct libeth_xdp_tx_frame {
 
 		/* ``XDP_TX`` frag */
 		skb_frag_t			frag;
+
+		/* .ndo_xdp_xmit() */
+		struct {
+			union {
+				struct xdp_frame		*xdpf;
+				dma_addr_t			dma;
+			};
+			union {
+				struct {
+					u32				len;
+					u32				flags;
+				};
+				aligned_u64			opts;
+			};
+		};
 	};
 } __aligned_largest;
 static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
@@ -107,7 +140,7 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
 
 /**
  * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending
- * @prog: corresponding active XDP program
+ * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit()
  * @dev: &net_device which the frames are transmitted on
  * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure
  * @count: current number of frames in @bulk
@@ -445,7 +478,7 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
 /**
  * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk
  * @bq: bulk to flush
- * @flags: XDP TX flags
+ * @flags: XDP TX flags (.ndo_xdp_xmit() etc.)
  * @prep: driver-specific callback to prepare the queue for sending
  * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
  * @xmit: driver callback to fill a HW descriptor
@@ -495,6 +528,259 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
 	__libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf,   \
 				   xmit)
 
+/* .ndo_xdp_xmit() implementation */
+
+/**
+ * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame
+ * @xf: pointer to the XDP frame
+ *
+ * There's no place in &libeth_xdp_tx_frame to store DMA address for an
+ * &xdp_frame head. The headroom is used then, the address is placed right
+ * after the frame struct, naturally aligned.
+ *
+ * Return: pointer to the DMA address to use.
+ */
+#define libeth_xdp_xmit_frame_dma(xf)					      \
+	_Generic((xf),							      \
+		 const struct xdp_frame *:				      \
+			(const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf),  \
+		 struct xdp_frame *:					      \
+			(dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf)	      \
+	)
+
+static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf)
+{
+	void *addr = (void *)(xdpf + 1);
+
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+	    __alignof(*xdpf) < sizeof(dma_addr_t))
+		addr = PTR_ALIGN(addr, sizeof(dma_addr_t));
+
+	return addr;
+}
+
+/**
+ * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdpf: XDP frame with the head to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: ``LIBETH_XDP_DROP`` on DMA mapping error,
+ *	   ``LIBETH_XDP_PASS`` if it's the only frag in the frame,
+ *	   ``LIBETH_XDP_TX`` if it's an S/G frame.
+ */
+static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq,
+					     struct xdp_frame *xdpf,
+					     struct device *dev)
+{
+	dma_addr_t dma;
+
+	dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, dma))
+		return LIBETH_XDP_DROP;
+
+	*libeth_xdp_xmit_frame_dma(xdpf) = dma;
+
+	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+		.xdpf	= xdpf,
+		.len	= xdpf->len,
+		.flags	= LIBETH_XDP_TX_FIRST,
+	};
+
+	if (!xdp_frame_has_frags(xdpf))
+		return LIBETH_XDP_PASS;
+
+	bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+	return LIBETH_XDP_TX;
+}
+
+/**
+ * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: true on success, false on DMA mapping error.
+ */
+static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq,
+					      const skb_frag_t *frag,
+					      struct device *dev)
+{
+	dma_addr_t dma;
+
+	dma = skb_frag_dma_map(dev, frag);
+	if (dma_mapping_error(dev, dma))
+		return false;
+
+	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+		.dma	= dma,
+		.len	= skb_frag_size(frag),
+	};
+
+	return true;
+}
+
+/**
+ * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdpf: XDP frame to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: ``LIBETH_XDP_TX`` on success,
+ *	   ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack,
+ *	   ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp.
+ */
+static __always_inline u32
+libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+			   struct xdp_frame *xdpf,
+			   bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					      u32 flags))
+{
+	u32 head, nr_frags, i, ret = LIBETH_XDP_TX;
+	struct device *dev = bq->dev->dev.parent;
+	const struct skb_shared_info *sinfo;
+
+	if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+	    unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+		return LIBETH_XDP_DROP;
+
+	head = libeth_xdp_xmit_queue_head(bq, xdpf, dev);
+	if (head == LIBETH_XDP_PASS)
+		goto out;
+	else if (head == LIBETH_XDP_DROP)
+		return LIBETH_XDP_DROP;
+
+	sinfo = xdp_get_shared_info_from_frame(xdpf);
+	nr_frags = sinfo->nr_frags;
+
+	for (i = 0; i < nr_frags; i++) {
+		if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+		    unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+			break;
+
+		if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev))
+			break;
+	}
+
+	if (unlikely(i < nr_frags))
+		ret = LIBETH_XDP_ABORTED;
+
+out:
+	bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+	return ret;
+}
+
+/**
+ * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the mapped DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+			 const struct libeth_xdpsq *sq, u64 priv)
+{
+	struct libeth_xdp_tx_desc desc;
+	struct libeth_sqe *sqe;
+	struct xdp_frame *xdpf;
+
+	if (frm.flags & LIBETH_XDP_TX_FIRST) {
+		xdpf = frm.xdpf;
+		desc.addr = *libeth_xdp_xmit_frame_dma(xdpf);
+	} else {
+		xdpf = NULL;
+		desc.addr = frm.dma;
+	}
+	desc.opts = frm.opts;
+
+	sqe = &sq->sqes[i];
+	dma_unmap_addr_set(sqe, dma, desc.addr);
+	dma_unmap_len_set(sqe, len, desc.len);
+
+	if (!xdpf) {
+		sqe->type = LIBETH_SQE_XDP_XMIT_FRAG;
+		return desc;
+	}
+
+	sqe->type = LIBETH_SQE_XDP_XMIT;
+	sqe->xdpf = xdpf;
+	libeth_xdp_tx_fill_stats(sqe, &desc,
+				 xdp_get_shared_info_from_frame(xdpf));
+
+	return desc;
+}
+
+/**
+ * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ */
+#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit)		      \
+	__libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep,     \
+				   libeth_xdp_xmit_fill_buf, xmit)
+
+u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq,
+				u32 count, const struct net_device *dev);
+
+/**
+ * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit()
+ * @bq: XDP Tx bulk to queue frames to
+ * @frames: XDP frames passed by the stack
+ * @n: number of frames
+ * @flags: flags passed by the stack
+ * @flush_bulk: driver callback to flush an XDP xmit bulk
+ * @finalize: driver callback to finalize sending XDP Tx frames on the queue
+ *
+ * Perform common checks, map the frags and queue them to the bulk, then flush
+ * the bulk to the XDPSQ. If requested by the stack, finalize the queue.
+ *
+ * Return: number of frames send or -errno on error.
+ */
+static __always_inline int
+__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
+			  struct xdp_frame **frames, u32 n, u32 flags,
+			  bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					     u32 flags),
+			  void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+	u32 nxmit = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	for (u32 i = 0; likely(i < n); i++) {
+		u32 ret;
+
+		ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk);
+		if (unlikely(ret != LIBETH_XDP_TX)) {
+			nxmit += ret == LIBETH_XDP_ABORTED;
+			break;
+		}
+
+		nxmit++;
+	}
+
+	if (bq->count) {
+		flush_bulk(bq, LIBETH_XDP_TX_NDO);
+		if (unlikely(bq->count))
+			nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk,
+							     bq->count,
+							     bq->dev);
+	}
+
+	finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH);
+
+	return nxmit;
+}
+
 /* Rx polling path */
 
 static inline void libeth_xdp_return_va(const void *data, bool napi)
-- 
cgit v1.2.3


From 26ce8eb0bb7d47c5fb36f7c12f34e4a320f14cac Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:23 +0200
Subject: libeth: xdp: add XDPSQE completion helpers

Similarly to libeth_tx_complete(), add libeth_xdp_complete_tx() to
handle XDP_TX and xmit buffers. Both use bulk return under the hood.

Also add out of line libeth_tx_complete_any() which handles both
regular and XDP frames (if libeth_xdp is loaded), for example,
to call on queue destroy, where we don't need inlining but
convenience.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/Makefile |  1 +
 drivers/net/ethernet/intel/libeth/priv.h   | 26 ++++++++++++
 drivers/net/ethernet/intel/libeth/tx.c     | 38 +++++++++++++++++
 drivers/net/ethernet/intel/libeth/xdp.c    | 58 ++++++++++++++++++++++++++
 include/net/libeth/tx.h                    | 13 +++++-
 include/net/libeth/types.h                 | 21 +++++++++-
 include/net/libeth/xdp.h                   | 66 ++++++++++++++++++++++++++++++
 7 files changed, 221 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libeth/priv.h
 create mode 100644 drivers/net/ethernet/intel/libeth/tx.c

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile
index 9ba78f463f2e..51669840ee06 100644
--- a/drivers/net/ethernet/intel/libeth/Makefile
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -4,6 +4,7 @@
 obj-$(CONFIG_LIBETH)		+= libeth.o
 
 libeth-y			:= rx.o
+libeth-y			+= tx.o
 
 obj-$(CONFIG_LIBETH_XDP)	+= libeth_xdp.o
 
diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h
new file mode 100644
index 000000000000..1bd6e2d7a3e7
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/priv.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_PRIV_H
+#define __LIBETH_PRIV_H
+
+#include <linux/types.h>
+
+/* XDP */
+
+struct skb_shared_info;
+struct xdp_frame_bulk;
+
+struct libeth_xdp_ops {
+	void	(*bulk)(const struct skb_shared_info *sinfo,
+			struct xdp_frame_bulk *bq, bool frags);
+};
+
+void libeth_attach_xdp(const struct libeth_xdp_ops *ops);
+
+static inline void libeth_detach_xdp(void)
+{
+	libeth_attach_xdp(NULL);
+}
+
+#endif /* __LIBETH_PRIV_H */
diff --git a/drivers/net/ethernet/intel/libeth/tx.c b/drivers/net/ethernet/intel/libeth/tx.c
new file mode 100644
index 000000000000..227c841ab16a
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/tx.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#define DEFAULT_SYMBOL_NAMESPACE	"LIBETH"
+
+#include <net/libeth/xdp.h>
+
+#include "priv.h"
+
+/* Tx buffer completion */
+
+DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk);
+
+/**
+ * libeth_tx_complete_any - perform Tx completion for one SQE of any type
+ * @sqe: Tx buffer to complete
+ * @cp: polling params
+ *
+ * Can be used to complete both regular and XDP SQEs, for example when
+ * destroying queues.
+ * When libeth_xdp is not loaded, XDPSQEs won't be handled.
+ */
+void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp)
+{
+	if (sqe->type >= __LIBETH_SQE_XDP_START)
+		__libeth_xdp_complete_tx(sqe, cp, static_call(bulk));
+	else
+		libeth_tx_complete(sqe, cp);
+}
+EXPORT_SYMBOL_GPL(libeth_tx_complete_any);
+
+/* Module */
+
+void libeth_attach_xdp(const struct libeth_xdp_ops *ops)
+{
+	static_call_update(bulk, ops ? ops->bulk : NULL);
+}
+EXPORT_SYMBOL_GPL(libeth_attach_xdp);
diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index c65ea5d2746a..c29a1a0dfc57 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -7,6 +7,8 @@
 
 #include <net/libeth/xdp.h>
 
+#include "priv.h"
+
 /* ``XDP_TX`` bulking */
 
 static void __cold
@@ -115,6 +117,62 @@ void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow);
 
+/* Tx buffer completion */
+
+static void libeth_xdp_put_netmem_bulk(netmem_ref netmem,
+				       struct xdp_frame_bulk *bq)
+{
+	if (unlikely(bq->count == XDP_BULK_QUEUE_SIZE))
+		xdp_flush_frame_bulk(bq);
+
+	bq->q[bq->count++] = netmem;
+}
+
+/**
+ * libeth_xdp_return_buff_bulk - free &xdp_buff as part of a bulk
+ * @sinfo: shared info corresponding to the buffer
+ * @bq: XDP frame bulk to store the buffer
+ * @frags: whether the buffer has frags
+ *
+ * Same as xdp_return_frame_bulk(), but for &libeth_xdp_buff, speeds up Tx
+ * completion of ``XDP_TX`` buffers and allows to free them in same bulks
+ * with &xdp_frame buffers.
+ */
+void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
+				 struct xdp_frame_bulk *bq, bool frags)
+{
+	if (!frags)
+		goto head;
+
+	for (u32 i = 0; i < sinfo->nr_frags; i++)
+		libeth_xdp_put_netmem_bulk(skb_frag_netmem(&sinfo->frags[i]),
+					   bq);
+
+head:
+	libeth_xdp_put_netmem_bulk(virt_to_netmem(sinfo), bq);
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_bulk);
+
+/* Module */
+
+static const struct libeth_xdp_ops xdp_ops __initconst = {
+	.bulk	= libeth_xdp_return_buff_bulk,
+};
+
+static int __init libeth_xdp_module_init(void)
+{
+	libeth_attach_xdp(&xdp_ops);
+
+	return 0;
+}
+module_init(libeth_xdp_module_init);
+
+static void __exit libeth_xdp_module_exit(void)
+{
+	libeth_detach_xdp();
+}
+module_exit(libeth_xdp_module_exit);
+
 MODULE_DESCRIPTION("Common Ethernet library - XDP infra");
 MODULE_IMPORT_NS("LIBETH");
 MODULE_LICENSE("GPL");
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index e2b62a8b4c57..33b9bb22f6ac 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -84,7 +84,10 @@ struct libeth_sqe {
 /**
  * struct libeth_cq_pp - completion queue poll params
  * @dev: &device to perform DMA unmapping
+ * @bq: XDP frame bulk to combine return operations
  * @ss: onstack NAPI stats to fill
+ * @xss: onstack XDPSQ NAPI stats to fill
+ * @xdp_tx: number of XDP frames processed
  * @napi: whether it's called from the NAPI context
  *
  * libeth uses this structure to access objects needed for performing full
@@ -93,7 +96,13 @@ struct libeth_sqe {
  */
 struct libeth_cq_pp {
 	struct device			*dev;
-	struct libeth_sq_napi_stats	*ss;
+	struct xdp_frame_bulk		*bq;
+
+	union {
+		struct libeth_sq_napi_stats	*ss;
+		struct libeth_xdpsq_napi_stats	*xss;
+	};
+	u32				xdp_tx;
 
 	bool				napi;
 };
@@ -139,4 +148,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe,
 	sqe->type = LIBETH_SQE_EMPTY;
 }
 
+void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp);
+
 #endif /* __LIBETH_TX_H */
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 603825e45133..ad7a5c1f119f 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
 
 #ifndef __LIBETH_TYPES_H
 #define __LIBETH_TYPES_H
@@ -22,4 +22,23 @@ struct libeth_sq_napi_stats {
 	};
 };
 
+/**
+ * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx
+ *				    completion loop
+ * @packets: completed frames counter
+ * @bytes: sum of bytes of completed frames above
+ * @fragments: sum of fragments of completed S/G frames
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_xdpsq_napi_stats {
+	union {
+		struct {
+							u32 packets;
+							u32 bytes;
+							u32 fragments;
+		};
+		DECLARE_FLEX_ARRAY(u32, raw);
+	};
+};
+
 #endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 839001d901b2..c47ecba56020 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -824,4 +824,70 @@ out:
 	xdp->data = NULL;
 }
 
+/* Tx buffer completion */
+
+void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
+				 struct xdp_frame_bulk *bq, bool frags);
+
+/**
+ * __libeth_xdp_complete_tx - complete sent XDPSQE
+ * @sqe: SQ element / Tx buffer to complete
+ * @cp: Tx polling/completion params
+ * @bulk: internal callback to bulk-free ``XDP_TX`` buffers
+ *
+ * Use the non-underscored version in drivers instead. This one is shared
+ * internally with libeth_tx_complete_any().
+ * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping
+ * when needed, buffer freeing, stats update, and SQE invalidation.
+ */
+static __always_inline void
+__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
+			 typeof(libeth_xdp_return_buff_bulk) bulk)
+{
+	enum libeth_sqe_type type = sqe->type;
+
+	switch (type) {
+	case LIBETH_SQE_EMPTY:
+		return;
+	case LIBETH_SQE_XDP_XMIT:
+	case LIBETH_SQE_XDP_XMIT_FRAG:
+		dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma),
+			       dma_unmap_len(sqe, len), DMA_TO_DEVICE);
+		break;
+	default:
+		break;
+	}
+
+	switch (type) {
+	case LIBETH_SQE_XDP_TX:
+		bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1);
+		break;
+	case LIBETH_SQE_XDP_XMIT:
+		xdp_return_frame_bulk(sqe->xdpf, cp->bq);
+		break;
+	default:
+		break;
+	}
+
+	switch (type) {
+	case LIBETH_SQE_XDP_TX:
+	case LIBETH_SQE_XDP_XMIT:
+		cp->xdp_tx -= sqe->nr_frags;
+
+		cp->xss->packets++;
+		cp->xss->bytes += sqe->bytes;
+		break;
+	default:
+		break;
+	}
+
+	sqe->type = LIBETH_SQE_EMPTY;
+}
+
+static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
+					  struct libeth_cq_pp *cp)
+{
+	__libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk);
+}
+
 #endif /* __LIBETH_XDP_H */
-- 
cgit v1.2.3


From c4ba6a9b9d460c6fd742e118022f2808ec3c4223 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:24 +0200
Subject: libeth: xdp: add XDPSQ locking helpers

Unfortunately, it's not always possible to allocate
max(num_rxqs, nr_cpu_ids) even on hi-end NICs.
To mitigate this, add simple locking helpers to libeth_xdp.
As long as XDPSQs are not shared, the whole functionality is gated
behind a static lock. Otherwise, each bulk flush locks the queue for
the time of cleaning and filling the descriptors.
As long as this particular queue is not used by more than 1 CPU,
the impact is minimal (runtime check for boolean twice per 16+
descriptors).

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> # static key
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c |  47 ++++++++++++
 include/net/libeth/types.h              |  21 +++++-
 include/net/libeth/xdp.h                | 127 +++++++++++++++++++++++++++++++-
 3 files changed, 192 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index c29a1a0dfc57..0f08dd405190 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -9,6 +9,53 @@
 
 #include "priv.h"
 
+/* XDPSQ sharing */
+
+DEFINE_STATIC_KEY_FALSE(libeth_xdpsq_share);
+EXPORT_SYMBOL_GPL(libeth_xdpsq_share);
+
+void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+			const struct net_device *dev)
+{
+	bool warn;
+
+	spin_lock_init(&lock->lock);
+	lock->share = true;
+
+	warn = !static_key_enabled(&libeth_xdpsq_share);
+	static_branch_inc(&libeth_xdpsq_share);
+
+	if (warn && net_ratelimit())
+		netdev_warn(dev, "XDPSQ sharing enabled, possible XDP Tx slowdown\n");
+}
+EXPORT_SYMBOL_GPL(__libeth_xdpsq_get);
+
+void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+			const struct net_device *dev)
+{
+	static_branch_dec(&libeth_xdpsq_share);
+
+	if (!static_key_enabled(&libeth_xdpsq_share) && net_ratelimit())
+		netdev_notice(dev, "XDPSQ sharing disabled\n");
+
+	lock->share = false;
+}
+EXPORT_SYMBOL_GPL(__libeth_xdpsq_put);
+
+void __acquires(&lock->lock)
+__libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock)
+{
+	spin_lock(&lock->lock);
+}
+EXPORT_SYMBOL_GPL(__libeth_xdpsq_lock);
+
+void __releases(&lock->lock)
+__libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
+{
+	spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL_GPL(__libeth_xdpsq_unlock);
+
 /* ``XDP_TX`` bulking */
 
 static void __cold
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index ad7a5c1f119f..abfccae1a346 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -4,7 +4,7 @@
 #ifndef __LIBETH_TYPES_H
 #define __LIBETH_TYPES_H
 
-#include <linux/types.h>
+#include <linux/spinlock.h>
 
 /**
  * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
@@ -41,4 +41,23 @@ struct libeth_xdpsq_napi_stats {
 	};
 };
 
+/* XDP */
+
+/*
+ * The following structures should be embedded into driver's queue structure
+ * and passed to the libeth_xdp helpers, never used directly.
+ */
+
+/* XDPSQ sharing */
+
+/**
+ * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs
+ * @lock: spinlock for locking the queue
+ * @share: whether this particular queue is shared
+ */
+struct libeth_xdpsq_lock {
+	spinlock_t			lock;
+	bool				share;
+};
+
 #endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index c47ecba56020..20977fdfd6c9 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -60,6 +60,123 @@ static_assert(offsetof(struct libeth_xdp_buff, desc) ==
 static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
 			 __alignof(struct libeth_xdp_buff)));
 
+/* XDPSQ sharing */
+
+DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share);
+
+/**
+ * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys
+ * @rxq: current number of active Rx queues
+ * @txq: current number of active Tx queues
+ * @max: maximum number of Tx queues
+ *
+ * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ
+ * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these
+ * two with the number of SQs the device can have (minus used ones).
+ *
+ * Return: number of XDP Tx queues the device needs to use.
+ */
+static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max)
+{
+	return min(max(nr_cpu_ids, rxq), max - txq);
+}
+
+/**
+ * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs
+ * @num: number of active XDPSQs
+ *
+ * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise.
+ */
+static inline bool libeth_xdpsq_shared(u32 num)
+{
+	return num < nr_cpu_ids;
+}
+
+/**
+ * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU
+ * @num: number of active XDPSQs
+ *
+ * Helper for libeth_xdp routines, do not use in drivers directly.
+ *
+ * Return: XDPSQ index needs to be used on this CPU.
+ */
+static inline u32 libeth_xdpsq_id(u32 num)
+{
+	u32 ret = raw_smp_processor_id();
+
+	if (static_branch_unlikely(&libeth_xdpsq_share) &&
+	    libeth_xdpsq_shared(num))
+		ret %= num;
+
+	return ret;
+}
+
+void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+			const struct net_device *dev);
+void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+			const struct net_device *dev);
+
+/**
+ * libeth_xdpsq_get - initialize &libeth_xdpsq_lock
+ * @lock: lock to initialize
+ * @dev: netdev which this lock belongs to
+ * @share: whether XDPSQs can be shared
+ *
+ * Tracks the current XDPSQ association and enables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+				    const struct net_device *dev,
+				    bool share)
+{
+	if (unlikely(share))
+		__libeth_xdpsq_get(lock, dev);
+}
+
+/**
+ * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock
+ * @lock: lock to deinitialize
+ * @dev: netdev which this lock belongs to
+ *
+ * Tracks the current XDPSQ association and disables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+				    const struct net_device *dev)
+{
+	if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+		__libeth_xdpsq_put(lock, dev);
+}
+
+void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock);
+void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock);
+
+/**
+ * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed
+ * @lock: lock to take
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock)
+{
+	if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+		__libeth_xdpsq_lock(lock);
+}
+
+/**
+ * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed
+ * @lock: lock to free
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
+{
+	if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+		__libeth_xdpsq_unlock(lock);
+}
+
 /* Common Tx bits */
 
 /**
@@ -179,6 +296,7 @@ struct libeth_xdp_tx_bulk {
  * @count: number of descriptors on that queue
  * @pending: pointer to the number of sent-not-completed descs on that queue
  * @xdp_tx: pointer to the above
+ * @lock: corresponding XDPSQ lock
  *
  * Abstraction for driver-independent implementation of Tx. Placed on the stack
  * and filled by the driver before the transmission, so that the generic
@@ -193,6 +311,7 @@ struct libeth_xdpsq {
 
 	u32				*pending;
 	u32				*xdp_tx;
+	struct libeth_xdpsq_lock	*lock;
 };
 
 /**
@@ -229,7 +348,8 @@ struct libeth_xdp_tx_desc {
  *
  * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for
  * all types of frames.
- * @unroll greatly increases the object code size, but also greatly increases
+ * @prep must lock the queue as this function releases it at the end. @unroll
+ * greatly increases the object code size, but also greatly increases
  * performance.
  * The compilers inline all those onstack abstractions to direct data accesses.
  *
@@ -253,7 +373,7 @@ libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq,
 
 	n = min(n, prep(xdpsq, &sq));
 	if (unlikely(!n))
-		return 0;
+		goto unlock;
 
 	ntu = *sq.ntu;
 
@@ -302,6 +422,9 @@ out:
 	if (sq.xdp_tx)
 		*sq.xdp_tx += n;
 
+unlock:
+	libeth_xdpsq_unlock(sq.lock);
+
 	return n;
 }
 
-- 
cgit v1.2.3


From 819bbaefeded93df36d71d58d9963d706e6e99e1 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:25 +0200
Subject: libeth: xdp: add XDPSQ cleanup timers

When XDP Tx queues are not interrupt-driven but use lazy cleaning,
i.e. only when there are less than `threshold` free descriptors left,
we also need cleanup timers to avoid &xdp_buff and &xdp_frame stall
for too long, especially with Page Pool (it warns every about inflight
pages every 60 second).
Let's say we sent 256 frames and don't need to send more, but we clean
only when the number of pending items >= 384. In that case, those 256
will stall until 128 more are sent. For this, add simple helpers to
run a timer which will clean the queue regardless, after 1 second of
the last send.
The timer is triggered when finalizing the queue. As long as there is
regular active traffic, the timer doesn't fire.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c | 23 +++++++++++++
 include/net/libeth/types.h              | 21 +++++++++++-
 include/net/libeth/xdp.h                | 57 +++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index 0f08dd405190..6f62603cf568 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -56,6 +56,29 @@ __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
 }
 EXPORT_SYMBOL_GPL(__libeth_xdpsq_unlock);
 
+/* XDPSQ clean-up timers */
+
+/**
+ * libeth_xdpsq_init_timer - initialize an XDPSQ clean-up timer
+ * @timer: timer to initialize
+ * @xdpsq: queue this timer belongs to
+ * @lock: corresponding XDPSQ lock
+ * @poll: queue polling/completion function
+ *
+ * XDPSQ clean-up timers must be set up before using at the queue configuration
+ * time. Set the required pointers and the cleaning callback.
+ */
+void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq,
+			     struct libeth_xdpsq_lock *lock,
+			     void (*poll)(struct work_struct *work))
+{
+	timer->xdpsq = xdpsq;
+	timer->lock = lock;
+
+	INIT_DELAYED_WORK(&timer->dwork, poll);
+}
+EXPORT_SYMBOL_GPL(libeth_xdpsq_init_timer);
+
 /* ``XDP_TX`` bulking */
 
 static void __cold
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index abfccae1a346..4df703a9eb59 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -4,7 +4,7 @@
 #ifndef __LIBETH_TYPES_H
 #define __LIBETH_TYPES_H
 
-#include <linux/spinlock.h>
+#include <linux/workqueue.h>
 
 /**
  * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
@@ -60,4 +60,23 @@ struct libeth_xdpsq_lock {
 	bool				share;
 };
 
+/* XDPSQ clean-up timers */
+
+/**
+ * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts
+ * @xdpsq: queue this timer belongs to
+ * @lock: lock for the queue
+ * @dwork: work performing cleanups
+ *
+ * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no
+ * space for sending the current queued frame/bulk, must fire up timers to
+ * make sure there are no stale buffers to free.
+ */
+struct libeth_xdpsq_timer {
+	void				*xdpsq;
+	struct libeth_xdpsq_lock	*lock;
+
+	struct delayed_work		dwork;
+};
+
 #endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 20977fdfd6c9..22bd038decb6 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -177,6 +177,63 @@ static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
 		__libeth_xdpsq_unlock(lock);
 }
 
+/* XDPSQ clean-up timers */
+
+void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq,
+			     struct libeth_xdpsq_lock *lock,
+			     void (*poll)(struct work_struct *work));
+
+/**
+ * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer
+ * @timer: timer to deinitialize
+ *
+ * Flush and disable the underlying workqueue.
+ */
+static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer)
+{
+	cancel_delayed_work_sync(&timer->dwork);
+}
+
+/**
+ * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer
+ * @timer: timer to queue
+ *
+ * Should be called after the queue was filled and the transmission was run
+ * to complete the pending buffers if no further sending will be done in a
+ * second (-> lazy cleaning won't happen).
+ * If the timer was already run, it will be requeued back to one second
+ * timeout again.
+ */
+static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer)
+{
+	mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq,
+			    &timer->dwork, HZ);
+}
+
+/**
+ * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event
+ * @work: workqueue belonging to the corresponding timer
+ * @poll: driver-specific completion queue poll function
+ *
+ * Run the polling function on the locked queue and requeue the timer if
+ * there's more work to do.
+ * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below.
+ */
+static __always_inline void
+libeth_xdpsq_run_timer(struct work_struct *work,
+		       u32 (*poll)(void *xdpsq, u32 budget))
+{
+	struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer),
+							dwork.work);
+
+	libeth_xdpsq_lock(timer->lock);
+
+	if (poll(timer->xdpsq, U32_MAX))
+		libeth_xdpsq_queue_timer(timer);
+
+	libeth_xdpsq_unlock(timer->lock);
+}
+
 /* Common Tx bits */
 
 /**
-- 
cgit v1.2.3


From 3ef2b0192e8ba133f597919632bd9cf196076f0b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:26 +0200
Subject: libeth: xdp: add helpers for preparing/processing &libeth_xdp_buff

Add convenience helpers to build an &xdp_buff. This means: general
initialization before the NAPI loop, adding head, adding frags etc.
libeth_xdp_process_buff() is the same what everybody have in their
drivers:

dma_sync_for_cpu();

if (!frag) {
	add_head();
	prefetch();
} else {
	add_frag();
}

Note that I don't use net_prefetch(), sticking to the original
prefetch(). In none of my tests prefetching 128 bytes yielded better
perf than 64 bytes. That might differ if the headers are huge enough,
but then additional tunneling etc. overhead takes place, you either
way won't win a lot.

&libeth_xdp_stash is for cases when you exit the polling loop without
finishing building the buff. If that happens, you need to store the
buffer in the queue structure until the next loop and then restore it.
It makes no sense to place a whole full &xdp_buff there. Define a
minimal structure, which would store only the fields essential to
restore it.
I was able to pack it into 16 bytes, which is only 8 bytes bigger
than `struct sk_buff *skb` on x64.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c |  90 +++++++++++++++++++
 include/net/libeth/types.h              |  23 +++++
 include/net/libeth/xdp.h                | 151 ++++++++++++++++++++++++++++++++
 3 files changed, 264 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index 6f62603cf568..d0669f1f02f3 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -174,6 +174,64 @@ EXPORT_SYMBOL_GPL(libeth_xdp_xmit_return_bulk);
 
 /* Rx polling path */
 
+/**
+ * libeth_xdp_load_stash - recreate an &xdp_buff from libeth_xdp buffer stash
+ * @dst: target &libeth_xdp_buff to initialize
+ * @src: source stash
+ *
+ * External helper used by libeth_xdp_init_buff(), do not call directly.
+ * Recreate an onstack &libeth_xdp_buff using the stash saved earlier.
+ * The only field untouched (rxq) is initialized later in the
+ * abovementioned function.
+ */
+void libeth_xdp_load_stash(struct libeth_xdp_buff *dst,
+			   const struct libeth_xdp_buff_stash *src)
+{
+	dst->data = src->data;
+	dst->base.data_end = src->data + src->len;
+	dst->base.data_meta = src->data;
+	dst->base.data_hard_start = src->data - src->headroom;
+
+	dst->base.frame_sz = src->frame_sz;
+	dst->base.flags = src->flags;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_load_stash);
+
+/**
+ * libeth_xdp_save_stash - convert &xdp_buff to a libeth_xdp buffer stash
+ * @dst: target &libeth_xdp_buff_stash to initialize
+ * @src: source XDP buffer
+ *
+ * External helper used by libeth_xdp_save_buff(), do not call directly.
+ * Use the fields from the passed XDP buffer to initialize the stash on the
+ * queue, so that a partially received frame can be finished later during
+ * the next NAPI poll.
+ */
+void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst,
+			   const struct libeth_xdp_buff *src)
+{
+	dst->data = src->data;
+	dst->headroom = src->data - src->base.data_hard_start;
+	dst->len = src->base.data_end - src->data;
+
+	dst->frame_sz = src->base.frame_sz;
+	dst->flags = src->base.flags;
+
+	WARN_ON_ONCE(dst->flags != src->base.flags);
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_save_stash);
+
+void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash)
+{
+	LIBETH_XDP_ONSTACK_BUFF(xdp);
+
+	libeth_xdp_load_stash(xdp, stash);
+	libeth_xdp_return_buff_slow(xdp);
+
+	stash->data = NULL;
+}
+EXPORT_SYMBOL_GPL(__libeth_xdp_return_stash);
+
 /**
  * libeth_xdp_return_buff_slow - free &libeth_xdp_buff
  * @xdp: buffer to free/return
@@ -187,6 +245,38 @@ void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow);
 
+/**
+ * libeth_xdp_buff_add_frag - add frag to XDP buffer
+ * @xdp: head XDP buffer
+ * @fqe: Rx buffer containing the frag
+ * @len: frag length reported by HW
+ *
+ * External helper used by libeth_xdp_process_buff(), do not call directly.
+ * Frees both head and frag buffers on error.
+ *
+ * Return: true success, false on error (no space for a new frag).
+ */
+bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
+			      const struct libeth_fqe *fqe,
+			      u32 len)
+{
+	netmem_ref netmem = fqe->netmem;
+
+	if (!xdp_buff_add_frag(&xdp->base, netmem,
+			       fqe->offset + netmem_get_pp(netmem)->p.offset,
+			       len, fqe->truesize))
+		goto recycle;
+
+	return true;
+
+recycle:
+	libeth_rx_recycle_slow(netmem);
+	libeth_xdp_return_buff_slow(xdp);
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag);
+
 /* Tx buffer completion */
 
 static void libeth_xdp_put_netmem_bulk(netmem_ref netmem,
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 4df703a9eb59..7b27c1966d45 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -79,4 +79,27 @@ struct libeth_xdpsq_timer {
 	struct delayed_work		dwork;
 };
 
+/* Rx polling path */
+
+/**
+ * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue
+ * @data: pointer to the start of the frame, xdp_buff.data
+ * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start
+ * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data
+ * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz
+ * @flags: xdp_buff.flags
+ *
+ * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This
+ * structure carries only necessary fields to save/restore a partially built
+ * frame on the queue structure to finish it during the next NAPI poll.
+ */
+struct libeth_xdp_buff_stash {
+	void				*data;
+	u16				headroom;
+	u16				len;
+
+	u32				frame_sz:24;
+	u32				flags:8;
+} __aligned_largest;
+
 #endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 22bd038decb6..780447cdabc1 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -60,6 +60,42 @@ static_assert(offsetof(struct libeth_xdp_buff, desc) ==
 static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
 			 __alignof(struct libeth_xdp_buff)));
 
+/**
+ * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: sizeof() of the driver-private data
+ */
+#define __LIBETH_XDP_ONSTACK_BUFF(name, ...)				      \
+	___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__)
+/**
+ * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: type or variable name of the driver-private data
+ */
+#define LIBETH_XDP_ONSTACK_BUFF(name, ...)				      \
+	__LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__))
+
+#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...)				      \
+	__DEFINE_FLEX(struct libeth_xdp_buff, name, priv,		      \
+		      LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0),		      \
+		      __uninitialized);					      \
+	LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0)
+
+#define __libeth_xdp_priv_sz(...)					      \
+	CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define __libeth_xdp_psz0(...)
+#define __libeth_xdp_psz1(...)		sizeof(__VA_ARGS__)
+
+#define LIBETH_XDP_PRIV_SZ(sz)						      \
+	(ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long))
+
+/* Performs XSK_CHECK_PRIV_TYPE() */
+#define LIBETH_XDP_ASSERT_PRIV_SZ(sz)					      \
+	static_assert(offsetofend(struct xdp_buff_xsk, cb) >=		      \
+		      struct_size_t(struct libeth_xdp_buff, priv,	      \
+				    LIBETH_XDP_PRIV_SZ(sz)))
+
 /* XDPSQ sharing */
 
 DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share);
@@ -963,6 +999,65 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
 
 /* Rx polling path */
 
+void libeth_xdp_load_stash(struct libeth_xdp_buff *dst,
+			   const struct libeth_xdp_buff_stash *src);
+void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst,
+			   const struct libeth_xdp_buff *src);
+void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash);
+
+/**
+ * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll
+ * @dst: onstack buffer to initialize
+ * @src: XDP buffer stash placed on the queue
+ * @rxq: registered &xdp_rxq_info corresponding to this queue
+ *
+ * Should be called before the main NAPI polling loop. Loads the content of
+ * the previously saved stash or initializes the buffer from scratch.
+ */
+static inline void
+libeth_xdp_init_buff(struct libeth_xdp_buff *dst,
+		     const struct libeth_xdp_buff_stash *src,
+		     struct xdp_rxq_info *rxq)
+{
+	if (likely(!src->data))
+		dst->data = NULL;
+	else
+		libeth_xdp_load_stash(dst, src);
+
+	dst->base.rxq = rxq;
+}
+
+/**
+ * libeth_xdp_save_buff - save a partially built buffer on a queue
+ * @dst: XDP buffer stash placed on the queue
+ * @src: onstack buffer to save
+ *
+ * Should be called after the main NAPI polling loop. If the loop exited before
+ * the buffer was finished, saves its content on the queue, so that it can be
+ * completed during the next poll. Otherwise, clears the stash.
+ */
+static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst,
+					const struct libeth_xdp_buff *src)
+{
+	if (likely(!src->data))
+		dst->data = NULL;
+	else
+		libeth_xdp_save_stash(dst, src);
+}
+
+/**
+ * libeth_xdp_return_stash - free an XDP buffer stash from a queue
+ * @stash: stash to free
+ *
+ * If the queue is about to be destroyed, but it still has an incompleted
+ * buffer stash, this helper should be called to free it.
+ */
+static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash)
+{
+	if (stash->data)
+		__libeth_xdp_return_stash(stash);
+}
+
 static inline void libeth_xdp_return_va(const void *data, bool napi)
 {
 	netmem_ref netmem = virt_to_netmem(data);
@@ -1004,6 +1099,62 @@ out:
 	xdp->data = NULL;
 }
 
+bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
+			      const struct libeth_fqe *fqe,
+			      u32 len);
+
+/**
+ * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data
+ * @xdp: XDP buffer to attach the head to
+ * @fqe: FQE containing the head buffer
+ * @len: buffer len passed from HW
+ *
+ * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer
+ * head with the Rx buffer data: data pointer, length, headroom, and
+ * truesize/tailroom. Zeroes the flags.
+ */
+static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
+					   const struct libeth_fqe *fqe,
+					   u32 len)
+{
+	const struct page *page = __netmem_to_page(fqe->netmem);
+
+	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
+	xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
+			 page->pp->p.offset, len, true);
+}
+
+/**
+ * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff
+ * @xdp: XDP buffer to attach the Rx buffer to
+ * @fqe: Rx buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If the XDP buffer is empty, attaches the Rx buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: true on success, false if the descriptor must be skipped (empty or
+ * no space for a new frag).
+ */
+static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp,
+					   const struct libeth_fqe *fqe,
+					   u32 len)
+{
+	if (!libeth_rx_sync_for_cpu(fqe, len))
+		return false;
+
+	if (xdp->data)
+		return libeth_xdp_buff_add_frag(xdp, fqe, len);
+
+	libeth_xdp_prepare_buff(xdp, fqe, len);
+
+	prefetch(xdp->data);
+
+	return true;
+}
+
 /* Tx buffer completion */
 
 void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
-- 
cgit v1.2.3


From 4c805f7ae1ce61a90121378a5ee1f47b3b870c73 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:27 +0200
Subject: libeth: xdp: add XDP prog run and verdict result handling

Running a prog and handling the verdicts, up to napi_gro_receive()
is also pretty generic code not really differing between vendors
(except for Tx descriptor filling and Rx descriptor parsing).

Define a couple inlines to do that. The inline callbacks a driver
needs to pass is mentioned above: Tx descriptor filling for XDP_TX,
populating skb with the descriptor data for XDP_PASS, finalizing
XDPSQs after the polling loop for XDP_TX (kicking the HW to start
sending).
The populate callback passes only &libeth_xdp_buff assuming buff::desc
pointer is enough, plus you can always get the corresponding Rx queue
structure via container_of(buff::rxq). If not, a driver can extend
the buff with more fields directly on the stack without touching
libeth_xdp definitions.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c |  27 +++
 include/net/libeth/types.h              |  22 +++
 include/net/libeth/xdp.h                | 281 ++++++++++++++++++++++++++++++++
 3 files changed, 330 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index d0669f1f02f3..1607579d65bb 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -277,6 +277,33 @@ recycle:
 }
 EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag);
 
+/**
+ * libeth_xdp_prog_exception - handle XDP prog exceptions
+ * @bq: XDP Tx bulk
+ * @xdp: buffer to process
+ * @act: original XDP prog verdict
+ * @ret: error code if redirect failed
+ *
+ * External helper used by __libeth_xdp_run_prog(), do not call directly.
+ * Reports invalid @act, XDP exception trace event and frees the buffer.
+ *
+ * Return: libeth_xdp XDP prog verdict.
+ */
+u32 __cold libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq,
+				     struct libeth_xdp_buff *xdp,
+				     enum xdp_action act, int ret)
+{
+	if (act > XDP_REDIRECT)
+		bpf_warn_invalid_xdp_action(bq->dev, bq->prog, act);
+
+	libeth_trace_xdp_exception(bq->dev, bq->prog, act);
+
+	libeth_xdp_return_buff_slow(xdp);
+
+	return LIBETH_XDP_DROP;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_prog_exception);
+
 /* Tx buffer completion */
 
 static void libeth_xdp_put_netmem_bulk(netmem_ref netmem,
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 7b27c1966d45..cf1d78a9dc38 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -6,6 +6,28 @@
 
 #include <linux/workqueue.h>
 
+/* Stats */
+
+/**
+ * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop
+ * @packets: received frames counter
+ * @bytes: sum of bytes of received frames above
+ * @fragments: sum of fragments of received S/G frames
+ * @hsplit: number of frames the device performed the header split for
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_rq_napi_stats {
+	union {
+		struct {
+							u32 packets;
+							u32 bytes;
+							u32 fragments;
+							u32 hsplit;
+		};
+		DECLARE_FLEX_ARRAY(u32, raw);
+	};
+};
+
 /**
  * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
  * @packets: completed frames counter
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 780447cdabc1..db99bc690eb6 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -20,6 +20,7 @@ enum {
 	LIBETH_XDP_DROP			= BIT(0),
 	LIBETH_XDP_ABORTED		= BIT(1),
 	LIBETH_XDP_TX			= BIT(2),
+	LIBETH_XDP_REDIRECT		= BIT(3),
 };
 
 /*
@@ -353,6 +354,7 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
  * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit()
  * @dev: &net_device which the frames are transmitted on
  * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure
+ * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session
  * @count: current number of frames in @bulk
  * @bulk: array of queued frames for bulk Tx
  *
@@ -366,6 +368,7 @@ struct libeth_xdp_tx_bulk {
 	struct net_device		*dev;
 	void				*xdpsq;
 
+	u32				act_mask;
 	u32				count;
 	struct libeth_xdp_tx_frame	bulk[LIBETH_XDP_TX_BULK];
 } __aligned(sizeof(struct libeth_xdp_tx_frame));
@@ -999,6 +1002,40 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
 
 /* Rx polling path */
 
+/**
+ * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (can be %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions. If @num == 0,
+ * assumes XDP is not enabled.
+ */
+#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num)		      \
+	__libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false,	      \
+				  __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, ub, un) do {	      \
+	typeof(bq) ub = (bq);						      \
+	u32 un = (num);							      \
+									      \
+	rcu_read_lock();						      \
+									      \
+	if (un) {							      \
+		ub->prog = rcu_dereference(pr);				      \
+		ub->dev = (d);						      \
+		ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)];		      \
+	} else {							      \
+		ub->prog = NULL;					      \
+	}								      \
+									      \
+	ub->act_mask = 0;						      \
+	ub->count = 0;							      \
+} while (0)
+
 void libeth_xdp_load_stash(struct libeth_xdp_buff *dst,
 			   const struct libeth_xdp_buff_stash *src);
 void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst,
@@ -1155,6 +1192,250 @@ static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp,
 	return true;
 }
 
+/**
+ * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info
+ * @ss: onstack stats to update
+ * @xdp: buffer to account
+ *
+ * Internal helper used by __libeth_xdp_run_pass(), do not call directly.
+ * Adds buffer's frags count and total len to the onstack stats.
+ */
+static inline void
+libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss,
+			    const struct libeth_xdp_buff *xdp)
+{
+	const struct skb_shared_info *sinfo;
+
+	sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+	ss->bytes += sinfo->xdp_frags_size;
+	ss->fragments += sinfo->nr_frags + 1;
+}
+
+u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq,
+			      struct libeth_xdp_buff *xdp,
+			      enum xdp_action act, int ret);
+
+/**
+ * __libeth_xdp_run_prog - run XDP program on an XDP buffer
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program. Handles ``XDP_DROP``
+ * and ``XDP_REDIRECT`` only, the rest is processed levels up.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp,
+		      const struct libeth_xdp_tx_bulk *bq)
+{
+	enum xdp_action act;
+
+	act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+	if (unlikely(act < XDP_DROP || act > XDP_REDIRECT))
+		goto out;
+
+	switch (act) {
+	case XDP_PASS:
+		return LIBETH_XDP_PASS;
+	case XDP_DROP:
+		libeth_xdp_return_buff(xdp);
+
+		return LIBETH_XDP_DROP;
+	case XDP_TX:
+		return LIBETH_XDP_TX;
+	case XDP_REDIRECT:
+		if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog)))
+			break;
+
+		xdp->data = NULL;
+
+		return LIBETH_XDP_REDIRECT;
+	default:
+		break;
+	}
+
+out:
+	return libeth_xdp_prog_exception(bq, xdp, act, 0);
+}
+
+/**
+ * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ * @run: internal callback for running XDP program
+ * @queue: internal callback for queuing ``XDP_TX`` frame
+ * @flush_bulk: driver callback for flushing a bulk
+ *
+ * Internal inline abstraction to run XDP program and additionally handle
+ * ``XDP_TX`` verdict.
+ * Do not use directly.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
+		       struct libeth_xdp_tx_bulk *bq,
+		       u32 (*run)(struct libeth_xdp_buff *xdp,
+				  const struct libeth_xdp_tx_bulk *bq),
+		       bool (*queue)(struct libeth_xdp_tx_bulk *bq,
+				     struct libeth_xdp_buff *xdp,
+				     bool (*flush_bulk)
+					  (struct libeth_xdp_tx_bulk *bq,
+					   u32 flags)),
+		       bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					  u32 flags))
+{
+	u32 act;
+
+	act = run(xdp, bq);
+	if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk)))
+		act = LIBETH_XDP_DROP;
+
+	bq->act_mask |= act;
+
+	return act;
+}
+
+/**
+ * libeth_xdp_run_prog - run XDP program and handle all verdicts
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts.
+ *
+ * Return: true if the buffer should be passed up the stack, false if the poll
+ * should go to the next buffer.
+ */
+#define libeth_xdp_run_prog(xdp, bq, fl)				      \
+	(__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog,		      \
+				libeth_xdp_tx_queue_bulk,		      \
+				fl) == LIBETH_XDP_PASS)
+
+/**
+ * __libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XDP buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction that does the following:
+ * 1) adds frame size and frag number (if needed) to the onstack stats;
+ * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff
+ * 3) runs XDP program if present;
+ * 4) handles all possible verdicts;
+ * 5) on ``XDP_PASS`, builds an skb from the buffer;
+ * 6) populates it with the descriptor metadata;
+ * 7) passes it up the stack.
+ *
+ * In most cases, number 2 means just writing the pointer to the HW descriptor
+ * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}()
+ * wrappers to build a driver function.
+ */
+static __always_inline void
+__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp,
+		      struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+		      struct libeth_rq_napi_stats *rs, const void *md,
+		      void (*prep)(struct libeth_xdp_buff *xdp,
+				   const void *md),
+		      bool (*run)(struct libeth_xdp_buff *xdp,
+				  struct libeth_xdp_tx_bulk *bq),
+		      bool (*populate)(struct sk_buff *skb,
+				       const struct libeth_xdp_buff *xdp,
+				       struct libeth_rq_napi_stats *rs))
+{
+	struct sk_buff *skb;
+
+	rs->bytes += xdp->base.data_end - xdp->data;
+	rs->packets++;
+
+	if (xdp_buff_has_frags(&xdp->base))
+		libeth_xdp_buff_stats_frags(rs, xdp);
+
+	if (prep && (!__builtin_constant_p(!!md) || md))
+		prep(xdp, md);
+
+	if (!bq || !run || !bq->prog)
+		goto build;
+
+	if (!run(xdp, bq))
+		return;
+
+build:
+	skb = xdp_build_skb_from_buff(&xdp->base);
+	if (unlikely(!skb)) {
+		libeth_xdp_return_buff_slow(xdp);
+		return;
+	}
+
+	xdp->data = NULL;
+
+	if (unlikely(!populate(skb, xdp, rs))) {
+		napi_consume_skb(skb, true);
+		return;
+	}
+
+	napi_gro_receive(napi, skb);
+}
+
+static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp,
+					const void *desc)
+{
+	xdp->desc = desc;
+}
+
+/**
+ * libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @ss: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate)	      \
+	__libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc,  \
+			      run, populate)
+
+/**
+ * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xdp_finalize_rx(bq, flush, finalize)			      \
+	__libeth_xdp_finalize_rx(bq, 0, flush, finalize)
+
+static __always_inline void
+__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags,
+			 bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					    u32 flags),
+			 void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+	if (bq->act_mask & LIBETH_XDP_TX) {
+		if (bq->count)
+			flush_bulk(bq, flags | LIBETH_XDP_TX_DROP);
+		finalize(bq->xdpsq, true, true);
+	}
+	if (bq->act_mask & LIBETH_XDP_REDIRECT)
+		xdp_do_flush();
+
+	rcu_read_unlock();
+}
+
 /* Tx buffer completion */
 
 void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
-- 
cgit v1.2.3


From 1bb635d3748b7158c6a19e6fca4fb85e6f96fd9a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:28 +0200
Subject: libeth: xdp: add templates for building driver-side callbacks

Defining driver-specific functions to pass to libeth_xdp functions can
induce boilerplates and/or look a bit cryptic with all those layers of
indirection. On the other hand, this indirection is needed to allow
compilers to uninline big functions even when passed to __always_inline
helpers (too much inlining also hurts performance in some cases), plus
to reuse some XDP helpers in XSk code.
Add macros to quickly build them, with the detailed kdoc. They take
names of the actual callbacks for filling a Tx descriptor and other
purely HW-specific things and wrap them appropriately.

LIBETH_XDP_DEFINE_{BEGIN,END}() is needed for GCC 8+ unfortunately to
let the drivers control which functions will be static and which global
without hitting `-Wold-style-declaration`.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/libeth/xdp.h | 195 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)

(limited to 'include')

diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index db99bc690eb6..46a2ec3c3037 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -742,6 +742,9 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
  * @flags: Tx flags, see above
  * @prep: driver callback to prepare the queue
  * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver
+ * callback.
  */
 #define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit)			      \
 	__libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf,   \
@@ -749,6 +752,25 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
 
 /* .ndo_xdp_xmit() implementation */
 
+/**
+ * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit
+ * @bq: bulk to initialize
+ * @dev: target &net_device
+ * @xdpsqs: array of driver-specific XDPSQ structs
+ * @num: number of active XDPSQs (the above array length)
+ */
+#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num)			      \
+	__libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)])
+
+static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq,
+					       struct net_device *dev,
+					       void *xdpsq)
+{
+	bq->dev = dev;
+	bq->xdpsq = xdpsq;
+	bq->count = 0;
+}
+
 /**
  * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame
  * @xf: pointer to the XDP frame
@@ -941,6 +963,9 @@ libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
  * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
  * @prep: driver callback to prepare the queue
  * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver
+ * callback.
  */
 #define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit)		      \
 	__libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep,     \
@@ -1000,6 +1025,44 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
 	return nxmit;
 }
 
+/**
+ * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver
+ * @dev: target &net_device
+ * @n: number of frames to send
+ * @fr: XDP frames to send
+ * @f: flags passed by the stack
+ * @xqs: array of XDPSQs driver structs
+ * @nqs: number of active XDPSQs, the above array length
+ * @fl: driver callback to flush an XDP xmit bulk
+ * @fin: driver cabback to finalize the queue
+ *
+ * If the driver has active XDPSQs, perform common checks and send the frames.
+ * Finalize the queue, if requested.
+ *
+ * Return: number of frames sent or -errno on error.
+ */
+#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin)	      \
+	_libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin,	      \
+				 __UNIQUE_ID(bq_), __UNIQUE_ID(ret_),	      \
+				 __UNIQUE_ID(nqs_))
+
+#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un)  \
+({									      \
+	u32 un = (nqs);							      \
+	int ur;								      \
+									      \
+	if (likely(un)) {						      \
+		LIBETH_XDP_ONSTACK_BULK(ub);				      \
+									      \
+		libeth_xdp_xmit_init_bulk(&ub, d, xqs, un);		      \
+		ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin);	      \
+	} else {							      \
+		ur = -ENXIO;						      \
+	}								      \
+									      \
+	ur;								      \
+})
+
 /* Rx polling path */
 
 /**
@@ -1305,6 +1368,7 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
  * @fl: driver ``XDP_TX`` bulk flush callback
  *
  * Run the attached XDP program and handle all possible verdicts.
+ * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}().
  *
  * Return: true if the buffer should be passed up the stack, false if the poll
  * should go to the next buffer.
@@ -1436,6 +1500,137 @@ __libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags,
 	rcu_read_unlock();
 }
 
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep,
+ *			      driver_xdp_xmit);
+ * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog,
+ *			 driver_xdp_flush_tx, driver_populate_skb);
+ * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx,
+ *			      driver_xdp_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ *	while (packets < budget) {
+ *		// ...
+ *		driver_xdp_run(xdp, &bq, napi, &rs, desc);
+ *	}
+ *	driver_xdp_finalize_rx(&bq);
+ */
+
+#define LIBETH_XDP_DEFINE_START()					      \
+	__diag_push();							      \
+	__diag_ignore(GCC, 8, "-Wold-style-declaration",		      \
+		      "Allow specifying \'static\' after the return type")
+
+/**
+ * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback
+ * @name: name of the function to define
+ * @poll: Tx polling/completion function
+ */
+#define LIBETH_XDP_DEFINE_TIMER(name, poll)				      \
+void name(struct work_struct *work)					      \
+{									      \
+	libeth_xdpsq_run_timer(work, poll);				      \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit)			      \
+	__LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp)
+
+#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx)		      \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags)			      \
+{									      \
+	return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit);	      \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit)			      \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags)			      \
+{									      \
+	return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit);	      \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ */
+#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush)				      \
+	bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx)			      \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq)	      \
+{									      \
+	return libeth_##pfx##_run_prog(xdp, bq, flush);			      \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate)			      \
+	void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx)		      \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq,	      \
+     struct napi_struct *napi, struct libeth_rq_napi_stats *ss,		      \
+     const void *desc)							      \
+{									      \
+	return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run,	      \
+				       populate);			      \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate)		      \
+	__LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP)
+
+#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx)	      \
+	LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush);		      \
+	LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate)
+
+/**
+ * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize)		      \
+	__LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp)
+
+#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx)	      \
+void name(struct libeth_xdp_tx_bulk *bq)				      \
+{									      \
+	libeth_##pfx##_finalize_rx(bq, flush, finalize);		      \
+}
+
+#define LIBETH_XDP_DEFINE_END()		__diag_pop()
+
 /* Tx buffer completion */
 
 void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
-- 
cgit v1.2.3


From 576cc5c13d9ba53a1a24d9b34af2f939a87b7ce8 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:29 +0200
Subject: libeth: xdp: add RSS hash hint and XDP features setup helpers

End the XDP section by adding helpers to setup XDP features, flipping
.ndo_xdp_xmit() support at runtime (in case when it's not always on),
and calculating the queue clean/refill threshold.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xdp.c | 69 +++++++++++++++++++++++++
 include/net/libeth/xdp.h                | 90 +++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index 1607579d65bb..4eb0f3c6cdab 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -340,6 +340,75 @@ head:
 }
 EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_bulk);
 
+/* Misc */
+
+/**
+ * libeth_xdp_queue_threshold - calculate XDP queue clean/refill threshold
+ * @count: number of descriptors in the queue
+ *
+ * The threshold is the limit at which RQs start to refill (when the number of
+ * empty buffers exceeds it) and SQs get cleaned up (when the number of free
+ * descriptors goes below it). To speed up hotpath processing, threshold is
+ * always pow-2, closest to 1/4 of the queue length.
+ * Don't call it on hotpath, calculate and cache the threshold during the
+ * queue initialization.
+ *
+ * Return: the calculated threshold.
+ */
+u32 libeth_xdp_queue_threshold(u32 count)
+{
+	u32 quarter, low, high;
+
+	if (likely(is_power_of_2(count)))
+		return count >> 2;
+
+	quarter = DIV_ROUND_CLOSEST(count, 4);
+	low = rounddown_pow_of_two(quarter);
+	high = roundup_pow_of_two(quarter);
+
+	return high - quarter <= quarter - low ? high : low;
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_queue_threshold);
+
+/**
+ * __libeth_xdp_set_features - set XDP features for netdev
+ * @dev: &net_device to configure
+ * @xmo: XDP metadata ops (Rx hints)
+ *
+ * Set all the features libeth_xdp supports. Only the first argument is
+ * necessary.
+ * Use the non-underscored versions in drivers instead.
+ */
+void __libeth_xdp_set_features(struct net_device *dev,
+			       const struct xdp_metadata_ops *xmo)
+{
+	xdp_set_features_flag(dev,
+			      NETDEV_XDP_ACT_BASIC |
+			      NETDEV_XDP_ACT_REDIRECT |
+			      NETDEV_XDP_ACT_NDO_XMIT |
+			      NETDEV_XDP_ACT_RX_SG |
+			      NETDEV_XDP_ACT_NDO_XMIT_SG);
+	dev->xdp_metadata_ops = xmo;
+}
+EXPORT_SYMBOL_GPL(__libeth_xdp_set_features);
+
+/**
+ * libeth_xdp_set_redirect - toggle the XDP redirect feature
+ * @dev: &net_device to configure
+ * @enable: whether XDP is enabled
+ *
+ * Use this when XDPSQs are not always available to dynamically enable
+ * and disable redirect feature.
+ */
+void libeth_xdp_set_redirect(struct net_device *dev, bool enable)
+{
+	if (enable)
+		xdp_features_set_redirect_target(dev, true);
+	else
+		xdp_features_clear_redirect_target(dev);
+}
+EXPORT_SYMBOL_GPL(libeth_xdp_set_redirect);
+
 /* Module */
 
 static const struct libeth_xdp_ops xdp_ops __initconst = {
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 46a2ec3c3037..c36b2ca0d04c 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -1631,6 +1631,51 @@ void name(struct libeth_xdp_tx_bulk *bq)				      \
 
 #define LIBETH_XDP_DEFINE_END()		__diag_pop()
 
+/* XMO */
+
+/**
+ * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer
+ * @xdp: &libeth_xdp_buff corresponding to the queue
+ * @type: typeof() of the driver Rx queue structure
+ * @member: name of &xdp_rxq_info inside @type
+ *
+ * Often times, pointer to the RQ is needed when reading/filling metadata from
+ * HW descriptors. The helper can be used to quickly jump from an XDP buffer
+ * to the queue corresponding to its &xdp_rxq_info without introducing
+ * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64).
+ */
+#define libeth_xdp_buff_to_rq(xdp, type, member)			      \
+	container_of_const((xdp)->base.rxq, type, member)
+
+/**
+ * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata
+ * @hash: pointer to the variable to write the hash to
+ * @rss_type: pointer to the variable to write the hash type to
+ * @val: hash value from the HW descriptor
+ * @pt: libeth parsed packet type
+ *
+ * Handle zeroed/non-available hash and convert libeth parsed packet type to
+ * the corresponding XDP RSS hash type. To be called at the end of
+ * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation.
+ * Note that if the driver doesn't use a constant packet type lookup table but
+ * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to
+ * generate XDP RSS hash type for each packet type.
+ *
+ * Return: 0 on success, -ENODATA when the hash is not available.
+ */
+static inline int libeth_xdpmo_rx_hash(u32 *hash,
+				       enum xdp_rss_hash_type *rss_type,
+				       u32 val, struct libeth_rx_pt pt)
+{
+	if (unlikely(!val))
+		return -ENODATA;
+
+	*hash = val;
+	*rss_type = pt.hash_type;
+
+	return 0;
+}
+
 /* Tx buffer completion */
 
 void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
@@ -1697,4 +1742,49 @@ static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
 	__libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk);
 }
 
+/* Misc */
+
+u32 libeth_xdp_queue_threshold(u32 count);
+
+void __libeth_xdp_set_features(struct net_device *dev,
+			       const struct xdp_metadata_ops *xmo);
+void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
+
+/**
+ * libeth_xdp_set_features - set XDP features for netdev
+ * @dev: &net_device to configure
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That
+ * said, it should be used only when XDPSQs are always available regardless
+ * of whether an XDP prog is attached to @dev.
+ */
+#define libeth_xdp_set_features(dev, ...)				      \
+	CONCATENATE(__libeth_xdp_feat,					      \
+		    COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__)
+
+#define __libeth_xdp_feat0(dev)						      \
+	__libeth_xdp_set_features(dev, NULL)
+#define __libeth_xdp_feat1(dev, xmo)					      \
+	__libeth_xdp_set_features(dev, xmo)
+
+/**
+ * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir
+ * @dev: target &net_device
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are
+ * not available right after netdev registration.
+ */
+#define libeth_xdp_set_features_noredir(dev, ...)			      \
+	__libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_),	      \
+					  ##__VA_ARGS__)
+
+#define __libeth_xdp_set_features_noredir(dev, ud, ...) do {		      \
+	struct net_device *ud = (dev);					      \
+									      \
+	libeth_xdp_set_features(ud, ##__VA_ARGS__);			      \
+	libeth_xdp_set_redirect(ud, false);				      \
+} while (0)
+
 #endif /* __LIBETH_XDP_H */
-- 
cgit v1.2.3


From b3ad8450b4dc46c4ab0641f665068fd2a4d1adba Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:30 +0200
Subject: libeth: xsk: add XSk XDP_TX sending helpers

Add Xsk counterparts for XDP_TX buffer sending and completion.
The same base structures and functions used from the libeth_xdp core,
with adjustments to that XSk Rx always operates on &xdp_buff_xsk for
both head and frags. And unlike regular Rx, here unlikely() are used
for frags, as the header split gives no benefits for XSk Rx, at
least for now.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/Kconfig  |   2 +-
 drivers/net/ethernet/intel/libeth/Makefile |   1 +
 drivers/net/ethernet/intel/libeth/priv.h   |   6 ++
 drivers/net/ethernet/intel/libeth/tx.c     |   5 +-
 drivers/net/ethernet/intel/libeth/xdp.c    |   7 +-
 drivers/net/ethernet/intel/libeth/xsk.c    |  34 +++++++
 include/net/libeth/tx.h                    |   6 ++
 include/net/libeth/xdp.h                   |  26 ++++-
 include/net/libeth/xsk.h                   | 148 +++++++++++++++++++++++++++++
 9 files changed, 226 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libeth/xsk.c
 create mode 100644 include/net/libeth/xsk.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig
index d8c4926574fb..2445b979c499 100644
--- a/drivers/net/ethernet/intel/libeth/Kconfig
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -12,4 +12,4 @@ config LIBETH_XDP
 	tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST
 	select LIBETH
 	help
-	  XDP helpers based on libeth hotpath management.
+	  XDP and XSk helpers based on libeth hotpath management.
diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile
index 51669840ee06..350bc0b38bad 100644
--- a/drivers/net/ethernet/intel/libeth/Makefile
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -9,3 +9,4 @@ libeth-y			+= tx.o
 obj-$(CONFIG_LIBETH_XDP)	+= libeth_xdp.o
 
 libeth_xdp-y			+= xdp.o
+libeth_xdp-y			+= xsk.o
diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h
index 1bd6e2d7a3e7..ebcb26f24401 100644
--- a/drivers/net/ethernet/intel/libeth/priv.h
+++ b/drivers/net/ethernet/intel/libeth/priv.h
@@ -8,12 +8,18 @@
 
 /* XDP */
 
+struct libeth_xdp_buff;
+struct libeth_xdp_tx_frame;
 struct skb_shared_info;
 struct xdp_frame_bulk;
 
+void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
+			       u32 count);
+
 struct libeth_xdp_ops {
 	void	(*bulk)(const struct skb_shared_info *sinfo,
 			struct xdp_frame_bulk *bq, bool frags);
+	void	(*xsk)(struct libeth_xdp_buff *xdp);
 };
 
 void libeth_attach_xdp(const struct libeth_xdp_ops *ops);
diff --git a/drivers/net/ethernet/intel/libeth/tx.c b/drivers/net/ethernet/intel/libeth/tx.c
index 227c841ab16a..e0167f43d2a8 100644
--- a/drivers/net/ethernet/intel/libeth/tx.c
+++ b/drivers/net/ethernet/intel/libeth/tx.c
@@ -10,6 +10,7 @@
 /* Tx buffer completion */
 
 DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk);
+DEFINE_STATIC_CALL_NULL(xsk, libeth_xsk_buff_free_slow);
 
 /**
  * libeth_tx_complete_any - perform Tx completion for one SQE of any type
@@ -23,7 +24,8 @@ DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk);
 void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp)
 {
 	if (sqe->type >= __LIBETH_SQE_XDP_START)
-		__libeth_xdp_complete_tx(sqe, cp, static_call(bulk));
+		__libeth_xdp_complete_tx(sqe, cp, static_call(bulk),
+					 static_call(xsk));
 	else
 		libeth_tx_complete(sqe, cp);
 }
@@ -34,5 +36,6 @@ EXPORT_SYMBOL_GPL(libeth_tx_complete_any);
 void libeth_attach_xdp(const struct libeth_xdp_ops *ops)
 {
 	static_call_update(bulk, ops ? ops->bulk : NULL);
+	static_call_update(xsk, ops ? ops->xsk : NULL);
 }
 EXPORT_SYMBOL_GPL(libeth_attach_xdp);
diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index 4eb0f3c6cdab..bd334d314a1d 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -114,7 +114,7 @@ static void __cold libeth_trace_xdp_exception(const struct net_device *dev,
  * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames
  * @bq: XDP Tx frame bulk
  * @sent: number of frames sent successfully (from this bulk)
- * @flags: internal libeth_xdp flags (.ndo_xdp_xmit etc.)
+ * @flags: internal libeth_xdp flags (XSk, .ndo_xdp_xmit etc.)
  *
  * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly.
  * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust
@@ -136,7 +136,9 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
 		return;
 	}
 
-	if (!(flags & LIBETH_XDP_TX_NDO))
+	if (flags & LIBETH_XDP_TX_XSK)
+		libeth_xsk_tx_return_bulk(pos, left);
+	else if (!(flags & LIBETH_XDP_TX_NDO))
 		libeth_xdp_tx_return_bulk(pos, left);
 	else
 		libeth_xdp_xmit_return_bulk(pos, left, bq->dev);
@@ -413,6 +415,7 @@ EXPORT_SYMBOL_GPL(libeth_xdp_set_redirect);
 
 static const struct libeth_xdp_ops xdp_ops __initconst = {
 	.bulk	= libeth_xdp_return_buff_bulk,
+	.xsk	= libeth_xsk_buff_free_slow,
 };
 
 static int __init libeth_xdp_module_init(void)
diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
new file mode 100644
index 000000000000..fba6d7a025b0
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#define DEFAULT_SYMBOL_NAMESPACE	"LIBETH_XDP"
+
+#include <linux/export.h>
+
+#include <net/libeth/xsk.h>
+
+#include "priv.h"
+
+/* ``XDP_TX`` bulking */
+
+void __cold libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
+				      u32 count)
+{
+	for (u32 i = 0; i < count; i++)
+		libeth_xsk_buff_free_slow(bq[i].xsk);
+}
+
+/* Rx polling path */
+
+/**
+ * libeth_xsk_buff_free_slow - free an XSk Rx buffer
+ * @xdp: buffer to free
+ *
+ * Slowpath version of xsk_buff_free() to be used on exceptions, cleanups etc.
+ * to avoid unwanted inlining.
+ */
+void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp)
+{
+	xsk_buff_free(&xdp->base);
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_buff_free_slow);
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 33b9bb22f6ac..44192bec86d7 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -21,6 +21,8 @@
  * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
  * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats
  * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA
+ * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats
+ * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free()
  */
 enum libeth_sqe_type {
 	LIBETH_SQE_EMPTY		= 0U,
@@ -33,6 +35,8 @@ enum libeth_sqe_type {
 	LIBETH_SQE_XDP_TX		= __LIBETH_SQE_XDP_START,
 	LIBETH_SQE_XDP_XMIT,
 	LIBETH_SQE_XDP_XMIT_FRAG,
+	LIBETH_SQE_XSK_TX,
+	LIBETH_SQE_XSK_TX_FRAG,
 };
 
 /**
@@ -43,6 +47,7 @@ enum libeth_sqe_type {
  * @skb: &sk_buff to consume
  * @sinfo: skb shared info of an XDP_TX frame
  * @xdpf: XDP frame from ::ndo_xdp_xmit()
+ * @xsk: XSk Rx frame from XDP_TX action
  * @dma: DMA address to unmap
  * @len: length of the mapped region to unmap
  * @nr_frags: number of frags in the frame this buffer belongs to
@@ -59,6 +64,7 @@ struct libeth_sqe {
 		struct sk_buff			*skb;
 		struct skb_shared_info		*sinfo;
 		struct xdp_frame		*xdpf;
+		struct libeth_xdp_buff		*xsk;
 	};
 
 	DEFINE_DMA_UNMAP_ADDR(dma);
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index c36b2ca0d04c..ab907f36a35b 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -279,6 +279,7 @@ libeth_xdpsq_run_timer(struct work_struct *work,
  * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled
  * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent
  * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit()
+ * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk
  */
 enum {
 	LIBETH_XDP_TX_BULK		= DEV_MAP_BULK_SIZE,
@@ -286,6 +287,7 @@ enum {
 
 	LIBETH_XDP_TX_DROP		= BIT(0),
 	LIBETH_XDP_TX_NDO		= BIT(1),
+	LIBETH_XDP_TX_XSK		= BIT(2),
 };
 
 /**
@@ -314,7 +316,8 @@ enum {
  * @frag: one (non-head) frag for ``XDP_TX``
  * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit()
  * @dma: DMA address of the non-head frag for .ndo_xdp_xmit()
- * @len: frag length for .ndo_xdp_xmit()
+ * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag
+ * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit()
  * @flags: Tx flags for the above
  * @opts: combined @len + @flags for the above for speed
  */
@@ -330,11 +333,13 @@ struct libeth_xdp_tx_frame {
 		/* ``XDP_TX`` frag */
 		skb_frag_t			frag;
 
-		/* .ndo_xdp_xmit() */
+		/* .ndo_xdp_xmit(), XSk ``XDP_TX`` */
 		struct {
 			union {
 				struct xdp_frame		*xdpf;
 				dma_addr_t			dma;
+
+				struct libeth_xdp_buff		*xsk;
 			};
 			union {
 				struct {
@@ -386,6 +391,7 @@ struct libeth_xdp_tx_bulk {
 
 /**
  * struct libeth_xdpsq - abstraction for an XDPSQ
+ * @pool: XSk buffer pool for XSk ``XDP_TX``
  * @sqes: array of Tx buffers from the actual queue struct
  * @descs: opaque pointer to the HW descriptor array
  * @ntu: pointer to the next free descriptor index
@@ -399,6 +405,7 @@ struct libeth_xdp_tx_bulk {
  * functions can access and modify driver-specific resources.
  */
 struct libeth_xdpsq {
+	struct xsk_buff_pool		*pool;
 	struct libeth_sqe		*sqes;
 	void				*descs;
 
@@ -697,7 +704,7 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
 /**
  * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk
  * @bq: bulk to flush
- * @flags: XDP TX flags (.ndo_xdp_xmit() etc.)
+ * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.)
  * @prep: driver-specific callback to prepare the queue for sending
  * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
  * @xmit: driver callback to fill a HW descriptor
@@ -1680,12 +1687,14 @@ static inline int libeth_xdpmo_rx_hash(u32 *hash,
 
 void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
 				 struct xdp_frame_bulk *bq, bool frags);
+void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp);
 
 /**
  * __libeth_xdp_complete_tx - complete sent XDPSQE
  * @sqe: SQ element / Tx buffer to complete
  * @cp: Tx polling/completion params
  * @bulk: internal callback to bulk-free ``XDP_TX`` buffers
+ * @xsk: internal callback to free XSk ``XDP_TX`` buffers
  *
  * Use the non-underscored version in drivers instead. This one is shared
  * internally with libeth_tx_complete_any().
@@ -1694,7 +1703,8 @@ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
  */
 static __always_inline void
 __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
-			 typeof(libeth_xdp_return_buff_bulk) bulk)
+			 typeof(libeth_xdp_return_buff_bulk) bulk,
+			 typeof(libeth_xsk_buff_free_slow) xsk)
 {
 	enum libeth_sqe_type type = sqe->type;
 
@@ -1717,6 +1727,10 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
 	case LIBETH_SQE_XDP_XMIT:
 		xdp_return_frame_bulk(sqe->xdpf, cp->bq);
 		break;
+	case LIBETH_SQE_XSK_TX:
+	case LIBETH_SQE_XSK_TX_FRAG:
+		xsk(sqe->xsk);
+		break;
 	default:
 		break;
 	}
@@ -1724,6 +1738,7 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
 	switch (type) {
 	case LIBETH_SQE_XDP_TX:
 	case LIBETH_SQE_XDP_XMIT:
+	case LIBETH_SQE_XSK_TX:
 		cp->xdp_tx -= sqe->nr_frags;
 
 		cp->xss->packets++;
@@ -1739,7 +1754,8 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
 static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
 					  struct libeth_cq_pp *cp)
 {
-	__libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk);
+	__libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk,
+				 libeth_xsk_buff_free_slow);
 }
 
 /* Misc */
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
new file mode 100644
index 000000000000..af69b46fa7e4
--- /dev/null
+++ b/include/net/libeth/xsk.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XSK_H
+#define __LIBETH_XSK_H
+
+#include <net/libeth/xdp.h>
+#include <net/xdp_sock_drv.h>
+
+/* ``XDP_TX`` bulking */
+
+/**
+ * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XSk buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+					    struct libeth_xdp_buff *xdp)
+{
+	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+		.xsk	= xdp,
+		.len	= xdp->base.data_end - xdp->data,
+		.flags	= LIBETH_XDP_TX_FIRST,
+	};
+
+	if (likely(!xdp_buff_has_frags(&xdp->base)))
+		return false;
+
+	bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+	return true;
+}
+
+/**
+ * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: XSk frag to queue
+ */
+static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+					    struct libeth_xdp_buff *frag)
+{
+	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+		.xsk	= frag,
+		.len	= frag->base.data_end - frag->data,
+	};
+}
+
+/**
+ * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XSk buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+			 struct libeth_xdp_buff *xdp,
+			 bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+					    u32 flags))
+{
+	bool ret = true;
+
+	if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+	    unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+		libeth_xsk_buff_free_slow(xdp);
+		return false;
+	}
+
+	if (!libeth_xsk_tx_queue_head(bq, xdp))
+		goto out;
+
+	for (const struct libeth_xdp_buff *head = xdp; ; ) {
+		xdp = container_of(xsk_buff_get_frag(&head->base),
+				   typeof(*xdp), base);
+		if (!xdp)
+			break;
+
+		if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+		    unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+			ret = false;
+			break;
+		}
+
+		libeth_xsk_tx_queue_frag(bq, xdp);
+	}
+
+out:
+	bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+	return ret;
+}
+
+/**
+ * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+		       const struct libeth_xdpsq *sq, u64 priv)
+{
+	struct libeth_xdp_buff *xdp = frm.xsk;
+	struct libeth_xdp_tx_desc desc = {
+		.addr	= xsk_buff_xdp_get_dma(&xdp->base),
+		.opts	= frm.opts,
+	};
+	struct libeth_sqe *sqe;
+
+	xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+	sqe = &sq->sqes[i];
+	sqe->xsk = xdp;
+
+	if (!(desc.flags & LIBETH_XDP_TX_FIRST)) {
+		sqe->type = LIBETH_SQE_XSK_TX_FRAG;
+		return desc;
+	}
+
+	sqe->type = LIBETH_SQE_XSK_TX;
+	libeth_xdp_tx_fill_stats(sqe, &desc,
+				 xdp_get_shared_info_from_buff(&xdp->base));
+
+	return desc;
+}
+
+/**
+ * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver
+ * callback.
+ */
+#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit)			     \
+	__libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep,    \
+				   libeth_xsk_tx_fill_buf, xmit)
+
+#endif /* __LIBETH_XSK_H */
-- 
cgit v1.2.3


From 40e846d122df9b299e700ec86d01ef647fc0b09f Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:31 +0200
Subject: libeth: xsk: add XSk xmit functions

Reuse core sending functions to send XSk xmit frames.
Both metadata and no metadata pools/driver are supported. libeth_xdp
also provides generic XSk metadata ops, currently with the checksum
offload only and for cases when HW doesn't require supplying L3/L4
checksum offsets. Drivers are free to pass their own ops.
&libeth_xdp_tx_bulk is not used here as it would be redundant;
pool->tx_descs are accessed directly.
Fake "libeth_xsktmo" is needed to hide implementation details from the
drivers when they want to use the generic ops: the original struct is
defined in the same file where dev->xsk_tx_metadata_ops gets set to
avoid duplication of slowpath; at the same time; XSk xmit functions
use local "fast" copy to inline XMO callbacks.
Tx descriptor filling loop is unrolled by 8.

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> # optimizations
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/priv.h |   2 +
 drivers/net/ethernet/intel/libeth/xdp.c  |  14 ++-
 drivers/net/ethernet/intel/libeth/xsk.c  |   6 ++
 include/net/libeth/tx.h                  |   4 +-
 include/net/libeth/xdp.h                 |  73 +++++++++++---
 include/net/libeth/xsk.h                 | 166 +++++++++++++++++++++++++++++++
 6 files changed, 248 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h
index ebcb26f24401..03e74382b2cb 100644
--- a/drivers/net/ethernet/intel/libeth/priv.h
+++ b/drivers/net/ethernet/intel/libeth/priv.h
@@ -13,6 +13,8 @@ struct libeth_xdp_tx_frame;
 struct skb_shared_info;
 struct xdp_frame_bulk;
 
+extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow;
+
 void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
 			       u32 count);
 
diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index bd334d314a1d..b5fb2ce92da8 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -376,21 +376,31 @@ EXPORT_SYMBOL_GPL(libeth_xdp_queue_threshold);
  * __libeth_xdp_set_features - set XDP features for netdev
  * @dev: &net_device to configure
  * @xmo: XDP metadata ops (Rx hints)
+ * @zc_segs: maximum number of S/G frags the HW can transmit
+ * @tmo: XSk Tx metadata ops (Tx hints)
  *
  * Set all the features libeth_xdp supports. Only the first argument is
- * necessary.
+ * necessary; without the third one (zero), XSk support won't be advertised.
  * Use the non-underscored versions in drivers instead.
  */
 void __libeth_xdp_set_features(struct net_device *dev,
-			       const struct xdp_metadata_ops *xmo)
+			       const struct xdp_metadata_ops *xmo,
+			       u32 zc_segs,
+			       const struct xsk_tx_metadata_ops *tmo)
 {
 	xdp_set_features_flag(dev,
 			      NETDEV_XDP_ACT_BASIC |
 			      NETDEV_XDP_ACT_REDIRECT |
 			      NETDEV_XDP_ACT_NDO_XMIT |
+			      (zc_segs ? NETDEV_XDP_ACT_XSK_ZEROCOPY : 0) |
 			      NETDEV_XDP_ACT_RX_SG |
 			      NETDEV_XDP_ACT_NDO_XMIT_SG);
 	dev->xdp_metadata_ops = xmo;
+
+	tmo = tmo == libeth_xsktmo ? &libeth_xsktmo_slow : tmo;
+
+	dev->xdp_zc_max_segs = zc_segs ? : 1;
+	dev->xsk_tx_metadata_ops = zc_segs ? tmo : NULL;
 }
 EXPORT_SYMBOL_GPL(__libeth_xdp_set_features);
 
diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
index fba6d7a025b0..f09e1940183b 100644
--- a/drivers/net/ethernet/intel/libeth/xsk.c
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -18,6 +18,12 @@ void __cold libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
 		libeth_xsk_buff_free_slow(bq[i].xsk);
 }
 
+/* XSk TMO */
+
+const struct xsk_tx_metadata_ops libeth_xsktmo_slow = {
+	.tmo_request_checksum		= libeth_xsktmo_req_csum,
+};
+
 /* Rx polling path */
 
 /**
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 44192bec86d7..c3db5c6f1641 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -12,7 +12,7 @@
 
 /**
  * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
- * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX frag, no action required
+ * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required
  * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
  * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
  * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
@@ -93,7 +93,7 @@ struct libeth_sqe {
  * @bq: XDP frame bulk to combine return operations
  * @ss: onstack NAPI stats to fill
  * @xss: onstack XDPSQ NAPI stats to fill
- * @xdp_tx: number of XDP frames processed
+ * @xdp_tx: number of XDP-not-XSk frames processed
  * @napi: whether it's called from the NAPI context
  *
  * libeth uses this structure to access objects needed for performing full
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index ab907f36a35b..c3655458047d 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -293,6 +293,8 @@ enum {
 /**
  * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags
  * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length
+ * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload
+ * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits
  * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame
  * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame
  * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags
@@ -301,6 +303,9 @@ enum {
 enum {
 	LIBETH_XDP_TX_LEN		= GENMASK(15, 0),
 
+	LIBETH_XDP_TX_CSUM		= XDP_TXMD_FLAGS_CHECKSUM,
+	LIBETH_XDP_TX_XSKMD		= LIBETH_XDP_TX_LEN,
+
 	LIBETH_XDP_TX_FIRST		= BIT(16),
 	LIBETH_XDP_TX_LAST		= BIT(17),
 	LIBETH_XDP_TX_MULTI		= BIT(18),
@@ -320,6 +325,7 @@ enum {
  * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit()
  * @flags: Tx flags for the above
  * @opts: combined @len + @flags for the above for speed
+ * @desc: XSk xmit descriptor for direct casting
  */
 struct libeth_xdp_tx_frame {
 	union {
@@ -349,10 +355,14 @@ struct libeth_xdp_tx_frame {
 				aligned_u64			opts;
 			};
 		};
+
+		/* XSk xmit */
+		struct xdp_desc			desc;
 	};
-} __aligned_largest;
+} __aligned(sizeof(struct xdp_desc));
 static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
 	      offsetof(struct libeth_xdp_tx_frame, len_fl));
+static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc));
 
 /**
  * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending
@@ -363,10 +373,13 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
  * @count: current number of frames in @bulk
  * @bulk: array of queued frames for bulk Tx
  *
- * All XDP Tx operations queue each frame to the bulk first and flush it
- * when @count reaches the array end. Bulk is always placed on the stack
- * for performance. One bulk element contains all the data necessary
+ * All XDP Tx operations except XSk xmit queue each frame to the bulk first
+ * and flush it when @count reaches the array end. Bulk is always placed on
+ * the stack for performance. One bulk element contains all the data necessary
  * for sending a frame and then freeing it on completion.
+ * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly
+ * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is
+ * not used.
  */
 struct libeth_xdp_tx_bulk {
 	const struct bpf_prog		*prog;
@@ -391,13 +404,13 @@ struct libeth_xdp_tx_bulk {
 
 /**
  * struct libeth_xdpsq - abstraction for an XDPSQ
- * @pool: XSk buffer pool for XSk ``XDP_TX``
+ * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit
  * @sqes: array of Tx buffers from the actual queue struct
  * @descs: opaque pointer to the HW descriptor array
  * @ntu: pointer to the next free descriptor index
  * @count: number of descriptors on that queue
  * @pending: pointer to the number of sent-not-completed descs on that queue
- * @xdp_tx: pointer to the above
+ * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames
  * @lock: corresponding XDPSQ lock
  *
  * Abstraction for driver-independent implementation of Tx. Placed on the stack
@@ -438,6 +451,30 @@ struct libeth_xdp_tx_desc {
 	};
 } __aligned_largest;
 
+/**
+ * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv
+ * @ptr: pointer to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when you want to pass a pointer there.
+ */
+#define libeth_xdp_ptr_to_priv(ptr) ({					      \
+	typecheck_pointer(ptr);						      \
+	((u64)(uintptr_t)(ptr));					      \
+})
+/**
+ * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer
+ * @priv: private data to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when your callback takes this u64 and you want to convert
+ * it back to a pointer.
+ */
+#define libeth_xdp_priv_to_ptr(priv) ({					      \
+	static_assert(__same_type(priv, u64));				      \
+	((const void *)(uintptr_t)(priv));				      \
+})
+
 /**
  * libeth_xdp_tx_xmit_bulk - main XDP Tx function
  * @bulk: array of frames to send
@@ -450,10 +487,11 @@ struct libeth_xdp_tx_desc {
  * @xmit: callback for filling a HW descriptor with the frame info
  *
  * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for
- * all types of frames.
+ * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk
+ * xmit.
  * @prep must lock the queue as this function releases it at the end. @unroll
- * greatly increases the object code size, but also greatly increases
- * performance.
+ * greatly increases the object code size, but also greatly increases XSk xmit
+ * performance; for other types of frames, it's not enabled.
  * The compilers inline all those onstack abstractions to direct data accesses.
  *
  * Return: number of frames actually placed on the queue, <= @n. The function
@@ -709,7 +747,8 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
  * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
  * @xmit: driver callback to fill a HW descriptor
  *
- * Internal abstraction to create bulk flush functions for drivers.
+ * Internal abstraction to create bulk flush functions for drivers. Used for
+ * everything except XSk xmit.
  *
  * Return: true if anything was sent, false otherwise.
  */
@@ -1763,7 +1802,9 @@ static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
 u32 libeth_xdp_queue_threshold(u32 count);
 
 void __libeth_xdp_set_features(struct net_device *dev,
-			       const struct xdp_metadata_ops *xmo);
+			       const struct xdp_metadata_ops *xmo,
+			       u32 zc_segs,
+			       const struct xsk_tx_metadata_ops *tmo);
 void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
 
 /**
@@ -1780,9 +1821,13 @@ void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
 		    COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__)
 
 #define __libeth_xdp_feat0(dev)						      \
-	__libeth_xdp_set_features(dev, NULL)
+	__libeth_xdp_set_features(dev, NULL, 0, NULL)
 #define __libeth_xdp_feat1(dev, xmo)					      \
-	__libeth_xdp_set_features(dev, xmo)
+	__libeth_xdp_set_features(dev, xmo, 0, NULL)
+#define __libeth_xdp_feat2(dev, xmo, zc_segs)				      \
+	__libeth_xdp_set_features(dev, xmo, zc_segs, NULL)
+#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo)			      \
+	__libeth_xdp_set_features(dev, xmo, zc_segs, tmo)
 
 /**
  * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir
@@ -1803,4 +1848,6 @@ void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
 	libeth_xdp_set_redirect(ud, false);				      \
 } while (0)
 
+#define libeth_xsktmo			((const void *)GOLDEN_RATIO_PRIME)
+
 #endif /* __LIBETH_XDP_H */
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index af69b46fa7e4..16ca195981fe 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -7,6 +7,11 @@
 #include <net/libeth/xdp.h>
 #include <net/xdp_sock_drv.h>
 
+/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */
+#ifdef XDP_TXMD_FLAGS_VALID
+static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD);
+#endif
+
 /* ``XDP_TX`` bulking */
 
 /**
@@ -145,4 +150,165 @@ libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
 	__libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep,    \
 				   libeth_xsk_tx_fill_buf, xmit)
 
+/* XSk TMO */
+
+/**
+ * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload
+ * @csum_start: unused
+ * @csum_offset: unused
+ * @priv: &libeth_xdp_tx_desc from the filling helper
+ *
+ * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't
+ * require filling checksum offsets and other parameters beside the checksum
+ * request bit.
+ * Consider using within @libeth_xsktmo unless the driver requires HW-specific
+ * callbacks.
+ */
+static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset,
+					  void *priv)
+{
+	((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM;
+}
+
+/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */
+static const struct xsk_tx_metadata_ops __libeth_xsktmo = {
+	.tmo_request_checksum	= libeth_xsktmo_req_csum,
+};
+
+/**
+ * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and
+ * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload.
+ *
+ * Return: XDP Tx descriptor with the DMA, metadata request bits, and other
+ * info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc,
+			      const struct libeth_xdpsq *sq,
+			      u64 priv)
+{
+	const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv);
+	struct libeth_xdp_tx_desc desc;
+	struct xdp_desc_ctx ctx;
+
+	ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr);
+	desc = (typeof(desc)){
+		.addr	= ctx.dma,
+		.len	= xdesc->len,
+	};
+
+	BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo));
+	tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo;
+
+	xsk_tx_metadata_request(ctx.meta, tmo, &desc);
+
+	return desc;
+}
+
+/* XSk xmit implementation */
+
+/**
+ * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ *
+ * Return: XDP Tx descriptor with the DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc,
+			   const struct libeth_xdpsq *sq)
+{
+	return (struct libeth_xdp_tx_desc){
+		.addr	= xsk_buff_raw_get_dma(sq->pool, xdesc->addr),
+		.len	= xdesc->len,
+	};
+}
+
+/**
+ * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit
+ * @frm: &xdp_desc from the XSk buffer pool
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Depending on the metadata ops presence (determined at compile time), calls
+ * the quickest helper to build a libeth XDP Tx descriptor.
+ *
+ * Return: XDP Tx descriptor with the synced DMA, metadata request bits,
+ * and other info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+			 const struct libeth_xdpsq *sq, u64 priv)
+{
+	struct libeth_xdp_tx_desc desc;
+
+	if (priv)
+		desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv);
+	else
+		desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq);
+
+	desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0;
+
+	xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+	return desc;
+}
+
+/**
+ * libeth_xsk_xmit_do_bulk - send XSk xmit frames
+ * @pool: XSk buffer pool containing the frames to send
+ * @xdpsq: opaque pointer to driver's XDPSQ struct
+ * @budget: maximum number of frames can be sent
+ * @tmo: optional XSk Tx metadata ops
+ * @prep: driver callback to build a &libeth_xdpsq
+ * @xmit: driver callback to put frames to a HW queue
+ * @finalize: driver callback to start a transmission
+ *
+ * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed
+ * lazy cleaning is used and interrupts are disabled for the queue.
+ * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize
+ * writes.
+ * Note that unlike other XDP Tx ops, the queue must be locked and cleaned
+ * prior to calling this function to already know available @budget.
+ * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``.
+ *
+ * Return: false if @budget was exhausted, true otherwise.
+ */
+static __always_inline bool
+libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget,
+			const struct xsk_tx_metadata_ops *tmo,
+			u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+			void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+				     const struct libeth_xdpsq *sq, u64 priv),
+			void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+	const struct libeth_xdp_tx_frame *bulk;
+	bool wake;
+	u32 n;
+
+	wake = xsk_uses_need_wakeup(pool);
+	if (wake)
+		xsk_clear_tx_need_wakeup(pool);
+
+	n = xsk_tx_peek_release_desc_batch(pool, budget);
+	bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc);
+
+	libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true,
+				libeth_xdp_ptr_to_priv(tmo), prep,
+				libeth_xsk_xmit_fill_buf, xmit);
+	finalize(xdpsq, n, true);
+
+	if (wake)
+		xsk_set_tx_need_wakeup(pool);
+
+	return n < budget;
+}
+
 #endif /* __LIBETH_XSK_H */
-- 
cgit v1.2.3


From 5495c58c65aa3d650cccaa19dc59115b9a0069a5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:32 +0200
Subject: libeth: xsk: add XSk Rx processing support

Add XSk counterparts for preparing XSk &libeth_xdp_buff (adding head and
frags), running the program, and handling the verdict, inc. XDP_PASS.
Shortcuts in comparison with regular Rx: frags and all verdicts except
XDP_REDIRECT are under unlikely() and out of line; no checks for XDP
program presence as it's always true for XSk.

Suggested-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> # optimizations
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/priv.h |   3 +
 drivers/net/ethernet/intel/libeth/xdp.c  |   6 +-
 drivers/net/ethernet/intel/libeth/xsk.c  | 107 ++++++++++++
 include/net/libeth/xdp.h                 |  17 +-
 include/net/libeth/xsk.h                 | 273 +++++++++++++++++++++++++++++++
 5 files changed, 398 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h
index 03e74382b2cb..9b811d31015c 100644
--- a/drivers/net/ethernet/intel/libeth/priv.h
+++ b/drivers/net/ethernet/intel/libeth/priv.h
@@ -8,6 +8,7 @@
 
 /* XDP */
 
+enum xdp_action;
 struct libeth_xdp_buff;
 struct libeth_xdp_tx_frame;
 struct skb_shared_info;
@@ -17,6 +18,8 @@ extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow;
 
 void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq,
 			       u32 count);
+u32 libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, enum xdp_action act,
+			      int ret);
 
 struct libeth_xdp_ops {
 	void	(*bulk)(const struct skb_shared_info *sinfo,
diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c
index b5fb2ce92da8..d4ac027d9584 100644
--- a/drivers/net/ethernet/intel/libeth/xdp.c
+++ b/drivers/net/ethernet/intel/libeth/xdp.c
@@ -286,7 +286,8 @@ EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag);
  * @act: original XDP prog verdict
  * @ret: error code if redirect failed
  *
- * External helper used by __libeth_xdp_run_prog(), do not call directly.
+ * External helper used by __libeth_xdp_run_prog() and
+ * __libeth_xsk_run_prog_slow(), do not call directly.
  * Reports invalid @act, XDP exception trace event and frees the buffer.
  *
  * Return: libeth_xdp XDP prog verdict.
@@ -300,6 +301,9 @@ u32 __cold libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq,
 
 	libeth_trace_xdp_exception(bq->dev, bq->prog, act);
 
+	if (xdp->base.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
+		return libeth_xsk_prog_exception(xdp, act, ret);
+
 	libeth_xdp_return_buff_slow(xdp);
 
 	return LIBETH_XDP_DROP;
diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
index f09e1940183b..f8f4016d1b25 100644
--- a/drivers/net/ethernet/intel/libeth/xsk.c
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -38,3 +38,110 @@ void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp)
 	xsk_buff_free(&xdp->base);
 }
 EXPORT_SYMBOL_GPL(libeth_xsk_buff_free_slow);
+
+/**
+ * libeth_xsk_buff_add_frag - add frag to XSk Rx buffer
+ * @head: head buffer
+ * @xdp: frag buffer
+ *
+ * External helper used by libeth_xsk_process_buff(), do not call directly.
+ * Frees both main and frag buffers on error.
+ *
+ * Return: main buffer with attached frag on success, %NULL on error (no space
+ * for a new frag).
+ */
+struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head,
+						 struct libeth_xdp_buff *xdp)
+{
+	if (!xsk_buff_add_frag(&head->base, &xdp->base))
+		goto free;
+
+	return head;
+
+free:
+	libeth_xsk_buff_free_slow(xdp);
+	libeth_xsk_buff_free_slow(head);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_buff_add_frag);
+
+/**
+ * libeth_xsk_buff_stats_frags - update onstack RQ stats with XSk frags info
+ * @rs: onstack stats to update
+ * @xdp: buffer to account
+ *
+ * External helper used by __libeth_xsk_run_pass(), do not call directly.
+ * Adds buffer's frags count and total len to the onstack stats.
+ */
+void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs,
+				 const struct libeth_xdp_buff *xdp)
+{
+	libeth_xdp_buff_stats_frags(rs, xdp);
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_buff_stats_frags);
+
+/**
+ * __libeth_xsk_run_prog_slow - process the non-``XDP_REDIRECT`` verdicts
+ * @xdp: buffer to process
+ * @bq: Tx bulk for queueing on ``XDP_TX``
+ * @act: verdict to process
+ * @ret: error code if ``XDP_REDIRECT`` failed
+ *
+ * External helper used by __libeth_xsk_run_prog(), do not call directly.
+ * ``XDP_REDIRECT`` is the most common and hottest verdict on XSk, thus
+ * it is processed inline. The rest goes here for out-of-line processing,
+ * together with redirect errors.
+ *
+ * Return: libeth_xdp XDP prog verdict.
+ */
+u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp,
+			       const struct libeth_xdp_tx_bulk *bq,
+			       enum xdp_action act, int ret)
+{
+	switch (act) {
+	case XDP_DROP:
+		xsk_buff_free(&xdp->base);
+
+		return LIBETH_XDP_DROP;
+	case XDP_TX:
+		return LIBETH_XDP_TX;
+	case XDP_PASS:
+		return LIBETH_XDP_PASS;
+	default:
+		break;
+	}
+
+	return libeth_xdp_prog_exception(bq, xdp, act, ret);
+}
+EXPORT_SYMBOL_GPL(__libeth_xsk_run_prog_slow);
+
+/**
+ * libeth_xsk_prog_exception - handle XDP prog exceptions on XSk
+ * @xdp: buffer to process
+ * @act: verdict returned by the prog
+ * @ret: error code if ``XDP_REDIRECT`` failed
+ *
+ * Internal. Frees the buffer and, if the queue uses XSk wakeups, stop the
+ * current NAPI poll when there are no free buffers left.
+ *
+ * Return: libeth_xdp's XDP prog verdict.
+ */
+u32 __cold libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp,
+				     enum xdp_action act, int ret)
+{
+	const struct xdp_buff_xsk *xsk;
+	u32 __ret = LIBETH_XDP_DROP;
+
+	if (act != XDP_REDIRECT)
+		goto drop;
+
+	xsk = container_of(&xdp->base, typeof(*xsk), xdp);
+	if (xsk_uses_need_wakeup(xsk->pool) && ret == -ENOBUFS)
+		__ret = LIBETH_XDP_ABORTED;
+
+drop:
+	libeth_xsk_buff_free_slow(xdp);
+
+	return __ret;
+}
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index c3655458047d..dba09a9168f1 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -1122,18 +1122,19 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
  * Should be called on an onstack XDP Tx bulk before the NAPI polling loop.
  * Initializes all the needed fields to run libeth_xdp functions. If @num == 0,
  * assumes XDP is not enabled.
+ * Do not use for XSk, it has its own optimized helper.
  */
 #define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num)		      \
 	__libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false,	      \
 				  __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
 
-#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, ub, un) do {	      \
+#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do {   \
 	typeof(bq) ub = (bq);						      \
 	u32 un = (num);							      \
 									      \
 	rcu_read_lock();						      \
 									      \
-	if (un) {							      \
+	if (un || (xsk)) {						      \
 		ub->prog = rcu_dereference(pr);				      \
 		ub->dev = (d);						      \
 		ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)];		      \
@@ -1159,6 +1160,7 @@ void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash);
  *
  * Should be called before the main NAPI polling loop. Loads the content of
  * the previously saved stash or initializes the buffer from scratch.
+ * Do not use for XSk.
  */
 static inline void
 libeth_xdp_init_buff(struct libeth_xdp_buff *dst,
@@ -1378,7 +1380,7 @@ out:
  * @flush_bulk: driver callback for flushing a bulk
  *
  * Internal inline abstraction to run XDP program and additionally handle
- * ``XDP_TX`` verdict.
+ * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue.
  * Do not use directly.
  *
  * Return: libeth_xdp prog verdict depending on the prog's verdict.
@@ -1408,12 +1410,13 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
 }
 
 /**
- * libeth_xdp_run_prog - run XDP program and handle all verdicts
+ * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts
  * @xdp: XDP buffer to process
  * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
  * @fl: driver ``XDP_TX`` bulk flush callback
  *
- * Run the attached XDP program and handle all possible verdicts.
+ * Run the attached XDP program and handle all possible verdicts. XSk has its
+ * own version.
  * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}().
  *
  * Return: true if the buffer should be passed up the stack, false if the poll
@@ -1435,7 +1438,7 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
  * @run: driver wrapper to run XDP program
  * @populate: driver callback to populate an skb with the HW descriptor data
  *
- * Inline abstraction that does the following:
+ * Inline abstraction that does the following (non-XSk path):
  * 1) adds frame size and frag number (if needed) to the onstack stats;
  * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff
  * 3) runs XDP program if present;
@@ -1518,7 +1521,7 @@ static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp,
 			      run, populate)
 
 /**
- * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop
+ * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk)
  * @bq: ``XDP_TX`` frame bulk
  * @flush: driver callback to flush the bulk
  * @finalize: driver callback to start sending the frames and run the timer
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index 16ca195981fe..f3f338e566fc 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -311,4 +311,277 @@ libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget,
 	return n < budget;
 }
 
+/* Rx polling path */
+
+/**
+ * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (never %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions.
+ * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled
+ * when hitting this path.
+ */
+#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num)		     \
+	__libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true,	     \
+				  __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head,
+						 struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff
+ * @head: head XSk buffer to attach the XSk buffer to (or %NULL)
+ * @xdp: XSk buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If @head == %NULL, treats the XSk buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: head XSk buffer on success or if the descriptor must be skipped
+ * (empty), %NULL if there is no space for a new frag.
+ */
+static inline struct libeth_xdp_buff *
+libeth_xsk_process_buff(struct libeth_xdp_buff *head,
+			struct libeth_xdp_buff *xdp, u32 len)
+{
+	if (unlikely(!len)) {
+		libeth_xsk_buff_free_slow(xdp);
+		return head;
+	}
+
+	xsk_buff_set_size(&xdp->base, len);
+	xsk_buff_dma_sync_for_cpu(&xdp->base);
+
+	if (head)
+		return libeth_xsk_buff_add_frag(head, xdp);
+
+	prefetch(xdp->data);
+
+	return xdp;
+}
+
+void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs,
+				 const struct libeth_xdp_buff *xdp);
+
+u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp,
+			       const struct libeth_xdp_tx_bulk *bq,
+			       enum xdp_action act, int ret);
+
+/**
+ * __libeth_xsk_run_prog - run XDP program on XSk buffer
+ * @xdp: XSk buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program on XSk Rx path. Handles
+ * only the most common ``XDP_REDIRECT`` inline, the rest is processed
+ * externally.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp,
+		      const struct libeth_xdp_tx_bulk *bq)
+{
+	enum xdp_action act;
+	int ret = 0;
+
+	act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+	if (unlikely(act != XDP_REDIRECT))
+rest:
+		return __libeth_xsk_run_prog_slow(xdp, bq, act, ret);
+
+	ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog);
+	if (unlikely(ret))
+		goto rest;
+
+	return LIBETH_XDP_REDIRECT;
+}
+
+/**
+ * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts.
+ * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}().
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+#define libeth_xsk_run_prog(xdp, bq, fl)				     \
+	__libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog,		     \
+			       libeth_xsk_tx_queue_bulk, fl)
+
+/**
+ * __libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XSk buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its
+ * doc for details.
+ *
+ * Return: false if the polling loop must be exited due to lack of free
+ * buffers, true otherwise.
+ */
+static __always_inline bool
+__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
+		      struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+		      struct libeth_rq_napi_stats *rs, const void *md,
+		      void (*prep)(struct libeth_xdp_buff *xdp,
+				   const void *md),
+		      u32 (*run)(struct libeth_xdp_buff *xdp,
+				 struct libeth_xdp_tx_bulk *bq),
+		      bool (*populate)(struct sk_buff *skb,
+				       const struct libeth_xdp_buff *xdp,
+				       struct libeth_rq_napi_stats *rs))
+{
+	struct sk_buff *skb;
+	u32 act;
+
+	rs->bytes += xdp->base.data_end - xdp->data;
+	rs->packets++;
+
+	if (unlikely(xdp_buff_has_frags(&xdp->base)))
+		libeth_xsk_buff_stats_frags(rs, xdp);
+
+	if (prep && (!__builtin_constant_p(!!md) || md))
+		prep(xdp, md);
+
+	act = run(xdp, bq);
+	if (likely(act == LIBETH_XDP_REDIRECT))
+		return true;
+
+	if (act != LIBETH_XDP_PASS)
+		return act != LIBETH_XDP_ABORTED;
+
+	skb = xdp_build_skb_from_zc(&xdp->base);
+	if (unlikely(!skb)) {
+		libeth_xsk_buff_free_slow(xdp);
+		return true;
+	}
+
+	if (unlikely(!populate(skb, xdp, rs))) {
+		napi_consume_skb(skb, true);
+		return true;
+	}
+
+	napi_gro_receive(napi, skb);
+
+	return true;
+}
+
+/**
+ * libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate)	     \
+	__libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \
+			      run, populate)
+
+/**
+ * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xsk_finalize_rx(bq, flush, finalize)			     \
+	__libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize)
+
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep,
+ *			      driver_xdp_xmit);
+ * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog,
+ *			 driver_xsk_flush_tx, driver_populate_skb);
+ * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx,
+ *			      driver_xsk_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ *	while (packets < budget) {
+ *		// ...
+ *		if (!driver_xsk_run(xdp, &bq, napi, &rs, desc))
+ *			break;
+ *	}
+ *	driver_xsk_finalize_rx(&bq);
+ */
+
+/**
+ * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit)			     \
+	__LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ */
+#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush)				     \
+	u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate)			     \
+	bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate)		     \
+	__LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK)
+
+/**
+ * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize)		     \
+	__LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk)
+
 #endif /* __LIBETH_XSK_H */
-- 
cgit v1.2.3


From 3ced71a8b39e84f91a4fa9d42e85815515f9b1bc Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:33 +0200
Subject: libeth: xsk: add XSkFQ refill and XSk wakeup helpers

XSkFQ refill is pretty generic across the drivers minus FQ descriptor
filling and can easily be unified with one inline callback.
XSk wakeup is usually not, but here, instead of commonly used
"SW interrupts", I picked firing an IPI. In most tests, it showed better
performance; it also provides better control for userspace on which CPU
will handle the xmit, as SW interrupts honor IRQ affinity no matter
which core produces XSk xmit descs (while XDPSQs are associated 1:1
with cores having the same ID).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/xsk.c | 124 ++++++++++++++++++++++++++++++++
 include/net/libeth/xsk.h                |  98 +++++++++++++++++++++++++
 2 files changed, 222 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c
index f8f4016d1b25..846e902e31b6 100644
--- a/drivers/net/ethernet/intel/libeth/xsk.c
+++ b/drivers/net/ethernet/intel/libeth/xsk.c
@@ -145,3 +145,127 @@ drop:
 
 	return __ret;
 }
+
+/* Refill */
+
+/**
+ * libeth_xskfq_create - create an XSkFQ
+ * @fq: fill queue to initialize
+ *
+ * Allocates the FQEs and initializes the fields used by libeth_xdp: number
+ * of buffers to refill, refill threshold and buffer len.
+ *
+ * Return: %0 on success, -errno otherwise.
+ */
+int libeth_xskfq_create(struct libeth_xskfq *fq)
+{
+	fq->fqes = kvcalloc_node(fq->count, sizeof(*fq->fqes), GFP_KERNEL,
+				 fq->nid);
+	if (!fq->fqes)
+		return -ENOMEM;
+
+	fq->pending = fq->count;
+	fq->thresh = libeth_xdp_queue_threshold(fq->count);
+	fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(libeth_xskfq_create);
+
+/**
+ * libeth_xskfq_destroy - destroy an XSkFQ
+ * @fq: fill queue to destroy
+ *
+ * Zeroes the used fields and frees the FQEs array.
+ */
+void libeth_xskfq_destroy(struct libeth_xskfq *fq)
+{
+	fq->buf_len = 0;
+	fq->thresh = 0;
+	fq->pending = 0;
+
+	kvfree(fq->fqes);
+}
+EXPORT_SYMBOL_GPL(libeth_xskfq_destroy);
+
+/* .ndo_xsk_wakeup */
+
+static void libeth_xsk_napi_sched(void *info)
+{
+	__napi_schedule_irqoff(info);
+}
+
+/**
+ * libeth_xsk_init_wakeup - initialize libeth XSk wakeup structure
+ * @csd: struct to initialize
+ * @napi: NAPI corresponding to this queue
+ *
+ * libeth_xdp uses inter-processor interrupts to perform XSk wakeups. In order
+ * to do that, the corresponding CSDs must be initialized when creating the
+ * queues.
+ */
+void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi)
+{
+	INIT_CSD(csd, libeth_xsk_napi_sched, napi);
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_init_wakeup);
+
+/**
+ * libeth_xsk_wakeup - perform an XSk wakeup
+ * @csd: CSD corresponding to the queue
+ * @qid: the stack queue index
+ *
+ * Try to mark the NAPI as missed first, so that it could be rescheduled.
+ * If it's not, schedule it on the corresponding CPU using IPIs (or directly
+ * if already running on it).
+ */
+void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid)
+{
+	struct napi_struct *napi = csd->info;
+
+	if (napi_if_scheduled_mark_missed(napi) ||
+	    unlikely(!napi_schedule_prep(napi)))
+		return;
+
+	if (unlikely(qid >= nr_cpu_ids))
+		qid %= nr_cpu_ids;
+
+	if (qid != raw_smp_processor_id() && cpu_online(qid))
+		smp_call_function_single_async(qid, csd);
+	else
+		__napi_schedule(napi);
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_wakeup);
+
+/* Pool setup */
+
+#define LIBETH_XSK_DMA_ATTR					\
+	(DMA_ATTR_WEAK_ORDERING | DMA_ATTR_SKIP_CPU_SYNC)
+
+/**
+ * libeth_xsk_setup_pool - setup or destroy an XSk pool for a queue
+ * @dev: target &net_device
+ * @qid: stack queue index to configure
+ * @enable: whether to enable or disable the pool
+ *
+ * Check that @qid is valid and then map or unmap the pool.
+ *
+ * Return: %0 on success, -errno otherwise.
+ */
+int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable)
+{
+	struct xsk_buff_pool *pool;
+
+	pool = xsk_get_pool_from_qid(dev, qid);
+	if (!pool)
+		return -EINVAL;
+
+	if (enable)
+		return xsk_pool_dma_map(pool, dev->dev.parent,
+					LIBETH_XSK_DMA_ATTR);
+	else
+		xsk_pool_dma_unmap(pool, LIBETH_XSK_DMA_ATTR);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(libeth_xsk_setup_pool);
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index f3f338e566fc..213778a68476 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -584,4 +584,102 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
 #define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize)		     \
 	__LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk)
 
+/* Refilling */
+
+/**
+ * struct libeth_xskfq - structure representing an XSk buffer (fill) queue
+ * @fp: hotpath part of the structure
+ * @pool: &xsk_buff_pool for buffer management
+ * @fqes: array of XSk buffer pointers
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: index of the next buffer to poll
+ * @count: number of descriptors/buffers the queue has
+ * @pending: current number of XSkFQEs to refill
+ * @thresh: threshold below which the queue is refilled
+ * @buf_len: HW-writeable length per each buffer
+ * @nid: ID of the closest NUMA node with memory
+ */
+struct libeth_xskfq {
+	struct_group_tagged(libeth_xskfq_fp, fp,
+		struct xsk_buff_pool	*pool;
+		struct libeth_xdp_buff	**fqes;
+		void			*descs;
+
+		u32			ntu;
+		u32			count;
+	);
+
+	/* Cold fields */
+	u32			pending;
+	u32			thresh;
+
+	u32			buf_len;
+	int			nid;
+};
+
+int libeth_xskfq_create(struct libeth_xskfq *fq);
+void libeth_xskfq_destroy(struct libeth_xskfq *fq);
+
+/**
+ * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff
+ * @xdp: buffer to get the DMA addr for
+ */
+#define libeth_xsk_buff_xdp_get_dma(xdp)				     \
+	xsk_buff_xdp_get_dma(&(xdp)->base)
+
+/**
+ * libeth_xskfqe_alloc - allocate @n XSk Rx buffers
+ * @fq: hotpath part of the XSkFQ, usually onstack
+ * @n: number of buffers to allocate
+ * @fill: driver callback to write DMA addresses to HW descriptors
+ *
+ * Note that @fq->ntu gets updated, but ::pending must be recalculated
+ * by the caller.
+ *
+ * Return: number of buffers refilled.
+ */
+static __always_inline u32
+libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n,
+		    void (*fill)(const struct libeth_xskfq_fp *fq, u32 i))
+{
+	u32 this, ret, done = 0;
+	struct xdp_buff **xskb;
+
+	this = fq->count - fq->ntu;
+	if (likely(this > n))
+		this = n;
+
+again:
+	xskb = (typeof(xskb))&fq->fqes[fq->ntu];
+	ret = xsk_buff_alloc_batch(fq->pool, xskb, this);
+
+	for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++)
+		fill(fq, ntu + i);
+
+	done += ret;
+	fq->ntu += ret;
+
+	if (likely(fq->ntu < fq->count) || unlikely(ret < this))
+		goto out;
+
+	fq->ntu = 0;
+
+	if (this < n) {
+		this = n - this;
+		goto again;
+	}
+
+out:
+	return done;
+}
+
+/* .ndo_xsk_wakeup */
+
+void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi);
+void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid);
+
+/* Pool setup */
+
+int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable);
+
 #endif /* __LIBETH_XSK_H */
-- 
cgit v1.2.3


From 80bae9df2108cb72a060ee5235614d7c072af1de Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 12 Jun 2025 18:02:34 +0200
Subject: libeth: xdp, xsk: access adjacent u32s as u64 where applicable

On 64-bit systems, writing/reading one u64 is faster than two u32s even
when they're are adjacent in a struct. The compilers won't guarantee
they will combine those; I observed both successful and unsuccessful
attempts with both GCC and Clang, and it's not easy to say what it
depends on.
There's a few places in libeth_xdp winning up to several percent from
combined access (both performance and object code size, especially
when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE.
Drivers are free to optimize HW-specific callbacks under the same
definition.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/libeth/xdp.h | 29 ++++++++++++++++++++++++++---
 include/net/libeth/xsk.h | 10 +++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index dba09a9168f1..6ce6aec6884c 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -475,6 +475,21 @@ struct libeth_xdp_tx_desc {
 	((const void *)(uintptr_t)(priv));				      \
 })
 
+/*
+ * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len
+ * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead.
+ */
+#ifdef __LITTLE_ENDIAN
+#define __LIBETH_WORD_ACCESS		1
+#endif
+#ifdef __LIBETH_WORD_ACCESS
+#define __libeth_xdp_tx_len(flen, ...)					      \
+	.opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0)))
+#else
+#define __libeth_xdp_tx_len(flen, ...)					      \
+	.len = (flen), .flags = (__VA_ARGS__ + 0)
+#endif
+
 /**
  * libeth_xdp_tx_xmit_bulk - main XDP Tx function
  * @bulk: array of frames to send
@@ -870,8 +885,7 @@ static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq,
 
 	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
 		.xdpf	= xdpf,
-		.len	= xdpf->len,
-		.flags	= LIBETH_XDP_TX_FIRST,
+		__libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST),
 	};
 
 	if (!xdp_frame_has_frags(xdpf))
@@ -902,7 +916,7 @@ static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq,
 
 	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
 		.dma	= dma,
-		.len	= skb_frag_size(frag),
+		__libeth_xdp_tx_len(skb_frag_size(frag)),
 	};
 
 	return true;
@@ -1260,6 +1274,7 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
  * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer
  * head with the Rx buffer data: data pointer, length, headroom, and
  * truesize/tailroom. Zeroes the flags.
+ * Uses faster single u64 write instead of per-field access.
  */
 static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
 					   const struct libeth_fqe *fqe,
@@ -1267,7 +1282,15 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
 {
 	const struct page *page = __netmem_to_page(fqe->netmem);
 
+#ifdef __LIBETH_WORD_ACCESS
+	static_assert(offsetofend(typeof(xdp->base), flags) -
+		      offsetof(typeof(xdp->base), frame_sz) ==
+		      sizeof(u64));
+
+	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
+#else
 	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
+#endif
 	xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
 			 page->pp->p.offset, len, true);
 }
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
index 213778a68476..481a7b28e6f2 100644
--- a/include/net/libeth/xsk.h
+++ b/include/net/libeth/xsk.h
@@ -26,8 +26,8 @@ static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
 {
 	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
 		.xsk	= xdp,
-		.len	= xdp->base.data_end - xdp->data,
-		.flags	= LIBETH_XDP_TX_FIRST,
+		__libeth_xdp_tx_len(xdp->base.data_end - xdp->data,
+				    LIBETH_XDP_TX_FIRST),
 	};
 
 	if (likely(!xdp_buff_has_frags(&xdp->base)))
@@ -48,7 +48,7 @@ static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
 {
 	bq->bulk[bq->count++] = (typeof(*bq->bulk)){
 		.xsk	= frag,
-		.len	= frag->base.data_end - frag->data,
+		__libeth_xdp_tx_len(frag->base.data_end - frag->data),
 	};
 }
 
@@ -199,7 +199,7 @@ __libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc,
 	ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr);
 	desc = (typeof(desc)){
 		.addr	= ctx.dma,
-		.len	= xdesc->len,
+		__libeth_xdp_tx_len(xdesc->len),
 	};
 
 	BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo));
@@ -226,7 +226,7 @@ __libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc,
 {
 	return (struct libeth_xdp_tx_desc){
 		.addr	= xsk_buff_raw_get_dma(sq->pool, xdesc->addr),
-		.len	= xdesc->len,
+		__libeth_xdp_tx_len(xdesc->len),
 	};
 }
 
-- 
cgit v1.2.3


From acc379c63ade8e247fb792ccdd4ae9a208530c1a Mon Sep 17 00:00:00 2001
From: Andrea della Porta <andrea.porta@suse.com>
Date: Thu, 29 May 2025 15:50:38 +0200
Subject: dt-bindings: clock: Add RaspberryPi RP1 clock bindings

Add device tree bindings for the clock generator found in RP1 multi
function device, and relative entries in MAINTAINERS file.

Signed-off-by: Andrea della Porta <andrea.porta@suse.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/20250529135052.28398-1-andrea.porta@suse.com
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 .../bindings/clock/raspberrypi,rp1-clocks.yaml     | 58 ++++++++++++++++++++
 include/dt-bindings/clock/raspberrypi,rp1-clocks.h | 61 ++++++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
 create mode 100644 include/dt-bindings/clock/raspberrypi,rp1-clocks.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml b/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
new file mode 100644
index 000000000000..cc4491f7ee5f
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/raspberrypi,rp1-clocks.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/raspberrypi,rp1-clocks.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RaspberryPi RP1 clock generator
+
+maintainers:
+  - A. della Porta <andrea.porta@suse.com>
+
+description: |
+  The RP1 contains a clock generator designed as three PLLs (CORE, AUDIO,
+  VIDEO), and each PLL output can be programmed through dividers to generate
+  the clocks to drive the sub-peripherals embedded inside the chipset.
+
+  Link to datasheet:
+  https://datasheets.raspberrypi.com/rp1/rp1-peripherals.pdf
+
+properties:
+  compatible:
+    const: raspberrypi,rp1-clocks
+
+  reg:
+    maxItems: 1
+
+  '#clock-cells':
+    const: 1
+    description:
+      The available clocks are defined in
+      include/dt-bindings/clock/raspberrypi,rp1-clocks.h.
+
+  clocks:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - '#clock-cells'
+  - clocks
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/raspberrypi,rp1-clocks.h>
+
+    rp1 {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        clocks@c040018000 {
+            compatible = "raspberrypi,rp1-clocks";
+            reg = <0xc0 0x40018000 0x0 0x10038>;
+            #clock-cells = <1>;
+            clocks = <&clk_rp1_xosc>;
+        };
+    };
diff --git a/include/dt-bindings/clock/raspberrypi,rp1-clocks.h b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h
new file mode 100644
index 000000000000..248efb895f35
--- /dev/null
+++ b/include/dt-bindings/clock/raspberrypi,rp1-clocks.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2021 Raspberry Pi Ltd.
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_RASPBERRYPI_RP1
+#define __DT_BINDINGS_CLOCK_RASPBERRYPI_RP1
+
+#define RP1_PLL_SYS_CORE		0
+#define RP1_PLL_AUDIO_CORE		1
+#define RP1_PLL_VIDEO_CORE		2
+
+#define RP1_PLL_SYS			3
+#define RP1_PLL_AUDIO			4
+#define RP1_PLL_VIDEO			5
+
+#define RP1_PLL_SYS_PRI_PH		6
+#define RP1_PLL_SYS_SEC_PH		7
+#define RP1_PLL_AUDIO_PRI_PH		8
+
+#define RP1_PLL_SYS_SEC			9
+#define RP1_PLL_AUDIO_SEC		10
+#define RP1_PLL_VIDEO_SEC		11
+
+#define RP1_CLK_SYS			12
+#define RP1_CLK_SLOW_SYS		13
+#define RP1_CLK_DMA			14
+#define RP1_CLK_UART			15
+#define RP1_CLK_ETH			16
+#define RP1_CLK_PWM0			17
+#define RP1_CLK_PWM1			18
+#define RP1_CLK_AUDIO_IN		19
+#define RP1_CLK_AUDIO_OUT		20
+#define RP1_CLK_I2S			21
+#define RP1_CLK_MIPI0_CFG		22
+#define RP1_CLK_MIPI1_CFG		23
+#define RP1_CLK_PCIE_AUX		24
+#define RP1_CLK_USBH0_MICROFRAME	25
+#define RP1_CLK_USBH1_MICROFRAME	26
+#define RP1_CLK_USBH0_SUSPEND		27
+#define RP1_CLK_USBH1_SUSPEND		28
+#define RP1_CLK_ETH_TSU			29
+#define RP1_CLK_ADC			30
+#define RP1_CLK_SDIO_TIMER		31
+#define RP1_CLK_SDIO_ALT_SRC		32
+#define RP1_CLK_GP0			33
+#define RP1_CLK_GP1			34
+#define RP1_CLK_GP2			35
+#define RP1_CLK_GP3			36
+#define RP1_CLK_GP4			37
+#define RP1_CLK_GP5			38
+#define RP1_CLK_VEC			39
+#define RP1_CLK_DPI			40
+#define RP1_CLK_MIPI0_DPI		41
+#define RP1_CLK_MIPI1_DPI		42
+
+/* Extra PLL output channels - RP1B0 only */
+#define RP1_PLL_VIDEO_PRI_PH		43
+#define RP1_PLL_AUDIO_TERN		44
+
+#endif
-- 
cgit v1.2.3


From 49d63971f96349cdcff89d21786e3c804e7cd4c0 Mon Sep 17 00:00:00 2001
From: Andrea della Porta <andrea.porta@suse.com>
Date: Thu, 29 May 2025 15:50:44 +0200
Subject: misc: rp1: RaspberryPi RP1 misc driver

The RaspberryPi RP1 is a PCI multi function device containing
peripherals ranging from Ethernet to USB controller, I2C, SPI
and others.

Implement a bare minimum driver to operate the RP1, leveraging
actual OF based driver implementations for the on-board peripherals
by loading a devicetree overlay during driver probe if the RP1
node is not already present in the DT.

The peripherals are accessed by mapping MMIO registers starting
from PCI BAR1 region.

With the overlay approach we can achieve more generic and agnostic
approach to managing this chipset, being that it is a PCI endpoint
and could possibly be reused in other hw implementations. The
presented approach is also used by Bootlin's Microchip LAN966x
patchset (see link) as well, for a similar chipset.
In this case, the inclusion tree for the DT overlay is as follow
(the arrow points to the includer):

 rp1-pci.dtso <---- rp1-common.dtsi

On the other hand, to ensure compatibility with downstream, this
driver can also work with a DT already comprising the RP1 node, so
the dynamically loaded overlay will not be used if the DT is already
fully defined.

The reason why this driver is contained in drivers/misc has
been paved by Bootlin's LAN966X driver, which first used the
overlay approach to implement non discoverable peripherals behind a
PCI bus. For RP1, the same arguments apply: it's not used as an SoC
since the driver code is not running on-chip and is not like an MFD
since it does not really need all the MFD infrastructure (shared regs,
etc.). So, for this particular use, misc has been proposed and deemed
as a good choice. For further details about that please check the links.

This driver is heavily based on downstream code from RaspberryPi
Foundation, and the original author is Phil Elwell.

Link: https://datasheets.raspberrypi.com/rp1/rp1-peripherals.pdf
Link: https://lore.kernel.org/all/20240612140208.GC1504919@google.com/
Link: https://lore.kernel.org/all/83f7fa09-d0e6-4f36-a27d-cee08979be2a@app.fastmail.com/
Link: https://lore.kernel.org/all/2024081356-mutable-everyday-6f9d@gregkh/
Link: https://lore.kernel.org/all/20240808154658.247873-1-herve.codina@bootlin.com/

Signed-off-by: Andrea della Porta <andrea.porta@suse.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # quirks.c, pci_ids.h
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20250529135052.28398-7-andrea.porta@suse.com
Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 drivers/misc/Kconfig          |   1 +
 drivers/misc/Makefile         |   1 +
 drivers/misc/rp1/Kconfig      |  20 +++
 drivers/misc/rp1/Makefile     |   3 +
 drivers/misc/rp1/rp1-pci.dtso |  25 ++++
 drivers/misc/rp1/rp1_pci.c    | 333 ++++++++++++++++++++++++++++++++++++++++++
 drivers/pci/quirks.c          |   1 +
 include/linux/pci_ids.h       |   3 +
 8 files changed, 387 insertions(+)
 create mode 100644 drivers/misc/rp1/Kconfig
 create mode 100644 drivers/misc/rp1/Makefile
 create mode 100644 drivers/misc/rp1/rp1-pci.dtso
 create mode 100644 drivers/misc/rp1/rp1_pci.c

(limited to 'include')

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index b9ca56930003..b9c11f67315f 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -660,4 +660,5 @@ source "drivers/misc/pvpanic/Kconfig"
 source "drivers/misc/mchp_pci1xxxx/Kconfig"
 source "drivers/misc/keba/Kconfig"
 source "drivers/misc/amd-sbi/Kconfig"
+source "drivers/misc/rp1/Kconfig"
 endmenu
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 917b9a7183aa..e2e66f5f4fb8 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -75,3 +75,4 @@ lan966x-pci-objs		+= lan966x_pci.dtbo.o
 obj-$(CONFIG_MCHP_LAN966X_PCI)	+= lan966x-pci.o
 obj-y				+= keba/
 obj-y				+= amd-sbi/
+obj-$(CONFIG_MISC_RP1)		+= rp1/
diff --git a/drivers/misc/rp1/Kconfig b/drivers/misc/rp1/Kconfig
new file mode 100644
index 000000000000..5232e70d3079
--- /dev/null
+++ b/drivers/misc/rp1/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# RaspberryPi RP1 misc device
+#
+
+config MISC_RP1
+	tristate "RaspberryPi RP1 misc device"
+	depends on OF_IRQ && OF_OVERLAY && PCI_MSI && PCI_QUIRKS
+	select PCI_DYNAMIC_OF_NODES
+	help
+	  Support the RP1 peripheral chip found on Raspberry Pi 5 board.
+
+	  This device supports several sub-devices including e.g. Ethernet
+	  controller, USB controller, I2C, SPI and UART.
+
+	  The driver is responsible for enabling the DT node once the PCIe
+	  endpoint has been configured, and handling interrupts.
+
+	  This driver uses an overlay to load other drivers to support for
+	  RP1 internal sub-devices.
diff --git a/drivers/misc/rp1/Makefile b/drivers/misc/rp1/Makefile
new file mode 100644
index 000000000000..508b4cb05627
--- /dev/null
+++ b/drivers/misc/rp1/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MISC_RP1)		+= rp1-pci.o
+rp1-pci-objs			:= rp1_pci.o rp1-pci.dtbo.o
diff --git a/drivers/misc/rp1/rp1-pci.dtso b/drivers/misc/rp1/rp1-pci.dtso
new file mode 100644
index 000000000000..eea826b36e02
--- /dev/null
+++ b/drivers/misc/rp1/rp1-pci.dtso
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+
+/*
+ * The dts overlay is included from the dts directory so
+ * it can be possible to check it with CHECK_DTBS while
+ * also compile it from the driver source directory.
+ */
+
+/dts-v1/;
+/plugin/;
+
+/ {
+	fragment@0 {
+		target-path="";
+		__overlay__ {
+			compatible = "pci1de4,1";
+			#address-cells = <3>;
+			#size-cells = <2>;
+			interrupt-controller;
+			#interrupt-cells = <2>;
+
+			#include "arm64/broadcom/rp1-common.dtsi"
+		};
+	};
+};
diff --git a/drivers/misc/rp1/rp1_pci.c b/drivers/misc/rp1/rp1_pci.c
new file mode 100644
index 000000000000..803832006ec8
--- /dev/null
+++ b/drivers/misc/rp1/rp1_pci.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2025 Raspberry Pi Ltd.
+ *
+ * All rights reserved.
+ */
+
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#define RP1_HW_IRQ_MASK		GENMASK(5, 0)
+
+#define REG_SET			0x800
+#define REG_CLR			0xc00
+
+/* MSI-X CFG registers start at 0x8 */
+#define MSIX_CFG(x) (0x8 + (4 * (x)))
+
+#define MSIX_CFG_IACK_EN        BIT(3)
+#define MSIX_CFG_IACK           BIT(2)
+#define MSIX_CFG_ENABLE         BIT(0)
+
+/* Address map */
+#define RP1_PCIE_APBS_BASE	0x108000
+
+/* Interrupts */
+#define RP1_INT_END		61
+
+/* Embedded dtbo symbols created by cmd_wrap_S_dtb in scripts/Makefile.lib */
+extern char __dtbo_rp1_pci_begin[];
+extern char __dtbo_rp1_pci_end[];
+
+struct rp1_dev {
+	struct pci_dev *pdev;
+	struct irq_domain *domain;
+	struct irq_data *pcie_irqds[64];
+	void __iomem *bar1;
+	int ovcs_id;	/* overlay changeset id */
+	bool level_triggered_irq[RP1_INT_END];
+};
+
+static void msix_cfg_set(struct rp1_dev *rp1, unsigned int hwirq, u32 value)
+{
+	iowrite32(value, rp1->bar1 + RP1_PCIE_APBS_BASE + REG_SET + MSIX_CFG(hwirq));
+}
+
+static void msix_cfg_clr(struct rp1_dev *rp1, unsigned int hwirq, u32 value)
+{
+	iowrite32(value, rp1->bar1 + RP1_PCIE_APBS_BASE + REG_CLR + MSIX_CFG(hwirq));
+}
+
+static void rp1_mask_irq(struct irq_data *irqd)
+{
+	struct rp1_dev *rp1 = irqd->domain->host_data;
+	struct irq_data *pcie_irqd = rp1->pcie_irqds[irqd->hwirq];
+
+	pci_msi_mask_irq(pcie_irqd);
+}
+
+static void rp1_unmask_irq(struct irq_data *irqd)
+{
+	struct rp1_dev *rp1 = irqd->domain->host_data;
+	struct irq_data *pcie_irqd = rp1->pcie_irqds[irqd->hwirq];
+
+	pci_msi_unmask_irq(pcie_irqd);
+}
+
+static int rp1_irq_set_type(struct irq_data *irqd, unsigned int type)
+{
+	struct rp1_dev *rp1 = irqd->domain->host_data;
+	unsigned int hwirq = (unsigned int)irqd->hwirq;
+
+	switch (type) {
+	case IRQ_TYPE_LEVEL_HIGH:
+		dev_dbg(&rp1->pdev->dev, "MSIX IACK EN for IRQ %u\n", hwirq);
+		msix_cfg_set(rp1, hwirq, MSIX_CFG_IACK_EN);
+		rp1->level_triggered_irq[hwirq] = true;
+	break;
+	case IRQ_TYPE_EDGE_RISING:
+		msix_cfg_clr(rp1, hwirq, MSIX_CFG_IACK_EN);
+		rp1->level_triggered_irq[hwirq] = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct irq_chip rp1_irq_chip = {
+	.name		= "rp1_irq_chip",
+	.irq_mask	= rp1_mask_irq,
+	.irq_unmask	= rp1_unmask_irq,
+	.irq_set_type	= rp1_irq_set_type,
+};
+
+static void rp1_chained_handle_irq(struct irq_desc *desc)
+{
+	unsigned int hwirq = desc->irq_data.hwirq & RP1_HW_IRQ_MASK;
+	struct rp1_dev *rp1 = irq_desc_get_handler_data(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	unsigned int virq;
+
+	chained_irq_enter(chip, desc);
+
+	virq = irq_find_mapping(rp1->domain, hwirq);
+	generic_handle_irq(virq);
+	if (rp1->level_triggered_irq[hwirq])
+		msix_cfg_set(rp1, hwirq, MSIX_CFG_IACK);
+
+	chained_irq_exit(chip, desc);
+}
+
+static int rp1_irq_xlate(struct irq_domain *d, struct device_node *node,
+			 const u32 *intspec, unsigned int intsize,
+			 unsigned long *out_hwirq, unsigned int *out_type)
+{
+	struct rp1_dev *rp1 = d->host_data;
+	struct irq_data *pcie_irqd;
+	unsigned long hwirq;
+	int pcie_irq;
+	int ret;
+
+	ret = irq_domain_xlate_twocell(d, node, intspec, intsize,
+				       &hwirq, out_type);
+	if (ret)
+		return ret;
+
+	pcie_irq = pci_irq_vector(rp1->pdev, hwirq);
+	pcie_irqd = irq_get_irq_data(pcie_irq);
+	rp1->pcie_irqds[hwirq] = pcie_irqd;
+	*out_hwirq = hwirq;
+
+	return 0;
+}
+
+static int rp1_irq_activate(struct irq_domain *d, struct irq_data *irqd,
+			    bool reserve)
+{
+	struct rp1_dev *rp1 = d->host_data;
+
+	msix_cfg_set(rp1, (unsigned int)irqd->hwirq, MSIX_CFG_ENABLE);
+
+	return 0;
+}
+
+static void rp1_irq_deactivate(struct irq_domain *d, struct irq_data *irqd)
+{
+	struct rp1_dev *rp1 = d->host_data;
+
+	msix_cfg_clr(rp1, (unsigned int)irqd->hwirq, MSIX_CFG_ENABLE);
+}
+
+static const struct irq_domain_ops rp1_domain_ops = {
+	.xlate      = rp1_irq_xlate,
+	.activate   = rp1_irq_activate,
+	.deactivate = rp1_irq_deactivate,
+};
+
+static void rp1_unregister_interrupts(struct pci_dev *pdev)
+{
+	struct rp1_dev *rp1 = pci_get_drvdata(pdev);
+	int irq, i;
+
+	if (rp1->domain) {
+		for (i = 0; i < RP1_INT_END; i++) {
+			irq = irq_find_mapping(rp1->domain, i);
+			irq_dispose_mapping(irq);
+		}
+
+		irq_domain_remove(rp1->domain);
+	}
+
+	pci_free_irq_vectors(pdev);
+}
+
+static int rp1_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	u32 dtbo_size = __dtbo_rp1_pci_end - __dtbo_rp1_pci_begin;
+	void *dtbo_start = __dtbo_rp1_pci_begin;
+	struct device *dev = &pdev->dev;
+	struct device_node *rp1_node;
+	bool skip_ovl = true;
+	struct rp1_dev *rp1;
+	int err = 0;
+	int i;
+
+	/*
+	 * Either use rp1_nexus node if already present in DT, or
+	 * set a flag to load it from overlay at runtime
+	 */
+	rp1_node = of_find_node_by_name(NULL, "rp1_nexus");
+	if (!rp1_node) {
+		rp1_node = dev_of_node(dev);
+		skip_ovl = false;
+	}
+
+	if (!rp1_node) {
+		dev_err(dev, "Missing of_node for device\n");
+		err = -EINVAL;
+		goto err_put_node;
+	}
+
+	rp1 = devm_kzalloc(&pdev->dev, sizeof(*rp1), GFP_KERNEL);
+	if (!rp1) {
+		err = -ENOMEM;
+		goto err_put_node;
+	}
+
+	rp1->pdev = pdev;
+
+	if (pci_resource_len(pdev, 1) <= 0x10000) {
+		dev_err(&pdev->dev,
+			"Not initialized - is the firmware running?\n");
+		err = -EINVAL;
+		goto err_put_node;
+	}
+
+	err = pcim_enable_device(pdev);
+	if (err < 0) {
+		err = dev_err_probe(&pdev->dev, err,
+				    "Enabling PCI device has failed");
+		goto err_put_node;
+	}
+
+	rp1->bar1 = pcim_iomap(pdev, 1, 0);
+	if (!rp1->bar1) {
+		dev_err(&pdev->dev, "Cannot map PCI BAR\n");
+		err = -EIO;
+		goto err_put_node;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_alloc_irq_vectors(pdev, RP1_INT_END, RP1_INT_END,
+				    PCI_IRQ_MSIX);
+	if (err < 0) {
+		err = dev_err_probe(&pdev->dev, err,
+				    "Failed to allocate MSI-X vectors\n");
+		goto err_put_node;
+	} else if (err != RP1_INT_END) {
+		dev_err(&pdev->dev, "Cannot allocate enough interrupts\n");
+		err = -EINVAL;
+		goto err_put_node;
+	}
+
+	pci_set_drvdata(pdev, rp1);
+	rp1->domain = irq_domain_add_linear(rp1_node, RP1_INT_END,
+					    &rp1_domain_ops, rp1);
+	if (!rp1->domain) {
+		dev_err(&pdev->dev, "Error creating IRQ domain\n");
+		err = -ENOMEM;
+		goto err_unregister_interrupts;
+	}
+
+	for (i = 0; i < RP1_INT_END; i++) {
+		unsigned int irq = irq_create_mapping(rp1->domain, i);
+
+		if (!irq) {
+			dev_err(&pdev->dev, "Failed to create IRQ mapping\n");
+			err = -EINVAL;
+			goto err_unregister_interrupts;
+		}
+
+		irq_set_chip_and_handler(irq, &rp1_irq_chip, handle_level_irq);
+		irq_set_probe(irq);
+		irq_set_chained_handler_and_data(pci_irq_vector(pdev, i),
+						 rp1_chained_handle_irq, rp1);
+	}
+
+	if (!skip_ovl) {
+		err = of_overlay_fdt_apply(dtbo_start, dtbo_size, &rp1->ovcs_id,
+					   rp1_node);
+		if (err)
+			goto err_unregister_interrupts;
+	}
+
+	err = of_platform_default_populate(rp1_node, NULL, dev);
+	if (err) {
+		dev_err_probe(&pdev->dev, err, "Error populating devicetree\n");
+		goto err_unload_overlay;
+	}
+
+	return 0;
+
+err_unload_overlay:
+	of_overlay_remove(&rp1->ovcs_id);
+err_unregister_interrupts:
+	rp1_unregister_interrupts(pdev);
+err_put_node:
+	if (skip_ovl)
+		of_node_put(rp1_node);
+
+	return err;
+}
+
+static void rp1_remove(struct pci_dev *pdev)
+{
+	struct rp1_dev *rp1 = pci_get_drvdata(pdev);
+	struct device *dev = &pdev->dev;
+
+	of_platform_depopulate(dev);
+	of_overlay_remove(&rp1->ovcs_id);
+	rp1_unregister_interrupts(pdev);
+}
+
+static const struct pci_device_id dev_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_RPI, PCI_DEVICE_ID_RPI_RP1_C0), },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, dev_id_table);
+
+static struct pci_driver rp1_driver = {
+	.name		= KBUILD_MODNAME,
+	.id_table	= dev_id_table,
+	.probe		= rp1_probe,
+	.remove		= rp1_remove,
+};
+
+module_pci_driver(rp1_driver);
+
+MODULE_AUTHOR("Phil Elwell <phil@raspberrypi.com>");
+MODULE_AUTHOR("Andrea della Porta <andrea.porta@suse.com>");
+MODULE_DESCRIPTION("RaspberryPi RP1 misc device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index d7f4ee634263..cf483d82572c 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -6303,6 +6303,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_XILINX, 0x5020, of_pci_make_dev_node);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_XILINX, 0x5021, of_pci_make_dev_node);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REDHAT, 0x0005, of_pci_make_dev_node);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_EFAR, 0x9660, of_pci_make_dev_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_RPI, PCI_DEVICE_ID_RPI_RP1_C0, of_pci_make_dev_node);
 
 /*
  * Devices known to require a longer delay before first config space access
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e2d71b6fdd84..92ffc4373f6d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2624,6 +2624,9 @@
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
+#define PCI_VENDOR_ID_RPI		0x1de4
+#define PCI_DEVICE_ID_RPI_RP1_C0	0x0001
+
 #define PCI_VENDOR_ID_ALIBABA		0x1ded
 
 #define PCI_VENDOR_ID_CXL		0x1e98
-- 
cgit v1.2.3


From 594902c986e269660302f09df9ec4bf1cf017b77 Mon Sep 17 00:00:00 2001
From: Qinyun Tan <qinyuntan@linux.alibaba.com>
Date: Sat, 31 May 2025 02:20:53 +0800
Subject: x86,fs/resctrl: Remove inappropriate references to cacheinfo in the
 resctrl subsystem

In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain
structure representing a NUMA node relies on the cacheinfo interface
(rdt_mon_domain::ci) to store L3 cache information (e.g., shared_cpu_map)
for monitoring. The L3 cache information of a SNC NUMA node determines
which domains are summed for the "top level" L3-scoped events.

rdt_mon_domain::ci is initialized using the first online CPU of a NUMA
node. When this CPU goes offline, its shared_cpu_map is cleared to contain
only the offline CPU itself. Subsequently, attempting to read counters
via smp_call_on_cpu(offline_cpu) fails (and error ignored), returning
zero values for "top-level events" without any error indication.

Replace the cacheinfo references in struct rdt_mon_domain and struct
rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).

rdt_domain_hdr::cpu_mask contains the online CPUs associated with that
domain. When reading "top-level events", select a CPU from
rdt_domain_hdr::cpu_mask and utilize its L3 shared_cpu_map to determine
valid CPUs for reading RMID counter via the MSR interface.

Considering all CPUs associated with the L3 cache improves the chances
of picking a housekeeping CPU on which the counter reading work can be
queued, avoiding an unnecessary IPI.

Fixes: 328ea68874642 ("x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files")
Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20250530182053.37502-2-qinyuntan@linux.alibaba.com
---
 arch/x86/kernel/cpu/resctrl/core.c |  6 ++++--
 fs/resctrl/ctrlmondata.c           | 13 +++++++++----
 fs/resctrl/internal.h              |  4 ++--
 fs/resctrl/monitor.c               |  6 ++++--
 fs/resctrl/rdtgroup.c              |  6 +++---
 include/linux/resctrl.h            |  4 ++--
 6 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 7109cbfcad4f..187d527ef73b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -498,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 	struct rdt_hw_mon_domain *hw_dom;
 	struct rdt_domain_hdr *hdr;
 	struct rdt_mon_domain *d;
+	struct cacheinfo *ci;
 	int err;
 
 	lockdep_assert_held(&domain_list_lock);
@@ -525,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 	d = &hw_dom->d_resctrl;
 	d->hdr.id = id;
 	d->hdr.type = RESCTRL_MON_DOMAIN;
-	d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
-	if (!d->ci) {
+	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+	if (!ci) {
 		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
 		mon_domain_free(hw_dom);
 		return;
 	}
+	d->ci_id = ci->id;
 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
 
 	arch_mon_domain_online(r, d);
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index 6ed2dfd4dbbd..d98e0d2de09f 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	struct rmid_read rr = {0};
 	struct rdt_mon_domain *d;
 	struct rdtgroup *rdtgrp;
+	int domid, cpu, ret = 0;
 	struct rdt_resource *r;
+	struct cacheinfo *ci;
 	struct mon_data *md;
-	int domid, ret = 0;
 
 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 	if (!rdtgrp) {
@@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		 * one that matches this cache id.
 		 */
 		list_for_each_entry(d, &r->mon_domains, hdr.list) {
-			if (d->ci->id == domid) {
-				rr.ci = d->ci;
+			if (d->ci_id == domid) {
+				rr.ci_id = d->ci_id;
+				cpu = cpumask_any(&d->hdr.cpu_mask);
+				ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+				if (!ci)
+					continue;
 				mon_event_read(&rr, r, NULL, rdtgrp,
-					       &d->ci->shared_cpu_map, evtid, false);
+					       &ci->shared_cpu_map, evtid, false);
 				goto checkresult;
 			}
 		}
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 9a8cf6f11151..0a1eedba2b03 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,7 +98,7 @@ struct mon_data {
  *	   domains in @r sharing L3 @ci.id
  * @evtid: Which monitor event to read.
  * @first: Initialize MBM counter when true.
- * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
  * @err:   Error encountered when reading counter.
  * @val:   Returned value of event counter. If @rgrp is a parent resource group,
  *	   @val includes the sum of event counts from its child resource groups.
@@ -112,7 +112,7 @@ struct rmid_read {
 	struct rdt_mon_domain	*d;
 	enum resctrl_event_id	evtid;
 	bool			first;
-	struct cacheinfo	*ci;
+	unsigned int		ci_id;
 	int			err;
 	u64			val;
 	void			*arch_mon_ctx;
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index bde2801289d3..f5637855c3ac 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 {
 	int cpu = smp_processor_id();
 	struct rdt_mon_domain *d;
+	struct cacheinfo *ci;
 	struct mbm_state *m;
 	int err, ret;
 	u64 tval = 0;
@@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 	}
 
 	/* Summing domains that share a cache, must be on a CPU for that cache. */
-	if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+	if (!ci || ci->id != rr->ci_id)
 		return -EINVAL;
 
 	/*
@@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
 	 */
 	ret = -EINVAL;
 	list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
-		if (d->ci->id != rr->ci->id)
+		if (d->ci_id != rr->ci_id)
 			continue;
 		err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
 					     rr->evtid, &tval, rr->arch_mon_ctx);
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
index 1beb124e25f6..77d08229d855 100644
--- a/fs/resctrl/rdtgroup.c
+++ b/fs/resctrl/rdtgroup.c
@@ -3036,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 	char name[32];
 
 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
-	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
 	if (snc_mode)
 		sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
 
@@ -3061,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
 		return -EPERM;
 
 	list_for_each_entry(mevt, &r->evt_list, list) {
-		domid = do_sum ? d->ci->id : d->hdr.id;
+		domid = do_sum ? d->ci_id : d->hdr.id;
 		priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
 		if (WARN_ON_ONCE(!priv))
 			return -EINVAL;
@@ -3089,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
 	lockdep_assert_held(&rdtgroup_mutex);
 
 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
-	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
 	kn = kernfs_find_and_get(parent_kn, name);
 	if (kn) {
 		/*
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 9ba771f2ddea..6fb4894b8cfd 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -159,7 +159,7 @@ struct rdt_ctrl_domain {
 /**
  * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource
  * @hdr:		common header for different domain types
- * @ci:			cache info for this domain
+ * @ci_id:		cache info id for this domain
  * @rmid_busy_llc:	bitmap of which limbo RMIDs are above threshold
  * @mbm_total:		saved state for MBM total bandwidth
  * @mbm_local:		saved state for MBM local bandwidth
@@ -170,7 +170,7 @@ struct rdt_ctrl_domain {
  */
 struct rdt_mon_domain {
 	struct rdt_domain_hdr		hdr;
-	struct cacheinfo		*ci;
+	unsigned int			ci_id;
 	unsigned long			*rmid_busy_llc;
 	struct mbm_state		*mbm_total;
 	struct mbm_state		*mbm_local;
-- 
cgit v1.2.3


From ae1ae11fb277f1335d6bcd4935ba0ea985af3c32 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 13 Jun 2025 15:58:00 -0400
Subject: audit,module: restore audit logging in load failure case

The move of the module sanity check to earlier skipped the audit logging
call in the case of failure and to a place where the previously used
context is unavailable.

Add an audit logging call for the module loading failure case and get
the module name when possible.

Link: https://issues.redhat.com/browse/RHEL-52839
Fixes: 02da2cbab452 ("module: move check_modinfo() early to early_mod_check()")
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 9 ++++-----
 kernel/audit.h        | 2 +-
 kernel/auditsc.c      | 2 +-
 kernel/module/main.c  | 6 ++++--
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 0050ef288ab3..a394614ccd0b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -417,7 +417,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
 extern void __audit_openat2_how(struct open_how *how);
-extern void __audit_log_kern_module(char *name);
+extern void __audit_log_kern_module(const char *name);
 extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
 extern void __audit_tk_injoffset(struct timespec64 offset);
 extern void __audit_ntp_log(const struct audit_ntp_data *ad);
@@ -519,7 +519,7 @@ static inline void audit_openat2_how(struct open_how *how)
 		__audit_openat2_how(how);
 }
 
-static inline void audit_log_kern_module(char *name)
+static inline void audit_log_kern_module(const char *name)
 {
 	if (!audit_dummy_context())
 		__audit_log_kern_module(name);
@@ -677,9 +677,8 @@ static inline void audit_mmap_fd(int fd, int flags)
 static inline void audit_openat2_how(struct open_how *how)
 { }
 
-static inline void audit_log_kern_module(char *name)
-{
-}
+static inline void audit_log_kern_module(const char *name)
+{ }
 
 static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
 { }
diff --git a/kernel/audit.h b/kernel/audit.h
index 0211cb307d30..2a24d01c5fb0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -200,7 +200,7 @@ struct audit_context {
 			int			argc;
 		} execve;
 		struct {
-			char			*name;
+			const char		*name;
 		} module;
 		struct {
 			struct audit_ntp_data	ntp_data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 78fd876a5473..eb98cd6fe91f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2864,7 +2864,7 @@ void __audit_openat2_how(struct open_how *how)
 	context->type = AUDIT_OPENAT2;
 }
 
-void __audit_log_kern_module(char *name)
+void __audit_log_kern_module(const char *name)
 {
 	struct audit_context *context = audit_context();
 
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 413ac6ea3702..1ac487d2f7c4 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3368,7 +3368,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	module_allocated = true;
 
-	audit_log_kern_module(mod->name);
+	audit_log_kern_module(info->name);
 
 	/* Reserve our place in the list. */
 	err = add_unformed_module(mod);
@@ -3532,8 +3532,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	 * failures once the proper module was allocated and
 	 * before that.
 	 */
-	if (!module_allocated)
+	if (!module_allocated) {
+		audit_log_kern_module(info->name ? info->name : "?");
 		mod_stat_bump_becoming(info, flags);
+	}
 	free_copy(info, flags);
 	return err;
 }
-- 
cgit v1.2.3


From 3cfbde048b1c0606d0e02ecb0319c8748421bc7c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 09:46:16 -0400
Subject: net/tcp_ao: tracing: Hide tcp_ao events under CONFIG_TCP_AO

Several of the tcp_ao events are only called when CONFIG_TCP_AO is
defined. As each event can take up to 5K regardless if they are used or
not, it's best not to define them when they are not used. Add #ifdef
around these events when they are not used.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250612094616.4222daf0@batman.local.home
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 95f59c1a6f57..54e60c6009e3 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -692,6 +692,7 @@ DEFINE_EVENT(tcp_ao_event, tcp_ao_handshake_failure,
 	TP_ARGS(sk, skb, keyid, rnext, maclen)
 );
 
+#ifdef CONFIG_TCP_AO
 DEFINE_EVENT(tcp_ao_event, tcp_ao_wrong_maclen,
 	TP_PROTO(const struct sock *sk, const struct sk_buff *skb,
 		 const __u8 keyid, const __u8 rnext, const __u8 maclen),
@@ -830,6 +831,7 @@ DEFINE_EVENT(tcp_ao_event_sne, tcp_ao_rcv_sne_update,
 	TP_PROTO(const struct sock *sk, __u32 new_sne),
 	TP_ARGS(sk, new_sne)
 );
+#endif /* CONFIG_TCP_AO */
 
 #endif /* _TRACE_TCP_H */
 
-- 
cgit v1.2.3


From 260948993a9f99428f801dcb40654205e74aaa47 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 13 Jun 2025 04:31:30 -0700
Subject: netpoll: remove __netpoll_cleanup from exported API

Since commit 97714695ef90 ("net: netconsole: Defer netpoll cleanup to
avoid lock release during list traversal"), netconsole no longer uses
__netpoll_cleanup(). With no remaining users, remove this function
from the exported netpoll API.

The function remains available internally within netpoll for use by
netpoll_cleanup().

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250613-rework-v3-1-0752bf2e6912@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h | 1 -
 net/core/netpoll.c      | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 0477208ed9ff..a637e5115254 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -69,7 +69,6 @@ void netpoll_print_options(struct netpoll *np);
 int netpoll_parse_options(struct netpoll *np, char *opt);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
-void __netpoll_cleanup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
 void netpoll_cleanup(struct netpoll *np);
 void do_netpoll_cleanup(struct netpoll *np);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4ddb7490df4b..a69c2773841a 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -863,7 +863,7 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
 	kfree(npinfo);
 }
 
-void __netpoll_cleanup(struct netpoll *np)
+static void __netpoll_cleanup(struct netpoll *np)
 {
 	struct netpoll_info *npinfo;
 
@@ -885,7 +885,6 @@ void __netpoll_cleanup(struct netpoll *np)
 
 	skb_pool_flush(np);
 }
-EXPORT_SYMBOL_GPL(__netpoll_cleanup);
 
 void __netpoll_free(struct netpoll *np)
 {
-- 
cgit v1.2.3


From afb023329c07af7a9144901a1dad3a80d9e177b1 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 13 Jun 2025 04:31:31 -0700
Subject: netpoll: expose netpoll logging macros in public header

Move np_info(), np_err(), and np_notice() macros from internal
implementation to the public netpoll header file to make them
available for use by netpoll consumers.

These logging macros provide consistent formatting for netpoll-related
messages by automatically prefixing log output with the netpoll instance
name.

The goal is to use the exact same format that is being displayed today,
instead of creating something netconsole-specific.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250613-rework-v3-2-0752bf2e6912@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h | 7 +++++++
 net/core/netpoll.c      | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index a637e5115254..72086b8a3dec 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -42,6 +42,13 @@ struct netpoll {
 	struct work_struct refill_wq;
 };
 
+#define np_info(np, fmt, ...)				\
+	pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_err(np, fmt, ...)				\
+	pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
+#define np_notice(np, fmt, ...)				\
+	pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
+
 struct netpoll_info {
 	refcount_t refcnt;
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a69c2773841a..9e86026225a3 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -58,13 +58,6 @@ static void zap_completion_queue(void);
 static unsigned int carrier_timeout = 4;
 module_param(carrier_timeout, uint, 0644);
 
-#define np_info(np, fmt, ...)				\
-	pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
-#define np_err(np, fmt, ...)				\
-	pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
-#define np_notice(np, fmt, ...)				\
-	pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
-
 static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
 				      struct net_device *dev,
 				      struct netdev_queue *txq)
-- 
cgit v1.2.3


From 5a34c9a8536511b6bd43d85bb0211077226c6fdb Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 13 Jun 2025 04:31:32 -0700
Subject: netpoll: relocate netconsole-specific functions to netconsole module

Move netpoll_parse_ip_addr() and netpoll_parse_options() from the generic
netpoll module to the netconsole module where they are actually used.

These functions were originally placed in netpoll but are only consumed by
netconsole. This refactoring improves code organization by:

 - Removing unnecessary exported symbols from netpoll
 - Making netpoll_parse_options() static (no longer needs global visibility)
 - Reducing coupling between netpoll and netconsole modules

The functions remain functionally identical - this is purely a code
reorganization to better reflect their actual usage patterns. Here are
the changes:

 1) Move both functions from netpoll to netconsole
 2) Add static to netpoll_parse_options()
 3) Removed the EXPORT_SYMBOL()

PS: This diff does not change the function format, so, it is easy to
review, but, checkpatch will not be happy. A follow-up patch will
address the current issues reported by checkpatch.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250613-rework-v3-3-0752bf2e6912@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netconsole.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/netpoll.h  |   1 -
 net/core/netpoll.c       | 109 -----------------------------------------------
 3 files changed, 108 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index 1eb678e07dd0..bc145e4cf6e7 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -1659,6 +1659,114 @@ static void write_msg(struct console *con, const char *msg, unsigned int len)
 	spin_unlock_irqrestore(&target_list_lock, flags);
 }
 
+static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
+{
+	const char *end;
+
+	if (!strchr(str, ':') &&
+	    in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
+		if (!*end)
+			return 0;
+	}
+	if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
+#if IS_ENABLED(CONFIG_IPV6)
+		if (!*end)
+			return 1;
+#else
+		return -1;
+#endif
+	}
+	return -1;
+}
+
+static int netpoll_parse_options(struct netpoll *np, char *opt)
+{
+	char *cur=opt, *delim;
+	int ipv6;
+	bool ipversion_set = false;
+
+	if (*cur != '@') {
+		if ((delim = strchr(cur, '@')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		if (kstrtou16(cur, 10, &np->local_port))
+			goto parse_failed;
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != '/') {
+		ipversion_set = true;
+		if ((delim = strchr(cur, '/')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
+		if (ipv6 < 0)
+			goto parse_failed;
+		else
+			np->ipv6 = (bool)ipv6;
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != ',') {
+		/* parse out dev_name or dev_mac */
+		if ((delim = strchr(cur, ',')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+
+		np->dev_name[0] = '\0';
+		eth_broadcast_addr(np->dev_mac);
+		if (!strchr(cur, ':'))
+			strscpy(np->dev_name, cur, sizeof(np->dev_name));
+		else if (!mac_pton(cur, np->dev_mac))
+			goto parse_failed;
+
+		cur = delim;
+	}
+	cur++;
+
+	if (*cur != '@') {
+		/* dst port */
+		if ((delim = strchr(cur, '@')) == NULL)
+			goto parse_failed;
+		*delim = 0;
+		if (*cur == ' ' || *cur == '\t')
+			np_info(np, "warning: whitespace is not allowed\n");
+		if (kstrtou16(cur, 10, &np->remote_port))
+			goto parse_failed;
+		cur = delim;
+	}
+	cur++;
+
+	/* dst ip */
+	if ((delim = strchr(cur, '/')) == NULL)
+		goto parse_failed;
+	*delim = 0;
+	ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
+	if (ipv6 < 0)
+		goto parse_failed;
+	else if (ipversion_set && np->ipv6 != (bool)ipv6)
+		goto parse_failed;
+	else
+		np->ipv6 = (bool)ipv6;
+	cur = delim + 1;
+
+	if (*cur != 0) {
+		/* MAC address */
+		if (!mac_pton(cur, np->remote_mac))
+			goto parse_failed;
+	}
+
+	netpoll_print_options(np);
+
+	return 0;
+
+ parse_failed:
+	np_info(np, "couldn't parse config at '%s'!\n", cur);
+	return -1;
+}
+
 /* Allocate new target (from boot/module param) and setup netpoll for it */
 static struct netconsole_target *alloc_param_target(char *target_config,
 						    int cmdline_count)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 72086b8a3dec..1b8000954e52 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -73,7 +73,6 @@ static inline void netpoll_poll_enable(struct net_device *dev) { return; }
 
 int netpoll_send_udp(struct netpoll *np, const char *msg, int len);
 void netpoll_print_options(struct netpoll *np);
-int netpoll_parse_options(struct netpoll *np, char *opt);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 9e86026225a3..d2965c916130 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -510,26 +510,6 @@ void netpoll_print_options(struct netpoll *np)
 }
 EXPORT_SYMBOL(netpoll_print_options);
 
-static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
-{
-	const char *end;
-
-	if (!strchr(str, ':') &&
-	    in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
-		if (!*end)
-			return 0;
-	}
-	if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
-#if IS_ENABLED(CONFIG_IPV6)
-		if (!*end)
-			return 1;
-#else
-		return -1;
-#endif
-	}
-	return -1;
-}
-
 static void skb_pool_flush(struct netpoll *np)
 {
 	struct sk_buff_head *skb_pool;
@@ -539,95 +519,6 @@ static void skb_pool_flush(struct netpoll *np)
 	skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
 }
 
-int netpoll_parse_options(struct netpoll *np, char *opt)
-{
-	char *cur=opt, *delim;
-	int ipv6;
-	bool ipversion_set = false;
-
-	if (*cur != '@') {
-		if ((delim = strchr(cur, '@')) == NULL)
-			goto parse_failed;
-		*delim = 0;
-		if (kstrtou16(cur, 10, &np->local_port))
-			goto parse_failed;
-		cur = delim;
-	}
-	cur++;
-
-	if (*cur != '/') {
-		ipversion_set = true;
-		if ((delim = strchr(cur, '/')) == NULL)
-			goto parse_failed;
-		*delim = 0;
-		ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
-		if (ipv6 < 0)
-			goto parse_failed;
-		else
-			np->ipv6 = (bool)ipv6;
-		cur = delim;
-	}
-	cur++;
-
-	if (*cur != ',') {
-		/* parse out dev_name or dev_mac */
-		if ((delim = strchr(cur, ',')) == NULL)
-			goto parse_failed;
-		*delim = 0;
-
-		np->dev_name[0] = '\0';
-		eth_broadcast_addr(np->dev_mac);
-		if (!strchr(cur, ':'))
-			strscpy(np->dev_name, cur, sizeof(np->dev_name));
-		else if (!mac_pton(cur, np->dev_mac))
-			goto parse_failed;
-
-		cur = delim;
-	}
-	cur++;
-
-	if (*cur != '@') {
-		/* dst port */
-		if ((delim = strchr(cur, '@')) == NULL)
-			goto parse_failed;
-		*delim = 0;
-		if (*cur == ' ' || *cur == '\t')
-			np_info(np, "warning: whitespace is not allowed\n");
-		if (kstrtou16(cur, 10, &np->remote_port))
-			goto parse_failed;
-		cur = delim;
-	}
-	cur++;
-
-	/* dst ip */
-	if ((delim = strchr(cur, '/')) == NULL)
-		goto parse_failed;
-	*delim = 0;
-	ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
-	if (ipv6 < 0)
-		goto parse_failed;
-	else if (ipversion_set && np->ipv6 != (bool)ipv6)
-		goto parse_failed;
-	else
-		np->ipv6 = (bool)ipv6;
-	cur = delim + 1;
-
-	if (*cur != 0) {
-		/* MAC address */
-		if (!mac_pton(cur, np->remote_mac))
-			goto parse_failed;
-	}
-
-	netpoll_print_options(np);
-
-	return 0;
-
- parse_failed:
-	np_info(np, "couldn't parse config at '%s'!\n", cur);
-	return -1;
-}
-EXPORT_SYMBOL(netpoll_parse_options);
-
 static void refill_skbs_work_handler(struct work_struct *work)
 {
 	struct netpoll *np =
-- 
cgit v1.2.3


From ccc7edf0ada83d395b634506eff9616360a99b5a Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Fri, 13 Jun 2025 04:31:33 -0700
Subject: netpoll: move netpoll_print_options to netconsole

Move netpoll_print_options() from net/core/netpoll.c to
drivers/net/netconsole.c and make it static. This function is only used
by netconsole, so there's no need to export it or keep it in the public
netpoll API.

This reduces the netpoll API surface and improves code locality
by keeping netconsole-specific functionality within the netconsole
driver.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250613-rework-v3-4-0752bf2e6912@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/netconsole.c | 17 +++++++++++++++++
 include/linux/netpoll.h  |  1 -
 net/core/netpoll.c       | 17 -----------------
 3 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index bc145e4cf6e7..71522fb0eeee 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -278,6 +278,23 @@ static void netconsole_process_cleanups_core(void)
 	mutex_unlock(&target_cleanup_list_lock);
 }
 
+static void netpoll_print_options(struct netpoll *np)
+{
+	np_info(np, "local port %d\n", np->local_port);
+	if (np->ipv6)
+		np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+	else
+		np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
+	np_info(np, "interface name '%s'\n", np->dev_name);
+	np_info(np, "local ethernet address '%pM'\n", np->dev_mac);
+	np_info(np, "remote port %d\n", np->remote_port);
+	if (np->ipv6)
+		np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+	else
+		np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
+	np_info(np, "remote ethernet address %pM\n", np->remote_mac);
+}
+
 #ifdef	CONFIG_NETCONSOLE_DYNAMIC
 
 /*
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 1b8000954e52..735e65c3cc11 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -72,7 +72,6 @@ static inline void netpoll_poll_enable(struct net_device *dev) { return; }
 #endif
 
 int netpoll_send_udp(struct netpoll *np, const char *msg, int len);
-void netpoll_print_options(struct netpoll *np);
 int __netpoll_setup(struct netpoll *np, struct net_device *ndev);
 int netpoll_setup(struct netpoll *np);
 void __netpoll_free(struct netpoll *np);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d2965c916130..07c453864a7d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -492,23 +492,6 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 }
 EXPORT_SYMBOL(netpoll_send_udp);
 
-void netpoll_print_options(struct netpoll *np)
-{
-	np_info(np, "local port %d\n", np->local_port);
-	if (np->ipv6)
-		np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
-	else
-		np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
-	np_info(np, "interface name '%s'\n", np->dev_name);
-	np_info(np, "local ethernet address '%pM'\n", np->dev_mac);
-	np_info(np, "remote port %d\n", np->remote_port);
-	if (np->ipv6)
-		np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
-	else
-		np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
-	np_info(np, "remote ethernet address %pM\n", np->remote_mac);
-}
-EXPORT_SYMBOL(netpoll_print_options);
 
 static void skb_pool_flush(struct netpoll *np)
 {
-- 
cgit v1.2.3


From 7768c5f417336fa58dbfef9bb7ecd7eeec6d8886 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Fri, 13 Jun 2025 10:00:34 -0700
Subject: net: mana: Add handler for hardware servicing events

To collaborate with hardware servicing events, upon receiving the special
EQE notification from the HW channel, remove the devices on this bus.
Then, after a waiting period based on the device specs, rescan the parent
bus to recover the devices.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/1749834034-18498-1-git-send-email-haiyangz@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 75 +++++++++++++++++++++++++
 include/net/mana/gdma.h                         | 10 +++-
 2 files changed, 83 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3504507477c6..069b7a871b78 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -352,11 +352,59 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
 }
 EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
 
+#define MANA_SERVICE_PERIOD 10
+
+struct mana_serv_work {
+	struct work_struct serv_work;
+	struct pci_dev *pdev;
+};
+
+static void mana_serv_func(struct work_struct *w)
+{
+	struct mana_serv_work *mns_wk;
+	struct pci_bus *bus, *parent;
+	struct pci_dev *pdev;
+
+	mns_wk = container_of(w, struct mana_serv_work, serv_work);
+	pdev = mns_wk->pdev;
+
+	pci_lock_rescan_remove();
+
+	if (!pdev)
+		goto out;
+
+	bus = pdev->bus;
+	if (!bus) {
+		dev_err(&pdev->dev, "MANA service: no bus\n");
+		goto out;
+	}
+
+	parent = bus->parent;
+	if (!parent) {
+		dev_err(&pdev->dev, "MANA service: no parent bus\n");
+		goto out;
+	}
+
+	pci_stop_and_remove_bus_device(bus->self);
+
+	msleep(MANA_SERVICE_PERIOD * 1000);
+
+	pci_rescan_bus(parent);
+
+out:
+	pci_unlock_rescan_remove();
+
+	pci_dev_put(pdev);
+	kfree(mns_wk);
+	module_put(THIS_MODULE);
+}
+
 static void mana_gd_process_eqe(struct gdma_queue *eq)
 {
 	u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE);
 	struct gdma_context *gc = eq->gdma_dev->gdma_context;
 	struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr;
+	struct mana_serv_work *mns_wk;
 	union gdma_eqe_info eqe_info;
 	enum gdma_eqe_type type;
 	struct gdma_event event;
@@ -401,6 +449,33 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 		eq->eq.callback(eq->eq.context, eq, &event);
 		break;
 
+	case GDMA_EQE_HWC_FPGA_RECONFIG:
+		dev_info(gc->dev, "Recv MANA service type:%d\n", type);
+
+		if (gc->in_service) {
+			dev_info(gc->dev, "Already in service\n");
+			break;
+		}
+
+		if (!try_module_get(THIS_MODULE)) {
+			dev_info(gc->dev, "Module is unloading\n");
+			break;
+		}
+
+		mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
+		if (!mns_wk) {
+			module_put(THIS_MODULE);
+			break;
+		}
+
+		dev_info(gc->dev, "Start MANA service type:%d\n", type);
+		gc->in_service = true;
+		mns_wk->pdev = to_pci_dev(gc->dev);
+		pci_dev_get(mns_wk->pdev);
+		INIT_WORK(&mns_wk->serv_work, mana_serv_func);
+		schedule_work(&mns_wk->serv_work);
+		break;
+
 	default:
 		break;
 	}
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 3ce56a816425..bfae59202669 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -58,7 +58,7 @@ enum gdma_eqe_type {
 	GDMA_EQE_HWC_INIT_EQ_ID_DB	= 129,
 	GDMA_EQE_HWC_INIT_DATA		= 130,
 	GDMA_EQE_HWC_INIT_DONE		= 131,
-	GDMA_EQE_HWC_SOC_RECONFIG	= 132,
+	GDMA_EQE_HWC_FPGA_RECONFIG	= 132,
 	GDMA_EQE_HWC_SOC_RECONFIG_DATA	= 133,
 	GDMA_EQE_HWC_SOC_SERVICE	= 134,
 	GDMA_EQE_RNIC_QP_FATAL		= 176,
@@ -403,6 +403,8 @@ struct gdma_context {
 	u32			test_event_eq_id;
 
 	bool			is_pf;
+	bool			in_service;
+
 	phys_addr_t		bar0_pa;
 	void __iomem		*bar0_va;
 	void __iomem		*shm_base;
@@ -578,12 +580,16 @@ enum {
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
 
+/* Driver can self reset on FPGA Reconfig EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
+
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
 	 GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
 	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
 	 GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
-	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
+	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
cgit v1.2.3


From 25d51ebf0f54f9c2424f28bb29125cf24f120df0 Mon Sep 17 00:00:00 2001
From: Subbaraya Sundeep <sbhatta@marvell.com>
Date: Wed, 11 Jun 2025 16:31:51 +0530
Subject: octeontx2: Set appropriate PF, VF masks and shifts based on silicon

Number of RVU PFs on CN20K silicon have increased to 96 from maximum
of 32 that were supported on earlier silicons. Every RVU PF and VF is
identified by HW using a 16bit PF_FUNC value. Due to the change in
Max number of PFs in CN20K, the bit encoding of this PF_FUNC has changed.

This patch handles the change by using helper functions(using silicon
check) to use PF,VF masks and shifts to support both new silicon CN20K,
OcteonTx series. These helper functions are used in different modules.

Also moved the NIX AF register offset macros to other files which
will be posted in coming patches.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: Sai Krishna <saikrishnag@marvell.com>
Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
Link: https://patch.msgid.link/1749639716-13868-2-git-send-email-sbhatta@marvell.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/crypto/marvell/octeontx2/otx2_cpt_common.h |  5 +-
 drivers/crypto/marvell/octeontx2/otx2_cptpf_mbox.c | 13 ++---
 .../crypto/marvell/octeontx2/otx2_cptpf_ucode.c    |  4 +-
 drivers/crypto/marvell/octeontx2/otx2_cptvf_mbox.c |  6 +-
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c |  6 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c    | 30 ++++------
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    | 52 ++++++++++++++---
 .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c    | 68 +++++++++++-----------
 .../net/ethernet/marvell/octeontx2/af/rvu_cn10k.c  |  4 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_cpt.c    |  4 +-
 .../ethernet/marvell/octeontx2/af/rvu_debugfs.c    | 22 +++----
 .../net/ethernet/marvell/octeontx2/af/rvu_nix.c    | 54 +++++++++--------
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c    |  8 ++-
 .../ethernet/marvell/octeontx2/af/rvu_npc_hash.c   | 16 ++---
 .../ethernet/marvell/octeontx2/af/rvu_npc_hash.h   |  4 +-
 .../net/ethernet/marvell/octeontx2/af/rvu_rep.c    | 13 ++---
 .../net/ethernet/marvell/octeontx2/af/rvu_sdp.c    | 10 ++--
 .../net/ethernet/marvell/octeontx2/af/rvu_switch.c |  8 +--
 .../ethernet/marvell/octeontx2/nic/cn10k_ipsec.c   |  2 +-
 .../ethernet/marvell/octeontx2/nic/cn10k_ipsec.h   |  2 +-
 .../ethernet/marvell/octeontx2/nic/otx2_common.h   | 11 +---
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   | 21 ++++---
 .../net/ethernet/marvell/octeontx2/nic/otx2_reg.h  | 30 ----------
 .../net/ethernet/marvell/octeontx2/nic/otx2_tc.c   |  3 +-
 drivers/net/ethernet/marvell/octeontx2/nic/rep.c   |  7 ++-
 include/linux/soc/marvell/silicons.h               | 25 ++++++++
 26 files changed, 225 insertions(+), 203 deletions(-)
 create mode 100644 include/linux/soc/marvell/silicons.h

(limited to 'include')

diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h b/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h
index d529bcb03775..062def303dce 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h
+++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_common.h
@@ -18,9 +18,8 @@
 #define OTX2_CPT_MAX_VFS_NUM 128
 #define OTX2_CPT_RVU_FUNC_ADDR_S(blk, slot, offs) \
 		(((blk) << 20) | ((slot) << 12) | (offs))
-#define OTX2_CPT_RVU_PFFUNC(pf, func)	\
-		((((pf) & RVU_PFVF_PF_MASK) << RVU_PFVF_PF_SHIFT) | \
-		(((func) & RVU_PFVF_FUNC_MASK) << RVU_PFVF_FUNC_SHIFT))
+
+#define OTX2_CPT_RVU_PFFUNC(pdev, pf, func) rvu_make_pcifunc(pdev, pf, func)
 
 #define OTX2_CPT_INVALID_CRYPTO_ENG_GRP 0xFF
 #define OTX2_CPT_NAME_LENGTH 64
diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptpf_mbox.c b/drivers/crypto/marvell/octeontx2/otx2_cptpf_mbox.c
index 12c0e966fa65..b4b2d3d1cbc2 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cptpf_mbox.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cptpf_mbox.c
@@ -142,7 +142,7 @@ static int send_inline_ipsec_inbound_msg(struct otx2_cptpf_dev *cptpf,
 	memset(req, 0, sizeof(*req));
 	req->hdr.id = MBOX_MSG_CPT_INLINE_IPSEC_CFG;
 	req->hdr.sig = OTX2_MBOX_REQ_SIG;
-	req->hdr.pcifunc = OTX2_CPT_RVU_PFFUNC(cptpf->pf_id, 0);
+	req->hdr.pcifunc = OTX2_CPT_RVU_PFFUNC(cptpf->pdev, cptpf->pf_id, 0);
 	req->dir = CPT_INLINE_INBOUND;
 	req->slot = slot;
 	req->sso_pf_func_ovrd = cptpf->sso_pf_func_ovrd;
@@ -184,7 +184,8 @@ static int rx_inline_ipsec_lf_cfg(struct otx2_cptpf_dev *cptpf, u8 egrp,
 		nix_req->gen_cfg.opcode = cpt_inline_rx_opcode(pdev);
 	nix_req->gen_cfg.param1 = req->param1;
 	nix_req->gen_cfg.param2 = req->param2;
-	nix_req->inst_qsel.cpt_pf_func = OTX2_CPT_RVU_PFFUNC(cptpf->pf_id, 0);
+	nix_req->inst_qsel.cpt_pf_func =
+		OTX2_CPT_RVU_PFFUNC(cptpf->pdev, cptpf->pf_id, 0);
 	nix_req->inst_qsel.cpt_slot = 0;
 	ret = otx2_cpt_send_mbox_msg(&cptpf->afpf_mbox, pdev);
 	if (ret)
@@ -392,9 +393,8 @@ void otx2_cptpf_vfpf_mbox_handler(struct work_struct *work)
 		msg = (struct mbox_msghdr *)(mdev->mbase + offset);
 
 		/* Set which VF sent this message based on mbox IRQ */
-		msg->pcifunc = ((u16)cptpf->pf_id << RVU_PFVF_PF_SHIFT) |
-				((vf->vf_id + 1) & RVU_PFVF_FUNC_MASK);
-
+		msg->pcifunc = rvu_make_pcifunc(cptpf->pdev, cptpf->pf_id,
+						(vf->vf_id + 1));
 		err = cptpf_handle_vf_req(cptpf, vf, msg,
 					  msg->next_msgoff - offset);
 		/*
@@ -469,8 +469,7 @@ static void process_afpf_mbox_msg(struct otx2_cptpf_dev *cptpf,
 
 	switch (msg->id) {
 	case MBOX_MSG_READY:
-		cptpf->pf_id = (msg->pcifunc >> RVU_PFVF_PF_SHIFT) &
-				RVU_PFVF_PF_MASK;
+		cptpf->pf_id = rvu_get_pf(cptpf->pdev, msg->pcifunc);
 		break;
 	case MBOX_MSG_MSIX_OFFSET:
 		rsp_msix = (struct msix_offset_rsp *) msg;
diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c b/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c
index 78367849c3d5..7180944ece50 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cptpf_ucode.c
@@ -176,7 +176,9 @@ static int cptx_set_ucode_base(struct otx2_cpt_eng_grp_info *eng_grp,
 	/* Set PF number for microcode fetches */
 	ret = otx2_cpt_write_af_reg(&cptpf->afpf_mbox, cptpf->pdev,
 				    CPT_AF_PF_FUNC,
-				    cptpf->pf_id << RVU_PFVF_PF_SHIFT, blkaddr);
+				    rvu_make_pcifunc(cptpf->pdev,
+						     cptpf->pf_id, 0),
+				    blkaddr);
 	if (ret)
 		return ret;
 
diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_mbox.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_mbox.c
index 931b72580fd9..92e49babd79a 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_mbox.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_mbox.c
@@ -189,7 +189,7 @@ int otx2_cptvf_send_eng_grp_num_msg(struct otx2_cptvf_dev *cptvf, int eng_type)
 	}
 	req->hdr.id = MBOX_MSG_GET_ENG_GRP_NUM;
 	req->hdr.sig = OTX2_MBOX_REQ_SIG;
-	req->hdr.pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->vf_id, 0);
+	req->hdr.pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->pdev, cptvf->vf_id, 0);
 	req->eng_type = eng_type;
 
 	return otx2_cpt_send_mbox_msg(mbox, pdev);
@@ -210,7 +210,7 @@ int otx2_cptvf_send_kvf_limits_msg(struct otx2_cptvf_dev *cptvf)
 	}
 	req->id = MBOX_MSG_GET_KVF_LIMITS;
 	req->sig = OTX2_MBOX_REQ_SIG;
-	req->pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->vf_id, 0);
+	req->pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->pdev, cptvf->vf_id, 0);
 
 	return otx2_cpt_send_mbox_msg(mbox, pdev);
 }
@@ -230,7 +230,7 @@ int otx2_cptvf_send_caps_msg(struct otx2_cptvf_dev *cptvf)
 	}
 	req->id = MBOX_MSG_GET_CAPS;
 	req->sig = OTX2_MBOX_REQ_SIG;
-	req->pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->vf_id, 0);
+	req->pcifunc = OTX2_CPT_RVU_PFFUNC(cptvf->pdev, cptvf->vf_id, 0);
 
 	return otx2_cpt_send_mbox_msg(mbox, pdev);
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index 0277d226293e..d7030dfa5dad 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -97,7 +97,7 @@ int mcs_add_intr_wq_entry(struct mcs *mcs, struct mcs_intr_event *event)
 	if (pcifunc & RVU_PFVF_FUNC_MASK)
 		pfvf = &mcs->vf[rvu_get_hwvf(rvu, pcifunc)];
 	else
-		pfvf = &mcs->pf[rvu_get_pf(pcifunc)];
+		pfvf = &mcs->pf[rvu_get_pf(rvu->pdev, pcifunc)];
 
 	event->intr_mask &= pfvf->intr_mask;
 
@@ -123,7 +123,7 @@ static int mcs_notify_pfvf(struct mcs_intr_event *event, struct rvu *rvu)
 	struct mcs_intr_info *req;
 	int pf;
 
-	pf = rvu_get_pf(event->pcifunc);
+	pf = rvu_get_pf(rvu->pdev, event->pcifunc);
 
 	mutex_lock(&rvu->mbox_lock);
 
@@ -193,7 +193,7 @@ int rvu_mbox_handler_mcs_intr_cfg(struct rvu *rvu,
 	if (pcifunc & RVU_PFVF_FUNC_MASK)
 		pfvf = &mcs->vf[rvu_get_hwvf(rvu, pcifunc)];
 	else
-		pfvf = &mcs->pf[rvu_get_pf(pcifunc)];
+		pfvf = &mcs->pf[rvu_get_pf(rvu->pdev, pcifunc)];
 
 	mcs->pf_map[0] = pcifunc;
 	pfvf->intr_mask = req->intr_mask;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 43eea74bf541..61d80a2277f0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -294,7 +294,7 @@ int rvu_get_blkaddr(struct rvu *rvu, int blktype, u16 pcifunc)
 		devnum = rvu_get_hwvf(rvu, pcifunc);
 	} else {
 		is_pf = true;
-		devnum = rvu_get_pf(pcifunc);
+		devnum = rvu_get_pf(rvu->pdev, pcifunc);
 	}
 
 	/* Check if the 'pcifunc' has a NIX LF from 'BLKADDR_NIX0' or
@@ -359,7 +359,7 @@ static void rvu_update_rsrc_map(struct rvu *rvu, struct rvu_pfvf *pfvf,
 		devnum = rvu_get_hwvf(rvu, pcifunc);
 	} else {
 		is_pf = true;
-		devnum = rvu_get_pf(pcifunc);
+		devnum = rvu_get_pf(rvu->pdev, pcifunc);
 	}
 
 	block->fn_map[lf] = attach ? pcifunc : 0;
@@ -400,11 +400,6 @@ static void rvu_update_rsrc_map(struct rvu *rvu, struct rvu_pfvf *pfvf,
 	rvu_write64(rvu, BLKADDR_RVUM, reg | (devnum << 16), num_lfs);
 }
 
-inline int rvu_get_pf(u16 pcifunc)
-{
-	return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
-}
-
 void rvu_get_pf_numvfs(struct rvu *rvu, int pf, int *numvfs, int *hwvf)
 {
 	u64 cfg;
@@ -422,7 +417,7 @@ int rvu_get_hwvf(struct rvu *rvu, int pcifunc)
 	int pf, func;
 	u64 cfg;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	func = pcifunc & RVU_PFVF_FUNC_MASK;
 
 	/* Get first HWVF attached to this PF */
@@ -437,7 +432,7 @@ struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc)
 	if (pcifunc & RVU_PFVF_FUNC_MASK)
 		return &rvu->hwvf[rvu_get_hwvf(rvu, pcifunc)];
 	else
-		return &rvu->pf[rvu_get_pf(pcifunc)];
+		return &rvu->pf[rvu_get_pf(rvu->pdev, pcifunc)];
 }
 
 static bool is_pf_func_valid(struct rvu *rvu, u16 pcifunc)
@@ -445,7 +440,7 @@ static bool is_pf_func_valid(struct rvu *rvu, u16 pcifunc)
 	int pf, vf, nvfs;
 	u64 cfg;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (pf >= rvu->hw->total_pfs)
 		return false;
 
@@ -1487,7 +1482,7 @@ int rvu_get_nix_blkaddr(struct rvu *rvu, u16 pcifunc)
 	pf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
 
 	/* All CGX mapped PFs are set with assigned NIX block during init */
-	if (is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc))) {
+	if (is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc))) {
 		blkaddr = pf->nix_blkaddr;
 	} else if (is_lbk_vf(rvu, pcifunc)) {
 		vf = pcifunc - 1;
@@ -1501,7 +1496,7 @@ int rvu_get_nix_blkaddr(struct rvu *rvu, u16 pcifunc)
 	}
 
 	/* if SDP1 then the blkaddr is NIX1 */
-	if (is_sdp_pfvf(pcifunc) && pf->sdp_info->node_id == 1)
+	if (is_sdp_pfvf(rvu, pcifunc) && pf->sdp_info->node_id == 1)
 		blkaddr = BLKADDR_NIX1;
 
 	switch (blkaddr) {
@@ -2006,7 +2001,7 @@ int rvu_mbox_handler_vf_flr(struct rvu *rvu, struct msg_req *req,
 
 	vf = pcifunc & RVU_PFVF_FUNC_MASK;
 	cfg = rvu_read64(rvu, BLKADDR_RVUM,
-			 RVU_PRIV_PFX_CFG(rvu_get_pf(pcifunc)));
+			 RVU_PRIV_PFX_CFG(rvu_get_pf(rvu->pdev, pcifunc)));
 	numvfs = (cfg >> 12) & 0xFF;
 
 	if (vf && vf <= numvfs)
@@ -2229,9 +2224,8 @@ static void __rvu_mbox_handler(struct rvu_work *mwork, int type, bool poll)
 		/* Set which PF/VF sent this message based on mbox IRQ */
 		switch (type) {
 		case TYPE_AFPF:
-			msg->pcifunc &=
-				~(RVU_PFVF_PF_MASK << RVU_PFVF_PF_SHIFT);
-			msg->pcifunc |= (devid << RVU_PFVF_PF_SHIFT);
+			msg->pcifunc &= rvu_pcifunc_pf_mask(rvu->pdev);
+			msg->pcifunc |= rvu_make_pcifunc(rvu->pdev, devid, 0);
 			break;
 		case TYPE_AFVF:
 			msg->pcifunc &=
@@ -2249,7 +2243,7 @@ static void __rvu_mbox_handler(struct rvu_work *mwork, int type, bool poll)
 		if (msg->pcifunc & RVU_PFVF_FUNC_MASK)
 			dev_warn(rvu->dev, "Error %d when processing message %s (0x%x) from PF%d:VF%d\n",
 				 err, otx2_mbox_id2name(msg->id),
-				 msg->id, rvu_get_pf(msg->pcifunc),
+				 msg->id, rvu_get_pf(rvu->pdev, msg->pcifunc),
 				 (msg->pcifunc & RVU_PFVF_FUNC_MASK) - 1);
 		else
 			dev_warn(rvu->dev, "Error %d when processing message %s (0x%x) from PF%d\n",
@@ -2773,7 +2767,7 @@ static void rvu_flr_handler(struct work_struct *work)
 
 	cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf));
 	numvfs = (cfg >> 12) & 0xFF;
-	pcifunc  = pf << RVU_PFVF_PF_SHIFT;
+	pcifunc  = rvu_make_pcifunc(rvu->pdev, pf, 0);
 
 	for (vf = 0; vf < numvfs; vf++)
 		__rvu_flr_handler(rvu, (pcifunc | (vf + 1)));
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 48f66292ad5c..5c179df1f167 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -10,6 +10,7 @@
 
 #include <linux/pci.h>
 #include <net/devlink.h>
+#include <linux/soc/marvell/silicons.h>
 
 #include "rvu_struct.h"
 #include "rvu_devlink.h"
@@ -43,10 +44,34 @@
 #define MAX_CPT_BLKS				2
 
 /* PF_FUNC */
-#define RVU_PFVF_PF_SHIFT	10
-#define RVU_PFVF_PF_MASK	0x3F
-#define RVU_PFVF_FUNC_SHIFT	0
-#define RVU_PFVF_FUNC_MASK	0x3FF
+#define RVU_OTX2_PFVF_PF_SHIFT			10
+#define RVU_OTX2_PFVF_PF_MASK			0x3F
+#define RVU_PFVF_FUNC_SHIFT			0
+#define RVU_PFVF_FUNC_MASK			0x3FF
+#define RVU_CN20K_PFVF_PF_SHIFT			9
+#define RVU_CN20K_PFVF_PF_MASK			0x7F
+
+static inline u16 rvu_make_pcifunc(struct pci_dev *pdev, int pf, int func)
+{
+	if (is_cn20k(pdev))
+		return ((pf & RVU_CN20K_PFVF_PF_MASK) <<
+			RVU_CN20K_PFVF_PF_SHIFT) |
+			((func & RVU_PFVF_FUNC_MASK) <<
+			RVU_PFVF_FUNC_SHIFT);
+	else
+		return ((pf & RVU_OTX2_PFVF_PF_MASK) <<
+			RVU_OTX2_PFVF_PF_SHIFT) |
+			((func & RVU_PFVF_FUNC_MASK) <<
+			RVU_PFVF_FUNC_SHIFT);
+}
+
+static inline int rvu_pcifunc_pf_mask(struct pci_dev *pdev)
+{
+	if (is_cn20k(pdev))
+		return ~(RVU_CN20K_PFVF_PF_MASK << RVU_CN20K_PFVF_PF_SHIFT);
+	else
+		return ~(RVU_OTX2_PFVF_PF_MASK << RVU_OTX2_PFVF_PF_SHIFT);
+}
 
 #ifdef CONFIG_DEBUG_FS
 struct dump_ctx {
@@ -836,7 +861,6 @@ int rvu_alloc_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc);
 void rvu_free_rsrc_contig(struct rsrc_bmap *rsrc, int nrsrc, int start);
 bool rvu_rsrc_check_contig(struct rsrc_bmap *rsrc, int nrsrc);
 u16 rvu_get_rsrc_mapcount(struct rvu_pfvf *pfvf, int blkaddr);
-int rvu_get_pf(u16 pcifunc);
 struct rvu_pfvf *rvu_get_pfvf(struct rvu *rvu, int pcifunc);
 void rvu_get_pf_numvfs(struct rvu *rvu, int pf, int *numvfs, int *hwvf);
 bool is_block_implemented(struct rvu_hwinfo *hw, int blkaddr);
@@ -865,8 +889,8 @@ void rvu_aq_free(struct rvu *rvu, struct admin_queue *aq);
 
 /* SDP APIs */
 int rvu_sdp_init(struct rvu *rvu);
-bool is_sdp_pfvf(u16 pcifunc);
-bool is_sdp_pf(u16 pcifunc);
+bool is_sdp_pfvf(struct rvu *rvu, u16 pcifunc);
+bool is_sdp_pf(struct rvu *rvu, u16 pcifunc);
 bool is_sdp_vf(struct rvu *rvu, u16 pcifunc);
 
 static inline bool is_rep_dev(struct rvu *rvu, u16 pcifunc)
@@ -877,11 +901,21 @@ static inline bool is_rep_dev(struct rvu *rvu, u16 pcifunc)
 	return false;
 }
 
+static inline int rvu_get_pf(struct pci_dev *pdev, u16 pcifunc)
+{
+	if (is_cn20k(pdev))
+		return (pcifunc >> RVU_CN20K_PFVF_PF_SHIFT) &
+			RVU_CN20K_PFVF_PF_MASK;
+	else
+		return (pcifunc >> RVU_OTX2_PFVF_PF_SHIFT) &
+			RVU_OTX2_PFVF_PF_MASK;
+}
+
 /* CGX APIs */
 static inline bool is_pf_cgxmapped(struct rvu *rvu, u8 pf)
 {
 	return (pf >= PF_CGXMAP_BASE && pf <= rvu->cgx_mapped_pfs) &&
-		!is_sdp_pf(pf << RVU_PFVF_PF_SHIFT);
+		!is_sdp_pf(rvu, rvu_make_pcifunc(rvu->pdev, pf, 0));
 }
 
 static inline void rvu_get_cgx_lmac_id(u8 map, u8 *cgx_id, u8 *lmac_id)
@@ -893,7 +927,7 @@ static inline void rvu_get_cgx_lmac_id(u8 map, u8 *cgx_id, u8 *lmac_id)
 static inline bool is_cgx_vf(struct rvu *rvu, u16 pcifunc)
 {
 	return ((pcifunc & RVU_PFVF_FUNC_MASK) &&
-		is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc)));
+		is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)));
 }
 
 #define M(_name, _id, fn_name, req, rsp)				\
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
index d0331b0e0bfd..b79db887ab9b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
@@ -457,7 +457,7 @@ int rvu_cgx_exit(struct rvu *rvu)
 inline bool is_cgx_config_permitted(struct rvu *rvu, u16 pcifunc)
 {
 	if ((pcifunc & RVU_PFVF_FUNC_MASK) ||
-	    !is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc)))
+	    !is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)))
 		return false;
 	return true;
 }
@@ -484,7 +484,7 @@ void rvu_cgx_enadis_rx_bp(struct rvu *rvu, int pf, bool enable)
 
 int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	void *cgxd;
@@ -501,7 +501,7 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start)
 
 int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	void *cgxd;
@@ -526,7 +526,7 @@ int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable)
 
 void rvu_cgx_disable_dmac_entries(struct rvu *rvu, u16 pcifunc)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	int i = 0, lmac_count = 0;
 	struct mac_ops *mac_ops;
 	u8 max_dmac_filters;
@@ -577,7 +577,7 @@ int rvu_mbox_handler_cgx_stop_rxtx(struct rvu *rvu, struct msg_req *req,
 static int rvu_lmac_get_stats(struct rvu *rvu, struct msg_req *req,
 			      void *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct mac_ops *mac_ops;
 	int stat = 0, err = 0;
 	u64 tx_stat, rx_stat;
@@ -633,7 +633,7 @@ int rvu_mbox_handler_rpm_stats(struct rvu *rvu, struct msg_req *req,
 int rvu_mbox_handler_cgx_stats_rst(struct rvu *rvu, struct msg_req *req,
 				   struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct rvu_pfvf	*parent_pf;
 	struct mac_ops *mac_ops;
 	u8 cgx_idx, lmac;
@@ -663,7 +663,7 @@ int rvu_mbox_handler_cgx_fec_stats(struct rvu *rvu,
 				   struct msg_req *req,
 				   struct cgx_fec_stats_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_idx, lmac;
 	void *cgxd;
@@ -681,7 +681,7 @@ int rvu_mbox_handler_cgx_mac_addr_set(struct rvu *rvu,
 				      struct cgx_mac_addr_set_or_get *req,
 				      struct cgx_mac_addr_set_or_get *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -701,7 +701,7 @@ int rvu_mbox_handler_cgx_mac_addr_add(struct rvu *rvu,
 				      struct cgx_mac_addr_add_req *req,
 				      struct cgx_mac_addr_add_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 	int rc = 0;
 
@@ -725,7 +725,7 @@ int rvu_mbox_handler_cgx_mac_addr_del(struct rvu *rvu,
 				      struct cgx_mac_addr_del_req *req,
 				      struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -743,7 +743,7 @@ int rvu_mbox_handler_cgx_mac_max_entries_get(struct rvu *rvu,
 					     struct cgx_max_dmac_entries_get_rsp
 					     *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	/* If msg is received from PFs(which are not mapped to CGX LMACs)
@@ -769,7 +769,7 @@ int rvu_mbox_handler_cgx_mac_addr_get(struct rvu *rvu,
 				      struct cgx_mac_addr_set_or_get *req,
 				      struct cgx_mac_addr_set_or_get *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 	int rc = 0;
 	u64 cfg;
@@ -790,7 +790,7 @@ int rvu_mbox_handler_cgx_promisc_enable(struct rvu *rvu, struct msg_req *req,
 					struct msg_rsp *rsp)
 {
 	u16 pcifunc = req->hdr.pcifunc;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -809,7 +809,7 @@ int rvu_mbox_handler_cgx_promisc_enable(struct rvu *rvu, struct msg_req *req,
 int rvu_mbox_handler_cgx_promisc_disable(struct rvu *rvu, struct msg_req *req,
 					 struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -828,7 +828,7 @@ int rvu_mbox_handler_cgx_promisc_disable(struct rvu *rvu, struct msg_req *req,
 static int rvu_cgx_ptp_rx_cfg(struct rvu *rvu, u16 pcifunc, bool enable)
 {
 	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	void *cgxd;
@@ -864,7 +864,7 @@ static int rvu_cgx_ptp_rx_cfg(struct rvu *rvu, u16 pcifunc, bool enable)
 int rvu_mbox_handler_cgx_ptp_rx_enable(struct rvu *rvu, struct msg_req *req,
 				       struct msg_rsp *rsp)
 {
-	if (!is_pf_cgxmapped(rvu, rvu_get_pf(req->hdr.pcifunc)))
+	if (!is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, req->hdr.pcifunc)))
 		return -EPERM;
 
 	return rvu_cgx_ptp_rx_cfg(rvu, req->hdr.pcifunc, true);
@@ -878,7 +878,7 @@ int rvu_mbox_handler_cgx_ptp_rx_disable(struct rvu *rvu, struct msg_req *req,
 
 static int rvu_cgx_config_linkevents(struct rvu *rvu, u16 pcifunc, bool en)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, pcifunc))
@@ -917,7 +917,7 @@ int rvu_mbox_handler_cgx_get_linkinfo(struct rvu *rvu, struct msg_req *req,
 	u8 cgx_id, lmac_id;
 	int pf, err;
 
-	pf = rvu_get_pf(req->hdr.pcifunc);
+	pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 
 	if (!is_pf_cgxmapped(rvu, pf))
 		return -ENODEV;
@@ -933,7 +933,7 @@ int rvu_mbox_handler_cgx_features_get(struct rvu *rvu,
 				      struct msg_req *req,
 				      struct cgx_features_info_msg *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_idx, lmac;
 	void *cgxd;
 
@@ -975,7 +975,7 @@ u32 rvu_cgx_get_lmac_fifolen(struct rvu *rvu, int cgx, int lmac)
 
 static int rvu_cgx_config_intlbk(struct rvu *rvu, u16 pcifunc, bool en)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 
@@ -1005,7 +1005,7 @@ int rvu_mbox_handler_cgx_intlbk_disable(struct rvu *rvu, struct msg_req *req,
 
 int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 rx_pfc = 0, tx_pfc = 0;
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
@@ -1046,7 +1046,7 @@ int rvu_mbox_handler_cgx_cfg_pause_frm(struct rvu *rvu,
 				       struct cgx_pause_frm_cfg *req,
 				       struct cgx_pause_frm_cfg *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	int err = 0;
@@ -1073,7 +1073,7 @@ int rvu_mbox_handler_cgx_cfg_pause_frm(struct rvu *rvu,
 int rvu_mbox_handler_cgx_get_phy_fec_stats(struct rvu *rvu, struct msg_req *req,
 					   struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_pf_cgxmapped(rvu, pf))
@@ -1106,7 +1106,7 @@ int rvu_cgx_nix_cuml_stats(struct rvu *rvu, void *cgxd, int lmac_id,
 	/* Assumes LF of a PF and all of its VF belongs to the same
 	 * NIX block
 	 */
-	pcifunc = pf << RVU_PFVF_PF_SHIFT;
+	pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 	blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
 	if (blkaddr < 0)
 		return 0;
@@ -1133,10 +1133,10 @@ int rvu_cgx_start_stop_io(struct rvu *rvu, u16 pcifunc, bool start)
 	struct rvu_pfvf *parent_pf, *pfvf;
 	int cgx_users, err = 0;
 
-	if (!is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc)))
+	if (!is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)))
 		return 0;
 
-	parent_pf = &rvu->pf[rvu_get_pf(pcifunc)];
+	parent_pf = &rvu->pf[rvu_get_pf(rvu->pdev, pcifunc)];
 	pfvf = rvu_get_pfvf(rvu, pcifunc);
 
 	mutex_lock(&rvu->cgx_cfg_lock);
@@ -1179,7 +1179,7 @@ int rvu_mbox_handler_cgx_set_fec_param(struct rvu *rvu,
 				       struct fec_mode *req,
 				       struct fec_mode *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_pf_cgxmapped(rvu, pf))
@@ -1195,7 +1195,7 @@ int rvu_mbox_handler_cgx_set_fec_param(struct rvu *rvu,
 int rvu_mbox_handler_cgx_get_aux_link_info(struct rvu *rvu, struct msg_req *req,
 					   struct cgx_fw_data *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!rvu->fwdata)
@@ -1222,7 +1222,7 @@ int rvu_mbox_handler_cgx_set_link_mode(struct rvu *rvu,
 				       struct cgx_set_link_mode_req *req,
 				       struct cgx_set_link_mode_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_idx, lmac;
 	void *cgxd;
 
@@ -1238,7 +1238,7 @@ int rvu_mbox_handler_cgx_set_link_mode(struct rvu *rvu,
 int rvu_mbox_handler_cgx_mac_addr_reset(struct rvu *rvu, struct cgx_mac_addr_reset_req *req,
 					struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -1256,7 +1256,7 @@ int rvu_mbox_handler_cgx_mac_addr_update(struct rvu *rvu,
 					 struct cgx_mac_addr_update_req *req,
 					 struct cgx_mac_addr_update_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u8 cgx_id, lmac_id;
 
 	if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc))
@@ -1272,7 +1272,7 @@ int rvu_mbox_handler_cgx_mac_addr_update(struct rvu *rvu,
 int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause,
 			       u8 rx_pause, u16 pfc_en)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 rx_8023 = 0, tx_8023 = 0;
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
@@ -1310,7 +1310,7 @@ int rvu_mbox_handler_cgx_prio_flow_ctrl_cfg(struct rvu *rvu,
 					    struct cgx_pfc_cfg *req,
 					    struct cgx_pfc_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	void *cgxd;
@@ -1335,7 +1335,7 @@ int rvu_mbox_handler_cgx_prio_flow_ctrl_cfg(struct rvu *rvu,
 
 void rvu_mac_reset(struct rvu *rvu, u16 pcifunc)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	struct cgx *cgxd;
 	u8 cgx, lmac;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
index 4a3370a40dd8..05adc54535eb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c
@@ -66,7 +66,7 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val,
 #define LMT_MAP_TBL_W1_OFF  8
 static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc)
 {
-	return ((rvu_get_pf(pcifunc) * LMT_MAX_VFS) +
+	return ((rvu_get_pf(rvu->pdev, pcifunc) * LMT_MAX_VFS) +
 		(pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE;
 }
 
@@ -83,7 +83,7 @@ static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc,
 
 	mutex_lock(&rvu->rsrc_lock);
 	rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova);
-	pf = rvu_get_pf(pcifunc) & RVU_PFVF_PF_MASK;
+	pf = rvu_get_pf(rvu->pdev, pcifunc) & RVU_OTX2_PFVF_PF_MASK;
 	val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 |
 	      ((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF);
 	rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
index 3c5bbaf12e59..f404117bf6c8 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
@@ -410,7 +410,7 @@ static bool is_cpt_pf(struct rvu *rvu, u16 pcifunc)
 {
 	int cpt_pf_num = rvu->cpt_pf_num;
 
-	if (rvu_get_pf(pcifunc) != cpt_pf_num)
+	if (rvu_get_pf(rvu->pdev, pcifunc) != cpt_pf_num)
 		return false;
 	if (pcifunc & RVU_PFVF_FUNC_MASK)
 		return false;
@@ -422,7 +422,7 @@ static bool is_cpt_vf(struct rvu *rvu, u16 pcifunc)
 {
 	int cpt_pf_num = rvu->cpt_pf_num;
 
-	if (rvu_get_pf(pcifunc) != cpt_pf_num)
+	if (rvu_get_pf(rvu->pdev, pcifunc) != cpt_pf_num)
 		return false;
 	if (!(pcifunc & RVU_PFVF_FUNC_MASK))
 		return false;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index c827da626471..0c20642f81b9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -688,7 +688,7 @@ static int get_max_column_width(struct rvu *rvu)
 
 	for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
 		for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
-			pcifunc = pf << 10 | vf;
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, vf);
 			if (!pcifunc)
 				continue;
 
@@ -759,7 +759,7 @@ static ssize_t rvu_dbg_rsrc_attach_status(struct file *filp,
 		for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
 			off = 0;
 			flag = 0;
-			pcifunc = pf << 10 | vf;
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, vf);
 			if (!pcifunc)
 				continue;
 
@@ -842,7 +842,7 @@ static int rvu_dbg_rvu_pf_cgx_map_display(struct seq_file *filp, void *unused)
 
 		cgx[0] = 0;
 		lmac[0] = 0;
-		pcifunc = pf << 10;
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		pfvf = rvu_get_pfvf(rvu, pcifunc);
 
 		if (pfvf->nix_blkaddr == BLKADDR_NIX0)
@@ -2623,10 +2623,10 @@ static int rvu_dbg_nix_band_prof_ctx_display(struct seq_file *m, void *unused)
 			pcifunc = ipolicer->pfvf_map[idx];
 			if (!(pcifunc & RVU_PFVF_FUNC_MASK))
 				seq_printf(m, "Allocated to :: PF %d\n",
-					   rvu_get_pf(pcifunc));
+					   rvu_get_pf(rvu->pdev, pcifunc));
 			else
 				seq_printf(m, "Allocated to :: PF %d VF %d\n",
-					   rvu_get_pf(pcifunc),
+					   rvu_get_pf(rvu->pdev, pcifunc),
 					   (pcifunc & RVU_PFVF_FUNC_MASK) - 1);
 			print_band_prof_ctx(m, &aq_rsp.prof);
 		}
@@ -2983,10 +2983,10 @@ static void rvu_print_npc_mcam_info(struct seq_file *s,
 
 	if (!(pcifunc & RVU_PFVF_FUNC_MASK))
 		seq_printf(s, "\n\t\t Device \t\t: PF%d\n",
-			   rvu_get_pf(pcifunc));
+			   rvu_get_pf(rvu->pdev, pcifunc));
 	else
 		seq_printf(s, "\n\t\t Device \t\t: PF%d VF%d\n",
-			   rvu_get_pf(pcifunc),
+			   rvu_get_pf(rvu->pdev, pcifunc),
 			   (pcifunc & RVU_PFVF_FUNC_MASK) - 1);
 
 	if (entry_acnt) {
@@ -3049,13 +3049,13 @@ static int rvu_dbg_npc_mcam_info_display(struct seq_file *filp, void *unsued)
 	seq_puts(filp, "\n\t\t Current allocation\n");
 	seq_puts(filp, "\t\t====================\n");
 	for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
-		pcifunc = (pf << RVU_PFVF_PF_SHIFT);
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		rvu_print_npc_mcam_info(filp, pcifunc, blkaddr);
 
 		cfg = rvu_read64(rvu, BLKADDR_RVUM, RVU_PRIV_PFX_CFG(pf));
 		numvfs = (cfg >> 12) & 0xFF;
 		for (vf = 0; vf < numvfs; vf++) {
-			pcifunc = (pf << RVU_PFVF_PF_SHIFT) | (vf + 1);
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, (vf + 1));
 			rvu_print_npc_mcam_info(filp, pcifunc, blkaddr);
 		}
 	}
@@ -3326,7 +3326,7 @@ static int rvu_dbg_npc_mcam_show_rules(struct seq_file *s, void *unused)
 
 	mutex_lock(&mcam->lock);
 	list_for_each_entry(iter, &mcam->mcam_rules, list) {
-		pf = (iter->owner >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
+		pf = rvu_get_pf(rvu->pdev, iter->owner);
 		seq_printf(s, "\n\tInstalled by: PF%d ", pf);
 
 		if (iter->owner & RVU_PFVF_FUNC_MASK) {
@@ -3344,7 +3344,7 @@ static int rvu_dbg_npc_mcam_show_rules(struct seq_file *s, void *unused)
 		rvu_dbg_npc_mcam_show_flows(s, iter);
 		if (is_npc_intf_rx(iter->intf)) {
 			target = iter->rx_action.pf_func;
-			pf = (target >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
+			pf = rvu_get_pf(rvu->pdev, target);
 			seq_printf(s, "\tForward to: PF%d ", pf);
 
 			if (target & RVU_PFVF_FUNC_MASK) {
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
index 613655fcd34f..bdf4d852c15d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@ -315,7 +315,8 @@ static bool is_valid_txschq(struct rvu *rvu, int blkaddr,
 	if (lvl >= hw->cap.nix_tx_aggr_lvl) {
 		if ((nix_get_tx_link(rvu, map_func) !=
 		     nix_get_tx_link(rvu, pcifunc)) &&
-		     (rvu_get_pf(map_func) != rvu_get_pf(pcifunc)))
+		     (rvu_get_pf(rvu->pdev, map_func) !=
+				rvu_get_pf(rvu->pdev, pcifunc)))
 			return false;
 		else
 			return true;
@@ -339,7 +340,7 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf,
 	bool from_vf;
 	int err;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (!is_pf_cgxmapped(rvu, pf) && type != NIX_INTF_TYPE_LBK &&
 	    type != NIX_INTF_TYPE_SDP)
 		return 0;
@@ -416,7 +417,7 @@ static int nix_interface_init(struct rvu *rvu, u16 pcifunc, int type, int nixlf,
 		break;
 	case NIX_INTF_TYPE_SDP:
 		from_vf = !!(pcifunc & RVU_PFVF_FUNC_MASK);
-		parent_pf = &rvu->pf[rvu_get_pf(pcifunc)];
+		parent_pf = &rvu->pf[rvu_get_pf(rvu->pdev, pcifunc)];
 		sdp_info = parent_pf->sdp_info;
 		if (!sdp_info) {
 			dev_err(rvu->dev, "Invalid sdp_info pointer\n");
@@ -590,12 +591,12 @@ static int nix_bp_disable(struct rvu *rvu,
 	u16 chan_v;
 	u64 cfg;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	type = is_lbk_vf(rvu, pcifunc) ? NIX_INTF_TYPE_LBK : NIX_INTF_TYPE_CGX;
 	if (!is_pf_cgxmapped(rvu, pf) && type != NIX_INTF_TYPE_LBK)
 		return 0;
 
-	if (is_sdp_pfvf(pcifunc))
+	if (is_sdp_pfvf(rvu, pcifunc))
 		type = NIX_INTF_TYPE_SDP;
 
 	if (cpt_link && !rvu->hw->cpt_links)
@@ -736,9 +737,9 @@ static int nix_bp_enable(struct rvu *rvu,
 	u16 chan_v;
 	u64 cfg;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	type = is_lbk_vf(rvu, pcifunc) ? NIX_INTF_TYPE_LBK : NIX_INTF_TYPE_CGX;
-	if (is_sdp_pfvf(pcifunc))
+	if (is_sdp_pfvf(rvu, pcifunc))
 		type = NIX_INTF_TYPE_SDP;
 
 	/* Enable backpressure only for CGX mapped PFs and LBK/SDP interface */
@@ -1674,7 +1675,7 @@ int rvu_mbox_handler_nix_lf_alloc(struct rvu *rvu,
 	}
 
 	intf = is_lbk_vf(rvu, pcifunc) ? NIX_INTF_TYPE_LBK : NIX_INTF_TYPE_CGX;
-	if (is_sdp_pfvf(pcifunc))
+	if (is_sdp_pfvf(rvu, pcifunc))
 		intf = NIX_INTF_TYPE_SDP;
 
 	err = nix_interface_init(rvu, pcifunc, intf, nixlf, rsp,
@@ -1798,7 +1799,8 @@ int rvu_mbox_handler_nix_mark_format_cfg(struct rvu *rvu,
 	rc = rvu_nix_reserve_mark_format(rvu, nix_hw, blkaddr, cfg);
 	if (rc < 0) {
 		dev_err(rvu->dev, "No mark_format_ctl for (pf:%d, vf:%d)",
-			rvu_get_pf(pcifunc), pcifunc & RVU_PFVF_FUNC_MASK);
+			rvu_get_pf(rvu->pdev,  pcifunc),
+				   pcifunc & RVU_PFVF_FUNC_MASK);
 		return NIX_AF_ERR_MARK_CFG_FAIL;
 	}
 
@@ -2050,7 +2052,7 @@ static void nix_clear_tx_xoff(struct rvu *rvu, int blkaddr,
 static int nix_get_tx_link(struct rvu *rvu, u16 pcifunc)
 {
 	struct rvu_hwinfo *hw = rvu->hw;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id = 0, lmac_id = 0;
 
 	if (is_lbk_vf(rvu, pcifunc)) {/* LBK links */
@@ -2068,7 +2070,7 @@ static void nix_get_txschq_range(struct rvu *rvu, u16 pcifunc,
 				 int link, int *start, int *end)
 {
 	struct rvu_hwinfo *hw = rvu->hw;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 
 	/* LBK links */
 	if (is_lbk_vf(rvu, pcifunc) || is_rep_dev(rvu, pcifunc)) {
@@ -2426,7 +2428,7 @@ static int nix_smq_flush(struct rvu *rvu, int blkaddr,
 {
 	struct nix_smq_flush_ctx *smq_flush_ctx;
 	int err, restore_tx_en = 0, i;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id = 0, lmac_id = 0;
 	u16 tl2_tl3_link_schq;
 	u8 link, link_level;
@@ -2820,7 +2822,7 @@ void rvu_nix_tx_tl2_cfg(struct rvu *rvu, int blkaddr, u16 pcifunc,
 {
 	struct rvu_hwinfo *hw = rvu->hw;
 	int lbk_link_start, lbk_links;
-	u8 pf = rvu_get_pf(pcifunc);
+	u8 pf = rvu_get_pf(rvu->pdev, pcifunc);
 	int schq;
 	u64 cfg;
 
@@ -3190,7 +3192,8 @@ static int nix_blk_setup_mce(struct rvu *rvu, struct nix_hw *nix_hw,
 	err = rvu_nix_blk_aq_enq_inst(rvu, nix_hw, &aq_req, NULL);
 	if (err) {
 		dev_err(rvu->dev, "Failed to setup Bcast MCE for PF%d:VF%d\n",
-			rvu_get_pf(pcifunc), pcifunc & RVU_PFVF_FUNC_MASK);
+			rvu_get_pf(rvu->pdev, pcifunc),
+				pcifunc & RVU_PFVF_FUNC_MASK);
 		return err;
 	}
 	return 0;
@@ -3458,7 +3461,7 @@ int nix_update_mce_list(struct rvu *rvu, u16 pcifunc,
 		dev_err(rvu->dev,
 			"%s: Idx %d > max MCE idx %d, for PF%d bcast list\n",
 			__func__, idx, mce_list->max,
-			pcifunc >> RVU_PFVF_PF_SHIFT);
+			rvu_get_pf(rvu->pdev, pcifunc));
 		return -EINVAL;
 	}
 
@@ -3510,7 +3513,8 @@ void nix_get_mce_list(struct rvu *rvu, u16 pcifunc, int type,
 	struct rvu_pfvf *pfvf;
 
 	if (!hw->cap.nix_rx_multicast ||
-	    !is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc & ~RVU_PFVF_FUNC_MASK))) {
+	    !is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev,
+			     pcifunc & ~RVU_PFVF_FUNC_MASK))) {
 		*mce_list = NULL;
 		*mce_idx = 0;
 		return;
@@ -3544,13 +3548,13 @@ static int nix_update_mce_rule(struct rvu *rvu, u16 pcifunc,
 	int pf;
 
 	/* skip multicast pkt replication for AF's VFs & SDP links */
-	if (is_lbk_vf(rvu, pcifunc) || is_sdp_pfvf(pcifunc))
+	if (is_lbk_vf(rvu, pcifunc) || is_sdp_pfvf(rvu, pcifunc))
 		return 0;
 
 	if (!hw->cap.nix_rx_multicast)
 		return 0;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (!is_pf_cgxmapped(rvu, pf))
 		return 0;
 
@@ -3619,7 +3623,7 @@ static int nix_setup_mce_tables(struct rvu *rvu, struct nix_hw *nix_hw)
 
 		for (idx = 0; idx < (numvfs + 1); idx++) {
 			/* idx-0 is for PF, followed by VFs */
-			pcifunc = (pf << RVU_PFVF_PF_SHIFT);
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 			pcifunc |= idx;
 			/* Add dummy entries now, so that we don't have to check
 			 * for whether AQ_OP should be INIT/WRITE later on.
@@ -4554,7 +4558,7 @@ int rvu_mbox_handler_nix_set_rx_mode(struct rvu *rvu, struct nix_rx_mode *req,
 static void nix_find_link_frs(struct rvu *rvu,
 			      struct nix_frs_cfg *req, u16 pcifunc)
 {
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct rvu_pfvf *pfvf;
 	int maxlen, minlen;
 	int numvfs, hwvf;
@@ -4601,7 +4605,7 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req,
 {
 	struct rvu_hwinfo *hw = rvu->hw;
 	u16 pcifunc = req->hdr.pcifunc;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	int blkaddr, link = -1;
 	struct nix_hw *nix_hw;
 	struct rvu_pfvf *pfvf;
@@ -5251,7 +5255,7 @@ int rvu_mbox_handler_nix_lf_start_rx(struct rvu *rvu, struct msg_req *req,
 
 	rvu_switch_update_rules(rvu, pcifunc, true);
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (is_pf_cgxmapped(rvu, pf) && rvu->rep_mode)
 		rvu_rep_notify_pfvf_state(rvu, pcifunc, true);
 
@@ -5284,7 +5288,7 @@ int rvu_mbox_handler_nix_lf_stop_rx(struct rvu *rvu, struct msg_req *req,
 	rvu_switch_update_rules(rvu, pcifunc, false);
 	rvu_cgx_tx_enable(rvu, pcifunc, true);
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (is_pf_cgxmapped(rvu, pf) && rvu->rep_mode)
 		rvu_rep_notify_pfvf_state(rvu, pcifunc, false);
 	return 0;
@@ -5296,7 +5300,7 @@ void rvu_nix_lf_teardown(struct rvu *rvu, u16 pcifunc, int blkaddr, int nixlf)
 {
 	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
 	struct hwctx_disable_req ctx_req;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	struct mac_ops *mac_ops;
 	u8 cgx_id, lmac_id;
 	u64 sa_base;
@@ -5385,7 +5389,7 @@ static int rvu_nix_lf_ptp_tx_cfg(struct rvu *rvu, u16 pcifunc, bool enable)
 	int nixlf;
 	u64 cfg;
 
-	pf = rvu_get_pf(pcifunc);
+	pf = rvu_get_pf(rvu->pdev, pcifunc);
 	if (!is_mac_feature_supported(rvu, pf, RVU_LMAC_FEAT_PTP))
 		return 0;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index da15bb451178..c7c70429eb6c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -147,7 +147,9 @@ static int npc_get_ucast_mcam_index(struct npc_mcam *mcam, u16 pcifunc,
 int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
 			     u16 pcifunc, int nixlf, int type)
 {
-	int pf = rvu_get_pf(pcifunc);
+	struct rvu_hwinfo *hw = container_of(mcam, struct rvu_hwinfo, mcam);
+	struct rvu *rvu = hw->rvu;
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	int index;
 
 	/* Check if this is for a PF */
@@ -698,7 +700,7 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
 
 	/* RX_ACTION set to MCAST for CGX PF's */
 	if (hw->cap.nix_rx_multicast && pfvf->use_mce_list &&
-	    is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc))) {
+	    is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc))) {
 		*(u64 *)&action = 0;
 		action.op = NIX_RX_ACTIONOP_MCAST;
 		pfvf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
@@ -3434,7 +3436,7 @@ int rvu_npc_set_parse_mode(struct rvu *rvu, u16 pcifunc, u64 mode, u8 dir,
 {
 	struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
 	int blkaddr, nixlf, rc, intf_mode;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u64 rxpkind, txpkind;
 	u8 cgx_id, lmac_id;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
index d2661e7fabdb..999f6d93c7fe 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
@@ -1465,7 +1465,7 @@ static int rvu_npc_exact_update_table_entry(struct rvu *rvu, u8 cgx_id, u8 lmac_
 int rvu_npc_exact_promisc_disable(struct rvu *rvu, u16 pcifunc)
 {
 	struct npc_exact_table *table;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id, lmac_id;
 	u32 drop_mcam_idx;
 	bool *promisc;
@@ -1512,7 +1512,7 @@ int rvu_npc_exact_promisc_disable(struct rvu *rvu, u16 pcifunc)
 int rvu_npc_exact_promisc_enable(struct rvu *rvu, u16 pcifunc)
 {
 	struct npc_exact_table *table;
-	int pf = rvu_get_pf(pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u8 cgx_id, lmac_id;
 	u32 drop_mcam_idx;
 	bool *promisc;
@@ -1560,7 +1560,7 @@ int rvu_npc_exact_promisc_enable(struct rvu *rvu, u16 pcifunc)
 int rvu_npc_exact_mac_addr_reset(struct rvu *rvu, struct cgx_mac_addr_reset_req *req,
 				 struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u32 seq_id = req->index;
 	struct rvu_pfvf *pfvf;
 	u8 cgx_id, lmac_id;
@@ -1593,7 +1593,7 @@ int rvu_npc_exact_mac_addr_update(struct rvu *rvu,
 				  struct cgx_mac_addr_update_req *req,
 				  struct cgx_mac_addr_update_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct npc_exact_table_entry *entry;
 	struct npc_exact_table *table;
 	struct rvu_pfvf *pfvf;
@@ -1675,7 +1675,7 @@ int rvu_npc_exact_mac_addr_add(struct rvu *rvu,
 			       struct cgx_mac_addr_add_req *req,
 			       struct cgx_mac_addr_add_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	struct rvu_pfvf *pfvf;
 	u8 cgx_id, lmac_id;
 	int rc = 0;
@@ -1711,7 +1711,7 @@ int rvu_npc_exact_mac_addr_del(struct rvu *rvu,
 			       struct cgx_mac_addr_del_req *req,
 			       struct msg_rsp *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	int rc;
 
 	rc = rvu_npc_exact_del_table_entry_by_id(rvu, req->index);
@@ -1736,7 +1736,7 @@ int rvu_npc_exact_mac_addr_del(struct rvu *rvu,
 int rvu_npc_exact_mac_addr_set(struct rvu *rvu, struct cgx_mac_addr_set_or_get *req,
 			       struct cgx_mac_addr_set_or_get *rsp)
 {
-	int pf = rvu_get_pf(req->hdr.pcifunc);
+	int pf = rvu_get_pf(rvu->pdev, req->hdr.pcifunc);
 	u32 seq_id = req->index;
 	struct rvu_pfvf *pfvf;
 	u8 cgx_id, lmac_id;
@@ -2001,7 +2001,7 @@ int rvu_npc_exact_init(struct rvu *rvu)
 		}
 
 		/* Filter rules are only for PF */
-		pcifunc = RVU_PFFUNC(i, 0);
+		pcifunc = RVU_PFFUNC(rvu->pdev, i, 0);
 
 		dev_dbg(rvu->dev,
 			"%s:Drop rule cgx=%d lmac=%d chan(val=0x%llx, mask=0x%llx\n",
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.h
index 57a09328d46b..cb25cf478f1f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.h
@@ -139,9 +139,7 @@ static struct npc_mcam_kex_hash npc_mkex_hash_default __maybe_unused = {
 #define NPC_MCAM_DROP_RULE_MAX 30
 #define NPC_MCAM_SDP_DROP_RULE_IDX 0
 
-#define RVU_PFFUNC(pf, func)	\
-	((((pf) & RVU_PFVF_PF_MASK) << RVU_PFVF_PF_SHIFT) | \
-	(((func) & RVU_PFVF_FUNC_MASK) << RVU_PFVF_FUNC_SHIFT))
+#define RVU_PFFUNC(pdev, pf, func) rvu_make_pcifunc(pdev, pf, func)
 
 enum npc_exact_opc_type {
 	NPC_EXACT_OPC_MEM,
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c
index 32953cca108c..03099bc570bd 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c
@@ -39,7 +39,7 @@ static int rvu_rep_up_notify(struct rvu *rvu, struct rep_event *event)
 	struct rep_event *msg;
 	int pf;
 
-	pf = rvu_get_pf(event->pcifunc);
+	pf = rvu_get_pf(rvu->pdev, event->pcifunc);
 
 	if (event->event & RVU_EVENT_MAC_ADDR_CHANGE)
 		ether_addr_copy(pfvf->mac_addr, event->evt_data.mac);
@@ -114,10 +114,10 @@ int rvu_rep_notify_pfvf_state(struct rvu *rvu, u16 pcifunc, bool enable)
 	struct rep_event *req;
 	int pf;
 
-	if (!is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc)))
+	if (!is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)))
 		return 0;
 
-	pf = rvu_get_pf(rvu->rep_pcifunc);
+	pf = rvu_get_pf(rvu->pdev, rvu->rep_pcifunc);
 
 	mutex_lock(&rvu->mbox_lock);
 	req = otx2_mbox_alloc_msg_rep_event_up_notify(rvu, pf);
@@ -325,7 +325,7 @@ int rvu_rep_install_mcam_rules(struct rvu *rvu)
 		if (!is_pf_cgxmapped(rvu, pf))
 			continue;
 
-		pcifunc = pf << RVU_PFVF_PF_SHIFT;
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		rvu_get_nix_blkaddr(rvu, pcifunc);
 		rep = true;
 		for (i = 0; i < 2; i++) {
@@ -345,8 +345,7 @@ int rvu_rep_install_mcam_rules(struct rvu *rvu)
 
 		rvu_get_pf_numvfs(rvu, pf, &numvfs, NULL);
 		for (vf = 0; vf < numvfs; vf++) {
-			pcifunc = pf << RVU_PFVF_PF_SHIFT |
-				  ((vf + 1) & RVU_PFVF_FUNC_MASK);
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, vf + 1);
 			rvu_get_nix_blkaddr(rvu, pcifunc);
 
 			/* Skip installimg rules if nixlf is not attached */
@@ -454,7 +453,7 @@ int rvu_mbox_handler_get_rep_cnt(struct rvu *rvu, struct msg_req *req,
 	for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
 		if (!is_pf_cgxmapped(rvu, pf))
 			continue;
-		pcifunc = pf << RVU_PFVF_PF_SHIFT;
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		rvu->rep2pfvf_map[rep] = pcifunc;
 		rsp->rep_pf_map[rep] = pcifunc;
 		rep++;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c
index 38cfe148f4b7..e4a5f9fa6fd4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_sdp.c
@@ -17,9 +17,9 @@
 /* SDP PF number */
 static int sdp_pf_num[MAX_SDP] = {-1, -1};
 
-bool is_sdp_pfvf(u16 pcifunc)
+bool is_sdp_pfvf(struct rvu *rvu, u16 pcifunc)
 {
-	u16 pf = rvu_get_pf(pcifunc);
+	u16 pf = rvu_get_pf(rvu->pdev, pcifunc);
 	u32 found = 0, i = 0;
 
 	while (i < MAX_SDP) {
@@ -34,9 +34,9 @@ bool is_sdp_pfvf(u16 pcifunc)
 	return true;
 }
 
-bool is_sdp_pf(u16 pcifunc)
+bool is_sdp_pf(struct rvu *rvu, u16 pcifunc)
 {
-	return (is_sdp_pfvf(pcifunc) &&
+	return (is_sdp_pfvf(rvu, pcifunc) &&
 		!(pcifunc & RVU_PFVF_FUNC_MASK));
 }
 
@@ -46,7 +46,7 @@ bool is_sdp_vf(struct rvu *rvu, u16 pcifunc)
 	if (!(pcifunc & ~RVU_PFVF_FUNC_MASK))
 		return (rvu->vf_devid == RVU_SDP_VF_DEVID);
 
-	return (is_sdp_pfvf(pcifunc) &&
+	return (is_sdp_pfvf(rvu, pcifunc) &&
 		!!(pcifunc & RVU_PFVF_FUNC_MASK));
 }
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_switch.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_switch.c
index 268efb7c1c15..49ce38685a7e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_switch.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_switch.c
@@ -93,7 +93,7 @@ static int rvu_switch_install_rules(struct rvu *rvu)
 		if (!is_pf_cgxmapped(rvu, pf))
 			continue;
 
-		pcifunc = pf << 10;
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		/* rvu_get_nix_blkaddr sets up the corresponding NIX block
 		 * address and NIX RX and TX interfaces for a pcifunc.
 		 * Generally it is called during attach call of a pcifunc but it
@@ -126,7 +126,7 @@ static int rvu_switch_install_rules(struct rvu *rvu)
 
 		rvu_get_pf_numvfs(rvu, pf, &numvfs, NULL);
 		for (vf = 0; vf < numvfs; vf++) {
-			pcifunc = pf << 10 | ((vf + 1) & 0x3FF);
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, (vf + 1));
 			rvu_get_nix_blkaddr(rvu, pcifunc);
 
 			err = rvu_switch_install_rx_rule(rvu, pcifunc, 0x0);
@@ -236,7 +236,7 @@ void rvu_switch_disable(struct rvu *rvu)
 		if (!is_pf_cgxmapped(rvu, pf))
 			continue;
 
-		pcifunc = pf << 10;
+		pcifunc = rvu_make_pcifunc(rvu->pdev, pf, 0);
 		err = rvu_switch_install_rx_rule(rvu, pcifunc, 0xFFF);
 		if (err)
 			dev_err(rvu->dev,
@@ -248,7 +248,7 @@ void rvu_switch_disable(struct rvu *rvu)
 
 		rvu_get_pf_numvfs(rvu, pf, &numvfs, NULL);
 		for (vf = 0; vf < numvfs; vf++) {
-			pcifunc = pf << 10 | ((vf + 1) & 0x3FF);
+			pcifunc = rvu_make_pcifunc(rvu->pdev, pf, (vf + 1));
 			err = rvu_switch_install_rx_rule(rvu, pcifunc, 0xFFF);
 			if (err)
 				dev_err(rvu->dev,
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c
index a6500e3673f2..c691f0722154 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c
@@ -481,7 +481,7 @@ static int cn10k_outb_write_sa(struct otx2_nic *pf, struct qmem *sa_info)
 		goto set_available;
 
 	/* Trigger CTX flush to write dirty data back to DRAM */
-	reg_val = FIELD_PREP(CPT_LF_CTX_FLUSH, sa_iova >> 7);
+	reg_val = FIELD_PREP(CPT_LF_CTX_FLUSH_CPTR, sa_iova >> 7);
 	otx2_write64(pf, CN10K_CPT_LF_CTX_FLUSH, reg_val);
 
 set_available:
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h
index 9965df0faa3e..43fbce0d6039 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.h
@@ -220,7 +220,7 @@ struct cpt_sg_s {
 #define CPT_LF_Q_SIZE_DIV40 GENMASK_ULL(14, 0)
 
 /* CPT LF CTX Flush Register */
-#define CPT_LF_CTX_FLUSH GENMASK_ULL(45, 0)
+#define CPT_LF_CTX_FLUSH_CPTR GENMASK_ULL(45, 0)
 
 #ifdef CONFIG_XFRM_OFFLOAD
 int cn10k_ipsec_init(struct net_device *netdev);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index a2a7fc99695d..8ada34a868e9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -28,6 +28,7 @@
 #include "otx2_reg.h"
 #include "otx2_txrx.h"
 #include "otx2_devlink.h"
+#include <rvu.h>
 #include <rvu_trace.h>
 #include "qos.h"
 #include "rep.h"
@@ -904,21 +905,11 @@ MBOX_UP_MCS_MESSAGES
 /* Time to wait before watchdog kicks off */
 #define OTX2_TX_TIMEOUT		(100 * HZ)
 
-#define	RVU_PFVF_PF_SHIFT	10
-#define	RVU_PFVF_PF_MASK	0x3F
-#define	RVU_PFVF_FUNC_SHIFT	0
-#define	RVU_PFVF_FUNC_MASK	0x3FF
-
 static inline bool is_otx2_vf(u16 pcifunc)
 {
 	return !!(pcifunc & RVU_PFVF_FUNC_MASK);
 }
 
-static inline int rvu_get_pf(u16 pcifunc)
-{
-	return (pcifunc >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
-}
-
 static inline dma_addr_t otx2_dma_map_page(struct otx2_nic *pfvf,
 					   struct page *page,
 					   size_t offset, size_t size,
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index 07da4d6dbbc9..1dc3e057f52d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -206,7 +206,8 @@ static int otx2_register_flr_me_intr(struct otx2_nic *pf, int numvfs)
 
 	/* Register ME interrupt handler*/
 	irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFME0 * NAME_SIZE];
-	snprintf(irq_name, NAME_SIZE, "RVUPF%d_ME0", rvu_get_pf(pf->pcifunc));
+	snprintf(irq_name, NAME_SIZE, "RVUPF%d_ME0",
+		 rvu_get_pf(pf->pdev, pf->pcifunc));
 	ret = request_irq(pci_irq_vector(pf->pdev, RVU_PF_INT_VEC_VFME0),
 			  otx2_pf_me_intr_handler, 0, irq_name, pf);
 	if (ret) {
@@ -216,7 +217,8 @@ static int otx2_register_flr_me_intr(struct otx2_nic *pf, int numvfs)
 
 	/* Register FLR interrupt handler */
 	irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFFLR0 * NAME_SIZE];
-	snprintf(irq_name, NAME_SIZE, "RVUPF%d_FLR0", rvu_get_pf(pf->pcifunc));
+	snprintf(irq_name, NAME_SIZE, "RVUPF%d_FLR0",
+		 rvu_get_pf(pf->pdev, pf->pcifunc));
 	ret = request_irq(pci_irq_vector(pf->pdev, RVU_PF_INT_VEC_VFFLR0),
 			  otx2_pf_flr_intr_handler, 0, irq_name, pf);
 	if (ret) {
@@ -228,7 +230,7 @@ static int otx2_register_flr_me_intr(struct otx2_nic *pf, int numvfs)
 	if (numvfs > 64) {
 		irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFME1 * NAME_SIZE];
 		snprintf(irq_name, NAME_SIZE, "RVUPF%d_ME1",
-			 rvu_get_pf(pf->pcifunc));
+			 rvu_get_pf(pf->pdev, pf->pcifunc));
 		ret = request_irq(pci_irq_vector
 				  (pf->pdev, RVU_PF_INT_VEC_VFME1),
 				  otx2_pf_me_intr_handler, 0, irq_name, pf);
@@ -238,7 +240,7 @@ static int otx2_register_flr_me_intr(struct otx2_nic *pf, int numvfs)
 		}
 		irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFFLR1 * NAME_SIZE];
 		snprintf(irq_name, NAME_SIZE, "RVUPF%d_FLR1",
-			 rvu_get_pf(pf->pcifunc));
+			 rvu_get_pf(pf->pdev, pf->pcifunc));
 		ret = request_irq(pci_irq_vector
 				  (pf->pdev, RVU_PF_INT_VEC_VFFLR1),
 				  otx2_pf_flr_intr_handler, 0, irq_name, pf);
@@ -700,7 +702,7 @@ static int otx2_register_pfvf_mbox_intr(struct otx2_nic *pf, int numvfs)
 	irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFPF_MBOX0 * NAME_SIZE];
 	if (pf->pcifunc)
 		snprintf(irq_name, NAME_SIZE,
-			 "RVUPF%d_VF Mbox0", rvu_get_pf(pf->pcifunc));
+			 "RVUPF%d_VF Mbox0", rvu_get_pf(pf->pdev, pf->pcifunc));
 	else
 		snprintf(irq_name, NAME_SIZE, "RVUPF_VF Mbox0");
 	err = request_irq(pci_irq_vector(pf->pdev, RVU_PF_INT_VEC_VFPF_MBOX0),
@@ -716,7 +718,8 @@ static int otx2_register_pfvf_mbox_intr(struct otx2_nic *pf, int numvfs)
 		irq_name = &hw->irq_name[RVU_PF_INT_VEC_VFPF_MBOX1 * NAME_SIZE];
 		if (pf->pcifunc)
 			snprintf(irq_name, NAME_SIZE,
-				 "RVUPF%d_VF Mbox1", rvu_get_pf(pf->pcifunc));
+				 "RVUPF%d_VF Mbox1",
+				 rvu_get_pf(pf->pdev, pf->pcifunc));
 		else
 			snprintf(irq_name, NAME_SIZE, "RVUPF_VF Mbox1");
 		err = request_irq(pci_irq_vector(pf->pdev,
@@ -1971,7 +1974,7 @@ int otx2_open(struct net_device *netdev)
 	if (err) {
 		dev_err(pf->dev,
 			"RVUPF%d: IRQ registration failed for QERR\n",
-			rvu_get_pf(pf->pcifunc));
+			rvu_get_pf(pf->pdev, pf->pcifunc));
 		goto err_disable_napi;
 	}
 
@@ -1989,7 +1992,7 @@ int otx2_open(struct net_device *netdev)
 		if (name_len >= NAME_SIZE) {
 			dev_err(pf->dev,
 				"RVUPF%d: IRQ registration failed for CQ%d, irq name is too long\n",
-				rvu_get_pf(pf->pcifunc), qidx);
+				rvu_get_pf(pf->pdev, pf->pcifunc), qidx);
 			err = -EINVAL;
 			goto err_free_cints;
 		}
@@ -2000,7 +2003,7 @@ int otx2_open(struct net_device *netdev)
 		if (err) {
 			dev_err(pf->dev,
 				"RVUPF%d: IRQ registration failed for CQ%d\n",
-				rvu_get_pf(pf->pcifunc), qidx);
+				rvu_get_pf(pf->pdev, pf->pcifunc), qidx);
 			goto err_free_cints;
 		}
 		vec++;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
index e3aee6e36215..858f084b9d47 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_reg.h
@@ -138,36 +138,6 @@
 #define	NIX_LF_CINTX_ENA_W1S(a)		(NIX_LFBASE | 0xD40 | (a) << 12)
 #define	NIX_LF_CINTX_ENA_W1C(a)		(NIX_LFBASE | 0xD50 | (a) << 12)
 
-/* NIX AF transmit scheduler registers */
-#define NIX_AF_SMQX_CFG(a)		(0x700 | (u64)(a) << 16)
-#define NIX_AF_TL4X_SDP_LINK_CFG(a)	(0xB10 | (u64)(a) << 16)
-#define NIX_AF_TL1X_SCHEDULE(a)		(0xC00 | (u64)(a) << 16)
-#define NIX_AF_TL1X_CIR(a)		(0xC20 | (u64)(a) << 16)
-#define NIX_AF_TL1X_TOPOLOGY(a)		(0xC80 | (u64)(a) << 16)
-#define NIX_AF_TL2X_PARENT(a)		(0xE88 | (u64)(a) << 16)
-#define NIX_AF_TL2X_SCHEDULE(a)		(0xE00 | (u64)(a) << 16)
-#define NIX_AF_TL2X_TOPOLOGY(a)		(0xE80 | (u64)(a) << 16)
-#define NIX_AF_TL2X_CIR(a)		(0xE20 | (u64)(a) << 16)
-#define NIX_AF_TL2X_PIR(a)		(0xE30 | (u64)(a) << 16)
-#define NIX_AF_TL3X_PARENT(a)		(0x1088 | (u64)(a) << 16)
-#define NIX_AF_TL3X_SCHEDULE(a)		(0x1000 | (u64)(a) << 16)
-#define NIX_AF_TL3X_SHAPE(a)		(0x1010 | (u64)(a) << 16)
-#define NIX_AF_TL3X_CIR(a)		(0x1020 | (u64)(a) << 16)
-#define NIX_AF_TL3X_PIR(a)		(0x1030 | (u64)(a) << 16)
-#define NIX_AF_TL3X_TOPOLOGY(a)		(0x1080 | (u64)(a) << 16)
-#define NIX_AF_TL4X_PARENT(a)		(0x1288 | (u64)(a) << 16)
-#define NIX_AF_TL4X_SCHEDULE(a)		(0x1200 | (u64)(a) << 16)
-#define NIX_AF_TL4X_SHAPE(a)		(0x1210 | (u64)(a) << 16)
-#define NIX_AF_TL4X_CIR(a)		(0x1220 | (u64)(a) << 16)
-#define NIX_AF_TL4X_PIR(a)		(0x1230 | (u64)(a) << 16)
-#define NIX_AF_TL4X_TOPOLOGY(a)		(0x1280 | (u64)(a) << 16)
-#define NIX_AF_MDQX_SCHEDULE(a)		(0x1400 | (u64)(a) << 16)
-#define NIX_AF_MDQX_SHAPE(a)		(0x1410 | (u64)(a) << 16)
-#define NIX_AF_MDQX_CIR(a)		(0x1420 | (u64)(a) << 16)
-#define NIX_AF_MDQX_PIR(a)		(0x1430 | (u64)(a) << 16)
-#define NIX_AF_MDQX_PARENT(a)		(0x1480 | (u64)(a) << 16)
-#define NIX_AF_TL3_TL2X_LINKX_CFG(a, b)	(0x1700 | (u64)(a) << 16 | (b) << 3)
-
 /* LMT LF registers */
 #define LMT_LFBASE			BIT_ULL(RVU_FUNC_BLKADDR_SHIFT)
 #define LMT_LF_LMTLINEX(a)		(LMT_LFBASE | 0x000 | (a) << 12)
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
index 9a226ca74425..5f80b23c5335 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
@@ -467,7 +467,8 @@ static int otx2_tc_parse_actions(struct otx2_nic *nic,
 			target = act->dev;
 			if (target->dev.parent) {
 				priv = netdev_priv(target);
-				if (rvu_get_pf(nic->pcifunc) != rvu_get_pf(priv->pcifunc)) {
+				if (rvu_get_pf(nic->pdev, nic->pcifunc) !=
+					rvu_get_pf(nic->pdev, priv->pcifunc)) {
 					NL_SET_ERR_MSG_MOD(extack,
 							   "can't redirect to other pf/vf");
 					return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
index 2cd3da3b6843..25af98034e2e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c
@@ -244,10 +244,10 @@ static int rvu_rep_devlink_port_register(struct rep_dev *rep)
 
 	if (!(rep->pcifunc & RVU_PFVF_FUNC_MASK)) {
 		attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
-		attrs.phys.port_number = rvu_get_pf(rep->pcifunc);
+		attrs.phys.port_number = rvu_get_pf(priv->pdev, rep->pcifunc);
 	} else {
 		attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF;
-		attrs.pci_vf.pf = rvu_get_pf(rep->pcifunc);
+		attrs.pci_vf.pf = rvu_get_pf(priv->pdev, rep->pcifunc);
 		attrs.pci_vf.vf = rep->pcifunc & RVU_PFVF_FUNC_MASK;
 	}
 
@@ -672,7 +672,8 @@ int rvu_rep_create(struct otx2_nic *priv, struct netlink_ext_ack *extack)
 		rep->pcifunc = pcifunc;
 
 		snprintf(ndev->name, sizeof(ndev->name), "Rpf%dvf%d",
-			 rvu_get_pf(pcifunc), (pcifunc & RVU_PFVF_FUNC_MASK));
+			 rvu_get_pf(priv->pdev, pcifunc),
+			 (pcifunc & RVU_PFVF_FUNC_MASK));
 
 		ndev->hw_features = (NETIF_F_RXCSUM | NETIF_F_IP_CSUM |
 			       NETIF_F_IPV6_CSUM | NETIF_F_RXHASH |
diff --git a/include/linux/soc/marvell/silicons.h b/include/linux/soc/marvell/silicons.h
new file mode 100644
index 000000000000..66bb9bfaf17d
--- /dev/null
+++ b/include/linux/soc/marvell/silicons.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ * Copyright (C) 2024 Marvell.
+ */
+
+#ifndef __SOC_SILICON_H
+#define __SOC_SILICON_H
+
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#if defined(CONFIG_ARM64)
+
+#define CN20K_CHIPID	0x20
+/*
+ * Silicon check for CN20K family
+ */
+static inline bool is_cn20k(struct pci_dev *pdev)
+{
+	return (pdev->subsystem_device & 0xFF) == CN20K_CHIPID;
+}
+#else
+#define is_cn20k(pdev)		((void)(pdev), 0)
+#endif
+
+#endif /* __SOC_SILICON_H */
-- 
cgit v1.2.3


From 2796ff1e3dcae7a3568f8e428ec9d32a8ee2fb36 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 14 Jun 2025 22:30:43 +0200
Subject: net: phy: add flag is_genphy_driven to struct phy_device

In order to get rid of phy_driver_is_genphy() and
phy_driver_is_genphy_10g(), as first step add and use a flag
phydev->is_genphy_driven.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/3f3ad6dc-402e-4915-8d5a-2306b6d5562b@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 13 +++++++------
 include/linux/phy.h          |  2 ++
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 509078344020..2aae97b2ffd8 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1522,7 +1522,6 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	struct mii_bus *bus = phydev->mdio.bus;
 	struct device *d = &phydev->mdio.dev;
 	struct module *ndev_owner = NULL;
-	bool using_genphy = false;
 	int err;
 
 	/* For Ethernet device drivers that register their own MDIO bus, we
@@ -1548,7 +1547,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		else
 			d->driver = &genphy_driver.mdiodrv.driver;
 
-		using_genphy = true;
+		phydev->is_genphy_driven = 1;
 	}
 
 	if (!try_module_get(d->driver->owner)) {
@@ -1557,7 +1556,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 		goto error_put_device;
 	}
 
-	if (using_genphy) {
+	if (phydev->is_genphy_driven) {
 		err = d->driver->probe(d);
 		if (err >= 0)
 			err = device_bind_driver(d);
@@ -1627,7 +1626,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 	 * the generic PHY driver we can't figure it out, thus set the old
 	 * legacy PORT_MII value.
 	 */
-	if (using_genphy)
+	if (phydev->is_genphy_driven)
 		phydev->port = PORT_MII;
 
 	/* Initial carrier state is off as the phy is about to be
@@ -1666,6 +1665,7 @@ error:
 
 error_module_put:
 	module_put(d->driver->owner);
+	phydev->is_genphy_driven = 0;
 	d->driver = NULL;
 error_put_device:
 	put_device(d);
@@ -1799,9 +1799,10 @@ void phy_detach(struct phy_device *phydev)
 	 * from the generic driver so that there's a chance a
 	 * real driver could be loaded
 	 */
-	if (phy_driver_is_genphy(phydev) ||
-	    phy_driver_is_genphy_10g(phydev))
+	if (phydev->is_genphy_driven) {
 		device_release_driver(&phydev->mdio.dev);
+		phydev->is_genphy_driven = 0;
+	}
 
 	/* Assert the reset signal */
 	phy_device_reset(phydev, 1);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8e2e4fcd050e..32ed27f10639 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -528,6 +528,7 @@ struct macsec_ops;
  * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY
  * @wol_enabled: Set to true if the PHY or the attached MAC have Wake-on-LAN
  * 		 enabled.
+ * @is_genphy_driven: PHY is driven by one of the generic PHY drivers
  * @state: State of the PHY for management purposes
  * @dev_flags: Device-specific flags used by the PHY driver.
  *
@@ -631,6 +632,7 @@ struct phy_device {
 	unsigned is_on_sfp_module:1;
 	unsigned mac_managed_pm:1;
 	unsigned wol_enabled:1;
+	unsigned is_genphy_driven:1;
 
 	unsigned autoneg:1;
 	/* The most recently read link state */
-- 
cgit v1.2.3


From 59e74c92e67e2951d829f9b0d78c5dc1df7c4c88 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 14 Jun 2025 22:31:57 +0200
Subject: net: phy: improve phy_driver_is_genphy

Use new flag phydev->is_genphy_driven to simplify this function.
Note that this includes a minor functional change:
Now this function returns true if ANY of the genphy drivers
is bound to the PHY device.

We have only one user in DSA driver mt7530, and there the
functional change doesn't matter.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/c9ac3a7d-262a-425d-9153-97fe3ca6280a@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c |  7 -------
 include/linux/phy.h          | 12 +++++++++++-
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 2aae97b2ffd8..fa0890ebf2ea 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1729,13 +1729,6 @@ static bool phy_driver_is_genphy_kind(struct phy_device *phydev,
 	return ret;
 }
 
-bool phy_driver_is_genphy(struct phy_device *phydev)
-{
-	return phy_driver_is_genphy_kind(phydev,
-					 &genphy_driver.mdiodrv.driver);
-}
-EXPORT_SYMBOL_GPL(phy_driver_is_genphy);
-
 bool phy_driver_is_genphy_10g(struct phy_device *phydev)
 {
 	return phy_driver_is_genphy_kind(phydev,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 32ed27f10639..97a09e5743ef 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1295,6 +1295,17 @@ static inline bool phy_is_started(struct phy_device *phydev)
 	return phydev->state >= PHY_UP;
 }
 
+/**
+ * phy_driver_is_genphy - Convenience function to check whether PHY is driven
+ *                        by one of the generic PHY drivers
+ * @phydev: The phy_device struct
+ * Return: true if PHY is driven by one of the genphy drivers
+ */
+static inline bool phy_driver_is_genphy(struct phy_device *phydev)
+{
+	return phydev->is_genphy_driven;
+}
+
 /**
  * phy_disable_eee_mode - Don't advertise an EEE mode.
  * @phydev: The phy_device struct
@@ -2097,7 +2108,6 @@ module_exit(phy_module_exit)
 #define module_phy_driver(__phy_drivers)				\
 	phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers))
 
-bool phy_driver_is_genphy(struct phy_device *phydev);
 bool phy_driver_is_genphy_10g(struct phy_device *phydev);
 
 #endif /* __PHY_H */
-- 
cgit v1.2.3


From 42ed7f7e94da01391d3519ffb5747698d2be0a67 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 14 Jun 2025 22:32:47 +0200
Subject: net: phy: remove phy_driver_is_genphy_10g

Remove now unused function phy_driver_is_genphy_10g().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/49b0589a-9604-4ee9-add5-28fbbbe2c2f3@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phy_device.c | 23 -----------------------
 include/linux/phy.h          |  2 --
 2 files changed, 25 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index fa0890ebf2ea..1c3a27b73d7b 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1713,29 +1713,6 @@ struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
 }
 EXPORT_SYMBOL(phy_attach);
 
-static bool phy_driver_is_genphy_kind(struct phy_device *phydev,
-				      struct device_driver *driver)
-{
-	struct device *d = &phydev->mdio.dev;
-	bool ret = false;
-
-	if (!phydev->drv)
-		return ret;
-
-	get_device(d);
-	ret = d->driver == driver;
-	put_device(d);
-
-	return ret;
-}
-
-bool phy_driver_is_genphy_10g(struct phy_device *phydev)
-{
-	return phy_driver_is_genphy_kind(phydev,
-					 &genphy_c45_driver.mdiodrv.driver);
-}
-EXPORT_SYMBOL_GPL(phy_driver_is_genphy_10g);
-
 /**
  * phy_detach - detach a PHY device from its network device
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 97a09e5743ef..b037aab7b71d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -2108,6 +2108,4 @@ module_exit(phy_module_exit)
 #define module_phy_driver(__phy_drivers)				\
 	phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers))
 
-bool phy_driver_is_genphy_10g(struct phy_device *phydev);
-
 #endif /* __PHY_H */
-- 
cgit v1.2.3


From 5da8a8b8090b5f79a816ba016af3a70a9d7287bf Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta@linux.microsoft.com>
Date: Wed, 11 Jun 2025 07:10:01 -0700
Subject: PCI/MSI: Export pci_msix_prepare_desc() for dynamic MSI-X allocations

For supporting dynamic MSI-X vector allocation by PCI controllers, enabling
the flag MSI_FLAG_PCI_MSIX_ALLOC_DYN is not enough, msix_prepare_msi_desc()
to prepare the MSI descriptor is also needed.

Export pci_msix_prepare_desc() to allow PCI controllers to support dynamic
MSI-X vector allocation.

Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi/irqdomain.c | 5 +++--
 include/linux/msi.h         | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index c05152733993..765312c92d9b 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -222,13 +222,14 @@ static void pci_irq_unmask_msix(struct irq_data *data)
 	pci_msix_unmask(irq_data_get_msi_desc(data));
 }
 
-static void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
-				  struct msi_desc *desc)
+void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
+			   struct msi_desc *desc)
 {
 	/* Don't fiddle with preallocated MSI descriptors */
 	if (!desc->pci.mask_base)
 		msix_prepare_msi_desc(to_pci_dev(desc->dev), desc);
 }
+EXPORT_SYMBOL_GPL(pci_msix_prepare_desc);
 
 static const struct msi_domain_template pci_msix_template = {
 	.chip = {
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6863540f4b71..7f254bde5426 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -706,6 +706,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct irq_domain *parent);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
+void pci_msix_prepare_desc(struct irq_domain *domain, msi_alloc_info_t *arg,
+			   struct msi_desc *desc);
 #else /* CONFIG_PCI_MSI */
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 {
-- 
cgit v1.2.3


From 755391121038c06cb653241aa94dcabd87179f62 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta@linux.microsoft.com>
Date: Wed, 11 Jun 2025 07:11:13 -0700
Subject: net: mana: Allocate MSI-X vectors dynamically

Currently, the MANA driver allocates MSI-X vectors statically based on
MANA_MAX_NUM_QUEUES and num_online_cpus() values and in some cases ends
up allocating more vectors than it needs. This is because, by this time
we do not have a HW channel and do not know how many IRQs should be
allocated.

To avoid this, we allocate 1 MSI-X vector during the creation of HWC and
after getting the value supported by hardware, dynamically add the
remaining MSI-X vectors.

Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 311 +++++++++++++++++-------
 include/net/mana/gdma.h                         |   8 +-
 2 files changed, 235 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 6e468c0f2c40..d0040c12b8a2 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -6,6 +6,8 @@
 #include <linux/pci.h>
 #include <linux/utsname.h>
 #include <linux/version.h>
+#include <linux/msi.h>
+#include <linux/irqdomain.h>
 
 #include <net/mana/mana.h>
 
@@ -80,8 +82,15 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 		return err ? err : -EPROTO;
 	}
 
-	if (gc->num_msix_usable > resp.max_msix)
-		gc->num_msix_usable = resp.max_msix;
+	if (!pci_msix_can_alloc_dyn(pdev)) {
+		if (gc->num_msix_usable > resp.max_msix)
+			gc->num_msix_usable = resp.max_msix;
+	} else {
+		/* If dynamic allocation is enabled we have already allocated
+		 * hwc msi
+		 */
+		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
+	}
 
 	if (gc->num_msix_usable <= 1)
 		return -ENOSPC;
@@ -483,7 +492,9 @@ static int mana_gd_register_irq(struct gdma_queue *queue,
 	}
 
 	queue->eq.msix_index = msi_index;
-	gic = &gc->irq_contexts[msi_index];
+	gic = xa_load(&gc->irq_contexts, msi_index);
+	if (WARN_ON(!gic))
+		return -EINVAL;
 
 	spin_lock_irqsave(&gic->lock, flags);
 	list_add_rcu(&queue->entry, &gic->eq_list);
@@ -508,7 +519,10 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue)
 	if (WARN_ON(msix_index >= gc->num_msix_usable))
 		return;
 
-	gic = &gc->irq_contexts[msix_index];
+	gic = xa_load(&gc->irq_contexts, msix_index);
+	if (WARN_ON(!gic))
+		return;
+
 	spin_lock_irqsave(&gic->lock, flags);
 	list_for_each_entry_rcu(eq, &gic->eq_list, entry) {
 		if (queue == eq) {
@@ -1366,47 +1380,108 @@ done:
 	return 0;
 }
 
-static int mana_gd_setup_irqs(struct pci_dev *pdev)
+static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
-	unsigned int max_queues_per_port;
 	struct gdma_irq_context *gic;
-	unsigned int max_irqs, cpu;
-	int start_irq_index = 1;
-	int nvec, *irqs, irq;
-	int err, i = 0, j;
+	bool skip_first_cpu = false;
+	int *irqs, irq, err, i;
 
-	cpus_read_lock();
-	max_queues_per_port = num_online_cpus();
-	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
-		max_queues_per_port = MANA_MAX_NUM_QUEUES;
+	irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	/*
+	 * While processing the next pci irq vector, we start with index 1,
+	 * as IRQ vector at index 0 is already processed for HWC.
+	 * However, the population of irqs array starts with index 0, to be
+	 * further used in irq_setup()
+	 */
+	for (i = 1; i <= nvec; i++) {
+		gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+		if (!gic) {
+			err = -ENOMEM;
+			goto free_irq;
+		}
+		gic->handler = mana_gd_process_eq_events;
+		INIT_LIST_HEAD(&gic->eq_list);
+		spin_lock_init(&gic->lock);
 
-	/* Need 1 interrupt for the Hardware communication Channel (HWC) */
-	max_irqs = max_queues_per_port + 1;
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
+			 i - 1, pci_name(pdev));
 
-	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
-	if (nvec < 0) {
-		cpus_read_unlock();
-		return nvec;
+		/* one pci vector is already allocated for HWC */
+		irqs[i - 1] = pci_irq_vector(pdev, i);
+		if (irqs[i - 1] < 0) {
+			err = irqs[i - 1];
+			goto free_current_gic;
+		}
+
+		err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic);
+		if (err)
+			goto free_current_gic;
+
+		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
 	}
-	if (nvec <= num_online_cpus())
-		start_irq_index = 0;
 
-	irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL);
-	if (!irqs) {
-		err = -ENOMEM;
-		goto free_irq_vector;
+	/*
+	 * When calling irq_setup() for dynamically added IRQs, if number of
+	 * CPUs is more than or equal to allocated MSI-X, we need to skip the
+	 * first CPU sibling group since they are already affinitized to HWC IRQ
+	 */
+	cpus_read_lock();
+	if (gc->num_msix_usable <= num_online_cpus())
+		skip_first_cpu = true;
+
+	err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
+	if (err) {
+		cpus_read_unlock();
+		goto free_irq;
 	}
 
-	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
-				   GFP_KERNEL);
-	if (!gc->irq_contexts) {
-		err = -ENOMEM;
-		goto free_irq_array;
+	cpus_read_unlock();
+	kfree(irqs);
+	return 0;
+
+free_current_gic:
+	kfree(gic);
+free_irq:
+	for (i -= 1; i > 0; i--) {
+		irq = pci_irq_vector(pdev, i);
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
+
+		irq_update_affinity_hint(irq, NULL);
+		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
+	kfree(irqs);
+	return err;
+}
+
+static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	struct gdma_irq_context *gic;
+	int *irqs, *start_irqs, irq;
+	unsigned int cpu;
+	int err, i;
+
+	irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
+	start_irqs = irqs;
 
 	for (i = 0; i < nvec; i++) {
-		gic = &gc->irq_contexts[i];
+		gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+		if (!gic) {
+			err = -ENOMEM;
+			goto free_irq;
+		}
+
 		gic->handler = mana_gd_process_eq_events;
 		INIT_LIST_HEAD(&gic->eq_list);
 		spin_lock_init(&gic->lock);
@@ -1418,69 +1493,128 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
 			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
 				 i - 1, pci_name(pdev));
 
-		irq = pci_irq_vector(pdev, i);
-		if (irq < 0) {
-			err = irq;
-			goto free_irq;
+		irqs[i] = pci_irq_vector(pdev, i);
+		if (irqs[i] < 0) {
+			err = irqs[i];
+			goto free_current_gic;
 		}
 
-		if (!i) {
-			err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
-			if (err)
-				goto free_irq;
-
-			/* If number of IRQ is one extra than number of online CPUs,
-			 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
-			 * same CPU.
-			 * Else we will use different CPUs for IRQ0 and IRQ1.
-			 * Also we are using cpumask_local_spread instead of
-			 * cpumask_first for the node, because the node can be
-			 * mem only.
-			 */
-			if (start_irq_index) {
-				cpu = cpumask_local_spread(i, gc->numa_node);
-				irq_set_affinity_and_hint(irq, cpumask_of(cpu));
-			} else {
-				irqs[start_irq_index] = irq;
-			}
-		} else {
-			irqs[i - start_irq_index] = irq;
-			err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0,
-					  gic->name, gic);
-			if (err)
-				goto free_irq;
-		}
+		err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
+		if (err)
+			goto free_current_gic;
+
+		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
 	}
 
-	err = irq_setup(irqs, nvec - start_irq_index, gc->numa_node, false);
-	if (err)
+	/* If number of IRQ is one extra than number of online CPUs,
+	 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
+	 * same CPU.
+	 * Else we will use different CPUs for IRQ0 and IRQ1.
+	 * Also we are using cpumask_local_spread instead of
+	 * cpumask_first for the node, because the node can be
+	 * mem only.
+	 */
+	cpus_read_lock();
+	if (nvec > num_online_cpus()) {
+		cpu = cpumask_local_spread(0, gc->numa_node);
+		irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu));
+		irqs++;
+		nvec -= 1;
+	}
+
+	err = irq_setup(irqs, nvec, gc->numa_node, false);
+	if (err) {
+		cpus_read_unlock();
 		goto free_irq;
+	}
 
-	gc->max_num_msix = nvec;
-	gc->num_msix_usable = nvec;
 	cpus_read_unlock();
-	kfree(irqs);
+	kfree(start_irqs);
 	return 0;
 
+free_current_gic:
+	kfree(gic);
 free_irq:
-	for (j = i - 1; j >= 0; j--) {
-		irq = pci_irq_vector(pdev, j);
-		gic = &gc->irq_contexts[j];
+	for (i -= 1; i >= 0; i--) {
+		irq = pci_irq_vector(pdev, i);
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
 
 		irq_update_affinity_hint(irq, NULL);
 		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
 
-	kfree(gc->irq_contexts);
-	gc->irq_contexts = NULL;
-free_irq_array:
-	kfree(irqs);
-free_irq_vector:
-	cpus_read_unlock();
-	pci_free_irq_vectors(pdev);
+	kfree(start_irqs);
 	return err;
 }
 
+static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	unsigned int max_irqs, min_irqs;
+	int nvec, err;
+
+	if (pci_msix_can_alloc_dyn(pdev)) {
+		max_irqs = 1;
+		min_irqs = 1;
+	} else {
+		/* Need 1 interrupt for HWC */
+		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
+		min_irqs = 2;
+	}
+
+	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
+	if (nvec < 0)
+		return nvec;
+
+	err = mana_gd_setup_irqs(pdev, nvec);
+	if (err) {
+		pci_free_irq_vectors(pdev);
+		return err;
+	}
+
+	gc->num_msix_usable = nvec;
+	gc->max_num_msix = nvec;
+
+	return 0;
+}
+
+static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	struct msi_map irq_map;
+	int max_irqs, i, err;
+
+	if (!pci_msix_can_alloc_dyn(pdev))
+		/* remain irqs are already allocated with HWC IRQ */
+		return 0;
+
+	/* allocate only remaining IRQs*/
+	max_irqs = gc->num_msix_usable - 1;
+
+	for (i = 1; i <= max_irqs; i++) {
+		irq_map = pci_msix_alloc_irq_at(pdev, i, NULL);
+		if (!irq_map.virq) {
+			err = irq_map.index;
+			/* caller will handle cleaning up all allocated
+			 * irqs, after HWC is destroyed
+			 */
+			return err;
+		}
+	}
+
+	err = mana_gd_setup_dyn_irqs(pdev, max_irqs);
+	if (err)
+		return err;
+
+	gc->max_num_msix = gc->max_num_msix + max_irqs;
+
+	return 0;
+}
+
 static void mana_gd_remove_irqs(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -1495,19 +1629,21 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev)
 		if (irq < 0)
 			continue;
 
-		gic = &gc->irq_contexts[i];
+		gic = xa_load(&gc->irq_contexts, i);
+		if (WARN_ON(!gic))
+			continue;
 
 		/* Need to clear the hint before free_irq */
 		irq_update_affinity_hint(irq, NULL);
 		free_irq(irq, gic);
+		xa_erase(&gc->irq_contexts, i);
+		kfree(gic);
 	}
 
 	pci_free_irq_vectors(pdev);
 
 	gc->max_num_msix = 0;
 	gc->num_msix_usable = 0;
-	kfree(gc->irq_contexts);
-	gc->irq_contexts = NULL;
 }
 
 static int mana_gd_setup(struct pci_dev *pdev)
@@ -1522,9 +1658,10 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (!gc->service_wq)
 		return -ENOMEM;
 
-	err = mana_gd_setup_irqs(pdev);
+	err = mana_gd_setup_hwc_irqs(pdev);
 	if (err) {
-		dev_err(gc->dev, "Failed to setup IRQs: %d\n", err);
+		dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n",
+			err);
 		goto free_workqueue;
 	}
 
@@ -1540,6 +1677,12 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (err)
 		goto destroy_hwc;
 
+	err = mana_gd_setup_remaining_irqs(pdev);
+	if (err) {
+		dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
+		goto destroy_hwc;
+	}
+
 	err = mana_gd_detect_devices(pdev);
 	if (err)
 		goto destroy_hwc;
@@ -1620,6 +1763,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	gc->is_pf = mana_is_pf(pdev->device);
 	gc->bar0_va = bar0_va;
 	gc->dev = &pdev->dev;
+	xa_init(&gc->irq_contexts);
 
 	if (gc->is_pf)
 		gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root);
@@ -1654,6 +1798,7 @@ unmap_bar:
 	 */
 	debugfs_remove_recursive(gc->mana_pci_debugfs);
 	gc->mana_pci_debugfs = NULL;
+	xa_destroy(&gc->irq_contexts);
 	pci_iounmap(pdev, bar0_va);
 free_gc:
 	pci_set_drvdata(pdev, NULL);
@@ -1679,6 +1824,8 @@ static void mana_gd_remove(struct pci_dev *pdev)
 
 	gc->mana_pci_debugfs = NULL;
 
+	xa_destroy(&gc->irq_contexts);
+
 	pci_iounmap(pdev, gc->bar0_va);
 
 	vfree(gc);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 3ce56a816425..87162ba96d91 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -388,7 +388,7 @@ struct gdma_context {
 	unsigned int		max_num_queues;
 	unsigned int		max_num_msix;
 	unsigned int		num_msix_usable;
-	struct gdma_irq_context	*irq_contexts;
+	struct xarray		irq_contexts;
 
 	/* L2 MTU */
 	u16 adapter_mtu;
@@ -578,12 +578,16 @@ enum {
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
 
+/* Driver supports dynamic MSI-X vector allocation */
+#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
+
 #define GDMA_DRV_CAP_FLAGS1 \
 	(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
 	 GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
 	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
 	 GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
-	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
+	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+	 GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
cgit v1.2.3


From c3021d6a80ff05034dfee494115ec71f1954e311 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 16 Jun 2025 01:23:59 +0100
Subject: media: v4l2-jpeg: Remove unused v4l2_jpeg_parse_* wrappers

The functions:
  v4l2_jpeg_parse_huffman_tables()
  v4l2_jpeg_parse_quantization_tables()
  v4l2_jpeg_parse_scan_header()
and
  v4l2_jpeg_parse_frame_header()

were added in 2020 by
commit 50733b5b9102 ("media: add v4l2 JPEG helpers")

but have remained unused.

Remove them.

They're all just wrappers around an underlying set of helpers,
which are all still called via v4l2_jpeg_parse_header().

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-jpeg.c | 80 -------------------------------------
 include/media/v4l2-jpeg.h           |  9 -----
 2 files changed, 89 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-jpeg.c b/drivers/media/v4l2-core/v4l2-jpeg.c
index 6e2647323522..36a0f1a1b0d9 100644
--- a/drivers/media/v4l2-core/v4l2-jpeg.c
+++ b/drivers/media/v4l2-core/v4l2-jpeg.c
@@ -711,83 +711,3 @@ int v4l2_jpeg_parse_header(void *buf, size_t len, struct v4l2_jpeg_header *out)
 	return marker;
 }
 EXPORT_SYMBOL_GPL(v4l2_jpeg_parse_header);
-
-/**
- * v4l2_jpeg_parse_frame_header - parse frame header
- * @buf: address of the frame header, after the SOF0 marker
- * @len: length of the frame header
- * @frame_header: returns the parsed frame header
- *
- * Returns 0 or negative error if parsing failed.
- */
-int v4l2_jpeg_parse_frame_header(void *buf, size_t len,
-				 struct v4l2_jpeg_frame_header *frame_header)
-{
-	struct jpeg_stream stream;
-
-	stream.curr = buf;
-	stream.end = stream.curr + len;
-	return jpeg_parse_frame_header(&stream, SOF0, frame_header);
-}
-EXPORT_SYMBOL_GPL(v4l2_jpeg_parse_frame_header);
-
-/**
- * v4l2_jpeg_parse_scan_header - parse scan header
- * @buf: address of the scan header, after the SOS marker
- * @len: length of the scan header
- * @scan_header: returns the parsed scan header
- *
- * Returns 0 or negative error if parsing failed.
- */
-int v4l2_jpeg_parse_scan_header(void *buf, size_t len,
-				struct v4l2_jpeg_scan_header *scan_header)
-{
-	struct jpeg_stream stream;
-
-	stream.curr = buf;
-	stream.end = stream.curr + len;
-	return jpeg_parse_scan_header(&stream, scan_header);
-}
-EXPORT_SYMBOL_GPL(v4l2_jpeg_parse_scan_header);
-
-/**
- * v4l2_jpeg_parse_quantization_tables - parse quantization tables segment
- * @buf: address of the quantization table segment, after the DQT marker
- * @len: length of the quantization table segment
- * @precision: sample precision (P) in bits per component
- * @q_tables: returns four references into the buffer for the
- *            four possible quantization table destinations
- *
- * Returns 0 or negative error if parsing failed.
- */
-int v4l2_jpeg_parse_quantization_tables(void *buf, size_t len, u8 precision,
-					struct v4l2_jpeg_reference *q_tables)
-{
-	struct jpeg_stream stream;
-
-	stream.curr = buf;
-	stream.end = stream.curr + len;
-	return jpeg_parse_quantization_tables(&stream, precision, q_tables);
-}
-EXPORT_SYMBOL_GPL(v4l2_jpeg_parse_quantization_tables);
-
-/**
- * v4l2_jpeg_parse_huffman_tables - parse huffman tables segment
- * @buf: address of the Huffman table segment, after the DHT marker
- * @len: length of the Huffman table segment
- * @huffman_tables: returns four references into the buffer for the
- *                  four possible Huffman table destinations, in
- *                  the order DC0, DC1, AC0, AC1
- *
- * Returns 0 or negative error if parsing failed.
- */
-int v4l2_jpeg_parse_huffman_tables(void *buf, size_t len,
-				   struct v4l2_jpeg_reference *huffman_tables)
-{
-	struct jpeg_stream stream;
-
-	stream.curr = buf;
-	stream.end = stream.curr + len;
-	return jpeg_parse_huffman_tables(&stream, huffman_tables);
-}
-EXPORT_SYMBOL_GPL(v4l2_jpeg_parse_huffman_tables);
diff --git a/include/media/v4l2-jpeg.h b/include/media/v4l2-jpeg.h
index b65658a02e3c..62dda1560275 100644
--- a/include/media/v4l2-jpeg.h
+++ b/include/media/v4l2-jpeg.h
@@ -169,15 +169,6 @@ struct v4l2_jpeg_header {
 
 int v4l2_jpeg_parse_header(void *buf, size_t len, struct v4l2_jpeg_header *out);
 
-int v4l2_jpeg_parse_frame_header(void *buf, size_t len,
-				 struct v4l2_jpeg_frame_header *frame_header);
-int v4l2_jpeg_parse_scan_header(void *buf, size_t len,
-				struct v4l2_jpeg_scan_header *scan_header);
-int v4l2_jpeg_parse_quantization_tables(void *buf, size_t len, u8 precision,
-					struct v4l2_jpeg_reference *q_tables);
-int v4l2_jpeg_parse_huffman_tables(void *buf, size_t len,
-				   struct v4l2_jpeg_reference *huffman_tables);
-
 extern const u8 v4l2_jpeg_zigzag_scan_index[V4L2_JPEG_PIXELS_IN_BLOCK];
 extern const u8 v4l2_jpeg_ref_table_luma_qt[V4L2_JPEG_PIXELS_IN_BLOCK];
 extern const u8 v4l2_jpeg_ref_table_chroma_qt[V4L2_JPEG_PIXELS_IN_BLOCK];
-- 
cgit v1.2.3


From b29929b819f35503024c6a7e6ad442f6e36c68a0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 16 Jun 2025 20:16:21 +0200
Subject: driver core: Add device_link_test() for testing device link flags

To avoid coding mistakes like the one fixed by commit 3860cbe23963 ("PM:
sleep: Fix bit masking operation"), introduce device_link_test() for
testing device link flags and use it where applicable.

No intentional functional impact.

Signed-off-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/2793309.mvXUDI8C0e@rjwysocki.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c          | 73 ++++++++++++++++++++++----------------------
 drivers/base/power/main.c    |  2 +-
 drivers/base/power/runtime.c |  6 ++--
 include/linux/device.h       |  5 +++
 4 files changed, 45 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index cbc0099d8ef2..3809baed42f3 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -460,9 +460,9 @@ static ssize_t auto_remove_on_show(struct device *dev,
 	struct device_link *link = to_devlink(dev);
 	const char *output;
 
-	if (link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
+	if (device_link_test(link, DL_FLAG_AUTOREMOVE_SUPPLIER))
 		output = "supplier unbind";
-	else if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
+	else if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER))
 		output = "consumer unbind";
 	else
 		output = "never";
@@ -476,7 +476,7 @@ static ssize_t runtime_pm_show(struct device *dev,
 {
 	struct device_link *link = to_devlink(dev);
 
-	return sysfs_emit(buf, "%d\n", !!(link->flags & DL_FLAG_PM_RUNTIME));
+	return sysfs_emit(buf, "%d\n", device_link_test(link, DL_FLAG_PM_RUNTIME));
 }
 static DEVICE_ATTR_RO(runtime_pm);
 
@@ -485,8 +485,7 @@ static ssize_t sync_state_only_show(struct device *dev,
 {
 	struct device_link *link = to_devlink(dev);
 
-	return sysfs_emit(buf, "%d\n",
-			  !!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
+	return sysfs_emit(buf, "%d\n", device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
 }
 static DEVICE_ATTR_RO(sync_state_only);
 
@@ -792,12 +791,12 @@ struct device_link *device_link_add(struct device *consumer,
 		if (link->consumer != consumer)
 			continue;
 
-		if (link->flags & DL_FLAG_INFERRED &&
+		if (device_link_test(link, DL_FLAG_INFERRED) &&
 		    !(flags & DL_FLAG_INFERRED))
 			link->flags &= ~DL_FLAG_INFERRED;
 
 		if (flags & DL_FLAG_PM_RUNTIME) {
-			if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
+			if (!device_link_test(link, DL_FLAG_PM_RUNTIME)) {
 				pm_runtime_new_link(consumer);
 				link->flags |= DL_FLAG_PM_RUNTIME;
 			}
@@ -807,8 +806,8 @@ struct device_link *device_link_add(struct device *consumer,
 
 		if (flags & DL_FLAG_STATELESS) {
 			kref_get(&link->kref);
-			if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
-			    !(link->flags & DL_FLAG_STATELESS)) {
+			if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY) &&
+			    !device_link_test(link, DL_FLAG_STATELESS)) {
 				link->flags |= DL_FLAG_STATELESS;
 				goto reorder;
 			} else {
@@ -823,7 +822,7 @@ struct device_link *device_link_add(struct device *consumer,
 		 * update the existing link to stay around longer.
 		 */
 		if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER) {
-			if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
+			if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER)) {
 				link->flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
 				link->flags |= DL_FLAG_AUTOREMOVE_SUPPLIER;
 			}
@@ -831,12 +830,12 @@ struct device_link *device_link_add(struct device *consumer,
 			link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER |
 					 DL_FLAG_AUTOREMOVE_SUPPLIER);
 		}
-		if (!(link->flags & DL_FLAG_MANAGED)) {
+		if (!device_link_test(link, DL_FLAG_MANAGED)) {
 			kref_get(&link->kref);
 			link->flags |= DL_FLAG_MANAGED;
 			device_link_init_status(link, consumer, supplier);
 		}
-		if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
+		if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY) &&
 		    !(flags & DL_FLAG_SYNC_STATE_ONLY)) {
 			link->flags &= ~DL_FLAG_SYNC_STATE_ONLY;
 			goto reorder;
@@ -940,7 +939,7 @@ static void __device_link_del(struct kref *kref)
 
 static void device_link_put_kref(struct device_link *link)
 {
-	if (link->flags & DL_FLAG_STATELESS)
+	if (device_link_test(link, DL_FLAG_STATELESS))
 		kref_put(&link->kref, __device_link_del);
 	else if (!device_is_registered(link->consumer))
 		__device_link_del(&link->kref);
@@ -1004,7 +1003,7 @@ static void device_links_missing_supplier(struct device *dev)
 		if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
 			WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
 		} else {
-			WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
+			WARN_ON(!device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
 			WRITE_ONCE(link->status, DL_STATE_DORMANT);
 		}
 	}
@@ -1072,14 +1071,14 @@ int device_links_check_suppliers(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry(link, &dev->links.suppliers, c_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		if (link->status != DL_STATE_AVAILABLE &&
-		    !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {
+		    !device_link_test(link, DL_FLAG_SYNC_STATE_ONLY)) {
 
 			if (dev_is_best_effort(dev) &&
-			    link->flags & DL_FLAG_INFERRED &&
+			    device_link_test(link, DL_FLAG_INFERRED) &&
 			    !link->supplier->can_match) {
 				ret = -EAGAIN;
 				continue;
@@ -1128,7 +1127,7 @@ static void __device_links_queue_sync_state(struct device *dev,
 		return;
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 		if (link->status != DL_STATE_ACTIVE)
 			return;
@@ -1268,7 +1267,7 @@ void device_links_force_bind(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		if (link->status != DL_STATE_AVAILABLE) {
@@ -1329,7 +1328,7 @@ void device_links_driver_bound(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		/*
@@ -1345,7 +1344,7 @@ void device_links_driver_bound(struct device *dev)
 		WARN_ON(link->status != DL_STATE_DORMANT);
 		WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
 
-		if (link->flags & DL_FLAG_AUTOPROBE_CONSUMER)
+		if (device_link_test(link, DL_FLAG_AUTOPROBE_CONSUMER))
 			driver_deferred_probe_add(link->consumer);
 	}
 
@@ -1357,11 +1356,11 @@ void device_links_driver_bound(struct device *dev)
 	list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
 		struct device *supplier;
 
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		supplier = link->supplier;
-		if (link->flags & DL_FLAG_SYNC_STATE_ONLY) {
+		if (device_link_test(link, DL_FLAG_SYNC_STATE_ONLY)) {
 			/*
 			 * When DL_FLAG_SYNC_STATE_ONLY is set, it means no
 			 * other DL_MANAGED_LINK_FLAGS have been set. So, it's
@@ -1369,7 +1368,7 @@ void device_links_driver_bound(struct device *dev)
 			 */
 			device_link_drop_managed(link);
 		} else if (dev_is_best_effort(dev) &&
-			   link->flags & DL_FLAG_INFERRED &&
+			   device_link_test(link, DL_FLAG_INFERRED) &&
 			   link->status != DL_STATE_CONSUMER_PROBE &&
 			   !link->supplier->can_match) {
 			/*
@@ -1421,10 +1420,10 @@ static void __device_links_no_driver(struct device *dev)
 	struct device_link *link, *ln;
 
 	list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
-		if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
+		if (device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER)) {
 			device_link_drop_managed(link);
 			continue;
 		}
@@ -1436,7 +1435,7 @@ static void __device_links_no_driver(struct device *dev)
 		if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
 			WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
 		} else {
-			WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
+			WARN_ON(!device_link_test(link, DL_FLAG_SYNC_STATE_ONLY));
 			WRITE_ONCE(link->status, DL_STATE_DORMANT);
 		}
 	}
@@ -1461,7 +1460,7 @@ void device_links_no_driver(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		/*
@@ -1498,10 +1497,10 @@ void device_links_driver_cleanup(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
-		WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
+		WARN_ON(device_link_test(link, DL_FLAG_AUTOREMOVE_CONSUMER));
 		WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);
 
 		/*
@@ -1510,7 +1509,7 @@ void device_links_driver_cleanup(struct device *dev)
 		 * has moved to DL_STATE_SUPPLIER_UNBIND.
 		 */
 		if (link->status == DL_STATE_SUPPLIER_UNBIND &&
-		    link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
+		    device_link_test(link, DL_FLAG_AUTOREMOVE_SUPPLIER))
 			device_link_drop_managed(link);
 
 		WRITE_ONCE(link->status, DL_STATE_DORMANT);
@@ -1544,7 +1543,7 @@ bool device_links_busy(struct device *dev)
 	device_links_write_lock();
 
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
-		if (!(link->flags & DL_FLAG_MANAGED))
+		if (!device_link_test(link, DL_FLAG_MANAGED))
 			continue;
 
 		if (link->status == DL_STATE_CONSUMER_PROBE
@@ -1586,8 +1585,8 @@ void device_links_unbind_consumers(struct device *dev)
 	list_for_each_entry(link, &dev->links.consumers, s_node) {
 		enum device_link_state status;
 
-		if (!(link->flags & DL_FLAG_MANAGED) ||
-		    link->flags & DL_FLAG_SYNC_STATE_ONLY)
+		if (!device_link_test(link, DL_FLAG_MANAGED) ||
+		    device_link_test(link, DL_FLAG_SYNC_STATE_ONLY))
 			continue;
 
 		status = link->status;
@@ -1743,7 +1742,7 @@ static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
 
 static void fw_devlink_relax_link(struct device_link *link)
 {
-	if (!(link->flags & DL_FLAG_INFERRED))
+	if (!device_link_test(link, DL_FLAG_INFERRED))
 		return;
 
 	if (device_link_flag_is_sync_state_only(link->flags))
@@ -1779,7 +1778,7 @@ static int fw_devlink_dev_sync_state(struct device *dev, void *data)
 	struct device_link *link = to_devlink(dev);
 	struct device *sup = link->supplier;
 
-	if (!(link->flags & DL_FLAG_MANAGED) ||
+	if (!device_link_test(link, DL_FLAG_MANAGED) ||
 	    link->status == DL_STATE_ACTIVE || sup->state_synced ||
 	    !dev_has_sync_state(sup))
 		return 0;
@@ -2063,7 +2062,7 @@ static bool __fw_devlink_relax_cycles(struct fwnode_handle *con_handle,
 		 * such due to a cycle.
 		 */
 		if (device_link_flag_is_sync_state_only(dev_link->flags) &&
-		    !(dev_link->flags & DL_FLAG_CYCLE))
+		    !device_link_test(dev_link, DL_FLAG_CYCLE))
 			continue;
 
 		if (__fw_devlink_relax_cycles(con_handle,
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index eebe699fdf4f..6c6f8ded6877 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1998,7 +1998,7 @@ static bool device_prepare_smart_suspend(struct device *dev)
 	idx = device_links_read_lock();
 
 	list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) {
-		if (!(link->flags & DL_FLAG_PM_RUNTIME))
+		if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
 			continue;
 
 		if (!dev_pm_smart_suspend(link->supplier) &&
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index c55a7c70bc1a..c67e7a6b8ed0 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -290,7 +290,7 @@ static int rpm_get_suppliers(struct device *dev)
 				device_links_read_lock_held()) {
 		int retval;
 
-		if (!(link->flags & DL_FLAG_PM_RUNTIME))
+		if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
 			continue;
 
 		retval = pm_runtime_get_sync(link->supplier);
@@ -1879,7 +1879,7 @@ void pm_runtime_get_suppliers(struct device *dev)
 
 	list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
 				device_links_read_lock_held())
-		if (link->flags & DL_FLAG_PM_RUNTIME) {
+		if (device_link_test(link, DL_FLAG_PM_RUNTIME)) {
 			link->supplier_preactivated = true;
 			pm_runtime_get_sync(link->supplier);
 		}
@@ -1933,7 +1933,7 @@ static void pm_runtime_drop_link_count(struct device *dev)
  */
 void pm_runtime_drop_link(struct device_link *link)
 {
-	if (!(link->flags & DL_FLAG_PM_RUNTIME))
+	if (!device_link_test(link, DL_FLAG_PM_RUNTIME))
 		return;
 
 	pm_runtime_drop_link_count(link->consumer);
diff --git a/include/linux/device.h b/include/linux/device.h
index 4940db137fff..afdd4f7c0d94 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1162,6 +1162,11 @@ void device_links_supplier_sync_state_pause(void);
 void device_links_supplier_sync_state_resume(void);
 void device_link_wait_removal(void);
 
+static inline bool device_link_test(const struct device_link *link, u32 flags)
+{
+	return !!(link->flags & flags);
+}
+
 /* Create alias, so I can be autoloaded. */
 #define MODULE_ALIAS_CHARDEV(major,minor) \
 	MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3


From 5660ee54e7982f9097ddc684e90f15bdcc7fef4b Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 2 Jun 2025 13:02:13 +0200
Subject: mm, slab: use frozen pages for large kmalloc

Since slab pages are now frozen, it makes sense to have large kmalloc()
objects behave same as small kmalloc(), as the choice between the two is
an implementation detail depending on allocation size.

Notably, increasing refcount on a slab page containing kmalloc() object
is not possible anymore, so it should be consistent for large kmalloc
pages.

Therefore, change large kmalloc to use the frozen pages API.

Because of some unexpected fallout in the slab pages case (see commit
b9c0e49abfca ("mm: decline to manipulate the refcount on a slab page"),
implement the same kind of checks and warnings as part of this change.

Notably, networking code using sendpage_ok() to determine whether the
page refcount can be manipulated in the network stack should continue
behaving correctly. Before this change, the function returns true for
large kmalloc pages and page refcount can be manipulated. After this
change, the function will return false.

Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h | 4 +++-
 mm/slub.c          | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..a35d5958603f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1325,6 +1325,8 @@ static inline void get_page(struct page *page)
 	struct folio *folio = page_folio(page);
 	if (WARN_ON_ONCE(folio_test_slab(folio)))
 		return;
+	if (WARN_ON_ONCE(folio_test_large_kmalloc(folio)))
+		return;
 	folio_get(folio);
 }
 
@@ -1419,7 +1421,7 @@ static inline void put_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
 
-	if (folio_test_slab(folio))
+	if (folio_test_slab(folio) || folio_test_large_kmalloc(folio))
 		return;
 
 	folio_put(folio);
diff --git a/mm/slub.c b/mm/slub.c
index 06d64a5fb1bf..823042efbfc9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4271,9 +4271,9 @@ static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
 	flags |= __GFP_COMP;
 
 	if (node == NUMA_NO_NODE)
-		folio = (struct folio *)alloc_pages_noprof(flags, order);
+		folio = (struct folio *)alloc_frozen_pages_noprof(flags, order);
 	else
-		folio = (struct folio *)__alloc_pages_noprof(flags, order, node, NULL);
+		folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL);
 
 	if (folio) {
 		ptr = folio_address(folio);
@@ -4770,7 +4770,7 @@ static void free_large_kmalloc(struct folio *folio, void *object)
 	lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
 			      -(PAGE_SIZE << order));
 	__folio_clear_large_kmalloc(folio);
-	folio_put(folio);
+	free_frozen_pages(&folio->page, order);
 }
 
 /*
-- 
cgit v1.2.3


From 20ca475d9860e14cf389f5a7d5ba9c6437d74613 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 16 Jun 2025 20:33:20 +0100
Subject: mm: rename call_mmap/mmap_prepare to vfs_mmap/mmap_prepare

The call_mmap() function violates the existing convention in
include/linux/fs.h whereby invocations of virtual file system hooks is
performed by functions prefixed with vfs_xxx().

Correct this by renaming call_mmap() to vfs_mmap(). This also avoids
confusion as to the fact that f_op->mmap_prepare may be invoked here.

Also rename __call_mmap_prepare() function to vfs_mmap_prepare() and adjust
to accept a file parameter, this is useful later for nested file systems.

Finally, fix up the VMA userland tests and ensure the mmap_prepare -> mmap
shim is implemented there.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/8d389f4994fa736aa8f9172bef8533c10a9e9011.1750099179.git.lorenzo.stoakes@oracle.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  2 +-
 fs/backing-file.c                          |  2 +-
 fs/coda/file.c                             |  4 ++--
 include/linux/fs.h                         |  5 ++---
 ipc/shm.c                                  |  2 +-
 mm/internal.h                              |  2 +-
 mm/vma.c                                   |  2 +-
 tools/testing/vma/vma_internal.h           | 32 ++++++++++++++++++++++++------
 8 files changed, 35 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 05e440643aa2..f4f1c979d1b9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -105,7 +105,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (!obj->base.filp)
 		return -ENODEV;
 
-	ret = call_mmap(obj->base.filp, vma);
+	ret = vfs_mmap(obj->base.filp, vma);
 	if (ret)
 		return ret;
 
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 763fbe9b72b2..04018679bf69 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -339,7 +339,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 	vma_set_file(vma, file);
 
 	old_cred = override_creds(ctx->cred);
-	ret = call_mmap(vma->vm_file, vma);
+	ret = vfs_mmap(vma->vm_file, vma);
 	revert_creds(old_cred);
 
 	if (ctx->accessed)
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 148856a582a9..2e6ea9319b35 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -199,10 +199,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	spin_unlock(&cii->c_lock);
 
 	vma->vm_file = get_file(host_file);
-	ret = call_mmap(vma->vm_file, vma);
+	ret = vfs_mmap(vma->vm_file, vma);
 
 	if (ret) {
-		/* if call_mmap fails, our caller will put host_file so we
+		/* if vfs_mmap fails, our caller will put host_file so we
 		 * should drop the reference to the coda_file that we got.
 		 */
 		fput(coda_file);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4ec77da65f14..c66f235f9e4d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2276,7 +2276,7 @@ static inline bool file_has_valid_mmap_hooks(struct file *file)
 
 int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
 
-static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
+static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	if (file->f_op->mmap_prepare)
 		return compat_vma_mmap_prepare(file, vma);
@@ -2284,8 +2284,7 @@ static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }
 
-static inline int __call_mmap_prepare(struct file *file,
-		struct vm_area_desc *desc)
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
 {
 	return file->f_op->mmap_prepare(desc);
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index 492fcc699985..a9310b6dbbc3 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -602,7 +602,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
 	if (ret)
 		return ret;
 
-	ret = call_mmap(sfd->file, vma);
+	ret = vfs_mmap(sfd->file, vma);
 	if (ret) {
 		__shm_close(sfd);
 		return ret;
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..0f73ff13c212 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,7 +164,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
  */
 static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
 {
-	int err = call_mmap(file, vma);
+	int err = vfs_mmap(file, vma);
 
 	if (likely(!err))
 		return 0;
diff --git a/mm/vma.c b/mm/vma.c
index fef67a66a095..535b138e26c1 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2569,7 +2569,7 @@ static int call_mmap_prepare(struct mmap_state *map)
 	};
 
 	/* Invoke the hook. */
-	err = __call_mmap_prepare(map->file, &desc);
+	err = vfs_mmap_prepare(map->file, &desc);
 	if (err)
 		return err;
 
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 14718ca23a05..7ab04700470f 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1442,6 +1442,27 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma)
 	(void)vma;
 }
 
+/* Declared in vma.h. */
+static inline void set_vma_from_desc(struct vm_area_struct *vma,
+		struct vm_area_desc *desc);
+
+static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma,
+		struct vm_area_desc *desc);
+
+static int compat_vma_mmap_prepare(struct file *file,
+		struct vm_area_struct *vma)
+{
+	struct vm_area_desc desc;
+	int err;
+
+	err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc));
+	if (err)
+		return err;
+	set_vma_from_desc(vma, &desc);
+
+	return 0;
+}
+
 /* Did the driver provide valid mmap hook configuration? */
 static inline bool file_has_valid_mmap_hooks(struct file *file)
 {
@@ -1451,22 +1472,21 @@ static inline bool file_has_valid_mmap_hooks(struct file *file)
 	/* Hooks are mutually exclusive. */
 	if (WARN_ON_ONCE(has_mmap && has_mmap_prepare))
 		return false;
-	if (WARN_ON_ONCE(!has_mmap && !has_mmap_prepare))
+	if (!has_mmap && !has_mmap_prepare)
 		return false;
 
 	return true;
 }
 
-static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
+static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if (WARN_ON_ONCE(file->f_op->mmap_prepare))
-		return -EINVAL;
+	if (file->f_op->mmap_prepare)
+		return compat_vma_mmap_prepare(file, vma);
 
 	return file->f_op->mmap(file, vma);
 }
 
-static inline int __call_mmap_prepare(struct file *file,
-		struct vm_area_desc *desc)
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
 {
 	return file->f_op->mmap_prepare(desc);
 }
-- 
cgit v1.2.3


From 2b5eac0f8c6e79bc152c8804f9f88d16717013ab Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 11 Jun 2025 12:02:47 +0200
Subject: tty: introduce and use tty_port_tty_vhangup() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This code (tty_get -> vhangup -> tty_put) is repeated on few places.
Introduce a helper similar to tty_port_tty_hangup() (asynchronous) to
handle even vhangup (synchronous).

And use it on those places.

In fact, reuse the tty_port_tty_hangup()'s code and call tty_vhangup()
depending on a new bool parameter.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: David Lin <dtwlin@gmail.com>
Cc: Johan Hovold <johan@kernel.org>
Cc: Alex Elder <elder@kernel.org>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20250611100319.186924-2-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/isdn/capi/capi.c         |  8 +-------
 drivers/staging/greybus/uart.c   |  7 +------
 drivers/tty/serial/serial_core.c |  7 +------
 drivers/tty/tty_port.c           | 12 ++++++++----
 drivers/usb/class/cdc-acm.c      |  7 +------
 drivers/usb/serial/usb-serial.c  |  7 +------
 include/linux/tty_port.h         | 12 +++++++++++-
 net/bluetooth/rfcomm/tty.c       |  7 +------
 8 files changed, 25 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 70dee9ad4bae..78e6e7748fb9 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -306,15 +306,9 @@ static void capincci_alloc_minor(struct capidev *cdev, struct capincci *np)
 static void capincci_free_minor(struct capincci *np)
 {
 	struct capiminor *mp = np->minorp;
-	struct tty_struct *tty;
 
 	if (mp) {
-		tty = tty_port_tty_get(&mp->port);
-		if (tty) {
-			tty_vhangup(tty);
-			tty_kref_put(tty);
-		}
-
+		tty_port_tty_vhangup(&mp->port);
 		capiminor_free(mp);
 	}
 }
diff --git a/drivers/staging/greybus/uart.c b/drivers/staging/greybus/uart.c
index 308ed1ca9947..10df5c37c83e 100644
--- a/drivers/staging/greybus/uart.c
+++ b/drivers/staging/greybus/uart.c
@@ -916,7 +916,6 @@ static void gb_uart_remove(struct gbphy_device *gbphy_dev)
 {
 	struct gb_tty *gb_tty = gb_gbphy_get_data(gbphy_dev);
 	struct gb_connection *connection = gb_tty->connection;
-	struct tty_struct *tty;
 	int ret;
 
 	ret = gbphy_runtime_get_sync(gbphy_dev);
@@ -929,11 +928,7 @@ static void gb_uart_remove(struct gbphy_device *gbphy_dev)
 	wake_up_all(&gb_tty->wioctl);
 	mutex_unlock(&gb_tty->mutex);
 
-	tty = tty_port_tty_get(&gb_tty->port);
-	if (tty) {
-		tty_vhangup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_vhangup(&gb_tty->port);
 
 	gb_connection_disable_rx(connection);
 	tty_unregister_device(gb_tty_driver, gb_tty->minor);
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 1f7708a91fc6..d6485714eb0f 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -3209,7 +3209,6 @@ static void serial_core_remove_one_port(struct uart_driver *drv,
 	struct uart_state *state = drv->state + uport->line;
 	struct tty_port *port = &state->port;
 	struct uart_port *uart_port;
-	struct tty_struct *tty;
 
 	mutex_lock(&port->mutex);
 	uart_port = uart_port_check(state);
@@ -3228,11 +3227,7 @@ static void serial_core_remove_one_port(struct uart_driver *drv,
 	 */
 	tty_port_unregister_device(port, drv->tty_driver, uport->line);
 
-	tty = tty_port_tty_get(port);
-	if (tty) {
-		tty_vhangup(port->tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_vhangup(port);
 
 	/*
 	 * If the port is used as a console, unregister it
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 4af1fbf73f51..903eebdbe12d 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -396,15 +396,19 @@ EXPORT_SYMBOL(tty_port_hangup);
  * @port: tty port
  * @check_clocal: hang only ttys with %CLOCAL unset?
  */
-void tty_port_tty_hangup(struct tty_port *port, bool check_clocal)
+void __tty_port_tty_hangup(struct tty_port *port, bool check_clocal, bool async)
 {
 	struct tty_struct *tty = tty_port_tty_get(port);
 
-	if (tty && (!check_clocal || !C_CLOCAL(tty)))
-		tty_hangup(tty);
+	if (tty && (!check_clocal || !C_CLOCAL(tty))) {
+		if (async)
+			tty_hangup(tty);
+		else
+			tty_vhangup(tty);
+	}
 	tty_kref_put(tty);
 }
-EXPORT_SYMBOL_GPL(tty_port_tty_hangup);
+EXPORT_SYMBOL_GPL(__tty_port_tty_hangup);
 
 /**
  * tty_port_tty_wakeup - helper to wake up a tty
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index c2ecfa3c8349..f9171fbedf5c 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1571,7 +1571,6 @@ err_put_port:
 static void acm_disconnect(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
-	struct tty_struct *tty;
 	int i;
 
 	/* sibling interface is already cleaning up */
@@ -1598,11 +1597,7 @@ static void acm_disconnect(struct usb_interface *intf)
 	usb_set_intfdata(acm->data, NULL);
 	mutex_unlock(&acm->mutex);
 
-	tty = tty_port_tty_get(&acm->port);
-	if (tty) {
-		tty_vhangup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_vhangup(&acm->port);
 
 	cancel_delayed_work_sync(&acm->dwork);
 
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 7266558d823a..c78ff40b1e5f 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1176,7 +1176,6 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 	struct usb_serial *serial = usb_get_intfdata(interface);
 	struct device *dev = &interface->dev;
 	struct usb_serial_port *port;
-	struct tty_struct *tty;
 
 	/* sibling interface is cleaning up */
 	if (!serial)
@@ -1191,11 +1190,7 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 
 	for (i = 0; i < serial->num_ports; ++i) {
 		port = serial->port[i];
-		tty = tty_port_tty_get(&port->port);
-		if (tty) {
-			tty_vhangup(tty);
-			tty_kref_put(tty);
-		}
+		tty_port_tty_vhangup(&port->port);
 		usb_serial_port_poison_urbs(port);
 		wake_up_interruptible(&port->port.delta_msr_wait);
 		cancel_work_sync(&port->work);
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 08f89a598366..021f9a8415c0 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -232,7 +232,7 @@ bool tty_port_carrier_raised(struct tty_port *port);
 void tty_port_raise_dtr_rts(struct tty_port *port);
 void tty_port_lower_dtr_rts(struct tty_port *port);
 void tty_port_hangup(struct tty_port *port);
-void tty_port_tty_hangup(struct tty_port *port, bool check_clocal);
+void __tty_port_tty_hangup(struct tty_port *port, bool check_clocal, bool async);
 void tty_port_tty_wakeup(struct tty_port *port);
 int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty,
 		struct file *filp);
@@ -251,4 +251,14 @@ static inline int tty_port_users(struct tty_port *port)
 	return port->count + port->blocked_open;
 }
 
+static inline void tty_port_tty_hangup(struct tty_port *port, bool check_clocal)
+{
+	__tty_port_tty_hangup(port, check_clocal, true);
+}
+
+static inline void tty_port_tty_vhangup(struct tty_port *port)
+{
+	__tty_port_tty_hangup(port, false, false);
+}
+
 #endif
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 21a5b5535ebc..827dfbe66085 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -438,7 +438,6 @@ static int __rfcomm_release_dev(void __user *arg)
 {
 	struct rfcomm_dev_req req;
 	struct rfcomm_dev *dev;
-	struct tty_struct *tty;
 
 	if (copy_from_user(&req, arg, sizeof(req)))
 		return -EFAULT;
@@ -464,11 +463,7 @@ static int __rfcomm_release_dev(void __user *arg)
 		rfcomm_dlc_close(dev->dlc, 0);
 
 	/* Shut down TTY synchronously before freeing rfcomm_dev */
-	tty = tty_port_tty_get(&dev->port);
-	if (tty) {
-		tty_vhangup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_vhangup(&dev->port);
 
 	if (!test_bit(RFCOMM_TTY_OWNED, &dev->status))
 		tty_port_put(&dev->port);
-- 
cgit v1.2.3


From 866380bcf10c810c1c7097641170d53bbe5239ce Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 11 Jun 2025 12:02:52 +0200
Subject: tty: vt: use sane types for userspace API

As discussed earlier (see the Link below), use the preferred ioctl types
in vt.h (__u8, __u16, ...).

These types are already used for the new VT_GETCONSIZECSRPOS.
Therefore, the necessary includes are already present. Since now, the
types are used for every structure defined in the header now.

Note the kernel is built with -funsigned-char, therefore 'char' becomes
'__u8' in here.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Nicolas Pitre <nico@fluxnic.net>
Link: https://lore.kernel.org/all/p7p83sq1-4ro2-o924-s9o2-30spr74n076o@syhkavp.arg/
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Link: https://lore.kernel.org/r/20250611100319.186924-7-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/vt.h | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h
index e5b0c492aa18..714483d68c69 100644
--- a/include/uapi/linux/vt.h
+++ b/include/uapi/linux/vt.h
@@ -19,11 +19,11 @@
 #define VT_OPENQRY	0x5600	/* find available vt */
 
 struct vt_mode {
-	char mode;		/* vt mode */
-	char waitv;		/* if set, hang on writes if not active */
-	short relsig;		/* signal to raise on release req */
-	short acqsig;		/* signal to raise on acquisition */
-	short frsig;		/* unused (set to 0) */
+	__u8 mode;		/* vt mode */
+	__u8 waitv;		/* if set, hang on writes if not active */
+	__s16 relsig;		/* signal to raise on release req */
+	__s16 acqsig;		/* signal to raise on acquisition */
+	__s16 frsig;		/* unused (set to 0) */
 };
 #define VT_GETMODE	0x5601	/* get mode of active vt */
 #define VT_SETMODE	0x5602	/* set mode of active vt */
@@ -32,9 +32,9 @@ struct vt_mode {
 #define		VT_ACKACQ	0x02	/* acknowledge switch */
 
 struct vt_stat {
-	unsigned short v_active;	/* active vt */
-	unsigned short v_signal;	/* signal to send */
-	unsigned short v_state;		/* vt bitmask */
+	__u16 v_active;	/* active vt */
+	__u16 v_signal;	/* signal to send */
+	__u16 v_state;		/* vt bitmask */
 };
 #define VT_GETSTATE	0x5603	/* get global vt state info */
 #define VT_SENDSIG	0x5604	/* signal to send to bitmask of vts */
@@ -46,19 +46,19 @@ struct vt_stat {
 #define VT_DISALLOCATE	0x5608  /* free memory associated to vt */
 
 struct vt_sizes {
-	unsigned short v_rows;		/* number of rows */
-	unsigned short v_cols;		/* number of columns */
-	unsigned short v_scrollsize;	/* number of lines of scrollback */
+	__u16 v_rows;		/* number of rows */
+	__u16 v_cols;		/* number of columns */
+	__u16 v_scrollsize;	/* number of lines of scrollback */
 };
 #define VT_RESIZE	0x5609	/* set kernel's idea of screensize */
 
 struct vt_consize {
-	unsigned short v_rows;	/* number of rows */
-	unsigned short v_cols;	/* number of columns */
-	unsigned short v_vlin;	/* number of pixel rows on screen */
-	unsigned short v_clin;	/* number of pixel rows per character */
-	unsigned short v_vcol;	/* number of pixel columns on screen */
-	unsigned short v_ccol;	/* number of pixel columns per character */
+	__u16 v_rows;	/* number of rows */
+	__u16 v_cols;	/* number of columns */
+	__u16 v_vlin;	/* number of pixel rows on screen */
+	__u16 v_clin;	/* number of pixel rows per character */
+	__u16 v_vcol;	/* number of pixel columns on screen */
+	__u16 v_ccol;	/* number of pixel columns per character */
 };
 #define VT_RESIZEX      0x560A  /* set kernel's idea of screensize + more */
 #define VT_LOCKSWITCH   0x560B  /* disallow vt switching */
@@ -66,21 +66,21 @@ struct vt_consize {
 #define VT_GETHIFONTMASK 0x560D  /* return hi font mask */
 
 struct vt_event {
-	unsigned int event;
+	__u32 event;
 #define VT_EVENT_SWITCH		0x0001	/* Console switch */
 #define VT_EVENT_BLANK		0x0002	/* Screen blank */
 #define VT_EVENT_UNBLANK	0x0004	/* Screen unblank */
 #define VT_EVENT_RESIZE		0x0008	/* Resize display */
 #define VT_MAX_EVENT		0x000F
-	unsigned int oldev;		/* Old console */
-	unsigned int newev;		/* New console (if changing) */
-	unsigned int pad[4];		/* Padding for expansion */
+	__u32 oldev;		/* Old console */
+	__u32 newev;		/* New console (if changing) */
+	__u32 pad[4];		/* Padding for expansion */
 };
 
 #define VT_WAITEVENT	0x560E	/* Wait for an event */
 
 struct vt_setactivate {
-	unsigned int console;
+	__u32 console;
 	struct vt_mode mode;
 };
 
-- 
cgit v1.2.3


From f1180ca37abe3d117e4a19be12142fe722612a7c Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 11 Jun 2025 12:02:53 +0200
Subject: tty: vt: use _IO() to define ioctl numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_IO*() is the proper way of defining ioctl numbers. All these vt numbers
were synthetically built up the same way the _IO() macro does.

So instead of implicit hex numbers, use _IO() properly.

To not change the pre-existing numbers, use only _IO() (and not _IOR()
or _IOW()). The latter would change the numbers indeed.

Objdump of vt_ioctl.o reveals no difference with this patch.

Again, VT_GETCONSIZECSRPOS already uses _IOR(), so everything is paved
for this patch.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Nicolas Pitre <nico@fluxnic.net>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20250611100319.186924-8-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/vt.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h
index 714483d68c69..b60fcdfb2746 100644
--- a/include/uapi/linux/vt.h
+++ b/include/uapi/linux/vt.h
@@ -14,9 +14,9 @@
 		/* Note: the ioctl VT_GETSTATE does not work for
 		   consoles 16 and higher (since it returns a short) */
 
-/* 0x56 is 'V', to avoid collision with termios and kd */
+/* 'V' to avoid collision with termios and kd */
 
-#define VT_OPENQRY	0x5600	/* find available vt */
+#define VT_OPENQRY		_IO('V', 0x00)	/* find available vt */
 
 struct vt_mode {
 	__u8 mode;		/* vt mode */
@@ -25,8 +25,8 @@ struct vt_mode {
 	__s16 acqsig;		/* signal to raise on acquisition */
 	__s16 frsig;		/* unused (set to 0) */
 };
-#define VT_GETMODE	0x5601	/* get mode of active vt */
-#define VT_SETMODE	0x5602	/* set mode of active vt */
+#define VT_GETMODE		_IO('V', 0x01)	/* get mode of active vt */
+#define VT_SETMODE		_IO('V', 0x02)	/* set mode of active vt */
 #define		VT_AUTO		0x00	/* auto vt switching */
 #define		VT_PROCESS	0x01	/* process controls switching */
 #define		VT_ACKACQ	0x02	/* acknowledge switch */
@@ -36,21 +36,21 @@ struct vt_stat {
 	__u16 v_signal;	/* signal to send */
 	__u16 v_state;		/* vt bitmask */
 };
-#define VT_GETSTATE	0x5603	/* get global vt state info */
-#define VT_SENDSIG	0x5604	/* signal to send to bitmask of vts */
+#define VT_GETSTATE		_IO('V', 0x03)	/* get global vt state info */
+#define VT_SENDSIG		_IO('V', 0x04)	/* signal to send to bitmask of vts */
 
-#define VT_RELDISP	0x5605	/* release display */
+#define VT_RELDISP		_IO('V', 0x05)	/* release display */
 
-#define VT_ACTIVATE	0x5606	/* make vt active */
-#define VT_WAITACTIVE	0x5607	/* wait for vt active */
-#define VT_DISALLOCATE	0x5608  /* free memory associated to vt */
+#define VT_ACTIVATE		_IO('V', 0x06)	/* make vt active */
+#define VT_WAITACTIVE		_IO('V', 0x07)	/* wait for vt active */
+#define VT_DISALLOCATE		_IO('V', 0x08)  /* free memory associated to vt */
 
 struct vt_sizes {
 	__u16 v_rows;		/* number of rows */
 	__u16 v_cols;		/* number of columns */
 	__u16 v_scrollsize;	/* number of lines of scrollback */
 };
-#define VT_RESIZE	0x5609	/* set kernel's idea of screensize */
+#define VT_RESIZE		_IO('V', 0x09)	/* set kernel's idea of screensize */
 
 struct vt_consize {
 	__u16 v_rows;	/* number of rows */
@@ -60,10 +60,10 @@ struct vt_consize {
 	__u16 v_vcol;	/* number of pixel columns on screen */
 	__u16 v_ccol;	/* number of pixel columns per character */
 };
-#define VT_RESIZEX      0x560A  /* set kernel's idea of screensize + more */
-#define VT_LOCKSWITCH   0x560B  /* disallow vt switching */
-#define VT_UNLOCKSWITCH 0x560C  /* allow vt switching */
-#define VT_GETHIFONTMASK 0x560D  /* return hi font mask */
+#define VT_RESIZEX		_IO('V', 0x0A)  /* set kernel's idea of screensize + more */
+#define VT_LOCKSWITCH		_IO('V', 0x0B)  /* disallow vt switching */
+#define VT_UNLOCKSWITCH		_IO('V', 0x0C)  /* allow vt switching */
+#define VT_GETHIFONTMASK	_IO('V', 0x0D)  /* return hi font mask */
 
 struct vt_event {
 	__u32 event;
@@ -77,14 +77,14 @@ struct vt_event {
 	__u32 pad[4];		/* Padding for expansion */
 };
 
-#define VT_WAITEVENT	0x560E	/* Wait for an event */
+#define VT_WAITEVENT		_IO('V', 0x0E)	/* Wait for an event */
 
 struct vt_setactivate {
 	__u32 console;
 	struct vt_mode mode;
 };
 
-#define VT_SETACTIVATE	0x560F	/* Activate and set the mode of a console */
+#define VT_SETACTIVATE		_IO('V', 0x0F)	/* Activate and set the mode of a console */
 
 /* get console size and cursor position */
 struct vt_consizecsrpos {
-- 
cgit v1.2.3


From fc9ceb501e38cc21066c1638993500b30eda8bdb Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Wed, 11 Jun 2025 12:02:54 +0200
Subject: serial: 8250: sanitize uart_port::serial_{in,out}() types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

uart_port::{serial_in,serial_out} (and plat_serial8250_port::* likewise)
historically use:
* 'unsigned int' for 32-bit register values in reads and writes, and
* 'int' for offsets.

Make them sane such that:
* 'u32' is used for register values, and
* 'unsigned int' is used for offsets.

While at it, name hooks' parameters, so it is clear what is what.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Paul Cercueil <paul@crapouillou.net>
Cc: Vladimir Zapolskiy <vz@mleia.com>
Cc: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250611100319.186924-9-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/legacy_serial.c     |  7 ++++---
 drivers/tty/serial/8250/8250_dw.c       | 34 ++++++++++++++++-----------------
 drivers/tty/serial/8250/8250_em.c       |  4 ++--
 drivers/tty/serial/8250/8250_ingenic.c  |  8 ++++----
 drivers/tty/serial/8250/8250_ioc3.c     |  4 ++--
 drivers/tty/serial/8250/8250_lpc18xx.c  |  2 +-
 drivers/tty/serial/8250/8250_pci.c      |  6 +++---
 drivers/tty/serial/8250/8250_port.c     | 30 ++++++++++++++---------------
 drivers/tty/serial/8250/8250_rt288x.c   |  4 ++--
 drivers/tty/serial/8250/8250_uniphier.c |  4 ++--
 include/linux/serial_8250.h             |  4 ++--
 include/linux/serial_core.h             |  4 ++--
 12 files changed, 56 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index a874eb8e000b..ae1906bfe8a5 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -54,9 +54,10 @@ static int legacy_serial_console = -1;
 static const upf_t legacy_port_flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
 	UPF_SHARE_IRQ | UPF_FIXED_PORT;
 
-static unsigned int tsi_serial_in(struct uart_port *p, int offset)
+static u32 tsi_serial_in(struct uart_port *p, unsigned int offset)
 {
-	unsigned int tmp;
+	u32 tmp;
+
 	offset = offset << p->regshift;
 	if (offset == UART_IIR) {
 		tmp = readl(p->membase + (UART_IIR & ~3));
@@ -65,7 +66,7 @@ static unsigned int tsi_serial_in(struct uart_port *p, int offset)
 		return readb(p->membase + offset);
 }
 
-static void tsi_serial_out(struct uart_port *p, int offset, int value)
+static void tsi_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	if (!((offset == UART_IER) && (value & UART_IER_UUE)))
diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c
index 1902f29444a1..0a22f0cb8896 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -67,8 +67,8 @@ struct dw8250_data {
 	struct dw8250_port_data	data;
 	const struct dw8250_platform_data *pdata;
 
-	int			msr_mask_on;
-	int			msr_mask_off;
+	u32			msr_mask_on;
+	u32			msr_mask_off;
 	struct clk		*clk;
 	struct clk		*pclk;
 	struct notifier_block	clk_notifier;
@@ -94,7 +94,7 @@ static inline struct dw8250_data *work_to_dw8250_data(struct work_struct *work)
 	return container_of(work, struct dw8250_data, clk_work);
 }
 
-static inline int dw8250_modify_msr(struct uart_port *p, int offset, int value)
+static inline u32 dw8250_modify_msr(struct uart_port *p, unsigned int offset, u32 value)
 {
 	struct dw8250_data *d = to_dw8250_data(p->private_data);
 
@@ -145,7 +145,7 @@ static void dw8250_force_idle(struct uart_port *p)
  * routine. Hence, it must not call serial_port_out() or serial_out()
  * against the modified registers here, i.e. LCR.
  */
-static void dw8250_check_lcr(struct uart_port *p, int offset, int value)
+static void dw8250_check_lcr(struct uart_port *p, unsigned int offset, u32 value)
 {
 	struct dw8250_data *d = to_dw8250_data(p->private_data);
 	void __iomem *addr = p->membase + (offset << p->regshift);
@@ -156,7 +156,7 @@ static void dw8250_check_lcr(struct uart_port *p, int offset, int value)
 
 	/* Make sure LCR write wasn't ignored */
 	while (tries--) {
-		unsigned int lcr = serial_port_in(p, offset);
+		u32 lcr = serial_port_in(p, offset);
 
 		if ((value & ~UART_LCR_SPAR) == (lcr & ~UART_LCR_SPAR))
 			return;
@@ -205,13 +205,13 @@ static void dw8250_tx_wait_empty(struct uart_port *p)
 	}
 }
 
-static void dw8250_serial_out(struct uart_port *p, int offset, int value)
+static void dw8250_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	writeb(value, p->membase + (offset << p->regshift));
 	dw8250_check_lcr(p, offset, value);
 }
 
-static void dw8250_serial_out38x(struct uart_port *p, int offset, int value)
+static void dw8250_serial_out38x(struct uart_port *p, unsigned int offset, u32 value)
 {
 	/* Allow the TX to drain before we reconfigure */
 	if (offset == UART_LCR)
@@ -220,22 +220,22 @@ static void dw8250_serial_out38x(struct uart_port *p, int offset, int value)
 	dw8250_serial_out(p, offset, value);
 }
 
-static unsigned int dw8250_serial_in(struct uart_port *p, int offset)
+static u32 dw8250_serial_in(struct uart_port *p, unsigned int offset)
 {
-	unsigned int value = readb(p->membase + (offset << p->regshift));
+	u32 value = readb(p->membase + (offset << p->regshift));
 
 	return dw8250_modify_msr(p, offset, value);
 }
 
 #ifdef CONFIG_64BIT
-static unsigned int dw8250_serial_inq(struct uart_port *p, int offset)
+static u32 dw8250_serial_inq(struct uart_port *p, unsigned int offset)
 {
 	u8 value = __raw_readq(p->membase + (offset << p->regshift));
 
 	return dw8250_modify_msr(p, offset, value);
 }
 
-static void dw8250_serial_outq(struct uart_port *p, int offset, int value)
+static void dw8250_serial_outq(struct uart_port *p, unsigned int offset, u32 value)
 {
 	value &= 0xff;
 	__raw_writeq(value, p->membase + (offset << p->regshift));
@@ -246,28 +246,28 @@ static void dw8250_serial_outq(struct uart_port *p, int offset, int value)
 }
 #endif /* CONFIG_64BIT */
 
-static void dw8250_serial_out32(struct uart_port *p, int offset, int value)
+static void dw8250_serial_out32(struct uart_port *p, unsigned int offset, u32 value)
 {
 	writel(value, p->membase + (offset << p->regshift));
 	dw8250_check_lcr(p, offset, value);
 }
 
-static unsigned int dw8250_serial_in32(struct uart_port *p, int offset)
+static u32 dw8250_serial_in32(struct uart_port *p, unsigned int offset)
 {
-	unsigned int value = readl(p->membase + (offset << p->regshift));
+	u32 value = readl(p->membase + (offset << p->regshift));
 
 	return dw8250_modify_msr(p, offset, value);
 }
 
-static void dw8250_serial_out32be(struct uart_port *p, int offset, int value)
+static void dw8250_serial_out32be(struct uart_port *p, unsigned int offset, u32 value)
 {
 	iowrite32be(value, p->membase + (offset << p->regshift));
 	dw8250_check_lcr(p, offset, value);
 }
 
-static unsigned int dw8250_serial_in32be(struct uart_port *p, int offset)
+static u32 dw8250_serial_in32be(struct uart_port *p, unsigned int offset)
 {
-       unsigned int value = ioread32be(p->membase + (offset << p->regshift));
+       u32 value = ioread32be(p->membase + (offset << p->regshift));
 
        return dw8250_modify_msr(p, offset, value);
 }
diff --git a/drivers/tty/serial/8250/8250_em.c b/drivers/tty/serial/8250/8250_em.c
index 35094f884492..e90c71494944 100644
--- a/drivers/tty/serial/8250/8250_em.c
+++ b/drivers/tty/serial/8250/8250_em.c
@@ -59,7 +59,7 @@ static void serial8250_em_serial_out_helper(struct uart_port *p, int offset,
 	}
 }
 
-static unsigned int serial8250_em_serial_in(struct uart_port *p, int offset)
+static u32 serial8250_em_serial_in(struct uart_port *p, unsigned int offset)
 {
 	switch (offset) {
 	case UART_RX: /* RX @ 0x00 */
@@ -119,7 +119,7 @@ static void serial8250_em_reg_update(struct uart_port *p, int off, int value)
 	serial8250_em_serial_out_helper(p, UART_HCR0_EM, hcr0);
 }
 
-static void serial8250_em_serial_out(struct uart_port *p, int offset, int value)
+static void serial8250_em_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	switch (offset) {
 	case UART_TX:
diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c
index a73dd3773640..94542fc143c2 100644
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -168,9 +168,9 @@ OF_EARLYCON_DECLARE(jz4780_uart, "ingenic,jz4780-uart",
 OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart",
 		    ingenic_early_console_setup);
 
-static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
+static void ingenic_uart_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
-	int ier;
+	u32 ier;
 
 	switch (offset) {
 	case UART_FCR:
@@ -206,9 +206,9 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
 	writeb(value, p->membase + (offset << p->regshift));
 }
 
-static unsigned int ingenic_uart_serial_in(struct uart_port *p, int offset)
+static u32 ingenic_uart_serial_in(struct uart_port *p, unsigned int offset)
 {
-	unsigned int value;
+	u8 value;
 
 	value = readb(p->membase + (offset << p->regshift));
 
diff --git a/drivers/tty/serial/8250/8250_ioc3.c b/drivers/tty/serial/8250/8250_ioc3.c
index 499e80aa4cf9..3ebda9a5d07d 100644
--- a/drivers/tty/serial/8250/8250_ioc3.c
+++ b/drivers/tty/serial/8250/8250_ioc3.c
@@ -21,12 +21,12 @@ struct ioc3_8250_data {
 	int line;
 };
 
-static unsigned int ioc3_serial_in(struct uart_port *p, int offset)
+static u32 ioc3_serial_in(struct uart_port *p, unsigned int offset)
 {
 	return readb(p->membase + (offset ^ 3));
 }
 
-static void ioc3_serial_out(struct uart_port *p, int offset, int value)
+static void ioc3_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	writeb(value, p->membase + (offset ^ 3));
 }
diff --git a/drivers/tty/serial/8250/8250_lpc18xx.c b/drivers/tty/serial/8250/8250_lpc18xx.c
index d52445948da0..6c0489c9c253 100644
--- a/drivers/tty/serial/8250/8250_lpc18xx.c
+++ b/drivers/tty/serial/8250/8250_lpc18xx.c
@@ -67,7 +67,7 @@ static int lpc18xx_rs485_config(struct uart_port *port, struct ktermios *termios
 	return 0;
 }
 
-static void lpc18xx_uart_serial_out(struct uart_port *p, int offset, int value)
+static void lpc18xx_uart_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	/*
 	 * For DMA mode one must ensure that the UART_FCR_DMA_SELECT
diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
index 73c200127b08..152f914c599d 100644
--- a/drivers/tty/serial/8250/8250_pci.c
+++ b/drivers/tty/serial/8250/8250_pci.c
@@ -1751,7 +1751,7 @@ static int pci_fintek_init(struct pci_dev *dev)
 	return max_port;
 }
 
-static void f815xxa_mem_serial_out(struct uart_port *p, int offset, int value)
+static void f815xxa_mem_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	struct f815xxa_data *data = p->private_data;
 	unsigned long flags;
@@ -1846,10 +1846,10 @@ static void kt_handle_break(struct uart_port *p)
 	serial8250_clear_and_reinit_fifos(up);
 }
 
-static unsigned int kt_serial_in(struct uart_port *p, int offset)
+static u32 kt_serial_in(struct uart_port *p, unsigned int offset)
 {
 	struct uart_8250_port *up = up_to_u8250p(p);
-	unsigned int val;
+	u32 val;
 
 	/*
 	 * When the Intel ME (management engine) gets reset its serial
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 6d7b8c4667c9..f5407832e8a7 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -339,14 +339,14 @@ static void default_serial_dl_write(struct uart_8250_port *up, u32 value)
 }
 
 #ifdef CONFIG_HAS_IOPORT
-static unsigned int hub6_serial_in(struct uart_port *p, int offset)
+static u32 hub6_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	outb(p->hub6 - 1 + offset, p->iobase);
 	return inb(p->iobase + 1);
 }
 
-static void hub6_serial_out(struct uart_port *p, int offset, int value)
+static void hub6_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	outb(p->hub6 - 1 + offset, p->iobase);
@@ -354,73 +354,73 @@ static void hub6_serial_out(struct uart_port *p, int offset, int value)
 }
 #endif /* CONFIG_HAS_IOPORT */
 
-static unsigned int mem_serial_in(struct uart_port *p, int offset)
+static u32 mem_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	return readb(p->membase + offset);
 }
 
-static void mem_serial_out(struct uart_port *p, int offset, int value)
+static void mem_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	writeb(value, p->membase + offset);
 }
 
-static void mem16_serial_out(struct uart_port *p, int offset, int value)
+static void mem16_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	writew(value, p->membase + offset);
 }
 
-static unsigned int mem16_serial_in(struct uart_port *p, int offset)
+static u32 mem16_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	return readw(p->membase + offset);
 }
 
-static void mem32_serial_out(struct uart_port *p, int offset, int value)
+static void mem32_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	writel(value, p->membase + offset);
 }
 
-static unsigned int mem32_serial_in(struct uart_port *p, int offset)
+static u32 mem32_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	return readl(p->membase + offset);
 }
 
-static void mem32be_serial_out(struct uart_port *p, int offset, int value)
+static void mem32be_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	iowrite32be(value, p->membase + offset);
 }
 
-static unsigned int mem32be_serial_in(struct uart_port *p, int offset)
+static u32 mem32be_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	return ioread32be(p->membase + offset);
 }
 
 #ifdef CONFIG_HAS_IOPORT
-static unsigned int io_serial_in(struct uart_port *p, int offset)
+static u32 io_serial_in(struct uart_port *p, unsigned int offset)
 {
 	offset = offset << p->regshift;
 	return inb(p->iobase + offset);
 }
 
-static void io_serial_out(struct uart_port *p, int offset, int value)
+static void io_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	offset = offset << p->regshift;
 	outb(value, p->iobase + offset);
 }
 #endif
-static unsigned int no_serial_in(struct uart_port *p, int offset)
+static u32 no_serial_in(struct uart_port *p, unsigned int offset)
 {
-	return (unsigned int)-1;
+	return ~0U;
 }
 
-static void no_serial_out(struct uart_port *p, int offset, int value)
+static void no_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 }
 
diff --git a/drivers/tty/serial/8250/8250_rt288x.c b/drivers/tty/serial/8250/8250_rt288x.c
index 6415ca8d3adf..bf28b8a9a710 100644
--- a/drivers/tty/serial/8250/8250_rt288x.c
+++ b/drivers/tty/serial/8250/8250_rt288x.c
@@ -33,7 +33,7 @@ static const u8 au_io_out_map[5] = {
 	[UART_MCR]	= 6,
 };
 
-static unsigned int au_serial_in(struct uart_port *p, int offset)
+static u32 au_serial_in(struct uart_port *p, unsigned int offset)
 {
 	if (offset >= ARRAY_SIZE(au_io_in_map))
 		return UINT_MAX;
@@ -42,7 +42,7 @@ static unsigned int au_serial_in(struct uart_port *p, int offset)
 	return __raw_readl(p->membase + (offset << p->regshift));
 }
 
-static void au_serial_out(struct uart_port *p, int offset, int value)
+static void au_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	if (offset >= ARRAY_SIZE(au_io_out_map))
 		return;
diff --git a/drivers/tty/serial/8250/8250_uniphier.c b/drivers/tty/serial/8250/8250_uniphier.c
index 4874a9632db3..e3db60bf50c9 100644
--- a/drivers/tty/serial/8250/8250_uniphier.c
+++ b/drivers/tty/serial/8250/8250_uniphier.c
@@ -63,7 +63,7 @@ OF_EARLYCON_DECLARE(uniphier, "socionext,uniphier-uart",
  * The register map is slightly different from that of 8250.
  * IO callbacks must be overridden for correct access to FCR, LCR, MCR and SCR.
  */
-static unsigned int uniphier_serial_in(struct uart_port *p, int offset)
+static u32 uniphier_serial_in(struct uart_port *p, unsigned int offset)
 {
 	unsigned int valshift = 0;
 
@@ -92,7 +92,7 @@ static unsigned int uniphier_serial_in(struct uart_port *p, int offset)
 	return (readl(p->membase + offset) >> valshift) & 0xff;
 }
 
-static void uniphier_serial_out(struct uart_port *p, int offset, int value)
+static void uniphier_serial_out(struct uart_port *p, unsigned int offset, u32 value)
 {
 	unsigned int valshift = 0;
 	bool normal = false;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 144de7a7948d..01efdce0fda0 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -46,8 +46,8 @@ struct plat_serial8250_port {
 	unsigned int	type;		/* If UPF_FIXED_TYPE */
 	upf_t		flags;		/* UPF_* flags */
 	u16		bugs;		/* port bugs */
-	unsigned int	(*serial_in)(struct uart_port *, int);
-	void		(*serial_out)(struct uart_port *, int, int);
+	u32		(*serial_in)(struct uart_port *, unsigned int offset);
+	void		(*serial_out)(struct uart_port *, unsigned int offset, u32 val);
 	u32		(*dl_read)(struct uart_8250_port *up);
 	void		(*dl_write)(struct uart_8250_port *up, u32 value);
 	void		(*set_termios)(struct uart_port *,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 914b5e97e056..d65b15449cfe 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -443,8 +443,8 @@ struct uart_port {
 	spinlock_t		lock;			/* port lock */
 	unsigned long		iobase;			/* in/out[bwl] */
 	unsigned char __iomem	*membase;		/* read/write[bwl] */
-	unsigned int		(*serial_in)(struct uart_port *, int);
-	void			(*serial_out)(struct uart_port *, int, int);
+	u32			(*serial_in)(struct uart_port *, unsigned int offset);
+	void			(*serial_out)(struct uart_port *, unsigned int offset, u32 val);
 	void			(*set_termios)(struct uart_port *,
 				               struct ktermios *new,
 				               const struct ktermios *old);
-- 
cgit v1.2.3


From b013ed403197f3f8c30ddb3ce66fe05a632b3493 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 16 Jun 2025 20:33:22 +0100
Subject: fs: consistently use can_mmap_file() helper

Since commit c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file
callback"), the f_op->mmap() hook has been deprecated in favour of
f_op->mmap_prepare().

Additionally, commit bb666b7c2707 ("mm: add mmap_prepare() compatibility
layer for nested file systems") permits the use of the .mmap_prepare() hook
even in nested filesystems like overlayfs.

There are a number of places where we check only for f_op->mmap - this is
incorrect now mmap_prepare exists, so update all of these to use the
general helper can_mmap_file().

Most notably, this updates the elf logic to allow for the ability to
execute binaries on filesystems which have the .mmap_prepare hook, but
additionally we update nested filesystems.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/b68145b609532e62bab603dd9686faa6562046ec.1750099179.git.lorenzo.stoakes@oracle.com
Acked-by: Kees Cook <kees@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/backing-file.c                | 2 +-
 fs/binfmt_elf.c                  | 4 ++--
 fs/binfmt_elf_fdpic.c            | 2 +-
 fs/coda/file.c                   | 2 +-
 fs/ecryptfs/file.c               | 2 +-
 include/linux/fs.h               | 2 +-
 mm/mmap.c                        | 2 +-
 mm/nommu.c                       | 2 +-
 tools/testing/vma/vma_internal.h | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 04018679bf69..29748953a851 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -333,7 +333,7 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
 		return -EIO;
 
-	if (!file->f_op->mmap)
+	if (!can_mmap_file(file))
 		return -ENODEV;
 
 	vma_set_file(vma, file);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a43363d593e5..e3b56b603192 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -646,7 +646,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	if (!elf_check_arch(interp_elf_ex) ||
 	    elf_check_fdpic(interp_elf_ex))
 		goto out;
-	if (!interpreter->f_op->mmap)
+	if (!can_mmap_file(interpreter))
 		goto out;
 
 	total_size = total_mapping_size(interp_elf_phdata,
@@ -848,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 	if (elf_check_fdpic(elf_ex))
 		goto out;
-	if (!bprm->file->f_op->mmap)
+	if (!can_mmap_file(bprm->file))
 		goto out;
 
 	elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9133f3827f90..59b138062352 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -109,7 +109,7 @@ static int is_elf(struct elfhdr *hdr, struct file *file)
 		return 0;
 	if (!elf_check_arch(hdr))
 		return 0;
-	if (!file->f_op->mmap)
+	if (!can_mmap_file(file))
 		return 0;
 	return 1;
 }
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 2e6ea9319b35..a390b5d21196 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -160,7 +160,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
 	size_t count;
 	int ret;
 
-	if (!host_file->f_op->mmap)
+	if (!can_mmap_file(host_file))
 		return -ENODEV;
 
 	if (WARN_ON(coda_file != vma->vm_file))
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index ce0a3c5ed0ca..5f8f96da09fe 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -193,7 +193,7 @@ static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
 	 * natively.  If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
 	 * allows recursive mounting, this will need to be extended.
 	 */
-	if (!lower_file->f_op->mmap)
+	if (!can_mmap_file(lower_file))
 		return -ENODEV;
 	return generic_file_mmap(file, vma);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c66f235f9e4d..d4fa1cb0755a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2260,7 +2260,7 @@ struct inode_operations {
 } ____cacheline_aligned;
 
 /* Did the driver provide valid mmap hook configuration? */
-static inline bool file_has_valid_mmap_hooks(struct file *file)
+static inline bool can_mmap_file(struct file *file)
 {
 	bool has_mmap = file->f_op->mmap;
 	bool has_mmap_prepare = file->f_op->mmap_prepare;
diff --git a/mm/mmap.c b/mm/mmap.c
index 09c563c95112..12c1d060f104 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -475,7 +475,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 				vm_flags &= ~VM_MAYEXEC;
 			}
 
-			if (!file_has_valid_mmap_hooks(file))
+			if (!can_mmap_file(file))
 				return -ENODEV;
 			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 				return -EINVAL;
diff --git a/mm/nommu.c b/mm/nommu.c
index 38c22ea0a95c..56a53de10166 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -719,7 +719,7 @@ static int validate_mmap_request(struct file *file,
 
 	if (file) {
 		/* files must support mmap */
-		if (!file_has_valid_mmap_hooks(file))
+		if (!can_mmap_file(file))
 			return -ENODEV;
 
 		/* work out if what we've got could possibly be shared
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 7ab04700470f..816e7e057585 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1464,7 +1464,7 @@ static int compat_vma_mmap_prepare(struct file *file,
 }
 
 /* Did the driver provide valid mmap hook configuration? */
-static inline bool file_has_valid_mmap_hooks(struct file *file)
+static inline bool can_mmap_file(struct file *file)
 {
 	bool has_mmap = file->f_op->mmap;
 	bool has_mmap_prepare = file->f_op->mmap_prepare;
-- 
cgit v1.2.3


From 0335f6afd3488d1101f3b15014095fa51b978253 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 16 Jun 2025 20:33:23 +0100
Subject: fs/dax: make it possible to check dev dax support without a VMA

This is a prerequisite for adapting those filesystems to use the
.mmap_prepare() hook for mmap()'ing which invoke this check as this hook
does not have access to a VMA pointer.

To effect this, change the signature of daxdev_mapping_supported() and
update its callers (ext4 and xfs mmap()'ing hook code).

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/b09de1e8544384074165d92d048e80058d971286.1750099179.git.lorenzo.stoakes@oracle.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/file.c      |  2 +-
 fs/xfs/xfs_file.c   |  3 ++-
 include/linux/dax.h | 16 +++++++++-------
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21df81347147..5b8b95936a4c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -821,7 +821,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * We don't support synchronous mappings for non-DAX files and
 	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!daxdev_mapping_supported(vma, dax_dev))
+	if (!daxdev_mapping_supported(vma->vm_flags, file_inode(vma->vm_file), dax_dev))
 		return -EOPNOTSUPP;
 
 	file_accessed(file);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 48254a72071b..1160eba8727f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1924,7 +1924,8 @@ xfs_file_mmap(
 	 * We don't support synchronous mappings for non-DAX files and
 	 * for DAX files if underneath dax_device is not synchronous.
 	 */
-	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
+	if (!daxdev_mapping_supported(vma->vm_flags, file_inode(vma->vm_file),
+				      target->bt_daxdev))
 		return -EOPNOTSUPP;
 
 	file_accessed(file);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index dcc9fcdf14e4..78891518291d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -65,12 +65,13 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
 /*
  * Check if given mapping is supported by the file / underlying device.
  */
-static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
-					     struct dax_device *dax_dev)
+static inline bool daxdev_mapping_supported(vm_flags_t vm_flags,
+					    const struct inode *inode,
+					    struct dax_device *dax_dev)
 {
-	if (!(vma->vm_flags & VM_SYNC))
+	if (!(vm_flags & VM_SYNC))
 		return true;
-	if (!IS_DAX(file_inode(vma->vm_file)))
+	if (!IS_DAX(inode))
 		return false;
 	return dax_synchronous(dax_dev);
 }
@@ -110,10 +111,11 @@ static inline void set_dax_nomc(struct dax_device *dax_dev)
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
-static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
-				struct dax_device *dax_dev)
+static inline bool daxdev_mapping_supported(vm_flags_t vm_flags,
+					    const struct inode *inode,
+					    struct dax_device *dax_dev)
 {
-	return !(vma->vm_flags & VM_SYNC);
+	return !(vm_flags & VM_SYNC);
 }
 static inline size_t dax_recovery_write(struct dax_device *dax_dev,
 		pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
-- 
cgit v1.2.3


From 5b44297bcfa49ee197cdb8ca6164bef120c4e73c Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 16 Jun 2025 20:33:26 +0100
Subject: mm/filemap: introduce generic_file_*_mmap_prepare() helpers

Since commit c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file
callback"), the f_op->mmap() hook has been deprecated in favour of
f_op->mmap_prepare().

The generic mmap handlers are very simple, so we can very easily convert
these in advance of converting file systems which use them.

This patch does so.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/30622c1f0b98c66840bc8c02668bda276a810b70.1750099179.git.lorenzo.stoakes@oracle.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h |  6 ++++--
 mm/filemap.c       | 29 +++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d4fa1cb0755a..fd5e7409489d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3393,8 +3393,10 @@ extern void inode_add_lru(struct inode *inode);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
 
-extern int generic_file_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
+int generic_file_mmap(struct file *, struct vm_area_struct *);
+int generic_file_mmap_prepare(struct vm_area_desc *desc);
+int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc);
 extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
 extern int generic_write_check_limits(struct file *file, loff_t pos,
diff --git a/mm/filemap.c b/mm/filemap.c
index bada249b9fb7..77e1bac30490 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3814,6 +3814,18 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+	struct file *file = desc->file;
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->read_folio)
+		return -ENOEXEC;
+	file_accessed(file);
+	desc->vm_ops = &generic_file_vm_ops;
+	return 0;
+}
+
 /*
  * This is for filesystems which do not implement ->writepage.
  */
@@ -3823,6 +3835,13 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 		return -EINVAL;
 	return generic_file_mmap(file, vma);
 }
+
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+	if (is_shared_maywrite(desc->vm_flags))
+		return -EINVAL;
+	return generic_file_mmap_prepare(desc);
+}
 #else
 vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
 {
@@ -3832,15 +3851,25 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	return -ENOSYS;
 }
+int generic_file_mmap_prepare(struct vm_area_desc *desc)
+{
+	return -ENOSYS;
+}
 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	return -ENOSYS;
 }
+int generic_file_readonly_mmap_prepare(struct vm_area_desc *desc)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_MMU */
 
 EXPORT_SYMBOL(filemap_page_mkwrite);
 EXPORT_SYMBOL(generic_file_mmap);
+EXPORT_SYMBOL(generic_file_mmap_prepare);
 EXPORT_SYMBOL(generic_file_readonly_mmap);
+EXPORT_SYMBOL(generic_file_readonly_mmap_prepare);
 
 static struct folio *do_read_cache_folio(struct address_space *mapping,
 		pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
-- 
cgit v1.2.3


From a85b8544d46390469b6ca72d6bfd3ecb7be985ff Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Sat, 14 Jun 2025 00:30:37 +0200
Subject: wifi: remove zero-length arrays

All of these are really meant to be variable-length, and
in the case of s1g_beacon it's actually accessed. Make that
one in particular, and a couple of others (that aren't used
as arrays now), actually variable.

Reported-by: syzbot+fd222bb38e916df26fa4@syzkaller.appspotmail.com
Fixes: 1e1f706fc2ce ("wifi: cfg80211/mac80211: correctly parse S1G beacon optional elements")
Link: https://patch.msgid.link/20250614003037.a3e82e882251.I2e8b58e56ff2a9f8b06c66f036578b7c1d4e4685@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index ce377f7fb912..22f39e5e2ff1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1278,7 +1278,7 @@ struct ieee80211_ext {
 			u8 sa[ETH_ALEN];
 			__le32 timestamp;
 			u8 change_seq;
-			u8 variable[0];
+			u8 variable[];
 		} __packed s1g_beacon;
 	} u;
 } __packed __aligned(2);
@@ -1536,7 +1536,7 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 dialog_token;
 					__le16 capability;
-					u8 variable[0];
+					u8 variable[];
 				} __packed tdls_discover_resp;
 				struct {
 					u8 action_code;
@@ -1721,35 +1721,35 @@ struct ieee80211_tdls_data {
 		struct {
 			u8 dialog_token;
 			__le16 capability;
-			u8 variable[0];
+			u8 variable[];
 		} __packed setup_req;
 		struct {
 			__le16 status_code;
 			u8 dialog_token;
 			__le16 capability;
-			u8 variable[0];
+			u8 variable[];
 		} __packed setup_resp;
 		struct {
 			__le16 status_code;
 			u8 dialog_token;
-			u8 variable[0];
+			u8 variable[];
 		} __packed setup_cfm;
 		struct {
 			__le16 reason_code;
-			u8 variable[0];
+			u8 variable[];
 		} __packed teardown;
 		struct {
 			u8 dialog_token;
-			u8 variable[0];
+			u8 variable[];
 		} __packed discover_req;
 		struct {
 			u8 target_channel;
 			u8 oper_class;
-			u8 variable[0];
+			u8 variable[];
 		} __packed chan_switch_req;
 		struct {
 			__le16 status_code;
-			u8 variable[0];
+			u8 variable[];
 		} __packed chan_switch_resp;
 	} u;
 } __packed;
-- 
cgit v1.2.3


From d8010d4ba43e9f790925375a7de100604a5e2dba Mon Sep 17 00:00:00 2001
From: "Borislav Petkov (AMD)" <bp@alien8.de>
Date: Wed, 11 Sep 2024 10:53:08 +0200
Subject: x86/bugs: Add a Transient Scheduler Attacks mitigation

Add the required features detection glue to bugs.c et all in order to
support the TSA mitigation.

Co-developed-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu |   1 +
 Documentation/admin-guide/kernel-parameters.txt    |  13 +++
 arch/x86/Kconfig                                   |   9 ++
 arch/x86/include/asm/cpufeatures.h                 |   6 +-
 arch/x86/include/asm/mwait.h                       |   2 +-
 arch/x86/include/asm/nospec-branch.h               |  14 ++-
 arch/x86/kernel/cpu/amd.c                          |  44 ++++++++
 arch/x86/kernel/cpu/bugs.c                         | 124 +++++++++++++++++++++
 arch/x86/kernel/cpu/common.c                       |  14 ++-
 arch/x86/kernel/cpu/scattered.c                    |   2 +
 arch/x86/kvm/svm/vmenter.S                         |   6 +
 drivers/base/cpu.c                                 |   3 +
 include/linux/cpu.h                                |   1 +
 13 files changed, 232 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index bf85f4de6862..ab8cd337f43a 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -584,6 +584,7 @@ What:		/sys/devices/system/cpu/vulnerabilities
 		/sys/devices/system/cpu/vulnerabilities/spectre_v1
 		/sys/devices/system/cpu/vulnerabilities/spectre_v2
 		/sys/devices/system/cpu/vulnerabilities/srbds
+		/sys/devices/system/cpu/vulnerabilities/tsa
 		/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
 Date:		January 2018
 Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1f2c0874da9..07e22ba5bfe3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7488,6 +7488,19 @@
 			having this key zero'ed is acceptable. E.g. in testing
 			scenarios.
 
+	tsa=		[X86] Control mitigation for Transient Scheduler
+			Attacks on AMD CPUs. Search the following in your
+			favourite search engine for more details:
+
+			"Technical guidance for mitigating transient scheduler
+			attacks".
+
+			off		- disable the mitigation
+			on		- enable the mitigation (default)
+			user		- mitigate only user/kernel transitions
+			vm		- mitigate only guest/host transitions
+
+
 	tsc=		Disable clocksource stability checks for TSC.
 			Format: <string>
 			[x86] reliable: mark tsc clocksource as reliable, this
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..71dfe7d7c786 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2695,6 +2695,15 @@ config MITIGATION_ITS
 	  disabled, mitigation cannot be enabled via cmdline.
 	  See <file:Documentation/admin-guide/hw-vuln/indirect-target-selection.rst>
 
+config MITIGATION_TSA
+	bool "Mitigate Transient Scheduler Attacks"
+	depends on CPU_SUP_AMD
+	default y
+	help
+	  Enable mitigation for Transient Scheduler Attacks. TSA is a hardware
+	  security vulnerability on AMD CPUs which can lead to forwarding of
+	  invalid info to subsequent instructions and thus can affect their
+	  timing and thereby cause a leakage.
 endif
 
 config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ee176236c2be..286d509f9363 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -456,6 +456,7 @@
 #define X86_FEATURE_NO_NESTED_DATA_BP	(20*32+ 0) /* No Nested Data Breakpoints */
 #define X86_FEATURE_WRMSR_XX_BASE_NS	(20*32+ 1) /* WRMSR to {FS,GS,KERNEL_GS}_BASE is non-serializing */
 #define X86_FEATURE_LFENCE_RDTSC	(20*32+ 2) /* LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_VERW_CLEAR		(20*32+ 5) /* The memory form of VERW mitigates TSA */
 #define X86_FEATURE_NULL_SEL_CLR_BASE	(20*32+ 6) /* Null Selector Clears Base */
 #define X86_FEATURE_AUTOIBRS		(20*32+ 8) /* Automatic IBRS */
 #define X86_FEATURE_NO_SMM_CTL_MSR	(20*32+ 9) /* SMM_CTL MSR is not present */
@@ -487,6 +488,9 @@
 #define X86_FEATURE_PREFER_YMM		(21*32+ 8) /* Avoid ZMM registers due to downclocking */
 #define X86_FEATURE_APX			(21*32+ 9) /* Advanced Performance Extensions */
 #define X86_FEATURE_INDIRECT_THUNK_ITS	(21*32+10) /* Use thunk for indirect branches in lower half of cacheline */
+#define X86_FEATURE_TSA_SQ_NO		(21*32+11) /* AMD CPU not vulnerable to TSA-SQ */
+#define X86_FEATURE_TSA_L1_NO		(21*32+12) /* AMD CPU not vulnerable to TSA-L1 */
+#define X86_FEATURE_CLEAR_CPU_BUF_VM	(21*32+13) /* Clear CPU buffers using VERW before VMRUN */
 
 /*
  * BUG word(s)
@@ -542,5 +546,5 @@
 #define X86_BUG_OLD_MICROCODE		X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */
 #define X86_BUG_ITS			X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */
 #define X86_BUG_ITS_NATIVE_ONLY		X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */
-
+#define X86_BUG_TSA			X86_BUG( 1*32+ 9) /* "tsa" CPU is affected by Transient Scheduler Attacks */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index cc34c3fd197b..82bd9eb73b3c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -80,7 +80,7 @@ static __always_inline void __mwait(u32 eax, u32 ecx)
  */
 static __always_inline void __mwaitx(u32 eax, u32 ebx, u32 ecx)
 {
-	/* No MDS buffer clear as this is AMD/HYGON only */
+	/* No need for TSA buffer clearing on AMD */
 
 	/* "mwaitx %eax, %ebx, %ecx" */
 	asm volatile(".byte 0x0f, 0x01, 0xfb"
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 5dcd75bb5e0d..10f261678749 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -308,19 +308,25 @@
  * CFLAGS.ZF.
  * Note: Only the memory operand variant of VERW clears the CPU buffers.
  */
-.macro CLEAR_CPU_BUFFERS
+.macro __CLEAR_CPU_BUFFERS feature
 #ifdef CONFIG_X86_64
-	ALTERNATIVE "", "verw x86_verw_sel(%rip)", X86_FEATURE_CLEAR_CPU_BUF
+	ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature
 #else
 	/*
 	 * In 32bit mode, the memory operand must be a %cs reference. The data
 	 * segments may not be usable (vm86 mode), and the stack segment may not
 	 * be flat (ESPFIX32).
 	 */
-	ALTERNATIVE "", "verw %cs:x86_verw_sel", X86_FEATURE_CLEAR_CPU_BUF
+	ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature
 #endif
 .endm
 
+#define CLEAR_CPU_BUFFERS \
+	__CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF
+
+#define VM_CLEAR_CPU_BUFFERS \
+	__CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM
+
 #ifdef CONFIG_X86_64
 .macro CLEAR_BRANCH_HISTORY
 	ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP
@@ -602,7 +608,7 @@ static __always_inline void x86_clear_cpu_buffers(void)
 
 /**
  * x86_idle_clear_cpu_buffers - Buffer clearing support in idle for the MDS
- * vulnerability
+ * and TSA vulnerabilities.
  *
  * Clear CPU buffers if the corresponding static key is enabled
  */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 93da466dfe2c..23c535871a7e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -377,6 +377,47 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
 #endif
 }
 
+#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \
+	X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \
+			    step, step, ucode)
+
+static const struct x86_cpu_id amd_tsa_microcode[] = {
+	ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008),
+	ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008),
+	ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216),
+	{},
+};
+
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+		return;
+
+	if (cpu_has(c, X86_FEATURE_ZEN3) ||
+	    cpu_has(c, X86_FEATURE_ZEN4)) {
+		if (x86_match_min_microcode_rev(amd_tsa_microcode))
+			setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+		else
+			pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode);
+	} else {
+		setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+		setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
+	}
+}
+
 static void bsp_init_amd(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -489,6 +530,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 	}
 
 	bsp_determine_snp(c);
+
+	tsa_init(c);
+
 	return;
 
 warn:
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 258ed3d2b6a9..f4d3abb12317 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -94,6 +94,8 @@ static void __init bhi_apply_mitigation(void);
 static void __init its_select_mitigation(void);
 static void __init its_update_mitigation(void);
 static void __init its_apply_mitigation(void);
+static void __init tsa_select_mitigation(void);
+static void __init tsa_apply_mitigation(void);
 
 /* The base value of the SPEC_CTRL MSR without task-specific bits set */
 u64 x86_spec_ctrl_base;
@@ -225,6 +227,7 @@ void __init cpu_select_mitigations(void)
 	gds_select_mitigation();
 	its_select_mitigation();
 	bhi_select_mitigation();
+	tsa_select_mitigation();
 
 	/*
 	 * After mitigations are selected, some may need to update their
@@ -272,6 +275,7 @@ void __init cpu_select_mitigations(void)
 	gds_apply_mitigation();
 	its_apply_mitigation();
 	bhi_apply_mitigation();
+	tsa_apply_mitigation();
 }
 
 /*
@@ -1487,6 +1491,94 @@ static void __init its_apply_mitigation(void)
 	set_return_thunk(its_return_thunk);
 }
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+	TSA_MITIGATION_NONE,
+	TSA_MITIGATION_AUTO,
+	TSA_MITIGATION_UCODE_NEEDED,
+	TSA_MITIGATION_USER_KERNEL,
+	TSA_MITIGATION_VM,
+	TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+	[TSA_MITIGATION_NONE]		= "Vulnerable",
+	[TSA_MITIGATION_UCODE_NEEDED]	= "Vulnerable: No microcode",
+	[TSA_MITIGATION_USER_KERNEL]	= "Mitigation: Clear CPU buffers: user/kernel boundary",
+	[TSA_MITIGATION_VM]		= "Mitigation: Clear CPU buffers: VM",
+	[TSA_MITIGATION_FULL]		= "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+	IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off"))
+		tsa_mitigation = TSA_MITIGATION_NONE;
+	else if (!strcmp(str, "on"))
+		tsa_mitigation = TSA_MITIGATION_FULL;
+	else if (!strcmp(str, "user"))
+		tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+	else if (!strcmp(str, "vm"))
+		tsa_mitigation = TSA_MITIGATION_VM;
+	else
+		pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+	return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+	if (cpu_mitigations_off() || !boot_cpu_has_bug(X86_BUG_TSA)) {
+		tsa_mitigation = TSA_MITIGATION_NONE;
+		return;
+	}
+
+	if (tsa_mitigation == TSA_MITIGATION_NONE)
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR)) {
+		tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+		goto out;
+	}
+
+	if (tsa_mitigation == TSA_MITIGATION_AUTO)
+		tsa_mitigation = TSA_MITIGATION_FULL;
+
+	/*
+	 * No need to set verw_clear_cpu_buf_mitigation_selected - it
+	 * doesn't fit all cases here and it is not needed because this
+	 * is the only VERW-based mitigation on AMD.
+	 */
+out:
+	pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static void __init tsa_apply_mitigation(void)
+{
+	switch (tsa_mitigation) {
+	case TSA_MITIGATION_USER_KERNEL:
+		setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+		break;
+	case TSA_MITIGATION_VM:
+		setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+		break;
+	case TSA_MITIGATION_FULL:
+		setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+		setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+		break;
+	default:
+		break;
+	}
+}
+
 #undef pr_fmt
 #define pr_fmt(fmt)     "Spectre V2 : " fmt
 
@@ -2316,6 +2408,25 @@ void cpu_bugs_smt_update(void)
 		break;
 	}
 
+	switch (tsa_mitigation) {
+	case TSA_MITIGATION_USER_KERNEL:
+	case TSA_MITIGATION_VM:
+	case TSA_MITIGATION_AUTO:
+	case TSA_MITIGATION_FULL:
+		/*
+		 * TSA-SQ can potentially lead to info leakage between
+		 * SMT threads.
+		 */
+		if (sched_smt_active())
+			static_branch_enable(&cpu_buf_idle_clear);
+		else
+			static_branch_disable(&cpu_buf_idle_clear);
+		break;
+	case TSA_MITIGATION_NONE:
+	case TSA_MITIGATION_UCODE_NEEDED:
+		break;
+	}
+
 	mutex_unlock(&spec_ctrl_mutex);
 }
 
@@ -3265,6 +3376,11 @@ static ssize_t gds_show_state(char *buf)
 	return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
 }
 
+static ssize_t tsa_show_state(char *buf)
+{
+	return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
 			       char *buf, unsigned int bug)
 {
@@ -3328,6 +3444,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
 	case X86_BUG_ITS:
 		return its_show_state(buf);
 
+	case X86_BUG_TSA:
+		return tsa_show_state(buf);
+
 	default:
 		break;
 	}
@@ -3414,6 +3533,11 @@ ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_att
 {
 	return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
 }
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
 #endif
 
 void __warn_thunk(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8feb8fd2957a..f7b9fca82bda 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1233,6 +1233,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
 #define ITS		BIT(8)
 /* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
 #define ITS_NATIVE_ONLY	BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA		BIT(10)
 
 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE,	     X86_STEP_MAX,	SRBDS),
@@ -1280,7 +1282,7 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
 	VULNBL_AMD(0x16, RETBLEED),
 	VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
 	VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
-	VULNBL_AMD(0x19, SRSO),
+	VULNBL_AMD(0x19, SRSO | TSA),
 	VULNBL_AMD(0x1a, SRSO),
 	{}
 };
@@ -1530,6 +1532,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
 			setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
 	}
 
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+		    !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+			if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+			    /* Enable bug on Zen guests to allow for live migration. */
+			    (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+				setup_force_cpu_bug(X86_BUG_TSA);
+		}
+	}
+
 	if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
 		return;
 
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index dbf6d71bdf18..b4a1f6732a3a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -50,6 +50,8 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_MBA,			CPUID_EBX,  6, 0x80000008, 0 },
 	{ X86_FEATURE_SMBA,			CPUID_EBX,  2, 0x80000020, 0 },
 	{ X86_FEATURE_BMEC,			CPUID_EBX,  3, 0x80000020, 0 },
+	{ X86_FEATURE_TSA_SQ_NO,		CPUID_ECX,  1, 0x80000021, 0 },
+	{ X86_FEATURE_TSA_L1_NO,		CPUID_ECX,  2, 0x80000021, 0 },
 	{ X86_FEATURE_AMD_WORKLOAD_CLASS,	CPUID_EAX, 22, 0x80000021, 0 },
 	{ X86_FEATURE_PERFMON_V2,		CPUID_EAX,  0, 0x80000022, 0 },
 	{ X86_FEATURE_AMD_LBR_V2,		CPUID_EAX,  1, 0x80000022, 0 },
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 0c61153b275f..235c4af6b692 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -169,6 +169,9 @@ SYM_FUNC_START(__svm_vcpu_run)
 #endif
 	mov VCPU_RDI(%_ASM_DI), %_ASM_DI
 
+	/* Clobbers EFLAGS.ZF */
+	VM_CLEAR_CPU_BUFFERS
+
 	/* Enter guest mode */
 3:	vmrun %_ASM_AX
 4:
@@ -335,6 +338,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
 	mov SVM_current_vmcb(%rdi), %rax
 	mov KVM_VMCB_pa(%rax), %rax
 
+	/* Clobbers EFLAGS.ZF */
+	VM_CLEAR_CPU_BUFFERS
+
 	/* Enter guest mode */
 1:	vmrun %rax
 2:
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 7779ab0ca7ce..efc575a00edd 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -602,6 +602,7 @@ CPU_SHOW_VULN_FALLBACK(reg_file_data_sampling);
 CPU_SHOW_VULN_FALLBACK(ghostwrite);
 CPU_SHOW_VULN_FALLBACK(old_microcode);
 CPU_SHOW_VULN_FALLBACK(indirect_target_selection);
+CPU_SHOW_VULN_FALLBACK(tsa);
 
 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
@@ -620,6 +621,7 @@ static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling
 static DEVICE_ATTR(ghostwrite, 0444, cpu_show_ghostwrite, NULL);
 static DEVICE_ATTR(old_microcode, 0444, cpu_show_old_microcode, NULL);
 static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL);
+static DEVICE_ATTR(tsa, 0444, cpu_show_tsa, NULL);
 
 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_meltdown.attr,
@@ -639,6 +641,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
 	&dev_attr_ghostwrite.attr,
 	&dev_attr_old_microcode.attr,
 	&dev_attr_indirect_target_selection.attr,
+	&dev_attr_tsa.attr,
 	NULL
 };
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 96a3a0d6a60e..6378370a952f 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -82,6 +82,7 @@ extern ssize_t cpu_show_old_microcode(struct device *dev,
 				      struct device_attribute *attr, char *buf);
 extern ssize_t cpu_show_indirect_target_selection(struct device *dev,
 						  struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf);
 
 extern __printf(4, 5)
 struct device *cpu_device_create(struct device *parent, void *drvdata,
-- 
cgit v1.2.3


From 128ea9f6ccfb6960293ae4212f4f97165e42222d Mon Sep 17 00:00:00 2001
From: Marco Crivellari <marco.crivellari@suse.com>
Date: Sat, 14 Jun 2025 15:35:29 +0200
Subject: workqueue: Add system_percpu_wq and system_dfl_wq

Currently, if a user enqueue a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistentcy cannot be addressed without refactoring the API.

system_wq is a per-CPU worqueue, yet nothing in its name tells about that
CPU affinity constraint, which is very often not required by users. Make it
clear by adding a system_percpu_wq.

system_unbound_wq should be the default workqueue so as not to enforce
locality constraints for random work whenever it's not required.

Adding system_dfl_wq to encourage its use when unbound work should be used.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  8 +++++---
 kernel/workqueue.c        | 13 +++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b0dc957c3e56..74b0042709cd 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -427,7 +427,7 @@ enum wq_consts {
 /*
  * System-wide workqueues which are always present.
  *
- * system_wq is the one used by schedule[_delayed]_work[_on]().
+ * system_percpu_wq is the one used by schedule[_delayed]_work[_on]().
  * Multi-CPU multi-threaded.  There are users which expect relatively
  * short queue flush time.  Don't queue works which can run for too
  * long.
@@ -438,7 +438,7 @@ enum wq_consts {
  * system_long_wq is similar to system_wq but may host long running
  * works.  Queue flushing might take relatively long.
  *
- * system_unbound_wq is unbound workqueue.  Workers are not bound to
+ * system_dfl_wq is unbound workqueue.  Workers are not bound to
  * any specific CPU, not concurrency managed, and all queued works are
  * executed immediately as long as max_active limit is not reached and
  * resources are available.
@@ -455,10 +455,12 @@ enum wq_consts {
  * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
  * are executed in the queueing CPU's BH context in the queueing order.
  */
-extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_wq; /* use system_percpu_wq, this will be removed */
+extern struct workqueue_struct *system_percpu_wq;
 extern struct workqueue_struct *system_highpri_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
+extern struct workqueue_struct *system_dfl_wq;
 extern struct workqueue_struct *system_freezable_wq;
 extern struct workqueue_struct *system_power_efficient_wq;
 extern struct workqueue_struct *system_freezable_power_efficient_wq;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cf6203282737..71a4dd59977c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init;
 
 struct workqueue_struct *system_wq __ro_after_init;
 EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_percpu_wq __ro_after_init;
+EXPORT_SYMBOL(system_percpu_wq);
 struct workqueue_struct *system_highpri_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_highpri_wq);
 struct workqueue_struct *system_long_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_long_wq);
 struct workqueue_struct *system_unbound_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_dfl_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_wq);
 struct workqueue_struct *system_freezable_wq __ro_after_init;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
 struct workqueue_struct *system_power_efficient_wq __ro_after_init;
@@ -7816,10 +7820,11 @@ void __init workqueue_init_early(void)
 	}
 
 	system_wq = alloc_workqueue("events", 0, 0);
+	system_percpu_wq = alloc_workqueue("events", 0, 0);
 	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
 	system_long_wq = alloc_workqueue("events_long", 0, 0);
-	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
-					    WQ_MAX_ACTIVE);
+	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+	system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
 	system_freezable_wq = alloc_workqueue("events_freezable",
 					      WQ_FREEZABLE, 0);
 	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -7830,8 +7835,8 @@ void __init workqueue_init_early(void)
 	system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
 	system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
 					       WQ_BH | WQ_HIGHPRI, 0);
-	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-	       !system_unbound_wq || !system_freezable_wq ||
+	BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
+	       !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq ||
 	       !system_bh_wq || !system_bh_highpri_wq);
-- 
cgit v1.2.3


From 930c2ea566aff59e962c50b2421d5fcc3b98b8be Mon Sep 17 00:00:00 2001
From: Marco Crivellari <marco.crivellari@suse.com>
Date: Sat, 14 Jun 2025 15:35:30 +0200
Subject: workqueue: Add new WQ_PERCPU flag

Currently if a user enqueue a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.
This lack of consistentcy cannot be addressed without refactoring the API.

This patch adds a new WQ_PERCPU flag to explicitly request the use of
the per-CPU behavior. Both flags coexist for one release cycle to allow
callers to transition their calls.

Once migration is complete, WQ_UNBOUND can be removed and unbound will
become the implicit default.

tj: Merged doc patch.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/core-api/workqueue.rst | 6 ++++++
 include/linux/workqueue.h            | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index e295835fc116..165ca73e8351 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -183,6 +183,12 @@ resources, scheduled and executed.
   BH work items cannot sleep. All other features such as delayed queueing,
   flushing and canceling are supported.
 
+``WQ_PERCPU``
+  Work items queued to a per-cpu wq are bound to a specific CPU.
+  This flag is the right choice when cpu locality is important.
+
+  This flag is the complement of ``WQ_UNBOUND``.
+
 ``WQ_UNBOUND``
   Work items queued to an unbound wq are served by the special
   worker-pools which host workers which are not bound to any
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 74b0042709cd..f19072605faa 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -401,6 +401,7 @@ enum wq_flags {
 	 * http://thread.gmane.org/gmane.linux.kernel/1480396
 	 */
 	WQ_POWER_EFFICIENT	= 1 << 7,
+	WQ_PERCPU		= 1 << 8, /* bound to a specific cpu */
 
 	__WQ_DESTROYING		= 1 << 15, /* internal: workqueue is destroying */
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
-- 
cgit v1.2.3


From fc92099902fbf21000554678a47654b029c15a4d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 Jun 2025 12:36:06 -0300
Subject: tools headers: Synchronize linux/bits.h with the kernel sources

To pick up the changes in this cset:

  1e7933a575ed8af4 ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"")
  5b572e8a9f3dcd6e ("bits: introduce fixed-type BIT_U*()")
  19408200c094858d ("bits: introduce fixed-type GENMASK_U*()")
  31299a5e02112411 ("bits: add comments and newlines to #if, #else and #endif directives")

This addresses these perf build warnings:

  Warning: Kernel ABI header differences:
    diff -u tools/include/linux/bits.h include/linux/bits.h

Please see tools/include/uapi/README for further details.

Acked-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Cc: I Hsin Cheng <richard120310@gmail.com>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/aEr0ZJ60EbshEy6p@x1
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/uapi/linux/bits.h  |  4 ++--
 tools/include/linux/bits.h | 57 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h
index a04afef9efca..682b406e1067 100644
--- a/include/uapi/linux/bits.h
+++ b/include/uapi/linux/bits.h
@@ -4,9 +4,9 @@
 #ifndef _UAPI_LINUX_BITS_H
 #define _UAPI_LINUX_BITS_H
 
-#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h))))
+#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h))))
 
-#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))
+#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 #define __GENMASK_U128(h, l) \
 	((_BIT128((h)) << 1) - (_BIT128(l)))
diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h
index 14fd0ca9a6cd..7ad056219115 100644
--- a/tools/include/linux/bits.h
+++ b/tools/include/linux/bits.h
@@ -12,6 +12,7 @@
 #define BIT_ULL_MASK(nr)	(ULL(1) << ((nr) % BITS_PER_LONG_LONG))
 #define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
 #define BITS_PER_BYTE		8
+#define BITS_PER_TYPE(type)	(sizeof(type) * BITS_PER_BYTE)
 
 /*
  * Create a contiguous bitmask starting at bit position @l and ending at
@@ -19,16 +20,68 @@
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
 #if !defined(__ASSEMBLY__)
+
+/*
+ * Missing asm support
+ *
+ * GENMASK_U*() and BIT_U*() depend on BITS_PER_TYPE() which relies on sizeof(),
+ * something not available in asm. Nevertheless, fixed width integers is a C
+ * concept. Assembly code can rely on the long and long long versions instead.
+ */
+
 #include <linux/build_bug.h>
 #include <linux/compiler.h>
+#include <linux/overflow.h>
+
 #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h)))
-#else
+
+/*
+ * Generate a mask for the specified type @t. Additional checks are made to
+ * guarantee the value returned fits in that type, relying on
+ * -Wshift-count-overflow compiler check to detect incompatible arguments.
+ * For example, all these create build errors or warnings:
+ *
+ * - GENMASK(15, 20): wrong argument order
+ * - GENMASK(72, 15): doesn't fit unsigned long
+ * - GENMASK_U32(33, 15): doesn't fit in a u32
+ */
+#define GENMASK_TYPE(t, h, l)					\
+	((t)(GENMASK_INPUT_CHECK(h, l) +			\
+	     (type_max(t) << (l) &				\
+	      type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h)))))
+
+#define GENMASK_U8(h, l)	GENMASK_TYPE(u8, h, l)
+#define GENMASK_U16(h, l)	GENMASK_TYPE(u16, h, l)
+#define GENMASK_U32(h, l)	GENMASK_TYPE(u32, h, l)
+#define GENMASK_U64(h, l)	GENMASK_TYPE(u64, h, l)
+
+/*
+ * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The
+ * following examples generate compiler warnings due to -Wshift-count-overflow:
+ *
+ * - BIT_U8(8)
+ * - BIT_U32(-1)
+ * - BIT_U32(40)
+ */
+#define BIT_INPUT_CHECK(type, nr) \
+	BUILD_BUG_ON_ZERO(const_true((nr) >= BITS_PER_TYPE(type)))
+
+#define BIT_TYPE(type, nr) ((type)(BIT_INPUT_CHECK(type, nr) + BIT_ULL(nr)))
+
+#define BIT_U8(nr)	BIT_TYPE(u8, nr)
+#define BIT_U16(nr)	BIT_TYPE(u16, nr)
+#define BIT_U32(nr)	BIT_TYPE(u32, nr)
+#define BIT_U64(nr)	BIT_TYPE(u64, nr)
+
+#else /* defined(__ASSEMBLY__) */
+
 /*
  * BUILD_BUG_ON_ZERO is not available in h files included from asm files,
  * disable the input check if that is the case.
  */
 #define GENMASK_INPUT_CHECK(h, l) 0
-#endif
+
+#endif /* !defined(__ASSEMBLY__) */
 
 #define GENMASK(h, l) \
 	(GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
-- 
cgit v1.2.3


From 1257b8786ac689a2ce5fe3e1741c65038035adc6 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:22 -0700
Subject: cgroup: support to enable nmi-safe css_rstat_updated

Add necessary infrastructure to enable the nmi-safe execution of
css_rstat_updated(). Currently css_rstat_updated() takes a per-cpu
per-css raw spinlock to add the given css in the per-cpu per-css update
tree. However the kernel can not spin in nmi context, so we need to
remove the spinning on the raw spinlock in css_rstat_updated().

To support lockless css_rstat_updated(), let's add necessary data
structures in the css and ss structures.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  4 ++++
 kernel/cgroup/rstat.c       | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index cd7f093e34cd..04191d99228c 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -384,6 +384,9 @@ struct css_rstat_cpu {
 	 */
 	struct cgroup_subsys_state *updated_children;
 	struct cgroup_subsys_state *updated_next;	/* NULL if not on the list */
+
+	struct llist_node lnode;		/* lockless list for update */
+	struct cgroup_subsys_state *owner;	/* back pointer */
 };
 
 /*
@@ -822,6 +825,7 @@ struct cgroup_subsys {
 
 	spinlock_t rstat_ss_lock;
 	raw_spinlock_t __percpu *rstat_ss_cpu_lock;
+	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index ce4752ab9e09..bfa6366d2325 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -11,6 +11,7 @@
 
 static DEFINE_SPINLOCK(rstat_base_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
 
@@ -45,6 +46,13 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
 	return &rstat_base_lock;
 }
 
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
+{
+	if (ss)
+		return per_cpu_ptr(ss->lhead, cpu);
+	return per_cpu_ptr(&rstat_backlog_list, cpu);
+}
+
 static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
 {
 	if (ss)
@@ -456,7 +464,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
 	for_each_possible_cpu(cpu) {
 		struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
 
-		rstatc->updated_children = css;
+		rstatc->owner = rstatc->updated_children = css;
+		init_llist_node(&rstatc->lnode);
 
 		if (is_self) {
 			struct cgroup_rstat_base_cpu *rstatbc;
@@ -525,9 +534,19 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 	}
 #endif
 
+	if (ss) {
+		ss->lhead = alloc_percpu(struct llist_head);
+		if (!ss->lhead) {
+			free_percpu(ss->rstat_ss_cpu_lock);
+			return -ENOMEM;
+		}
+	}
+
 	spin_lock_init(ss_rstat_lock(ss));
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+		init_llist_head(ss_lhead_cpu(ss, cpu));
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 6af89c6ca71742e9227e6f8172a86ce1ee16aa85 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Tue, 17 Jun 2025 12:57:24 -0700
Subject: cgroup: remove per-cpu per-subsystem locks

The rstat update side used to insert the cgroup whose stats are updated
in the update tree and the read side flush the update tree to get the
latest uptodate stats. The per-cpu per-subsystem locks were used to
synchronize the update and flush side. However now the update side does
not access update tree but uses per-cpu lockless lists. So there is no
need for locks to synchronize update and flush side. Let's remove them.

Suggested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Tested-by: JP Kobryn <inwardvessel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h   |   7 ---
 include/trace/events/cgroup.h |  47 --------------------
 kernel/cgroup/rstat.c         | 100 ++----------------------------------------
 3 files changed, 4 insertions(+), 150 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 04191d99228c..6b93a64115fe 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -375,12 +375,6 @@ struct css_rstat_cpu {
 	 * Child cgroups with stat updates on this cpu since the last read
 	 * are linked on the parent's ->updated_children through
 	 * ->updated_next. updated_children is terminated by its container css.
-	 *
-	 * In addition to being more compact, singly-linked list pointing to
-	 * the css makes it unnecessary for each per-cpu struct to point back
-	 * to the associated css.
-	 *
-	 * Protected by per-cpu css->ss->rstat_ss_cpu_lock.
 	 */
 	struct cgroup_subsys_state *updated_children;
 	struct cgroup_subsys_state *updated_next;	/* NULL if not on the list */
@@ -824,7 +818,6 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 
 	spinlock_t rstat_ss_lock;
-	raw_spinlock_t __percpu *rstat_ss_cpu_lock;
 	struct llist_head __percpu *lhead; /* lockless update list head */
 };
 
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 7d332387be6c..ba9229af9a34 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -257,53 +257,6 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
-/*
- * Related to per CPU locks:
- * global rstat_base_cpu_lock for base stats
- * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats
- */
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
-DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
-
-	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
-
-	TP_ARGS(cgrp, cpu, contended)
-);
-
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 823a4c7c3fea..c8a48cf83878 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,6 @@
 #include <trace/events/cgroup.h>
 
 static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
 static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
 
 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -53,74 +52,6 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
 	return per_cpu_ptr(&rstat_backlog_list, cpu);
 }
 
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
-{
-	if (ss)
-		return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
-
-	return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
-		const bool fast_path)
-{
-	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock;
-	unsigned long flags;
-	bool contended;
-
-	/*
-	 * The _irqsave() is needed because the locks used for flushing are
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
-	 * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
-	 * kernel. The raw_spinlock_t below disables interrupts on both
-	 * configurations. The _irqsave() ensures that interrupts are always
-	 * disabled and later restored.
-	 */
-	cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
-	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
-	if (contended) {
-		if (fast_path)
-			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
-		else
-			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
-		raw_spin_lock_irqsave(cpu_lock, flags);
-	}
-
-	if (fast_path)
-		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
-	else
-		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
-	return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
-		unsigned long flags, const bool fast_path)
-{
-	struct cgroup *cgrp = css->cgroup;
-	raw_spinlock_t *cpu_lock;
-
-	if (fast_path)
-		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
-	else
-		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
-	cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
-}
-
 /**
  * css_rstat_updated - keep track of updated rstat_cpu
  * @css: target cgroup subsystem state
@@ -323,15 +254,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 {
 	struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
 	struct cgroup_subsys_state *head = NULL, *parent, *child;
-	unsigned long flags;
-
-	flags = _css_rstat_cpu_lock(root, cpu, false);
 
 	css_process_update_tree(root->ss, cpu);
 
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
-		goto unlock_ret;
+		return NULL;
 
 	/*
 	 * Unlink @root from its parent. As the updated_children list is
@@ -363,8 +291,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
 	rstatc->updated_children = root;
 	if (child != root)
 		head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
-	_css_rstat_cpu_unlock(root, cpu, flags, false);
+
 	return head;
 }
 
@@ -560,34 +487,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
 {
 	int cpu;
 
-#ifdef CONFIG_SMP
-	/*
-	 * On uniprocessor machines, arch_spinlock_t is defined as an empty
-	 * struct. Avoid allocating a size of zero by having this block
-	 * excluded in this case. It's acceptable to leave the subsystem locks
-	 * unitialized since the associated lock functions are no-ops in the
-	 * non-smp case.
-	 */
-	if (ss) {
-		ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
-		if (!ss->rstat_ss_cpu_lock)
-			return -ENOMEM;
-	}
-#endif
-
 	if (ss) {
 		ss->lhead = alloc_percpu(struct llist_head);
-		if (!ss->lhead) {
-			free_percpu(ss->rstat_ss_cpu_lock);
+		if (!ss->lhead)
 			return -ENOMEM;
-		}
 	}
 
 	spin_lock_init(ss_rstat_lock(ss));
-	for_each_possible_cpu(cpu) {
-		raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+	for_each_possible_cpu(cpu)
 		init_llist_head(ss_lhead_cpu(ss, cpu));
-	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 07a4688833b237331e5045f90fc546c085b28c86 Mon Sep 17 00:00:00 2001
From: Alexander Wilhelm <alexander.wilhelm@westermo.com>
Date: Thu, 22 May 2025 16:35:30 +0200
Subject: soc: qcom: fix endianness for QMI header

The members of QMI header have to be swapped on big endian platforms. Use
__le16 types instead of u16 ones.

Signed-off-by: Alexander Wilhelm <alexander.wilhelm@westermo.com>
Fixes: 9b8a11e82615 ("soc: qcom: Introduce QMI encoder/decoder")
Fixes: 3830d0771ef6 ("soc: qcom: Introduce QMI helpers")
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250522143530.3623809-3-alexander.wilhelm@westermo.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/qmi_encdec.c    | 6 +++---
 drivers/soc/qcom/qmi_interface.c | 6 +++---
 include/linux/soc/qcom/qmi.h     | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/qcom/qmi_encdec.c b/drivers/soc/qcom/qmi_encdec.c
index dafe0a4c202e..7660a960fb45 100644
--- a/drivers/soc/qcom/qmi_encdec.c
+++ b/drivers/soc/qcom/qmi_encdec.c
@@ -776,9 +776,9 @@ void *qmi_encode_message(int type, unsigned int msg_id, size_t *len,
 
 	hdr = msg;
 	hdr->type = type;
-	hdr->txn_id = txn_id;
-	hdr->msg_id = msg_id;
-	hdr->msg_len = msglen;
+	hdr->txn_id = cpu_to_le16(txn_id);
+	hdr->msg_id = cpu_to_le16(msg_id);
+	hdr->msg_len = cpu_to_le16(msglen);
 
 	*len = sizeof(*hdr) + msglen;
 
diff --git a/drivers/soc/qcom/qmi_interface.c b/drivers/soc/qcom/qmi_interface.c
index bc6d6379d8b1..6500f863aae5 100644
--- a/drivers/soc/qcom/qmi_interface.c
+++ b/drivers/soc/qcom/qmi_interface.c
@@ -400,7 +400,7 @@ static void qmi_invoke_handler(struct qmi_handle *qmi, struct sockaddr_qrtr *sq,
 
 	for (handler = qmi->handlers; handler->fn; handler++) {
 		if (handler->type == hdr->type &&
-		    handler->msg_id == hdr->msg_id)
+		    handler->msg_id == le16_to_cpu(hdr->msg_id))
 			break;
 	}
 
@@ -488,7 +488,7 @@ static void qmi_handle_message(struct qmi_handle *qmi,
 	/* If this is a response, find the matching transaction handle */
 	if (hdr->type == QMI_RESPONSE) {
 		mutex_lock(&qmi->txn_lock);
-		txn = idr_find(&qmi->txns, hdr->txn_id);
+		txn = idr_find(&qmi->txns, le16_to_cpu(hdr->txn_id));
 
 		/* Ignore unexpected responses */
 		if (!txn) {
@@ -514,7 +514,7 @@ static void qmi_handle_message(struct qmi_handle *qmi,
 	} else {
 		/* Create a txn based on the txn_id of the incoming message */
 		memset(&tmp_txn, 0, sizeof(tmp_txn));
-		tmp_txn.id = hdr->txn_id;
+		tmp_txn.id = le16_to_cpu(hdr->txn_id);
 
 		qmi_invoke_handler(qmi, sq, &tmp_txn, buf, len);
 	}
diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h
index 469e02d2aa0d..291cdc7ef49c 100644
--- a/include/linux/soc/qcom/qmi.h
+++ b/include/linux/soc/qcom/qmi.h
@@ -24,9 +24,9 @@ struct socket;
  */
 struct qmi_header {
 	u8 type;
-	u16 txn_id;
-	u16 msg_id;
-	u16 msg_len;
+	__le16 txn_id;
+	__le16 msg_id;
+	__le16 msg_len;
 } __packed;
 
 #define QMI_REQUEST	0
-- 
cgit v1.2.3


From f42b8d78dee77107245ec5beee3eb01915bcae7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 11 Jun 2025 19:40:04 -0400
Subject: tpm: don't bother with removal of files in directory we'll be
 removing

FWIW, there is a reliable indication of removal - ->i_nlink going to 0 ;-)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/tpm/eventlog/common.c | 46 ++++++++++----------------------------
 include/linux/tpm.h                |  2 +-
 2 files changed, 13 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/eventlog/common.c b/drivers/char/tpm/eventlog/common.c
index 4c0bbba64ee5..691813d2a5a2 100644
--- a/drivers/char/tpm/eventlog/common.c
+++ b/drivers/char/tpm/eventlog/common.c
@@ -32,7 +32,7 @@ static int tpm_bios_measurements_open(struct inode *inode,
 	struct tpm_chip *chip;
 
 	inode_lock(inode);
-	if (!inode->i_private) {
+	if (!inode->i_nlink) {
 		inode_unlock(inode);
 		return -ENODEV;
 	}
@@ -105,7 +105,7 @@ static int tpm_read_log(struct tpm_chip *chip)
 void tpm_bios_log_setup(struct tpm_chip *chip)
 {
 	const char *name = dev_name(&chip->dev);
-	unsigned int cnt;
+	struct dentry *dentry;
 	int log_version;
 	int rc = 0;
 
@@ -117,14 +117,12 @@ void tpm_bios_log_setup(struct tpm_chip *chip)
 		return;
 	log_version = rc;
 
-	cnt = 0;
-	chip->bios_dir[cnt] = securityfs_create_dir(name, NULL);
+	chip->bios_dir = securityfs_create_dir(name, NULL);
 	/* NOTE: securityfs_create_dir can return ENODEV if securityfs is
 	 * compiled out. The caller should ignore the ENODEV return code.
 	 */
-	if (IS_ERR(chip->bios_dir[cnt]))
-		goto err;
-	cnt++;
+	if (IS_ERR(chip->bios_dir))
+		return;
 
 	chip->bin_log_seqops.chip = chip;
 	if (log_version == EFI_TCG2_EVENT_LOG_FORMAT_TCG_2)
@@ -135,14 +133,13 @@ void tpm_bios_log_setup(struct tpm_chip *chip)
 			&tpm1_binary_b_measurements_seqops;
 
 
-	chip->bios_dir[cnt] =
+	dentry =
 	    securityfs_create_file("binary_bios_measurements",
-				   0440, chip->bios_dir[0],
+				   0440, chip->bios_dir,
 				   (void *)&chip->bin_log_seqops,
 				   &tpm_bios_measurements_ops);
-	if (IS_ERR(chip->bios_dir[cnt]))
+	if (IS_ERR(dentry))
 		goto err;
-	cnt++;
 
 	if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) {
 
@@ -150,42 +147,23 @@ void tpm_bios_log_setup(struct tpm_chip *chip)
 		chip->ascii_log_seqops.seqops =
 			&tpm1_ascii_b_measurements_seqops;
 
-		chip->bios_dir[cnt] =
+		dentry =
 			securityfs_create_file("ascii_bios_measurements",
-					       0440, chip->bios_dir[0],
+					       0440, chip->bios_dir,
 					       (void *)&chip->ascii_log_seqops,
 					       &tpm_bios_measurements_ops);
-		if (IS_ERR(chip->bios_dir[cnt]))
+		if (IS_ERR(dentry))
 			goto err;
-		cnt++;
 	}
 
 	return;
 
 err:
-	chip->bios_dir[cnt] = NULL;
 	tpm_bios_log_teardown(chip);
 	return;
 }
 
 void tpm_bios_log_teardown(struct tpm_chip *chip)
 {
-	int i;
-	struct inode *inode;
-
-	/* securityfs_remove currently doesn't take care of handling sync
-	 * between removal and opening of pseudo files. To handle this, a
-	 * workaround is added by making i_private = NULL here during removal
-	 * and to check it during open(), both within inode_lock()/unlock().
-	 * This design ensures that open() either safely gets kref or fails.
-	 */
-	for (i = (TPM_NUM_EVENT_LOG_FILES - 1); i >= 0; i--) {
-		if (chip->bios_dir[i]) {
-			inode = d_inode(chip->bios_dir[i]);
-			inode_lock(inode);
-			inode->i_private = NULL;
-			inode_unlock(inode);
-			securityfs_remove(chip->bios_dir[i]);
-		}
-	}
+	securityfs_remove(chip->bios_dir);
 }
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index a3d8305e88a5..9894c104dc93 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -182,7 +182,7 @@ struct tpm_chip {
 	unsigned long duration[TPM_NUM_DURATIONS]; /* jiffies */
 	bool duration_adjusted;
 
-	struct dentry *bios_dir[TPM_NUM_EVENT_LOG_FILES];
+	struct dentry *bios_dir;
 
 	const struct attribute_group *groups[3 + TPM_MAX_HASHES];
 	unsigned int groups_cnt;
-- 
cgit v1.2.3


From 2410251cde0bac9f660f276307d6c967466eef0c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jun 2025 10:46:25 +0100
Subject: net: timestamp: add helper returning skb's tx tstamp

Add a helper function skb_get_tx_timestamp() that returns a tx timestamp
associated with an error queue skb.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/702357dd8936ef4c0d3864441e853bfe3224a677.1750065793.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h |  4 ++++
 net/socket.c       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 92e7c1aae3cc..f5f5a9ad290b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2677,6 +2677,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 			     struct sk_buff *skb);
 
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+			 struct timespec64 *ts);
+
 static inline void
 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 {
diff --git a/net/socket.c b/net/socket.c
index 9a0e720f0859..2cab805943c0 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -843,6 +843,52 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb,
 		 sizeof(ts_pktinfo), &ts_pktinfo);
 }
 
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk)
+{
+	const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	u32 tsflags = READ_ONCE(sk->sk_tsflags);
+
+	if (serr->ee.ee_errno != ENOMSG ||
+	   serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING)
+		return false;
+
+	/* software time stamp available and wanted */
+	if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp)
+		return true;
+	/* hardware time stamps available and wanted */
+	return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+		skb_hwtstamps(skb)->hwtstamp;
+}
+
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+			  struct timespec64 *ts)
+{
+	u32 tsflags = READ_ONCE(sk->sk_tsflags);
+	ktime_t hwtstamp;
+	int if_index = 0;
+
+	if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+	    ktime_to_timespec64_cond(skb->tstamp, ts))
+		return SOF_TIMESTAMPING_TX_SOFTWARE;
+
+	if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) ||
+	    skb_is_swtx_tstamp(skb, false))
+		return -ENOENT;
+
+	if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
+		hwtstamp = get_timestamp(sk, skb, &if_index);
+	else
+		hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+
+	if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
+		hwtstamp = ptp_convert_timestamp(&hwtstamp,
+						READ_ONCE(sk->sk_bind_phc));
+	if (!ktime_to_timespec64_cond(hwtstamp, ts))
+		return -ENOENT;
+
+	return SOF_TIMESTAMPING_TX_HARDWARE;
+}
+
 /*
  * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
  */
-- 
cgit v1.2.3


From ba4618885b23372c45bb1566ed8e3f1c191ff22d Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 14 Jun 2025 20:14:34 -0400
Subject: tcp: remove RFC3517/RFC6675 hint state: lost_skb_hint, lost_cnt_hint

Now that obsolete RFC3517/RFC6675 TCP loss detection has been removed,
we can remove the somewhat complex and intrusive code to maintain its
hint state: lost_skb_hint and lost_cnt_hint.

This commit makes tcp_clear_retrans_hints_partial() empty. We will
remove tcp_clear_retrans_hints_partial() and its call sites in the
next commit.

Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250615001435.2390793-3-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_cachelines/tcp_sock.rst |  2 --
 include/linux/tcp.h                                  |  3 ---
 include/net/tcp.h                                    |  1 -
 net/ipv4/tcp.c                                       |  3 +--
 net/ipv4/tcp_input.c                                 | 19 -------------------
 net/ipv4/tcp_output.c                                |  5 -----
 6 files changed, 1 insertion(+), 32 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index bc9b2131bf7a..7bbda5944ee2 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -115,7 +115,6 @@ u32                           lost_out                read_mostly         read_m
 u32                           sacked_out              read_mostly         read_mostly         tcp_left_out(tx);tcp_packets_in_flight(tx/rx);tcp_clean_rtx_queue(rx)
 struct hrtimer                pacing_timer
 struct hrtimer                compressed_ack_timer
-struct sk_buff*               lost_skb_hint           read_mostly                             tcp_clean_rtx_queue
 struct sk_buff*               retransmit_skb_hint     read_mostly                             tcp_clean_rtx_queue
 struct rb_root                out_of_order_queue                          read_mostly         tcp_data_queue,tcp_fast_path_check
 struct sk_buff*               ooo_last_skb
@@ -123,7 +122,6 @@ struct tcp_sack_block[1]      duplicate_sack
 struct tcp_sack_block[4]      selective_acks
 struct tcp_sack_block[4]      recv_sack_cache
 struct sk_buff*               highest_sack            read_write                              tcp_event_new_data_sent
-int                           lost_cnt_hint
 u32                           prior_ssthresh
 u32                           high_seq
 u32                           retrans_stamp
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 29f59d50dc73..1a5737b3753d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -208,7 +208,6 @@ struct tcp_sock {
 	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
 	u16	gso_segs;	/* Max number of segs per GSO packet	*/
 	/* from STCP, retrans queue hinting */
-	struct sk_buff *lost_skb_hint;
 	struct sk_buff *retransmit_skb_hint;
 	__cacheline_group_end(tcp_sock_read_tx);
 
@@ -419,8 +418,6 @@ struct tcp_sock {
 
 	struct tcp_sack_block recv_sack_cache[4];
 
-	int     lost_cnt_hint;
-
 	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
 	u32	high_seq;	/* snd_nxt at onset of congestion	*/
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5078ad868fee..f57d12183794 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1813,7 +1813,6 @@ static inline void tcp_mib_init(struct net *net)
 /* from STCP */
 static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
 {
-	tp->lost_skb_hint = NULL;
 }
 
 static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f64f8276a73c..27d3ef83ce7b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5053,9 +5053,8 @@ static void __init tcp_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
-	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32);
 
 	/* TXRX read-mostly hotpath cache lines */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc234d3854aa..e8e130e946f1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1451,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		tp->sacked_out += pcount;
 		/* Out-of-order packets delivered */
 		state->sack_delivered += pcount;
-
-		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
-		if (tp->lost_skb_hint &&
-		    before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
-			tp->lost_cnt_hint += pcount;
 	}
 
 	/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1496,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 			tcp_skb_timestamp_us(skb));
 	tcp_rate_skb_delivered(sk, skb, state->rate);
 
-	if (skb == tp->lost_skb_hint)
-		tp->lost_cnt_hint += pcount;
-
 	TCP_SKB_CB(prev)->end_seq += shifted;
 	TCP_SKB_CB(skb)->seq += shifted;
 
@@ -1531,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 
 	if (skb == tp->retransmit_skb_hint)
 		tp->retransmit_skb_hint = prev;
-	if (skb == tp->lost_skb_hint) {
-		tp->lost_skb_hint = prev;
-		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
-	}
 
 	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
 	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
@@ -3318,8 +3306,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
 		next = skb_rb_next(skb);
 		if (unlikely(skb == tp->retransmit_skb_hint))
 			tp->retransmit_skb_hint = NULL;
-		if (unlikely(skb == tp->lost_skb_hint))
-			tp->lost_skb_hint = NULL;
 		tcp_highest_sack_replace(sk, skb, next);
 		tcp_rtx_queue_unlink_and_free(skb, sk);
 	}
@@ -3377,14 +3363,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
 			if (flag & FLAG_RETRANS_DATA_ACKED)
 				flag &= ~FLAG_ORIG_SACK_ACKED;
 		} else {
-			int delta;
-
 			/* Non-retransmitted hole got filled? That's reordering */
 			if (before(reord, prior_fack))
 				tcp_check_sack_reordering(sk, reord, 0);
-
-			delta = prior_sacked - tp->sacked_out;
-			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 	} else if (skb && rtt_update && sack_rtt_us >= 0 &&
 		   sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3ac8d2d17e1f..b0ffefe604b4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1554,11 +1554,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 	if (tcp_is_reno(tp) && decr > 0)
 		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
 
-	if (tp->lost_skb_hint &&
-	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
-	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-		tp->lost_cnt_hint -= decr;
-
 	tcp_verify_left_out(tp);
 }
 
-- 
cgit v1.2.3


From db16319efcc717a31dcb9c8f038acb6e4111c12e Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 14 Jun 2025 20:14:35 -0400
Subject: tcp: remove RFC3517/RFC6675 tcp_clear_retrans_hints_partial()

Now that we have removed the RFC3517/RFC6675 hints,
tcp_clear_retrans_hints_partial() is empty, and can be removed.

Suggested-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250615001435.2390793-4-ncardwell.sw@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h     | 5 -----
 net/ipv4/tcp_input.c  | 2 --
 net/ipv4/tcp_output.c | 1 -
 3 files changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f57d12183794..9f852f5f8b95 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1811,13 +1811,8 @@ static inline void tcp_mib_init(struct net *net)
 }
 
 /* from STCP */
-static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
-{
-}
-
 static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 {
-	tcp_clear_retrans_hints_partial(tp);
 	tp->retransmit_skb_hint = NULL;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e8e130e946f1..05b9571c9c92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2769,8 +2769,6 @@ void tcp_simple_retransmit(struct sock *sk)
 			tcp_mark_skb_lost(sk, skb);
 	}
 
-	tcp_clear_retrans_hints_partial(tp);
-
 	if (!tp->lost_out)
 		return;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b0ffefe604b4..eb50746dc482 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3247,7 +3247,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
 
 	/* changed transmit queue under us so clear hints */
-	tcp_clear_retrans_hints_partial(tp);
 	if (next_skb == tp->retransmit_skb_hint)
 		tp->retransmit_skb_hint = skb;
 
-- 
cgit v1.2.3


From ef07df397a621707903ef0d294a7df11f80cf206 Mon Sep 17 00:00:00 2001
From: Álvaro Fernández Rojas <noltari@gmail.com>
Date: Sat, 14 Jun 2025 09:59:48 +0200
Subject: net: dsa: tag_brcm: add support for legacy FCS tags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for legacy Broadcom FCS tags, which are similar to
DSA_TAG_PROTO_BRCM_LEGACY.
BCM5325 and BCM5365 switches require including the original FCS value and
length, as opposed to BCM63xx switches.
Adding the original FCS value and length to DSA_TAG_PROTO_BRCM_LEGACY would
impact performance of BCM63xx switches, so it's better to create a new tag.

Signed-off-by: Álvaro Fernández Rojas <noltari@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://patch.msgid.link/20250614080000.1884236-3-noltari@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h  |  2 ++
 net/dsa/Kconfig    | 16 ++++++++++--
 net/dsa/tag_brcm.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 88 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 55e2d97f247e..d73ea0880066 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -54,11 +54,13 @@ struct tc_action;
 #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE		26
 #define DSA_TAG_PROTO_LAN937X_VALUE		27
 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE	28
+#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE	29
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
 	DSA_TAG_PROTO_BRCM		= DSA_TAG_PROTO_BRCM_VALUE,
 	DSA_TAG_PROTO_BRCM_LEGACY	= DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
+	DSA_TAG_PROTO_BRCM_LEGACY_FCS	= DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE,
 	DSA_TAG_PROTO_BRCM_PREPEND	= DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
 	DSA_TAG_PROTO_DSA		= DSA_TAG_PROTO_DSA_VALUE,
 	DSA_TAG_PROTO_EDSA		= DSA_TAG_PROTO_EDSA_VALUE,
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 2dfe9063613f..869cbe57162f 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -42,12 +42,24 @@ config NET_DSA_TAG_BRCM
 	  Broadcom switches which place the tag after the MAC source address.
 
 config NET_DSA_TAG_BRCM_LEGACY
-	tristate "Tag driver for Broadcom legacy switches using in-frame headers"
+	tristate "Tag driver for BCM63xx legacy switches using in-frame headers"
 	select NET_DSA_TAG_BRCM_COMMON
 	help
 	  Say Y if you want to enable support for tagging frames for the
-	  Broadcom legacy switches which place the tag after the MAC source
+	  BCM63xx legacy switches which place the tag after the MAC source
 	  address.
+	  This tag is used in BCM63xx legacy switches which work without the
+	  original FCS and length before the tag insertion.
+
+config NET_DSA_TAG_BRCM_LEGACY_FCS
+	tristate "Tag driver for BCM53xx legacy switches using in-frame headers"
+	select NET_DSA_TAG_BRCM_COMMON
+	help
+	  Say Y if you want to enable support for tagging frames for the
+	  BCM53xx legacy switches which place the tag after the MAC source
+	  address.
+	  This tag is used in BCM53xx legacy switches which expect original
+	  FCS and length before the tag insertion to be present.
 
 config NET_DSA_TAG_BRCM_PREPEND
 	tristate "Tag driver for Broadcom switches using prepended headers"
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 9f4b0bcd95cd..26bb657ceac3 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -15,6 +15,7 @@
 
 #define BRCM_NAME		"brcm"
 #define BRCM_LEGACY_NAME	"brcm-legacy"
+#define BRCM_LEGACY_FCS_NAME	"brcm-legacy-fcs"
 #define BRCM_PREPEND_NAME	"brcm-prepend"
 
 /* Legacy Broadcom tag (6 bytes) */
@@ -32,6 +33,10 @@
 #define BRCM_LEG_MULTICAST	(1 << 5)
 #define BRCM_LEG_EGRESS		(2 << 5)
 #define BRCM_LEG_INGRESS	(3 << 5)
+#define BRCM_LEG_LEN_HI(x)	(((x) >> 8) & 0x7)
+
+/* 4th byte in the tag */
+#define BRCM_LEG_LEN_LO(x)	((x) & 0xff)
 
 /* 6th byte in the tag */
 #define BRCM_LEG_PORT_ID	(0xf)
@@ -212,7 +217,8 @@ DSA_TAG_DRIVER(brcm_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
 #endif
 
-#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) || \
+	IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
 static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 					struct net_device *dev)
 {
@@ -244,7 +250,9 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
 
 	return skb;
 }
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY || CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
 
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
 static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
 					 struct net_device *dev)
 {
@@ -294,6 +302,66 @@ DSA_TAG_DRIVER(brcm_legacy_netdev_ops);
 MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME);
 #endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */
 
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
+					     struct net_device *dev)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	unsigned int fcs_len;
+	__le32 fcs_val;
+	u8 *brcm_tag;
+
+	/* The Ethernet switch we are interfaced with needs packets to be at
+	 * least 64 bytes (including FCS) otherwise they will be discarded when
+	 * they enter the switch port logic. When Broadcom tags are enabled, we
+	 * need to make sure that packets are at least 70 bytes (including FCS
+	 * and tag) because the length verification is done after the Broadcom
+	 * tag is stripped off the ingress packet.
+	 *
+	 * Let dsa_user_xmit() free the SKB.
+	 */
+	if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+		return NULL;
+
+	fcs_len = skb->len;
+	fcs_val = cpu_to_le32(crc32_le(~0, skb->data, fcs_len) ^ ~0);
+
+	skb_push(skb, BRCM_LEG_TAG_LEN);
+
+	dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN);
+
+	brcm_tag = skb->data + 2 * ETH_ALEN;
+
+	/* Broadcom tag type */
+	brcm_tag[0] = BRCM_LEG_TYPE_HI;
+	brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+	/* Broadcom tag value */
+	brcm_tag[2] = BRCM_LEG_EGRESS | BRCM_LEG_LEN_HI(fcs_len);
+	brcm_tag[3] = BRCM_LEG_LEN_LO(fcs_len);
+	brcm_tag[4] = 0;
+	brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+	/* Original FCS value */
+	if (__skb_pad(skb, ETH_FCS_LEN, false))
+		return NULL;
+	skb_put_data(skb, &fcs_val, ETH_FCS_LEN);
+
+	return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_fcs_netdev_ops = {
+	.name = BRCM_LEGACY_FCS_NAME,
+	.proto = DSA_TAG_PROTO_BRCM_LEGACY_FCS,
+	.xmit = brcm_leg_fcs_tag_xmit,
+	.rcv = brcm_leg_tag_rcv,
+	.needed_headroom = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_fcs_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY_FCS, BRCM_LEGACY_FCS_NAME);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
+
 #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
 static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
 					     struct net_device *dev)
@@ -328,6 +396,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] =	{
 #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
 	&DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops),
 #endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+	&DSA_TAG_DRIVER_NAME(brcm_legacy_fcs_netdev_ops),
+#endif
 #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
 	&DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops),
 #endif
-- 
cgit v1.2.3


From 0f66b616b87cb4a57d22f6f0e0e1698a70d8ad21 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Sun, 15 Jun 2025 20:35:09 +0000
Subject: netmem: fix netmem comments

Trivial fix to a couple of outdated netmem comments. No code changes,
just more accurately describing current code.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250615203511.591438-1-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 386164fb9c18..850869b45b45 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -89,8 +89,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov)
  * typedef netmem_ref - a nonexistent type marking a reference to generic
  * network memory.
  *
- * A netmem_ref currently is always a reference to a struct page. This
- * abstraction is introduced so support for new memory types can be added.
+ * A netmem_ref can be a struct page* or a struct net_iov* underneath.
  *
  * Use the supplied helpers to obtain the underlying memory pointer and fields.
  */
@@ -117,9 +116,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem)
 	return (__force struct page *)netmem;
 }
 
-/* This conversion fails (returns NULL) if the netmem_ref is not struct page
- * backed.
- */
 static inline struct page *netmem_to_page(netmem_ref netmem)
 {
 	if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
@@ -178,6 +174,21 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
 	return page_to_pfn(netmem_to_page(netmem));
 }
 
+/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to
+ * common fields.
+ * @netmem: netmem reference to extract as net_iov.
+ *
+ * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic,
+ * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access
+ * these fields without a type check to make sure that the underlying mem is
+ * net_iov or page.
+ *
+ * The resulting value of this function can only be used to access the fields
+ * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in
+ * undefined behavior.
+ *
+ * Return: the netmem_ref cast to net_iov* regardless of its underlying type.
+ */
 static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
 {
 	return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
-- 
cgit v1.2.3


From 2de1ba0887e5d3bf02d7c212f380039b34e10aa3 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 16 Jun 2025 16:26:24 +0300
Subject: net: vlan: Make is_vlan_dev() a stub when VLAN is not configured

Add a stub implementation of is_vlan_dev() that returns false when
VLAN support is not compiled in (CONFIG_VLAN_8021Q=n).

This allows us to compile-out VLAN-dependent dead code when it is not
needed.

This also resolves the following compilation error when:
* CONFIG_VLAN_8021Q=n
* CONFIG_OBJTOOL=y
* CONFIG_OBJTOOL_WERROR=y

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.o: error: objtool: parse_mirred.isra.0+0x370: mlx5e_tc_act_vlan_add_push_action() missing __noreturn in .c/.h or NORETURN() in noreturns.h

The error occurs because objtool cannot determine that unreachable BUG()
(which doesn't return) calls in VLAN code paths are actually dead code
when VLAN support is disabled.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250616132626.1749331-2-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_vlan.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 38456b42cdb5..618a973ff8ee 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -79,11 +79,6 @@ static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
 /* found in socket.c */
 extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));
 
-static inline bool is_vlan_dev(const struct net_device *dev)
-{
-        return dev->priv_flags & IFF_802_1Q_VLAN;
-}
-
 #define skb_vlan_tag_present(__skb)	(!!(__skb)->vlan_all)
 #define skb_vlan_tag_get(__skb)		((__skb)->vlan_tci)
 #define skb_vlan_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
@@ -200,6 +195,11 @@ struct vlan_dev_priv {
 #endif
 };
 
+static inline bool is_vlan_dev(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_802_1Q_VLAN;
+}
+
 static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
 {
 	return netdev_priv(dev);
@@ -237,6 +237,11 @@ extern void vlan_vids_del_by_dev(struct net_device *dev,
 extern bool vlan_uses_dev(const struct net_device *dev);
 
 #else
+static inline bool is_vlan_dev(const struct net_device *dev)
+{
+	return false;
+}
+
 static inline struct net_device *
 __vlan_find_dev_deep_rcu(struct net_device *real_dev,
 		     __be16 vlan_proto, u16 vlan_id)
-- 
cgit v1.2.3


From 60a8b1a5d0824afda869f18dc0ecfe72f8dfda42 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 16 Jun 2025 16:26:25 +0300
Subject: net: vlan: Replace BUG() with WARN_ON_ONCE() in vlan_dev_* stubs

When CONFIG_VLAN_8021Q=n, a set of stub helpers are used, three of these
helpers use BUG() unconditionally.

This code should not be reached, as callers of these functions should
always check for is_vlan_dev() first, but the usage of BUG() is not
recommended, replace it with WARN_ON() instead.

Reviewed-by: Alex Lazar <alazar@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250616132626.1749331-3-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_vlan.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 618a973ff8ee..b9f699799cf6 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -259,19 +259,19 @@ vlan_for_each(struct net_device *dev,
 
 static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
 {
-	BUG();
+	WARN_ON_ONCE(1);
 	return NULL;
 }
 
 static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
 {
-	BUG();
+	WARN_ON_ONCE(1);
 	return 0;
 }
 
 static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
 {
-	BUG();
+	WARN_ON_ONCE(1);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9c5f5a5bf0da5cee2044b93907ac6d8d9af0492b Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 16 Jun 2025 16:26:26 +0300
Subject: net: vlan: Use IS_ENABLED() helper for CONFIG_VLAN_8021Q guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The header currently tests the VLAN core with an explicit pair of 'if
defined' checks:
    #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)

Instead, use IS_ENABLED() which is the kernel way to test whether an
option is configured as builtin/module.

This is purely cosmetic – no functional changes.

Reviewed-by: Alex Lazar <alazar@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250616132626.1749331-4-gal@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/if_vlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index b9f699799cf6..15e01935d3fa 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -131,7 +131,7 @@ struct vlan_pcpu_stats {
 	u32			tx_dropped;
 };
 
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
 
 extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
 					       __be16 vlan_proto, u16 vlan_id);
-- 
cgit v1.2.3


From e3411e326fa48c9be09ba449330352ba698db698 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:09 +0200
Subject: net: ipv4: Add a flags argument to iptunnel_xmit(),
 udp_tunnel_xmit_skb()

iptunnel_xmit() erases the contents of the SKB control block. In order to
be able to set particular IPCB flags on the SKB, add a corresponding
parameter, and propagate it to udp_tunnel_xmit_skb() as well.

In one of the following patches, VXLAN driver will use this facility to
mark packets as subject to IP multicast routing.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Acked-by: Antonio Quartulli <antonio@openvpn.net>
Link: https://patch.msgid.link/89c9daf9f2dc088b6b92ccebcc929f51742de91f.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/amt.c              |  9 ++++++---
 drivers/net/bareudp.c          |  4 ++--
 drivers/net/geneve.c           |  4 ++--
 drivers/net/gtp.c              | 10 ++++++----
 drivers/net/ovpn/udp.c         |  2 +-
 drivers/net/vxlan/vxlan_core.c |  2 +-
 drivers/net/wireguard/socket.c |  2 +-
 include/net/ip_tunnels.h       |  2 +-
 include/net/udp_tunnel.h       |  2 +-
 net/ipv4/ip_tunnel.c           |  4 ++--
 net/ipv4/ip_tunnel_core.c      |  4 +++-
 net/ipv4/udp_tunnel_core.c     |  5 +++--
 net/ipv6/sit.c                 |  2 +-
 net/sctp/protocol.c            |  3 ++-
 net/tipc/udp_media.c           |  2 +-
 15 files changed, 33 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/amt.c b/drivers/net/amt.c
index fb130fde68c0..ed86537b2f61 100644
--- a/drivers/net/amt.c
+++ b/drivers/net/amt.c
@@ -1046,7 +1046,8 @@ static bool amt_send_membership_update(struct amt_dev *amt,
 			    amt->gw_port,
 			    amt->relay_port,
 			    false,
-			    false);
+			    false,
+			    0);
 	amt_update_gw_status(amt, AMT_STATUS_SENT_UPDATE, true);
 	return false;
 }
@@ -1103,7 +1104,8 @@ static void amt_send_multicast_data(struct amt_dev *amt,
 			    amt->relay_port,
 			    tunnel->source_port,
 			    false,
-			    false);
+			    false,
+			    0);
 }
 
 static bool amt_send_membership_query(struct amt_dev *amt,
@@ -1161,7 +1163,8 @@ static bool amt_send_membership_query(struct amt_dev *amt,
 			    amt->relay_port,
 			    tunnel->source_port,
 			    false,
-			    false);
+			    false,
+			    0);
 	amt_update_relay_status(tunnel, AMT_STATUS_SENT_QUERY, true);
 	return false;
 }
diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index a9dffdcac805..5e613080d3f8 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -362,8 +362,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst,
 			    tos, ttl, df, sport, bareudp->port,
 			    !net_eq(bareudp->net, dev_net(bareudp->dev)),
-			    !test_bit(IP_TUNNEL_CSUM_BIT,
-				      info->key.tun_flags));
+			    !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags),
+			    0);
 	return 0;
 
 free_dst:
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index ffc15a432689..c668e8b00ed2 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -921,8 +921,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst,
 			    tos, ttl, df, sport, geneve->cfg.info.key.tp_dst,
 			    !net_eq(geneve->net, dev_net(geneve->dev)),
-			    !test_bit(IP_TUNNEL_CSUM_BIT,
-				      info->key.tun_flags));
+			    !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags),
+			    0);
 	return 0;
 }
 
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index d4dec741c7f4..14584793fe4e 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -446,7 +446,8 @@ static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb)
 			    htons(GTP0_PORT), htons(GTP0_PORT),
 			    !net_eq(sock_net(gtp->sk1u),
 				    dev_net(gtp->dev)),
-			    false);
+			    false,
+			    0);
 
 	return 0;
 }
@@ -704,7 +705,8 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 			    htons(GTP1U_PORT), htons(GTP1U_PORT),
 			    !net_eq(sock_net(gtp->sk1u),
 				    dev_net(gtp->dev)),
-			    false);
+			    false,
+			    0);
 	return 0;
 }
 
@@ -1304,7 +1306,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 				    pktinfo.gtph_port, pktinfo.gtph_port,
 				    !net_eq(sock_net(pktinfo.pctx->sk),
 					    dev_net(dev)),
-				    false);
+				    false, 0);
 		break;
 	case AF_INET6:
 #if IS_ENABLED(CONFIG_IPV6)
@@ -2405,7 +2407,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info)
 			    port, port,
 			    !net_eq(sock_net(sk),
 				    dev_net(gtp->dev)),
-			    false);
+			    false, 0);
 	return 0;
 }
 
diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c
index bff00946eae2..d866e6bfda70 100644
--- a/drivers/net/ovpn/udp.c
+++ b/drivers/net/ovpn/udp.c
@@ -199,7 +199,7 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind,
 transmit:
 	udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0,
 			    ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport,
-			    fl.fl4_dport, false, sk->sk_no_check_tx);
+			    fl.fl4_dport, false, sk->sk_no_check_tx, 0);
 	ret = 0;
 err:
 	local_bh_enable();
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 97792de896b7..1cc18acd242d 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2522,7 +2522,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr,
 				    pkey->u.ipv4.dst, tos, ttl, df,
-				    src_port, dst_port, xnet, !udp_sum);
+				    src_port, dst_port, xnet, !udp_sum, 0);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index 0414d7a6ce74..88e685667bc0 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -84,7 +84,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb,
 	skb->ignore_df = 1;
 	udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds,
 			    ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport,
-			    fl.fl4_dport, false, false);
+			    fl.fl4_dport, false, false, 0);
 	goto out;
 
 err:
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 0c3d571a04a1..8cf1380f3656 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -603,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
 
 void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, u8 proto,
-		   u8 tos, u8 ttl, __be16 df, bool xnet);
+		   u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags);
 struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 					     gfp_t flags);
 int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 2df3b8344eb5..28102c8fd8a8 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -150,7 +150,7 @@ static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
 			 __be16 df, __be16 src_port, __be16 dst_port,
-			 bool xnet, bool nocheck);
+			 bool xnet, bool nocheck, u16 ipcb_flags);
 
 int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			 struct sk_buff *skb,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 678b8f96e3e9..aaeb5d16f0c9 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -668,7 +668,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	ip_tunnel_adj_headroom(dev, headroom);
 
 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
-		      df, !net_eq(tunnel->net, dev_net(dev)));
+		      df, !net_eq(tunnel->net, dev_net(dev)), 0);
 	return;
 tx_error:
 	DEV_STATS_INC(dev, tx_errors);
@@ -857,7 +857,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	ip_tunnel_adj_headroom(dev, max_headroom);
 
 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
-		      df, !net_eq(tunnel->net, dev_net(dev)));
+		      df, !net_eq(tunnel->net, dev_net(dev)), 0);
 	return;
 
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index f65d2f727381..cc9915543637 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -49,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps);
 
 void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		   __be32 src, __be32 dst, __u8 proto,
-		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
+		   __u8 tos, __u8 ttl, __be16 df, bool xnet,
+		   u16 ipcb_flags)
 {
 	int pkt_len = skb->len - skb_inner_network_offset(skb);
 	struct net *net = dev_net(rt->dst.dev);
@@ -62,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 	skb_clear_hash_if_not_l4(skb);
 	skb_dst_set(skb, &rt->dst);
 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+	IPCB(skb)->flags = ipcb_flags;
 
 	/* Push down and install the IP header. */
 	skb_push(skb, sizeof(struct iphdr));
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 2326548997d3..9efd62505916 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
 			 __be16 df, __be16 src_port, __be16 dst_port,
-			 bool xnet, bool nocheck)
+			 bool xnet, bool nocheck, u16 ipcb_flags)
 {
 	struct udphdr *uh;
 
@@ -185,7 +185,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
 
 	udp_set_csum(nocheck, skb, src, dst, skb->len);
 
-	iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
+	iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet,
+		      ipcb_flags);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a72dbca9e8fc..12496ba1b7d4 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1035,7 +1035,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 	skb_set_inner_ipproto(skb, IPPROTO_IPV6);
 
 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
-		      df, !net_eq(tunnel->net, dev_net(dev)));
+		      df, !net_eq(tunnel->net, dev_net(dev)), 0);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f402f90eb6b6..a5ccada55f2b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1103,7 +1103,8 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
 	udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
 			    fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
-			    sctp_sk(sk)->udp_port, t->encap_port, false, false);
+			    sctp_sk(sk)->udp_port, t->encap_port, false, false,
+			    0);
 	return 0;
 }
 
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 108a4cc2e001..87e8c1e6d550 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -197,7 +197,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 		ttl = ip4_dst_hoplimit(&rt->dst);
 		udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
 				    dst->ipv4.s_addr, 0, ttl, 0, src->port,
-				    dst->port, false, true);
+				    dst->port, false, true, 0);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		if (!ndst) {
-- 
cgit v1.2.3


From 35bec72a24ace52a7f57642ff2813f22733b08fd Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:12 +0200
Subject: net: ipv4: Add ip_mr_output()

Multicast routing is today handled in the input path. Locally generated MC
packets don't hit the IPMR code today. Thus if a VXLAN remote address is
multicast, the driver needs to set an OIF during route lookup. Thus MC
routing configuration needs to be kept in sync with the VXLAN FDB and MDB.
Ideally, the VXLAN packets would be routed by the MC routing code instead.

To that end, this patch adds support to route locally generated multicast
packets. The newly-added routines do largely what ip_mr_input() and
ip_mr_forward() do: make an MR cache lookup to find where to send the
packets, and use ip_mc_output() to send each of them. When no cache entry
is found, the packet is punted to the daemon for resolution.

However, an installation that uses a VXLAN underlay netdevice for which it
also has matching MC routes, would get a different routing with this patch.
Previously, the MC packets would be delivered directly to the underlay
port, whereas now they would be MC-routed. In order to avoid this change in
behavior, introduce an IPCB flag. Only if the flag is set will
ip_mr_output() actually engage, otherwise it reverts to ip_mc_output().

This code is based on work by Roopa Prabhu and Nikolay Aleksandrov.

Signed-off-by: Roopa Prabhu <roopa@nvidia.com>
Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/0aadbd49330471c0f758d54afb05eb3b6e3a6b65.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip.h |   2 +
 net/ipv4/ipmr.c  | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/route.c |   2 +-
 3 files changed, 120 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 47ed6d23853d..375304bb99f6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -59,6 +59,7 @@ struct inet_skb_parm {
 #define IPSKB_L3SLAVE		BIT(7)
 #define IPSKB_NOPOLICY		BIT(8)
 #define IPSKB_MULTIPATH		BIT(9)
+#define IPSKB_MCROUTE		BIT(10)
 
 	u16			frag_max_size;
 };
@@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 int ip_local_deliver(struct sk_buff *skb);
 void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
 int ip_mr_input(struct sk_buff *skb);
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 74d45fd5d11e..f78c4e53dc8c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1965,6 +1965,19 @@ out_free:
 	kfree_skb(skb);
 }
 
+static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt,
+				   struct sk_buff *skb, int vifi)
+{
+	if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+		goto out_free;
+
+	ip_mc_output(net, NULL, skb);
+	return;
+
+out_free:
+	kfree_skb(skb);
+}
+
 /* Called with mrt_lock or rcu_read_lock() */
 static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
 {
@@ -2224,6 +2237,110 @@ dont_forward:
 	return 0;
 }
 
+static void ip_mr_output_finish(struct net *net, struct mr_table *mrt,
+				struct net_device *dev, struct sk_buff *skb,
+				struct mfc_cache *c)
+{
+	int psend = -1;
+	int ct;
+
+	atomic_long_inc(&c->_c.mfc_un.res.pkt);
+	atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+	WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+	/* Forward the frame */
+	if (c->mfc_origin == htonl(INADDR_ANY) &&
+	    c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+		if (ip_hdr(skb)->ttl >
+		    c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+			/* It's an (*,*) entry and the packet is not coming from
+			 * the upstream: forward the packet to the upstream
+			 * only.
+			 */
+			psend = c->_c.mfc_parent;
+			goto last_xmit;
+		}
+		goto dont_xmit;
+	}
+
+	for (ct = c->_c.mfc_un.res.maxvif - 1;
+	     ct >= c->_c.mfc_un.res.minvif; ct--) {
+		if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2;
+
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					ipmr_queue_output_xmit(net, mrt,
+							       skb2, psend);
+			}
+			psend = ct;
+		}
+	}
+
+last_xmit:
+	if (psend != -1) {
+		ipmr_queue_output_xmit(net, mrt, skb, psend);
+		return;
+	}
+
+dont_xmit:
+	kfree_skb(skb);
+}
+
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct rtable *rt = skb_rtable(skb);
+	struct mfc_cache *cache;
+	struct net_device *dev;
+	struct mr_table *mrt;
+	int vif;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	dev = rt->dst.dev;
+
+	if (IPCB(skb)->flags & IPSKB_FORWARDED)
+		goto mc_output;
+	if (!(IPCB(skb)->flags & IPSKB_MCROUTE))
+		goto mc_output;
+
+	skb->dev = dev;
+
+	mrt = ipmr_rt_fib_lookup(net, skb);
+	if (IS_ERR(mrt))
+		goto mc_output;
+
+	/* already under rcu_read_lock() */
+	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+	if (!cache) {
+		vif = ipmr_find_vif(mrt, dev);
+		if (vif >= 0)
+			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+						    vif);
+	}
+
+	/* No usable cache entry */
+	if (!cache) {
+		vif = ipmr_find_vif(mrt, dev);
+		if (vif >= 0)
+			return ipmr_cache_unresolved(mrt, vif, skb, dev);
+		goto mc_output;
+	}
+
+	vif = cache->_c.mfc_parent;
+	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+		goto mc_output;
+
+	ip_mr_output_finish(net, mrt, dev, skb, cache);
+	return 0;
+
+mc_output:
+	return ip_mc_output(net, sk, skb);
+}
+
 #ifdef CONFIG_IP_PIMSM_V1
 /* Handle IGMP messages of PIMv1 */
 int pim_rcv_v1(struct sk_buff *skb)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fccb05fb3a79..3ddf6bf40357 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2660,7 +2660,7 @@ add:
 			if (IN_DEV_MFORWARD(in_dev) &&
 			    !ipv4_is_local_multicast(fl4->daddr)) {
 				rth->dst.input = ip_mr_input;
-				rth->dst.output = ip_mc_output;
+				rth->dst.output = ip_mr_output;
 			}
 		}
 #endif
-- 
cgit v1.2.3


From 6a7d88ca15f73c5c570c372238f71d63da1fda55 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:13 +0200
Subject: net: ipv6: Make udp_tunnel6_xmit_skb() void

The function always returns zero, thus the return value does not carry any
signal. Just make it void.

Most callers already ignore the return value. However:

- Refold arguments of the call from sctp_v6_xmit() so that they fit into
  the 80-column limit.

- tipc_udp_xmit() initializes err from the return value, but that should
  already be always zero at that point. So there's no practical change, but
  elision of the assignment prompts a couple more tweaks to clean up the
  function.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/7facacf9d8ca3ca9391a4aee88160913671b868d.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/udp_tunnel.h  | 14 +++++++-------
 net/ipv6/ip6_udp_tunnel.c | 15 +++++++--------
 net/sctp/ipv6.c           |  7 ++++---
 net/tipc/udp_media.c      | 10 +++++-----
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 28102c8fd8a8..0b01f6ade20d 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -152,13 +152,13 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
 			 __be16 df, __be16 src_port, __be16 dst_port,
 			 bool xnet, bool nocheck, u16 ipcb_flags);
 
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
-			 struct sk_buff *skb,
-			 struct net_device *dev,
-			 const struct in6_addr *saddr,
-			 const struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be32 label,
-			 __be16 src_port, __be16 dst_port, bool nocheck);
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb,
+			  struct net_device *dev,
+			  const struct in6_addr *saddr,
+			  const struct in6_addr *daddr,
+			  __u8 prio, __u8 ttl, __be32 label,
+			  __be16 src_port, __be16 dst_port, bool nocheck);
 
 void udp_tunnel_sock_release(struct socket *sock);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index c99053189ea8..21681718b7bb 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -74,13 +74,13 @@ error:
 }
 EXPORT_SYMBOL_GPL(udp_sock_create6);
 
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
-			 struct sk_buff *skb,
-			 struct net_device *dev,
-			 const struct in6_addr *saddr,
-			 const struct in6_addr *daddr,
-			 __u8 prio, __u8 ttl, __be32 label,
-			 __be16 src_port, __be16 dst_port, bool nocheck)
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb,
+			  struct net_device *dev,
+			  const struct in6_addr *saddr,
+			  const struct in6_addr *daddr,
+			  __u8 prio, __u8 ttl, __be32 label,
+			  __be16 src_port, __be16 dst_port, bool nocheck)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
@@ -109,7 +109,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	ip6h->saddr	  = *saddr;
 
 	ip6tunnel_xmit(sk, skb, dev);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index a9ed2ccab1bd..d1ecf7454827 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -261,9 +261,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
 	label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6);
 
-	return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr,
-				    &fl6->daddr, tclass, ip6_dst_hoplimit(dst),
-				    label, sctp_sk(sk)->udp_port, t->encap_port, false);
+	udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
+			     tclass, ip6_dst_hoplimit(dst), label,
+			     sctp_sk(sk)->udp_port, t->encap_port, false);
+	return 0;
 }
 
 /* Returns the dst cache entry for the given source and destination ip
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 87e8c1e6d550..414713fcd8c5 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -172,7 +172,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 			 struct udp_media_addr *dst, struct dst_cache *cache)
 {
 	struct dst_entry *ndst;
-	int ttl, err = 0;
+	int ttl, err;
 
 	local_bh_disable();
 	ndst = dst_cache_get(cache);
@@ -217,13 +217,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 			dst_cache_set_ip6(cache, ndst, &fl6.saddr);
 		}
 		ttl = ip6_dst_hoplimit(ndst);
-		err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
-					   &src->ipv6, &dst->ipv6, 0, ttl, 0,
-					   src->port, dst->port, false);
+		udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
+				     &src->ipv6, &dst->ipv6, 0, ttl, 0,
+				     src->port, dst->port, false);
 #endif
 	}
 	local_bh_enable();
-	return err;
+	return 0;
 
 tx_error:
 	local_bh_enable();
-- 
cgit v1.2.3


From f78c75d84fe83898f0a00658f593d4f17b38cbc6 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:14 +0200
Subject: net: ipv6: Add a flags argument to ip6tunnel_xmit(),
 udp_tunnel6_xmit_skb()

ip6tunnel_xmit() erases the contents of the SKB control block. In order to
be able to set particular IP6CB flags on the SKB, add a corresponding
parameter, and propagate it to udp_tunnel6_xmit_skb() as well.

In one of the following patches, VXLAN driver will use this facility to
mark packets as subject to IPv6 multicast routing.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/acb4f9f3e40c3a931236c3af08a720b017fbfbfb.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bareudp.c          | 3 ++-
 drivers/net/geneve.c           | 3 ++-
 drivers/net/gtp.c              | 2 +-
 drivers/net/ovpn/udp.c         | 2 +-
 drivers/net/vxlan/vxlan_core.c | 3 ++-
 drivers/net/wireguard/socket.c | 2 +-
 include/net/ip6_tunnel.h       | 3 ++-
 include/net/udp_tunnel.h       | 3 ++-
 net/ipv6/ip6_tunnel.c          | 2 +-
 net/ipv6/ip6_udp_tunnel.c      | 5 +++--
 net/sctp/ipv6.c                | 2 +-
 net/tipc/udp_media.c           | 2 +-
 12 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 5e613080d3f8..0df3208783ad 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -431,7 +431,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			     &saddr, &daddr, prio, ttl,
 			     info->key.label, sport, bareudp->port,
 			     !test_bit(IP_TUNNEL_CSUM_BIT,
-				       info->key.tun_flags));
+				       info->key.tun_flags),
+			     0);
 	return 0;
 
 free_dst:
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index c668e8b00ed2..f6bd155aae7f 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1014,7 +1014,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			     &saddr, &key->u.ipv6.dst, prio, ttl,
 			     info->key.label, sport, geneve->cfg.info.key.tp_dst,
 			     !test_bit(IP_TUNNEL_CSUM_BIT,
-				       info->key.tun_flags));
+				       info->key.tun_flags),
+			     0);
 	return 0;
 }
 #endif
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 14584793fe4e..4b668ebaa0f7 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1316,7 +1316,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 				     ip6_dst_hoplimit(&pktinfo.rt->dst),
 				     0,
 				     pktinfo.gtph_port, pktinfo.gtph_port,
-				     false);
+				     false, 0);
 #else
 		goto tx_err;
 #endif
diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c
index d866e6bfda70..254cc94c4617 100644
--- a/drivers/net/ovpn/udp.c
+++ b/drivers/net/ovpn/udp.c
@@ -274,7 +274,7 @@ transmit:
 	skb->ignore_df = 1;
 	udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0,
 			     ip6_dst_hoplimit(dst), 0, fl.fl6_sport,
-			     fl.fl6_dport, udp_get_no_check6_tx(sk));
+			     fl.fl6_dport, udp_get_no_check6_tx(sk), 0);
 	ret = 0;
 err:
 	local_bh_enable();
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 1cc18acd242d..b22f9866be8e 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2586,7 +2586,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
 				     &saddr, &pkey->u.ipv6.dst, tos, ttl,
-				     pkey->label, src_port, dst_port, !udp_sum);
+				     pkey->label, src_port, dst_port, !udp_sum,
+				     0);
 #endif
 	}
 	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c
index 88e685667bc0..253488f8c00f 100644
--- a/drivers/net/wireguard/socket.c
+++ b/drivers/net/wireguard/socket.c
@@ -151,7 +151,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb,
 	skb->ignore_df = 1;
 	udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds,
 			     ip6_dst_hoplimit(dst), 0, fl.fl6_sport,
-			     fl.fl6_dport, false);
+			     fl.fl6_dport, false, 0);
 	goto out;
 
 err:
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 399592405c72..dd163495f353 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -152,11 +152,12 @@ int ip6_tnl_get_iflink(const struct net_device *dev);
 int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
 
 static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
-				  struct net_device *dev)
+				  struct net_device *dev, u16 ip6cb_flags)
 {
 	int pkt_len, err;
 
 	memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+	IP6CB(skb)->flags = ip6cb_flags;
 	pkt_len = skb->len - skb_inner_network_offset(skb);
 	err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 0b01f6ade20d..e3c70b579095 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -158,7 +158,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			  const struct in6_addr *saddr,
 			  const struct in6_addr *daddr,
 			  __u8 prio, __u8 ttl, __be32 label,
-			  __be16 src_port, __be16 dst_port, bool nocheck);
+			  __be16 src_port, __be16 dst_port, bool nocheck,
+			  u16 ip6cb_flags);
 
 void udp_tunnel_sock_release(struct socket *sock);
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 894d3158a6f0..a885bb5c98ea 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1278,7 +1278,7 @@ route_lookup:
 	ipv6h->nexthdr = proto;
 	ipv6h->saddr = fl6->saddr;
 	ipv6h->daddr = fl6->daddr;
-	ip6tunnel_xmit(NULL, skb, dev);
+	ip6tunnel_xmit(NULL, skb, dev, 0);
 	return 0;
 tx_err_link_failure:
 	DEV_STATS_INC(dev, tx_carrier_errors);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 21681718b7bb..8ebe17a6058a 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -80,7 +80,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 			  const struct in6_addr *saddr,
 			  const struct in6_addr *daddr,
 			  __u8 prio, __u8 ttl, __be32 label,
-			  __be16 src_port, __be16 dst_port, bool nocheck)
+			  __be16 src_port, __be16 dst_port, bool nocheck,
+			  u16 ip6cb_flags)
 {
 	struct udphdr *uh;
 	struct ipv6hdr *ip6h;
@@ -108,7 +109,7 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 	ip6h->daddr	  = *daddr;
 	ip6h->saddr	  = *saddr;
 
-	ip6tunnel_xmit(sk, skb, dev);
+	ip6tunnel_xmit(sk, skb, dev, ip6cb_flags);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index d1ecf7454827..3336dcfb4515 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -263,7 +263,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 
 	udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr,
 			     tclass, ip6_dst_hoplimit(dst), label,
-			     sctp_sk(sk)->udp_port, t->encap_port, false);
+			     sctp_sk(sk)->udp_port, t->encap_port, false, 0);
 	return 0;
 }
 
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 414713fcd8c5..a024fcc8c0cb 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -219,7 +219,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 		ttl = ip6_dst_hoplimit(ndst);
 		udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
 				     &src->ipv6, &dst->ipv6, 0, ttl, 0,
-				     src->port, dst->port, false);
+				     src->port, dst->port, false, 0);
 #endif
 	}
 	local_bh_enable();
-- 
cgit v1.2.3


From 96e8f5a9fe2d91b9f9eb8b45cc13ce1ca6a8af82 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:18 +0200
Subject: net: ipv6: Add ip6_mr_output()

Multicast routing is today handled in the input path. Locally generated MC
packets don't hit the IPMR code today. Thus if a VXLAN remote address is
multicast, the driver needs to set an OIF during route lookup. Thus MC
routing configuration needs to be kept in sync with the VXLAN FDB and MDB.
Ideally, the VXLAN packets would be routed by the MC routing code instead.

To that end, this patch adds support to route locally generated multicast
packets. The newly-added routines do largely what ip6_mr_input() and
ip6_mr_forward() do: make an MR cache lookup to find where to send the
packets, and use ip6_output() to send each of them. When no cache entry is
found, the packet is punted to the daemon for resolution.

Similarly to the IPv4 case in a previous patch, the new logic is contingent
on a newly-added IP6CB flag being set.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Link: https://patch.msgid.link/3bcc034a3ab4d3c291072fff38f78d7fbbeef4e6.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ipv6.h    |   1 +
 include/linux/mroute6.h |   7 +++
 net/ipv6/ip6mr.c        | 118 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/route.c        |   1 +
 4 files changed, 127 insertions(+)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5aeeed22f35b..db0eb0d86b64 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -156,6 +156,7 @@ struct inet6_skb_parm {
 #define IP6SKB_SEG6	      256
 #define IP6SKB_FAKEJUMBO      512
 #define IP6SKB_MULTIPATH      1024
+#define IP6SKB_MCROUTE        2048
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h
index 63ef5191cc57..fddafdc168f7 100644
--- a/include/linux/mroute6.h
+++ b/include/linux/mroute6.h
@@ -31,6 +31,7 @@ extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
 extern int ip6_mr_input(struct sk_buff *skb);
 extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 extern int ip6_mr_init(void);
+extern int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 extern void ip6_mr_cleanup(void);
 int ip6mr_ioctl(struct sock *sk, int cmd, void *arg);
 #else
@@ -58,6 +59,12 @@ static inline int ip6_mr_init(void)
 	return 0;
 }
 
+static inline int
+ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	return ip6_output(net, sk, skb);
+}
+
 static inline void ip6_mr_cleanup(void)
 {
 	return;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index bd964564160d..a35f4f1c6589 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2119,6 +2119,19 @@ out_free:
 	kfree_skb(skb);
 }
 
+static void ip6mr_output2(struct net *net, struct mr_table *mrt,
+			  struct sk_buff *skb, int vifi)
+{
+	if (ip6mr_prepare_xmit(net, mrt, skb, vifi))
+		goto out_free;
+
+	ip6_output(net, NULL, skb);
+	return;
+
+out_free:
+	kfree_skb(skb);
+}
+
 /* Called with rcu_read_lock() */
 static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
 {
@@ -2231,6 +2244,56 @@ dont_forward:
 	kfree_skb(skb);
 }
 
+/* Called under rcu_read_lock() */
+static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt,
+				 struct net_device *dev, struct sk_buff *skb,
+				 struct mfc6_cache *c)
+{
+	int psend = -1;
+	int ct;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	atomic_long_inc(&c->_c.mfc_un.res.pkt);
+	atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+	WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+	/* Forward the frame */
+	if (ipv6_addr_any(&c->mf6c_origin) &&
+	    ipv6_addr_any(&c->mf6c_mcastgrp)) {
+		if (ipv6_hdr(skb)->hop_limit >
+		    c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+			/* It's an (*,*) entry and the packet is not coming from
+			 * the upstream: forward the packet to the upstream
+			 * only.
+			 */
+			psend = c->_c.mfc_parent;
+			goto last_forward;
+		}
+		goto dont_forward;
+	}
+	for (ct = c->_c.mfc_un.res.maxvif - 1;
+	     ct >= c->_c.mfc_un.res.minvif; ct--) {
+		if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2;
+
+				skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					ip6mr_output2(net, mrt, skb2, psend);
+			}
+			psend = ct;
+		}
+	}
+last_forward:
+	if (psend != -1) {
+		ip6mr_output2(net, mrt, skb, psend);
+		return;
+	}
+
+dont_forward:
+	kfree_skb(skb);
+}
 
 /*
  *	Multicast packets for forwarding arrive here
@@ -2298,6 +2361,61 @@ int ip6_mr_input(struct sk_buff *skb)
 	return 0;
 }
 
+int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb_dst(skb)->dev;
+	struct flowi6 fl6 = (struct flowi6) {
+		.flowi6_iif = LOOPBACK_IFINDEX,
+		.flowi6_mark = skb->mark,
+	};
+	struct mfc6_cache *cache;
+	struct mr_table *mrt;
+	int err;
+	int vif;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
+		goto ip6_output;
+	if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE))
+		goto ip6_output;
+
+	err = ip6mr_fib_lookup(net, &fl6, &mrt);
+	if (err < 0) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	cache = ip6mr_cache_find(mrt,
+				 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+	if (!cache) {
+		vif = ip6mr_find_vif(mrt, dev);
+		if (vif >= 0)
+			cache = ip6mr_cache_find_any(mrt,
+						     &ipv6_hdr(skb)->daddr,
+						     vif);
+	}
+
+	/* No usable cache entry */
+	if (!cache) {
+		vif = ip6mr_find_vif(mrt, dev);
+		if (vif >= 0)
+			return ip6mr_cache_unresolved(mrt, vif, skb, dev);
+		goto ip6_output;
+	}
+
+	/* Wrong interface */
+	vif = cache->_c.mfc_parent;
+	if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+		goto ip6_output;
+
+	ip6_mr_output_finish(net, mrt, dev, skb, cache);
+	return 0;
+
+ip6_output:
+	return ip6_output(net, sk, skb);
+}
+
 int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		    u32 portid)
 {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 79c8f1acf8a3..df0caffefb38 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1145,6 +1145,7 @@ static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
 		rt->dst.input = ip6_input;
 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
 		rt->dst.input = ip6_mc_input;
+		rt->dst.output = ip6_mr_output;
 	} else {
 		rt->dst.input = ip6_forward;
 	}
-- 
cgit v1.2.3


From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:19 +0200
Subject: vxlan: Support MC routing in the underlay

Locally-generated MC packets have so far not been subject to MC routing.
Instead an MC-enabled installation would maintain the MC routing tables,
and separately from that the list of interfaces to send packets to as part
of the VXLAN FDB and MDB.

In a previous patch, a ip_mr_output() and ip6_mr_output() routines were
added for IPv4 and IPv6. All locally generated MC traffic is now passed
through these functions. For reasons of backward compatibility, an SKB
(IPCB / IP6CB) flag guards the actual MC routing.

This patch adds logic to set the flag, and the UAPI to enable the behavior.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_core.c | 22 ++++++++++++++++++++--
 include/net/vxlan.h            |  5 ++++-
 include/uapi/linux/if_link.h   |  1 +
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index b22f9866be8e..a6cc1de4d8b8 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2451,6 +2451,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	rcu_read_lock();
 	if (addr_family == AF_INET) {
 		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
+		u16 ipcb_flags = 0;
 		struct rtable *rt;
 		__be16 df = 0;
 		__be32 saddr;
@@ -2467,6 +2468,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 
+		if (flags & VXLAN_F_MC_ROUTE)
+			ipcb_flags |= IPSKB_MCROUTE;
+
 		if (!info) {
 			/* Bypass encapsulation if the destination is local */
 			err = encap_bypass_if_local(skb, dev, vxlan, AF_INET,
@@ -2522,11 +2526,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr,
 				    pkey->u.ipv4.dst, tos, ttl, df,
-				    src_port, dst_port, xnet, !udp_sum, 0);
+				    src_port, dst_port, xnet, !udp_sum,
+				    ipcb_flags);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
 		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
 		struct in6_addr saddr;
+		u16 ip6cb_flags = 0;
 
 		if (!ifindex)
 			ifindex = sock6->sock->sk->sk_bound_dev_if;
@@ -2542,6 +2548,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 
+		if (flags & VXLAN_F_MC_ROUTE)
+			ip6cb_flags |= IP6SKB_MCROUTE;
+
 		if (!info) {
 			u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags;
 
@@ -2587,7 +2596,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
 				     &saddr, &pkey->u.ipv6.dst, tos, ttl,
 				     pkey->label, src_port, dst_port, !udp_sum,
-				     0);
+				     ip6cb_flags);
 #endif
 	}
 	vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
@@ -3402,6 +3411,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_LOCALBYPASS]	= NLA_POLICY_MAX(NLA_U8, 1),
 	[IFLA_VXLAN_LABEL_POLICY]       = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX),
 	[IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)),
+	[IFLA_VXLAN_MC_ROUTE]		= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -4315,6 +4325,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
 			return err;
 	}
 
+	if (data[IFLA_VXLAN_MC_ROUTE]) {
+		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE,
+				    VXLAN_F_MC_ROUTE, changelink,
+				    true, extack);
+		if (err)
+			return err;
+	}
+
 	if (tb[IFLA_MTU]) {
 		if (changelink) {
 			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index e2f7ca045d3e..0ee50785f4f1 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -332,6 +332,7 @@ struct vxlan_dev {
 #define VXLAN_F_VNIFILTER               0x20000
 #define VXLAN_F_MDB			0x40000
 #define VXLAN_F_LOCALBYPASS		0x80000
+#define VXLAN_F_MC_ROUTE		0x100000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
@@ -353,7 +354,9 @@ struct vxlan_dev {
 					 VXLAN_F_UDP_ZERO_CSUM6_RX |	\
 					 VXLAN_F_COLLECT_METADATA  |	\
 					 VXLAN_F_VNIFILTER         |    \
-					 VXLAN_F_LOCALBYPASS)
+					 VXLAN_F_LOCALBYPASS       |	\
+					 VXLAN_F_MC_ROUTE          |	\
+					 0)
 
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3ad2d5d98034..873c285996fe 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1398,6 +1398,7 @@ enum {
 	IFLA_VXLAN_LOCALBYPASS,
 	IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
 	IFLA_VXLAN_RESERVED_BITS,
+	IFLA_VXLAN_MC_ROUTE,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From fd0406e5ca53b804353d4b1b60a980c13cbfbea3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 16 Jun 2025 08:10:47 -1000
Subject: net: tcp: tsq: Convert from tasklet to BH workqueue

The only generic interface to execute asynchronously in the BH context is
tasklet; however, it's marked deprecated and has some design flaws. To
replace tasklets, BH workqueue support was recently added. A BH workqueue
behaves similarly to regular workqueues except that the queued work items
are executed in the BH context.

This patch converts TCP Small Queues implementation from tasklet to BH
workqueue.

Semantically, this is an equivalent conversion and there shouldn't be any
user-visible behavior changes. While workqueue's queueing and execution
paths are a bit heavier than tasklet's, unless the work item is being queued
every packet, the difference hopefully shouldn't matter.

My experience with the networking stack is very limited and this patch
definitely needs attention from someone who actually understands networking.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Cc: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/aFBeJ38AS1ZF3Dq5@slm.duckdns.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h     |  2 +-
 net/ipv4/tcp.c        |  2 +-
 net/ipv4/tcp_output.c | 36 ++++++++++++++++++------------------
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9f852f5f8b95..761c4a0ad386 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -321,7 +321,7 @@ extern struct proto tcp_prot;
 #define TCP_DEC_STATS(net, field)	SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
 #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
 
-void tcp_tasklet_init(void);
+void tcp_tsq_work_init(void);
 
 int tcp_v4_err(struct sk_buff *skb, u32);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 27d3ef83ce7b..8a3c99246d2e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5242,6 +5242,6 @@ void __init tcp_init(void)
 	tcp_v4_init();
 	tcp_metrics_init();
 	BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
-	tcp_tasklet_init();
+	tcp_tsq_work_init();
 	mptcp_init();
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index eb50746dc482..28f840724fe8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1066,15 +1066,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
  * needs to be reallocated in a driver.
  * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
  *
- * Since transmit from skb destructor is forbidden, we use a tasklet
+ * Since transmit from skb destructor is forbidden, we use a BH work item
  * to process all sockets that eventually need to send more skbs.
- * We use one tasklet per cpu, with its own queue of sockets.
+ * We use one work item per cpu, with its own queue of sockets.
  */
-struct tsq_tasklet {
-	struct tasklet_struct	tasklet;
+struct tsq_work {
+	struct work_struct	work;
 	struct list_head	head; /* queue of tcp sockets */
 };
-static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+static DEFINE_PER_CPU(struct tsq_work, tsq_work);
 
 static void tcp_tsq_write(struct sock *sk)
 {
@@ -1104,14 +1104,14 @@ static void tcp_tsq_handler(struct sock *sk)
 	bh_unlock_sock(sk);
 }
 /*
- * One tasklet per cpu tries to send more skbs.
- * We run in tasklet context but need to disable irqs when
+ * One work item per cpu tries to send more skbs.
+ * We run in BH context but need to disable irqs when
  * transferring tsq->head because tcp_wfree() might
  * interrupt us (non NAPI drivers)
  */
-static void tcp_tasklet_func(struct tasklet_struct *t)
+static void tcp_tsq_workfn(struct work_struct *work)
 {
-	struct tsq_tasklet *tsq = from_tasklet(tsq,  t, tasklet);
+	struct tsq_work *tsq = container_of(work, struct tsq_work, work);
 	LIST_HEAD(list);
 	unsigned long flags;
 	struct list_head *q, *n;
@@ -1181,15 +1181,15 @@ void tcp_release_cb(struct sock *sk)
 }
 EXPORT_IPV6_MOD(tcp_release_cb);
 
-void __init tcp_tasklet_init(void)
+void __init tcp_tsq_work_init(void)
 {
 	int i;
 
 	for_each_possible_cpu(i) {
-		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+		struct tsq_work *tsq = &per_cpu(tsq_work, i);
 
 		INIT_LIST_HEAD(&tsq->head);
-		tasklet_setup(&tsq->tasklet, tcp_tasklet_func);
+		INIT_WORK(&tsq->work, tcp_tsq_workfn);
 	}
 }
 
@@ -1203,11 +1203,11 @@ void tcp_wfree(struct sk_buff *skb)
 	struct sock *sk = skb->sk;
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long flags, nval, oval;
-	struct tsq_tasklet *tsq;
+	struct tsq_work *tsq;
 	bool empty;
 
 	/* Keep one reference on sk_wmem_alloc.
-	 * Will be released by sk_free() from here or tcp_tasklet_func()
+	 * Will be released by sk_free() from here or tcp_tsq_workfn()
 	 */
 	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
 
@@ -1229,13 +1229,13 @@ void tcp_wfree(struct sk_buff *skb)
 		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
 	} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
 
-	/* queue this socket to tasklet queue */
+	/* queue this socket to BH workqueue */
 	local_irq_save(flags);
-	tsq = this_cpu_ptr(&tsq_tasklet);
+	tsq = this_cpu_ptr(&tsq_work);
 	empty = list_empty(&tsq->head);
 	list_add(&tp->tsq_node, &tsq->head);
 	if (empty)
-		tasklet_schedule(&tsq->tasklet);
+		queue_work(system_bh_wq, &tsq->work);
 	local_irq_restore(flags);
 	return;
 out:
@@ -2634,7 +2634,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
 		/* Always send skb if rtx queue is empty or has one skb.
 		 * No need to wait for TX completion to call us back,
-		 * after softirq/tasklet schedule.
+		 * after softirq schedule.
 		 * This helps when TX completions are delayed too much.
 		 */
 		if (tcp_rtx_queue_empty_or_single_skb(sk))
-- 
cgit v1.2.3


From c9e1225352d48b184991a4edc77b897cac66991e Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 16 Jun 2025 17:14:30 +0300
Subject: net: Allow const args for of page_to_netmem()

This allows calling page_to_netmem() with a const page * argument.

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-2-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 850869b45b45..7a1dafa3f080 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -139,7 +139,7 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
 	return (__force netmem_ref)((unsigned long)niov | NET_IOV);
 }
 
-static inline netmem_ref page_to_netmem(struct page *page)
+static inline netmem_ref page_to_netmem(const struct page *page)
 {
 	return (__force netmem_ref)page;
 }
-- 
cgit v1.2.3


From 1cbb49f85b4095af798f1a421db2e08894d0606c Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 16 Jun 2025 17:14:31 +0300
Subject: net: Add skb_can_coalesce for netmem

Allow drivers that have moved over to netmem to do fragment coalescing.

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5520524c93bf..9508968cb300 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3873,20 +3873,26 @@ static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int l
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
 	__must_check;
 
-static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
-				    const struct page *page, int off)
+static inline bool skb_can_coalesce_netmem(struct sk_buff *skb, int i,
+					   netmem_ref netmem, int off)
 {
 	if (skb_zcopy(skb))
 		return false;
 	if (i) {
 		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 
-		return page == skb_frag_page(frag) &&
+		return netmem == skb_frag_netmem(frag) &&
 		       off == skb_frag_off(frag) + skb_frag_size(frag);
 	}
 	return false;
 }
 
+static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
+				    const struct page *page, int off)
+{
+	return skb_can_coalesce_netmem(skb, i, page_to_netmem(page), off);
+}
+
 static inline int __skb_linearize(struct sk_buff *skb)
 {
 	return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
-- 
cgit v1.2.3


From a202f24b08587021a39eade5aa5444d5714689fb Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Mon, 16 Jun 2025 17:14:32 +0300
Subject: page_pool: Add page_pool_dev_alloc_netmems helper

This is the netmem counterpart of page_pool_dev_alloc_pages() which
uses the default GFP flags for RX.

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250616141441.1243044-4-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/helpers.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 93f2c31baf9b..773fc65780b5 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool,
 	return page_pool_alloc_netmem(pool, offset, size, gfp);
 }
 
+static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool)
+{
+	gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+
+	return page_pool_alloc_netmems(pool, gfp);
+}
+
 static inline struct page *page_pool_alloc(struct page_pool *pool,
 					   unsigned int *offset,
 					   unsigned int *size, gfp_t gfp)
-- 
cgit v1.2.3


From 7851263998d4269125fd6cb3fdbfc7c6db853859 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Mon, 16 Jun 2025 11:21:15 -0700
Subject: atm: Revert atm_account_tx() if copy_from_iter_full() fails.

In vcc_sendmsg(), we account skb->truesize to sk->sk_wmem_alloc by
atm_account_tx().

It is expected to be reverted by atm_pop_raw() later called by
vcc->dev->ops->send(vcc, skb).

However, vcc_sendmsg() misses the same revert when copy_from_iter_full()
fails, and then we will leak a socket.

Let's factorise the revert part as atm_return_tx() and call it in
the failure path.

Note that the corresponding sk_wmem_alloc operation can be found in
alloc_tx() as of the blamed commit.

  $ git blame -L:alloc_tx net/atm/common.c c55fa3cccbc2c~

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Simon Horman <horms@kernel.org>
Closes: https://lore.kernel.org/netdev/20250614161959.GR414686@horms.kernel.org/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250616182147.963333-3-kuni1840@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/atmdev.h | 6 ++++++
 net/atm/common.c       | 1 +
 net/atm/raw.c          | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index 9b02961d65ee..45f2f278b50a 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -249,6 +249,12 @@ static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb)
 	ATM_SKB(skb)->atm_options = vcc->atm_options;
 }
 
+static inline void atm_return_tx(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+	WARN_ON_ONCE(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize,
+					   &sk_atm(vcc)->sk_wmem_alloc));
+}
+
 static inline void atm_force_charge(struct atm_vcc *vcc,int truesize)
 {
 	atomic_add(truesize, &sk_atm(vcc)->sk_rmem_alloc);
diff --git a/net/atm/common.c b/net/atm/common.c
index 9b75699992ff..d7f7976ea13a 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -635,6 +635,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
 
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
 	if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
+		atm_return_tx(vcc, skb);
 		kfree_skb(skb);
 		error = -EFAULT;
 		goto out;
diff --git a/net/atm/raw.c b/net/atm/raw.c
index 2b5f78a7ec3e..1e6511ec842c 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -36,7 +36,7 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
 
 	pr_debug("(%d) %d -= %d\n",
 		 vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize);
-	WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc));
+	atm_return_tx(vcc, skb);
 	dev_kfree_skb_any(skb);
 	sk->sk_write_space(sk);
 }
-- 
cgit v1.2.3


From 30b58444807c93bffeaba7d776110f2a909d2f9a Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 17 Jun 2025 13:40:56 +0800
Subject: erofs: remove unused trace event erofs_destroy_inode

The trace event `erofs_destroy_inode` was added but remains unused. This
unused event contributes approximately 5KB to the kernel module size.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Closes: https://lore.kernel.org/r/20250612224906.15000244@batman.local.home
Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint")
Cc: stable@vger.kernel.org
Reviewed-by: Hongbo Li <lihongbo22@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20250617054056.3232365-1-hsiangkao@linux.alibaba.com
---
 include/trace/events/erofs.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index a5f4b9234f46..dad7360f42f9 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -211,24 +211,6 @@ TRACE_EVENT(erofs_map_blocks_exit,
 		  show_mflags(__entry->mflags), __entry->ret)
 );
 
-TRACE_EVENT(erofs_destroy_inode,
-	TP_PROTO(struct inode *inode),
-
-	TP_ARGS(inode),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,		dev		)
-		__field(	erofs_nid_t,	nid		)
-	),
-
-	TP_fast_assign(
-		__entry->dev	= inode->i_sb->s_dev;
-		__entry->nid	= EROFS_I(inode)->nid;
-	),
-
-	TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry))
-);
-
 #endif /* _TRACE_EROFS_H */
 
  /* This part must be outside protection */
-- 
cgit v1.2.3


From 6c1dedf805ecd304236a83a2057ed803fb6b32f6 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi+renesas@ideasonboard.com>
Date: Mon, 16 Jun 2025 14:14:24 +0200
Subject: media: rcar-fcp: Add rcar_fcp_soft_reset()

Add a function to perform soft reset of the FCP.

It is intended to support the correct stop procedure of the VSPX-FCPVX
and VSPD-FCPD pairs according to section "62.3.7.3 Reset Operation" of
the R-Car Hardware Manual at revision 1.20.

Signed-off-by: Jacopo Mondi <jacopo.mondi+renesas@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Link: https://lore.kernel.org/r/20250616-vspx-reset-v2-1-6cc12ed7e9bb@ideasonboard.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/platform/renesas/rcar-fcp.c | 36 +++++++++++++++++++++++++++++++
 include/media/rcar-fcp.h                  |  5 +++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/media/platform/renesas/rcar-fcp.c b/drivers/media/platform/renesas/rcar-fcp.c
index cee9bbce4e3a..f90c86bbce6e 100644
--- a/drivers/media/platform/renesas/rcar-fcp.c
+++ b/drivers/media/platform/renesas/rcar-fcp.c
@@ -9,6 +9,8 @@
 
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -19,14 +21,25 @@
 
 #include <media/rcar-fcp.h>
 
+#define RCAR_FCP_REG_RST		0x0010
+#define RCAR_FCP_REG_RST_SOFTRST	BIT(0)
+#define RCAR_FCP_REG_STA		0x0018
+#define RCAR_FCP_REG_STA_ACT		BIT(0)
+
 struct rcar_fcp_device {
 	struct list_head list;
 	struct device *dev;
+	void __iomem *base;
 };
 
 static LIST_HEAD(fcp_devices);
 static DEFINE_MUTEX(fcp_lock);
 
+static inline void rcar_fcp_write(struct rcar_fcp_device *fcp, u32 reg, u32 val)
+{
+	iowrite32(val, fcp->base + reg);
+}
+
 /* -----------------------------------------------------------------------------
  * Public API
  */
@@ -117,6 +130,25 @@ void rcar_fcp_disable(struct rcar_fcp_device *fcp)
 }
 EXPORT_SYMBOL_GPL(rcar_fcp_disable);
 
+int rcar_fcp_soft_reset(struct rcar_fcp_device *fcp)
+{
+	u32 value;
+	int ret;
+
+	if (!fcp)
+		return 0;
+
+	rcar_fcp_write(fcp, RCAR_FCP_REG_RST, RCAR_FCP_REG_RST_SOFTRST);
+	ret = readl_poll_timeout(fcp->base + RCAR_FCP_REG_STA,
+				 value, !(value & RCAR_FCP_REG_STA_ACT),
+				 1, 100);
+	if (ret)
+		dev_err(fcp->dev, "Failed to soft-reset\n");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rcar_fcp_soft_reset);
+
 /* -----------------------------------------------------------------------------
  * Platform Driver
  */
@@ -131,6 +163,10 @@ static int rcar_fcp_probe(struct platform_device *pdev)
 
 	fcp->dev = &pdev->dev;
 
+	fcp->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(fcp->base))
+		return PTR_ERR(fcp->base);
+
 	dma_set_max_seg_size(fcp->dev, UINT_MAX);
 
 	pm_runtime_enable(&pdev->dev);
diff --git a/include/media/rcar-fcp.h b/include/media/rcar-fcp.h
index 179240fb163b..6ac9be9f675e 100644
--- a/include/media/rcar-fcp.h
+++ b/include/media/rcar-fcp.h
@@ -18,6 +18,7 @@ void rcar_fcp_put(struct rcar_fcp_device *fcp);
 struct device *rcar_fcp_get_device(struct rcar_fcp_device *fcp);
 int rcar_fcp_enable(struct rcar_fcp_device *fcp);
 void rcar_fcp_disable(struct rcar_fcp_device *fcp);
+int rcar_fcp_soft_reset(struct rcar_fcp_device *fcp);
 #else
 static inline struct rcar_fcp_device *rcar_fcp_get(const struct device_node *np)
 {
@@ -33,6 +34,10 @@ static inline int rcar_fcp_enable(struct rcar_fcp_device *fcp)
 	return 0;
 }
 static inline void rcar_fcp_disable(struct rcar_fcp_device *fcp) { }
+static inline int rcar_fcp_soft_reset(struct rcar_fcp_device *fcp)
+{
+	return 0;
+}
 #endif
 
 #endif /* __MEDIA_RCAR_FCP_H__ */
-- 
cgit v1.2.3


From d06c1a9f348d22478c6bc5684f9c990e15ada1e9 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi+renesas@ideasonboard.com>
Date: Tue, 17 Jun 2025 09:23:28 +0200
Subject: media: vsp1: Add VSPX support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for VSPX, a specialized version of the VSP2 that
transfers data to the ISP. The VSPX is composed of two RPF units
to read data from external memory and an IIF instance that performs
transfer towards the ISP.

The VSPX is supported through a newly introduced vsp1_vspx.c file that
exposes two interfaces: vsp1_vspx interface, declared in vsp1_vspx.h
for the vsp1 core to initialize and cleanup the VSPX, and a vsp1_isp
interface, declared in include/media/vsp1.h for the ISP driver to
control the VSPX operations.

Tested-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi+renesas@ideasonboard.com>
Link: https://lore.kernel.org/r/20250617-b4-vspx-v13-1-9f4054c1c9af@ideasonboard.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/platform/renesas/vsp1/Makefile    |   1 +
 drivers/media/platform/renesas/vsp1/vsp1.h      |   1 +
 drivers/media/platform/renesas/vsp1/vsp1_drv.c  |  13 +-
 drivers/media/platform/renesas/vsp1/vsp1_regs.h |   1 +
 drivers/media/platform/renesas/vsp1/vsp1_vspx.c | 632 ++++++++++++++++++++++++
 drivers/media/platform/renesas/vsp1/vsp1_vspx.h |  16 +
 include/media/vsp1.h                            |  89 ++++
 7 files changed, 752 insertions(+), 1 deletion(-)
 create mode 100644 drivers/media/platform/renesas/vsp1/vsp1_vspx.c
 create mode 100644 drivers/media/platform/renesas/vsp1/vsp1_vspx.h

(limited to 'include')

diff --git a/drivers/media/platform/renesas/vsp1/Makefile b/drivers/media/platform/renesas/vsp1/Makefile
index de8c802e1d1a..2057c8f7be47 100644
--- a/drivers/media/platform/renesas/vsp1/Makefile
+++ b/drivers/media/platform/renesas/vsp1/Makefile
@@ -6,5 +6,6 @@ vsp1-y					+= vsp1_clu.o vsp1_hsit.o vsp1_lut.o
 vsp1-y					+= vsp1_brx.o vsp1_sru.o vsp1_uds.o
 vsp1-y					+= vsp1_hgo.o vsp1_hgt.o vsp1_histo.o
 vsp1-y					+= vsp1_iif.o vsp1_lif.o vsp1_uif.o
+vsp1-y					+= vsp1_vspx.o
 
 obj-$(CONFIG_VIDEO_RENESAS_VSP1)	+= vsp1.o
diff --git a/drivers/media/platform/renesas/vsp1/vsp1.h b/drivers/media/platform/renesas/vsp1/vsp1.h
index f97a1a31bfab..94de2e85792e 100644
--- a/drivers/media/platform/renesas/vsp1/vsp1.h
+++ b/drivers/media/platform/renesas/vsp1/vsp1.h
@@ -111,6 +111,7 @@ struct vsp1_device {
 	struct media_entity_operations media_ops;
 
 	struct vsp1_drm *drm;
+	struct vsp1_vspx *vspx;
 };
 
 int vsp1_device_get(struct vsp1_device *vsp1);
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_drv.c b/drivers/media/platform/renesas/vsp1/vsp1_drv.c
index 49ac3104291a..b8d06e88c475 100644
--- a/drivers/media/platform/renesas/vsp1/vsp1_drv.c
+++ b/drivers/media/platform/renesas/vsp1/vsp1_drv.c
@@ -39,6 +39,7 @@
 #include "vsp1_uds.h"
 #include "vsp1_uif.h"
 #include "vsp1_video.h"
+#include "vsp1_vspx.h"
 
 /* -----------------------------------------------------------------------------
  * Interrupt Handling
@@ -491,7 +492,10 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 
 		ret = media_device_register(mdev);
 	} else {
-		ret = vsp1_drm_init(vsp1);
+		if (vsp1->info->version == VI6_IP_VERSION_MODEL_VSPX_GEN4)
+			ret = vsp1_vspx_init(vsp1);
+		else
+			ret = vsp1_drm_init(vsp1);
 	}
 
 done:
@@ -858,6 +862,13 @@ static const struct vsp1_device_info vsp1_device_infos[] = {
 		.uif_count = 2,
 		.wpf_count = 1,
 		.num_bru_inputs = 5,
+	}, {
+		.version = VI6_IP_VERSION_MODEL_VSPX_GEN4,
+		.model = "VSP2-X",
+		.gen = 4,
+		.features = VSP1_HAS_IIF,
+		.rpf_count = 2,
+		.wpf_count = 1,
 	},
 };
 
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_regs.h b/drivers/media/platform/renesas/vsp1/vsp1_regs.h
index 86e47c2d991f..10cfbcd1b6e0 100644
--- a/drivers/media/platform/renesas/vsp1/vsp1_regs.h
+++ b/drivers/media/platform/renesas/vsp1/vsp1_regs.h
@@ -799,6 +799,7 @@
 #define VI6_IP_VERSION_MODEL_VSPDL_GEN3	(0x19 << 8)
 #define VI6_IP_VERSION_MODEL_VSPBS_GEN3	(0x1a << 8)
 #define VI6_IP_VERSION_MODEL_VSPD_GEN4	(0x1c << 8)
+#define VI6_IP_VERSION_MODEL_VSPX_GEN4	(0x1d << 8)
 /* RZ/G2L SoCs have no version register, So use 0x80 as the model version */
 #define VI6_IP_VERSION_MODEL_VSPD_RZG2L	(0x80 << 8)
 
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_vspx.c b/drivers/media/platform/renesas/vsp1/vsp1_vspx.c
new file mode 100644
index 000000000000..6551d63ba387
--- /dev/null
+++ b/drivers/media/platform/renesas/vsp1/vsp1_vspx.c
@@ -0,0 +1,632 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * vsp1_vspx.c  --  R-Car Gen 4 VSPX
+ *
+ * Copyright (C) 2025 Ideas On Board Oy
+ * Copyright (C) 2025 Renesas Electronics Corporation
+ */
+
+#include "vsp1_vspx.h"
+
+#include <linux/cleanup.h>
+#include <linux/container_of.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <media/media-entity.h>
+#include <media/v4l2-subdev.h>
+#include <media/vsp1.h>
+
+#include "vsp1_dl.h"
+#include "vsp1_iif.h"
+#include "vsp1_pipe.h"
+#include "vsp1_rwpf.h"
+
+/*
+ * struct vsp1_vspx_pipeline - VSPX pipeline
+ * @pipe: the VSP1 pipeline
+ * @partition: the pre-calculated partition used by the pipeline
+ * @mutex: protects the streaming start/stop sequences
+ * @lock: protect access to the enabled flag
+ * @enabled: the enable flag
+ * @vspx_frame_end: frame end callback
+ * @frame_end_data: data for the frame end callback
+ */
+struct vsp1_vspx_pipeline {
+	struct vsp1_pipeline pipe;
+	struct vsp1_partition partition;
+
+	/*
+	 * Protects the streaming start/stop sequences.
+	 *
+	 * The start/stop sequences cannot be locked with the 'lock' spinlock
+	 * as they acquire mutexes when handling the pm runtime and the vsp1
+	 * pipe start/stop operations. Provide a dedicated mutex for this
+	 * reason.
+	 */
+	struct mutex mutex;
+
+	/*
+	 * Protects the enable flag.
+	 *
+	 * The enabled flag is contended between the start/stop streaming
+	 * routines and the job_run one, which cannot take a mutex as it is
+	 * called from the ISP irq context.
+	 */
+	spinlock_t lock;
+	bool enabled;
+
+	void (*vspx_frame_end)(void *frame_end_data);
+	void *frame_end_data;
+};
+
+static inline struct vsp1_vspx_pipeline *
+to_vsp1_vspx_pipeline(struct vsp1_pipeline *pipe)
+{
+	return container_of(pipe, struct vsp1_vspx_pipeline, pipe);
+}
+
+/*
+ * struct vsp1_vspx - VSPX device
+ * @vsp1: the VSP1 device
+ * @pipe: the VSPX pipeline
+ */
+struct vsp1_vspx {
+	struct vsp1_device *vsp1;
+	struct vsp1_vspx_pipeline pipe;
+};
+
+/* Apply the given width, height and fourcc to the RWPF's subdevice */
+static int vsp1_vspx_rwpf_set_subdev_fmt(struct vsp1_device *vsp1,
+					 struct vsp1_rwpf *rwpf,
+					 u32 isp_fourcc,
+					 unsigned int width,
+					 unsigned int height)
+{
+	struct vsp1_entity *ent = &rwpf->entity;
+	struct v4l2_subdev_format format = {};
+	u32 vspx_fourcc;
+
+	switch (isp_fourcc) {
+	case V4L2_PIX_FMT_GREY:
+		/* 8 bit RAW Bayer image. */
+		vspx_fourcc = V4L2_PIX_FMT_RGB332;
+		break;
+	case V4L2_PIX_FMT_Y10:
+	case V4L2_PIX_FMT_Y12:
+	case V4L2_PIX_FMT_Y16:
+		/* 10, 12 and 16 bit RAW Bayer image. */
+		vspx_fourcc = V4L2_PIX_FMT_RGB565;
+		break;
+	case V4L2_META_FMT_GENERIC_8:
+		/* ConfigDMA parameters buffer. */
+		vspx_fourcc = V4L2_PIX_FMT_XBGR32;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	rwpf->fmtinfo = vsp1_get_format_info(vsp1, vspx_fourcc);
+
+	format.which = V4L2_SUBDEV_FORMAT_ACTIVE;
+	format.pad = RWPF_PAD_SINK;
+	format.format.width = width;
+	format.format.height = height;
+	format.format.field = V4L2_FIELD_NONE;
+	format.format.code = rwpf->fmtinfo->mbus;
+
+	return v4l2_subdev_call(&ent->subdev, pad, set_fmt, NULL, &format);
+}
+
+/* Configure the RPF->IIF->WPF pipeline for ConfigDMA or RAW image transfer. */
+static int vsp1_vspx_pipeline_configure(struct vsp1_device *vsp1,
+					dma_addr_t addr, u32 isp_fourcc,
+					unsigned int width, unsigned int height,
+					unsigned int stride,
+					unsigned int iif_sink_pad,
+					struct vsp1_dl_list *dl,
+					struct vsp1_dl_body *dlb)
+{
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+	struct vsp1_pipeline *pipe = &vspx_pipe->pipe;
+	struct vsp1_rwpf *rpf0 = pipe->inputs[0];
+	int ret;
+
+	ret = vsp1_vspx_rwpf_set_subdev_fmt(vsp1, rpf0, isp_fourcc, width,
+					    height);
+	if (ret)
+		return ret;
+
+	ret = vsp1_vspx_rwpf_set_subdev_fmt(vsp1, pipe->output, isp_fourcc,
+					    width, height);
+	if (ret)
+		return ret;
+
+	vsp1_pipeline_calculate_partition(pipe, &pipe->part_table[0], width, 0);
+	rpf0->format.plane_fmt[0].bytesperline = stride;
+	rpf0->format.num_planes = 1;
+	rpf0->mem.addr[0] = addr;
+
+	/*
+	 * Connect RPF0 to the IIF sink pad corresponding to the config or image
+	 * path.
+	 */
+	rpf0->entity.sink_pad = iif_sink_pad;
+
+	vsp1_entity_route_setup(&rpf0->entity, pipe, dlb);
+	vsp1_entity_configure_stream(&rpf0->entity, rpf0->entity.state, pipe,
+				     dl, dlb);
+	vsp1_entity_configure_partition(&rpf0->entity, pipe,
+					&pipe->part_table[0], dl, dlb);
+
+	return 0;
+}
+
+/* -----------------------------------------------------------------------------
+ * Interrupt handling
+ */
+
+static void vsp1_vspx_pipeline_frame_end(struct vsp1_pipeline *pipe,
+					 unsigned int completion)
+{
+	struct vsp1_vspx_pipeline *vspx_pipe = to_vsp1_vspx_pipeline(pipe);
+
+	scoped_guard(spinlock_irqsave, &pipe->irqlock) {
+		/*
+		 * Operating the vsp1_pipe in singleshot mode requires to
+		 * manually set the pipeline state to stopped when a transfer
+		 * is completed.
+		 */
+		pipe->state = VSP1_PIPELINE_STOPPED;
+	}
+
+	if (vspx_pipe->vspx_frame_end)
+		vspx_pipe->vspx_frame_end(vspx_pipe->frame_end_data);
+}
+
+/* -----------------------------------------------------------------------------
+ * ISP Driver API (include/media/vsp1.h)
+ */
+
+/**
+ * vsp1_isp_init() - Initialize the VSPX
+ * @dev: The VSP1 struct device
+ *
+ * Return: %0 on success or a negative error code on failure
+ */
+int vsp1_isp_init(struct device *dev)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+
+	if (!vsp1)
+		return -EPROBE_DEFER;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_init);
+
+/**
+ * vsp1_isp_get_bus_master - Get VSPX bus master
+ * @dev: The VSP1 struct device
+ *
+ * The VSPX accesses memory through an FCPX instance. When allocating memory
+ * buffers that will have to be accessed by the VSPX the 'struct device' of
+ * the FCPX should be used. Use this function to get a reference to it.
+ *
+ * Return: a pointer to the bus master's device
+ */
+struct device *vsp1_isp_get_bus_master(struct device *dev)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+
+	if (!vsp1)
+		return ERR_PTR(-ENODEV);
+
+	return vsp1->bus_master;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_get_bus_master);
+
+/**
+ * vsp1_isp_alloc_buffer - Allocate a buffer in the VSPX address space
+ * @dev: The VSP1 struct device
+ * @size: The size of the buffer to be allocated by the VSPX
+ * @buffer_desc: The buffer descriptor. Will be filled with the buffer
+ *		 CPU-mapped address, the bus address and the size of the
+ *		 allocated buffer
+ *
+ * Allocate a buffer that will be later accessed by the VSPX. Buffers allocated
+ * using vsp1_isp_alloc_buffer() shall be released with a call to
+ * vsp1_isp_free_buffer(). This function is used by the ISP driver to allocate
+ * memory for the ConfigDMA parameters buffer.
+ *
+ * Return: %0 on success or a negative error code on failure
+ */
+int vsp1_isp_alloc_buffer(struct device *dev, size_t size,
+			  struct vsp1_isp_buffer_desc *buffer_desc)
+{
+	struct device *bus_master = vsp1_isp_get_bus_master(dev);
+
+	if (IS_ERR_OR_NULL(bus_master))
+		return -ENODEV;
+
+	buffer_desc->cpu_addr = dma_alloc_coherent(bus_master, size,
+						   &buffer_desc->dma_addr,
+						   GFP_KERNEL);
+	if (!buffer_desc->cpu_addr)
+		return -ENOMEM;
+
+	buffer_desc->size = size;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_alloc_buffer);
+
+/**
+ * vsp1_isp_free_buffer - Release a buffer allocated by vsp1_isp_alloc_buffer()
+ * @dev: The VSP1 struct device
+ * @buffer_desc: The descriptor of the buffer to release as returned by
+ *		 vsp1_isp_alloc_buffer()
+ *
+ * Release memory in the VSPX address space allocated by
+ * vsp1_isp_alloc_buffer().
+ */
+void vsp1_isp_free_buffer(struct device *dev,
+			  struct vsp1_isp_buffer_desc *buffer_desc)
+{
+	struct device *bus_master = vsp1_isp_get_bus_master(dev);
+
+	if (IS_ERR_OR_NULL(bus_master))
+		return;
+
+	dma_free_coherent(bus_master, buffer_desc->size, buffer_desc->cpu_addr,
+			  buffer_desc->dma_addr);
+}
+
+/**
+ * vsp1_isp_start_streaming - Start processing VSPX jobs
+ * @dev: The VSP1 struct device
+ * @frame_end: The frame end callback description
+ *
+ * Start the VSPX and prepare for accepting buffer transfer job requests.
+ * The caller is responsible for tracking the started state of the VSPX.
+ * Attempting to start an already started VSPX instance is an error.
+ *
+ * Return: %0 on success or a negative error code on failure
+ */
+int vsp1_isp_start_streaming(struct device *dev,
+			     struct vsp1_vspx_frame_end *frame_end)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+	struct vsp1_pipeline *pipe = &vspx_pipe->pipe;
+	u32 value;
+	int ret;
+
+	if (!frame_end)
+		return -EINVAL;
+
+	guard(mutex)(&vspx_pipe->mutex);
+
+	scoped_guard(spinlock_irq, &vspx_pipe->lock) {
+		if (vspx_pipe->enabled)
+			return -EBUSY;
+	}
+
+	vspx_pipe->vspx_frame_end = frame_end->vspx_frame_end;
+	vspx_pipe->frame_end_data = frame_end->frame_end_data;
+
+	/* Enable the VSP1 and prepare for streaming. */
+	vsp1_pipeline_dump(pipe, "VSPX job");
+
+	ret = vsp1_device_get(vsp1);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Make sure VSPX is not active. This should never happen in normal
+	 * usage
+	 */
+	value = vsp1_read(vsp1, VI6_CMD(0));
+	if (value & VI6_CMD_STRCMD) {
+		dev_err(vsp1->dev,
+			"%s: Starting of WPF0 already reserved\n", __func__);
+		ret = -EBUSY;
+		goto error_put;
+	}
+
+	value = vsp1_read(vsp1, VI6_STATUS);
+	if (value & VI6_STATUS_SYS_ACT(0)) {
+		dev_err(vsp1->dev,
+			"%s: WPF0 has not entered idle state\n", __func__);
+		ret = -EBUSY;
+		goto error_put;
+	}
+
+	scoped_guard(spinlock_irq, &vspx_pipe->lock) {
+		vspx_pipe->enabled = true;
+	}
+
+	return 0;
+
+error_put:
+	vsp1_device_put(vsp1);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_start_streaming);
+
+/**
+ * vsp1_isp_stop_streaming - Stop the VSPX
+ * @dev: The VSP1 struct device
+ *
+ * Stop the VSPX operation by stopping the vsp1 pipeline and waiting for the
+ * last frame in transfer, if any, to complete.
+ *
+ * The caller is responsible for tracking the stopped state of the VSPX.
+ * Attempting to stop an already stopped VSPX instance is a nop.
+ */
+void vsp1_isp_stop_streaming(struct device *dev)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+	struct vsp1_pipeline *pipe = &vspx_pipe->pipe;
+
+	guard(mutex)(&vspx_pipe->mutex);
+
+	scoped_guard(spinlock_irq, &vspx_pipe->lock) {
+		if (!vspx_pipe->enabled)
+			return;
+
+		vspx_pipe->enabled = false;
+	}
+
+	WARN_ON_ONCE(vsp1_pipeline_stop(pipe));
+
+	vspx_pipe->vspx_frame_end = NULL;
+	vsp1_dlm_reset(pipe->output->dlm);
+	vsp1_device_put(vsp1);
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_stop_streaming);
+
+/**
+ * vsp1_isp_job_prepare - Prepare a new buffer transfer job
+ * @dev: The VSP1 struct device
+ * @job: The job description
+ *
+ * Prepare a new buffer transfer job by populating a display list that will be
+ * later executed by a call to vsp1_isp_job_run(). All pending jobs must be
+ * released after stopping the streaming operations with a call to
+ * vsp1_isp_job_release().
+ *
+ * In order for the VSPX to accept new jobs to prepare the VSPX must have been
+ * started.
+ *
+ * Return: %0 on success or a negative error code on failure
+ */
+int vsp1_isp_job_prepare(struct device *dev, struct vsp1_isp_job_desc *job)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+	struct vsp1_pipeline *pipe = &vspx_pipe->pipe;
+	const struct v4l2_pix_format_mplane *pix_mp;
+	struct vsp1_dl_list *second_dl = NULL;
+	struct vsp1_dl_body *dlb;
+	struct vsp1_dl_list *dl;
+	int ret;
+
+	/*
+	 * Transfer the buffers described in the job: an optional ConfigDMA
+	 * parameters buffer and a RAW image.
+	 */
+
+	job->dl = vsp1_dl_list_get(pipe->output->dlm);
+	if (!job->dl)
+		return -ENOMEM;
+
+	dl = job->dl;
+	dlb = vsp1_dl_list_get_body0(dl);
+
+	/* Configure IIF routing and enable IIF function. */
+	vsp1_entity_route_setup(pipe->iif, pipe, dlb);
+	vsp1_entity_configure_stream(pipe->iif, pipe->iif->state, pipe,
+				     dl, dlb);
+
+	/* Configure WPF0 to enable RPF0 as source. */
+	vsp1_entity_route_setup(&pipe->output->entity, pipe, dlb);
+	vsp1_entity_configure_stream(&pipe->output->entity,
+				     pipe->output->entity.state, pipe,
+				     dl, dlb);
+
+	if (job->config.pairs) {
+		/*
+		 * Writing less than 17 pairs corrupts the output images ( < 16
+		 * pairs) or freezes the VSPX operations (= 16 pairs). Only
+		 * allow more than 16 pairs to be written.
+		 */
+		if (job->config.pairs <= 16) {
+			ret = -EINVAL;
+			goto error_put_dl;
+		}
+
+		/*
+		 * Configure RPF0 for ConfigDMA data. Transfer the number of
+		 * configuration pairs plus 2 words for the header.
+		 */
+		ret = vsp1_vspx_pipeline_configure(vsp1, job->config.mem,
+						   V4L2_META_FMT_GENERIC_8,
+						   job->config.pairs * 2 + 2, 1,
+						   job->config.pairs * 2 + 2,
+						   VSPX_IIF_SINK_PAD_CONFIG,
+						   dl, dlb);
+		if (ret)
+			goto error_put_dl;
+
+		second_dl = vsp1_dl_list_get(pipe->output->dlm);
+		if (!second_dl) {
+			ret = -ENOMEM;
+			goto error_put_dl;
+		}
+
+		dl = second_dl;
+		dlb = vsp1_dl_list_get_body0(dl);
+	}
+
+	/* Configure RPF0 for RAW image transfer. */
+	pix_mp = &job->img.fmt;
+	ret = vsp1_vspx_pipeline_configure(vsp1, job->img.mem,
+					   pix_mp->pixelformat,
+					   pix_mp->width, pix_mp->height,
+					   pix_mp->plane_fmt[0].bytesperline,
+					   VSPX_IIF_SINK_PAD_IMG, dl, dlb);
+	if (ret)
+		goto error_put_dl;
+
+	if (second_dl)
+		vsp1_dl_list_add_chain(job->dl, second_dl);
+
+	return 0;
+
+error_put_dl:
+	if (second_dl)
+		vsp1_dl_list_put(second_dl);
+	vsp1_dl_list_put(job->dl);
+	job->dl = NULL;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_job_prepare);
+
+/**
+ * vsp1_isp_job_run - Run a buffer transfer job
+ * @dev: The VSP1 struct device
+ * @job: The job to be run
+ *
+ * Run the display list contained in the job description provided by the caller.
+ * The job must have been prepared with a call to vsp1_isp_job_prepare() and
+ * the job's display list shall be valid.
+ *
+ * Jobs can be run only on VSPX instances which have been started. Requests
+ * to run a job after the VSPX has been stopped return -EINVAL and the job
+ * resources shall be released by the caller with vsp1_isp_job_release().
+ * When a job is run successfully all the resources acquired by
+ * vsp1_isp_job_prepare() are released by this function and no further action
+ * is required to the caller.
+ *
+ * Return: %0 on success or a negative error code on failure
+ */
+int vsp1_isp_job_run(struct device *dev, struct vsp1_isp_job_desc *job)
+{
+	struct vsp1_device *vsp1 = dev_get_drvdata(dev);
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+	struct vsp1_pipeline *pipe = &vspx_pipe->pipe;
+	u32 value;
+
+	/* Make sure VSPX is not busy processing a frame. */
+	value = vsp1_read(vsp1, VI6_CMD(0));
+	if (value) {
+		dev_err(vsp1->dev,
+			"%s: Starting of WPF0 already reserved\n", __func__);
+		return -EBUSY;
+	}
+
+	scoped_guard(spinlock_irqsave, &vspx_pipe->lock) {
+		/*
+		 * If a new job is scheduled when the VSPX is stopped, do not
+		 * run it.
+		 */
+		if (!vspx_pipe->enabled)
+			return -EINVAL;
+
+		vsp1_dl_list_commit(job->dl, 0);
+
+		/*
+		 * The display list is now under control of the display list
+		 * manager and will be released automatically when the job
+		 * completes.
+		 */
+		job->dl = NULL;
+	}
+
+	scoped_guard(spinlock_irqsave, &pipe->irqlock) {
+		vsp1_pipeline_run(pipe);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_job_run);
+
+/**
+ * vsp1_isp_job_release - Release a non processed transfer job
+ * @dev: The VSP1 struct device
+ * @job: The job to release
+ *
+ * Release a job prepared by a call to vsp1_isp_job_prepare() and not yet
+ * run. All pending jobs shall be released after streaming has been stopped.
+ */
+void vsp1_isp_job_release(struct device *dev,
+			  struct vsp1_isp_job_desc *job)
+{
+	vsp1_dl_list_put(job->dl);
+}
+EXPORT_SYMBOL_GPL(vsp1_isp_job_release);
+
+/* -----------------------------------------------------------------------------
+ * Initialization and cleanup
+ */
+
+int vsp1_vspx_init(struct vsp1_device *vsp1)
+{
+	struct vsp1_vspx_pipeline *vspx_pipe;
+	struct vsp1_pipeline *pipe;
+
+	vsp1->vspx = devm_kzalloc(vsp1->dev, sizeof(*vsp1->vspx), GFP_KERNEL);
+	if (!vsp1->vspx)
+		return -ENOMEM;
+
+	vsp1->vspx->vsp1 = vsp1;
+
+	vspx_pipe = &vsp1->vspx->pipe;
+	vspx_pipe->enabled = false;
+
+	pipe = &vspx_pipe->pipe;
+
+	vsp1_pipeline_init(pipe);
+
+	pipe->partitions = 1;
+	pipe->part_table = &vspx_pipe->partition;
+	pipe->interlaced = false;
+	pipe->frame_end = vsp1_vspx_pipeline_frame_end;
+
+	mutex_init(&vspx_pipe->mutex);
+	spin_lock_init(&vspx_pipe->lock);
+
+	/*
+	 * Initialize RPF0 as input for VSPX and use it unconditionally for
+	 * now.
+	 */
+	pipe->inputs[0] = vsp1->rpf[0];
+	pipe->inputs[0]->entity.pipe = pipe;
+	pipe->inputs[0]->entity.sink = &vsp1->iif->entity;
+	list_add_tail(&pipe->inputs[0]->entity.list_pipe, &pipe->entities);
+
+	pipe->iif = &vsp1->iif->entity;
+	pipe->iif->pipe = pipe;
+	pipe->iif->sink = &vsp1->wpf[0]->entity;
+	pipe->iif->sink_pad = RWPF_PAD_SINK;
+	list_add_tail(&pipe->iif->list_pipe, &pipe->entities);
+
+	pipe->output = vsp1->wpf[0];
+	pipe->output->entity.pipe = pipe;
+	list_add_tail(&pipe->output->entity.list_pipe, &pipe->entities);
+
+	return 0;
+}
+
+void vsp1_vspx_cleanup(struct vsp1_device *vsp1)
+{
+	struct vsp1_vspx_pipeline *vspx_pipe = &vsp1->vspx->pipe;
+
+	mutex_destroy(&vspx_pipe->mutex);
+}
diff --git a/drivers/media/platform/renesas/vsp1/vsp1_vspx.h b/drivers/media/platform/renesas/vsp1/vsp1_vspx.h
new file mode 100644
index 000000000000..f871bf9e7dec
--- /dev/null
+++ b/drivers/media/platform/renesas/vsp1/vsp1_vspx.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * vsp1_vspx.h  --  R-Car Gen 4 VSPX
+ *
+ * Copyright (C) 2025 Ideas On Board Oy
+ * Copyright (C) 2025 Renesas Electronics Corporation
+ */
+#ifndef __VSP1_VSPX_H__
+#define __VSP1_VSPX_H__
+
+#include "vsp1.h"
+
+int vsp1_vspx_init(struct vsp1_device *vsp1);
+void vsp1_vspx_cleanup(struct vsp1_device *vsp1);
+
+#endif /* __VSP1_VSPX_H__ */
diff --git a/include/media/vsp1.h b/include/media/vsp1.h
index 4ea6352fd63f..d9b91ff02761 100644
--- a/include/media/vsp1.h
+++ b/include/media/vsp1.h
@@ -14,6 +14,11 @@
 #include <linux/videodev2.h>
 
 struct device;
+struct vsp1_dl_list;
+
+/* -----------------------------------------------------------------------------
+ * VSP1 DU interface
+ */
 
 int vsp1_du_init(struct device *dev);
 
@@ -121,4 +126,88 @@ void vsp1_du_atomic_flush(struct device *dev, unsigned int pipe_index,
 int vsp1_du_map_sg(struct device *dev, struct sg_table *sgt);
 void vsp1_du_unmap_sg(struct device *dev, struct sg_table *sgt);
 
+/* -----------------------------------------------------------------------------
+ * VSP1 ISP interface
+ */
+
+/**
+ * struct vsp1_isp_buffer_desc - Describe a buffer allocated by VSPX
+ * @size: Byte size of the buffer allocated by VSPX
+ * @cpu_addr: CPU-mapped address of a buffer allocated by VSPX
+ * @dma_addr: bus address of a buffer allocated by VSPX
+ */
+struct vsp1_isp_buffer_desc {
+	size_t size;
+	void *cpu_addr;
+	dma_addr_t dma_addr;
+};
+
+/**
+ * struct vsp1_isp_job_desc - Describe a VSPX buffer transfer request
+ * @config: ConfigDMA buffer descriptor
+ * @config.pairs: number of reg-value pairs in the ConfigDMA buffer
+ * @config.mem: bus address of the ConfigDMA buffer
+ * @img: RAW image buffer descriptor
+ * @img.fmt: RAW image format
+ * @img.mem: bus address of the RAW image buffer
+ * @dl: pointer to the display list populated by the VSPX driver in the
+ *      vsp1_isp_job_prepare() function
+ *
+ * Describe a transfer request for the VSPX to perform on behalf of the ISP.
+ * The job descriptor contains an optional ConfigDMA buffer and one RAW image
+ * buffer. Set config.pairs to 0 if no ConfigDMA buffer should be transferred.
+ * The minimum number of config.pairs that can be written using ConfigDMA is 17.
+ * A number of pairs < 16 corrupts the output image. A number of pairs == 16
+ * freezes the VSPX operation. If the ISP driver has to write less than 17 pairs
+ * it shall pad the buffer with writes directed to registers that have no effect
+ * or avoid using ConfigDMA at all for such small write sequences.
+ *
+ * The ISP driver shall pass an instance this type to the vsp1_isp_job_prepare()
+ * function that will populate the display list pointer @dl using the @config
+ * and @img descriptors. When the job has to be run on the VSPX, the descriptor
+ * shall be passed to vsp1_isp_job_run() which consumes the display list.
+ *
+ * Job descriptors not yet run shall be released with a call to
+ * vsp1_isp_job_release() when stopping the streaming in order to properly
+ * release the resources acquired by vsp1_isp_job_prepare().
+ */
+struct vsp1_isp_job_desc {
+	struct {
+		unsigned int pairs;
+		dma_addr_t mem;
+	} config;
+	struct {
+		struct v4l2_pix_format_mplane fmt;
+		dma_addr_t mem;
+	} img;
+	struct vsp1_dl_list *dl;
+};
+
+/**
+ * struct vsp1_vspx_frame_end - VSPX frame end callback data
+ * @vspx_frame_end: Frame end callback. Called after a transfer job has been
+ *		    completed. If the job includes both a ConfigDMA and a
+ *		    RAW image, the callback is called after both have been
+ *		    transferred
+ * @frame_end_data: Frame end callback data, passed to vspx_frame_end
+ */
+struct vsp1_vspx_frame_end {
+	void (*vspx_frame_end)(void *data);
+	void *frame_end_data;
+};
+
+int vsp1_isp_init(struct device *dev);
+struct device *vsp1_isp_get_bus_master(struct device *dev);
+int vsp1_isp_alloc_buffer(struct device *dev, size_t size,
+			  struct vsp1_isp_buffer_desc *buffer_desc);
+void vsp1_isp_free_buffer(struct device *dev,
+			  struct vsp1_isp_buffer_desc *buffer_desc);
+int vsp1_isp_start_streaming(struct device *dev,
+			     struct vsp1_vspx_frame_end *frame_end);
+void vsp1_isp_stop_streaming(struct device *dev);
+int vsp1_isp_job_prepare(struct device *dev,
+			 struct vsp1_isp_job_desc *job);
+int vsp1_isp_job_run(struct device *dev, struct vsp1_isp_job_desc *job);
+void vsp1_isp_job_release(struct device *dev,  struct vsp1_isp_job_desc *job);
+
 #endif /* __MEDIA_VSP1_H__ */
-- 
cgit v1.2.3


From 7c8c957ef12c41968adb66d785ce1dd5fb2f96e7 Mon Sep 17 00:00:00 2001
From: Stefan Klug <stefan.klug@ideasonboard.com>
Date: Fri, 23 May 2025 17:14:31 +0200
Subject: media: rkisp1: Add RKISP1_CID_SUPPORTED_PARAMS_BLOCKS control

Add a RKISP1_CID_SUPPORTED_PARAMS_BLOCKS V4L2 control to be able to
query the parameters blocks supported by the current kernel on the
current hardware from user space.

Signed-off-by: Stefan Klug <stefan.klug@ideasonboard.com>
Reviewed-by: Paul Elder <paul.elder@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://lore.kernel.org/r/20250523-supported-params-and-wdr-v3-2-7283b8536694@ideasonboard.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 .../media/platform/rockchip/rkisp1/rkisp1-common.h |  2 +
 .../media/platform/rockchip/rkisp1/rkisp1-params.c | 49 +++++++++++++++++++++-
 include/uapi/linux/rkisp1-config.h                 | 11 +++++
 include/uapi/linux/v4l2-controls.h                 |  6 +++
 4 files changed, 67 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h
index ca952fd0829b..5f187f9efc7b 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-common.h
@@ -415,6 +415,8 @@ struct rkisp1_params {
 	spinlock_t config_lock; /* locks the buffers list 'params' */
 	struct list_head params;
 
+	struct v4l2_ctrl_handler ctrls;
+
 	const struct v4l2_meta_format *metafmt;
 
 	enum v4l2_quantization quantization;
diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
index 24a8de697f2b..4db0ca8d86db 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
@@ -2736,6 +2736,44 @@ static int rkisp1_params_init_vb2_queue(struct vb2_queue *q,
 	return vb2_queue_init(q);
 }
 
+static int rkisp1_params_ctrl_init(struct rkisp1_params *params)
+{
+	struct v4l2_ctrl_config ctrl_config = {
+		.id = RKISP1_CID_SUPPORTED_PARAMS_BLOCKS,
+		.name = "Supported Params Blocks",
+		.type = V4L2_CTRL_TYPE_BITMASK,
+		.flags = V4L2_CTRL_FLAG_READ_ONLY,
+	};
+	int ret;
+
+	v4l2_ctrl_handler_init(&params->ctrls, 1);
+
+	for (unsigned int i = 0; i < ARRAY_SIZE(rkisp1_ext_params_handlers); i++) {
+		const struct rkisp1_ext_params_handler *block_handler;
+
+		block_handler = &rkisp1_ext_params_handlers[i];
+		ctrl_config.max |= BIT(i);
+
+		if ((params->rkisp1->info->features & block_handler->features) !=
+		    block_handler->features)
+			continue;
+
+		ctrl_config.def |= BIT(i);
+	}
+
+	v4l2_ctrl_new_custom(&params->ctrls, &ctrl_config, NULL);
+
+	params->vnode.vdev.ctrl_handler = &params->ctrls;
+
+	if (params->ctrls.error) {
+		ret = params->ctrls.error;
+		v4l2_ctrl_handler_free(&params->ctrls);
+		return ret;
+	}
+
+	return 0;
+}
+
 int rkisp1_params_register(struct rkisp1_device *rkisp1)
 {
 	struct rkisp1_params *params = &rkisp1->params;
@@ -2781,15 +2819,23 @@ int rkisp1_params_register(struct rkisp1_device *rkisp1)
 	if (ret)
 		goto err_media;
 
+	ret = rkisp1_params_ctrl_init(params);
+	if (ret) {
+		dev_err(rkisp1->dev, "Control initialization error %d\n", ret);
+		goto err_media;
+	}
+
 	ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1);
 	if (ret) {
 		dev_err(rkisp1->dev,
 			"failed to register %s, ret=%d\n", vdev->name, ret);
-		goto err_media;
+		goto err_ctrl;
 	}
 
 	return 0;
 
+err_ctrl:
+	v4l2_ctrl_handler_free(&params->ctrls);
 err_media:
 	media_entity_cleanup(&vdev->entity);
 	mutex_destroy(&node->vlock);
@@ -2806,6 +2852,7 @@ void rkisp1_params_unregister(struct rkisp1_device *rkisp1)
 		return;
 
 	vb2_video_unregister_device(vdev);
+	v4l2_ctrl_handler_free(&params->ctrls);
 	media_entity_cleanup(&vdev->entity);
 	mutex_destroy(&node->vlock);
 }
diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h
index 2d995f3c1ca3..5ca4d5961c5b 100644
--- a/include/uapi/linux/rkisp1-config.h
+++ b/include/uapi/linux/rkisp1-config.h
@@ -1086,6 +1086,9 @@ enum rkisp1_ext_params_block_type {
 #define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE	(1U << 0)
 #define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE	(1U << 1)
 
+/* A bitmask of parameters blocks supported on the current hardware. */
+#define RKISP1_CID_SUPPORTED_PARAMS_BLOCKS	(V4L2_CID_USER_RKISP1_BASE + 0x01)
+
 /**
  * struct rkisp1_ext_params_block_header - RkISP1 extensible parameters block
  *					   header
@@ -1520,6 +1523,14 @@ enum rksip1_ext_param_buffer_version {
  * V4L2 control. If such control is not available, userspace should assume only
  * RKISP1_EXT_PARAM_BUFFER_V1 is supported by the driver.
  *
+ * The read-only V4L2 control ``RKISP1_CID_SUPPORTED_PARAMS_BLOCKS`` can be used
+ * to query the blocks supported by the device. It contains a bitmask where each
+ * bit represents the availability of the corresponding entry from the
+ * :c:type:`rkisp1_ext_params_block_type` enum. The current and default values
+ * of the control represents the blocks supported by the device instance, while
+ * the maximum value represents the blocks supported by the kernel driver,
+ * independently of the device instance.
+ *
  * For each ISP block that userspace wants to configure, a block-specific
  * structure is appended to the @data buffer, one after the other without gaps
  * in between nor overlaps. Userspace shall populate the @data_size field with
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 72e32814ea83..f836512e9deb 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -222,6 +222,12 @@ enum v4l2_colorfx {
  */
 #define V4L2_CID_USER_UVC_BASE			(V4L2_CID_USER_BASE + 0x11e0)
 
+/*
+ * The base for Rockchip ISP1 driver controls.
+ * We reserve 16 controls for this driver.
+ */
+#define V4L2_CID_USER_RKISP1_BASE		(V4L2_CID_USER_BASE + 0x1220)
+
 /* MPEG-class control IDs */
 /* The MPEG controls are applicable to all codec controls
  * and the 'MPEG' part of the define is historical */
-- 
cgit v1.2.3


From cd403e8aed6caad87967d2c135b57d92ba8e5544 Mon Sep 17 00:00:00 2001
From: Jai Luthra <jai.luthra@ideasonboard.com>
Date: Tue, 10 Jun 2025 17:55:27 +0530
Subject: media: rockchip: rkisp1: Add support for Wide Dynamic Range

RKISP supports a basic Wide Dynamic Range (WDR) module since the first
iteration (v1.0) of the ISP. Add support for enabling and configuring it
using extensible parameters.

Also, to ease programming, switch to using macro variables for defining
the tonemapping curve register addresses.

Reviewed-by: Stefan Klug <stefan.klug@ideasonboard.com>
Tested-by: Stefan Klug <stefan.klug@ideasonboard.com>
Reviewed-by: Paul Elder <paul.elder@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Jai Luthra <jai.luthra@ideasonboard.com>
Link: https://lore.kernel.org/r/20250610-wdr-latest-v4-1-b69d0ac17ce9@ideasonboard.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 .../media/platform/rockchip/rkisp1/rkisp1-params.c | 93 ++++++++++++++++++++
 .../media/platform/rockchip/rkisp1/rkisp1-regs.h   | 99 ++++++----------------
 include/uapi/linux/rkisp1-config.h                 | 95 ++++++++++++++++++++-
 3 files changed, 212 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
index 4db0ca8d86db..f1585f8fa0f4 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-params.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2017 Rockchip Electronics Co., Ltd.
  */
 
+#include <linux/bitfield.h>
 #include <linux/math.h>
 #include <linux/string.h>
 
@@ -60,6 +61,7 @@ union rkisp1_ext_params_config {
 	struct rkisp1_ext_params_afc_config afc;
 	struct rkisp1_ext_params_compand_bls_config compand_bls;
 	struct rkisp1_ext_params_compand_curve_config compand_curve;
+	struct rkisp1_ext_params_wdr_config wdr;
 };
 
 enum rkisp1_params_formats {
@@ -1348,6 +1350,73 @@ rkisp1_compand_compress_config(struct rkisp1_params *params,
 				       arg->x);
 }
 
+static void rkisp1_wdr_config(struct rkisp1_params *params,
+			      const struct rkisp1_cif_isp_wdr_config *arg)
+{
+	unsigned int i;
+	u32 value;
+
+	value = rkisp1_read(params->rkisp1, RKISP1_CIF_ISP_WDR_CTRL)
+	      & ~(RKISP1_CIF_ISP_WDR_USE_IREF |
+		  RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT |
+		  RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE |
+		  RKISP1_CIF_ISP_WDR_USE_Y9_8 |
+		  RKISP1_CIF_ISP_WDR_USE_RGB7_8 |
+		  RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT |
+		  RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK);
+
+	/* Colorspace and chrominance mapping */
+	if (arg->use_rgb_colorspace)
+		value |= RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT;
+
+	if (!arg->use_rgb_colorspace && arg->bypass_chroma_mapping)
+		value |= RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE;
+
+	/* Illumination reference */
+	if (arg->use_iref) {
+		value |= RKISP1_CIF_ISP_WDR_USE_IREF;
+
+		if (arg->iref_config.use_y9_8)
+			value |= RKISP1_CIF_ISP_WDR_USE_Y9_8;
+
+		if (arg->iref_config.use_rgb7_8)
+			value |= RKISP1_CIF_ISP_WDR_USE_RGB7_8;
+
+		if (arg->iref_config.disable_transient)
+			value |= RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT;
+
+		value |= FIELD_PREP(RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK,
+				    min(arg->iref_config.rgb_factor,
+					RKISP1_CIF_ISP_WDR_RGB_FACTOR_MAX));
+	}
+
+	rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_CTRL, value);
+
+	/* RGB and Luminance offsets */
+	value = FIELD_PREP(RKISP1_CIF_ISP_WDR_RGB_OFFSET_MASK,
+			   arg->rgb_offset)
+	      | FIELD_PREP(RKISP1_CIF_ISP_WDR_LUM_OFFSET_MASK,
+			   arg->luma_offset);
+	rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_OFFSET, value);
+
+	/* DeltaMin */
+	value = FIELD_PREP(RKISP1_CIF_ISP_WDR_DMIN_THRESH_MASK,
+			   arg->dmin_thresh)
+	      | FIELD_PREP(RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MASK,
+			   min(arg->dmin_strength,
+			       RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MAX));
+	rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_DELTAMIN, value);
+
+	/* Tone curve */
+	for (i = 0; i < RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS; i++)
+		rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_TONECURVE(i),
+			     arg->tone_curve.dY[i]);
+	for (i = 0; i < RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF; i++)
+		rkisp1_write(params->rkisp1, RKISP1_CIF_ISP_WDR_TONECURVE_YM(i),
+			     arg->tone_curve.ym[i] &
+				     RKISP1_CIF_ISP_WDR_TONE_CURVE_YM_MASK);
+}
+
 static void
 rkisp1_isp_isr_other_config(struct rkisp1_params *params,
 			    const struct rkisp1_params_cfg *new_params)
@@ -2005,6 +2074,25 @@ static void rkisp1_ext_params_compand_compress(struct rkisp1_params *params,
 				      RKISP1_CIF_ISP_COMPAND_CTRL_COMPRESS_ENABLE);
 }
 
+static void rkisp1_ext_params_wdr(struct rkisp1_params *params,
+				  const union rkisp1_ext_params_config *block)
+{
+	const struct rkisp1_ext_params_wdr_config *wdr = &block->wdr;
+
+	if (wdr->header.flags & RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE) {
+		rkisp1_param_clear_bits(params, RKISP1_CIF_ISP_WDR_CTRL,
+					RKISP1_CIF_ISP_WDR_CTRL_ENABLE);
+		return;
+	}
+
+	rkisp1_wdr_config(params, &wdr->config);
+
+	if ((wdr->header.flags & RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE) &&
+	    !(params->enabled_blocks & BIT(wdr->header.type)))
+		rkisp1_param_set_bits(params, RKISP1_CIF_ISP_WDR_CTRL,
+				      RKISP1_CIF_ISP_WDR_CTRL_ENABLE);
+}
+
 typedef void (*rkisp1_block_handler)(struct rkisp1_params *params,
 			     const union rkisp1_ext_params_config *config);
 
@@ -2118,6 +2206,11 @@ static const struct rkisp1_ext_params_handler {
 		.group		= RKISP1_EXT_PARAMS_BLOCK_GROUP_OTHERS,
 		.features	= RKISP1_FEATURE_COMPAND,
 	},
+	[RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR] = {
+		.size		= sizeof(struct rkisp1_ext_params_wdr_config),
+		.handler	= rkisp1_ext_params_wdr,
+		.group		= RKISP1_EXT_PARAMS_BLOCK_GROUP_OTHERS,
+	},
 };
 
 static void rkisp1_ext_params_config(struct rkisp1_params *params,
diff --git a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
index 139177db9c6d..fbeb186cde0d 100644
--- a/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
+++ b/drivers/media/platform/rockchip/rkisp1/rkisp1-regs.h
@@ -703,6 +703,27 @@
 #define RKISP1_CIF_ISP_COMPAND_CTRL_SOFT_RESET_FLAG	BIT(2)
 #define RKISP1_CIF_ISP_COMPAND_CTRL_BLS_ENABLE		BIT(3)
 
+/* WDR */
+/* ISP_WDR_CTRL */
+#define RKISP1_CIF_ISP_WDR_CTRL_ENABLE			BIT(0)
+#define RKISP1_CIF_ISP_WDR_COLOR_SPACE_SELECT		BIT(1)
+#define RKISP1_CIF_ISP_WDR_CR_MAPPING_DISABLE		BIT(2)
+#define RKISP1_CIF_ISP_WDR_USE_IREF			BIT(3)
+#define RKISP1_CIF_ISP_WDR_USE_Y9_8			BIT(4)
+#define RKISP1_CIF_ISP_WDR_USE_RGB7_8			BIT(5)
+#define RKISP1_CIF_ISP_WDR_DISABLE_TRANSIENT		BIT(6)
+#define RKISP1_CIF_ISP_WDR_RGB_FACTOR_MASK		GENMASK(11, 8)
+#define RKISP1_CIF_ISP_WDR_RGB_FACTOR_MAX		8U
+/* ISP_WDR_TONE_CURVE_YM */
+#define RKISP1_CIF_ISP_WDR_TONE_CURVE_YM_MASK		GENMASK(12, 0)
+/* ISP_WDR_OFFSET */
+#define RKISP1_CIF_ISP_WDR_RGB_OFFSET_MASK		GENMASK(11, 0)
+#define RKISP1_CIF_ISP_WDR_LUM_OFFSET_MASK		GENMASK(27, 16)
+/* ISP_WDR_DELTAMIN */
+#define RKISP1_CIF_ISP_WDR_DMIN_THRESH_MASK		GENMASK(11, 0)
+#define RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MASK		GENMASK(20, 16)
+#define RKISP1_CIF_ISP_WDR_DMIN_STRENGTH_MAX		16U
+
 /* =================================================================== */
 /*                            CIF Registers                            */
 /* =================================================================== */
@@ -1295,82 +1316,12 @@
 
 #define RKISP1_CIF_ISP_WDR_BASE			0x00002a00
 #define RKISP1_CIF_ISP_WDR_CTRL			(RKISP1_CIF_ISP_WDR_BASE + 0x00000000)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_1		(RKISP1_CIF_ISP_WDR_BASE + 0x00000004)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_2		(RKISP1_CIF_ISP_WDR_BASE + 0x00000008)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_3		(RKISP1_CIF_ISP_WDR_BASE + 0x0000000c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_4		(RKISP1_CIF_ISP_WDR_BASE + 0x00000010)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_0	(RKISP1_CIF_ISP_WDR_BASE + 0x00000014)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_1	(RKISP1_CIF_ISP_WDR_BASE + 0x00000018)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_2	(RKISP1_CIF_ISP_WDR_BASE + 0x0000001c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_3	(RKISP1_CIF_ISP_WDR_BASE + 0x00000020)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_4	(RKISP1_CIF_ISP_WDR_BASE + 0x00000024)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_5	(RKISP1_CIF_ISP_WDR_BASE + 0x00000028)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_6	(RKISP1_CIF_ISP_WDR_BASE + 0x0000002c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_7	(RKISP1_CIF_ISP_WDR_BASE + 0x00000030)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_8	(RKISP1_CIF_ISP_WDR_BASE + 0x00000034)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_9	(RKISP1_CIF_ISP_WDR_BASE + 0x00000038)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_10	(RKISP1_CIF_ISP_WDR_BASE + 0x0000003c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_11	(RKISP1_CIF_ISP_WDR_BASE + 0x00000040)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_12	(RKISP1_CIF_ISP_WDR_BASE + 0x00000044)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_13	(RKISP1_CIF_ISP_WDR_BASE + 0x00000048)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_14	(RKISP1_CIF_ISP_WDR_BASE + 0x0000004c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_15	(RKISP1_CIF_ISP_WDR_BASE + 0x00000050)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_16	(RKISP1_CIF_ISP_WDR_BASE + 0x00000054)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_17	(RKISP1_CIF_ISP_WDR_BASE + 0x00000058)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_18	(RKISP1_CIF_ISP_WDR_BASE + 0x0000005c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_19	(RKISP1_CIF_ISP_WDR_BASE + 0x00000060)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_20	(RKISP1_CIF_ISP_WDR_BASE + 0x00000064)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_21	(RKISP1_CIF_ISP_WDR_BASE + 0x00000068)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_22	(RKISP1_CIF_ISP_WDR_BASE + 0x0000006c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_23	(RKISP1_CIF_ISP_WDR_BASE + 0x00000070)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_24	(RKISP1_CIF_ISP_WDR_BASE + 0x00000074)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_25	(RKISP1_CIF_ISP_WDR_BASE + 0x00000078)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_26	(RKISP1_CIF_ISP_WDR_BASE + 0x0000007c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_27	(RKISP1_CIF_ISP_WDR_BASE + 0x00000080)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_28	(RKISP1_CIF_ISP_WDR_BASE + 0x00000084)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_29	(RKISP1_CIF_ISP_WDR_BASE + 0x00000088)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_30	(RKISP1_CIF_ISP_WDR_BASE + 0x0000008c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_31	(RKISP1_CIF_ISP_WDR_BASE + 0x00000090)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_32	(RKISP1_CIF_ISP_WDR_BASE + 0x00000094)
+#define RKISP1_CIF_ISP_WDR_TONECURVE(n)		(RKISP1_CIF_ISP_WDR_BASE + 0x00000004 + (n) * 4)
+#define RKISP1_CIF_ISP_WDR_TONECURVE_YM(n)	(RKISP1_CIF_ISP_WDR_BASE + 0x00000014 + (n) * 4)
 #define RKISP1_CIF_ISP_WDR_OFFSET		(RKISP1_CIF_ISP_WDR_BASE + 0x00000098)
 #define RKISP1_CIF_ISP_WDR_DELTAMIN		(RKISP1_CIF_ISP_WDR_BASE + 0x0000009c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_1_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000a0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_2_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000a4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_3_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000a8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_4_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000ac)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_0_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000b0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_1_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000b4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_2_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000b8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_3_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000bc)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_4_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000c0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_5_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000c4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_6_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000c8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_7_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000cc)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_8_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000d0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_9_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000d4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_10_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000d8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_11_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000dc)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_12_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000e0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_13_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000e4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_14_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000e8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_15_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000ec)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_16_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000f0)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_17_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000f4)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_18_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000f8)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_19_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x000000fc)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_20_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000100)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_21_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000104)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_22_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000108)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_23_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x0000010c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_24_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000110)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_25_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000114)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_26_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000118)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_27_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x0000011c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_28_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000120)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_29_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000124)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_30_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000128)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_31_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x0000012c)
-#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_32_SHD	(RKISP1_CIF_ISP_WDR_BASE + 0x00000130)
+#define RKISP1_CIF_ISP_WDR_TONECURVE_SHD(n)	(RKISP1_CIF_ISP_WDR_BASE + 0x000000a0 + (n) * 4)
+#define RKISP1_CIF_ISP_WDR_TONECURVE_YM_SHD(n)	(RKISP1_CIF_ISP_WDR_BASE + 0x000000b0 + (n) * 4)
 
 #define RKISP1_CIF_ISP_HIST_BASE_V12		0x00002c00
 #define RKISP1_CIF_ISP_HIST_CTRL_V12		(RKISP1_CIF_ISP_HIST_BASE_V12 + 0x00000000)
diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h
index 5ca4d5961c5b..3b060ea6eed7 100644
--- a/include/uapi/linux/rkisp1-config.h
+++ b/include/uapi/linux/rkisp1-config.h
@@ -169,6 +169,13 @@
  */
 #define RKISP1_CIF_ISP_COMPAND_NUM_POINTS	64
 
+/*
+ * Wide Dynamic Range
+ */
+#define RKISP1_CIF_ISP_WDR_CURVE_NUM_INTERV	32
+#define RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF	(RKISP1_CIF_ISP_WDR_CURVE_NUM_INTERV + 1)
+#define RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS	4
+
 /*
  * Measurement types
  */
@@ -889,6 +896,72 @@ struct rkisp1_cif_isp_compand_curve_config {
 	__u32 y[RKISP1_CIF_ISP_COMPAND_NUM_POINTS];
 };
 
+/**
+ * struct rkisp1_cif_isp_wdr_tone_curve - Tone mapping curve definition for WDR.
+ *
+ * @dY: the dYn increments for horizontal (input) axis of the tone curve.
+ *      each 3-bit dY value represents an increment of 2**(value+3).
+ *      dY[0] bits 0:2 is increment dY1, bit 3 unused
+ *      dY[0] bits 4:6 is increment dY2, bit 7 unused
+ *      ...
+ *      dY[0] bits 28:30 is increment dY8, bit 31 unused
+ *      ... and so on till dY[3] bits 28:30 is increment dY32, bit 31 unused.
+ * @ym: the Ym values for the vertical (output) axis of the tone curve.
+ *      each value is 13 bit.
+ */
+struct rkisp1_cif_isp_wdr_tone_curve {
+	__u32 dY[RKISP1_CIF_ISP_WDR_CURVE_NUM_DY_REGS];
+	__u16 ym[RKISP1_CIF_ISP_WDR_CURVE_NUM_COEFF];
+};
+
+/**
+ * struct rkisp1_cif_isp_wdr_iref_config - Illumination reference config for WDR.
+ *
+ * Use illumination reference value as described below, instead of only the
+ * luminance (Y) value for tone mapping and gain calculations:
+ * IRef = (rgb_factor * RGBMax_tr + (8 - rgb_factor) * Y)/8
+ *
+ * @rgb_factor: defines how much influence the RGBmax approach has in
+ *              comparison to Y (valid values are 0..8).
+ * @use_y9_8: use Y*9/8 for maximum value calculation along with the
+ *            default of R, G, B for noise reduction.
+ * @use_rgb7_8: decrease RGBMax by 7/8 for noise reduction.
+ * @disable_transient: disable transient calculation between Y and RGBY_max.
+ */
+struct rkisp1_cif_isp_wdr_iref_config {
+	__u8 rgb_factor;
+	__u8 use_y9_8;
+	__u8 use_rgb7_8;
+	__u8 disable_transient;
+};
+
+/**
+ * struct rkisp1_cif_isp_wdr_config - Configuration for wide dynamic range.
+ *
+ * @tone_curve: tone mapping curve.
+ * @iref_config: illumination reference configuration. (when use_iref is true)
+ * @rgb_offset: RGB offset value for RGB operation mode. (12 bits)
+ * @luma_offset: luminance offset value for RGB operation mode. (12 bits)
+ * @dmin_thresh: lower threshold for deltaMin value. (12 bits)
+ * @dmin_strength: strength factor for deltaMin. (valid range is 0x00..0x10)
+ * @use_rgb_colorspace: use RGB instead of luminance/chrominance colorspace.
+ * @bypass_chroma_mapping: disable chrominance mapping (only valid if
+ *                         use_rgb_colorspace = 0)
+ * @use_iref: use illumination reference instead of Y for tone mapping
+ *            and gain calculations.
+ */
+struct rkisp1_cif_isp_wdr_config {
+	struct rkisp1_cif_isp_wdr_tone_curve tone_curve;
+	struct rkisp1_cif_isp_wdr_iref_config iref_config;
+	__u16 rgb_offset;
+	__u16 luma_offset;
+	__u16 dmin_thresh;
+	__u8 dmin_strength;
+	__u8 use_rgb_colorspace;
+	__u8 bypass_chroma_mapping;
+	__u8 use_iref;
+};
+
 /*---------- PART2: Measurement Statistics ------------*/
 
 /**
@@ -1059,6 +1132,7 @@ struct rkisp1_stat_buffer {
  * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_BLS: BLS in the compand block
  * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_EXPAND: Companding expand curve
  * @RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_COMPRESS: Companding compress curve
+ * @RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR: Wide dynamic range
  */
 enum rkisp1_ext_params_block_type {
 	RKISP1_EXT_PARAMS_BLOCK_TYPE_BLS,
@@ -1081,6 +1155,7 @@ enum rkisp1_ext_params_block_type {
 	RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_BLS,
 	RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_EXPAND,
 	RKISP1_EXT_PARAMS_BLOCK_TYPE_COMPAND_COMPRESS,
+	RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR,
 };
 
 #define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE	(1U << 0)
@@ -1463,6 +1538,23 @@ struct rkisp1_ext_params_compand_curve_config {
 	struct rkisp1_cif_isp_compand_curve_config config;
 } __attribute__((aligned(8)));
 
+/**
+ * struct rkisp1_ext_params_wdr_config - RkISP1 extensible params
+ *                                       Wide dynamic range config
+ *
+ * RkISP1 extensible parameters WDR block.
+ * Identified by :c:type:`RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR`
+ *
+ * @header: The RkISP1 extensible parameters header, see
+ *	    :c:type:`rkisp1_ext_params_block_header`
+ * @config: WDR configuration, see
+ *	    :c:type:`rkisp1_cif_isp_wdr_config`
+ */
+struct rkisp1_ext_params_wdr_config {
+	struct rkisp1_ext_params_block_header header;
+	struct rkisp1_cif_isp_wdr_config config;
+} __attribute__((aligned(8)));
+
 /*
  * The rkisp1_ext_params_compand_curve_config structure is counted twice as it
  * is used for both the COMPAND_EXPAND and COMPAND_COMPRESS block types.
@@ -1487,7 +1579,8 @@ struct rkisp1_ext_params_compand_curve_config {
 	sizeof(struct rkisp1_ext_params_afc_config)			+\
 	sizeof(struct rkisp1_ext_params_compand_bls_config)		+\
 	sizeof(struct rkisp1_ext_params_compand_curve_config)		+\
-	sizeof(struct rkisp1_ext_params_compand_curve_config))
+	sizeof(struct rkisp1_ext_params_compand_curve_config)		+\
+	sizeof(struct rkisp1_ext_params_wdr_config))
 
 /**
  * enum rksip1_ext_param_buffer_version - RkISP1 extensible parameters version
-- 
cgit v1.2.3


From 635e118317ffa773f6d25ec6a71b7927d7e8886a Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 10:54:44 +0200
Subject: Revert "mtd: core: always create master device"

The idea behind this patch was to always let a "master" mtd device
available to anchor runtime PM. Historically, there was no mtd device
representing the whole storage as soon as partitions were coming into
play. The introduction of CONFIG_MTD_PARTITIONED_MASTER allowed to keep
this "master" device, but was not enabled by default to avoid breaking
existing users (otherwise the mtd device numbering would be totally
messed up with an off by 1, at least).

The approach of adding an mtd_master class on top of partitioned mtd
devices is breaking the mtd core in many creative ways, so better think
again this approach and revert the faulty changes for now.

This reverts commit 0aa7b390fc40a871267a2328bbbefca8b37ad307.

Fixes: 0aa7b390fc40 ("mtd: core: always create master device")
Tested-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/mtdchar.c          |   2 +-
 drivers/mtd/mtdcore.c          | 152 +++++++++++------------------------------
 drivers/mtd/mtdcore.h          |   2 +-
 drivers/mtd/mtdpart.c          |  16 ++---
 include/linux/mtd/partitions.h |   2 +-
 5 files changed, 51 insertions(+), 123 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 391d81ad960c..8dc4f5c493fc 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -559,7 +559,7 @@ static int mtdchar_blkpg_ioctl(struct mtd_info *mtd,
 		/* Sanitize user input */
 		p.devname[BLKPG_DEVNAMELTH - 1] = '\0';
 
-		return mtd_add_partition(mtd, p.devname, p.start, p.length, NULL);
+		return mtd_add_partition(mtd, p.devname, p.start, p.length);
 
 	case BLKPG_DEL_PARTITION:
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 429d8c16baf0..5ba9a741f5ac 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -68,13 +68,7 @@ static struct class mtd_class = {
 	.pm = MTD_CLS_PM_OPS,
 };
 
-static struct class mtd_master_class = {
-	.name = "mtd_master",
-	.pm = MTD_CLS_PM_OPS,
-};
-
 static DEFINE_IDR(mtd_idr);
-static DEFINE_IDR(mtd_master_idr);
 
 /* These are exported solely for the purpose of mtd_blkdevs.c. You
    should not use them for _anything_ else */
@@ -89,9 +83,8 @@ EXPORT_SYMBOL_GPL(__mtd_next_device);
 
 static LIST_HEAD(mtd_notifiers);
 
-#define MTD_MASTER_DEVS 255
+
 #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2)
-static dev_t mtd_master_devt;
 
 /* REVISIT once MTD uses the driver model better, whoever allocates
  * the mtd_info will probably want to use the release() hook...
@@ -111,17 +104,6 @@ static void mtd_release(struct device *dev)
 	device_destroy(&mtd_class, index + 1);
 }
 
-static void mtd_master_release(struct device *dev)
-{
-	struct mtd_info *mtd = dev_get_drvdata(dev);
-
-	idr_remove(&mtd_master_idr, mtd->index);
-	of_node_put(mtd_get_of_node(mtd));
-
-	if (mtd_is_partition(mtd))
-		release_mtd_partition(mtd);
-}
-
 static void mtd_device_release(struct kref *kref)
 {
 	struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt);
@@ -385,11 +367,6 @@ static const struct device_type mtd_devtype = {
 	.release	= mtd_release,
 };
 
-static const struct device_type mtd_master_devtype = {
-	.name		= "mtd_master",
-	.release	= mtd_master_release,
-};
-
 static bool mtd_expert_analysis_mode;
 
 #ifdef CONFIG_DEBUG_FS
@@ -657,13 +634,13 @@ exit_parent:
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
- *	@partitioned: create partitioned device
  *
  *	Add a device to the list of MTD devices present in the system, and
  *	notify each currently active MTD 'user' of its arrival. Returns
  *	zero on success or non-zero on failure.
  */
-int add_mtd_device(struct mtd_info *mtd, bool partitioned)
+
+int add_mtd_device(struct mtd_info *mtd)
 {
 	struct device_node *np = mtd_get_of_node(mtd);
 	struct mtd_info *master = mtd_get_master(mtd);
@@ -710,17 +687,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned)
 	ofidx = -1;
 	if (np)
 		ofidx = of_alias_get_id(np, "mtd");
-	if (partitioned) {
-		if (ofidx >= 0)
-			i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL);
-		else
-			i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL);
-	} else {
-		if (ofidx >= 0)
-			i = idr_alloc(&mtd_master_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL);
-		else
-			i = idr_alloc(&mtd_master_idr, mtd, 0, 0, GFP_KERNEL);
-	}
+	if (ofidx >= 0)
+		i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL);
+	else
+		i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL);
 	if (i < 0) {
 		error = i;
 		goto fail_locked;
@@ -768,18 +738,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned)
 	/* Caller should have set dev.parent to match the
 	 * physical device, if appropriate.
 	 */
-	if (partitioned) {
-		mtd->dev.type = &mtd_devtype;
-		mtd->dev.class = &mtd_class;
-		mtd->dev.devt = MTD_DEVT(i);
-		dev_set_name(&mtd->dev, "mtd%d", i);
-		error = dev_set_name(&mtd->dev, "mtd%d", i);
-	} else {
-		mtd->dev.type = &mtd_master_devtype;
-		mtd->dev.class = &mtd_master_class;
-		mtd->dev.devt = MKDEV(MAJOR(mtd_master_devt), i);
-		error = dev_set_name(&mtd->dev, "mtd_master%d", i);
-	}
+	mtd->dev.type = &mtd_devtype;
+	mtd->dev.class = &mtd_class;
+	mtd->dev.devt = MTD_DEVT(i);
+	error = dev_set_name(&mtd->dev, "mtd%d", i);
 	if (error)
 		goto fail_devname;
 	dev_set_drvdata(&mtd->dev, mtd);
@@ -787,7 +749,6 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned)
 	of_node_get(mtd_get_of_node(mtd));
 	error = device_register(&mtd->dev);
 	if (error) {
-		pr_err("mtd: %s device_register fail %d\n", mtd->name, error);
 		put_device(&mtd->dev);
 		goto fail_added;
 	}
@@ -799,13 +760,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned)
 
 	mtd_debugfs_populate(mtd);
 
-	if (partitioned) {
-		device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL,
-			      "mtd%dro", i);
-	}
+	device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL,
+		      "mtd%dro", i);
 
-	pr_debug("mtd: Giving out %spartitioned device %d to %s\n",
-		 partitioned ? "" : "un-", i, mtd->name);
+	pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name);
 	/* No need to get a refcount on the module containing
 	   the notifier, since we hold the mtd_table_mutex */
 	list_for_each_entry(not, &mtd_notifiers, list)
@@ -813,16 +771,13 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned)
 
 	mutex_unlock(&mtd_table_mutex);
 
-	if (partitioned) {
-		if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) {
-			if (IS_BUILTIN(CONFIG_MTD)) {
-				pr_info("mtd: setting mtd%d (%s) as root device\n",
-					mtd->index, mtd->name);
-				ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
-			} else {
-				pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n",
-					mtd->index, mtd->name);
-			}
+	if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) {
+		if (IS_BUILTIN(CONFIG_MTD)) {
+			pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name);
+			ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
+		} else {
+			pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n",
+				mtd->index, mtd->name);
 		}
 	}
 
@@ -838,10 +793,7 @@ fail_nvmem_add:
 fail_added:
 	of_node_put(mtd_get_of_node(mtd));
 fail_devname:
-	if (partitioned)
-		idr_remove(&mtd_idr, i);
-	else
-		idr_remove(&mtd_master_idr, i);
+	idr_remove(&mtd_idr, i);
 fail_locked:
 	mutex_unlock(&mtd_table_mutex);
 	return error;
@@ -859,14 +811,12 @@ fail_locked:
 
 int del_mtd_device(struct mtd_info *mtd)
 {
-	struct mtd_notifier *not;
-	struct idr *idr;
 	int ret;
+	struct mtd_notifier *not;
 
 	mutex_lock(&mtd_table_mutex);
 
-	idr = mtd->dev.class == &mtd_class ? &mtd_idr : &mtd_master_idr;
-	if (idr_find(idr, mtd->index) != mtd) {
+	if (idr_find(&mtd_idr, mtd->index) != mtd) {
 		ret = -ENODEV;
 		goto out_error;
 	}
@@ -1106,7 +1056,6 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 			      const struct mtd_partition *parts,
 			      int nr_parts)
 {
-	struct mtd_info *parent;
 	int ret, err;
 
 	mtd_set_dev_defaults(mtd);
@@ -1115,30 +1064,25 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 	if (ret)
 		goto out;
 
-	ret = add_mtd_device(mtd, false);
-	if (ret)
-		goto out;
-
 	if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) {
-		ret = mtd_add_partition(mtd, mtd->name, 0, MTDPART_SIZ_FULL, &parent);
+		ret = add_mtd_device(mtd);
 		if (ret)
 			goto out;
-
-	} else {
-		parent = mtd;
 	}
 
 	/* Prefer parsed partitions over driver-provided fallback */
-	ret = parse_mtd_partitions(parent, types, parser_data);
+	ret = parse_mtd_partitions(mtd, types, parser_data);
 	if (ret == -EPROBE_DEFER)
 		goto out;
 
 	if (ret > 0)
 		ret = 0;
 	else if (nr_parts)
-		ret = add_mtd_partitions(parent, parts, nr_parts);
-	else if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER))
-		ret = mtd_add_partition(parent, mtd->name, 0, MTDPART_SIZ_FULL, NULL);
+		ret = add_mtd_partitions(mtd, parts, nr_parts);
+	else if (!device_is_registered(&mtd->dev))
+		ret = add_mtd_device(mtd);
+	else
+		ret = 0;
 
 	if (ret)
 		goto out;
@@ -1158,14 +1102,13 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 		register_reboot_notifier(&mtd->reboot_notifier);
 	}
 
-	return 0;
 out:
-	nvmem_unregister(mtd->otp_user_nvmem);
-	nvmem_unregister(mtd->otp_factory_nvmem);
-
-	del_mtd_partitions(mtd);
+	if (ret) {
+		nvmem_unregister(mtd->otp_user_nvmem);
+		nvmem_unregister(mtd->otp_factory_nvmem);
+	}
 
-	if (device_is_registered(&mtd->dev)) {
+	if (ret && device_is_registered(&mtd->dev)) {
 		err = del_mtd_device(mtd);
 		if (err)
 			pr_err("Error when deleting MTD device (%d)\n", err);
@@ -1324,7 +1267,8 @@ int __get_mtd_device(struct mtd_info *mtd)
 		mtd = mtd->parent;
 	}
 
-	kref_get(&master->refcnt);
+	if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER))
+		kref_get(&master->refcnt);
 
 	return 0;
 }
@@ -1418,7 +1362,8 @@ void __put_mtd_device(struct mtd_info *mtd)
 		mtd = parent;
 	}
 
-	kref_put(&master->refcnt, mtd_device_release);
+	if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER))
+		kref_put(&master->refcnt, mtd_device_release);
 
 	module_put(master->owner);
 
@@ -2585,16 +2530,6 @@ static int __init init_mtd(void)
 	if (ret)
 		goto err_reg;
 
-	ret = class_register(&mtd_master_class);
-	if (ret)
-		goto err_reg2;
-
-	ret = alloc_chrdev_region(&mtd_master_devt, 0, MTD_MASTER_DEVS, "mtd_master");
-	if (ret < 0) {
-		pr_err("unable to allocate char dev region\n");
-		goto err_chrdev;
-	}
-
 	mtd_bdi = mtd_bdi_init("mtd");
 	if (IS_ERR(mtd_bdi)) {
 		ret = PTR_ERR(mtd_bdi);
@@ -2619,10 +2554,6 @@ out_procfs:
 	bdi_unregister(mtd_bdi);
 	bdi_put(mtd_bdi);
 err_bdi:
-	unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS);
-err_chrdev:
-	class_unregister(&mtd_master_class);
-err_reg2:
 	class_unregister(&mtd_class);
 err_reg:
 	pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -2636,12 +2567,9 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	class_unregister(&mtd_master_class);
-	unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS);
 	bdi_unregister(mtd_bdi);
 	bdi_put(mtd_bdi);
 	idr_destroy(&mtd_idr);
-	idr_destroy(&mtd_master_idr);
 }
 
 module_init(init_mtd);
diff --git a/drivers/mtd/mtdcore.h b/drivers/mtd/mtdcore.h
index 2258d31c5aa6..b014861a06a6 100644
--- a/drivers/mtd/mtdcore.h
+++ b/drivers/mtd/mtdcore.h
@@ -8,7 +8,7 @@ extern struct mutex mtd_table_mutex;
 extern struct backing_dev_info *mtd_bdi;
 
 struct mtd_info *__mtd_next_device(int i);
-int __must_check add_mtd_device(struct mtd_info *mtd, bool partitioned);
+int __must_check add_mtd_device(struct mtd_info *mtd);
 int del_mtd_device(struct mtd_info *mtd);
 int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int);
 int del_mtd_partitions(struct mtd_info *);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 5a3db36d734e..994e8c51e674 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -86,7 +86,8 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent,
 	 * parent conditional on that option. Note, this is a way to
 	 * distinguish between the parent and its partitions in sysfs.
 	 */
-	child->dev.parent = &parent->dev;
+	child->dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) || mtd_is_partition(parent) ?
+			    &parent->dev : parent->dev.parent;
 	child->dev.of_node = part->of_node;
 	child->parent = parent;
 	child->part.offset = part->offset;
@@ -242,7 +243,7 @@ static int mtd_add_partition_attrs(struct mtd_info *new)
 }
 
 int mtd_add_partition(struct mtd_info *parent, const char *name,
-		      long long offset, long long length, struct mtd_info **out)
+		      long long offset, long long length)
 {
 	struct mtd_info *master = mtd_get_master(parent);
 	u64 parent_size = mtd_is_partition(parent) ?
@@ -275,15 +276,12 @@ int mtd_add_partition(struct mtd_info *parent, const char *name,
 	list_add_tail(&child->part.node, &parent->partitions);
 	mutex_unlock(&master->master.partitions_lock);
 
-	ret = add_mtd_device(child, true);
+	ret = add_mtd_device(child);
 	if (ret)
 		goto err_remove_part;
 
 	mtd_add_partition_attrs(child);
 
-	if (out)
-		*out = child;
-
 	return 0;
 
 err_remove_part:
@@ -415,7 +413,7 @@ int add_mtd_partitions(struct mtd_info *parent,
 		list_add_tail(&child->part.node, &parent->partitions);
 		mutex_unlock(&master->master.partitions_lock);
 
-		ret = add_mtd_device(child, true);
+		ret = add_mtd_device(child);
 		if (ret) {
 			mutex_lock(&master->master.partitions_lock);
 			list_del(&child->part.node);
@@ -592,6 +590,9 @@ static int mtd_part_of_parse(struct mtd_info *master,
 	int ret, err = 0;
 
 	dev = &master->dev;
+	/* Use parent device (controller) if the top level MTD is not registered */
+	if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master))
+		dev = master->dev.parent;
 
 	np = mtd_get_of_node(master);
 	if (mtd_is_partition(master))
@@ -710,7 +711,6 @@ int parse_mtd_partitions(struct mtd_info *master, const char *const *types,
 		if (ret < 0 && !err)
 			err = ret;
 	}
-
 	return err;
 }
 
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 5daf80df9e89..b74a539ec581 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -108,7 +108,7 @@ extern void deregister_mtd_parser(struct mtd_part_parser *parser);
 		      deregister_mtd_parser)
 
 int mtd_add_partition(struct mtd_info *master, const char *name,
-		      long long offset, long long length, struct mtd_info **part);
+		      long long offset, long long length);
 int mtd_del_partition(struct mtd_info *master, int partno);
 uint64_t mtd_get_device_size(const struct mtd_info *mtd);
 
-- 
cgit v1.2.3


From dba90f5a79c13936de4273a19e67908a0c296afe Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 10:48:00 +0200
Subject: mtd: spinand: winbond: Prevent unsupported frequencies on dual/quad
 I/O variants

Dual and quad capable chips natively support dual and quad I/O variants
at up to 104MHz (1-2-2 and 1-4-4 operations). Reaching the maximum speed
of 166MHz is theoretically possible (while still unsupported in the
field) by adding a few more dummy cycles. Let's be accurate and clearly
state this limit.

Setting a maximum frequency implies adding the frequency parameter to
the macro, which is done using a variadic argument to avoid impacting
all the other drivers which already make use of this macro.

Fixes: 1ea808b4d15b ("mtd: spinand: winbond: Update the *JW chip definitions")
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/winbond.c |  4 ++--
 include/linux/mtd/spinand.h    | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
index 0756966b4e3c..b7a28f001a38 100644
--- a/drivers/mtd/nand/spi/winbond.c
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -42,11 +42,11 @@ static SPINAND_OP_VARIANTS(update_cache_octal_variants,
 static SPINAND_OP_VARIANTS(read_cache_dual_quad_dtr_variants,
 		SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(0, 8, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 104 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(0, 4, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 104 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 811a0f356315..15eaa09da998 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -113,11 +113,12 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 2),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len, ...) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+		   SPI_MEM_OP_DATA_IN(len, buf, 2),			\
+		   SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
@@ -151,11 +152,12 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 4),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len, ...) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+		   SPI_MEM_OP_DATA_IN(len, buf, 4),			\
+		   SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
-- 
cgit v1.2.3


From ffaf1bf3737f706e4e9be876de4bc3c8fc578091 Mon Sep 17 00:00:00 2001
From: RubenKelevra <rubenkelevra@gmail.com>
Date: Wed, 18 Jun 2025 01:09:27 +0200
Subject: fs_context: fix parameter name in infofc() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The macro takes a parameter called "p" but references "fc" internally.
This happens to compile as long as callers pass a variable named fc,
but breaks otherwise. Rename the first parameter to “fc” to match the
usage and to be consistent with warnfc() / errorfc().

Fixes: a3ff937b33d9 ("prefix-handling analogues of errorf() and friends")
Signed-off-by: RubenKelevra <rubenkelevra@gmail.com>
Link: https://lore.kernel.org/20250617230927.1790401-1-rubenkelevra@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index a19e4bd32e4d..7773eb870039 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -200,7 +200,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
  */
 #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
 #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
-#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
+#define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
 
 /**
  * warnf - Store supplementary warning message
-- 
cgit v1.2.3


From 4cdf874f67adfdec4f0a288c76f9aba05f9babe2 Mon Sep 17 00:00:00 2001
From: Fabien Dessenne <fabien.dessenne@foss.st.com>
Date: Tue, 10 Jun 2025 17:23:06 +0200
Subject: dt-bindings: pinctrl: stm32: Add RSVD mux function

Document the RSVD (Reserved) mux function, used to reserve pins
for a coprocessor not running Linux.

Signed-off-by: Fabien Dessenne <fabien.dessenne@foss.st.com>
Signed-off-by: Antonio Borneo <antonio.borneo@foss.st.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/20250610152309.299438-3-antonio.borneo@foss.st.com
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml | 8 ++++++++
 include/dt-bindings/pinctrl/stm32-pinfunc.h                     | 1 +
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
index a28d77748095..5d17d6487ae9 100644
--- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
+++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
@@ -160,9 +160,13 @@ patternProperties:
               * ...
               * 16 : Alternate Function 15
               * 17 : Analog
+              * 18 : Reserved
               To simplify the usage, macro is available to generate "pinmux" field.
               This macro is available here:
                 - include/dt-bindings/pinctrl/stm32-pinfunc.h
+              Setting the pinmux's function to the Reserved (RSVD) value is used to inform
+              the driver that it shall not apply the mux setting. This can be used to
+              reserve some pins, for example to a co-processor not running Linux.
               Some examples of using macro:
                /* GPIO A9 set as alternate function 2 */
                ... {
@@ -176,6 +180,10 @@ patternProperties:
                ... {
                           pinmux = <STM32_PINMUX('A', 9, ANALOG)>;
                };
+               /* GPIO A9 reserved for co-processor */
+               ... {
+                          pinmux = <STM32_PINMUX('A', 9, RSVD)>;
+               };
 
           bias-disable:
             type: boolean
diff --git a/include/dt-bindings/pinctrl/stm32-pinfunc.h b/include/dt-bindings/pinctrl/stm32-pinfunc.h
index 28ad0235086a..af3fd388329a 100644
--- a/include/dt-bindings/pinctrl/stm32-pinfunc.h
+++ b/include/dt-bindings/pinctrl/stm32-pinfunc.h
@@ -26,6 +26,7 @@
 #define AF14	0xf
 #define AF15	0x10
 #define ANALOG	0x11
+#define RSVD	0x12
 
 /* define Pins number*/
 #define PIN_NO(port, line)	(((port) - 'A') * 0x10 + (line))
-- 
cgit v1.2.3


From 82a0302e7167d0b7c6cde56613db3748f8dd806d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 16 Jun 2025 16:38:49 +0800
Subject: padata: Remove comment for reorder_work

Remove comment for reorder_work which no longer exists.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 71203f68c774 ("padata: Fix pd UAF once and for all")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index b486c7359de2..765f2778e264 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -90,7 +90,6 @@ struct padata_cpumask {
  * @processed: Number of already processed objects.
  * @cpu: Next CPU to be processed.
  * @cpumask: The cpumasks in use for parallel and serial workers.
- * @reorder_work: work struct for reordering.
  */
 struct parallel_data {
 	struct padata_shell		*ps;
-- 
cgit v1.2.3


From 9724e6f1953644cc9a5d102605d624bc79609038 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Wed, 11 Jun 2025 08:13:36 +0200
Subject: pinctrl: Constify pointers to 'pinctrl_desc'

Pin controller core code only stores the pointer to
'struct pinctrl_desc' and does not modify it anywhere.  The pointer can
be changed to pointer to const which makes the code safer, explicit and
later allows constifying 'pinctrl_desc' allocations in individual
drivers.

Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/20250611-pinctrl-const-desc-v2-4-b11c1d650384@linaro.org
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 13 +++++++------
 drivers/pinctrl/core.h          |  2 +-
 include/linux/pinctrl/pinctrl.h |  8 ++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 9046292d1360..73b78d6eac67 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -2062,7 +2062,7 @@ static int pinctrl_check_ops(struct pinctrl_dev *pctldev)
  * @driver_data: private pin controller data for this pin controller
  */
 static struct pinctrl_dev *
-pinctrl_init_controller(struct pinctrl_desc *pctldesc, struct device *dev,
+pinctrl_init_controller(const struct pinctrl_desc *pctldesc, struct device *dev,
 			void *driver_data)
 {
 	struct pinctrl_dev *pctldev;
@@ -2132,7 +2132,8 @@ out_err:
 	return ERR_PTR(ret);
 }
 
-static void pinctrl_uninit_controller(struct pinctrl_dev *pctldev, struct pinctrl_desc *pctldesc)
+static void pinctrl_uninit_controller(struct pinctrl_dev *pctldev,
+				      const struct pinctrl_desc *pctldesc)
 {
 	pinctrl_free_pindescs(pctldev, pctldesc->pins,
 			      pctldesc->npins);
@@ -2209,7 +2210,7 @@ EXPORT_SYMBOL_GPL(pinctrl_enable);
  * struct pinctrl_dev handle. To avoid issues later on, please use the
  * new pinctrl_register_and_init() below instead.
  */
-struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
+struct pinctrl_dev *pinctrl_register(const struct pinctrl_desc *pctldesc,
 				    struct device *dev, void *driver_data)
 {
 	struct pinctrl_dev *pctldev;
@@ -2239,7 +2240,7 @@ EXPORT_SYMBOL_GPL(pinctrl_register);
  * Note that pinctrl_enable() still needs to be manually called after
  * this once the driver is ready.
  */
-int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
+int pinctrl_register_and_init(const struct pinctrl_desc *pctldesc,
 			      struct device *dev, void *driver_data,
 			      struct pinctrl_dev **pctldev)
 {
@@ -2330,7 +2331,7 @@ static int devm_pinctrl_dev_match(struct device *dev, void *res, void *data)
  * The pinctrl device will be automatically released when the device is unbound.
  */
 struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
-					  struct pinctrl_desc *pctldesc,
+					  const struct pinctrl_desc *pctldesc,
 					  void *driver_data)
 {
 	struct pinctrl_dev **ptr, *pctldev;
@@ -2364,7 +2365,7 @@ EXPORT_SYMBOL_GPL(devm_pinctrl_register);
  * The pinctrl device will be automatically released when the device is unbound.
  */
 int devm_pinctrl_register_and_init(struct device *dev,
-				   struct pinctrl_desc *pctldesc,
+				   const struct pinctrl_desc *pctldesc,
 				   void *driver_data,
 				   struct pinctrl_dev **pctldev)
 {
diff --git a/drivers/pinctrl/core.h b/drivers/pinctrl/core.h
index d6c24978e708..fc513a9cdd4f 100644
--- a/drivers/pinctrl/core.h
+++ b/drivers/pinctrl/core.h
@@ -51,7 +51,7 @@ struct pinctrl_state;
  */
 struct pinctrl_dev {
 	struct list_head node;
-	struct pinctrl_desc *desc;
+	const struct pinctrl_desc *desc;
 	struct radix_tree_root pin_desc_tree;
 #ifdef CONFIG_GENERIC_PINCTRL_GROUPS
 	struct radix_tree_root pin_group_tree;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 9a8189ffd0f2..d138e1815645 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -165,25 +165,25 @@ struct pinctrl_desc {
 
 /* External interface to pin controller */
 
-extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
+extern int pinctrl_register_and_init(const struct pinctrl_desc *pctldesc,
 				     struct device *dev, void *driver_data,
 				     struct pinctrl_dev **pctldev);
 extern int pinctrl_enable(struct pinctrl_dev *pctldev);
 
 /* Please use pinctrl_register_and_init() and pinctrl_enable() instead */
-extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
+extern struct pinctrl_dev *pinctrl_register(const struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
 
 extern void pinctrl_unregister(struct pinctrl_dev *pctldev);
 
 extern int devm_pinctrl_register_and_init(struct device *dev,
-				struct pinctrl_desc *pctldesc,
+				const struct pinctrl_desc *pctldesc,
 				void *driver_data,
 				struct pinctrl_dev **pctldev);
 
 /* Please use devm_pinctrl_register_and_init() instead */
 extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
-				struct pinctrl_desc *pctldesc,
+				const struct pinctrl_desc *pctldesc,
 				void *driver_data);
 
 extern void devm_pinctrl_unregister(struct device *dev,
-- 
cgit v1.2.3


From 8358102806c619d8d6c814010173617fb374b77e Mon Sep 17 00:00:00 2001
From: Jonas Karlman <jonas@kwiboo.se>
Date: Sun, 18 May 2025 22:06:48 +0000
Subject: dt-bindings: power: rockchip: Add support for RK3528

Add the compatible string and power domains for RK3528 SoC.

Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20250518220707.669515-2-jonas@kwiboo.se
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 .../bindings/power/rockchip,power-controller.yaml     |  1 +
 include/dt-bindings/power/rockchip,rk3528-power.h     | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 include/dt-bindings/power/rockchip,rk3528-power.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml b/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
index f494b7710c09..a884e49c995f 100644
--- a/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
+++ b/Documentation/devicetree/bindings/power/rockchip,power-controller.yaml
@@ -40,6 +40,7 @@ properties:
       - rockchip,rk3366-power-controller
       - rockchip,rk3368-power-controller
       - rockchip,rk3399-power-controller
+      - rockchip,rk3528-power-controller
       - rockchip,rk3562-power-controller
       - rockchip,rk3568-power-controller
       - rockchip,rk3576-power-controller
diff --git a/include/dt-bindings/power/rockchip,rk3528-power.h b/include/dt-bindings/power/rockchip,rk3528-power.h
new file mode 100644
index 000000000000..318923cdaaf6
--- /dev/null
+++ b/include/dt-bindings/power/rockchip,rk3528-power.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR MIT) */
+#ifndef __DT_BINDINGS_POWER_RK3528_POWER_H__
+#define __DT_BINDINGS_POWER_RK3528_POWER_H__
+
+#define RK3528_PD_PMU		0
+#define RK3528_PD_BUS		1
+#define RK3528_PD_DDR		2
+#define RK3528_PD_MSCH		3
+
+/* VD_GPU */
+#define RK3528_PD_GPU		4
+
+/* VD_LOGIC */
+#define RK3528_PD_RKVDEC	5
+#define RK3528_PD_RKVENC	6
+#define RK3528_PD_VO		7
+#define RK3528_PD_VPU		8
+
+#endif
-- 
cgit v1.2.3


From d80a75624051b817043431f847470fb4680f2582 Mon Sep 17 00:00:00 2001
From: Lifeng Zheng <zhenglifeng1@huawei.com>
Date: Mon, 26 May 2025 19:30:55 +0800
Subject: cpufreq: CPPC: Remove cpu_data_list

After commit a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a
list"), cpu_data can be got from policy->driver_data, so cpu_data_list is
not actually needed and can be removed.

Signed-off-by: Lifeng Zheng <zhenglifeng1@huawei.com>
Link: https://patch.msgid.link/20250526113057.3086513-2-zhenglifeng1@huawei.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 25 -------------------------
 include/acpi/cppc_acpi.h       |  1 -
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index b7c688a5659c..f3b5ea9fcbf5 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -26,14 +26,6 @@
 
 #include <acpi/cppc_acpi.h>
 
-/*
- * This list contains information parsed from per CPU ACPI _CPC and _PSD
- * structures: e.g. the highest and lowest supported performance, capabilities,
- * desired performance, level requested etc. Depending on the share_type, not
- * all CPUs will have an entry in the list.
- */
-static LIST_HEAD(cpu_data_list);
-
 static struct cpufreq_driver cppc_cpufreq_driver;
 
 #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
@@ -567,8 +559,6 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu)
 		goto free_mask;
 	}
 
-	list_add(&cpu_data->node, &cpu_data_list);
-
 	return cpu_data;
 
 free_mask:
@@ -583,7 +573,6 @@ static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy)
 {
 	struct cppc_cpudata *cpu_data = policy->driver_data;
 
-	list_del(&cpu_data->node);
 	free_cpumask_var(cpu_data->shared_cpu_map);
 	kfree(cpu_data);
 	policy->driver_data = NULL;
@@ -954,24 +943,10 @@ static int __init cppc_cpufreq_init(void)
 	return ret;
 }
 
-static inline void free_cpu_data(void)
-{
-	struct cppc_cpudata *iter, *tmp;
-
-	list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) {
-		free_cpumask_var(iter->shared_cpu_map);
-		list_del(&iter->node);
-		kfree(iter);
-	}
-
-}
-
 static void __exit cppc_cpufreq_exit(void)
 {
 	cpufreq_unregister_driver(&cppc_cpufreq_driver);
 	cppc_freq_invariance_exit();
-
-	free_cpu_data();
 }
 
 module_exit(cppc_cpufreq_exit);
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 325e9543e08f..20f3d62e7a16 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -139,7 +139,6 @@ struct cppc_perf_fb_ctrs {
 
 /* Per CPU container for runtime CPPC management. */
 struct cppc_cpudata {
-	struct list_head node;
 	struct cppc_perf_caps perf_caps;
 	struct cppc_perf_ctrls perf_ctrls;
 	struct cppc_perf_fb_ctrs perf_fb_ctrs;
-- 
cgit v1.2.3


From 271ff96d6066347cd267ac3bcd6021bd4d38913d Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 16 Jun 2025 09:12:07 +0300
Subject: PM: runtime: Document return values of suspend-related API functions

Document return values for device suspend and idle related API
functions.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://patch.msgid.link/20250616061212.2286741-2-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_runtime.h | 147 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 138 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index e7cb70fcc0af..9dd2e4031a27 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -337,6 +337,20 @@ static inline void pm_runtime_release_supplier(struct device_link *link) {}
  * Invoke the "idle check" callback of @dev and, depending on its return value,
  * set up autosuspend of @dev or suspend it (depending on whether or not
  * autosuspend has been enabled for it).
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing
+ *            or device not in %RPM_ACTIVE state.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM idle and suspend callbacks.
  */
 static inline int pm_runtime_idle(struct device *dev)
 {
@@ -346,6 +360,18 @@ static inline int pm_runtime_idle(struct device *dev)
 /**
  * pm_runtime_suspend - Suspend a device synchronously.
  * @dev: Target device.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM suspend callbacks.
  */
 static inline int pm_runtime_suspend(struct device *dev)
 {
@@ -358,6 +384,18 @@ static inline int pm_runtime_suspend(struct device *dev)
  *
  * Set up autosuspend of @dev or suspend it (depending on whether or not
  * autosuspend is enabled for it) without engaging its "idle check" callback.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM suspend callbacks.
  */
 static inline int pm_runtime_autosuspend(struct device *dev)
 {
@@ -379,6 +417,18 @@ static inline int pm_runtime_resume(struct device *dev)
  *
  * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev
  * asynchronously.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing
+ *            or device not in %RPM_ACTIVE state.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
  */
 static inline int pm_request_idle(struct device *dev)
 {
@@ -400,6 +450,17 @@ static inline int pm_request_resume(struct device *dev)
  *
  * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev
  * asynchronously.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
  */
 static inline int pm_request_autosuspend(struct device *dev)
 {
@@ -464,6 +525,17 @@ static inline int pm_runtime_resume_and_get(struct device *dev)
  *
  * Decrement the runtime PM usage counter of @dev and if it turns out to be
  * equal to 0, queue up a work item for @dev like in pm_request_idle().
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
  */
 static inline int pm_runtime_put(struct device *dev)
 {
@@ -478,6 +550,17 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T))
  *
  * Decrement the runtime PM usage counter of @dev and if it turns out to be
  * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
  */
 static inline int __pm_runtime_put_autosuspend(struct device *dev)
 {
@@ -490,6 +573,17 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev)
  *
  * Decrement the runtime PM usage counter of @dev and if it turns out to be
  * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
  */
 static inline int pm_runtime_put_autosuspend(struct device *dev)
 {
@@ -506,9 +600,20 @@ static inline int pm_runtime_put_autosuspend(struct device *dev)
  * return value, set up autosuspend of @dev or suspend it (depending on whether
  * or not autosuspend has been enabled for it).
  *
- * The possible return values of this function are the same as for
- * pm_runtime_idle() and the runtime PM usage counter of @dev remains
- * decremented in all cases, even if it returns an error code.
+ * The runtime PM usage counter of @dev remains decremented in all cases, even
+ * if it returns an error code.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM suspend callbacks.
  */
 static inline int pm_runtime_put_sync(struct device *dev)
 {
@@ -522,9 +627,21 @@ static inline int pm_runtime_put_sync(struct device *dev)
  * Decrement the runtime PM usage counter of @dev and if it turns out to be
  * equal to 0, carry out runtime-suspend of @dev synchronously.
  *
- * The possible return values of this function are the same as for
- * pm_runtime_suspend() and the runtime PM usage counter of @dev remains
- * decremented in all cases, even if it returns an error code.
+ * The runtime PM usage counter of @dev remains decremented in all cases, even
+ * if it returns an error code.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM suspend callbacks.
  */
 static inline int pm_runtime_put_sync_suspend(struct device *dev)
 {
@@ -539,9 +656,21 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev)
  * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending
  * on whether or not autosuspend has been enabled for it).
  *
- * The possible return values of this function are the same as for
- * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains
- * decremented in all cases, even if it returns an error code.
+ * The runtime PM usage counter of @dev remains decremented in all cases, even
+ * if it returns an error code.
+ *
+ * Return:
+ * * 0: Success.
+ * * -EINVAL: Runtime PM error.
+ * * -EACCES: Runtime PM disabled.
+ * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing.
+ * * -EBUSY: Runtime PM child_count non-zero.
+ * * -EPERM: Device PM QoS resume latency 0.
+ * * -EINPROGRESS: Suspend already in progress.
+ * * -ENOSYS: CONFIG_PM not enabled.
+ * * 1: Device already suspended.
+ * Other values and conditions for the above values are possible as returned by
+ * Runtime PM suspend callbacks.
  */
 static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
 {
-- 
cgit v1.2.3


From b3db492e8335417dfd66c1fa2ea08e1d2f7b6736 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 16 Jun 2025 09:12:08 +0300
Subject: PM: runtime: Mark last busy stamp in pm_runtime_put_autosuspend()

Set device's last busy timestamp to current time in
pm_runtime_put_autosuspend(). Callers wishing not to do that will need to
use __pm_runtime_put_autosuspend().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patch.msgid.link/20250616061212.2286741-3-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst | 23 ++++++++++-------------
 include/linux/pm_runtime.h         | 12 +++++++-----
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 63344bea8393..e7bbdc66d64c 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -411,8 +411,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       pm_request_idle(dev) and return its result
 
   `int pm_runtime_put_autosuspend(struct device *dev);`
-    - does the same as __pm_runtime_put_autosuspend() for now, but in the
-      future, will also call pm_runtime_mark_last_busy() as well, DO NOT USE!
+    - set the power.last_busy field to the current time and decrement the
+      device's usage counter; if the result is 0 then run
+      pm_request_autosuspend(dev) and return its result
 
   `int __pm_runtime_put_autosuspend(struct device *dev);`
     - decrement the device's usage counter; if the result is 0 then run
@@ -870,11 +871,9 @@ device is automatically suspended (the subsystem or driver still has to call
 the appropriate PM routines); rather it means that runtime suspends will
 automatically be delayed until the desired period of inactivity has elapsed.
 
-Inactivity is determined based on the power.last_busy field.  Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling __pm_runtime_put_autosuspend().  The desired
-length of the inactivity period is a matter of policy.  Subsystems can set this
-length initially by calling pm_runtime_set_autosuspend_delay(), but after device
+Inactivity is determined based on the power.last_busy field. The desired length
+of the inactivity period is a matter of policy.  Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
 registration the length should be controlled by user space, using the
 /sys/devices/.../power/autosuspend_delay_ms attribute.
 
@@ -885,7 +884,7 @@ instead of the non-autosuspend counterparts::
 
 	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
 	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
-	Instead of: pm_runtime_put        use: __pm_runtime_put_autosuspend;
+	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
 	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
 
 Drivers may also continue to use the non-autosuspend helper functions; they
@@ -922,12 +921,10 @@ Here is a schematic pseudo-code example::
 	foo_io_completion(struct foo_priv *foo, void *req)
 	{
 		lock(&foo->private_lock);
-		if (--foo->num_pending_requests == 0) {
-			pm_runtime_mark_last_busy(&foo->dev);
-			__pm_runtime_put_autosuspend(&foo->dev);
-		} else {
+		if (--foo->num_pending_requests == 0)
+			pm_runtime_put_autosuspend(&foo->dev);
+		else
 			foo_process_next_request(foo);
-		}
 		unlock(&foo->private_lock);
 		/* Send req result back to the user ... */
 	}
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 9dd2e4031a27..14ca7be96686 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -568,11 +568,13 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev)
 }
 
 /**
- * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0.
+ * pm_runtime_put_autosuspend - Update the last access time of a device, drop
+ * its usage counter and queue autosuspend if the usage counter becomes 0.
  * @dev: Target device.
  *
- * Decrement the runtime PM usage counter of @dev and if it turns out to be
- * equal to 0, queue up a work item for @dev like in pm_request_autosuspend().
+ * Update the last access time of @dev, decrement runtime PM usage counter of
+ * @dev and if it turns out to be equal to 0, queue up a work item for @dev like
+ * in pm_request_autosuspend().
  *
  * Return:
  * * 0: Success.
@@ -587,8 +589,8 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev)
  */
 static inline int pm_runtime_put_autosuspend(struct device *dev)
 {
-	return __pm_runtime_suspend(dev,
-	    RPM_GET_PUT | RPM_ASYNC | RPM_AUTO);
+	pm_runtime_mark_last_busy(dev);
+	return __pm_runtime_put_autosuspend(dev);
 }
 
 /**
-- 
cgit v1.2.3


From e24e0630b5ba13e83f65905becde9945518efa0b Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 16 Jun 2025 09:12:09 +0300
Subject: PM: runtime: Mark last busy stamp in
 pm_runtime_put_sync_autosuspend()

Set device's last busy timestamp to current time in
pm_runtime_put_sync_autosuspend().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patch.msgid.link/20250616061212.2286741-4-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst |  3 ++-
 include/linux/pm_runtime.h         | 11 +++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index e7bbdc66d64c..9c21c913f9cf 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -428,7 +428,8 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       pm_runtime_suspend(dev) and return its result
 
   `int pm_runtime_put_sync_autosuspend(struct device *dev);`
-    - decrement the device's usage counter; if the result is 0 then run
+    - set the power.last_busy field to the current time and decrement the
+      device's usage counter; if the result is 0 then run
       pm_runtime_autosuspend(dev) and return its result
 
   `void pm_runtime_enable(struct device *dev);`
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 14ca7be96686..3a0d5f0ea471 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -651,12 +651,14 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev)
 }
 
 /**
- * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0.
+ * pm_runtime_put_sync_autosuspend - Update the last access time of a device,
+ * drop device usage counter and autosuspend if 0.
  * @dev: Target device.
  *
- * Decrement the runtime PM usage counter of @dev and if it turns out to be
- * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending
- * on whether or not autosuspend has been enabled for it).
+ * Update the last access time of @dev, decrement the runtime PM usage counter
+ * of @dev and if it turns out to be equal to 0, set up autosuspend of @dev or
+ * suspend it synchronously (depending on whether or not autosuspend has been
+ * enabled for it).
  *
  * The runtime PM usage counter of @dev remains decremented in all cases, even
  * if it returns an error code.
@@ -676,6 +678,7 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev)
  */
 static inline int pm_runtime_put_sync_autosuspend(struct device *dev)
 {
+	pm_runtime_mark_last_busy(dev);
 	return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO);
 }
 
-- 
cgit v1.2.3


From 08071e64cb642ae19ebd6ffeb13b4f3d130b5860 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 16 Jun 2025 09:12:10 +0300
Subject: PM: runtime: Mark last busy stamp in pm_runtime_autosuspend()

Set device's last busy timestamp to current time in
pm_runtime_autosuspend().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patch.msgid.link/20250616061212.2286741-5-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst | 15 ++++++---------
 include/linux/pm_runtime.h         |  9 ++++++---
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 9c21c913f9cf..39a0b62f6648 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -154,11 +154,9 @@ suspending the device are satisfied) and to queue up a suspend request for the
 device in that case.  If there is no idle callback, or if the callback returns
 0, then the PM core will attempt to carry out a runtime suspend of the device,
 also respecting devices configured for autosuspend.  In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance).  To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value.  Negative
-error return codes are ignored by the PM core.
+call to pm_runtime_autosuspend(). To prevent this (for example, if the callback
+routine has started a delayed suspend), the routine must return a non-zero
+value.  Negative error return codes are ignored by the PM core.
 
 The helper functions provided by the PM core, described in Section 4, guarantee
 that the following constraints are met with respect to runtime PM callbacks for
@@ -330,10 +328,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       'power.disable_depth' is different from 0
 
   `int pm_runtime_autosuspend(struct device *dev);`
-    - same as pm_runtime_suspend() except that the autosuspend delay is taken
-      `into account;` if pm_runtime_autosuspend_expiration() says the delay has
-      not yet expired then an autosuspend is scheduled for the appropriate time
-      and 0 is returned
+    - same as pm_runtime_suspend() except that a call to
+      pm_runtime_mark_last_busy() is made and an autosuspend is scheduled for
+      the appropriate time and 0 is returned
 
   `int pm_runtime_resume(struct device *dev);`
     - execute the subsystem-level resume callback for the device; returns 0 on
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 3a0d5f0ea471..566a07b60f63 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -379,11 +379,13 @@ static inline int pm_runtime_suspend(struct device *dev)
 }
 
 /**
- * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it.
+ * pm_runtime_autosuspend - Update the last access time and set up autosuspend
+ * of a device.
  * @dev: Target device.
  *
- * Set up autosuspend of @dev or suspend it (depending on whether or not
- * autosuspend is enabled for it) without engaging its "idle check" callback.
+ * First update the last access time, then set up autosuspend of @dev or suspend
+ * it (depending on whether or not autosuspend is enabled for it) without
+ * engaging its "idle check" callback.
  *
  * Return:
  * * 0: Success.
@@ -399,6 +401,7 @@ static inline int pm_runtime_suspend(struct device *dev)
  */
 static inline int pm_runtime_autosuspend(struct device *dev)
 {
+	pm_runtime_mark_last_busy(dev);
 	return __pm_runtime_suspend(dev, RPM_AUTO);
 }
 
-- 
cgit v1.2.3


From 18c1fe53d186867243f4cf17f4eef60737a16c4c Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 16 Jun 2025 09:12:11 +0300
Subject: PM: runtime: Mark last busy stamp in pm_request_autosuspend()

Set device's last busy timestamp to current time in
pm_request_autosuspend().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Link: https://patch.msgid.link/20250616061212.2286741-6-sakari.ailus@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/runtime_pm.rst | 6 +++---
 include/linux/pm_runtime.h         | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
index 39a0b62f6648..91bc93422262 100644
--- a/Documentation/power/runtime_pm.rst
+++ b/Documentation/power/runtime_pm.rst
@@ -354,9 +354,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
       success or error code if the request has not been queued up
 
   `int pm_request_autosuspend(struct device *dev);`
-    - schedule the execution of the subsystem-level suspend callback for the
-      device when the autosuspend delay has expired; if the delay has already
-      expired then the work item is queued up immediately
+    - Call pm_runtime_mark_last_busy() and schedule the execution of the
+      subsystem-level suspend callback for the device when the autosuspend delay
+      expires
 
   `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
     - schedule the execution of the subsystem-level suspend callback for the
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 566a07b60f63..778d5988f35e 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -448,11 +448,12 @@ static inline int pm_request_resume(struct device *dev)
 }
 
 /**
- * pm_request_autosuspend - Queue up autosuspend of a device.
+ * pm_request_autosuspend - Update the last access time and queue up autosuspend
+ * of a device.
  * @dev: Target device.
  *
- * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev
- * asynchronously.
+ * Update the last access time of a device and queue up a work item to run an
+ * equivalent pm_runtime_autosuspend() for @dev asynchronously.
  *
  * Return:
  * * 0: Success.
@@ -467,6 +468,7 @@ static inline int pm_request_resume(struct device *dev)
  */
 static inline int pm_request_autosuspend(struct device *dev)
 {
+	pm_runtime_mark_last_busy(dev);
 	return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO);
 }
 
-- 
cgit v1.2.3


From c6d732c38f93c4aebd204a5656583142289c3a2e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 17 Jun 2025 13:22:40 -0700
Subject: net: ethtool: remove duplicate defines for family info

Commit under fixes switched to uAPI generation from the YAML
spec. A number of custom defines were left behind, mostly
for commands very hard to express in YAML spec.

Among what was left behind was the name and version of
the generic netlink family. Problem is that the codegen
always outputs those values so we ended up with a duplicated,
differently named set of defines.

Provide naming info in YAML and remove the incorrect defines.

Fixes: 8d0580c6ebdd ("ethtool: regenerate uapi header from the spec")
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250617202240.811179-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       | 3 +++
 include/uapi/linux/ethtool_netlink.h           | 4 ----
 include/uapi/linux/ethtool_netlink_generated.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 9f98715a6512..72a076b0e1b5 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -7,6 +7,9 @@ protocol: genetlink-legacy
 doc: Partial family for Ethtool Netlink.
 uapi-header: linux/ethtool_netlink_generated.h
 
+c-family-name: ethtool-genl-name
+c-version-name: ethtool-genl-version
+
 definitions:
   -
     name: udp-tunnel-type
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 9ff72cfb2e98..09a75bdb6560 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -208,10 +208,6 @@ enum {
 	ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1)
 };
 
-/* generic netlink info */
-#define ETHTOOL_GENL_NAME "ethtool"
-#define ETHTOOL_GENL_VERSION 1
-
 #define ETHTOOL_MCGRP_MONITOR_NAME "monitor"
 
 #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 9a02f579de22..aa8ab5227c1e 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -6,8 +6,8 @@
 #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H
 #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H
 
-#define ETHTOOL_FAMILY_NAME	"ethtool"
-#define ETHTOOL_FAMILY_VERSION	1
+#define ETHTOOL_GENL_NAME	"ethtool"
+#define ETHTOOL_GENL_VERSION	1
 
 enum {
 	ETHTOOL_UDP_TUNNEL_TYPE_VXLAN,
-- 
cgit v1.2.3


From 4bfbc2691de8c869339090e851703209b17ba378 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Tue, 10 Jun 2025 12:40:59 +0200
Subject: mux: Convert mux_control_ops to a flex array member in mux_chip

Convert mux_control_ops to a flexible array member at the end of the
mux_chip struct and add the __counted_by() compiler attribute to
improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Use struct_size() to calculate the number of bytes to allocate for a new
mux chip and to remove the following Coccinelle/coccicheck warning:

  WARNING: Use struct_size

Use size_add() to safely add any extra bytes.

No functional changes intended.

Link: https://github.com/KSPP/linux/issues/83
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Link: https://lore.kernel.org/r/20250610104106.1948-2-thorsten.blum@linux.dev
Signed-off-by: Kees Cook <kees@kernel.org>
---
 drivers/mux/core.c         | 7 +++----
 include/linux/mux/driver.h | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mux/core.c b/drivers/mux/core.c
index 02be4ba37257..a3840fe0995f 100644
--- a/drivers/mux/core.c
+++ b/drivers/mux/core.c
@@ -98,13 +98,12 @@ struct mux_chip *mux_chip_alloc(struct device *dev,
 	if (WARN_ON(!dev || !controllers))
 		return ERR_PTR(-EINVAL);
 
-	mux_chip = kzalloc(sizeof(*mux_chip) +
-			   controllers * sizeof(*mux_chip->mux) +
-			   sizeof_priv, GFP_KERNEL);
+	mux_chip = kzalloc(size_add(struct_size(mux_chip, mux, controllers),
+				    sizeof_priv),
+			   GFP_KERNEL);
 	if (!mux_chip)
 		return ERR_PTR(-ENOMEM);
 
-	mux_chip->mux = (struct mux_control *)(mux_chip + 1);
 	mux_chip->dev.class = &mux_class;
 	mux_chip->dev.type = &mux_type;
 	mux_chip->dev.parent = dev;
diff --git a/include/linux/mux/driver.h b/include/linux/mux/driver.h
index 18824064f8c0..e58e59354e23 100644
--- a/include/linux/mux/driver.h
+++ b/include/linux/mux/driver.h
@@ -56,18 +56,18 @@ struct mux_control {
 /**
  * struct mux_chip -	Represents a chip holding mux controllers.
  * @controllers:	Number of mux controllers handled by the chip.
- * @mux:		Array of mux controllers that are handled.
  * @dev:		Device structure.
  * @id:			Used to identify the device internally.
  * @ops:		Mux controller operations.
+ * @mux:		Array of mux controllers that are handled.
  */
 struct mux_chip {
 	unsigned int controllers;
-	struct mux_control *mux;
 	struct device dev;
 	int id;
 
 	const struct mux_control_ops *ops;
+	struct mux_control mux[] __counted_by(controllers);
 };
 
 #define to_mux_chip(x) container_of((x), struct mux_chip, dev)
-- 
cgit v1.2.3


From 29bb79e9dbf1ba100125e39deb7147acd490903f Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 17 Jun 2025 13:05:36 -0600
Subject: stddef: Introduce TRAILING_OVERLAP() helper macro

Add new TRAILING_OVERLAP() helper macro to create a union between
a flexible-array member (FAM) and a set of members that would
otherwise follow it. This overlays the trailing members onto the
FAM while preserving the original memory layout.

Co-developed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/aFG8gEwKXAWWIvX0@kspp
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/stddef.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index 929d67710cc5..dab49e2ec8c0 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -93,4 +93,24 @@ enum {
 #define DECLARE_FLEX_ARRAY(TYPE, NAME) \
 	__DECLARE_FLEX_ARRAY(TYPE, NAME)
 
+/**
+ * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members.
+ *
+ * Creates a union between a flexible-array member (FAM) in a struct and a set
+ * of additional members that would otherwise follow it.
+ *
+ * @TYPE: Flexible structure type name, including "struct" keyword.
+ * @NAME: Name for a variable to define.
+ * @FAM: The flexible-array member within @TYPE
+ * @MEMBERS: Trailing overlapping members.
+ */
+#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS)				\
+	union {									\
+		TYPE NAME;							\
+		struct {							\
+			unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)];	\
+			MEMBERS							\
+		};								\
+	}
+
 #endif
-- 
cgit v1.2.3


From 0c25ae62f5dc6a438b563536b5fe7fb6da3612b8 Mon Sep 17 00:00:00 2001
From: Luo Jie <quic_luoj@quicinc.com>
Date: Tue, 10 Jun 2025 18:35:18 +0800
Subject: dt-bindings: clock: qcom: Add CMN PLL support for IPQ5424 SoC

The CMN PLL block in the IPQ5424 SoC takes 48 MHZ as the reference
input clock. The output clocks are the same as IPQ9574 SoC, except
for the clock rate of output clocks to PPE and NSS.

Also, add the new header file to export the CMN PLL output clock
specifiers for IPQ5424 SoC.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Luo Jie <quic_luoj@quicinc.com>
Link: https://lore.kernel.org/r/20250610-qcom_ipq5424_cmnpll-v3-1-ceada8165645@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,ipq9574-cmn-pll.yaml       |  1 +
 include/dt-bindings/clock/qcom,ipq5424-cmn-pll.h   | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,ipq5424-cmn-pll.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
index f869b3739be8..cb6e09f4247f 100644
--- a/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
@@ -24,6 +24,7 @@ description:
 properties:
   compatible:
     enum:
+      - qcom,ipq5424-cmn-pll
       - qcom,ipq9574-cmn-pll
 
   reg:
diff --git a/include/dt-bindings/clock/qcom,ipq5424-cmn-pll.h b/include/dt-bindings/clock/qcom,ipq5424-cmn-pll.h
new file mode 100644
index 000000000000..f643c2668c04
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,ipq5424-cmn-pll.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_IPQ5424_CMN_PLL_H
+#define _DT_BINDINGS_CLK_QCOM_IPQ5424_CMN_PLL_H
+
+/* CMN PLL core clock. */
+#define IPQ5424_CMN_PLL_CLK			0
+
+/* The output clocks from CMN PLL of IPQ5424. */
+#define IPQ5424_XO_24MHZ_CLK			1
+#define IPQ5424_SLEEP_32KHZ_CLK			2
+#define IPQ5424_PCS_31P25MHZ_CLK		3
+#define IPQ5424_NSS_300MHZ_CLK			4
+#define IPQ5424_PPE_375MHZ_CLK			5
+#define IPQ5424_ETH0_50MHZ_CLK			6
+#define IPQ5424_ETH1_50MHZ_CLK			7
+#define IPQ5424_ETH2_50MHZ_CLK			8
+#define IPQ5424_ETH_25MHZ_CLK			9
+#endif
-- 
cgit v1.2.3


From 3c0f211bc8fc0d0a0b4c29c471b57ffff48eec60 Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Sat, 14 Jun 2025 01:01:43 +0530
Subject: drm/xe: Add Wildcat Lake device IDs to PTL list

Introduce wildcat lake device Id.
Wildcat Lake uses slightly different graphics and media IP versions
than Panther Lake, but can still be treated as PTL for general driver
flows.

Bspec: 73951
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Dnyaneshwar Bhadane <dnyaneshwar.bhadane@intel.com>
Reviewed-by: Tejas Upadhyay <tejas.upadhyay@intel.com>
Link: https://lore.kernel.org/r/20250613193146.3549862-7-dnyaneshwar.bhadane@intel.com
---
 include/drm/intel/pciids.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index 129c3172aa5f..a0180d10e260 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -871,6 +871,8 @@
 	MACRO__(0xB08F, ## __VA_ARGS__), \
 	MACRO__(0xB090, ## __VA_ARGS__), \
 	MACRO__(0xB0A0, ## __VA_ARGS__), \
-	MACRO__(0xB0B0, ## __VA_ARGS__)
+	MACRO__(0xB0B0, ## __VA_ARGS__), \
+	MACRO__(0xFD80, ## __VA_ARGS__), \
+	MACRO__(0xFD81, ## __VA_ARGS__)
 
 #endif /* __PCIIDS_H__ */
-- 
cgit v1.2.3


From a33556940b5727191613104bced53c93f4a7a3aa Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 17 Jun 2025 21:06:13 +0800
Subject: tcp: Remove inet_hashinfo2_free_mod()

DCCP was removed, inet_hashinfo2_free_mod() is unused now.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250617130613.498659-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 4564b5d348b1..ae09e91398a5 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -202,12 +202,6 @@ static inline spinlock_t *inet_ehash_lockp(
 
 int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
 
-static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
-{
-	kfree(h->lhash2);
-	h->lhash2 = NULL;
-}
-
 static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
 {
 	kvfree(hashinfo->ehash_locks);
-- 
cgit v1.2.3


From 1ead7501094c6a61461c0c98dde9ec5660fa1e24 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <stfomichev@gmail.com>
Date: Mon, 16 Jun 2025 09:21:14 -0700
Subject: udp_tunnel: remove rtnl_lock dependency

Drivers that are using ops lock and don't depend on RTNL lock
still need to manage it because udp_tunnel's RTNL dependency.
Introduce new udp_tunnel_nic_lock and use it instead of
rtnl_lock. Drop non-UDP_TUNNEL_NIC_INFO_MAY_SLEEP mode from
udp_tunnel infra (udp_tunnel_nic_device_sync_work needs to
grab udp_tunnel_nic_lock mutex and might sleep).

Cover more places in v4:

- netlink
  - udp_tunnel_notify_add_rx_port (ndo_open)
    - triggers udp_tunnel_nic_device_sync_work
  - udp_tunnel_notify_del_rx_port (ndo_stop)
    - triggers udp_tunnel_nic_device_sync_work
  - udp_tunnel_get_rx_info (__netdev_update_features)
    - triggers NETDEV_UDP_TUNNEL_PUSH_INFO
  - udp_tunnel_drop_rx_info (__netdev_update_features)
    - triggers NETDEV_UDP_TUNNEL_DROP_INFO
  - udp_tunnel_nic_reset_ntf (ndo_open)

- notifiers
  - udp_tunnel_nic_netdevice_event, depending on the event:
    - triggers NETDEV_UDP_TUNNEL_PUSH_INFO
    - triggers NETDEV_UDP_TUNNEL_DROP_INFO

- ethnl_tunnel_info_reply_size
- udp_tunnel_nic_set_port_priv (two intel drivers)

Cc: Michael Chan <michael.chan@broadcom.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Stanislav Fomichev <stfomichev@gmail.com>
Link: https://patch.msgid.link/20250616162117.287806-4-stfomichev@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  3 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  6 +-
 drivers/net/ethernet/emulex/benet/be_main.c        |  3 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  1 -
 drivers/net/ethernet/intel/ice/ice_main.c          |  1 -
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  3 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  3 +-
 drivers/net/ethernet/qlogic/qede/qede_filter.c     |  3 -
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  1 -
 drivers/net/ethernet/sfc/ef10.c                    |  1 -
 drivers/net/netdevsim/udp_tunnels.c                |  4 -
 include/net/udp_tunnel.h                           | 87 ++++++++++++++++------
 net/core/dev.c                                     |  2 +
 net/ipv4/udp_tunnel_core.c                         | 16 ++--
 net/ipv4/udp_tunnel_nic.c                          | 78 ++++++++++++++-----
 16 files changed, 142 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c9a1a1d504c0..3ee4b848ef53 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -10219,8 +10219,7 @@ static int bnx2x_udp_tunnel_sync(struct net_device *netdev, unsigned int table)
 
 static const struct udp_tunnel_nic_info bnx2x_udp_tunnels = {
 	.sync_table	= bnx2x_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.flags		= UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN,  },
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, },
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 00a60b2b90c4..ededd292b9d3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -15573,8 +15573,7 @@ static int bnxt_udp_tunnel_unset_port(struct net_device *netdev, unsigned int ta
 static const struct udp_tunnel_nic_info bnxt_udp_tunnels = {
 	.set_port	= bnxt_udp_tunnel_set_port,
 	.unset_port	= bnxt_udp_tunnel_unset_port,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.flags		= UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN,  },
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, },
@@ -15582,8 +15581,7 @@ static const struct udp_tunnel_nic_info bnxt_udp_tunnels = {
 }, bnxt_udp_tunnels_p7 = {
 	.set_port	= bnxt_udp_tunnel_set_port,
 	.unset_port	= bnxt_udp_tunnel_unset_port,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.flags		= UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN,  },
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, },
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 3d2e21592119..f49400ba9729 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4031,8 +4031,7 @@ static int be_vxlan_unset_port(struct net_device *netdev, unsigned int table,
 static const struct udp_tunnel_nic_info be_udp_tunnels = {
 	.set_port	= be_vxlan_set_port,
 	.unset_port	= be_vxlan_unset_port,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.flags		= UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, },
 	},
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1156a5b3055c..3b4f59d978a5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -15895,7 +15895,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	pf->udp_tunnel_nic.set_port = i40e_udp_tunnel_set_port;
 	pf->udp_tunnel_nic.unset_port = i40e_udp_tunnel_unset_port;
-	pf->udp_tunnel_nic.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
 	pf->udp_tunnel_nic.shared = &pf->udp_tunnel_shared;
 	pf->udp_tunnel_nic.tables[0].n_entries = I40E_MAX_PF_UDP_OFFLOAD_PORTS;
 	pf->udp_tunnel_nic.tables[0].tunnel_types = UDP_TUNNEL_TYPE_VXLAN |
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 7959a65c0903..f8ef80069e3d 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -4767,7 +4767,6 @@ int ice_init_dev(struct ice_pf *pf)
 
 	pf->hw.udp_tunnel_nic.set_port = ice_udp_tunnel_set_port;
 	pf->hw.udp_tunnel_nic.unset_port = ice_udp_tunnel_unset_port;
-	pf->hw.udp_tunnel_nic.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
 	pf->hw.udp_tunnel_nic.shared = &pf->hw.udp_tunnel_shared;
 	if (pf->hw.tnl.valid_count[TNL_VXLAN]) {
 		pf->hw.udp_tunnel_nic.tables[0].n_entries =
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 281b34af0bb4..d2071aff7b8f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2670,8 +2670,7 @@ static int mlx4_udp_tunnel_sync(struct net_device *dev, unsigned int table)
 
 static const struct udp_tunnel_nic_info mlx4_udp_tunnels = {
 	.sync_table	= mlx4_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_IPV4_ONLY,
+	.flags		= UDP_TUNNEL_NIC_INFO_IPV4_ONLY,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, },
 	},
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 24559cbcbfc2..dca5ca51a470 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5351,8 +5351,7 @@ void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv)
 
 	priv->nic_info.set_port = mlx5e_vxlan_set_port;
 	priv->nic_info.unset_port = mlx5e_vxlan_unset_port;
-	priv->nic_info.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-				UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN;
+	priv->nic_info.flags = UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN;
 	priv->nic_info.tables[0].tunnel_types = UDP_TUNNEL_TYPE_VXLAN;
 	/* Don't count the space hard-coded to the IANA port */
 	priv->nic_info.tables[0].n_entries =
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 932f59d70f41..132626a3f9f7 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2394,8 +2394,7 @@ static int nfp_udp_tunnel_sync(struct net_device *netdev, unsigned int table)
 
 static const struct udp_tunnel_nic_info nfp_udp_tunnels = {
 	.sync_table     = nfp_udp_tunnel_sync,
-	.flags          = UDP_TUNNEL_NIC_INFO_MAY_SLEEP |
-			  UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
+	.flags          = UDP_TUNNEL_NIC_INFO_OPEN_ONLY,
 	.tables         = {
 		{
 			.n_entries      = NFP_NET_N_VXLAN_PORTS,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 985026dd816f..7e341e026489 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -987,20 +987,17 @@ static int qede_udp_tunnel_sync(struct net_device *dev, unsigned int table)
 
 static const struct udp_tunnel_nic_info qede_udp_tunnels_both = {
 	.sync_table	= qede_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN,  },
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, },
 	},
 }, qede_udp_tunnels_vxlan = {
 	.sync_table	= qede_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN,  },
 	},
 }, qede_udp_tunnels_geneve = {
 	.sync_table	= qede_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, },
 	},
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index eb69121df726..53cdd36c4123 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -486,7 +486,6 @@ static int qlcnic_udp_tunnel_sync(struct net_device *dev, unsigned int table)
 
 static const struct udp_tunnel_nic_info qlcnic_udp_tunnels = {
 	.sync_table	= qlcnic_udp_tunnel_sync,
-	.flags		= UDP_TUNNEL_NIC_INFO_MAY_SLEEP,
 	.tables		= {
 		{ .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, },
 	},
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 47349c148c0c..fcec81f862ec 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -3985,7 +3985,6 @@ static int efx_ef10_udp_tnl_unset_port(struct net_device *dev,
 static const struct udp_tunnel_nic_info efx_ef10_udp_tunnels = {
 	.set_port	= efx_ef10_udp_tnl_set_port,
 	.unset_port	= efx_ef10_udp_tnl_unset_port,
-	.flags          = UDP_TUNNEL_NIC_INFO_MAY_SLEEP,
 	.tables         = {
 		{
 			.n_entries = 16,
diff --git a/drivers/net/netdevsim/udp_tunnels.c b/drivers/net/netdevsim/udp_tunnels.c
index 640b4983a9a0..10cbbf1c584b 100644
--- a/drivers/net/netdevsim/udp_tunnels.c
+++ b/drivers/net/netdevsim/udp_tunnels.c
@@ -112,12 +112,10 @@ nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data,
 	struct net_device *dev = file->private_data;
 	struct netdevsim *ns = netdev_priv(dev);
 
-	rtnl_lock();
 	if (dev->reg_state == NETREG_REGISTERED) {
 		memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports));
 		udp_tunnel_nic_reset_ntf(dev);
 	}
-	rtnl_unlock();
 
 	return count;
 }
@@ -181,8 +179,6 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev,
 		info->sync_table = NULL;
 	}
 
-	if (ns->udp_ports.sleep)
-		info->flags |= UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
 	if (nsim_dev->udp_ports.open_only)
 		info->flags |= UDP_TUNNEL_NIC_INFO_OPEN_ONLY;
 	if (nsim_dev->udp_ports.ipv4_only)
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index e3c70b579095..cbd3a43074bd 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -130,22 +130,6 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
 void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
 void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
 
-static inline void udp_tunnel_get_rx_info(struct net_device *dev)
-{
-	ASSERT_RTNL();
-	if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
-		return;
-	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
-}
-
-static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
-{
-	ASSERT_RTNL();
-	if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
-		return;
-	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
-}
-
 /* Transmit the skb using UDP encapsulation. */
 void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
 			 __be32 src, __be32 dst, __u8 tos, __u8 ttl,
@@ -222,19 +206,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
 #define UDP_TUNNEL_NIC_MAX_TABLES	4
 
 enum udp_tunnel_nic_info_flags {
-	/* Device callbacks may sleep */
-	UDP_TUNNEL_NIC_INFO_MAY_SLEEP	= BIT(0),
 	/* Device only supports offloads when it's open, all ports
 	 * will be removed before close and re-added after open.
 	 */
-	UDP_TUNNEL_NIC_INFO_OPEN_ONLY	= BIT(1),
+	UDP_TUNNEL_NIC_INFO_OPEN_ONLY	= BIT(0),
 	/* Device supports only IPv4 tunnels */
-	UDP_TUNNEL_NIC_INFO_IPV4_ONLY	= BIT(2),
+	UDP_TUNNEL_NIC_INFO_IPV4_ONLY	= BIT(1),
 	/* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
 	 * This port must not be counted towards n_entries of any table.
 	 * Driver will not receive any callback associated with port 4789.
 	 */
-	UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN	= BIT(3),
+	UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN	= BIT(2),
 };
 
 struct udp_tunnel_nic;
@@ -325,6 +307,9 @@ struct udp_tunnel_nic_ops {
 	size_t (*dump_size)(struct net_device *dev, unsigned int table);
 	int (*dump_write)(struct net_device *dev, unsigned int table,
 			  struct sk_buff *skb);
+	void (*assert_locked)(struct net_device *dev);
+	void (*lock)(struct net_device *dev);
+	void (*unlock)(struct net_device *dev);
 };
 
 #ifdef CONFIG_INET
@@ -353,8 +338,29 @@ static inline void
 udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
 			     unsigned int idx, u8 priv)
 {
-	if (udp_tunnel_nic_ops)
+	if (udp_tunnel_nic_ops) {
+		udp_tunnel_nic_ops->lock(dev);
 		udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
+		udp_tunnel_nic_ops->unlock(dev);
+	}
+}
+
+static inline void udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+	if (udp_tunnel_nic_ops)
+		udp_tunnel_nic_ops->assert_locked(dev);
+}
+
+static inline void udp_tunnel_nic_lock(struct net_device *dev)
+{
+	if (udp_tunnel_nic_ops)
+		udp_tunnel_nic_ops->lock(dev);
+}
+
+static inline void udp_tunnel_nic_unlock(struct net_device *dev)
+{
+	if (udp_tunnel_nic_ops)
+		udp_tunnel_nic_ops->unlock(dev);
 }
 
 static inline void
@@ -396,17 +402,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
 static inline size_t
 udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
 {
+	size_t ret;
+
 	if (!udp_tunnel_nic_ops)
 		return 0;
-	return udp_tunnel_nic_ops->dump_size(dev, table);
+
+	udp_tunnel_nic_ops->lock(dev);
+	ret = udp_tunnel_nic_ops->dump_size(dev, table);
+	udp_tunnel_nic_ops->unlock(dev);
+
+	return ret;
 }
 
 static inline int
 udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
 			  struct sk_buff *skb)
 {
+	int ret;
+
 	if (!udp_tunnel_nic_ops)
 		return 0;
-	return udp_tunnel_nic_ops->dump_write(dev, table, skb);
+
+	udp_tunnel_nic_ops->lock(dev);
+	ret = udp_tunnel_nic_ops->dump_write(dev, table, skb);
+	udp_tunnel_nic_ops->unlock(dev);
+
+	return ret;
+}
+
+static inline void udp_tunnel_get_rx_info(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+		return;
+	udp_tunnel_nic_assert_locked(dev);
+	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
 }
+
+static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+		return;
+	udp_tunnel_nic_assert_locked(dev);
+	call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
+}
+
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 5baa4691074f..43f56b44f351 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10771,12 +10771,14 @@ sync_lower:
 			 * *before* calling udp_tunnel_get_rx_info,
 			 * but *after* calling udp_tunnel_drop_rx_info.
 			 */
+			udp_tunnel_nic_lock(dev);
 			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
 				dev->features = features;
 				udp_tunnel_get_rx_info(dev);
 			} else {
 				udp_tunnel_drop_rx_info(dev);
 			}
+			udp_tunnel_nic_unlock(dev);
 		}
 
 		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 9efd62505916..fce945f23069 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -134,15 +134,17 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
 	struct udp_tunnel_info ti;
 	struct net_device *dev;
 
+	ASSERT_RTNL();
+
 	ti.type = type;
 	ti.sa_family = sk->sk_family;
 	ti.port = inet_sk(sk)->inet_sport;
 
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
+	for_each_netdev(net, dev) {
+		udp_tunnel_nic_lock(dev);
 		udp_tunnel_nic_add_port(dev, &ti);
+		udp_tunnel_nic_unlock(dev);
 	}
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
 
@@ -154,15 +156,17 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
 	struct udp_tunnel_info ti;
 	struct net_device *dev;
 
+	ASSERT_RTNL();
+
 	ti.type = type;
 	ti.sa_family = sk->sk_family;
 	ti.port = inet_sk(sk)->inet_sport;
 
-	rcu_read_lock();
-	for_each_netdev_rcu(net, dev) {
+	for_each_netdev(net, dev) {
+		udp_tunnel_nic_lock(dev);
 		udp_tunnel_nic_del_port(dev, &ti);
+		udp_tunnel_nic_unlock(dev);
 	}
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
 
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index b6d2d16189c0..ff66db48453c 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -29,6 +29,7 @@ struct udp_tunnel_nic_table_entry {
  * struct udp_tunnel_nic - UDP tunnel port offload state
  * @work:	async work for talking to hardware from process context
  * @dev:	netdev pointer
+ * @lock:	protects all fields
  * @need_sync:	at least one port start changed
  * @need_replay: space was freed, we need a replay of all ports
  * @work_pending: @work is currently scheduled
@@ -41,6 +42,8 @@ struct udp_tunnel_nic {
 
 	struct net_device *dev;
 
+	struct mutex lock;
+
 	u8 need_sync:1;
 	u8 need_replay:1;
 	u8 work_pending:1;
@@ -298,22 +301,11 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 static void
 udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
-	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
-	bool may_sleep;
-
 	if (!utn->need_sync)
 		return;
 
-	/* Drivers which sleep in the callback need to update from
-	 * the workqueue, if we come from the tunnel driver's notification.
-	 */
-	may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
-	if (!may_sleep)
-		__udp_tunnel_nic_device_sync(dev, utn);
-	if (may_sleep || utn->need_replay) {
-		queue_work(udp_tunnel_nic_workqueue, &utn->work);
-		utn->work_pending = 1;
-	}
+	queue_work(udp_tunnel_nic_workqueue, &utn->work);
+	utn->work_pending = 1;
 }
 
 static bool
@@ -554,12 +546,12 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
 	struct udp_tunnel_nic *utn;
 	unsigned int i, j;
 
-	ASSERT_RTNL();
-
 	utn = dev->udp_tunnel_nic;
 	if (!utn)
 		return;
 
+	mutex_lock(&utn->lock);
+
 	utn->need_sync = false;
 	for (i = 0; i < utn->n_tables; i++)
 		for (j = 0; j < info->tables[i].n_entries; j++) {
@@ -569,7 +561,7 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
 
 			entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
 					  UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
-			/* We don't release rtnl across ops */
+			/* We don't release utn lock across ops */
 			WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
 			if (!entry->use_cnt)
 				continue;
@@ -579,6 +571,8 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
 		}
 
 	__udp_tunnel_nic_device_sync(dev, utn);
+
+	mutex_unlock(&utn->lock);
 }
 
 static size_t
@@ -643,6 +637,33 @@ err_cancel:
 	return -EMSGSIZE;
 }
 
+static void __udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	if (utn)
+		lockdep_assert_held(&utn->lock);
+}
+
+static void __udp_tunnel_nic_lock(struct net_device *dev)
+{
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	if (utn)
+		mutex_lock(&utn->lock);
+}
+
+static void __udp_tunnel_nic_unlock(struct net_device *dev)
+{
+	struct udp_tunnel_nic *utn;
+
+	utn = dev->udp_tunnel_nic;
+	if (utn)
+		mutex_unlock(&utn->lock);
+}
+
 static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
 	.get_port	= __udp_tunnel_nic_get_port,
 	.set_port_priv	= __udp_tunnel_nic_set_port_priv,
@@ -651,6 +672,9 @@ static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
 	.reset_ntf	= __udp_tunnel_nic_reset_ntf,
 	.dump_size	= __udp_tunnel_nic_dump_size,
 	.dump_write	= __udp_tunnel_nic_dump_write,
+	.assert_locked	= __udp_tunnel_nic_assert_locked,
+	.lock		= __udp_tunnel_nic_lock,
+	.unlock		= __udp_tunnel_nic_unlock,
 };
 
 static void
@@ -710,11 +734,15 @@ static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
 		container_of(work, struct udp_tunnel_nic, work);
 
 	rtnl_lock();
+	mutex_lock(&utn->lock);
+
 	utn->work_pending = 0;
 	__udp_tunnel_nic_device_sync(utn->dev, utn);
 
 	if (utn->need_replay)
 		udp_tunnel_nic_replay(utn->dev, utn);
+
+	mutex_unlock(&utn->lock);
 	rtnl_unlock();
 }
 
@@ -730,6 +758,7 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
 		return NULL;
 	utn->n_tables = n_tables;
 	INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
+	mutex_init(&utn->lock);
 
 	for (i = 0; i < n_tables; i++) {
 		utn->entries[i] = kcalloc(info->tables[i].n_entries,
@@ -821,8 +850,11 @@ static int udp_tunnel_nic_register(struct net_device *dev)
 	dev_hold(dev);
 	dev->udp_tunnel_nic = utn;
 
-	if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+	if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) {
+		udp_tunnel_nic_lock(dev);
 		udp_tunnel_get_rx_info(dev);
+		udp_tunnel_nic_unlock(dev);
+	}
 
 	return 0;
 }
@@ -832,6 +864,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
 	const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
 
+	udp_tunnel_nic_lock(dev);
+
 	/* For a shared table remove this dev from the list of sharing devices
 	 * and if there are other devices just detach.
 	 */
@@ -841,8 +875,10 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
 		list_for_each_entry(node, &info->shared->devices, list)
 			if (node->dev == dev)
 				break;
-		if (list_entry_is_head(node, &info->shared->devices, list))
+		if (list_entry_is_head(node, &info->shared->devices, list)) {
+			udp_tunnel_nic_unlock(dev);
 			return;
+		}
 
 		list_del(&node->list);
 		kfree(node);
@@ -852,6 +888,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
 		if (first) {
 			udp_tunnel_drop_rx_info(dev);
 			utn->dev = first->dev;
+			udp_tunnel_nic_unlock(dev);
 			goto release_dev;
 		}
 
@@ -862,6 +899,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
 	 * from the work which we will boot immediately.
 	 */
 	udp_tunnel_nic_flush(dev, utn);
+	udp_tunnel_nic_unlock(dev);
 
 	/* Wait for the work to be done using the state, netdev core will
 	 * retry unregister until we give up our reference on this device.
@@ -910,12 +948,16 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
 		return NOTIFY_DONE;
 
 	if (event == NETDEV_UP) {
+		udp_tunnel_nic_lock(dev);
 		WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
 		udp_tunnel_get_rx_info(dev);
+		udp_tunnel_nic_unlock(dev);
 		return NOTIFY_OK;
 	}
 	if (event == NETDEV_GOING_DOWN) {
+		udp_tunnel_nic_lock(dev);
 		udp_tunnel_nic_flush(dev, utn);
+		udp_tunnel_nic_unlock(dev);
 		return NOTIFY_OK;
 	}
 
-- 
cgit v1.2.3


From cea465a96a294e7bc2537f27a737cfa7c6234b3d Mon Sep 17 00:00:00 2001
From: David Arinzon <darinzon@amazon.com>
Date: Tue, 17 Jun 2025 14:05:41 +0300
Subject: devlink: Add new "enable_phc" generic device param

Add a new device generic parameter to enable/disable the
PHC (PTP Hardware Clock) functionality in the device associated
with the devlink instance.

Signed-off-by: David Arinzon <darinzon@amazon.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20250617110545.5659-6-darinzon@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/devlink-params.rst | 3 +++
 include/net/devlink.h                               | 4 ++++
 net/devlink/param.c                                 | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 4e01dc32bc08..3da8f4ef2417 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -137,3 +137,6 @@ own name.
    * - ``event_eq_size``
      - u32
      - Control the size of asynchronous control events EQ.
+   * - ``enable_phc``
+     - Boolean
+     - Enable PHC (PTP Hardware Clock) functionality in the device.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0091f23a40f7..63517646a497 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -520,6 +520,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
 	DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
 	DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+	DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -578,6 +579,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size"
 #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32
 
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc"
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/devlink/param.c b/net/devlink/param.c
index b29abf8d3ed4..396b8a7f6013 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -92,6 +92,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME,
 		.type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+		.name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME,
+		.type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From fa2f0454174c2f33005f5a6e6f70c7160a15b2a1 Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:00 +0200
Subject: net: pse-pd: Introduce attached_phydev to pse control

In preparation for reporting PSE events via ethtool notifications,
introduce an attached_phydev field in the pse_control structure.
This field stores the phy_device associated with the PSE PI,
ensuring that notifications are sent to the correct network
interface.

The attached_phydev pointer is directly tied to the PHY lifecycle. It
is set when the PHY is registered and cleared when the PHY is removed.
There is no need to use a refcount, as doing so could interfere with
the PHY removal process.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-1-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/fwnode_mdio.c | 26 ++++++++++++++------------
 drivers/net/pse-pd/pse_core.c  | 11 ++++++++---
 include/linux/pse-pd/pse.h     |  6 ++++--
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
index aea0f0357568..9b41d4697a40 100644
--- a/drivers/net/mdio/fwnode_mdio.c
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -18,7 +18,8 @@ MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("FWNODE MDIO bus (Ethernet PHY) accessors");
 
 static struct pse_control *
-fwnode_find_pse_control(struct fwnode_handle *fwnode)
+fwnode_find_pse_control(struct fwnode_handle *fwnode,
+			struct phy_device *phydev)
 {
 	struct pse_control *psec;
 	struct device_node *np;
@@ -30,7 +31,7 @@ fwnode_find_pse_control(struct fwnode_handle *fwnode)
 	if (!np)
 		return NULL;
 
-	psec = of_pse_control_get(np);
+	psec = of_pse_control_get(np, phydev);
 	if (PTR_ERR(psec) == -ENOENT)
 		return NULL;
 
@@ -128,15 +129,9 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 	u32 phy_id;
 	int rc;
 
-	psec = fwnode_find_pse_control(child);
-	if (IS_ERR(psec))
-		return PTR_ERR(psec);
-
 	mii_ts = fwnode_find_mii_timestamper(child);
-	if (IS_ERR(mii_ts)) {
-		rc = PTR_ERR(mii_ts);
-		goto clean_pse;
-	}
+	if (IS_ERR(mii_ts))
+		return PTR_ERR(mii_ts);
 
 	is_c45 = fwnode_device_is_compatible(child, "ethernet-phy-ieee802.3-c45");
 	if (is_c45 || fwnode_get_phy_id(child, &phy_id))
@@ -169,6 +164,12 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 			goto clean_phy;
 	}
 
+	psec = fwnode_find_pse_control(child, phy);
+	if (IS_ERR(psec)) {
+		rc = PTR_ERR(psec);
+		goto unregister_phy;
+	}
+
 	phy->psec = psec;
 
 	/* phy->mii_ts may already be defined by the PHY driver. A
@@ -180,12 +181,13 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 
 	return 0;
 
+unregister_phy:
+	if (is_acpi_node(child) || is_of_node(child))
+		phy_device_remove(phy);
 clean_phy:
 	phy_device_free(phy);
 clean_mii_ts:
 	unregister_mii_timestamper(mii_ts);
-clean_pse:
-	pse_control_put(psec);
 
 	return rc;
 }
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 4602e26eb8c8..4610c1f0ddd6 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -23,6 +23,7 @@ static LIST_HEAD(pse_controller_list);
  * @list: list entry for the pcdev's PSE controller list
  * @id: ID of the PSE line in the PSE controller device
  * @refcnt: Number of gets of this pse_control
+ * @attached_phydev: PHY device pointer attached by the PSE control
  */
 struct pse_control {
 	struct pse_controller_dev *pcdev;
@@ -30,6 +31,7 @@ struct pse_control {
 	struct list_head list;
 	unsigned int id;
 	struct kref refcnt;
+	struct phy_device *attached_phydev;
 };
 
 static int of_load_single_pse_pi_pairset(struct device_node *node,
@@ -599,7 +601,8 @@ void pse_control_put(struct pse_control *psec)
 EXPORT_SYMBOL_GPL(pse_control_put);
 
 static struct pse_control *
-pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
+pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index,
+			 struct phy_device *phydev)
 {
 	struct pse_control *psec;
 	int ret;
@@ -638,6 +641,7 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
 	psec->pcdev = pcdev;
 	list_add(&psec->list, &pcdev->pse_control_head);
 	psec->id = index;
+	psec->attached_phydev = phydev;
 	kref_init(&psec->refcnt);
 
 	return psec;
@@ -693,7 +697,8 @@ static int psec_id_xlate(struct pse_controller_dev *pcdev,
 	return pse_spec->args[0];
 }
 
-struct pse_control *of_pse_control_get(struct device_node *node)
+struct pse_control *of_pse_control_get(struct device_node *node,
+				       struct phy_device *phydev)
 {
 	struct pse_controller_dev *r, *pcdev;
 	struct of_phandle_args args;
@@ -743,7 +748,7 @@ struct pse_control *of_pse_control_get(struct device_node *node)
 	}
 
 	/* pse_list_mutex also protects the pcdev's pse_control list */
-	psec = pse_control_get_internal(pcdev, psec_id);
+	psec = pse_control_get_internal(pcdev, psec_id, phydev);
 
 out:
 	mutex_unlock(&pse_list_mutex);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index c773eeb92d04..8b0866fad2ad 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -250,7 +250,8 @@ struct device;
 int devm_pse_controller_register(struct device *dev,
 				 struct pse_controller_dev *pcdev);
 
-struct pse_control *of_pse_control_get(struct device_node *node);
+struct pse_control *of_pse_control_get(struct device_node *node,
+				       struct phy_device *phydev);
 void pse_control_put(struct pse_control *psec);
 
 int pse_ethtool_get_status(struct pse_control *psec,
@@ -268,7 +269,8 @@ bool pse_has_c33(struct pse_control *psec);
 
 #else
 
-static inline struct pse_control *of_pse_control_get(struct device_node *node)
+static inline struct pse_control *of_pse_control_get(struct device_node *node,
+						     struct phy_device *phydev)
 {
 	return ERR_PTR(-ENOENT);
 }
-- 
cgit v1.2.3


From fc0e6db30941a66e284b8516b82356f97f31061d Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:01 +0200
Subject: net: pse-pd: Add support for reporting events

Add support for devm_pse_irq_helper() to register PSE interrupts and report
events such as over-current or over-temperature conditions. This follows a
similar approach to the regulator API but also sends notifications using a
dedicated PSE ethtool netlink socket.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-2-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  34 +++++
 Documentation/networking/ethtool-netlink.rst   |  19 +++
 drivers/net/pse-pd/pse_core.c                  | 179 +++++++++++++++++++++++++
 include/linux/ethtool_netlink.h                |   7 +
 include/linux/pse-pd/pse.h                     |  20 +++
 include/uapi/linux/ethtool_netlink_generated.h |  19 +++
 net/ethtool/pse-pd.c                           |  39 ++++++
 7 files changed, 317 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index ed9bcdec01cc..92b34a19f308 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -118,6 +118,17 @@ definitions:
         doc: |
           Hardware timestamp comes from one PHY device
           of the network topology
+  -
+    name: pse-event
+    doc: PSE event list for the PSE controller
+    type: flags
+    entries:
+      -
+        name: over-current
+        doc: PSE output current is too high
+      -
+        name: over-temp
+        doc: PSE in over temperature state
 
 attribute-sets:
   -
@@ -1555,6 +1566,19 @@ attribute-sets:
         name: hwtstamp-flags
         type: nest
         nested-attributes: bitset
+  -
+    name: pse-ntf
+    attr-cnt-name: --ethtool-a-pse-ntf-cnt
+    attributes:
+      -
+        name: header
+        type: nest
+        nested-attributes: header
+      -
+        name: events
+        type: uint
+        enum: pse-event
+        doc: List of events reported by the PSE controller
 
 operations:
   enum-model: directional
@@ -2413,3 +2437,13 @@ operations:
           attributes: *tsconfig
         reply:
           attributes: *tsconfig
+    -
+      name: pse-ntf
+      doc: Notification for PSE events.
+
+      attribute-set: pse-ntf
+
+      event:
+        attributes:
+          - header
+          - events
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index b6e9af4d0f1b..433737865bc2 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -290,6 +290,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_PHY_NTF``                  Ethernet PHY information change
   ``ETHTOOL_MSG_TSCONFIG_GET_REPLY``       hw timestamping configuration
   ``ETHTOOL_MSG_TSCONFIG_SET_REPLY``       new hw timestamping configuration
+  ``ETHTOOL_MSG_PSE_NTF``                  PSE events notification
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1896,6 +1897,24 @@ various existing products that document power consumption in watts rather than
 classes. If power limit configuration based on classes is needed, the
 conversion can be done in user space, for example by ethtool.
 
+PSE_NTF
+=======
+
+Notify PSE events.
+
+Notification contents:
+
+  ===============================  ======  ========================
+  ``ETHTOOL_A_PSE_HEADER``         nested  request header
+  ``ETHTOOL_A_PSE_EVENTS``         bitset  PSE events
+  ===============================  ======  ========================
+
+When set, the optional ``ETHTOOL_A_PSE_EVENTS`` attribute identifies the
+PSE events.
+
+.. kernel-doc:: include/uapi/linux/ethtool_netlink_generated.h
+    :identifiers: ethtool_pse_event
+
 RSS_GET
 =======
 
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 4610c1f0ddd6..16cc1dc07246 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -7,10 +7,14 @@
 
 #include <linux/device.h>
 #include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/of.h>
+#include <linux/phy.h>
 #include <linux/pse-pd/pse.h>
 #include <linux/regulator/driver.h>
 #include <linux/regulator/machine.h>
+#include <linux/rtnetlink.h>
+#include <net/net_trackers.h>
 
 static DEFINE_MUTEX(pse_list_mutex);
 static LIST_HEAD(pse_controller_list);
@@ -210,6 +214,48 @@ out:
 	return ret;
 }
 
+/**
+ * pse_control_find_net_by_id - Find net attached to the pse control id
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ *
+ * Return: pse_control pointer or NULL. The device returned has had a
+ *	   reference added and the pointer is safe until the user calls
+ *	   pse_control_put() to indicate they have finished with it.
+ */
+static struct pse_control *
+pse_control_find_by_id(struct pse_controller_dev *pcdev, int id)
+{
+	struct pse_control *psec;
+
+	mutex_lock(&pse_list_mutex);
+	list_for_each_entry(psec, &pcdev->pse_control_head, list) {
+		if (psec->id == id) {
+			kref_get(&psec->refcnt);
+			mutex_unlock(&pse_list_mutex);
+			return psec;
+		}
+	}
+	mutex_unlock(&pse_list_mutex);
+	return NULL;
+}
+
+/**
+ * pse_control_get_netdev - Return netdev associated to a PSE control
+ * @psec: PSE control pointer
+ *
+ * Return: netdev pointer or NULL
+ */
+static struct net_device *pse_control_get_netdev(struct pse_control *psec)
+{
+	ASSERT_RTNL();
+
+	if (!psec || !psec->attached_phydev)
+		return NULL;
+
+	return psec->attached_phydev->attached_dev;
+}
+
 static int pse_pi_is_enabled(struct regulator_dev *rdev)
 {
 	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
@@ -559,6 +605,139 @@ int devm_pse_controller_register(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_pse_controller_register);
 
+struct pse_irq {
+	struct pse_controller_dev *pcdev;
+	struct pse_irq_desc desc;
+	unsigned long *notifs;
+};
+
+/**
+ * pse_to_regulator_notifs - Convert PSE notifications to Regulator
+ *			     notifications
+ * @notifs: PSE notifications
+ *
+ * Return: Regulator notifications
+ */
+static unsigned long pse_to_regulator_notifs(unsigned long notifs)
+{
+	unsigned long rnotifs = 0;
+
+	if (notifs & ETHTOOL_PSE_EVENT_OVER_CURRENT)
+		rnotifs |= REGULATOR_EVENT_OVER_CURRENT;
+	if (notifs & ETHTOOL_PSE_EVENT_OVER_TEMP)
+		rnotifs |= REGULATOR_EVENT_OVER_TEMP;
+
+	return rnotifs;
+}
+
+/**
+ * pse_isr - IRQ handler for PSE
+ * @irq: irq number
+ * @data: pointer to user interrupt structure
+ *
+ * Return: irqreturn_t - status of IRQ
+ */
+static irqreturn_t pse_isr(int irq, void *data)
+{
+	struct pse_controller_dev *pcdev;
+	unsigned long notifs_mask = 0;
+	struct pse_irq_desc *desc;
+	struct pse_irq *h = data;
+	int ret, i;
+
+	desc = &h->desc;
+	pcdev = h->pcdev;
+
+	/* Clear notifs mask */
+	memset(h->notifs, 0, pcdev->nr_lines * sizeof(*h->notifs));
+	mutex_lock(&pcdev->lock);
+	ret = desc->map_event(irq, pcdev, h->notifs, &notifs_mask);
+	mutex_unlock(&pcdev->lock);
+	if (ret || !notifs_mask)
+		return IRQ_NONE;
+
+	for_each_set_bit(i, &notifs_mask, pcdev->nr_lines) {
+		unsigned long notifs, rnotifs;
+		struct net_device *netdev;
+		struct pse_control *psec;
+
+		/* Do nothing PI not described */
+		if (!pcdev->pi[i].rdev)
+			continue;
+
+		notifs = h->notifs[i];
+		dev_dbg(h->pcdev->dev,
+			"Sending PSE notification EVT 0x%lx\n", notifs);
+
+		psec = pse_control_find_by_id(pcdev, i);
+		rtnl_lock();
+		netdev = pse_control_get_netdev(psec);
+		if (netdev)
+			ethnl_pse_send_ntf(netdev, notifs);
+		rtnl_unlock();
+		pse_control_put(psec);
+
+		rnotifs = pse_to_regulator_notifs(notifs);
+		regulator_notifier_call_chain(pcdev->pi[i].rdev, rnotifs,
+					      NULL);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * devm_pse_irq_helper - Register IRQ based PSE event notifier
+ * @pcdev: a pointer to the PSE
+ * @irq: the irq value to be passed to request_irq
+ * @irq_flags: the flags to be passed to request_irq
+ * @d: PSE interrupt description
+ *
+ * Return: 0 on success and errno on failure
+ */
+int devm_pse_irq_helper(struct pse_controller_dev *pcdev, int irq,
+			int irq_flags, const struct pse_irq_desc *d)
+{
+	struct device *dev = pcdev->dev;
+	size_t irq_name_len;
+	struct pse_irq *h;
+	char *irq_name;
+	int ret;
+
+	if (!d || !d->map_event || !d->name)
+		return -EINVAL;
+
+	h = devm_kzalloc(dev, sizeof(*h), GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->pcdev = pcdev;
+	h->desc = *d;
+
+	/* IRQ name len is pcdev dev name + 5 char + irq desc name + 1 */
+	irq_name_len = strlen(dev_name(pcdev->dev)) + 5 + strlen(d->name) + 1;
+	irq_name = devm_kzalloc(dev, irq_name_len, GFP_KERNEL);
+	if (!irq_name)
+		return -ENOMEM;
+
+	snprintf(irq_name, irq_name_len, "pse-%s:%s", dev_name(pcdev->dev),
+		 d->name);
+
+	h->notifs = devm_kcalloc(dev, pcdev->nr_lines,
+				 sizeof(*h->notifs), GFP_KERNEL);
+	if (!h->notifs)
+		return -ENOMEM;
+
+	ret = devm_request_threaded_irq(dev, irq, NULL, pse_isr,
+					IRQF_ONESHOT | irq_flags,
+					irq_name, h);
+	if (ret)
+		dev_err(pcdev->dev, "Failed to request IRQ %d\n", irq);
+
+	pcdev->irq = irq;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_pse_irq_helper);
+
 /* PSE control section */
 
 static void __pse_control_release(struct kref *kref)
diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index aba91335273a..1dcc4059b5ab 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -43,6 +43,8 @@ void ethtool_aggregate_rmon_stats(struct net_device *dev,
 				  struct ethtool_rmon_stats *rmon_stats);
 bool ethtool_dev_mm_supported(struct net_device *dev);
 
+void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notif);
+
 #else
 static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
 {
@@ -120,6 +122,11 @@ static inline bool ethtool_dev_mm_supported(struct net_device *dev)
 	return false;
 }
 
+static inline void ethnl_pse_send_ntf(struct phy_device *phydev,
+				      unsigned long notif)
+{
+}
+
 #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */
 
 static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair,
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 8b0866fad2ad..6eb064722aa8 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -7,12 +7,15 @@
 
 #include <linux/list.h>
 #include <uapi/linux/ethtool.h>
+#include <uapi/linux/ethtool_netlink_generated.h>
+#include <linux/regulator/driver.h>
 
 /* Maximum current in uA according to IEEE 802.3-2022 Table 145-1 */
 #define MAX_PI_CURRENT 1920000
 /* Maximum power in mW according to IEEE 802.3-2022 Table 145-16 */
 #define MAX_PI_PW 99900
 
+struct net_device;
 struct phy_device;
 struct pse_controller_dev;
 struct netlink_ext_ack;
@@ -37,6 +40,19 @@ struct ethtool_c33_pse_pw_limit_range {
 	u32 max;
 };
 
+/**
+ * struct pse_irq_desc - notification sender description for IRQ based events.
+ *
+ * @name: the visible name for the IRQ
+ * @map_event: driver callback to map IRQ status into PSE devices with events.
+ */
+struct pse_irq_desc {
+	const char *name;
+	int (*map_event)(int irq, struct pse_controller_dev *pcdev,
+			 unsigned long *notifs,
+			 unsigned long *notifs_mask);
+};
+
 /**
  * struct pse_control_config - PSE control/channel configuration.
  *
@@ -228,6 +244,7 @@ struct pse_pi {
  * @types: types of the PSE controller
  * @pi: table of PSE PIs described in this controller device
  * @no_of_pse_pi: flag set if the pse_pis devicetree node is not used
+ * @irq: PSE interrupt
  */
 struct pse_controller_dev {
 	const struct pse_controller_ops *ops;
@@ -241,6 +258,7 @@ struct pse_controller_dev {
 	enum ethtool_pse_types types;
 	struct pse_pi *pi;
 	bool no_of_pse_pi;
+	int irq;
 };
 
 #if IS_ENABLED(CONFIG_PSE_CONTROLLER)
@@ -249,6 +267,8 @@ void pse_controller_unregister(struct pse_controller_dev *pcdev);
 struct device;
 int devm_pse_controller_register(struct device *dev,
 				 struct pse_controller_dev *pcdev);
+int devm_pse_irq_helper(struct pse_controller_dev *pcdev, int irq,
+			int irq_flags, const struct pse_irq_desc *d);
 
 struct pse_control *of_pse_control_get(struct device_node *node,
 				       struct phy_device *phydev);
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 9a02f579de22..3864aa0de8c7 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -49,6 +49,16 @@ enum hwtstamp_source {
 	HWTSTAMP_SOURCE_PHYLIB,
 };
 
+/**
+ * enum ethtool_pse_event - PSE event list for the PSE controller
+ * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high
+ * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state
+ */
+enum ethtool_pse_event {
+	ETHTOOL_PSE_EVENT_OVER_CURRENT = 1,
+	ETHTOOL_PSE_EVENT_OVER_TEMP = 2,
+};
+
 enum {
 	ETHTOOL_A_HEADER_UNSPEC,
 	ETHTOOL_A_HEADER_DEV_INDEX,
@@ -718,6 +728,14 @@ enum {
 	ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_PSE_NTF_HEADER = 1,
+	ETHTOOL_A_PSE_NTF_EVENTS,
+
+	__ETHTOOL_A_PSE_NTF_CNT,
+	ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1)
+};
+
 enum {
 	ETHTOOL_MSG_USER_NONE = 0,
 	ETHTOOL_MSG_STRSET_GET = 1,
@@ -822,6 +840,7 @@ enum {
 	ETHTOOL_MSG_PHY_NTF,
 	ETHTOOL_MSG_TSCONFIG_GET_REPLY,
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
+	ETHTOOL_MSG_PSE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 4f6b99eab2a6..5443b4e0065a 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -315,3 +315,42 @@ const struct ethnl_request_ops ethnl_pse_request_ops = {
 	.set			= ethnl_set_pse,
 	/* PSE has no notification */
 };
+
+void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs)
+{
+	void *reply_payload;
+	struct sk_buff *skb;
+	int reply_len;
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (!netdev || !notifs)
+		return;
+
+	reply_len = ethnl_reply_header_size() +
+		    nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */
+
+	skb = genlmsg_new(reply_len, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF);
+	if (!reply_payload)
+		goto err_skb;
+
+	ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER);
+	if (ret < 0)
+		goto err_skb;
+
+	if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs))
+		goto err_skb;
+
+	genlmsg_end(skb, reply_payload);
+	ethnl_multicast(skb, netdev);
+	return;
+
+err_skb:
+	nlmsg_free(skb);
+}
+EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf);
-- 
cgit v1.2.3


From 50f8b341d26826aa5fdccb8f497fbff2500934b3 Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:03 +0200
Subject: net: pse-pd: Add support for PSE power domains

Introduce PSE power domain support as groundwork for upcoming port
priority features. Multiple PSE PIs can now be grouped under a single
PSE power domain, enabling future enhancements like defining available
power budgets, port priority modes, and disconnection policies. This
setup will allow the system to assess whether activating a port would
exceed the available power budget, preventing over-budget states
proactively.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-4-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_core.c | 140 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pse-pd/pse.h    |   2 +
 2 files changed, 142 insertions(+)

(limited to 'include')

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 16cc1dc07246..f2fb7ccbc4c2 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -16,8 +16,12 @@
 #include <linux/rtnetlink.h>
 #include <net/net_trackers.h>
 
+#define PSE_PW_D_LIMIT INT_MAX
+
 static DEFINE_MUTEX(pse_list_mutex);
 static LIST_HEAD(pse_controller_list);
+static DEFINE_XARRAY_ALLOC(pse_pw_d_map);
+static DEFINE_MUTEX(pse_pw_d_mutex);
 
 /**
  * struct pse_control - a PSE control
@@ -38,6 +42,18 @@ struct pse_control {
 	struct phy_device *attached_phydev;
 };
 
+/**
+ * struct pse_power_domain - a PSE power domain
+ * @id: ID of the power domain
+ * @supply: Power supply the Power Domain
+ * @refcnt: Number of gets of this pse_power_domain
+ */
+struct pse_power_domain {
+	int id;
+	struct regulator *supply;
+	struct kref refcnt;
+};
+
 static int of_load_single_pse_pi_pairset(struct device_node *node,
 					 struct pse_pi *pi,
 					 int pairset_num)
@@ -485,6 +501,125 @@ devm_pse_pi_regulator_register(struct pse_controller_dev *pcdev,
 	return 0;
 }
 
+static void __pse_pw_d_release(struct kref *kref)
+{
+	struct pse_power_domain *pw_d = container_of(kref,
+						     struct pse_power_domain,
+						     refcnt);
+
+	regulator_put(pw_d->supply);
+	xa_erase(&pse_pw_d_map, pw_d->id);
+	mutex_unlock(&pse_pw_d_mutex);
+}
+
+/**
+ * pse_flush_pw_ds - flush all PSE power domains of a PSE
+ * @pcdev: a pointer to the initialized PSE controller device
+ */
+static void pse_flush_pw_ds(struct pse_controller_dev *pcdev)
+{
+	struct pse_power_domain *pw_d;
+	int i;
+
+	for (i = 0; i < pcdev->nr_lines; i++) {
+		if (!pcdev->pi[i].pw_d)
+			continue;
+
+		pw_d = xa_load(&pse_pw_d_map, pcdev->pi[i].pw_d->id);
+		if (!pw_d)
+			continue;
+
+		kref_put_mutex(&pw_d->refcnt, __pse_pw_d_release,
+			       &pse_pw_d_mutex);
+	}
+}
+
+/**
+ * devm_pse_alloc_pw_d - allocate a new PSE power domain for a device
+ * @dev: device that is registering this PSE power domain
+ *
+ * Return: Pointer to the newly allocated PSE power domain or error pointers
+ */
+static struct pse_power_domain *devm_pse_alloc_pw_d(struct device *dev)
+{
+	struct pse_power_domain *pw_d;
+	int index, ret;
+
+	pw_d = devm_kzalloc(dev, sizeof(*pw_d), GFP_KERNEL);
+	if (!pw_d)
+		return ERR_PTR(-ENOMEM);
+
+	ret = xa_alloc(&pse_pw_d_map, &index, pw_d, XA_LIMIT(1, PSE_PW_D_LIMIT),
+		       GFP_KERNEL);
+	if (ret)
+		return ERR_PTR(ret);
+
+	kref_init(&pw_d->refcnt);
+	pw_d->id = index;
+	return pw_d;
+}
+
+/**
+ * pse_register_pw_ds - register the PSE power domains for a PSE
+ * @pcdev: a pointer to the PSE controller device
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int pse_register_pw_ds(struct pse_controller_dev *pcdev)
+{
+	int i, ret = 0;
+
+	mutex_lock(&pse_pw_d_mutex);
+	for (i = 0; i < pcdev->nr_lines; i++) {
+		struct regulator_dev *rdev = pcdev->pi[i].rdev;
+		struct pse_power_domain *pw_d;
+		struct regulator *supply;
+		bool present = false;
+		unsigned long index;
+
+		/* No regulator or regulator parent supply registered.
+		 * We need a regulator parent to register a PSE power domain
+		 */
+		if (!rdev || !rdev->supply)
+			continue;
+
+		xa_for_each(&pse_pw_d_map, index, pw_d) {
+			/* Power supply already registered as a PSE power
+			 * domain.
+			 */
+			if (regulator_is_equal(pw_d->supply, rdev->supply)) {
+				present = true;
+				pcdev->pi[i].pw_d = pw_d;
+				break;
+			}
+		}
+		if (present) {
+			kref_get(&pw_d->refcnt);
+			continue;
+		}
+
+		pw_d = devm_pse_alloc_pw_d(pcdev->dev);
+		if (IS_ERR(pw_d)) {
+			ret = PTR_ERR(pw_d);
+			goto out;
+		}
+
+		supply = regulator_get(&rdev->dev, rdev->supply_name);
+		if (IS_ERR(supply)) {
+			xa_erase(&pse_pw_d_map, pw_d->id);
+			ret = PTR_ERR(supply);
+			goto out;
+		}
+
+		pw_d->supply = supply;
+		pcdev->pi[i].pw_d = pw_d;
+	}
+
+out:
+	mutex_unlock(&pse_pw_d_mutex);
+	return ret;
+}
+
 /**
  * pse_controller_register - register a PSE controller device
  * @pcdev: a pointer to the initialized PSE controller device
@@ -544,6 +679,10 @@ int pse_controller_register(struct pse_controller_dev *pcdev)
 			return ret;
 	}
 
+	ret = pse_register_pw_ds(pcdev);
+	if (ret)
+		return ret;
+
 	mutex_lock(&pse_list_mutex);
 	list_add(&pcdev->list, &pse_controller_list);
 	mutex_unlock(&pse_list_mutex);
@@ -558,6 +697,7 @@ EXPORT_SYMBOL_GPL(pse_controller_register);
  */
 void pse_controller_unregister(struct pse_controller_dev *pcdev)
 {
+	pse_flush_pw_ds(pcdev);
 	pse_release_pis(pcdev);
 	mutex_lock(&pse_list_mutex);
 	list_del(&pcdev->list);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 6eb064722aa8..f736b1677ea5 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -222,12 +222,14 @@ struct pse_pi_pairset {
  * @np: device node pointer of the PSE PI node
  * @rdev: regulator represented by the PSE PI
  * @admin_state_enabled: PI enabled state
+ * @pw_d: Power domain of the PSE PI
  */
 struct pse_pi {
 	struct pse_pi_pairset pairset[2];
 	struct device_node *np;
 	struct regulator_dev *rdev;
 	bool admin_state_enabled;
+	struct pse_power_domain *pw_d;
 };
 
 /**
-- 
cgit v1.2.3


From 1176978ed851952652ddea3685e2f71a0e5d61ff Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:04 +0200
Subject: net: ethtool: Add support for new power domains index description

Report the index of the newly introduced PSE power domain to the user,
enabling improved management of the power budget for PSE devices.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-5-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       | 5 +++++
 Documentation/networking/ethtool-netlink.rst   | 4 ++++
 drivers/net/pse-pd/pse_core.c                  | 3 +++
 include/linux/pse-pd/pse.h                     | 2 ++
 include/uapi/linux/ethtool_netlink_generated.h | 1 +
 net/ethtool/pse-pd.c                           | 7 +++++++
 6 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 92b34a19f308..dfd9b842a4e7 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -1406,6 +1406,10 @@ attribute-sets:
         type: nest
         multi-attr: true
         nested-attributes: c33-pse-pw-limit
+      -
+        name: pse-pw-d-id
+        type: u32
+        name-prefix: ethtool-a-
   -
     name: rss
     attr-cnt-name: __ethtool-a-rss-cnt
@@ -2229,6 +2233,7 @@ operations:
             - c33-pse-ext-substate
             - c33-pse-avail-pw-limit
             - c33-pse-pw-limit-ranges
+            - pse-pw-d-id
       dump: *pse-get-op
     -
       name: pse-set
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 433737865bc2..e9af8e58564c 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1789,6 +1789,7 @@ Kernel response contents:
                                                       limit of the PoE PSE.
   ``ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES``       nested  Supported power limit
                                                       configuration ranges.
+  ``ETHTOOL_A_PSE_PW_D_ID``                      u32  Index of the PSE power domain
   ==========================================  ======  =============================
 
 When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies
@@ -1862,6 +1863,9 @@ identifies the C33 PSE power limit ranges through
 If the controller works with fixed classes, the min and max values will be
 equal.
 
+The ``ETHTOOL_A_PSE_PW_D_ID`` attribute identifies the index of PSE power
+domain.
+
 PSE_SET
 =======
 
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index f2fb7ccbc4c2..7d424c22225e 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -1098,6 +1098,9 @@ int pse_ethtool_get_status(struct pse_control *psec,
 	pcdev = psec->pcdev;
 	ops = pcdev->ops;
 	mutex_lock(&pcdev->lock);
+	if (pcdev->pi[psec->id].pw_d)
+		status->pw_d_id = pcdev->pi[psec->id].pw_d->id;
+
 	ret = ops->pi_get_admin_state(pcdev, psec->id, &admin_state);
 	if (ret)
 		goto out;
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index f736b1677ea5..2f8ecfd87d43 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -114,6 +114,7 @@ struct pse_pw_limit_ranges {
 /**
  * struct ethtool_pse_control_status - PSE control/channel status.
  *
+ * @pw_d_id: PSE power domain index.
  * @podl_admin_state: operational state of the PoDL PSE
  *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
  * @podl_pw_status: power detection status of the PoDL PSE.
@@ -135,6 +136,7 @@ struct pse_pw_limit_ranges {
  *	ranges
  */
 struct ethtool_pse_control_status {
+	u32 pw_d_id;
 	enum ethtool_podl_pse_admin_state podl_admin_state;
 	enum ethtool_podl_pse_pw_d_status podl_pw_status;
 	enum ethtool_c33_pse_admin_state c33_admin_state;
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 3864aa0de8c7..ed344c8533eb 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -652,6 +652,7 @@ enum {
 	ETHTOOL_A_C33_PSE_EXT_SUBSTATE,
 	ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
 	ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES,
+	ETHTOOL_A_PSE_PW_D_ID,
 
 	__ETHTOOL_A_PSE_CNT,
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 5443b4e0065a..6a978a55959e 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -83,6 +83,8 @@ static int pse_reply_size(const struct ethnl_req_info *req_base,
 	const struct ethtool_pse_control_status *st = &data->status;
 	int len = 0;
 
+	if (st->pw_d_id)
+		len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */
 	if (st->podl_admin_state > 0)
 		len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */
 	if (st->podl_pw_status > 0)
@@ -148,6 +150,11 @@ static int pse_fill_reply(struct sk_buff *skb,
 	const struct pse_reply_data *data = PSE_REPDATA(reply_base);
 	const struct ethtool_pse_control_status *st = &data->status;
 
+	if (st->pw_d_id &&
+	    nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID,
+			st->pw_d_id))
+		return -EMSGSIZE;
+
 	if (st->podl_admin_state > 0 &&
 	    nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
 			st->podl_admin_state))
-- 
cgit v1.2.3


From ffef61d6d27374542f1bce4452200d9bdd2e1edd Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:06 +0200
Subject: net: pse-pd: Add support for budget evaluation strategies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the ability to configure the PSE PI budget evaluation
strategies. Budget evaluation strategies is utilized by PSE controllers to
determine which ports to turn off first in scenarios such as power budget
exceedance.

The pis_prio_max value is used to define the maximum priority level
supported by the controller. Both the current priority and the maximum
priority are exposed to the user through the pse_ethtool_get_status call.

This patch add support for two mode of budget evaluation strategies.
1. Static Method:

   This method involves distributing power based on PD classification.
   It’s straightforward and stable, the PSE core keeping track of the
   budget and subtracting the power requested by each PD’s class.

   Advantages: Every PD gets its promised power at any time, which
   guarantees reliability.

   Disadvantages: PD classification steps are large, meaning devices
   request much more power than they actually need. As a result, the power
   supply may only operate at, say, 50% capacity, which is inefficient and
   wastes money.

   Priority max value is matching the number of PSE PIs within the PSE.

2. Dynamic Method:

   To address the inefficiencies of the static method, vendors like
   Microchip have introduced dynamic power budgeting, as seen in the
   PD692x0 firmware. This method monitors the current consumption per port
   and subtracts it from the available power budget. When the budget is
   exceeded, lower-priority ports are shut down.

   Advantages: This method optimizes resource utilization, saving costs.

   Disadvantages: Low-priority devices may experience instability.

   Priority max value is set by the PSE controller driver.

For now, budget evaluation methods are not configurable and cannot be
mixed. They are hardcoded in the PSE driver itself, as no current PSE
controller supports both methods.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-7-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  30 +-
 drivers/net/pse-pd/pse_core.c                  | 731 +++++++++++++++++++++++--
 include/linux/pse-pd/pse.h                     |  76 +++
 include/uapi/linux/ethtool_netlink_generated.h |  18 +
 4 files changed, 815 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index dfd9b842a4e7..7a9a857370e2 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -122,13 +122,39 @@ definitions:
     name: pse-event
     doc: PSE event list for the PSE controller
     type: flags
+    name-prefix: ethtool-
     entries:
       -
-        name: over-current
+        name: pse-event-over-current
         doc: PSE output current is too high
       -
-        name: over-temp
+        name: pse-event-over-temp
         doc: PSE in over temperature state
+      -
+        name: c33-pse-event-detection
+        doc: |
+          detection process occur on the PSE. IEEE 802.3-2022 33.2.5 and
+          145.2.6 PSE detection of PDs. IEEE 802.3-202 30.9.1.1.5
+          aPSEPowerDetectionStatus
+      -
+        name: c33-pse-event-classification
+        doc: |
+          classification process occur on the PSE. IEEE 802.3-2022 33.2.6
+          and 145.2.8 classification of PDs mutual identification.
+          IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification.
+      -
+        name: c33-pse-event-disconnection
+        doc: |
+          PD has been disconnected on the PSE. IEEE 802.3-2022 33.3.8
+          and 145.3.9 PD Maintain Power Signature. IEEE 802.3-2022
+          33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20
+          aPSEMPSAbsentCounter.
+      -
+        name: pse-event-over-budget
+        doc: PSE turned off due to over budget situation
+      -
+        name: pse-event-sw-pw-control-error
+        doc: PSE faced an error managing the power control from software
 
 attribute-sets:
   -
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 495d72f98029..23eb3c9d0bcd 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -47,11 +47,14 @@ struct pse_control {
  * @id: ID of the power domain
  * @supply: Power supply the Power Domain
  * @refcnt: Number of gets of this pse_power_domain
+ * @budget_eval_strategy: Current power budget evaluation strategy of the
+ *			  power domain
  */
 struct pse_power_domain {
 	int id;
 	struct regulator *supply;
 	struct kref refcnt;
+	u32 budget_eval_strategy;
 };
 
 static int of_load_single_pse_pi_pairset(struct device_node *node,
@@ -297,6 +300,115 @@ static int pse_pi_is_hw_enabled(struct pse_controller_dev *pcdev, int id)
 	return 0;
 }
 
+/**
+ * pse_pi_is_admin_enable_pending - Check if PI is in admin enable pending state
+ *				    which mean the power is not yet being
+ *				    delivered
+ * @pcdev: a pointer to the PSE controller device
+ * @id: Index of the PI
+ *
+ * Detects if a PI is enabled in software with a PD detected, but the hardware
+ * admin state hasn't been applied yet.
+ *
+ * This function is used in the power delivery and retry mechanisms to determine
+ * which PIs need to have power delivery attempted again.
+ *
+ * Return: true if the PI has admin enable flag set in software but not yet
+ *	   reflected in the hardware admin state, false otherwise.
+ */
+static bool
+pse_pi_is_admin_enable_pending(struct pse_controller_dev *pcdev, int id)
+{
+	int ret;
+
+	/* PI not enabled or nothing is plugged */
+	if (!pcdev->pi[id].admin_state_enabled ||
+	    !pcdev->pi[id].isr_pd_detected)
+		return false;
+
+	ret = pse_pi_is_hw_enabled(pcdev, id);
+	/* PSE PI is already enabled at hardware level */
+	if (ret == 1)
+		return false;
+
+	return true;
+}
+
+static int _pse_pi_delivery_power_sw_pw_ctrl(struct pse_controller_dev *pcdev,
+					     int id,
+					     struct netlink_ext_ack *extack);
+
+/**
+ * pse_pw_d_retry_power_delivery - Retry power delivery for pending ports in a
+ *				   PSE power domain
+ * @pcdev: a pointer to the PSE controller device
+ * @pw_d: a pointer to the PSE power domain
+ *
+ * Scans all ports in the specified power domain and attempts to enable power
+ * delivery to any ports that have admin enable state set but don't yet have
+ * hardware power enabled. Used when there are changes in connection status,
+ * admin state, or priority that might allow previously unpowered ports to
+ * receive power, especially in over-budget conditions.
+ */
+static void pse_pw_d_retry_power_delivery(struct pse_controller_dev *pcdev,
+					  struct pse_power_domain *pw_d)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < pcdev->nr_lines; i++) {
+		int prio_max = pcdev->nr_lines;
+		struct netlink_ext_ack extack;
+
+		if (pcdev->pi[i].pw_d != pw_d)
+			continue;
+
+		if (!pse_pi_is_admin_enable_pending(pcdev, i))
+			continue;
+
+		/* Do not try to enable PI with a lower prio (higher value)
+		 * than one which already can't be enabled.
+		 */
+		if (pcdev->pi[i].prio > prio_max)
+			continue;
+
+		ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, i, &extack);
+		if (ret == -ERANGE)
+			prio_max = pcdev->pi[i].prio;
+	}
+}
+
+/**
+ * pse_pw_d_is_sw_pw_control - Determine if power control is software managed
+ * @pcdev: a pointer to the PSE controller device
+ * @pw_d: a pointer to the PSE power domain
+ *
+ * This function determines whether the power control for a specific power
+ * domain is managed by software in the interrupt handler rather than directly
+ * by hardware.
+ *
+ * Software power control is active in the following cases:
+ * - When the budget evaluation strategy is set to static
+ * - When the budget evaluation strategy is disabled but the PSE controller
+ *   has an interrupt handler that can report if a Powered Device is connected
+ *
+ * Return: true if the power control of the power domain is managed by software,
+ *         false otherwise
+ */
+static bool pse_pw_d_is_sw_pw_control(struct pse_controller_dev *pcdev,
+				      struct pse_power_domain *pw_d)
+{
+	if (!pw_d)
+		return false;
+
+	if (pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_STATIC)
+		return true;
+	if (pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_DISABLED &&
+	    pcdev->ops->pi_enable && pcdev->irq)
+		return true;
+
+	return false;
+}
+
 static int pse_pi_is_enabled(struct regulator_dev *rdev)
 {
 	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
@@ -309,17 +421,252 @@ static int pse_pi_is_enabled(struct regulator_dev *rdev)
 
 	id = rdev_get_id(rdev);
 	mutex_lock(&pcdev->lock);
+	if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d)) {
+		ret = pcdev->pi[id].admin_state_enabled;
+		goto out;
+	}
+
 	ret = pse_pi_is_hw_enabled(pcdev, id);
+
+out:
 	mutex_unlock(&pcdev->lock);
 
 	return ret;
 }
 
+/**
+ * pse_pi_deallocate_pw_budget - Deallocate power budget of the PI
+ * @pi: a pointer to the PSE PI
+ */
+static void pse_pi_deallocate_pw_budget(struct pse_pi *pi)
+{
+	if (!pi->pw_d || !pi->pw_allocated_mW)
+		return;
+
+	regulator_free_power_budget(pi->pw_d->supply, pi->pw_allocated_mW);
+	pi->pw_allocated_mW = 0;
+}
+
+/**
+ * _pse_pi_disable - Call disable operation. Assumes the PSE lock has been
+ *		     acquired.
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int _pse_pi_disable(struct pse_controller_dev *pcdev, int id)
+{
+	const struct pse_controller_ops *ops = pcdev->ops;
+	int ret;
+
+	if (!ops->pi_disable)
+		return -EOPNOTSUPP;
+
+	ret = ops->pi_disable(pcdev, id);
+	if (ret)
+		return ret;
+
+	pse_pi_deallocate_pw_budget(&pcdev->pi[id]);
+
+	if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d))
+		pse_pw_d_retry_power_delivery(pcdev, pcdev->pi[id].pw_d);
+
+	return 0;
+}
+
+/**
+ * pse_disable_pi_pol - Disable a PI on a power budget policy
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE PI
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int pse_disable_pi_pol(struct pse_controller_dev *pcdev, int id)
+{
+	unsigned long notifs = ETHTOOL_PSE_EVENT_OVER_BUDGET;
+	struct pse_ntf ntf = {};
+	int ret;
+
+	dev_dbg(pcdev->dev, "Disabling PI %d to free power budget\n", id);
+
+	ret = _pse_pi_disable(pcdev, id);
+	if (ret)
+		notifs |= ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR;
+
+	ntf.notifs = notifs;
+	ntf.id = id;
+	kfifo_in_spinlocked(&pcdev->ntf_fifo, &ntf, 1, &pcdev->ntf_fifo_lock);
+	schedule_work(&pcdev->ntf_work);
+
+	return ret;
+}
+
+/**
+ * pse_disable_pi_prio - Disable all PIs of a given priority inside a PSE
+ *			 power domain
+ * @pcdev: a pointer to the PSE
+ * @pw_d: a pointer to the PSE power domain
+ * @prio: priority
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int pse_disable_pi_prio(struct pse_controller_dev *pcdev,
+			       struct pse_power_domain *pw_d,
+			       int prio)
+{
+	int i;
+
+	for (i = 0; i < pcdev->nr_lines; i++) {
+		int ret;
+
+		if (pcdev->pi[i].prio != prio ||
+		    pcdev->pi[i].pw_d != pw_d ||
+		    pse_pi_is_hw_enabled(pcdev, i) <= 0)
+			continue;
+
+		ret = pse_disable_pi_pol(pcdev, i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/**
+ * pse_pi_allocate_pw_budget_static_prio - Allocate power budget for the PI
+ *					   when the budget eval strategy is
+ *					   static
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ * @pw_req: power requested in mW
+ * @extack: extack for error reporting
+ *
+ * Allocates power using static budget evaluation strategy, where allocation
+ * is based on PD classification. When insufficient budget is available,
+ * lower-priority ports (higher priority numbers) are turned off first.
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int
+pse_pi_allocate_pw_budget_static_prio(struct pse_controller_dev *pcdev, int id,
+				      int pw_req, struct netlink_ext_ack *extack)
+{
+	struct pse_pi *pi = &pcdev->pi[id];
+	int ret, _prio;
+
+	_prio = pcdev->nr_lines;
+	while (regulator_request_power_budget(pi->pw_d->supply, pw_req) == -ERANGE) {
+		if (_prio <= pi->prio) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PI %d: not enough power budget available",
+					   id);
+			return -ERANGE;
+		}
+
+		ret = pse_disable_pi_prio(pcdev, pi->pw_d, _prio);
+		if (ret < 0)
+			return ret;
+
+		_prio--;
+	}
+
+	pi->pw_allocated_mW = pw_req;
+	return 0;
+}
+
+/**
+ * pse_pi_allocate_pw_budget - Allocate power budget for the PI
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ * @pw_req: power requested in mW
+ * @extack: extack for error reporting
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int pse_pi_allocate_pw_budget(struct pse_controller_dev *pcdev, int id,
+				     int pw_req, struct netlink_ext_ack *extack)
+{
+	struct pse_pi *pi = &pcdev->pi[id];
+
+	if (!pi->pw_d)
+		return 0;
+
+	/* PSE_BUDGET_EVAL_STRAT_STATIC */
+	if (pi->pw_d->budget_eval_strategy == PSE_BUDGET_EVAL_STRAT_STATIC)
+		return pse_pi_allocate_pw_budget_static_prio(pcdev, id, pw_req,
+							     extack);
+
+	return 0;
+}
+
+/**
+ * _pse_pi_delivery_power_sw_pw_ctrl - Enable PSE PI in case of software power
+ *				       control. Assumes the PSE lock has been
+ *				       acquired.
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ * @extack: extack for error reporting
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int _pse_pi_delivery_power_sw_pw_ctrl(struct pse_controller_dev *pcdev,
+					     int id,
+					     struct netlink_ext_ack *extack)
+{
+	const struct pse_controller_ops *ops = pcdev->ops;
+	struct pse_pi *pi = &pcdev->pi[id];
+	int ret, pw_req;
+
+	if (!ops->pi_get_pw_req) {
+		/* No power allocation management */
+		ret = ops->pi_enable(pcdev, id);
+		if (ret)
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PI %d: enable error %d",
+					   id, ret);
+		return ret;
+	}
+
+	ret = ops->pi_get_pw_req(pcdev, id);
+	if (ret < 0)
+		return ret;
+
+	pw_req = ret;
+
+	/* Compare requested power with port power limit and use the lowest
+	 * one.
+	 */
+	if (ops->pi_get_pw_limit) {
+		ret = ops->pi_get_pw_limit(pcdev, id);
+		if (ret < 0)
+			return ret;
+
+		if (ret < pw_req)
+			pw_req = ret;
+	}
+
+	ret = pse_pi_allocate_pw_budget(pcdev, id, pw_req, extack);
+	if (ret)
+		return ret;
+
+	ret = ops->pi_enable(pcdev, id);
+	if (ret) {
+		pse_pi_deallocate_pw_budget(pi);
+		NL_SET_ERR_MSG_FMT(extack,
+				   "PI %d: enable error %d",
+				   id, ret);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int pse_pi_enable(struct regulator_dev *rdev)
 {
 	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
 	const struct pse_controller_ops *ops;
-	int id, ret;
+	int id, ret = 0;
 
 	ops = pcdev->ops;
 	if (!ops->pi_enable)
@@ -327,6 +674,23 @@ static int pse_pi_enable(struct regulator_dev *rdev)
 
 	id = rdev_get_id(rdev);
 	mutex_lock(&pcdev->lock);
+	if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[id].pw_d)) {
+		/* Manage enabled status by software.
+		 * Real enable process will happen if a port is connected.
+		 */
+		if (pcdev->pi[id].isr_pd_detected) {
+			struct netlink_ext_ack extack;
+
+			ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, id, &extack);
+		}
+		if (!ret || ret == -ERANGE) {
+			pcdev->pi[id].admin_state_enabled = 1;
+			ret = 0;
+		}
+		mutex_unlock(&pcdev->lock);
+		return ret;
+	}
+
 	ret = ops->pi_enable(pcdev, id);
 	if (!ret)
 		pcdev->pi[id].admin_state_enabled = 1;
@@ -338,21 +702,18 @@ static int pse_pi_enable(struct regulator_dev *rdev)
 static int pse_pi_disable(struct regulator_dev *rdev)
 {
 	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
-	const struct pse_controller_ops *ops;
+	struct pse_pi *pi;
 	int id, ret;
 
-	ops = pcdev->ops;
-	if (!ops->pi_disable)
-		return -EOPNOTSUPP;
-
 	id = rdev_get_id(rdev);
+	pi = &pcdev->pi[id];
 	mutex_lock(&pcdev->lock);
-	ret = ops->pi_disable(pcdev, id);
+	ret = _pse_pi_disable(pcdev, id);
 	if (!ret)
-		pcdev->pi[id].admin_state_enabled = 0;
-	mutex_unlock(&pcdev->lock);
+		pi->admin_state_enabled = 0;
 
-	return ret;
+	mutex_unlock(&pcdev->lock);
+	return 0;
 }
 
 static int _pse_pi_get_voltage(struct regulator_dev *rdev)
@@ -628,6 +989,11 @@ static int pse_register_pw_ds(struct pse_controller_dev *pcdev)
 		}
 
 		pw_d->supply = supply;
+		if (pcdev->supp_budget_eval_strategies)
+			pw_d->budget_eval_strategy = pcdev->supp_budget_eval_strategies;
+		else
+			pw_d->budget_eval_strategy = PSE_BUDGET_EVAL_STRAT_DISABLED;
+		kref_init(&pw_d->refcnt);
 		pcdev->pi[i].pw_d = pw_d;
 	}
 
@@ -636,6 +1002,34 @@ out:
 	return ret;
 }
 
+/**
+ * pse_send_ntf_worker - Worker to send PSE notifications
+ * @work: work object
+ *
+ * Manage and send PSE netlink notifications using a workqueue to avoid
+ * deadlock between pcdev_lock and pse_list_mutex.
+ */
+static void pse_send_ntf_worker(struct work_struct *work)
+{
+	struct pse_controller_dev *pcdev;
+	struct pse_ntf ntf;
+
+	pcdev = container_of(work, struct pse_controller_dev, ntf_work);
+
+	while (kfifo_out(&pcdev->ntf_fifo, &ntf, 1)) {
+		struct net_device *netdev;
+		struct pse_control *psec;
+
+		psec = pse_control_find_by_id(pcdev, ntf.id);
+		rtnl_lock();
+		netdev = pse_control_get_netdev(psec);
+		if (netdev)
+			ethnl_pse_send_ntf(netdev, ntf.notifs);
+		rtnl_unlock();
+		pse_control_put(psec);
+	}
+}
+
 /**
  * pse_controller_register - register a PSE controller device
  * @pcdev: a pointer to the initialized PSE controller device
@@ -649,6 +1043,13 @@ int pse_controller_register(struct pse_controller_dev *pcdev)
 
 	mutex_init(&pcdev->lock);
 	INIT_LIST_HEAD(&pcdev->pse_control_head);
+	spin_lock_init(&pcdev->ntf_fifo_lock);
+	ret = kfifo_alloc(&pcdev->ntf_fifo, pcdev->nr_lines, GFP_KERNEL);
+	if (ret) {
+		dev_err(pcdev->dev, "failed to allocate kfifo notifications\n");
+		return ret;
+	}
+	INIT_WORK(&pcdev->ntf_work, pse_send_ntf_worker);
 
 	if (!pcdev->nr_lines)
 		pcdev->nr_lines = 1;
@@ -715,6 +1116,10 @@ void pse_controller_unregister(struct pse_controller_dev *pcdev)
 {
 	pse_flush_pw_ds(pcdev);
 	pse_release_pis(pcdev);
+	if (pcdev->irq)
+		disable_irq(pcdev->irq);
+	cancel_work_sync(&pcdev->ntf_work);
+	kfifo_free(&pcdev->ntf_fifo);
 	mutex_lock(&pse_list_mutex);
 	list_del(&pcdev->list);
 	mutex_unlock(&pse_list_mutex);
@@ -786,6 +1191,52 @@ static unsigned long pse_to_regulator_notifs(unsigned long notifs)
 	return rnotifs;
 }
 
+/**
+ * pse_set_config_isr - Set PSE control config according to the PSE
+ *			notifications
+ * @pcdev: a pointer to the PSE
+ * @id: index of the PSE control
+ * @notifs: PSE event notifications
+ *
+ * Return: 0 on success and failure value on error
+ */
+static int pse_set_config_isr(struct pse_controller_dev *pcdev, int id,
+			      unsigned long notifs)
+{
+	int ret = 0;
+
+	if (notifs & PSE_BUDGET_EVAL_STRAT_DYNAMIC)
+		return 0;
+
+	if ((notifs & ETHTOOL_C33_PSE_EVENT_DISCONNECTION) &&
+	    ((notifs & ETHTOOL_C33_PSE_EVENT_DETECTION) ||
+	     (notifs & ETHTOOL_C33_PSE_EVENT_CLASSIFICATION))) {
+		dev_dbg(pcdev->dev,
+			"PI %d: error, connection and disconnection reported simultaneously",
+			id);
+		return -EINVAL;
+	}
+
+	if (notifs & ETHTOOL_C33_PSE_EVENT_CLASSIFICATION) {
+		struct netlink_ext_ack extack;
+
+		pcdev->pi[id].isr_pd_detected = true;
+		if (pcdev->pi[id].admin_state_enabled) {
+			ret = _pse_pi_delivery_power_sw_pw_ctrl(pcdev, id,
+								&extack);
+			if (ret == -ERANGE)
+				ret = 0;
+		}
+	} else if (notifs & ETHTOOL_C33_PSE_EVENT_DISCONNECTION) {
+		if (pcdev->pi[id].admin_state_enabled &&
+		    pcdev->pi[id].isr_pd_detected)
+			ret = _pse_pi_disable(pcdev, id);
+		pcdev->pi[id].isr_pd_detected = false;
+	}
+
+	return ret;
+}
+
 /**
  * pse_isr - IRQ handler for PSE
  * @irq: irq number
@@ -808,36 +1259,42 @@ static irqreturn_t pse_isr(int irq, void *data)
 	memset(h->notifs, 0, pcdev->nr_lines * sizeof(*h->notifs));
 	mutex_lock(&pcdev->lock);
 	ret = desc->map_event(irq, pcdev, h->notifs, &notifs_mask);
-	mutex_unlock(&pcdev->lock);
-	if (ret || !notifs_mask)
+	if (ret || !notifs_mask) {
+		mutex_unlock(&pcdev->lock);
 		return IRQ_NONE;
+	}
 
 	for_each_set_bit(i, &notifs_mask, pcdev->nr_lines) {
 		unsigned long notifs, rnotifs;
-		struct net_device *netdev;
-		struct pse_control *psec;
+		struct pse_ntf ntf = {};
 
 		/* Do nothing PI not described */
 		if (!pcdev->pi[i].rdev)
 			continue;
 
 		notifs = h->notifs[i];
+		if (pse_pw_d_is_sw_pw_control(pcdev, pcdev->pi[i].pw_d)) {
+			ret = pse_set_config_isr(pcdev, i, notifs);
+			if (ret)
+				notifs |= ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR;
+		}
+
 		dev_dbg(h->pcdev->dev,
 			"Sending PSE notification EVT 0x%lx\n", notifs);
 
-		psec = pse_control_find_by_id(pcdev, i);
-		rtnl_lock();
-		netdev = pse_control_get_netdev(psec);
-		if (netdev)
-			ethnl_pse_send_ntf(netdev, notifs);
-		rtnl_unlock();
-		pse_control_put(psec);
+		ntf.notifs = notifs;
+		ntf.id = i;
+		kfifo_in_spinlocked(&pcdev->ntf_fifo, &ntf, 1,
+				    &pcdev->ntf_fifo_lock);
+		schedule_work(&pcdev->ntf_work);
 
 		rnotifs = pse_to_regulator_notifs(notifs);
 		regulator_notifier_call_chain(pcdev->pi[i].rdev, rnotifs,
 					      NULL);
 	}
 
+	mutex_unlock(&pcdev->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -960,6 +1417,20 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index,
 		goto free_psec;
 	}
 
+	if (!pcdev->ops->pi_get_admin_state) {
+		ret = -EOPNOTSUPP;
+		goto free_psec;
+	}
+
+	/* Initialize admin_state_enabled before the regulator_get. This
+	 * aims to have the right value reported in the first is_enabled
+	 * call in case of control managed by software.
+	 */
+	ret = pse_pi_is_hw_enabled(pcdev, index);
+	if (ret < 0)
+		goto free_psec;
+
+	pcdev->pi[index].admin_state_enabled = ret;
 	psec->ps = devm_regulator_get_exclusive(pcdev->dev,
 						rdev_get_name(pcdev->pi[index].rdev));
 	if (IS_ERR(psec->ps)) {
@@ -967,12 +1438,6 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index,
 		goto put_module;
 	}
 
-	ret = regulator_is_enabled(psec->ps);
-	if (ret < 0)
-		goto regulator_put;
-
-	pcdev->pi[index].admin_state_enabled = ret;
-
 	psec->pcdev = pcdev;
 	list_add(&psec->list, &pcdev->pse_control_head);
 	psec->id = index;
@@ -981,8 +1446,6 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index,
 
 	return psec;
 
-regulator_put:
-	devm_regulator_put(psec->ps);
 put_module:
 	module_put(pcdev->owner);
 free_psec:
@@ -1093,6 +1556,35 @@ out:
 }
 EXPORT_SYMBOL_GPL(of_pse_control_get);
 
+/**
+ * pse_get_sw_admin_state - Convert the software admin state to c33 or podl
+ *			    admin state value used in the standard
+ * @psec: PSE control pointer
+ * @admin_state: a pointer to the admin_state structure
+ */
+static void pse_get_sw_admin_state(struct pse_control *psec,
+				   struct pse_admin_state *admin_state)
+{
+	struct pse_pi *pi = &psec->pcdev->pi[psec->id];
+
+	if (pse_has_podl(psec)) {
+		if (pi->admin_state_enabled)
+			admin_state->podl_admin_state =
+				ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED;
+		else
+			admin_state->podl_admin_state =
+				ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED;
+	}
+	if (pse_has_c33(psec)) {
+		if (pi->admin_state_enabled)
+			admin_state->c33_admin_state =
+				ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED;
+		else
+			admin_state->c33_admin_state =
+				ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED;
+	}
+}
+
 /**
  * pse_ethtool_get_status - get status of PSE control
  * @psec: PSE control pointer
@@ -1109,19 +1601,46 @@ int pse_ethtool_get_status(struct pse_control *psec,
 	struct pse_pw_status pw_status = {0};
 	const struct pse_controller_ops *ops;
 	struct pse_controller_dev *pcdev;
+	struct pse_pi *pi;
 	int ret;
 
 	pcdev = psec->pcdev;
 	ops = pcdev->ops;
+
+	pi = &pcdev->pi[psec->id];
 	mutex_lock(&pcdev->lock);
-	if (pcdev->pi[psec->id].pw_d)
-		status->pw_d_id = pcdev->pi[psec->id].pw_d->id;
+	if (pi->pw_d) {
+		status->pw_d_id = pi->pw_d->id;
+		if (pse_pw_d_is_sw_pw_control(pcdev, pi->pw_d)) {
+			pse_get_sw_admin_state(psec, &admin_state);
+		} else {
+			ret = ops->pi_get_admin_state(pcdev, psec->id,
+						      &admin_state);
+			if (ret)
+				goto out;
+		}
+		status->podl_admin_state = admin_state.podl_admin_state;
+		status->c33_admin_state = admin_state.c33_admin_state;
 
-	ret = ops->pi_get_admin_state(pcdev, psec->id, &admin_state);
-	if (ret)
-		goto out;
-	status->podl_admin_state = admin_state.podl_admin_state;
-	status->c33_admin_state = admin_state.c33_admin_state;
+		switch (pi->pw_d->budget_eval_strategy) {
+		case PSE_BUDGET_EVAL_STRAT_STATIC:
+			status->prio_max = pcdev->nr_lines - 1;
+			status->prio = pi->prio;
+			break;
+		case PSE_BUDGET_EVAL_STRAT_DYNAMIC:
+			status->prio_max = pcdev->pis_prio_max;
+			if (ops->pi_get_prio) {
+				ret = ops->pi_get_prio(pcdev, psec->id);
+				if (ret < 0)
+					goto out;
+
+				status->prio = ret;
+			}
+			break;
+		default:
+			break;
+		}
+	}
 
 	ret = ops->pi_get_pw_status(pcdev, psec->id, &pw_status);
 	if (ret)
@@ -1270,6 +1789,52 @@ int pse_ethtool_set_config(struct pse_control *psec,
 }
 EXPORT_SYMBOL_GPL(pse_ethtool_set_config);
 
+/**
+ * pse_pi_update_pw_budget - Update PSE power budget allocated with new
+ *			     power in mW
+ * @pcdev: a pointer to the PSE controller device
+ * @id: index of the PSE PI
+ * @pw_req: power requested
+ * @extack: extack for reporting useful error messages
+ *
+ * Return: Previous power allocated on success and failure value on error
+ */
+static int pse_pi_update_pw_budget(struct pse_controller_dev *pcdev, int id,
+				   const unsigned int pw_req,
+				   struct netlink_ext_ack *extack)
+{
+	struct pse_pi *pi = &pcdev->pi[id];
+	int previous_pw_allocated;
+	int pw_diff, ret = 0;
+
+	/* We don't want pw_allocated_mW value change in the middle of an
+	 * power budget update
+	 */
+	mutex_lock(&pcdev->lock);
+	previous_pw_allocated = pi->pw_allocated_mW;
+	pw_diff = pw_req - previous_pw_allocated;
+	if (!pw_diff) {
+		goto out;
+	} else if (pw_diff > 0) {
+		ret = regulator_request_power_budget(pi->pw_d->supply, pw_diff);
+		if (ret) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "PI %d: not enough power budget available",
+					   id);
+			goto out;
+		}
+
+	} else {
+		regulator_free_power_budget(pi->pw_d->supply, -pw_diff);
+	}
+	pi->pw_allocated_mW = pw_req;
+	ret = previous_pw_allocated;
+
+out:
+	mutex_unlock(&pcdev->lock);
+	return ret;
+}
+
 /**
  * pse_ethtool_set_pw_limit - set PSE control power limit
  * @psec: PSE control pointer
@@ -1282,7 +1847,7 @@ int pse_ethtool_set_pw_limit(struct pse_control *psec,
 			     struct netlink_ext_ack *extack,
 			     const unsigned int pw_limit)
 {
-	int uV, uA, ret;
+	int uV, uA, ret, previous_pw_allocated = 0;
 	s64 tmp_64;
 
 	if (pw_limit > MAX_PI_PW)
@@ -1306,10 +1871,100 @@ int pse_ethtool_set_pw_limit(struct pse_control *psec,
 	/* uA = mW * 1000000000 / uV */
 	uA = DIV_ROUND_CLOSEST_ULL(tmp_64, uV);
 
-	return regulator_set_current_limit(psec->ps, 0, uA);
+	/* Update power budget only in software power control case and
+	 * if a Power Device is powered.
+	 */
+	if (pse_pw_d_is_sw_pw_control(psec->pcdev,
+				      psec->pcdev->pi[psec->id].pw_d) &&
+	    psec->pcdev->pi[psec->id].admin_state_enabled &&
+	    psec->pcdev->pi[psec->id].isr_pd_detected) {
+		ret = pse_pi_update_pw_budget(psec->pcdev, psec->id,
+					      pw_limit, extack);
+		if (ret < 0)
+			return ret;
+		previous_pw_allocated = ret;
+	}
+
+	ret = regulator_set_current_limit(psec->ps, 0, uA);
+	if (ret < 0 && previous_pw_allocated) {
+		pse_pi_update_pw_budget(psec->pcdev, psec->id,
+					previous_pw_allocated, extack);
+	}
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(pse_ethtool_set_pw_limit);
 
+/**
+ * pse_ethtool_set_prio - Set PSE PI priority according to the budget
+ *			  evaluation strategy
+ * @psec: PSE control pointer
+ * @extack: extack for reporting useful error messages
+ * @prio: priovity value
+ *
+ * Return: 0 on success and failure value on error
+ */
+int pse_ethtool_set_prio(struct pse_control *psec,
+			 struct netlink_ext_ack *extack,
+			 unsigned int prio)
+{
+	struct pse_controller_dev *pcdev = psec->pcdev;
+	const struct pse_controller_ops *ops;
+	int ret = 0;
+
+	if (!pcdev->pi[psec->id].pw_d) {
+		NL_SET_ERR_MSG(extack, "no power domain attached");
+		return -EOPNOTSUPP;
+	}
+
+	/* We don't want priority change in the middle of an
+	 * enable/disable call or a priority mode change
+	 */
+	mutex_lock(&pcdev->lock);
+	switch (pcdev->pi[psec->id].pw_d->budget_eval_strategy) {
+	case PSE_BUDGET_EVAL_STRAT_STATIC:
+		if (prio >= pcdev->nr_lines) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "priority %d exceed priority max %d",
+					   prio, pcdev->nr_lines);
+			ret = -ERANGE;
+			goto out;
+		}
+
+		pcdev->pi[psec->id].prio = prio;
+		pse_pw_d_retry_power_delivery(pcdev, pcdev->pi[psec->id].pw_d);
+		break;
+
+	case PSE_BUDGET_EVAL_STRAT_DYNAMIC:
+		ops = psec->pcdev->ops;
+		if (!ops->pi_set_prio) {
+			NL_SET_ERR_MSG(extack,
+				       "pse driver does not support setting port priority");
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+
+		if (prio > pcdev->pis_prio_max) {
+			NL_SET_ERR_MSG_FMT(extack,
+					   "priority %d exceed priority max %d",
+					   prio, pcdev->pis_prio_max);
+			ret = -ERANGE;
+			goto out;
+		}
+
+		ret = ops->pi_set_prio(pcdev, psec->id, prio);
+		break;
+
+	default:
+		ret = -EOPNOTSUPP;
+	}
+
+out:
+	mutex_unlock(&pcdev->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pse_ethtool_set_prio);
+
 bool pse_has_podl(struct pse_control *psec)
 {
 	return psec->pcdev->types & ETHTOOL_PSE_PODL;
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 2f8ecfd87d43..e5f305cef82e 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -6,6 +6,8 @@
 #define _LINUX_PSE_CONTROLLER_H
 
 #include <linux/list.h>
+#include <linux/netlink.h>
+#include <linux/kfifo.h>
 #include <uapi/linux/ethtool.h>
 #include <uapi/linux/ethtool_netlink_generated.h>
 #include <linux/regulator/driver.h>
@@ -134,6 +136,9 @@ struct pse_pw_limit_ranges {
  *	is in charge of the memory allocation
  * @c33_pw_limit_nb_ranges: number of supported power limit configuration
  *	ranges
+ * @prio_max: max priority allowed for the c33_prio variable value.
+ * @prio: priority of the PSE. Managed by PSE core in case of static budget
+ *	evaluation strategy.
  */
 struct ethtool_pse_control_status {
 	u32 pw_d_id;
@@ -147,6 +152,8 @@ struct ethtool_pse_control_status {
 	u32 c33_avail_pw_limit;
 	struct ethtool_c33_pse_pw_limit_range *c33_pw_limit_ranges;
 	u32 c33_pw_limit_nb_ranges;
+	u32 prio_max;
+	u32 prio;
 };
 
 /**
@@ -170,6 +177,11 @@ struct ethtool_pse_control_status {
  *			    range. The driver is in charge of the memory
  *			    allocation and should return the number of
  *			    ranges.
+ * @pi_get_prio: Get the PSE PI priority.
+ * @pi_set_prio: Configure the PSE PI priority.
+ * @pi_get_pw_req: Get the power requested by a PD before enabling the PSE PI.
+ *		   This is only relevant when an interrupt is registered using
+ *		   devm_pse_irq_helper helper.
  */
 struct pse_controller_ops {
 	int (*setup_pi_matrix)(struct pse_controller_dev *pcdev);
@@ -190,6 +202,10 @@ struct pse_controller_ops {
 			       int id, int max_mW);
 	int (*pi_get_pw_limit_ranges)(struct pse_controller_dev *pcdev, int id,
 				      struct pse_pw_limit_ranges *pw_limit_ranges);
+	int (*pi_get_prio)(struct pse_controller_dev *pcdev, int id);
+	int (*pi_set_prio)(struct pse_controller_dev *pcdev, int id,
+			   unsigned int prio);
+	int (*pi_get_pw_req)(struct pse_controller_dev *pcdev, int id);
 };
 
 struct module;
@@ -225,6 +241,13 @@ struct pse_pi_pairset {
  * @rdev: regulator represented by the PSE PI
  * @admin_state_enabled: PI enabled state
  * @pw_d: Power domain of the PSE PI
+ * @prio: Priority of the PSE PI. Used in static budget evaluation strategy
+ * @isr_pd_detected: PSE PI detection status managed by the interruption
+ *		     handler. This variable is relevant when the power enabled
+ *		     management is managed in software like the static
+ *		     budget evaluation strategy.
+ * @pw_allocated_mW: Power allocated to a PSE PI to manage power budget in
+ *		     static budget evaluation strategy.
  */
 struct pse_pi {
 	struct pse_pi_pairset pairset[2];
@@ -232,6 +255,20 @@ struct pse_pi {
 	struct regulator_dev *rdev;
 	bool admin_state_enabled;
 	struct pse_power_domain *pw_d;
+	int prio;
+	bool isr_pd_detected;
+	int pw_allocated_mW;
+};
+
+/**
+ * struct pse_ntf - PSE notification element
+ *
+ * @id: ID of the PSE control
+ * @notifs: PSE notifications to be reported
+ */
+struct pse_ntf {
+	int id;
+	unsigned long notifs;
 };
 
 /**
@@ -249,6 +286,12 @@ struct pse_pi {
  * @pi: table of PSE PIs described in this controller device
  * @no_of_pse_pi: flag set if the pse_pis devicetree node is not used
  * @irq: PSE interrupt
+ * @pis_prio_max: Maximum value allowed for the PSE PIs priority
+ * @supp_budget_eval_strategies: budget evaluation strategies supported
+ *				 by the PSE
+ * @ntf_work: workqueue for PSE notification management
+ * @ntf_fifo: PSE notifications FIFO
+ * @ntf_fifo_lock: protect @ntf_fifo writer
  */
 struct pse_controller_dev {
 	const struct pse_controller_ops *ops;
@@ -263,6 +306,29 @@ struct pse_controller_dev {
 	struct pse_pi *pi;
 	bool no_of_pse_pi;
 	int irq;
+	unsigned int pis_prio_max;
+	u32 supp_budget_eval_strategies;
+	struct work_struct ntf_work;
+	DECLARE_KFIFO_PTR(ntf_fifo, struct pse_ntf);
+	spinlock_t ntf_fifo_lock; /* Protect @ntf_fifo writer */
+};
+
+/**
+ * enum pse_budget_eval_strategies - PSE budget evaluation strategies.
+ * @PSE_BUDGET_EVAL_STRAT_DISABLED: Budget evaluation strategy disabled.
+ * @PSE_BUDGET_EVAL_STRAT_STATIC: PSE static budget evaluation strategy.
+ *	Budget evaluation strategy based on the power requested during PD
+ *	classification. This strategy is managed by the PSE core.
+ * @PSE_BUDGET_EVAL_STRAT_DYNAMIC: PSE dynamic budget evaluation
+ *	strategy. Budget evaluation strategy based on the current consumption
+ *	per ports compared to the total	power budget. This mode is managed by
+ *	the PSE controller.
+ */
+
+enum pse_budget_eval_strategies {
+	PSE_BUDGET_EVAL_STRAT_DISABLED	= 1 << 0,
+	PSE_BUDGET_EVAL_STRAT_STATIC	= 1 << 1,
+	PSE_BUDGET_EVAL_STRAT_DYNAMIC	= 1 << 2,
 };
 
 #if IS_ENABLED(CONFIG_PSE_CONTROLLER)
@@ -287,6 +353,9 @@ int pse_ethtool_set_config(struct pse_control *psec,
 int pse_ethtool_set_pw_limit(struct pse_control *psec,
 			     struct netlink_ext_ack *extack,
 			     const unsigned int pw_limit);
+int pse_ethtool_set_prio(struct pse_control *psec,
+			 struct netlink_ext_ack *extack,
+			 unsigned int prio);
 
 bool pse_has_podl(struct pse_control *psec);
 bool pse_has_c33(struct pse_control *psec);
@@ -324,6 +393,13 @@ static inline int pse_ethtool_set_pw_limit(struct pse_control *psec,
 	return -EOPNOTSUPP;
 }
 
+static inline int pse_ethtool_set_prio(struct pse_control *psec,
+				       struct netlink_ext_ack *extack,
+				       unsigned int prio)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline bool pse_has_podl(struct pse_control *psec)
 {
 	return false;
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index ed344c8533eb..c6a95224be25 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -53,10 +53,28 @@ enum hwtstamp_source {
  * enum ethtool_pse_event - PSE event list for the PSE controller
  * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high
  * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state
+ * @ETHTOOL_C33_PSE_EVENT_DETECTION: detection process occur on the PSE. IEEE
+ *   802.3-2022 33.2.5 and 145.2.6 PSE detection of PDs. IEEE 802.3-202
+ *   30.9.1.1.5 aPSEPowerDetectionStatus
+ * @ETHTOOL_C33_PSE_EVENT_CLASSIFICATION: classification process occur on the
+ *   PSE. IEEE 802.3-2022 33.2.6 and 145.2.8 classification of PDs mutual
+ *   identification. IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification.
+ * @ETHTOOL_C33_PSE_EVENT_DISCONNECTION: PD has been disconnected on the PSE.
+ *   IEEE 802.3-2022 33.3.8 and 145.3.9 PD Maintain Power Signature. IEEE
+ *   802.3-2022 33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20
+ *   aPSEMPSAbsentCounter.
+ * @ETHTOOL_PSE_EVENT_OVER_BUDGET: PSE turned off due to over budget situation
+ * @ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR: PSE faced an error managing the
+ *   power control from software
  */
 enum ethtool_pse_event {
 	ETHTOOL_PSE_EVENT_OVER_CURRENT = 1,
 	ETHTOOL_PSE_EVENT_OVER_TEMP = 2,
+	ETHTOOL_C33_PSE_EVENT_DETECTION = 4,
+	ETHTOOL_C33_PSE_EVENT_CLASSIFICATION = 8,
+	ETHTOOL_C33_PSE_EVENT_DISCONNECTION = 16,
+	ETHTOOL_PSE_EVENT_OVER_BUDGET = 32,
+	ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR = 64,
 };
 
 enum {
-- 
cgit v1.2.3


From eeb0c8f72f49a21984981188404cfd3700edbaff Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:07 +0200
Subject: net: ethtool: Add PSE port priority support feature

This patch expands the status information provided by ethtool for PSE c33
with current port priority and max port priority. It also adds a call to
pse_ethtool_set_prio() to configure the PSE port priority.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-8-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       | 11 +++++++++++
 Documentation/networking/ethtool-netlink.rst   | 26 ++++++++++++++++++++++++++
 include/uapi/linux/ethtool_netlink_generated.h |  2 ++
 net/ethtool/pse-pd.c                           | 18 ++++++++++++++++++
 4 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 7a9a857370e2..e6a77e8053a0 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -1436,6 +1436,14 @@ attribute-sets:
         name: pse-pw-d-id
         type: u32
         name-prefix: ethtool-a-
+      -
+        name: pse-prio-max
+        type: u32
+        name-prefix: ethtool-a-
+      -
+        name: pse-prio
+        type: u32
+        name-prefix: ethtool-a-
   -
     name: rss
     attr-cnt-name: __ethtool-a-rss-cnt
@@ -2260,6 +2268,8 @@ operations:
             - c33-pse-avail-pw-limit
             - c33-pse-pw-limit-ranges
             - pse-pw-d-id
+            - pse-prio-max
+            - pse-prio
       dump: *pse-get-op
     -
       name: pse-set
@@ -2274,6 +2284,7 @@ operations:
             - podl-pse-admin-control
             - c33-pse-admin-control
             - c33-pse-avail-pw-limit
+            - pse-prio
     -
       name: rss-get
       doc: Get RSS params.
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index e9af8e58564c..e45bb555e909 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1790,6 +1790,10 @@ Kernel response contents:
   ``ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES``       nested  Supported power limit
                                                       configuration ranges.
   ``ETHTOOL_A_PSE_PW_D_ID``                      u32  Index of the PSE power domain
+  ``ETHTOOL_A_PSE_PRIO_MAX``                     u32  Priority maximum configurable
+                                                      on the PoE PSE
+  ``ETHTOOL_A_PSE_PRIO``                         u32  Priority of the PoE PSE
+                                                      currently configured
   ==========================================  ======  =============================
 
 When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies
@@ -1866,6 +1870,12 @@ equal.
 The ``ETHTOOL_A_PSE_PW_D_ID`` attribute identifies the index of PSE power
 domain.
 
+When set, the optional ``ETHTOOL_A_PSE_PRIO_MAX`` attribute identifies
+the PSE maximum priority value.
+When set, the optional ``ETHTOOL_A_PSE_PRIO`` attributes is used to
+identifies the currently configured PSE priority.
+For a description of PSE priority attributes, see ``PSE_SET``.
+
 PSE_SET
 =======
 
@@ -1879,6 +1889,8 @@ Request contents:
   ``ETHTOOL_A_C33_PSE_ADMIN_CONTROL``        u32  Control PSE Admin state
   ``ETHTOOL_A_C33_PSE_AVAIL_PWR_LIMIT``      u32  Control PoE PSE available
                                                   power limit
+  ``ETHTOOL_A_PSE_PRIO``                     u32  Control priority of the
+                                                  PoE PSE
   ======================================  ======  =============================
 
 When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL`` attribute is used
@@ -1901,6 +1913,20 @@ various existing products that document power consumption in watts rather than
 classes. If power limit configuration based on classes is needed, the
 conversion can be done in user space, for example by ethtool.
 
+When set, the optional ``ETHTOOL_A_PSE_PRIO`` attributes is used to
+control the PSE priority. Allowed priority value are between zero and
+the value of ``ETHTOOL_A_PSE_PRIO_MAX`` attribute.
+
+A lower value indicates a higher priority, meaning that a priority value
+of 0 corresponds to the highest port priority.
+Port priority serves two functions:
+
+ - Power-up Order: After a reset, ports are powered up in order of their
+   priority from highest to lowest. Ports with higher priority
+   (lower values) power up first.
+ - Shutdown Order: When the power budget is exceeded, ports with lower
+   priority (higher values) are turned off first.
+
 PSE_NTF
 =======
 
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index c6a95224be25..8e5d067e7ddf 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -671,6 +671,8 @@ enum {
 	ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
 	ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES,
 	ETHTOOL_A_PSE_PW_D_ID,
+	ETHTOOL_A_PSE_PRIO_MAX,
+	ETHTOOL_A_PSE_PRIO,
 
 	__ETHTOOL_A_PSE_CNT,
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 6a978a55959e..6c536dfe52da 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -111,6 +111,9 @@ static int pse_reply_size(const struct ethnl_req_info *req_base,
 		len += st->c33_pw_limit_nb_ranges *
 		       (nla_total_size(0) +
 			nla_total_size(sizeof(u32)) * 2);
+	if (st->prio_max)
+		/* _PSE_PRIO_MAX + _PSE_PRIO */
+		len += nla_total_size(sizeof(u32)) * 2;
 
 	return len;
 }
@@ -205,6 +208,11 @@ static int pse_fill_reply(struct sk_buff *skb,
 	    pse_put_pw_limit_ranges(skb, st))
 		return -EMSGSIZE;
 
+	if (st->prio_max &&
+	    (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) ||
+	     nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio)))
+		return -EMSGSIZE;
+
 	return 0;
 }
 
@@ -226,6 +234,7 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = {
 		NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED,
 				 ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED),
 	[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 },
+	[ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 },
 };
 
 static int
@@ -274,6 +283,15 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (ret)
 		return ret;
 
+	if (tb[ETHTOOL_A_PSE_PRIO]) {
+		unsigned int prio;
+
+		prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]);
+		ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio);
+		if (ret)
+			return ret;
+	}
+
 	if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) {
 		unsigned int pw_limit;
 
-- 
cgit v1.2.3


From deefc7083414de81aad102b60f0390f600d7eb79 Mon Sep 17 00:00:00 2001
From: Clément Le Goffic <clement.legoffic@foss.st.com>
Date: Fri, 13 Jun 2025 12:14:12 +0200
Subject: gpio: mmio: add BGPIOF_NO_INPUT flag for GPO gpiochip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using bgpio_init with a gpiochip acting as a GPO (output only), the
gpiochip ops `direction_input` was set to `bgpio_simple_dir_in` by
default but we have no input ability.

Adding this flag allows to set a valid ops for the `direction_output`
ops without setting a valid ops for `direction_input` by default.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Clément Le Goffic <clement.legoffic@foss.st.com>
Link: https://lore.kernel.org/r/20250613-hdp-upstream-v5-1-6fd6f0dc527c@foss.st.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpio-mmio.c    | 11 ++++++++++-
 include/linux/gpio/driver.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index 4841e4ebe7a6..09b9e1275e7e 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -335,6 +335,11 @@ static int bgpio_dir_return(struct gpio_chip *gc, unsigned int gpio, bool dir_ou
 		return pinctrl_gpio_direction_input(gc, gpio);
 }
 
+static int bgpio_dir_in_err(struct gpio_chip *gc, unsigned int gpio)
+{
+	return -EINVAL;
+}
+
 static int bgpio_simple_dir_in(struct gpio_chip *gc, unsigned int gpio)
 {
 	return bgpio_dir_return(gc, gpio, false);
@@ -566,7 +571,11 @@ static int bgpio_setup_direction(struct gpio_chip *gc,
 			gc->direction_output = bgpio_dir_out_err;
 		else
 			gc->direction_output = bgpio_simple_dir_out;
-		gc->direction_input = bgpio_simple_dir_in;
+
+		if (flags & BGPIOF_NO_INPUT)
+			gc->direction_input = bgpio_dir_in_err;
+		else
+			gc->direction_input = bgpio_simple_dir_in;
 	}
 
 	return 0;
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index b53233051bee..97cc75623261 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -750,6 +750,7 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 #define BGPIOF_NO_OUTPUT		BIT(5) /* only input */
 #define BGPIOF_NO_SET_ON_INPUT		BIT(6)
 #define BGPIOF_PINCTRL_BACKEND		BIT(7) /* Call pinctrl direction setters */
+#define BGPIOF_NO_INPUT			BIT(8) /* only output */
 
 #ifdef CONFIG_GPIOLIB_IRQCHIP
 int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
-- 
cgit v1.2.3


From 6012ce6b30567aa8ec8dc5b648b7841f9f74ca7c Mon Sep 17 00:00:00 2001
From: Richard Leitner <richard.leitner@linux.dev>
Date: Tue, 17 Jun 2025 09:31:37 +0200
Subject: leds: led-class-flash:: Fix flash_timeout comment

The comment for the flash_timeout setter mentioned it is the "flash
duration". Fix this by changing it to "flash timeout".

Signed-off-by: Richard Leitner <richard.leitner@linux.dev>
Link: https://lore.kernel.org/r/20250617-ov9282-flash-strobe-v5-3-9762da74d065@linux.dev
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/led-class-flash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/led-class-flash.h b/include/linux/led-class-flash.h
index 21ec856c36bc..775a96217518 100644
--- a/include/linux/led-class-flash.h
+++ b/include/linux/led-class-flash.h
@@ -197,7 +197,7 @@ int led_update_flash_brightness(struct led_classdev_flash *fled_cdev);
  * @fled_cdev: the flash LED to set
  * @timeout: the flash timeout to set it to
  *
- * Set the flash strobe duration.
+ * Set the flash strobe timeout.
  *
  * Returns: 0 on success or negative error value on failure
  */
-- 
cgit v1.2.3


From bc9a0c68f2583e42ace81a9c229c158b7cdcb45b Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 3 Jun 2025 21:39:05 +0100
Subject: usb: phy: tegra: Remove unused functions

tegra_ehci_phy_restore_start() and tegra_ehci_phy_restore_end()
last use was removed in 2013 by
commit a4faa54e3aa2 ("USB: EHCI: tegra: remove all power management")

tegra_usb_phy_preresume() and tegra_usb_phy_postresume() last
use was removed in 2020 by
commit c3590c7656fb ("usb: host: ehci-tegra: Remove the driver")
(Although that one makes me wonder how much of the rest of the file
is actually used)

Remove both sets.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250603203905.279307-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/phy/phy-tegra-usb.c   | 89 ---------------------------------------
 include/linux/usb/tegra_usb_phy.h |  9 ----
 2 files changed, 98 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index bee222967f6b..fb9031628d39 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -711,58 +711,6 @@ static int utmi_phy_power_off(struct tegra_usb_phy *phy)
 	return utmip_pad_power_off(phy);
 }
 
-static void utmi_phy_preresume(struct tegra_usb_phy *phy)
-{
-	void __iomem *base = phy->regs;
-	u32 val;
-
-	val = readl_relaxed(base + UTMIP_TX_CFG0);
-	val |= UTMIP_HS_DISCON_DISABLE;
-	writel_relaxed(val, base + UTMIP_TX_CFG0);
-}
-
-static void utmi_phy_postresume(struct tegra_usb_phy *phy)
-{
-	void __iomem *base = phy->regs;
-	u32 val;
-
-	val = readl_relaxed(base + UTMIP_TX_CFG0);
-	val &= ~UTMIP_HS_DISCON_DISABLE;
-	writel_relaxed(val, base + UTMIP_TX_CFG0);
-}
-
-static void utmi_phy_restore_start(struct tegra_usb_phy *phy,
-				   enum tegra_usb_phy_port_speed port_speed)
-{
-	void __iomem *base = phy->regs;
-	u32 val;
-
-	val = readl_relaxed(base + UTMIP_MISC_CFG0);
-	val &= ~UTMIP_DPDM_OBSERVE_SEL(~0);
-	if (port_speed == TEGRA_USB_PHY_PORT_SPEED_LOW)
-		val |= UTMIP_DPDM_OBSERVE_SEL_FS_K;
-	else
-		val |= UTMIP_DPDM_OBSERVE_SEL_FS_J;
-	writel_relaxed(val, base + UTMIP_MISC_CFG0);
-	usleep_range(1, 10);
-
-	val = readl_relaxed(base + UTMIP_MISC_CFG0);
-	val |= UTMIP_DPDM_OBSERVE;
-	writel_relaxed(val, base + UTMIP_MISC_CFG0);
-	usleep_range(10, 100);
-}
-
-static void utmi_phy_restore_end(struct tegra_usb_phy *phy)
-{
-	void __iomem *base = phy->regs;
-	u32 val;
-
-	val = readl_relaxed(base + UTMIP_MISC_CFG0);
-	val &= ~UTMIP_DPDM_OBSERVE;
-	writel_relaxed(val, base + UTMIP_MISC_CFG0);
-	usleep_range(10, 100);
-}
-
 static int ulpi_phy_power_on(struct tegra_usb_phy *phy)
 {
 	void __iomem *base = phy->regs;
@@ -1123,43 +1071,6 @@ disable_clk:
 	return err;
 }
 
-void tegra_usb_phy_preresume(struct usb_phy *u_phy)
-{
-	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
-
-	if (!phy->is_ulpi_phy)
-		utmi_phy_preresume(phy);
-}
-EXPORT_SYMBOL_GPL(tegra_usb_phy_preresume);
-
-void tegra_usb_phy_postresume(struct usb_phy *u_phy)
-{
-	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
-
-	if (!phy->is_ulpi_phy)
-		utmi_phy_postresume(phy);
-}
-EXPORT_SYMBOL_GPL(tegra_usb_phy_postresume);
-
-void tegra_ehci_phy_restore_start(struct usb_phy *u_phy,
-				  enum tegra_usb_phy_port_speed port_speed)
-{
-	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
-
-	if (!phy->is_ulpi_phy)
-		utmi_phy_restore_start(phy, port_speed);
-}
-EXPORT_SYMBOL_GPL(tegra_ehci_phy_restore_start);
-
-void tegra_ehci_phy_restore_end(struct usb_phy *u_phy)
-{
-	struct tegra_usb_phy *phy = to_tegra_usb_phy(u_phy);
-
-	if (!phy->is_ulpi_phy)
-		utmi_phy_restore_end(phy);
-}
-EXPORT_SYMBOL_GPL(tegra_ehci_phy_restore_end);
-
 static int read_utmi_param(struct platform_device *pdev, const char *param,
 			   u8 *dest)
 {
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index e6c14f2b1f9b..40afcee8b4f5 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -80,13 +80,4 @@ struct tegra_usb_phy {
 	bool powered_on;
 };
 
-void tegra_usb_phy_preresume(struct usb_phy *phy);
-
-void tegra_usb_phy_postresume(struct usb_phy *phy);
-
-void tegra_ehci_phy_restore_start(struct usb_phy *phy,
-				 enum tegra_usb_phy_port_speed port_speed);
-
-void tegra_ehci_phy_restore_end(struct usb_phy *phy);
-
 #endif /* __TEGRA_USB_PHY_H */
-- 
cgit v1.2.3


From efec475e5e20c594e10d42f73cf6803805ae14c9 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 9 Jun 2025 00:33:37 +0100
Subject: usb: gadget: config: Remove unused usb_gadget_config_buf

usb_gadget_config_buf() has been unused since 2012's
commit fa06920a3ece ("usb: gadget: Remove File-backed Storage Gadget
(g_file_storage).")

Remove it.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250608233338.179894-2-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/config.c | 53 ---------------------------------------------
 include/linux/usb/gadget.h  |  4 ----
 2 files changed, 57 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/config.c b/drivers/usb/gadget/config.c
index 95f144a54ed9..256364d4b941 100644
--- a/drivers/usb/gadget/config.c
+++ b/drivers/usb/gadget/config.c
@@ -53,59 +53,6 @@ usb_descriptor_fillbuf(void *buf, unsigned buflen,
 }
 EXPORT_SYMBOL_GPL(usb_descriptor_fillbuf);
 
-/**
- * usb_gadget_config_buf - builts a complete configuration descriptor
- * @config: Header for the descriptor, including characteristics such
- *	as power requirements and number of interfaces.
- * @buf: Buffer for the resulting configuration descriptor.
- * @length: Length of buffer.  If this is not big enough to hold the
- *	entire configuration descriptor, an error code will be returned.
- * @desc: Null-terminated vector of pointers to the descriptors (interface,
- *	endpoint, etc) defining all functions in this device configuration.
- *
- * This copies descriptors into the response buffer, building a descriptor
- * for that configuration.  It returns the buffer length or a negative
- * status code.  The config.wTotalLength field is set to match the length
- * of the result, but other descriptor fields (including power usage and
- * interface count) must be set by the caller.
- *
- * Gadget drivers could use this when constructing a config descriptor
- * in response to USB_REQ_GET_DESCRIPTOR.  They will need to patch the
- * resulting bDescriptorType value if USB_DT_OTHER_SPEED_CONFIG is needed.
- */
-int usb_gadget_config_buf(
-	const struct usb_config_descriptor	*config,
-	void					*buf,
-	unsigned				length,
-	const struct usb_descriptor_header	**desc
-)
-{
-	struct usb_config_descriptor		*cp = buf;
-	int					len;
-
-	/* config descriptor first */
-	if (length < USB_DT_CONFIG_SIZE || !desc)
-		return -EINVAL;
-	*cp = *config;
-
-	/* then interface/endpoint/class/vendor/... */
-	len = usb_descriptor_fillbuf(USB_DT_CONFIG_SIZE + (u8 *)buf,
-			length - USB_DT_CONFIG_SIZE, desc);
-	if (len < 0)
-		return len;
-	len += USB_DT_CONFIG_SIZE;
-	if (len > 0xffff)
-		return -EINVAL;
-
-	/* patch up the config descriptor */
-	cp->bLength = USB_DT_CONFIG_SIZE;
-	cp->bDescriptorType = USB_DT_CONFIG;
-	cp->wTotalLength = cpu_to_le16(len);
-	cp->bmAttributes |= USB_CONFIG_ATT_ONE;
-	return len;
-}
-EXPORT_SYMBOL_GPL(usb_gadget_config_buf);
-
 /**
  * usb_copy_descriptors - copy a vector of USB descriptors
  * @src: null-terminated vector to copy
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index df33333650a0..0f28c5512fcb 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -860,10 +860,6 @@ container_of(str_item, struct gadget_string, item)
 int usb_descriptor_fillbuf(void *, unsigned,
 		const struct usb_descriptor_header **);
 
-/* build config descriptor from single descriptor vector */
-int usb_gadget_config_buf(const struct usb_config_descriptor *config,
-	void *buf, unsigned buflen, const struct usb_descriptor_header **desc);
-
 /* copy a NULL-terminated vector of descriptors */
 struct usb_descriptor_header **usb_copy_descriptors(
 		struct usb_descriptor_header **);
-- 
cgit v1.2.3


From 227280ad66ac50287a3ea8b8fd43b7c7a6eb09ac Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 9 Jun 2025 00:33:38 +0100
Subject: usb: gadget: Remove unused usb_remove_config

usb_remove_config() was added in 2012's
commit Fixes: 51cce6fc155c ("usb: gadget: composite: Add
usb_remove_config")
but has remained unused.

I see there was a use in drivers/staging/cch that
was removed by
commit 515e6dd20b3f ("Staging: ccg: delete it from the tree")
but it had it's own copy of usb_remove_config()

Remove it.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250608233338.179894-3-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/gadget/composite.c | 24 ------------------------
 include/linux/usb/composite.h  |  3 ---
 2 files changed, 27 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index 12634f0b45a8..0f649c747584 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1194,30 +1194,6 @@ static void remove_config(struct usb_composite_dev *cdev,
 	}
 }
 
-/**
- * usb_remove_config() - remove a configuration from a device.
- * @cdev: wraps the USB gadget
- * @config: the configuration
- *
- * Drivers must call usb_gadget_disconnect before calling this function
- * to disconnect the device from the host and make sure the host will not
- * try to enumerate the device while we are changing the config list.
- */
-void usb_remove_config(struct usb_composite_dev *cdev,
-		      struct usb_configuration *config)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&cdev->lock, flags);
-
-	if (cdev->config == config)
-		reset_config(cdev);
-
-	spin_unlock_irqrestore(&cdev->lock, flags);
-
-	remove_config(cdev, config);
-}
-
 /*-------------------------------------------------------------------------*/
 
 /* We support strings in multiple languages ... string descriptor zero
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index d8c4e9f73839..c18041fafa52 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -339,9 +339,6 @@ int usb_add_config(struct usb_composite_dev *,
 		struct usb_configuration *,
 		int (*)(struct usb_configuration *));
 
-void usb_remove_config(struct usb_composite_dev *,
-		struct usb_configuration *);
-
 /* predefined index for usb_composite_driver */
 enum {
 	USB_GADGET_MANUFACTURER_IDX	= 0,
-- 
cgit v1.2.3


From 47c83f95f3e293a9e924a1ba41d782bb059bb954 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 9 Jun 2025 00:56:17 +0100
Subject: usb: core: Remove unused usb_unlink_anchored_urbs

usb_unlink_anchored_urbs() has been unused since it's last use was
removed in 2009 by
commit 9b9c5aaeedfd ("ar9170: xmit code revamp")

Remove it.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20250608235617.200731-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/usb/anchors.rst | 11 -----------
 drivers/usb/core/urb.c                   | 29 +++--------------------------
 include/linux/usb.h                      |  1 -
 3 files changed, 3 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/usb/anchors.rst b/Documentation/driver-api/usb/anchors.rst
index 4b248e691bd6..5a93d171e76c 100644
--- a/Documentation/driver-api/usb/anchors.rst
+++ b/Documentation/driver-api/usb/anchors.rst
@@ -45,17 +45,6 @@ This function kills all URBs associated with an anchor. The URBs
 are called in the reverse temporal order they were submitted.
 This way no data can be reordered.
 
-:c:func:`usb_unlink_anchored_urbs`
-----------------------------------
-
-
-This function unlinks all URBs associated with an anchor. The URBs
-are processed in the reverse temporal order they were submitted.
-This is similar to :c:func:`usb_kill_anchored_urbs`, but it will not sleep.
-Therefore no guarantee is made that the URBs have been unlinked when
-the call returns. They may be unlinked later but will be unlinked in
-finite time.
-
 :c:func:`usb_scuttle_anchored_urbs`
 -----------------------------------
 
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 5e52a35486af..0e58a8531d6e 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -597,10 +597,9 @@ EXPORT_SYMBOL_GPL(usb_submit_urb);
  * code).
  *
  * Drivers should not call this routine or related routines, such as
- * usb_kill_urb() or usb_unlink_anchored_urbs(), after their disconnect
- * method has returned.  The disconnect function should synchronize with
- * a driver's I/O routines to insure that all URB-related activity has
- * completed before it returns.
+ * usb_kill_urb(), after their disconnect method has returned. The
+ * disconnect function should synchronize with a driver's I/O routines
+ * to insure that all URB-related activity has completed before it returns.
  *
  * This request is asynchronous, however the HCD might call the ->complete()
  * callback during unlink. Therefore when drivers call usb_unlink_urb(), they
@@ -890,28 +889,6 @@ void usb_unpoison_anchored_urbs(struct usb_anchor *anchor)
 	spin_unlock_irqrestore(&anchor->lock, flags);
 }
 EXPORT_SYMBOL_GPL(usb_unpoison_anchored_urbs);
-/**
- * usb_unlink_anchored_urbs - asynchronously cancel transfer requests en masse
- * @anchor: anchor the requests are bound to
- *
- * this allows all outstanding URBs to be unlinked starting
- * from the back of the queue. This function is asynchronous.
- * The unlinking is just triggered. It may happen after this
- * function has returned.
- *
- * This routine should not be called by a driver after its disconnect
- * method has returned.
- */
-void usb_unlink_anchored_urbs(struct usb_anchor *anchor)
-{
-	struct urb *victim;
-
-	while ((victim = usb_get_from_anchor(anchor)) != NULL) {
-		usb_unlink_urb(victim);
-		usb_put_urb(victim);
-	}
-}
-EXPORT_SYMBOL_GPL(usb_unlink_anchored_urbs);
 
 /**
  * usb_anchor_suspend_wakeups
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 1b2545b4363b..e8662843e68c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1780,7 +1780,6 @@ extern void usb_block_urb(struct urb *urb);
 extern void usb_kill_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_poison_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_unpoison_anchored_urbs(struct usb_anchor *anchor);
-extern void usb_unlink_anchored_urbs(struct usb_anchor *anchor);
 extern void usb_anchor_suspend_wakeups(struct usb_anchor *anchor);
 extern void usb_anchor_resume_wakeups(struct usb_anchor *anchor);
 extern void usb_anchor_urb(struct urb *urb, struct usb_anchor *anchor);
-- 
cgit v1.2.3


From 78c76554c6b94dfa5e101b870f0c57b6c230503e Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Sat, 14 Jun 2025 20:56:43 +0800
Subject: usb: chipidea: udc: add CI_HDRC_CONTROLLER_PULLUP_EVENT event

The device controller will send CI_HDRC_CONTROLLER_PULLUP_EVENT event
when it's going to pullup or pulldown data line.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Acked-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20250614125645.207732-2-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/chipidea/udc.c   | 5 +++++
 include/linux/usb/chipidea.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index 8a9b31fd5c89..8ec9817d3311 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c
@@ -1970,6 +1970,11 @@ static int ci_udc_pullup(struct usb_gadget *_gadget, int is_on)
 		hw_write(ci, OP_USBCMD, USBCMD_RS, USBCMD_RS);
 	else
 		hw_write(ci, OP_USBCMD, USBCMD_RS, 0);
+
+	if (ci->platdata->notify_event) {
+		_gadget->connected = is_on;
+		ci->platdata->notify_event(ci, CI_HDRC_CONTROLLER_PULLUP_EVENT);
+	}
 	pm_runtime_put_sync(ci->dev);
 
 	return 0;
diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h
index ebdfef124b2b..e17ebeee24e3 100644
--- a/include/linux/usb/chipidea.h
+++ b/include/linux/usb/chipidea.h
@@ -72,6 +72,7 @@ struct ci_hdrc_platform_data {
 #define CI_HDRC_IMX_HSIC_ACTIVE_EVENT		2
 #define CI_HDRC_IMX_HSIC_SUSPEND_EVENT		3
 #define CI_HDRC_CONTROLLER_VBUS_EVENT		4
+#define CI_HDRC_CONTROLLER_PULLUP_EVENT		5
 	int	(*notify_event) (struct ci_hdrc *ci, unsigned event);
 	struct regulator	*reg_vbus;
 	struct usb_otg_caps	ci_otg_caps;
-- 
cgit v1.2.3


From 48ea23115887c12e53335ca2eddc0d0e3d99e5d9 Mon Sep 17 00:00:00 2001
From: RubenKelevra <rubenkelevra@gmail.com>
Date: Wed, 18 Jun 2025 18:47:43 +0200
Subject: include: fsl_devices.h: drop unused, misspelled
 FLS_USB2_WORKAROUND_ENGCM09152

The macro was introduced in commit 69cb1ec4ce4d ("mxc_udc: add
workaround for ENGcm09152 for i.MX35") on 2010-10-15, but its prefix was
misspelled as **FLS_** instead of the usual **FSL_**.

Its last in-tree user disappeared with commit a390bef7db1f ("usb:
gadget: fsl_mxc_udc: Remove the driver") on 2020-12-10, so the macro has
been completely unused since then.

Remove the dead and wrongly named definition.

Signed-off-by: RubenKelevra <rubenkelevra@gmail.com>
Link: https://lore.kernel.org/r/20250618164743.1916838-1-rubenkelevra@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fsl_devices.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fsl_devices.h b/include/linux/fsl_devices.h
index 5d231ce8709b..49f20c2f99bf 100644
--- a/include/linux/fsl_devices.h
+++ b/include/linux/fsl_devices.h
@@ -118,7 +118,6 @@ struct fsl_usb2_platform_data {
 #define FSL_USB2_PORT0_ENABLED	0x00000001
 #define FSL_USB2_PORT1_ENABLED	0x00000002
 
-#define FLS_USB2_WORKAROUND_ENGCM09152	(1 << 0)
 
 struct spi_device;
 
-- 
cgit v1.2.3


From 3b18405763c1ebb1efc15feef5563c9cdb2cc3a7 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 11 Jun 2025 14:14:15 +0300
Subject: usb: acpi: fix device link removal

The device link to the USB4 host interface has to be removed
manually since it's no longer auto removed.

Fixes: 623dae3e7084 ("usb: acpi: fix boot hang due to early incorrect 'tunneled' USB3 device links")
Cc: stable <stable@kernel.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Link: https://lore.kernel.org/r/20250611111415.2707865-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hub.c      | 3 +++
 drivers/usb/core/usb-acpi.c | 4 +++-
 include/linux/usb.h         | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 5c12dfdef569..6bb6e92cb0a4 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2368,6 +2368,9 @@ void usb_disconnect(struct usb_device **pdev)
 	usb_remove_ep_devs(&udev->ep0);
 	usb_unlock_device(udev);
 
+	if (udev->usb4_link)
+		device_link_del(udev->usb4_link);
+
 	/* Unregister the device.  The device driver is responsible
 	 * for de-configuring the device and invoking the remove-device
 	 * notifier chain (used by usbfs and possibly others).
diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c
index ea1ce8beb0cb..489dbdc96f94 100644
--- a/drivers/usb/core/usb-acpi.c
+++ b/drivers/usb/core/usb-acpi.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(usb_acpi_set_power_state);
  */
 static int usb_acpi_add_usb4_devlink(struct usb_device *udev)
 {
-	const struct device_link *link;
+	struct device_link *link;
 	struct usb_port *port_dev;
 	struct usb_hub *hub;
 
@@ -188,6 +188,8 @@ static int usb_acpi_add_usb4_devlink(struct usb_device *udev)
 	dev_dbg(&port_dev->dev, "Created device link from %s to %s\n",
 		dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev));
 
+	udev->usb4_link = link;
+
 	return 0;
 }
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 1b2545b4363b..92c752f5446f 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -614,6 +614,7 @@ struct usb3_lpm_parameters {
  *	FIXME -- complete doc
  * @authenticated: Crypto authentication passed
  * @tunnel_mode: Connection native or tunneled over USB4
+ * @usb4_link: device link to the USB4 host interface
  * @lpm_capable: device supports LPM
  * @lpm_devinit_allow: Allow USB3 device initiated LPM, exit latency is in range
  * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
@@ -724,6 +725,7 @@ struct usb_device {
 	unsigned reset_resume:1;
 	unsigned port_is_suspended:1;
 	enum usb_link_tunnel_mode tunnel_mode;
+	struct device_link *usb4_link;
 
 	int slot_id;
 	struct usb2_lpm_parameters l1_params;
-- 
cgit v1.2.3


From d574c5dc8cfe1fd1ddda6edb435f3b3f39155c52 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 8 Jun 2025 16:46:54 +0100
Subject: serial: Remove unused uart_get_console

uart_get_console() has been unused since 2019's
commit bd0d9d159988 ("serial: remove ks8695 driver")

Remove it, and it's associated docs.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Jiri Slaby <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20250608154654.73994-1-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/serial/driver.rst |  7 +++----
 drivers/tty/serial/serial_core.c           | 27 ---------------------------
 include/linux/serial_core.h                |  2 --
 3 files changed, 3 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
index fa1ebfcd4472..c1db6a1a67c4 100644
--- a/Documentation/driver-api/serial/driver.rst
+++ b/Documentation/driver-api/serial/driver.rst
@@ -24,9 +24,8 @@ console support.
 Console Support
 ---------------
 
-The serial core provides a few helper functions.  This includes identifying
-the correct port structure (via uart_get_console()) and decoding command line
-arguments (uart_parse_options()).
+The serial core provides a few helper functions.  This includes
+decoding command line arguments (uart_parse_options()).
 
 There is also a helper function (uart_console_write()) which performs a
 character by character write, translating newlines to CRLF sequences.
@@ -76,7 +75,7 @@ Other functions
            uart_add_one_port uart_remove_one_port uart_console_write
            uart_parse_earlycon uart_parse_options uart_set_options
            uart_get_lsr_info uart_handle_dcd_change uart_handle_cts_change
-           uart_try_toggle_sysrq uart_get_console
+           uart_try_toggle_sysrq
 
 .. kernel-doc:: include/linux/serial_core.h
    :identifiers: uart_port_tx_limited uart_port_tx
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index d6485714eb0f..de23e45f55e5 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2131,33 +2131,6 @@ void uart_console_write(struct uart_port *port, const char *s,
 }
 EXPORT_SYMBOL_GPL(uart_console_write);
 
-/**
- * uart_get_console - get uart port for console
- * @ports: ports to search in
- * @nr: number of @ports
- * @co: console to search for
- * Returns: uart_port for the console @co
- *
- * Check whether an invalid uart number has been specified (as @co->index), and
- * if so, search for the first available port that does have console support.
- */
-struct uart_port * __init
-uart_get_console(struct uart_port *ports, int nr, struct console *co)
-{
-	int idx = co->index;
-
-	if (idx < 0 || idx >= nr || (ports[idx].iobase == 0 &&
-				     ports[idx].membase == NULL))
-		for (idx = 0; idx < nr; idx++)
-			if (ports[idx].iobase != 0 ||
-			    ports[idx].membase != NULL)
-				break;
-
-	co->index = idx;
-
-	return ports + idx;
-}
-
 /**
  * uart_parse_earlycon - Parse earlycon options
  * @p:	     ptr to 2nd field (ie., just beyond '<name>,')
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d65b15449cfe..84b4648ead7e 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -1101,8 +1101,6 @@ static inline bool uart_console_registered(struct uart_port *port)
 	return uart_console(port) && console_is_registered(port->cons);
 }
 
-struct uart_port *uart_get_console(struct uart_port *ports, int nr,
-				   struct console *c);
 int uart_parse_earlycon(char *p, enum uart_iotype *iotype,
 			resource_size_t *addr, char **options);
 void uart_parse_options(const char *options, int *baud, int *parity, int *bits,
-- 
cgit v1.2.3


From ace9b3daf2b4778358573d3698e34cb1c0fa7e14 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Thu, 19 Jun 2025 13:56:23 +0300
Subject: ASoC: SOF: ipc4/Intel: Add support for library restore firmware
 functionality

The firmware will be able to only save and restore the context related to
library management.
This means that even without a full context save, the libraries do not
need to be re-loaded to the firmware after second or consecutive boots.

This is reported via the FW_READY notification, where BIT(15) indicates:
0 - the library restore is not done
1 - library restore is done

This bit is only valid if full context save is not enabled, full context
save is by definition saves and restores the library related book-keeping
as well.

Add a new flag to tell the platform code if the libraries have been
restored, no need to reload them after boot.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Link: https://patch.msgid.link/20250619105623.4546-3-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof/ipc4/header.h  |  2 ++
 sound/soc/sof/intel/hda-loader.c |  7 +++++--
 sound/soc/sof/ipc4-loader.c      |  6 ++++++
 sound/soc/sof/ipc4-priv.h        |  2 ++
 sound/soc/sof/ipc4.c             | 14 ++++++++++++--
 5 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h
index f71d04736d17..e85c7afd85a4 100644
--- a/include/sound/sof/ipc4/header.h
+++ b/include/sound/sof/ipc4/header.h
@@ -498,6 +498,8 @@ struct sof_ipc4_intel_mic_privacy_cap {
 #define SOF_IPC4_LOG_CORE_GET(x)		(((x) & SOF_IPC4_LOG_CORE_MASK) >> \
 						 SOF_IPC4_LOG_CORE_SHIFT)
 
+#define SOF_IPC4_FW_READY_LIB_RESTORED		BIT(15)
+
 /* Value of notification type field - must fit into 8 bits */
 enum sof_ipc4_notification_type {
 	/* Phrase detected (notification from WoV module) */
diff --git a/sound/soc/sof/intel/hda-loader.c b/sound/soc/sof/intel/hda-loader.c
index 49085ca7b46b..2cc11d8b0f70 100644
--- a/sound/soc/sof/intel/hda-loader.c
+++ b/sound/soc/sof/intel/hda-loader.c
@@ -579,8 +579,11 @@ int hda_dsp_ipc4_load_library(struct snd_sof_dev *sdev,
 	struct sof_ipc4_msg msg = {};
 	int ret, ret1;
 
-	/* if IMR booting is enabled and fw context is saved for D3 state, skip the loading */
-	if (reload && hda->booted_from_imr && ipc4_data->fw_context_save)
+	/*
+	 * if IMR booting is enabled and libraries have been restored during fw
+	 * boot, skip the loading
+	 */
+	if (reload && hda->booted_from_imr && ipc4_data->libraries_restored)
 		return 0;
 
 	/* the fw_lib has been verified during loading, we can trust the validity here */
diff --git a/sound/soc/sof/ipc4-loader.c b/sound/soc/sof/ipc4-loader.c
index d2f534d65edf..ee61394e73d7 100644
--- a/sound/soc/sof/ipc4-loader.c
+++ b/sound/soc/sof/ipc4-loader.c
@@ -494,6 +494,12 @@ int sof_ipc4_query_fw_configuration(struct snd_sof_dev *sdev)
 			break;
 		case SOF_IPC4_FW_CONTEXT_SAVE:
 			ipc4_data->fw_context_save = *tuple->value;
+			/*
+			 * Set the default libraries_restored value - if full
+			 * context save is supported then it means that
+			 * libraries are restored
+			 */
+			ipc4_data->libraries_restored = ipc4_data->fw_context_save;
 			break;
 		default:
 			break;
diff --git a/sound/soc/sof/ipc4-priv.h b/sound/soc/sof/ipc4-priv.h
index 76dc54a2f07d..45e9b78432f7 100644
--- a/sound/soc/sof/ipc4-priv.h
+++ b/sound/soc/sof/ipc4-priv.h
@@ -73,6 +73,7 @@ struct sof_ipc4_fw_library {
  * @max_libs_count: Maximum number of libraries support by the FW including the
  *		    base firmware
  * @fw_context_save: Firmware supports full context save and restore
+ * @libraries_restored: The libraries have been retained during firmware boot
  *
  * @load_library: Callback function for platform dependent library loading
  * @pipeline_state_mutex: Mutex to protect pipeline triggers, ref counts, states and deletion
@@ -88,6 +89,7 @@ struct sof_ipc4_fw_data {
 	int max_num_pipelines;
 	u32 max_libs_count;
 	bool fw_context_save;
+	bool libraries_restored;
 
 	int (*load_library)(struct snd_sof_dev *sdev,
 			    struct sof_ipc4_fw_library *fw_lib, bool reload);
diff --git a/sound/soc/sof/ipc4.c b/sound/soc/sof/ipc4.c
index 37e837b22ac8..0ba0e8e615ae 100644
--- a/sound/soc/sof/ipc4.c
+++ b/sound/soc/sof/ipc4.c
@@ -576,9 +576,19 @@ EXPORT_SYMBOL(sof_ipc4_find_debug_slot_offset_by_type);
 
 static int ipc4_fw_ready(struct snd_sof_dev *sdev, struct sof_ipc4_msg *ipc4_msg)
 {
-	/* no need to re-check version/ABI for subsequent boots */
-	if (!sdev->first_boot)
+	if (!sdev->first_boot) {
+		struct sof_ipc4_fw_data *ipc4_data = sdev->private;
+
+		/*
+		 * After the initial boot only check if the libraries have been
+		 * restored when full context save is not enabled
+		 */
+		if (!ipc4_data->fw_context_save)
+			ipc4_data->libraries_restored = !!(ipc4_msg->primary &
+							   SOF_IPC4_FW_READY_LIB_RESTORED);
+
 		return 0;
+	}
 
 	sof_ipc4_create_exception_debugfs_node(sdev);
 
-- 
cgit v1.2.3


From f12b45862c4dcb9c2937b83ed730e473b9a76cbf Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 19 May 2025 10:33:19 +0200
Subject: timekeeping: Introduce timekeeper ID

As long as there is only a single timekeeper, there is no need to clarify
which timekeeper is used. But with the upcoming reusage of the timekeeper
infrastructure for auxiliary clock timekeepers, an ID is required to
differentiate.

Introduce an enum for timekeeper IDs, introduce a field in struct tk_data
to store this timekeeper id and add also initialization. The id struct
field is added at the end of the second cachline, as there is a 4 byte hole
anyway.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250519083025.842476378@linutronix.de
---
 include/linux/timekeeper_internal.h | 14 +++++++++++++-
 kernel/time/timekeeping.c           |  5 +++--
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 785048a3b3e6..bfcecad0e279 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -11,6 +11,16 @@
 #include <linux/jiffies.h>
 #include <linux/time.h>
 
+/**
+ * timekeeper_ids - IDs for various time keepers in the kernel
+ * @TIMEKEEPER_CORE:	The central core timekeeper managing system time
+ * @TIMEKEEPERS_MAX:	The maximum number of timekeepers managed
+ */
+enum timekeeper_ids {
+	TIMEKEEPER_CORE,
+	TIMEKEEPERS_MAX,
+};
+
 /**
  * struct tk_read_base - base structure for timekeeping readout
  * @clock:	Current clocksource used for timekeeping.
@@ -52,6 +62,7 @@ struct tk_read_base {
  * @offs_boot:			Offset clock monotonic -> clock boottime
  * @offs_tai:			Offset clock monotonic -> clock tai
  * @coarse_nsec:		The nanoseconds part for coarse time getters
+ * @id:				The timekeeper ID
  * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
  * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
  * @clock_was_set_seq:		The sequence number of clock was set events
@@ -101,7 +112,7 @@ struct tk_read_base {
  * which results in the following cacheline layout:
  *
  * 0:	seqcount, tkr_mono
- * 1:	xtime_sec ... coarse_nsec
+ * 1:	xtime_sec ... id
  * 2:	tkr_raw, raw_sec
  * 3,4: Internal variables
  *
@@ -123,6 +134,7 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	u32			coarse_nsec;
+	enum timekeeper_ids	id;
 
 	/* Cacheline 2: */
 	struct tk_read_base	tkr_raw;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb1da87a92f1..f4692fc2ea6b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1663,10 +1663,11 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
 	*boot_offset = ns_to_timespec64(local_clock());
 }
 
-static __init void tkd_basic_setup(struct tk_data *tkd)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id)
 {
 	raw_spin_lock_init(&tkd->lock);
 	seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+	tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
 }
 
 /*
@@ -1696,7 +1697,7 @@ void __init timekeeping_init(void)
 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
 	struct clocksource *clock;
 
-	tkd_basic_setup(&tk_core);
+	tkd_basic_setup(&tk_core, TIMEKEEPER_CORE);
 
 	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
 	if (timespec64_valid_settod(&wall_time) &&
-- 
cgit v1.2.3


From 9094c72c3d81bf2416b7c79d12c8494ab8fbac20 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Mon, 19 May 2025 10:33:20 +0200
Subject: time: Introduce auxiliary POSIX clocks

To support auxiliary timekeeping and the related user space interfaces,
it's required to define a clock ID range for them.

Reserve 8 auxiliary clock IDs after the regular timekeeping clock ID space.

This is the maximum number of auxiliary clocks the kernel can support. The actual
number of supported clocks depends obviously on the presence of related devices
and might be constraint by the available VDSO space.

Add the corresponding timekeeper IDs as well.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250519083025.905800695@linutronix.de
---
 include/linux/timekeeper_internal.h | 10 ++++++++--
 include/uapi/linux/time.h           | 11 +++++++++++
 kernel/time/Kconfig                 | 15 +++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index bfcecad0e279..4201ae818f57 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -13,11 +13,17 @@
 
 /**
  * timekeeper_ids - IDs for various time keepers in the kernel
- * @TIMEKEEPER_CORE:	The central core timekeeper managing system time
- * @TIMEKEEPERS_MAX:	The maximum number of timekeepers managed
+ * @TIMEKEEPER_CORE:		The central core timekeeper managing system time
+ * @TIMEKEEPER_AUX_FIRST:	The first AUX timekeeper
+ * @TIMEKEEPER_AUX_LAST:	The last AUX timekeeper
+ * @TIMEKEEPERS_MAX:		The maximum number of timekeepers managed
  */
 enum timekeeper_ids {
 	TIMEKEEPER_CORE,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+	TIMEKEEPER_AUX_FIRST,
+	TIMEKEEPER_AUX_LAST = TIMEKEEPER_AUX_FIRST + MAX_AUX_CLOCKS - 1,
+#endif
 	TIMEKEEPERS_MAX,
 };
 
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 4f4b6e48e01c..16ca1ac206fd 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -64,6 +64,17 @@ struct timezone {
 #define CLOCK_TAI			11
 
 #define MAX_CLOCKS			16
+
+/*
+ * AUX clock support. AUXiliary clocks are dynamically configured by
+ * enabling a clock ID. These clock can be steered independently of the
+ * core timekeeper. The kernel can support up to 8 auxiliary clocks, but
+ * the actual limit depends on eventual architecture constraints vs. VDSO.
+ */
+#define CLOCK_AUX			MAX_CLOCKS
+#define MAX_AUX_CLOCKS			8
+#define CLOCK_AUX_LAST			(CLOCK_AUX + MAX_AUX_CLOCKS - 1)
+
 #define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
 #define CLOCKS_MONO			CLOCK_MONOTONIC
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b0b97a60aaa6..7c6a52f7836c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE
 	help
 	  Tracks idle state on behalf of RCU.
 
-if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
+if GENERIC_CLOCKEVENTS
 # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
 # only related to the tick functionality. Oneshot clockevent devices
 # are supported independent of this.
@@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
 	  interval and NTP's maximum frequency drift of 500 parts
 	  per million.	If the clocksource is good enough for NTP,
 	  it is good enough for the clocksource watchdog!
+endif
+
+config POSIX_AUX_CLOCKS
+	bool "Enable auxiliary POSIX clocks"
+	depends on POSIX_TIMERS
+	help
+	  Auxiliary POSIX clocks are clocks which can be steered
+	  independently of the core timekeeper, which controls the
+	  MONOTONIC, REALTIME, BOOTTIME and TAI clocks.  They are useful to
+	  provide e.g. lockless time accessors to independent PTP clocks
+	  and other clock domains, which are not correlated to the TAI/NTP
+	  notion of time.
 
 endmenu
-endif
-- 
cgit v1.2.3


From 6168024604236cb2bb1004ea8459c8ece2c4ef5f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 May 2025 10:33:27 +0200
Subject: timekeeping: Add clock_valid flag to timekeeper

In preparation for supporting independent auxiliary timekeepers, add a
clock valid field and set it to true for the system timekeeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250519083026.287145536@linutronix.de
---
 include/linux/timekeeper_internal.h | 2 ++
 kernel/time/timekeeping.c           | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 4201ae818f57..1690eda1c7c3 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -73,6 +73,7 @@ struct tk_read_base {
  * @raw_sec:			CLOCK_MONOTONIC_RAW  time in seconds
  * @clock_was_set_seq:		The sequence number of clock was set events
  * @cs_was_changed_seq:		The sequence number of clocksource change events
+ * @clock_valid:		Indicator for valid clock
  * @monotonic_to_boot:		CLOCK_MONOTONIC to CLOCK_BOOTTIME offset
  * @cycle_interval:		Number of clock cycles in one NTP interval
  * @xtime_interval:		Number of clock shifted nano seconds in one NTP
@@ -149,6 +150,7 @@ struct timekeeper {
 	/* Cachline 3 and 4 (timekeeping internal variables): */
 	unsigned int		clock_was_set_seq;
 	u8			cs_was_changed_seq;
+	u8			clock_valid;
 
 	struct timespec64	monotonic_to_boot;
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e3c1a1c1d8c5..bf59bacc97db 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1665,11 +1665,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
 	*boot_offset = ns_to_timespec64(local_clock());
 }
 
-static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
 {
 	raw_spin_lock_init(&tkd->lock);
 	seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
 	tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+	tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
 }
 
 /*
@@ -1699,7 +1700,7 @@ void __init timekeeping_init(void)
 	struct timekeeper *tks = &tk_core.shadow_timekeeper;
 	struct clocksource *clock;
 
-	tkd_basic_setup(&tk_core, TIMEKEEPER_CORE);
+	tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
 
 	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
 	if (timespec64_valid_settod(&wall_time) &&
-- 
cgit v1.2.3


From 75215c972581d3934e76a57690cf838d7ceab399 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:38 +0200
Subject: pidfs: move to anonymous struct

Move the pidfs entries to an anonymous struct.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-4-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 453ae6d8a68d..00646a692dd4 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -52,14 +52,15 @@ struct upid {
 	struct pid_namespace *ns;
 };
 
-struct pid
-{
+struct pid {
 	refcount_t count;
 	unsigned int level;
 	spinlock_t lock;
-	struct dentry *stashed;
-	u64 ino;
-	struct rb_node pidfs_node;
+	struct {
+		u64 ino;
+		struct rb_node pidfs_node;
+		struct dentry *stashed;
+	};
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
-- 
cgit v1.2.3


From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:39 +0200
Subject: pidfs: persist information

Persist exit and coredump information independent of whether anyone
currently holds a pidfd for the struct pid.

The current scheme allocated pidfs dentries on-demand repeatedly.
This scheme is reaching it's limits as it makes it impossible to pin
information that needs to be available after the task has exited or
coredumped and that should not be lost simply because the pidfd got
closed temporarily. The next opener should still see the stashed
information.

This is also a prerequisite for supporting extended attributes on
pidfds to allow attaching meta information to them.

If someone opens a pidfd for a struct pid a pidfs dentry is allocated
and stashed in pid->stashed. Once the last pidfd for the struct pid is
closed the pidfs dentry is released and removed from pid->stashed.

So if 10 callers create a pidfs dentry for the same struct pid
sequentially, i.e., each closing the pidfd before the other creates a
new one then a new pidfs dentry is allocated every time.

Because multiple tasks acquiring and releasing a pidfd for the same
struct pid can race with each another a task may still find a valid
pidfs entry from the previous task in pid->stashed and reuse it. Or it
might find a dead dentry in there and fail to reuse it and so stashes a
new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry
but only one will succeed, the other ones will put their dentry.

The current scheme aims to ensure that a pidfs dentry for a struct pid
can only be created if the task is still alive or if a pidfs dentry
already existed before the task was reaped and so exit information has
been was stashed in the pidfs inode.

That's great except that it's buggy. If a pidfs dentry is stashed in
pid->stashed after pidfs_exit() but before __unhash_process() is called
we will return a pidfd for a reaped task without exit information being
available.

The pidfds_pid_valid() check does not guard against this race as it
doens't sync at all with pidfs_exit(). The pid_has_task() check might be
successful simply because we're before __unhash_process() but after
pidfs_exit().

Introduce a new scheme where the lifetime of information associated with
a pidfs entry (coredump and exit information) isn't bound to the
lifetime of the pidfs inode but the struct pid itself.

The first time a pidfs dentry is allocated for a struct pid a struct
pidfs_attr will be allocated which will be used to store exit and
coredump information.

If all pidfs for the pidfs dentry are closed the dentry and inode can be
cleaned up but the struct pidfs_attr will stick until the struct pid
itself is freed. This will ensure minimal memory usage while persisting
relevant information.

The new scheme has various advantages. First, it allows to close the
race where we end up handing out a pidfd for a reaped task for which no
exit information is available. Second, it minimizes memory usage.
Third, it allows to remove complex lifetime tracking via dentries when
registering a struct pid with pidfs. There's no need to get or put a
reference. Instead, the lifetime of exit and coredump information
associated with a struct pid is bound to the lifetime of struct pid
itself.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c            | 212 ++++++++++++++++++++++++++++++++++----------------
 include/linux/pid.h   |   3 +
 include/linux/pidfs.h |   1 +
 kernel/pid.c          |   2 +-
 4 files changed, 151 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index ff2560b34ed1..6a907457b1fe 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -25,7 +25,10 @@
 #include "internal.h"
 #include "mount.h"
 
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
 static struct kmem_cache *pidfs_cachep __ro_after_init;
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
 
 /*
  * Stashes information that userspace needs to access even after the
@@ -37,6 +40,11 @@ struct pidfs_exit_info {
 	__u32 coredump_mask;
 };
 
+struct pidfs_attr {
+	struct pidfs_exit_info __pei;
+	struct pidfs_exit_info *exit_info;
+};
+
 struct pidfs_inode {
 	struct pidfs_exit_info __pei;
 	struct pidfs_exit_info *exit_info;
@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
 
 	pid->ino = pidfs_ino_nr;
 	pid->stashed = NULL;
+	pid->attr = NULL;
 	pidfs_ino_nr++;
 
 	write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
 	write_seqcount_end(&pidmap_lock_seq);
 }
 
+void pidfs_free_pid(struct pid *pid)
+{
+	/*
+	 * Any dentry must've been wiped from the pid by now.
+	 * Otherwise there's a reference count bug.
+	 */
+	VFS_WARN_ON_ONCE(pid->stashed);
+
+	if (!IS_ERR(pid->attr))
+		kfree(pid->attr);
+}
+
 #ifdef CONFIG_PROC_FS
 /**
  * pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
-	struct inode *inode = file_inode(file);
 	struct pid *pid = pidfd_pid(file);
 	size_t usize = _IOC_SIZE(cmd);
 	struct pidfd_info kinfo = {};
 	struct pidfs_exit_info *exit_info;
 	struct user_namespace *user_ns;
 	struct task_struct *task;
+	struct pidfs_attr *attr;
 	const struct cred *c;
 	__u64 mask;
 
@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 	if (!pid_in_current_pidns(pid))
 		return -ESRCH;
 
+	attr = READ_ONCE(pid->attr);
 	if (mask & PIDFD_INFO_EXIT) {
-		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+		exit_info = READ_ONCE(attr->exit_info);
 		if (exit_info) {
 			kinfo.mask |= PIDFD_INFO_EXIT;
 #ifdef CONFIG_CGROUPS
@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (mask & PIDFD_INFO_COREDUMP) {
 		kinfo.mask |= PIDFD_INFO_COREDUMP;
-		kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+		kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
 	}
 
 	task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file)
  * task has been reaped which cannot happen until we're out of
  * release_task().
  *
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
  *
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
  * pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
  */
 void pidfs_exit(struct task_struct *tsk)
 {
-	struct dentry *dentry;
+	struct pid *pid = task_pid(tsk);
+	struct pidfs_attr *attr;
+	struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+	struct cgroup *cgrp;
+#endif
 
 	might_sleep();
 
-	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
-	if (dentry) {
-		struct inode *inode = d_inode(dentry);
-		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
-		struct cgroup *cgrp;
+	guard(spinlock_irq)(&pid->wait_pidfd.lock);
+	attr = pid->attr;
+	if (!attr) {
+		/*
+		 * No one ever held a pidfd for this struct pid.
+		 * Mark it as dead so no one can add a pidfs
+		 * entry anymore. We're about to be reaped and
+		 * so no exit information would be available.
+		 */
+		pid->attr = PIDFS_PID_DEAD;
+		return;
+	}
 
-		rcu_read_lock();
-		cgrp = task_dfl_cgroup(tsk);
-		exit_info->cgroupid = cgroup_id(cgrp);
-		rcu_read_unlock();
+	/*
+	 * If @pid->attr is set someone might still legitimately hold a
+	 * pidfd to @pid or someone might concurrently still be getting
+	 * a reference to an already stashed dentry from @pid->stashed.
+	 * So defer cleaning @pid->attr until the last reference to @pid
+	 * is put
+	 */
+
+	exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(tsk);
+	exit_info->cgroupid = cgroup_id(cgrp);
+	rcu_read_unlock();
 #endif
-		exit_info->exit_code = tsk->exit_code;
+	exit_info->exit_code = tsk->exit_code;
 
-		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
-		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
-		dput(dentry);
-	}
+	/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+	smp_store_release(&attr->exit_info, &attr->__pei);
 }
 
 #ifdef CONFIG_COREDUMP
@@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
 {
 	struct pid *pid = cprm->pid;
 	struct pidfs_exit_info *exit_info;
-	struct dentry *dentry;
-	struct inode *inode;
+	struct pidfs_attr *attr;
 	__u32 coredump_mask = 0;
 
-	dentry = pid->stashed;
-	if (WARN_ON_ONCE(!dentry))
-		return;
+	attr = READ_ONCE(pid->attr);
 
-	inode = d_inode(dentry);
-	exit_info = &pidfs_i(inode)->__pei;
+	VFS_WARN_ON_ONCE(!attr);
+	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+	exit_info = &attr->__pei;
 	/* Note how we were coredumped. */
 	coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
 	/* Note that we actually did coredump. */
@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
 
 static void pidfs_free_inode(struct inode *inode)
 {
-	kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+	kfree(pidfs_i(inode));
 }
 
 static const struct super_operations pidfs_sops = {
@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
 	 * recorded and published can be handled correctly.
 	 */
 	if (unlikely(!pid_has_task(pid, type))) {
-		struct inode *inode = d_inode(path->dentry);
-		return !!READ_ONCE(pidfs_i(inode)->exit_info);
+		struct pidfs_attr *attr;
+
+		attr = READ_ONCE(pid->attr);
+		if (!attr)
+			return false;
+		if (!READ_ONCE(attr->exit_info))
+			return false;
 	}
 
 	return true;
@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
 	put_pid(pid);
 }
 
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+	struct pidfs_attr *new_attr __free(kfree) = NULL;
+	struct pidfs_attr *attr;
+
+	might_sleep();
+
+	if (!pid)
+		return 0;
+
+	attr = READ_ONCE(pid->attr);
+	if (unlikely(attr == PIDFS_PID_DEAD))
+		return PTR_ERR(PIDFS_PID_DEAD);
+	if (attr)
+		return 0;
+
+	new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+	if (!new_attr)
+		return -ENOMEM;
+
+	/* Synchronize with pidfs_exit(). */
+	guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+	attr = pid->attr;
+	if (unlikely(attr == PIDFS_PID_DEAD))
+		return PTR_ERR(PIDFS_PID_DEAD);
+	if (unlikely(attr))
+		return 0;
+
+	pid->attr = no_free_ptr(new_attr);
+	return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+					 struct dentry *dentry)
+{
+	int ret;
+	struct pid *pid = d_inode(dentry)->i_private;
+
+	VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+	ret = pidfs_register_pid(pid);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return stash_dentry(stashed, dentry);
+}
+
 static const struct stashed_operations pidfs_stashed_ops = {
-	.init_inode = pidfs_init_inode,
-	.put_data = pidfs_put_data,
+	.stash_dentry	= pidfs_stash_dentry,
+	.init_inode	= pidfs_init_inode,
+	.put_data	= pidfs_put_data,
 };
 
 static int pidfs_init_fs_context(struct fs_context *fc)
@@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	return pidfd_file;
 }
 
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
-	struct path path __free(path_put) = {};
-	int ret;
-
-	might_sleep();
-
-	if (!pid)
-		return 0;
-
-	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
-	if (unlikely(ret))
-		return ret;
-	/* Keep the dentry and only put the reference to the mount. */
-	path.dentry = NULL;
-	return 0;
-}
-
 /**
  * pidfs_get_pid - pin a struct pid through pidfs
  * @pid: pid to pin
@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
 					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
 					  SLAB_ACCOUNT | SLAB_PANIC),
 					 pidfs_inode_init_once);
+	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
 	pidfs_mnt = kern_mount(&pidfs_type);
 	if (IS_ERR(pidfs_mnt))
 		panic("Failed to mount pidfs pseudo filesystem");
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 00646a692dd4..003a1027d219 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -47,6 +47,8 @@
 
 #define RESERVED_PIDS 300
 
+struct pidfs_attr;
+
 struct upid {
 	int nr;
 	struct pid_namespace *ns;
@@ -60,6 +62,7 @@ struct pid {
 		u64 ino;
 		struct rb_node pidfs_node;
 		struct dentry *stashed;
+		struct pidfs_attr *attr;
 	};
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 77e7db194914..8f6ed59bb3fb 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
 void pidfs_get_pid(struct pid *pid);
 void pidfs_put_pid(struct pid *pid);
+void pidfs_free_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 8317bcbc7cf7..07db7d8d066c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
 
 	ns = pid->numbers[pid->level].ns;
 	if (refcount_dec_and_test(&pid->count)) {
-		WARN_ON_ONCE(pid->stashed);
+		pidfs_free_pid(pid);
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
-- 
cgit v1.2.3


From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:42 +0200
Subject: pidfs: remove pidfs_{get,put}_pid()

Now that we stash persistent information in struct pid there's no need
to play volatile games with pinning struct pid via dentries in pidfs.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c         |  6 ------
 fs/pidfs.c            | 35 +----------------------------------
 include/linux/pidfs.h |  2 --
 net/unix/af_unix.c    |  5 -----
 4 files changed, 1 insertion(+), 47 deletions(-)

(limited to 'include')

diff --git a/fs/coredump.c b/fs/coredump.c
index f217ebf2b3b6..55d6a713a0fb 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -898,12 +898,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		retval = kernel_connect(socket, (struct sockaddr *)(&addr),
 					addr_len, O_NONBLOCK | SOCK_COREDUMP);
 
-		/*
-		 * ... Make sure to only put our reference after connect() took
-		 * its own reference keeping the pidfs entry alive ...
-		 */
-		pidfs_put_pid(cprm.pid);
-
 		if (retval) {
 			if (retval == -EAGAIN)
 				coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c49c53d6ae51..bc2342cf4492 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -895,8 +895,7 @@ static void pidfs_put_data(void *data)
  * pidfs_register_pid - register a struct pid in pidfs
  * @pid: pid to pin
  *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ * Register a struct pid in pidfs.
  *
  * Return: On success zero, on error a negative error code is returned.
  */
@@ -1007,38 +1006,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	return pidfd_file;
 }
 
-/**
- * pidfs_get_pid - pin a struct pid through pidfs
- * @pid: pid to pin
- *
- * Similar to pidfs_register_pid() but only valid if the caller knows
- * there's a reference to the @pid through a dentry already that can't
- * go away.
- */
-void pidfs_get_pid(struct pid *pid)
-{
-	if (!pid)
-		return;
-	WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
-}
-
-/**
- * pidfs_put_pid - drop a pidfs reference
- * @pid: pid to drop
- *
- * Drop a reference to @pid via pidfs. This is only safe if the
- * reference has been taken via pidfs_get_pid().
- */
-void pidfs_put_pid(struct pid *pid)
-{
-	might_sleep();
-
-	if (!pid)
-		return;
-	VFS_WARN_ON_ONCE(!pid->stashed);
-	dput(pid->stashed);
-}
-
 void __init pidfs_init(void)
 {
 	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 8f6ed59bb3fb..3e08c33da2df 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -14,8 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm);
 #endif
 extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
-void pidfs_get_pid(struct pid *pid);
-void pidfs_put_pid(struct pid *pid);
 void pidfs_free_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2e2e9997a68e..129388c309b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk)
 		return;
 	}
 
-	if (sk->sk_peer_pid)
-		pidfs_put_pid(sk->sk_peer_pid);
-
 	if (u->addr)
 		unix_release_addr(u->addr);
 
@@ -769,7 +766,6 @@ static void drop_peercred(struct unix_peercred *peercred)
 	swap(peercred->peer_pid, pid);
 	swap(peercred->peer_cred, cred);
 
-	pidfs_put_pid(pid);
 	put_pid(pid);
 	put_cred(cred);
 }
@@ -802,7 +798,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
 
 	spin_lock(&sk->sk_peer_lock);
 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
-	pidfs_get_pid(sk->sk_peer_pid);
 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 	spin_unlock(&sk->sk_peer_lock);
 }
-- 
cgit v1.2.3


From 180d8b4ce91fe0cf7a9cb236bb01f14587ba4bf0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 19 May 2025 10:33:32 +0200
Subject: timekeeping: Add AUX offset to struct timekeeper

This offset will be used in the time getters of auxiliary clocks. It is
added to the "monotonic" clock readout.

As auxiliary clocks do not utilize the offset fields of the core time
keeper, this is just an alias for offs_tai, so that the cache line layout
stays the same.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250519083026.533486349@linutronix.de
---
 include/linux/timekeeper_internal.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 1690eda1c7c3..ca79938b62f3 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -67,6 +67,7 @@ struct tk_read_base {
  * @offs_real:			Offset clock monotonic -> clock realtime
  * @offs_boot:			Offset clock monotonic -> clock boottime
  * @offs_tai:			Offset clock monotonic -> clock tai
+ * @offs_aux:			Offset clock monotonic -> clock AUX
  * @coarse_nsec:		The nanoseconds part for coarse time getters
  * @id:				The timekeeper ID
  * @tkr_raw:			The readout base structure for CLOCK_MONOTONIC_RAW
@@ -113,6 +114,9 @@ struct tk_read_base {
  * @monotonic_to_boottime is a timespec64 representation of @offs_boot to
  * accelerate the VDSO update for CLOCK_BOOTTIME.
  *
+ * @offs_aux is used by the auxiliary timekeepers which do not utilize any
+ * of the regular timekeeper offset fields.
+ *
  * The cacheline ordering of the structure is optimized for in kernel usage of
  * the ktime_get() and ktime_get_ts64() family of time accessors. Struct
  * timekeeper is prepended in the core timekeeping code with a sequence count,
@@ -139,7 +143,10 @@ struct timekeeper {
 	struct timespec64	wall_to_monotonic;
 	ktime_t			offs_real;
 	ktime_t			offs_boot;
-	ktime_t			offs_tai;
+	union {
+		ktime_t		offs_tai;
+		ktime_t		offs_aux;
+	};
 	u32			coarse_nsec;
 	enum timekeeper_ids	id;
 
-- 
cgit v1.2.3


From 75cabb46935b6de8e2bdfde563e460ac41cfff12 Mon Sep 17 00:00:00 2001
From: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Date: Tue, 17 Jun 2025 00:17:34 -0700
Subject: net: mana: Add support for net_shaper_ops

Introduce support for net_shaper_ops in the MANA driver,
enabling configuration of rate limiting on the MANA NIC.

To apply rate limiting, the driver issues a HWC command via
mana_set_bw_clamp() and updates the corresponding shaper object
in the net_shaper cache. If an error occurs during this process,
the driver restores the previous speed by querying the current link
configuration using mana_query_link_cfg().

The minimum supported bandwidth is 100 Mbps, and only values that are
exact multiples of 100 Mbps are allowed. Any other values are rejected.

To remove a shaper, the driver resets the bandwidth to the maximum
supported by the SKU using mana_set_bw_clamp() and clears the
associated cache entry. If an error occurs during this process,
the shaper details are retained.

On the hardware that does not support these APIs, the net-shaper
calls to set speed would fail.

Set the speed:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do set --json '{"ifindex":'$IFINDEX',
		   "handle":{"scope": "netdev", "id":'$ID' },
		   "bw-max": 200000000 }'

Get the shaper details:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do get --json '{"ifindex":'$IFINDEX',
		      "handle":{"scope": "netdev", "id":'$ID' }}'

> {'bw-max': 200000000,
> 'handle': {'scope': 'netdev'},
> 'ifindex': $IFINDEX,
> 'metric': 'bps'}

Delete the shaper object:
./tools/net/ynl/pyynl/cli.py \
 --spec Documentation/netlink/specs/net_shaper.yaml \
 --do delete --json '{"ifindex":'$IFINDEX',
		      "handle":{"scope": "netdev","id":'$ID' }}'

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-3-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 155 ++++++++++++++++++++++++++
 include/net/mana/mana.h                       |  40 +++++++
 2 files changed, 195 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index bcc33ea7aca3..547dff450b6d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -719,6 +719,78 @@ out:
 	return err;
 }
 
+static int mana_shaper_set(struct net_shaper_binding *binding,
+			   const struct net_shaper *shaper,
+			   struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(binding->netdev);
+	u32 old_speed, rate;
+	int err;
+
+	if (shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) {
+		NL_SET_ERR_MSG_MOD(extack, "net shaper scope should be netdev");
+		return -EINVAL;
+	}
+
+	if (apc->handle.id && shaper->handle.id != apc->handle.id) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot create multiple shapers");
+		return -EOPNOTSUPP;
+	}
+
+	if (!shaper->bw_max || (shaper->bw_max % 100000000)) {
+		NL_SET_ERR_MSG_MOD(extack, "Please use multiples of 100Mbps for bandwidth");
+		return -EINVAL;
+	}
+
+	rate = div_u64(shaper->bw_max, 1000); /* Convert bps to Kbps */
+	rate = div_u64(rate, 1000);	      /* Convert Kbps to Mbps */
+
+	/* Get current speed */
+	err = mana_query_link_cfg(apc);
+	old_speed = (err) ? SPEED_UNKNOWN : apc->speed;
+
+	if (!err) {
+		err = mana_set_bw_clamp(apc, rate, TRI_STATE_TRUE);
+		apc->speed = (err) ? old_speed : rate;
+		apc->handle = (err) ? apc->handle : shaper->handle;
+	}
+
+	return err;
+}
+
+static int mana_shaper_del(struct net_shaper_binding *binding,
+			   const struct net_shaper_handle *handle,
+			   struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(binding->netdev);
+	int err;
+
+	err = mana_set_bw_clamp(apc, 0, TRI_STATE_FALSE);
+
+	if (!err) {
+		/* Reset mana port context parameters */
+		apc->handle.id = 0;
+		apc->handle.scope = NET_SHAPER_SCOPE_UNSPEC;
+		apc->speed = 0;
+	}
+
+	return err;
+}
+
+static void mana_shaper_cap(struct net_shaper_binding *binding,
+			    enum net_shaper_scope scope,
+			    unsigned long *flags)
+{
+	*flags = BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX) |
+		 BIT(NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS);
+}
+
+static const struct net_shaper_ops mana_shaper_ops = {
+	.set = mana_shaper_set,
+	.delete = mana_shaper_del,
+	.capabilities = mana_shaper_cap,
+};
+
 static const struct net_device_ops mana_devops = {
 	.ndo_open		= mana_open,
 	.ndo_stop		= mana_close,
@@ -729,6 +801,7 @@ static const struct net_device_ops mana_devops = {
 	.ndo_bpf		= mana_bpf,
 	.ndo_xdp_xmit		= mana_xdp_xmit,
 	.ndo_change_mtu		= mana_change_mtu,
+	.net_shaper_ops         = &mana_shaper_ops,
 };
 
 static void mana_cleanup_port_context(struct mana_port_context *apc)
@@ -1162,6 +1235,86 @@ out:
 	return err;
 }
 
+int mana_query_link_cfg(struct mana_port_context *apc)
+{
+	struct net_device *ndev = apc->ndev;
+	struct mana_query_link_config_resp resp = {};
+	struct mana_query_link_config_req req = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_LINK_CONFIG,
+			     sizeof(req), sizeof(resp));
+
+	req.vport = apc->port_handle;
+	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
+
+	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+				sizeof(resp));
+
+	if (err) {
+		netdev_err(ndev, "Failed to query link config: %d\n", err);
+		return err;
+	}
+
+	err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_LINK_CONFIG,
+				   sizeof(resp));
+
+	if (err || resp.hdr.status) {
+		netdev_err(ndev, "Failed to query link config: %d, 0x%x\n", err,
+			   resp.hdr.status);
+		if (!err)
+			err = -EOPNOTSUPP;
+		return err;
+	}
+
+	if (resp.qos_unconfigured) {
+		err = -EINVAL;
+		return err;
+	}
+	apc->speed = resp.link_speed_mbps;
+	return 0;
+}
+
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+		      int enable_clamping)
+{
+	struct mana_set_bw_clamp_resp resp = {};
+	struct mana_set_bw_clamp_req req = {};
+	struct net_device *ndev = apc->ndev;
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, MANA_SET_BW_CLAMP,
+			     sizeof(req), sizeof(resp));
+	req.vport = apc->port_handle;
+	req.link_speed_mbps = speed;
+	req.enable_clamping = enable_clamping;
+
+	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
+				sizeof(resp));
+
+	if (err) {
+		netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d",
+			   speed, err);
+		return err;
+	}
+
+	err = mana_verify_resp_hdr(&resp.hdr, MANA_SET_BW_CLAMP,
+				   sizeof(resp));
+
+	if (err || resp.hdr.status) {
+		netdev_err(ndev, "Failed to set bandwidth clamp: %d, 0x%x\n", err,
+			   resp.hdr.status);
+		if (!err)
+			err = -EOPNOTSUPP;
+		return err;
+	}
+
+	if (resp.qos_unconfigured)
+		netdev_info(ndev, "QoS is unconfigured\n");
+
+	return 0;
+}
+
 int mana_create_wq_obj(struct mana_port_context *apc,
 		       mana_handle_t vport,
 		       u32 wq_type, struct mana_obj_spec *wq_spec,
@@ -3011,6 +3164,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 		goto free_indir;
 	}
 
+	debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed);
+
 	return 0;
 
 free_indir:
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 4176edf1be71..038b18340e51 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -5,6 +5,7 @@
 #define _MANA_H
 
 #include <net/xdp.h>
+#include <net/net_shaper.h>
 
 #include "gdma.h"
 #include "hw_channel.h"
@@ -526,7 +527,12 @@ struct mana_port_context {
 	struct mutex vport_mutex;
 	int vport_use_count;
 
+	/* Net shaper handle*/
+	struct net_shaper_handle handle;
+
 	u16 port_idx;
+	/* Currently configured speed (mbps) */
+	u32 speed;
 
 	bool port_is_up;
 	bool port_st_save; /* Saved port state */
@@ -562,6 +568,9 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
 void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
 int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
 void mana_query_gf_stats(struct mana_port_context *apc);
+int mana_query_link_cfg(struct mana_port_context *apc);
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+		      int enable_clamping);
 void mana_query_phy_stats(struct mana_port_context *apc);
 int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
 void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
@@ -589,6 +598,8 @@ enum mana_command_code {
 	MANA_FENCE_RQ		= 0x20006,
 	MANA_CONFIG_VPORT_RX	= 0x20007,
 	MANA_QUERY_VPORT_CONFIG	= 0x20008,
+	MANA_QUERY_LINK_CONFIG	= 0x2000A,
+	MANA_SET_BW_CLAMP	= 0x2000B,
 	MANA_QUERY_PHY_STAT     = 0x2000c,
 
 	/* Privileged commands for the PF mode */
@@ -598,6 +609,35 @@ enum mana_command_code {
 	MANA_DEREGISTER_HW_PORT	= 0x28004,
 };
 
+/* Query Link Configuration*/
+struct mana_query_link_config_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+}; /* HW DATA */
+
+struct mana_query_link_config_resp {
+	struct gdma_resp_hdr hdr;
+	u32 qos_speed_mbps;
+	u8 qos_unconfigured;
+	u8 reserved1[3];
+	u32 link_speed_mbps;
+	u8 reserved2[4];
+}; /* HW DATA */
+
+/* Set Bandwidth Clamp*/
+struct mana_set_bw_clamp_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	enum TRI_STATE enable_clamping;
+	u32 link_speed_mbps;
+}; /* HW DATA */
+
+struct mana_set_bw_clamp_resp {
+	struct gdma_resp_hdr hdr;
+	u8 qos_unconfigured;
+	u8 reserved[7];
+}; /* HW DATA */
+
 /* Query Device Configuration */
 struct mana_query_device_cfg_req {
 	struct gdma_req_hdr hdr;
-- 
cgit v1.2.3


From a6d5edf11e0cf5a4650f1d353d20ec29de093813 Mon Sep 17 00:00:00 2001
From: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Date: Tue, 17 Jun 2025 00:17:35 -0700
Subject: net: mana: Add speed support in mana_get_link_ksettings

Allow mana ethtool get_link_ksettings operation to report
the maximum speed supported by the SKU in mbps.

The driver retrieves this information by issuing a
HWC command to the hardware via mana_query_link_cfg(),
which retrieves the SKU's maximum supported speed.

These APIs when invoked on hardware that are older/do
not support these APIs, the speed would be reported as UNKNOWN.

Before:
$ethtool enP30832s1
> Settings for enP30832s1:
        Supported ports: [  ]
        Supported link modes:   Not reported
        Supported pause frame use: No
        Supports auto-negotiation: No
        Supported FEC modes: Not reported
        Advertised link modes:  Not reported
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Advertised FEC modes: Not reported
        Speed: Unknown!
        Duplex: Full
        Auto-negotiation: off
        Port: Other
        PHYAD: 0
        Transceiver: internal
        Link detected: yes

After:
$ethtool enP30832s1
> Settings for enP30832s1:
        Supported ports: [  ]
        Supported link modes:   Not reported
        Supported pause frame use: No
        Supports auto-negotiation: No
        Supported FEC modes: Not reported
        Advertised link modes:  Not reported
        Advertised pause frame use: No
        Advertised auto-negotiation: No
        Advertised FEC modes: Not reported
        Speed: 16000Mb/s
        Duplex: Full
        Auto-negotiation: off
        Port: Other
        PHYAD: 0
        Transceiver: internal
        Link detected: yes

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-4-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 1 +
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 6 ++++++
 include/net/mana/mana.h                            | 2 ++
 3 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 547dff450b6d..d7079e05dfb8 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1272,6 +1272,7 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 		return err;
 	}
 	apc->speed = resp.link_speed_mbps;
+	apc->max_speed = resp.qos_speed_mbps;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 4fb3a04994a2..a1afa75a9463 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -495,6 +495,12 @@ out:
 static int mana_get_link_ksettings(struct net_device *ndev,
 				   struct ethtool_link_ksettings *cmd)
 {
+	struct mana_port_context *apc = netdev_priv(ndev);
+	int err;
+
+	err = mana_query_link_cfg(apc);
+	cmd->base.speed = (err) ? SPEED_UNKNOWN : apc->max_speed;
+
 	cmd->base.duplex = DUPLEX_FULL;
 	cmd->base.port = PORT_OTHER;
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 038b18340e51..e1030a7d2daa 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -533,6 +533,8 @@ struct mana_port_context {
 	u16 port_idx;
 	/* Currently configured speed (mbps) */
 	u32 speed;
+	/* Maximum speed supported by the SKU (mbps) */
+	u32 max_speed;
 
 	bool port_is_up;
 	bool port_st_save; /* Saved port state */
-- 
cgit v1.2.3


From ca8ac489ca33c986ff02ee14c3e1c10b86355428 Mon Sep 17 00:00:00 2001
From: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Date: Tue, 17 Jun 2025 00:17:36 -0700
Subject: net: mana: Handle unsupported HWC commands

If any of the HWC commands are not recognized by the
underlying hardware, the hardware returns the response
header status of -1. Log the information using
netdev_info_once to avoid multiple error logs in dmesg.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Reviewed-by: Saurabh Singh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Link: https://patch.msgid.link/1750144656-2021-5-git-send-email-ernis@linux.microsoft.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/microsoft/mana/hw_channel.c |  4 ++++
 drivers/net/ethernet/microsoft/mana/mana_en.c    | 11 +++++++++++
 include/net/mana/gdma.h                          |  1 +
 3 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 3d3677c0d014..650d22654d49 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -891,6 +891,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 	}
 
 	if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) {
+		if (ctx->status_code == GDMA_STATUS_CMD_UNSUPPORTED) {
+			err = -EOPNOTSUPP;
+			goto out;
+		}
 		if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT)
 			dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n",
 				ctx->status_code);
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index d7079e05dfb8..5aee7bda1504 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -847,6 +847,9 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 	err = mana_gd_send_request(gc, in_len, in_buf, out_len,
 				   out_buf);
 	if (err || resp->status) {
+		if (err == -EOPNOTSUPP)
+			return err;
+
 		if (req->req.msg_type != MANA_QUERY_PHY_STAT)
 			dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
 				err, resp->status);
@@ -1252,6 +1255,10 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 				sizeof(resp));
 
 	if (err) {
+		if (err == -EOPNOTSUPP) {
+			netdev_info_once(ndev, "MANA_QUERY_LINK_CONFIG not supported\n");
+			return err;
+		}
 		netdev_err(ndev, "Failed to query link config: %d\n", err);
 		return err;
 	}
@@ -1294,6 +1301,10 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 				sizeof(resp));
 
 	if (err) {
+		if (err == -EOPNOTSUPP) {
+			netdev_info_once(ndev, "MANA_SET_BW_CLAMP not supported\n");
+			return err;
+		}
 		netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d",
 			   speed, err);
 		return err;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6fe6cbcd512d..92ab85061df0 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -10,6 +10,7 @@
 #include "shm_channel.h"
 
 #define GDMA_STATUS_MORE_ENTRIES	0x00000105
+#define GDMA_STATUS_CMD_UNSUPPORTED	0xffffffff
 
 /* Structures labeled with "HW DATA" are exchanged with the hardware. All of
  * them are naturally aligned and hence don't need __packed.
-- 
cgit v1.2.3


From 62ab7ac5be90392a9ac0955febab778ebf51bc0a Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Tue, 17 Jun 2025 16:57:55 +0100
Subject: dt-bindings: clock: renesas,r9a09g077: Add PCLKL core clock ID

Add the Peripheral Module Clock L (PCLKL) core clock ID for the RZ/T2H
(R9A09G077) SoC.  This clock is used by peripherals such as IIC, WDT,
and others.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/20250617155757.149597-3-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
index 1b22fe88dec7..f6e5f62b07c4 100644
--- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
+++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
@@ -23,5 +23,6 @@
 #define R9A09G077_CLK_PCLKGPTL		11
 #define R9A09G077_CLK_PCLKH		12
 #define R9A09G077_CLK_PCLKM		13
+#define R9A09G077_CLK_PCLKL		14
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */
-- 
cgit v1.2.3


From 292bf6c5b8100ba4e16cd194bdc89785f6fb6f7a Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Mon, 9 Jun 2025 21:36:51 +0100
Subject: dt-bindings: clock: renesas,cpg-mssr: Document RZ/N2H support

Document support for Module Standby and Software Reset found on the
Renesas RZ/N2H (R9A09G087) SoC.  The Module Standby and Software Reset
IP is similar to that found on the RZ/T2H SoC.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/20250609203656.333138-4-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 .../bindings/clock/renesas,cpg-mssr.yaml           |  5 +++-
 .../dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h | 28 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
index 708ab6bd7d44..bc2fd3761328 100644
--- a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
+++ b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.yaml
@@ -53,6 +53,7 @@ properties:
       - renesas,r8a779g0-cpg-mssr # R-Car V4H
       - renesas,r8a779h0-cpg-mssr # R-Car V4M
       - renesas,r9a09g077-cpg-mssr # RZ/T2H
+      - renesas,r9a09g087-cpg-mssr # RZ/N2H
 
   reg:
     minItems: 1
@@ -112,7 +113,9 @@ allOf:
       properties:
         compatible:
           contains:
-            const: renesas,r9a09g077-cpg-mssr
+            enum:
+              - renesas,r9a09g077-cpg-mssr
+              - renesas,r9a09g087-cpg-mssr
     then:
       properties:
         reg:
diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h
new file mode 100644
index 000000000000..f28166d6015f
--- /dev/null
+++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+ *
+ * Copyright (C) 2025 Renesas Electronics Corp.
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__
+#define __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__
+
+#include <dt-bindings/clock/renesas-cpg-mssr.h>
+
+/* R9A09G087 CPG Core Clocks */
+#define R9A09G087_CLK_CA55C0		0
+#define R9A09G087_CLK_CA55C1		1
+#define R9A09G087_CLK_CA55C2		2
+#define R9A09G087_CLK_CA55C3		3
+#define R9A09G087_CLK_CA55S		4
+#define R9A09G087_CLK_CR52_CPU0		5
+#define R9A09G087_CLK_CR52_CPU1		6
+#define R9A09G087_CLK_CKIO		7
+#define R9A09G087_CLK_PCLKAH		8
+#define R9A09G087_CLK_PCLKAM		9
+#define R9A09G087_CLK_PCLKAL		10
+#define R9A09G087_CLK_PCLKGPTL		11
+#define R9A09G087_CLK_PCLKH		12
+#define R9A09G087_CLK_PCLKM		13
+#define R9A09G087_CLK_PCLKL		14
+
+#endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */
-- 
cgit v1.2.3


From ea92128fe7f6eef6ee5fcaaed521b1b2b5ab7c9a Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 13 Jun 2025 23:35:13 -0700
Subject: iommufd: Apply obvious cosmetic fixes

Run clang-format but exclude those not so obvious ones, which leaves us:
 - Align indentations
 - Add missing spaces
 - Remove unnecessary spaces
 - Remove unnecessary line wrappings

Link: https://patch.msgid.link/r/9132e1ab45690ab1959c66bbb51ac5536a635388.1749882255.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |  3 +--
 drivers/iommu/iommufd/hw_pagetable.c    |  6 ++----
 drivers/iommu/iommufd/io_pagetable.c    |  3 +--
 drivers/iommu/iommufd/io_pagetable.h    |  2 +-
 drivers/iommu/iommufd/iommufd_private.h |  6 ++----
 drivers/iommu/iommufd/iova_bitmap.c     |  1 -
 drivers/iommu/iommufd/main.c            |  6 ++----
 drivers/iommu/iommufd/pages.c           |  9 ++++-----
 drivers/iommu/iommufd/selftest.c        | 24 +++++++++++-------------
 include/linux/iommufd.h                 |  5 +++--
 10 files changed, 27 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 86244403b532..ed0dc539d490 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -485,8 +485,7 @@ iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid)
 
 	lockdep_assert_held(&idev->igroup->lock);
 
-	handle =
-		iommu_attach_handle_get(idev->igroup->group, pasid, 0);
+	handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0);
 	if (IS_ERR(handle))
 		return NULL;
 	return to_iommufd_handle(handle);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 487779470261..8565a6f596b2 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -309,10 +309,8 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags,
 	refcount_inc(&viommu->obj.users);
 	hwpt_nested->parent = viommu->hwpt;
 
-	hwpt->domain =
-		viommu->ops->alloc_domain_nested(viommu,
-				flags & ~IOMMU_HWPT_FAULT_ID_VALID,
-				user_data);
+	hwpt->domain = viommu->ops->alloc_domain_nested(
+		viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
 		hwpt->domain = NULL;
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 8a790e597e12..13d010f19ed1 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -1410,8 +1410,7 @@ out_unlock:
 }
 
 void iopt_remove_access(struct io_pagetable *iopt,
-			struct iommufd_access *access,
-			u32 iopt_access_list_id)
+			struct iommufd_access *access, u32 iopt_access_list_id)
 {
 	down_write(&iopt->domains_rwsem);
 	down_write(&iopt->iova_rwsem);
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 10c928a9a463..c115a51d9384 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -240,7 +240,7 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start,
 			 unsigned long last, struct page **out_pages,
 			 unsigned int flags);
 void iopt_area_remove_access(struct iopt_area *area, unsigned long start,
-			    unsigned long last);
+			     unsigned long last);
 int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte,
 			 void *data, unsigned long length, unsigned int flags);
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9ccc83341f32..190ceb5dada3 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -266,8 +266,7 @@ struct iommufd_ioas {
 static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx,
 						    u32 id)
 {
-	return container_of(iommufd_get_object(ictx, id,
-					       IOMMUFD_OBJ_IOAS),
+	return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS),
 			    struct iommufd_ioas, obj);
 }
 
@@ -452,8 +451,7 @@ struct iommufd_access {
 
 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access);
 void iopt_remove_access(struct io_pagetable *iopt,
-			struct iommufd_access *access,
-			u32 iopt_access_list_id);
+			struct iommufd_access *access, u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
 struct iommufd_eventq {
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 39a86a4a1d3a..4514575818fc 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -407,7 +407,6 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 
 update_indexes:
 	if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) {
-
 		/*
 		 * The attempt to advance the base index to @iova
 		 * may fail if it's out of bounds, or pinning the pages
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3df468f64e7d..347c56ef44d8 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -102,9 +102,8 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
 		return 0;
 
 	if (wait_event_timeout(ictx->destroy_wait,
-				refcount_read(&to_destroy->shortterm_users) ==
-					0,
-				msecs_to_jiffies(60000)))
+			       refcount_read(&to_destroy->shortterm_users) == 0,
+			       msecs_to_jiffies(60000)))
 		return 0;
 
 	pr_crit("Time out waiting for iommufd object to become free\n");
@@ -539,7 +538,6 @@ static struct miscdevice iommu_misc_dev = {
 	.mode = 0660,
 };
 
-
 static struct miscdevice vfio_misc_dev = {
 	.minor = VFIO_MINOR,
 	.name = "vfio",
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 3427749bc5ce..cbdde642d2af 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -1287,8 +1287,7 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages,
 }
 
 static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte,
-					   unsigned long length,
-					   bool writable)
+					   unsigned long length, bool writable)
 {
 	struct iopt_pages *pages;
 
@@ -1328,7 +1327,7 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr,
 	struct iopt_pages *pages;
 	unsigned long end;
 	void __user *uptr_down =
-		(void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
+		(void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE);
 
 	if (check_add_overflow((unsigned long)uptr, length, &end))
 		return ERR_PTR(-EOVERFLOW);
@@ -2111,8 +2110,8 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index,
  * This should be undone through a matching call to iopt_area_remove_access()
  */
 int iopt_area_add_access(struct iopt_area *area, unsigned long start_index,
-			  unsigned long last_index, struct page **out_pages,
-			  unsigned int flags)
+			 unsigned long last_index, struct page **out_pages,
+			 unsigned int flags)
 {
 	struct iopt_pages *pages = area->pages;
 	struct iopt_pages_access *access;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 6bd0abf9a641..4d5dca8027b1 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1216,9 +1216,8 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd,
 	return 0;
 }
 
-static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
-				       u32 mockpt_id, unsigned int iotlb_id,
-				       u32 iotlb)
+static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+				       unsigned int iotlb_id, u32 iotlb)
 {
 	struct mock_iommu_domain_nested *mock_nested;
 	struct iommufd_hw_pagetable *hwpt;
@@ -1491,7 +1490,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
 	int rc;
 
 	/* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
-	if (length > 16*1024*1024)
+	if (length > 16 * 1024 * 1024)
 		return -ENOMEM;
 
 	if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ))
@@ -1508,7 +1507,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd,
 
 	if (flags & MOCK_FLAGS_ACCESS_SYZ)
 		iova = iommufd_test_syz_conv_iova(staccess->access,
-					&cmd->access_pages.iova);
+						  &cmd->access_pages.iova);
 
 	npages = (ALIGN(iova + length, PAGE_SIZE) -
 		  ALIGN_DOWN(iova, PAGE_SIZE)) /
@@ -1584,7 +1583,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
 	int rc;
 
 	/* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */
-	if (length > 16*1024*1024)
+	if (length > 16 * 1024 * 1024)
 		return -ENOMEM;
 
 	if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH |
@@ -1610,7 +1609,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd,
 
 	if (flags & MOCK_FLAGS_ACCESS_SYZ)
 		iova = iommufd_test_syz_conv_iova(staccess->access,
-				&cmd->access_rw.iova);
+						  &cmd->access_rw.iova);
 
 	rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags);
 	if (rc)
@@ -1665,7 +1664,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 		goto out_put;
 	}
 
-	if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) {
+	if (copy_from_user(tmp, uptr, DIV_ROUND_UP(max, BITS_PER_BYTE))) {
 		rc = -EFAULT;
 		goto out_free;
 	}
@@ -1701,7 +1700,7 @@ out_put:
 static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
 				     struct iommu_test_cmd *cmd)
 {
-	struct iopf_fault event = { };
+	struct iopf_fault event = {};
 	struct iommufd_device *idev;
 
 	idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id);
@@ -1832,8 +1831,7 @@ static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd,
 
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 	if (rc)
-		iommufd_device_detach(sobj->idev.idev,
-				      cmd->pasid_attach.pasid);
+		iommufd_device_detach(sobj->idev.idev, cmd->pasid_attach.pasid);
 
 out_sobj:
 	iommufd_put_object(ucmd->ictx, &sobj->obj);
@@ -2004,8 +2002,8 @@ int __init iommufd_test_init(void)
 		goto err_bus;
 
 	rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops,
-				  &iommufd_mock_bus_type.bus,
-				  &iommufd_mock_bus_type.nb);
+				       &iommufd_mock_bus_type.bus,
+				       &iommufd_mock_bus_type.nb);
 	if (rc)
 		goto err_sysfs;
 
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 34b6e6ca4bfa..498c9a768506 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -171,8 +171,9 @@ static inline void iommufd_access_unpin_pages(struct iommufd_access *access,
 {
 }
 
-static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
-		      void *data, size_t len, unsigned int flags)
+static inline int iommufd_access_rw(struct iommufd_access *access,
+				    unsigned long iova, void *data, size_t len,
+				    unsigned int flags)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From fc9c40e3a4faa09dbd643ae1bdaf8ad006c3bc28 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 13 Jun 2025 23:35:15 -0700
Subject: iommufd: Use enum iommu_viommu_type for type in struct iommufd_viommu

Replace unsigned int, to make it clear. No functional changes.

The viommu_alloc iommu op will be deprecated, so don't change that.

Link: https://patch.msgid.link/r/6c6ba5c0cd381594f17ae74355872d78d7a022c0.1749882255.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 498c9a768506..ac98e49e44fe 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -101,7 +101,7 @@ struct iommufd_viommu {
 	struct list_head veventqs;
 	struct rw_semaphore veventqs_rwsem;
 
-	unsigned int type;
+	enum iommu_viommu_type type;
 };
 
 /**
-- 
cgit v1.2.3


From 187f146d5de65d50d90a4f49157d381d8ae32939 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 13 Jun 2025 23:35:18 -0700
Subject: iommu: Introduce get_viommu_size and viommu_init ops

So far, a vIOMMU object has been allocated by IOMMU driver and initialized
with the driver-level structure, before it returns to the iommufd core for
core-level structure initialization. It has been requiring iommufd core to
expose some core structure/helpers in its driver.c file, which result in a
size increase of this driver module.

Meanwhile, IOMMU drivers are now requiring more vIOMMU-base structures for
some advanced feature, such as the existing vDEVICE and a future HW_QUEUE.
Initializing a core-structure later than driver-structure gives for-driver
helpers some trouble, when they are used by IOMMU driver assuming that the
new structure (including core) are fully initialized, for example:

core:	viommu = ops->viommu_alloc();
driver:	// my_viommu is successfully allocated
driver:	my_viommu = iommufd_viommu_alloc(...);
driver:	// This may crash if it reads viommu->ictx
driver:	new = iommufd_new_viommu_helper(my_viommu->core ...);
core:	viommu->ictx = ucmd->ictx;
core:	...

To ease such a condition, allow the IOMMU driver to report the size of its
vIOMMU structure, let the core allocate a vIOMMU object and initialize the
core-level structure first, and then hand it over the driver to initialize
its driver-level structure.

Thus, this requires two new iommu ops, get_viommu_size and viommu_init, so
iommufd core can communicate with drivers to replace the viommu_alloc op:

core:	viommu = ops->get_viommu_size();
driver:	return VIOMMU_STRUCT_SIZE();
core:	viommu->ictx = ucmd->ictx; // and others
core:	rc = ops->viommu_init();
driver:	// This is safe now as viommu->ictx is inited
driver:	new = iommufd_new_viommu_helper(my_viommu->core ...);
core:	...

This also adds a VIOMMU_STRUCT_SIZE macro, for drivers to use, which would
statically sanitize the driver structure.

Link: https://patch.msgid.link/r/3ab52c5b622dad476c43b1b1f1636c8b902f1692.1749882255.git.nicolinc@nvidia.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h   | 15 +++++++++++++++
 include/linux/iommufd.h |  6 ++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 156732807994..9be4ff370f1e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/iova_bitmap.h>
+#include <uapi/linux/iommufd.h>
 
 #define IOMMU_READ	(1 << 0)
 #define IOMMU_WRITE	(1 << 1)
@@ -596,6 +597,16 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  *		- IOMMU_DOMAIN_DMA: must use a dma domain
  *		- 0: use the default setting
  * @default_domain_ops: the default ops for domains
+ * @get_viommu_size: Get the size of a driver-level vIOMMU structure for a given
+ *                   @dev corresponding to @viommu_type. Driver should return 0
+ *                   if vIOMMU isn't supported accordingly. It is required for
+ *                   driver to use the VIOMMU_STRUCT_SIZE macro to sanitize the
+ *                   driver-level vIOMMU structure related to the core one
+ * @viommu_init: Init the driver-level struct of an iommufd_viommu on a physical
+ *               IOMMU instance @viommu->iommu_dev, as the set of virtualization
+ *               resources shared/passed to user space IOMMU instance. Associate
+ *               it with a nesting @parent_domain. It is required for driver to
+ *               set @viommu->ops pointing to its own viommu_ops
  * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind
  *                the @dev, as the set of virtualization resources shared/passed
  *                to user space IOMMU instance. And associate it with a nesting
@@ -654,6 +665,10 @@ struct iommu_ops {
 
 	int (*def_domain_type)(struct device *dev);
 
+	size_t (*get_viommu_size)(struct device *dev,
+				  enum iommu_viommu_type viommu_type);
+	int (*viommu_init)(struct iommufd_viommu *viommu,
+			   struct iommu_domain *parent_domain);
 	struct iommufd_viommu *(*viommu_alloc)(
 		struct device *dev, struct iommu_domain *parent_domain,
 		struct iommufd_ctx *ictx, unsigned int viommu_type);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index ac98e49e44fe..423e08963d90 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -229,6 +229,12 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 }
 #endif /* CONFIG_IOMMUFD_DRIVER_CORE */
 
+#define VIOMMU_STRUCT_SIZE(drv_struct, member)                                 \
+	(sizeof(drv_struct) +                                                  \
+	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
+	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu,                 \
+					((drv_struct *)NULL)->member)))
+
 /*
  * Helpers for IOMMU driver to allocate driver structures that will be freed by
  * the iommufd core. The free op will be called prior to freeing the memory.
-- 
cgit v1.2.3


From f842ea208e43066c43e5e91e20fe8ce600df7055 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 13 Jun 2025 23:35:23 -0700
Subject: iommu: Deprecate viommu_alloc op

To ease the for-driver iommufd APIs, get_viommu_size and viommu_init ops
are introduced. Now, those existing vIOMMU supported drivers implemented
these two ops, replacing the viommu_alloc one. So, there is no use of it.

Remove it from the headers and the viommu core.

Link: https://patch.msgid.link/r/5b32d4499d7ed02a63e57a293c11b642d226ef8d.1749882255.git.nicolinc@nvidia.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/viommu.c | 20 +++++---------------
 include/linux/iommu.h          | 11 -----------
 include/linux/iommufd.h        | 18 ------------------
 3 files changed, 5 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 27a39f524840..044e3ef06e0f 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -33,8 +33,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 
 	ops = dev_iommu_ops(idev->dev);
 	if (!ops->get_viommu_size || !ops->viommu_init) {
-		if (ops->viommu_alloc)
-			goto get_hwpt_paging;
 		rc = -EOPNOTSUPP;
 		goto out_put_idev;
 	}
@@ -54,7 +52,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
-get_hwpt_paging:
 	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
 	if (IS_ERR(hwpt_paging)) {
 		rc = PTR_ERR(hwpt_paging);
@@ -66,13 +63,8 @@ get_hwpt_paging:
 		goto out_put_hwpt;
 	}
 
-	if (ops->viommu_alloc)
-		viommu = ops->viommu_alloc(idev->dev,
-					   hwpt_paging->common.domain,
-					   ucmd->ictx, cmd->type);
-	else
-		viommu = (struct iommufd_viommu *)_iommufd_object_alloc(
-			ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU);
+	viommu = (struct iommufd_viommu *)_iommufd_object_alloc(
+		ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU);
 	if (IS_ERR(viommu)) {
 		rc = PTR_ERR(viommu);
 		goto out_put_hwpt;
@@ -92,11 +84,9 @@ get_hwpt_paging:
 	 */
 	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
 
-	if (!ops->viommu_alloc) {
-		rc = ops->viommu_init(viommu, hwpt_paging->common.domain);
-		if (rc)
-			goto out_abort;
-	}
+	rc = ops->viommu_init(viommu, hwpt_paging->common.domain);
+	if (rc)
+		goto out_abort;
 
 	/* It is a driver bug that viommu->ops isn't filled */
 	if (WARN_ON_ONCE(!viommu->ops)) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 9be4ff370f1e..04548b18df28 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -607,14 +607,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  *               resources shared/passed to user space IOMMU instance. Associate
  *               it with a nesting @parent_domain. It is required for driver to
  *               set @viommu->ops pointing to its own viommu_ops
- * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind
- *                the @dev, as the set of virtualization resources shared/passed
- *                to user space IOMMU instance. And associate it with a nesting
- *                @parent_domain. The @viommu_type must be defined in the header
- *                include/uapi/linux/iommufd.h
- *                It is required to call iommufd_viommu_alloc() helper for
- *                a bundled allocation of the core and the driver structures,
- *                using the given @ictx pointer.
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
@@ -669,9 +661,6 @@ struct iommu_ops {
 				  enum iommu_viommu_type viommu_type);
 	int (*viommu_init)(struct iommufd_viommu *viommu,
 			   struct iommu_domain *parent_domain);
-	struct iommufd_viommu *(*viommu_alloc)(
-		struct device *dev, struct iommu_domain *parent_domain,
-		struct iommufd_ctx *ictx, unsigned int viommu_type);
 
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 423e08963d90..bf41b242b9f6 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -234,22 +234,4 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
 	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu,                 \
 					((drv_struct *)NULL)->member)))
-
-/*
- * Helpers for IOMMU driver to allocate driver structures that will be freed by
- * the iommufd core. The free op will be called prior to freeing the memory.
- */
-#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops)             \
-	({                                                                     \
-		drv_struct *ret;                                               \
-									       \
-		static_assert(__same_type(struct iommufd_viommu,               \
-					  ((drv_struct *)NULL)->member));      \
-		static_assert(offsetof(drv_struct, member.obj) == 0);          \
-		ret = (drv_struct *)_iommufd_object_alloc(                     \
-			ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU);         \
-		if (!IS_ERR(ret))                                              \
-			ret->member.ops = viommu_ops;                          \
-		ret;                                                           \
-	})
 #endif
-- 
cgit v1.2.3


From 17a93473a552fc0ffdfb04e69a26946afd4a046a Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 13 Jun 2025 23:35:24 -0700
Subject: iommufd: Move _iommufd_object_alloc out of driver.c

Now, all driver structures will be allocated by the core, i.e. no longer a
need of driver calling _iommufd_object_alloc. Thus, move it back.

Before:
   text	   data	    bss	    dec	    hex	filename
   3024	    180	      0	   3204	    c84	drivers/iommu/iommufd/driver.o
   9074	    610	     64	   9748	   2614	drivers/iommu/iommufd/main.o
After:
   text	   data	    bss	    dec	    hex	filename
   2665	    164	      0	   2829	    b0d	drivers/iommu/iommufd/driver.o
   9410	    618	     64	  10092	   276c	drivers/iommu/iommufd/main.o

Link: https://patch.msgid.link/r/79e630c7b911930cf36e3c8a775a04e66c528d65.1749882255.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/driver.c          | 33 ---------------------------------
 drivers/iommu/iommufd/iommufd_private.h |  4 ++++
 drivers/iommu/iommufd/main.c            | 32 ++++++++++++++++++++++++++++++++
 include/linux/iommufd.h                 | 10 ----------
 4 files changed, 36 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 922cd1fe7ec2..2fee399a148e 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -3,39 +3,6 @@
  */
 #include "iommufd_private.h"
 
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type)
-{
-	struct iommufd_object *obj;
-	int rc;
-
-	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
-	if (!obj)
-		return ERR_PTR(-ENOMEM);
-	obj->type = type;
-	/* Starts out bias'd by 1 until it is removed from the xarray */
-	refcount_set(&obj->shortterm_users, 1);
-	refcount_set(&obj->users, 1);
-
-	/*
-	 * Reserve an ID in the xarray but do not publish the pointer yet since
-	 * the caller hasn't initialized it yet. Once the pointer is published
-	 * in the xarray and visible to other threads we can't reliably destroy
-	 * it anymore, so the caller must complete all errorable operations
-	 * before calling iommufd_object_finalize().
-	 */
-	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
-		      GFP_KERNEL_ACCOUNT);
-	if (rc)
-		goto out_free;
-	return obj;
-out_free:
-	kfree(obj);
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, "IOMMUFD");
-
 /* Caller should xa_lock(&viommu->vdevs) to protect the return value */
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 32f0631368e1..ec5b499d139c 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -230,6 +230,10 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
 	iommufd_object_remove(ictx, obj, obj->id, 0);
 }
 
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+					     size_t size,
+					     enum iommufd_object_type type);
+
 #define __iommufd_object_alloc(ictx, ptr, type, obj)                           \
 	container_of(_iommufd_object_alloc(                                    \
 			     ictx,                                             \
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 347c56ef44d8..85ad2853da0b 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,6 +29,38 @@ struct iommufd_object_ops {
 static const struct iommufd_object_ops iommufd_object_ops[];
 static struct miscdevice vfio_misc_dev;
 
+struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
+					     size_t size,
+					     enum iommufd_object_type type)
+{
+	struct iommufd_object *obj;
+	int rc;
+
+	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+	obj->type = type;
+	/* Starts out bias'd by 1 until it is removed from the xarray */
+	refcount_set(&obj->shortterm_users, 1);
+	refcount_set(&obj->users, 1);
+
+	/*
+	 * Reserve an ID in the xarray but do not publish the pointer yet since
+	 * the caller hasn't initialized it yet. Once the pointer is published
+	 * in the xarray and visible to other threads we can't reliably destroy
+	 * it anymore, so the caller must complete all errorable operations
+	 * before calling iommufd_object_finalize().
+	 */
+	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b,
+		      GFP_KERNEL_ACCOUNT);
+	if (rc)
+		goto out_free;
+	return obj;
+out_free:
+	kfree(obj);
+	return ERR_PTR(rc);
+}
+
 /*
  * Allow concurrent access to the object.
  *
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index bf41b242b9f6..2d1bf2f97ee3 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -190,9 +190,6 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
 #endif /* CONFIG_IOMMUFD */
 
 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE)
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type);
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
@@ -201,13 +198,6 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 				enum iommu_veventq_type type, void *event_data,
 				size_t data_len);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
-static inline struct iommufd_object *
-_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size,
-		      enum iommufd_object_type type)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct device *
 iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
 {
-- 
cgit v1.2.3


From 2c04d279e857e6c441593c282f978cebc5583fd9 Mon Sep 17 00:00:00 2001
From: Jun Miao <jun.miao@intel.com>
Date: Wed, 18 Jun 2025 13:39:23 -0400
Subject: net: usb: Convert tasklet API to new bottom half workqueue mechanism

Migrate tasklet APIs to the new bottom half workqueue mechanism. It
replaces all occurrences of tasklet usage with the appropriate workqueue
APIs throughout the usbnet driver. This transition ensures compatibility
with the latest design and enhances performance.

Signed-off-by: Jun Miao <jun.miao@intel.com>
Link: https://patch.msgid.link/20250618173923.950510-1-jun.miao@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/usbnet.c   | 36 ++++++++++++++++++------------------
 include/linux/usb/usbnet.h |  2 +-
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index c04e715a4c2a..9564478a79cc 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -461,7 +461,7 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb,
 
 	__skb_queue_tail(&dev->done, skb);
 	if (dev->done.qlen == 1)
-		tasklet_schedule(&dev->bh);
+		queue_work(system_bh_wq, &dev->bh_work);
 	spin_unlock(&dev->done.lock);
 	spin_unlock_irqrestore(&list->lock, flags);
 	return old_state;
@@ -549,7 +549,7 @@ static int rx_submit (struct usbnet *dev, struct urb *urb, gfp_t flags)
 		default:
 			netif_dbg(dev, rx_err, dev->net,
 				  "rx submit, %d\n", retval);
-			tasklet_schedule (&dev->bh);
+			queue_work(system_bh_wq, &dev->bh_work);
 			break;
 		case 0:
 			__usbnet_queue_skb(&dev->rxq, skb, rx_start);
@@ -709,7 +709,7 @@ void usbnet_resume_rx(struct usbnet *dev)
 		num++;
 	}
 
-	tasklet_schedule(&dev->bh);
+	queue_work(system_bh_wq, &dev->bh_work);
 
 	netif_dbg(dev, rx_status, dev->net,
 		  "paused rx queue disabled, %d skbs requeued\n", num);
@@ -778,7 +778,7 @@ void usbnet_unlink_rx_urbs(struct usbnet *dev)
 {
 	if (netif_running(dev->net)) {
 		(void) unlink_urbs (dev, &dev->rxq);
-		tasklet_schedule(&dev->bh);
+		queue_work(system_bh_wq, &dev->bh_work);
 	}
 }
 EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
@@ -861,14 +861,14 @@ int usbnet_stop (struct net_device *net)
 	/* deferred work (timer, softirq, task) must also stop */
 	dev->flags = 0;
 	timer_delete_sync(&dev->delay);
-	tasklet_kill(&dev->bh);
+	disable_work_sync(&dev->bh_work);
 	cancel_work_sync(&dev->kevent);
 
 	/* We have cyclic dependencies. Those calls are needed
 	 * to break a cycle. We cannot fall into the gaps because
 	 * we have a flag
 	 */
-	tasklet_kill(&dev->bh);
+	disable_work_sync(&dev->bh_work);
 	timer_delete_sync(&dev->delay);
 	cancel_work_sync(&dev->kevent);
 
@@ -955,7 +955,7 @@ int usbnet_open (struct net_device *net)
 	clear_bit(EVENT_RX_KILL, &dev->flags);
 
 	// delay posting reads until we're fully open
-	tasklet_schedule (&dev->bh);
+	queue_work(system_bh_wq, &dev->bh_work);
 	if (info->manage_power) {
 		retval = info->manage_power(dev, 1);
 		if (retval < 0) {
@@ -1123,7 +1123,7 @@ static void __handle_link_change(struct usbnet *dev)
 		 */
 	} else {
 		/* submitting URBs for reading packets */
-		tasklet_schedule(&dev->bh);
+		queue_work(system_bh_wq, &dev->bh_work);
 	}
 
 	/* hard_mtu or rx_urb_size may change during link change */
@@ -1198,11 +1198,11 @@ fail_halt:
 		} else {
 			clear_bit (EVENT_RX_HALT, &dev->flags);
 			if (!usbnet_going_away(dev))
-				tasklet_schedule(&dev->bh);
+				queue_work(system_bh_wq, &dev->bh_work);
 		}
 	}
 
-	/* tasklet could resubmit itself forever if memory is tight */
+	/* work could resubmit itself forever if memory is tight */
 	if (test_bit (EVENT_RX_MEMORY, &dev->flags)) {
 		struct urb	*urb = NULL;
 		int resched = 1;
@@ -1224,7 +1224,7 @@ fail_halt:
 fail_lowmem:
 			if (resched)
 				if (!usbnet_going_away(dev))
-					tasklet_schedule(&dev->bh);
+					queue_work(system_bh_wq, &dev->bh_work);
 		}
 	}
 
@@ -1325,7 +1325,7 @@ void usbnet_tx_timeout (struct net_device *net, unsigned int txqueue)
 	struct usbnet		*dev = netdev_priv(net);
 
 	unlink_urbs (dev, &dev->txq);
-	tasklet_schedule (&dev->bh);
+	queue_work(system_bh_wq, &dev->bh_work);
 	/* this needs to be handled individually because the generic layer
 	 * doesn't know what is sufficient and could not restore private
 	 * information if a remedy of an unconditional reset were used.
@@ -1547,7 +1547,7 @@ static inline void usb_free_skb(struct sk_buff *skb)
 
 /*-------------------------------------------------------------------------*/
 
-// tasklet (work deferred from completions, in_irq) or timer
+// work (work deferred from completions, in_irq) or timer
 
 static void usbnet_bh (struct timer_list *t)
 {
@@ -1601,16 +1601,16 @@ static void usbnet_bh (struct timer_list *t)
 					  "rxqlen %d --> %d\n",
 					  temp, dev->rxq.qlen);
 			if (dev->rxq.qlen < RX_QLEN(dev))
-				tasklet_schedule (&dev->bh);
+				queue_work(system_bh_wq, &dev->bh_work);
 		}
 		if (dev->txq.qlen < TX_QLEN (dev))
 			netif_wake_queue (dev->net);
 	}
 }
 
-static void usbnet_bh_tasklet(struct tasklet_struct *t)
+static void usbnet_bh_work(struct work_struct *work)
 {
-	struct usbnet *dev = from_tasklet(dev, t, bh);
+	struct usbnet *dev = from_work(dev, work, bh_work);
 
 	usbnet_bh(&dev->delay);
 }
@@ -1742,7 +1742,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	skb_queue_head_init (&dev->txq);
 	skb_queue_head_init (&dev->done);
 	skb_queue_head_init(&dev->rxq_pause);
-	tasklet_setup(&dev->bh, usbnet_bh_tasklet);
+	INIT_WORK(&dev->bh_work, usbnet_bh_work);
 	INIT_WORK (&dev->kevent, usbnet_deferred_kevent);
 	init_usb_anchor(&dev->deferred);
 	timer_setup(&dev->delay, usbnet_bh, 0);
@@ -1971,7 +1971,7 @@ int usbnet_resume (struct usb_interface *intf)
 
 			if (!(dev->txq.qlen >= TX_QLEN(dev)))
 				netif_tx_wake_all_queues(dev->net);
-			tasklet_schedule (&dev->bh);
+			queue_work(system_bh_wq, &dev->bh_work);
 		}
 	}
 
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 0b9f1e598e3a..208682f77179 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -58,7 +58,7 @@ struct usbnet {
 	unsigned		interrupt_count;
 	struct mutex		interrupt_mutex;
 	struct usb_anchor	deferred;
-	struct tasklet_struct	bh;
+	struct work_struct	bh_work;
 
 	struct work_struct	kevent;
 	unsigned long		flags;
-- 
cgit v1.2.3


From c7d78566bbd30544a0618a6ffbc97bc0ddac7035 Mon Sep 17 00:00:00 2001
From: Nicolas Escande <nico.escande@gmail.com>
Date: Tue, 17 Jun 2025 16:13:34 +0200
Subject: neighbour: add support for NUD_PERMANENT proxy entries

As discussesd before in [0] proxy entries (which are more configuration
than runtime data) should stay when the link (carrier) goes does down.
This is what happens for regular neighbour entries.

So lets fix this by:
  - storing in proxy entries the fact that it was added as NUD_PERMANENT
  - not removing NUD_PERMANENT proxy entries when the carrier goes down
    (same as how it's done in neigh_flush_dev() for regular neigh entries)

[0]: https://lore.kernel.org/netdev/c584ef7e-6897-01f3-5b80-12b53f7b4bf4@kernel.org/

Signed-off-by: Nicolas Escande <nico.escande@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250617141334.3724863-1-nico.escande@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  1 +
 net/core/neighbour.c    | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9a832cab5b1d..c7ce5ec7be23 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -182,6 +182,7 @@ struct pneigh_entry {
 	netdevice_tracker	dev_tracker;
 	u32			flags;
 	u8			protocol;
+	bool			permanent;
 	u32			key[];
 };
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 49dce9a82295..85a5535de8ba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -54,7 +54,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
 static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				    struct net_device *dev);
+				    struct net_device *dev,
+				    bool skip_perm);
 
 #ifdef CONFIG_PROC_FS
 static const struct seq_operations neigh_stat_seq_ops;
@@ -423,7 +424,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
 {
 	write_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev, skip_perm);
-	pneigh_ifdown_and_unlock(tbl, dev);
+	pneigh_ifdown_and_unlock(tbl, dev, skip_perm);
 	pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
 			   tbl->family);
 	if (skb_queue_empty_lockless(&tbl->proxy_queue))
@@ -803,7 +804,8 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 }
 
 static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				    struct net_device *dev)
+				    struct net_device *dev,
+				    bool skip_perm)
 {
 	struct pneigh_entry *n, **np, *freelist = NULL;
 	u32 h;
@@ -811,12 +813,15 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
 		np = &tbl->phash_buckets[h];
 		while ((n = *np) != NULL) {
+			if (skip_perm && n->permanent)
+				goto skip;
 			if (!dev || n->dev == dev) {
 				*np = n->next;
 				n->next = freelist;
 				freelist = n;
 				continue;
 			}
+skip:
 			np = &n->next;
 		}
 	}
@@ -1983,6 +1988,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 		pn = pneigh_lookup(tbl, net, dst, dev, 1);
 		if (pn) {
 			pn->flags = ndm_flags;
+			pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT);
 			if (protocol)
 				pn->protocol = protocol;
 			err = 0;
-- 
cgit v1.2.3


From 27480a7c8f0274f8f2fc6c40e4522f38e52bd05f Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 18 Jun 2025 01:32:44 -0700
Subject: net: add dev_dstats_rx_dropped_add() helper

Introduce the dev_dstats_rx_dropped_add() helper to allow incrementing
the rx_drops per-CPU statistic by an arbitrary value, rather than just
one. This is useful for drivers or code paths that need to account for
multiple dropped packets at once, such as when dropping entire queues.

Reviewed-by: Joe Damato <joe@dama.to>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://patch.msgid.link/20250618-netdevsim_stat-v4-3-19fe0d35e28e@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9cbc4e54b7e4..03c26bb0fbbe 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3016,6 +3016,16 @@ static inline void dev_dstats_rx_dropped(struct net_device *dev)
 	u64_stats_update_end(&dstats->syncp);
 }
 
+static inline void dev_dstats_rx_dropped_add(struct net_device *dev,
+					     unsigned int packets)
+{
+	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
+
+	u64_stats_update_begin(&dstats->syncp);
+	u64_stats_add(&dstats->rx_drops, packets);
+	u64_stats_update_end(&dstats->syncp);
+}
+
 static inline void dev_dstats_tx_add(struct net_device *dev,
 				     unsigned int len)
 {
-- 
cgit v1.2.3


From 49c94af071fc6c9f5e1db52b3031dec28daa90c3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 18 Jun 2025 10:24:16 -0400
Subject: ref_tracker: have callers pass output function to pr_ostream()

In a later patch, we'll be adding a 3rd mechanism for outputting
ref_tracker info via seq_file. Instead of a conditional, have the caller
set a pointer to an output function in struct ostream. As part of this,
the log prefix must be explicitly passed in, as it's too late for the
pr_fmt macro.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20250618-reftrack-dbgfs-v15-3-24fc37ead144@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ref_tracker.h |  2 ++
 lib/ref_tracker.c           | 52 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index 8eac4f3d5254..a0a1ee43724f 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -6,6 +6,8 @@
 #include <linux/spinlock.h>
 #include <linux/stackdepot.h>
 
+#define __ostream_printf __printf(2, 3)
+
 struct ref_tracker;
 
 struct ref_tracker_dir {
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index d374e5273e14..42872f406b2a 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -63,21 +63,38 @@ ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
 }
 
 struct ostream {
+	void __ostream_printf (*func)(struct ostream *stream, char *fmt, ...);
+	char *prefix;
 	char *buf;
 	int size, used;
 };
 
+static void __ostream_printf pr_ostream_log(struct ostream *stream, char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+}
+
+static void __ostream_printf pr_ostream_buf(struct ostream *stream, char *fmt, ...)
+{
+	int ret, len = stream->size - stream->used;
+	va_list args;
+
+	va_start(args, fmt);
+	ret = vsnprintf(stream->buf + stream->used, len, fmt, args);
+	va_end(args);
+	if (ret > 0)
+		stream->used += min(ret, len);
+}
+
 #define pr_ostream(stream, fmt, args...) \
 ({ \
 	struct ostream *_s = (stream); \
 \
-	if (!_s->buf) { \
-		pr_err(fmt, ##args); \
-	} else { \
-		int ret, len = _s->size - _s->used; \
-		ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \
-		_s->used += min(ret, len); \
-	} \
+	_s->func(_s, fmt, ##args); \
 })
 
 static void
@@ -96,8 +113,8 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
 
 	stats = ref_tracker_get_stats(dir, display_limit);
 	if (IS_ERR(stats)) {
-		pr_ostream(s, "%s@%p: couldn't get stats, error %pe\n",
-			   dir->name, dir, stats);
+		pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n",
+			   s->prefix, dir->name, dir, stats);
 		return;
 	}
 
@@ -107,14 +124,15 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
 		stack = stats->stacks[i].stack_handle;
 		if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
 			sbuf[0] = 0;
-		pr_ostream(s, "%s@%p has %d/%d users at\n%s\n", dir->name, dir,
-			   stats->stacks[i].count, stats->total, sbuf);
+		pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix,
+			   dir->name, dir, stats->stacks[i].count,
+			   stats->total, sbuf);
 		skipped -= stats->stacks[i].count;
 	}
 
 	if (skipped)
-		pr_ostream(s, "%s@%p skipped reports about %d/%d users.\n",
-			   dir->name, dir, skipped, stats->total);
+		pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n",
+			   s->prefix, dir->name, dir, skipped, stats->total);
 
 	kfree(sbuf);
 
@@ -124,7 +142,8 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
 void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir,
 				  unsigned int display_limit)
 {
-	struct ostream os = {};
+	struct ostream os = { .func = pr_ostream_log,
+			      .prefix = "ref_tracker: " };
 
 	__ref_tracker_dir_pr_ostream(dir, display_limit, &os);
 }
@@ -143,7 +162,10 @@ EXPORT_SYMBOL(ref_tracker_dir_print);
 
 int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size)
 {
-	struct ostream os = { .buf = buf, .size = size };
+	struct ostream os = { .func = pr_ostream_buf,
+			      .prefix = "ref_tracker: ",
+			      .buf = buf,
+			      .size = size };
 	unsigned long flags;
 
 	spin_lock_irqsave(&dir->lock, flags);
-- 
cgit v1.2.3


From aa7d26c3c3497258b712fb97221e775733a710b7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 18 Jun 2025 10:24:17 -0400
Subject: ref_tracker: add a static classname string to each ref_tracker_dir

A later patch in the series will be adding debugfs files for each
ref_tracker that get created in ref_tracker_dir_init(). The format will
be "class@%px". The current "name" string can vary between
ref_tracker_dir objects of the same type, so it's not suitable for this
purpose.

Add a new "class" string to the ref_tracker dir that describes the
the type of object (sans any individual info for that object).

Also, in the i915 driver, gate the creation of debugfs files on whether
the dentry pointer is still set to NULL. CI has shown that the
ref_tracker_dir can be initialized more than once.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20250618-reftrack-dbgfs-v15-4-24fc37ead144@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/gpu/drm/display/drm_dp_tunnel.c | 2 +-
 drivers/gpu/drm/i915/intel_runtime_pm.c | 4 +++-
 drivers/gpu/drm/i915/intel_wakeref.c    | 3 ++-
 include/linux/ref_tracker.h             | 4 ++++
 lib/test_ref_tracker.c                  | 2 +-
 net/core/dev.c                          | 2 +-
 net/core/net_namespace.c                | 4 ++--
 7 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_tunnel.c b/drivers/gpu/drm/display/drm_dp_tunnel.c
index 076edf161048..b9c12b8bf2a3 100644
--- a/drivers/gpu/drm/display/drm_dp_tunnel.c
+++ b/drivers/gpu/drm/display/drm_dp_tunnel.c
@@ -1920,7 +1920,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count)
 	}
 
 #ifdef CONFIG_DRM_DISPLAY_DP_TUNNEL_STATE_DEBUG
-	ref_tracker_dir_init(&mgr->ref_tracker, 16, "dptun");
+	ref_tracker_dir_init(&mgr->ref_tracker, 16, "drm_dptun", "dptun");
 #endif
 
 	for (i = 0; i < max_group_count; i++) {
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 8d9f4c410546..90d90145a189 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -59,7 +59,9 @@ static struct drm_i915_private *rpm_to_i915(struct intel_runtime_pm *rpm)
 
 static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm)
 {
-	ref_tracker_dir_init(&rpm->debug, INTEL_REFTRACK_DEAD_COUNT, dev_name(rpm->kdev));
+	if (!rpm->debug.class)
+		ref_tracker_dir_init(&rpm->debug, INTEL_REFTRACK_DEAD_COUNT,
+				     "intel_runtime_pm", dev_name(rpm->kdev));
 }
 
 static intel_wakeref_t
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index 51561b190b93..7e74c58862c1 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -114,7 +114,8 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 			 "wakeref.work", &key->work, 0);
 
 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_WAKEREF)
-	ref_tracker_dir_init(&wf->debug, INTEL_REFTRACK_DEAD_COUNT, name);
+	if (!wf->debug.class)
+		ref_tracker_dir_init(&wf->debug, INTEL_REFTRACK_DEAD_COUNT, "intel_wakeref", name);
 #endif
 }
 
diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index a0a1ee43724f..3968f993db81 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -19,6 +19,7 @@ struct ref_tracker_dir {
 	bool			dead;
 	struct list_head	list; /* List of active trackers */
 	struct list_head	quarantine; /* List of dead trackers */
+	const char		*class; /* object classname */
 	char			name[32];
 #endif
 };
@@ -27,6 +28,7 @@ struct ref_tracker_dir {
 
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 					unsigned int quarantine_count,
+					const char *class,
 					const char *name)
 {
 	INIT_LIST_HEAD(&dir->list);
@@ -36,6 +38,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 	dir->dead = false;
 	refcount_set(&dir->untracked, 1);
 	refcount_set(&dir->no_tracker, 1);
+	dir->class = class;
 	strscpy(dir->name, name, sizeof(dir->name));
 	stack_depot_init();
 }
@@ -60,6 +63,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
 
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 					unsigned int quarantine_count,
+					const char *class,
 					const char *name)
 {
 }
diff --git a/lib/test_ref_tracker.c b/lib/test_ref_tracker.c
index b983ceb12afc..d263502a4c1d 100644
--- a/lib/test_ref_tracker.c
+++ b/lib/test_ref_tracker.c
@@ -64,7 +64,7 @@ static int __init test_ref_tracker_init(void)
 {
 	int i;
 
-	ref_tracker_dir_init(&ref_dir, 100, "selftest");
+	ref_tracker_dir_init(&ref_dir, 100, "selftest", "selftest");
 
 	timer_setup(&test_ref_tracker_timer, test_ref_tracker_timer_func, 0);
 	mod_timer(&test_ref_tracker_timer, jiffies + 1);
diff --git a/net/core/dev.c b/net/core/dev.c
index be97c440ecd5..12cf4e5ae9c5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11715,7 +11715,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->priv_len = sizeof_priv;
 
-	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
+	ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev", name);
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	dev->pcpu_refcnt = alloc_percpu(int);
 	if (!dev->pcpu_refcnt)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ae54f26709ca..aa1e34181ed6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -403,8 +403,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
 {
 	refcount_set(&net->passive, 1);
 	refcount_set(&net->ns.count, 1);
-	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
-	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
+	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt", "net_refcnt");
+	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt", "net_notrefcnt");
 
 	get_random_bytes(&net->hash_mix, sizeof(u32));
 	net->dev_base_seq = 1;
-- 
cgit v1.2.3


From 65b584f5361163ba539d2c7122ca792c3cc87997 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 18 Jun 2025 10:24:19 -0400
Subject: ref_tracker: automatically register a file in debugfs for a
 ref_tracker_dir

Currently, there is no convenient way to see the info that the
ref_tracking infrastructure collects. Attempt to create a file in
debugfs when called from ref_tracker_dir_init().

The file is given the name "class@%px", as having the unmodified address
is helpful for debugging. This should be safe since this directory is only
accessible by root

While ref_tracker_dir_init() is generally called from a context where
sleeping is OK, ref_tracker_dir_exit() can be called from anywhere.
Thus, dentry cleanup must be handled asynchronously.

Add a new global xarray that has entries with the ref_tracker_dir
pointer as the index and the corresponding debugfs dentry pointer as the
value. Instead of removing the debugfs dentry, have
ref_tracker_dir_exit() set a mark on the xarray entry and schedule a
workqueue job. The workqueue job then walks the xarray looking for
marked entries, and removes their xarray entries and the debugfs
dentries.

Because of this, the debugfs dentry can outlive the corresponding
ref_tracker_dir. Have ref_tracker_debugfs_show() take extra care to
ensure that it's safe to dereference the dir pointer before using it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20250618-reftrack-dbgfs-v15-6-24fc37ead144@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ref_tracker.h |  17 +++++
 lib/ref_tracker.c           | 152 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 164 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index 3968f993db81..28bbf436a8f4 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -26,6 +26,18 @@ struct ref_tracker_dir {
 
 #ifdef CONFIG_REF_TRACKER
 
+#ifdef CONFIG_DEBUG_FS
+
+void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir);
+
+#else /* CONFIG_DEBUG_FS */
+
+static inline void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
+{
+}
+
+#endif /* CONFIG_DEBUG_FS */
+
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 					unsigned int quarantine_count,
 					const char *class,
@@ -40,6 +52,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 	refcount_set(&dir->no_tracker, 1);
 	dir->class = class;
 	strscpy(dir->name, name, sizeof(dir->name));
+	ref_tracker_dir_debugfs(dir);
 	stack_depot_init();
 }
 
@@ -68,6 +81,10 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 {
 }
 
+static inline void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
+{
+}
+
 static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
 {
 }
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index 73b606570cce..c938ef56954b 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -29,6 +29,40 @@ struct ref_tracker_dir_stats {
 	} stacks[];
 };
 
+#ifdef CONFIG_DEBUG_FS
+#include <linux/xarray.h>
+
+/*
+ * ref_tracker_dir_init() is usually called in allocation-safe contexts, but
+ * the same is not true of ref_tracker_dir_exit() which can be called from
+ * anywhere an object is freed. Removing debugfs dentries is a blocking
+ * operation, so we defer that work to the debugfs_reap_worker.
+ *
+ * Each dentry is tracked in the appropriate xarray.  When
+ * ref_tracker_dir_exit() is called, its entries in the xarrays are marked and
+ * the workqueue job is scheduled. The worker then runs and deletes any marked
+ * dentries asynchronously.
+ */
+static struct xarray		debugfs_dentries;
+static struct work_struct	debugfs_reap_worker;
+
+#define REF_TRACKER_DIR_DEAD	XA_MARK_0
+static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
+{
+	unsigned long flags;
+
+	xa_lock_irqsave(&debugfs_dentries, flags);
+	__xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
+	xa_unlock_irqrestore(&debugfs_dentries, flags);
+
+	schedule_work(&debugfs_reap_worker);
+}
+#else
+static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
+{
+}
+#endif
+
 static struct ref_tracker_dir_stats *
 ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
 {
@@ -185,6 +219,11 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
 	bool leak = false;
 
 	dir->dead = true;
+	/*
+	 * The xarray entries must be marked before the dir->lock is taken to
+	 * protect simultaneous debugfs readers.
+	 */
+	ref_tracker_debugfs_mark(dir);
 	spin_lock_irqsave(&dir->lock, flags);
 	list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
 		list_del(&tracker->head);
@@ -312,23 +351,126 @@ static void __ostream_printf pr_ostream_seq(struct ostream *stream, char *fmt, .
 	va_end(args);
 }
 
-static __maybe_unused int
-ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq)
+static int ref_tracker_dir_seq_print(struct ref_tracker_dir *dir, struct seq_file *seq)
 {
 	struct ostream os = { .func = pr_ostream_seq,
 			      .prefix = "",
 			      .seq = seq };
-	unsigned long flags;
 
-	spin_lock_irqsave(&dir->lock, flags);
 	__ref_tracker_dir_pr_ostream(dir, 16, &os);
-	spin_unlock_irqrestore(&dir->lock, flags);
 
 	return os.used;
 }
 
+static int ref_tracker_debugfs_show(struct seq_file *f, void *v)
+{
+	struct ref_tracker_dir *dir = f->private;
+	unsigned long index = (unsigned long)dir;
+	unsigned long flags;
+	int ret;
+
+	/*
+	 * "dir" may not exist at this point if ref_tracker_dir_exit() has
+	 * already been called. Take care not to dereference it until its
+	 * legitimacy is established.
+	 *
+	 * The xa_lock is necessary to ensure that "dir" doesn't disappear
+	 * before its lock can be taken. If it's in the hash and not marked
+	 * dead, then it's safe to take dir->lock which prevents
+	 * ref_tracker_dir_exit() from completing. Once the dir->lock is
+	 * acquired, the xa_lock can be released. All of this must be IRQ-safe.
+	 */
+	xa_lock_irqsave(&debugfs_dentries, flags);
+	if (!xa_load(&debugfs_dentries, index) ||
+	    xa_get_mark(&debugfs_dentries, index, REF_TRACKER_DIR_DEAD)) {
+		xa_unlock_irqrestore(&debugfs_dentries, flags);
+		return -ENODATA;
+	}
+
+	spin_lock(&dir->lock);
+	xa_unlock(&debugfs_dentries);
+	ret = ref_tracker_dir_seq_print(dir, f);
+	spin_unlock_irqrestore(&dir->lock, flags);
+	return ret;
+}
+
+static int ref_tracker_debugfs_open(struct inode *inode, struct file *filp)
+{
+	struct ref_tracker_dir *dir = inode->i_private;
+
+	return single_open(filp, ref_tracker_debugfs_show, dir);
+}
+
+static const struct file_operations ref_tracker_debugfs_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ref_tracker_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/**
+ * ref_tracker_dir_debugfs - create debugfs file for ref_tracker_dir
+ * @dir: ref_tracker_dir to be associated with debugfs file
+ *
+ * In most cases, a debugfs file will be created automatically for every
+ * ref_tracker_dir. If the object was created before debugfs is brought up
+ * then that may fail. In those cases, it is safe to call this at a later
+ * time to create the file.
+ */
+void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
+{
+	char name[NAME_MAX + 1];
+	struct dentry *dentry;
+	int ret;
+
+	/* No-op if already created */
+	dentry = xa_load(&debugfs_dentries, (unsigned long)dir);
+	if (dentry && !xa_is_err(dentry))
+		return;
+
+	ret = snprintf(name, sizeof(name), "%s@%px", dir->class, dir);
+	name[sizeof(name) - 1] = '\0';
+
+	if (ret < sizeof(name)) {
+		dentry = debugfs_create_file(name, S_IFREG | 0400,
+					     ref_tracker_debug_dir, dir,
+					     &ref_tracker_debugfs_fops);
+		if (!IS_ERR(dentry)) {
+			void *old;
+
+			old = xa_store_irq(&debugfs_dentries, (unsigned long)dir,
+					   dentry, GFP_KERNEL);
+
+			if (xa_is_err(old))
+				debugfs_remove(dentry);
+			else
+				WARN_ON_ONCE(old);
+		}
+	}
+}
+EXPORT_SYMBOL(ref_tracker_dir_debugfs);
+
+static void debugfs_reap_work(struct work_struct *work)
+{
+	struct dentry *dentry;
+	unsigned long index;
+	bool reaped;
+
+	do {
+		reaped = false;
+		xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) {
+			xa_erase_irq(&debugfs_dentries, index);
+			debugfs_remove(dentry);
+			reaped = true;
+		}
+	} while (reaped);
+}
+
 static int __init ref_tracker_debugfs_init(void)
 {
+	INIT_WORK(&debugfs_reap_worker, debugfs_reap_work);
+	xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ);
 	ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL);
 	return 0;
 }
-- 
cgit v1.2.3


From d04992dc86a6c77b7d39a1ee10013aed7111e855 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 18 Jun 2025 10:24:20 -0400
Subject: ref_tracker: add a way to create a symlink to the ref_tracker_dir
 debugfs file

Add the ability for a subsystem to add a user-friendly symlink that
points to a ref_tracker_dir's debugfs file. Add a separate
debugfs_symlinks xarray and use that to track symlinks. The reaper
workqueue job will remove symlinks before their corresponding dentries.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20250618-reftrack-dbgfs-v15-7-24fc37ead144@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ref_tracker.h | 11 ++++++++++
 lib/ref_tracker.c           | 50 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index 28bbf436a8f4..e1323de93bf6 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -29,6 +29,7 @@ struct ref_tracker_dir {
 #ifdef CONFIG_DEBUG_FS
 
 void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir);
+void ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...);
 
 #else /* CONFIG_DEBUG_FS */
 
@@ -36,6 +37,11 @@ static inline void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
 {
 }
 
+static inline __ostream_printf
+void ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
+{
+}
+
 #endif /* CONFIG_DEBUG_FS */
 
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
@@ -85,6 +91,11 @@ static inline void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
 {
 }
 
+static inline __ostream_printf
+void ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
+{
+}
+
 static inline void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
 {
 }
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index c938ef56954b..6608520d6118 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -44,6 +44,7 @@ struct ref_tracker_dir_stats {
  * dentries asynchronously.
  */
 static struct xarray		debugfs_dentries;
+static struct xarray		debugfs_symlinks;
 static struct work_struct	debugfs_reap_worker;
 
 #define REF_TRACKER_DIR_DEAD	XA_MARK_0
@@ -55,6 +56,10 @@ static inline void ref_tracker_debugfs_mark(struct ref_tracker_dir *dir)
 	__xa_set_mark(&debugfs_dentries, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
 	xa_unlock_irqrestore(&debugfs_dentries, flags);
 
+	xa_lock_irqsave(&debugfs_symlinks, flags);
+	__xa_set_mark(&debugfs_symlinks, (unsigned long)dir, REF_TRACKER_DIR_DEAD);
+	xa_unlock_irqrestore(&debugfs_symlinks, flags);
+
 	schedule_work(&debugfs_reap_worker);
 }
 #else
@@ -451,6 +456,45 @@ void ref_tracker_dir_debugfs(struct ref_tracker_dir *dir)
 }
 EXPORT_SYMBOL(ref_tracker_dir_debugfs);
 
+void __ostream_printf ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
+{
+	char name[NAME_MAX + 1];
+	struct dentry *symlink, *dentry;
+	va_list args;
+	int ret;
+
+	symlink = xa_load(&debugfs_symlinks, (unsigned long)dir);
+	dentry = xa_load(&debugfs_dentries, (unsigned long)dir);
+
+	/* Already created?*/
+	if (symlink && !xa_is_err(symlink))
+		return;
+
+	if (!dentry || xa_is_err(dentry))
+		return;
+
+	va_start(args, fmt);
+	ret = vsnprintf(name, sizeof(name), fmt, args);
+	va_end(args);
+	name[sizeof(name) - 1] = '\0';
+
+	if (ret < sizeof(name)) {
+		symlink = debugfs_create_symlink(name, ref_tracker_debug_dir,
+						 dentry->d_name.name);
+		if (!IS_ERR(symlink)) {
+			void *old;
+
+			old = xa_store_irq(&debugfs_symlinks, (unsigned long)dir,
+					   symlink, GFP_KERNEL);
+			if (xa_is_err(old))
+				debugfs_remove(symlink);
+			else
+				WARN_ON_ONCE(old);
+		}
+	}
+}
+EXPORT_SYMBOL(ref_tracker_dir_symlink);
+
 static void debugfs_reap_work(struct work_struct *work)
 {
 	struct dentry *dentry;
@@ -459,6 +503,11 @@ static void debugfs_reap_work(struct work_struct *work)
 
 	do {
 		reaped = false;
+		xa_for_each_marked(&debugfs_symlinks, index, dentry, REF_TRACKER_DIR_DEAD) {
+			xa_erase_irq(&debugfs_symlinks, index);
+			debugfs_remove(dentry);
+			reaped = true;
+		}
 		xa_for_each_marked(&debugfs_dentries, index, dentry, REF_TRACKER_DIR_DEAD) {
 			xa_erase_irq(&debugfs_dentries, index);
 			debugfs_remove(dentry);
@@ -471,6 +520,7 @@ static int __init ref_tracker_debugfs_init(void)
 {
 	INIT_WORK(&debugfs_reap_worker, debugfs_reap_work);
 	xa_init_flags(&debugfs_dentries, XA_FLAGS_LOCK_IRQ);
+	xa_init_flags(&debugfs_symlinks, XA_FLAGS_LOCK_IRQ);
 	ref_tracker_debug_dir = debugfs_create_dir("ref_tracker", NULL);
 	return 0;
 }
-- 
cgit v1.2.3


From 707bd05be75f65749c3f1695f4e362a89b3fcc7b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 18 Jun 2025 10:24:22 -0400
Subject: ref_tracker: eliminate the ref_tracker_dir name field

Now that we have dentries and the ability to create meaningful symlinks
to them, don't keep a name string in each tracker. Switch the output
format to print "class@address", and drop the name field.

Also, add a kerneldoc header for ref_tracker_dir_init().

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://patch.msgid.link/20250618-reftrack-dbgfs-v15-9-24fc37ead144@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/gpu/drm/display/drm_dp_tunnel.c |  2 +-
 drivers/gpu/drm/i915/intel_runtime_pm.c |  2 +-
 drivers/gpu/drm/i915/intel_wakeref.c    |  2 +-
 include/linux/ref_tracker.h             | 20 ++++++++++++++------
 lib/ref_tracker.c                       |  6 +++---
 lib/test_ref_tracker.c                  |  2 +-
 net/core/dev.c                          |  2 +-
 net/core/net_namespace.c                |  4 ++--
 8 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_tunnel.c b/drivers/gpu/drm/display/drm_dp_tunnel.c
index b9c12b8bf2a3..1205a4432eb4 100644
--- a/drivers/gpu/drm/display/drm_dp_tunnel.c
+++ b/drivers/gpu/drm/display/drm_dp_tunnel.c
@@ -1920,7 +1920,7 @@ drm_dp_tunnel_mgr_create(struct drm_device *dev, int max_group_count)
 	}
 
 #ifdef CONFIG_DRM_DISPLAY_DP_TUNNEL_STATE_DEBUG
-	ref_tracker_dir_init(&mgr->ref_tracker, 16, "drm_dptun", "dptun");
+	ref_tracker_dir_init(&mgr->ref_tracker, 16, "drm_dptun");
 #endif
 
 	for (i = 0; i < max_group_count; i++) {
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 90d90145a189..7ce3e6de0c19 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -61,7 +61,7 @@ static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm)
 {
 	if (!rpm->debug.class)
 		ref_tracker_dir_init(&rpm->debug, INTEL_REFTRACK_DEAD_COUNT,
-				     "intel_runtime_pm", dev_name(rpm->kdev));
+				     "intel_runtime_pm");
 }
 
 static intel_wakeref_t
diff --git a/drivers/gpu/drm/i915/intel_wakeref.c b/drivers/gpu/drm/i915/intel_wakeref.c
index 7e74c58862c1..7fa194de5d35 100644
--- a/drivers/gpu/drm/i915/intel_wakeref.c
+++ b/drivers/gpu/drm/i915/intel_wakeref.c
@@ -115,7 +115,7 @@ void __intel_wakeref_init(struct intel_wakeref *wf,
 
 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_WAKEREF)
 	if (!wf->debug.class)
-		ref_tracker_dir_init(&wf->debug, INTEL_REFTRACK_DEAD_COUNT, "intel_wakeref", name);
+		ref_tracker_dir_init(&wf->debug, INTEL_REFTRACK_DEAD_COUNT, "intel_wakeref");
 #endif
 }
 
diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h
index e1323de93bf6..d10563afd91c 100644
--- a/include/linux/ref_tracker.h
+++ b/include/linux/ref_tracker.h
@@ -20,7 +20,6 @@ struct ref_tracker_dir {
 	struct list_head	list; /* List of active trackers */
 	struct list_head	quarantine; /* List of dead trackers */
 	const char		*class; /* object classname */
-	char			name[32];
 #endif
 };
 
@@ -44,10 +43,21 @@ void ref_tracker_dir_symlink(struct ref_tracker_dir *dir, const char *fmt, ...)
 
 #endif /* CONFIG_DEBUG_FS */
 
+/**
+ * ref_tracker_dir_init - initialize a ref_tracker dir
+ * @dir: ref_tracker_dir to be initialized
+ * @quarantine_count: max number of entries to be tracked
+ * @class: pointer to static string that describes object type
+ *
+ * Initialize a ref_tracker_dir. If debugfs is configured, then a file
+ * will also be created for it under the top-level ref_tracker debugfs
+ * directory.
+ *
+ * Note that @class must point to a static string.
+ */
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 					unsigned int quarantine_count,
-					const char *class,
-					const char *name)
+					const char *class)
 {
 	INIT_LIST_HEAD(&dir->list);
 	INIT_LIST_HEAD(&dir->quarantine);
@@ -57,7 +67,6 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 	refcount_set(&dir->untracked, 1);
 	refcount_set(&dir->no_tracker, 1);
 	dir->class = class;
-	strscpy(dir->name, name, sizeof(dir->name));
 	ref_tracker_dir_debugfs(dir);
 	stack_depot_init();
 }
@@ -82,8 +91,7 @@ int ref_tracker_free(struct ref_tracker_dir *dir,
 
 static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir,
 					unsigned int quarantine_count,
-					const char *class,
-					const char *name)
+					const char *class)
 {
 }
 
diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c
index 6608520d6118..dcf923a1edf5 100644
--- a/lib/ref_tracker.c
+++ b/lib/ref_tracker.c
@@ -155,7 +155,7 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
 	stats = ref_tracker_get_stats(dir, display_limit);
 	if (IS_ERR(stats)) {
 		pr_ostream(s, "%s%s@%p: couldn't get stats, error %pe\n",
-			   s->prefix, dir->name, dir, stats);
+			   s->prefix, dir->class, dir, stats);
 		return;
 	}
 
@@ -166,14 +166,14 @@ __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
 		if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
 			sbuf[0] = 0;
 		pr_ostream(s, "%s%s@%p has %d/%d users at\n%s\n", s->prefix,
-			   dir->name, dir, stats->stacks[i].count,
+			   dir->class, dir, stats->stacks[i].count,
 			   stats->total, sbuf);
 		skipped -= stats->stacks[i].count;
 	}
 
 	if (skipped)
 		pr_ostream(s, "%s%s@%p skipped reports about %d/%d users.\n",
-			   s->prefix, dir->name, dir, skipped, stats->total);
+			   s->prefix, dir->class, dir, skipped, stats->total);
 
 	kfree(sbuf);
 
diff --git a/lib/test_ref_tracker.c b/lib/test_ref_tracker.c
index d263502a4c1d..b983ceb12afc 100644
--- a/lib/test_ref_tracker.c
+++ b/lib/test_ref_tracker.c
@@ -64,7 +64,7 @@ static int __init test_ref_tracker_init(void)
 {
 	int i;
 
-	ref_tracker_dir_init(&ref_dir, 100, "selftest", "selftest");
+	ref_tracker_dir_init(&ref_dir, 100, "selftest");
 
 	timer_setup(&test_ref_tracker_timer, test_ref_tracker_timer_func, 0);
 	mod_timer(&test_ref_tracker_timer, jiffies + 1);
diff --git a/net/core/dev.c b/net/core/dev.c
index 12cf4e5ae9c5..92a830162dd8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11715,7 +11715,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 
 	dev->priv_len = sizeof_priv;
 
-	ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev", name);
+	ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
 #ifdef CONFIG_PCPU_DEV_REFCNT
 	dev->pcpu_refcnt = alloc_percpu(int);
 	if (!dev->pcpu_refcnt)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 45de05d8f087..d0f607507ee8 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -403,8 +403,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
 {
 	refcount_set(&net->passive, 1);
 	refcount_set(&net->ns.count, 1);
-	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt", "net_refcnt");
-	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt", "net_notrefcnt");
+	ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
+	ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
 
 	get_random_bytes(&net->hash_mix, sizeof(u32));
 	net->dev_base_seq = 1;
-- 
cgit v1.2.3


From 5f4081d6fafec6c9dca9e7990e783b70db854a5c Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Thu, 17 Apr 2025 15:44:22 +0200
Subject: clk: add a clk_hw helpers to get the clock device or device_node

Add helpers to get the device or device_node associated with clk_hw.

This can be used by clock drivers to access various device related
functionality such as devres, dev_ prints, etc ...

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20250417-clk-hw-get-helpers-v1-1-7743e509612a@baylibre.com
Reviewed-by: Brian Masney <bmasney@redhat.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 12 ++++++++++++
 include/linux/clk-provider.h | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 0565c87656cf..b821b2cdb155 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -365,6 +365,18 @@ const char *clk_hw_get_name(const struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(clk_hw_get_name);
 
+struct device *clk_hw_get_dev(const struct clk_hw *hw)
+{
+	return hw->core->dev;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_dev);
+
+struct device_node *clk_hw_get_of_node(const struct clk_hw *hw)
+{
+	return hw->core->of_node;
+}
+EXPORT_SYMBOL_GPL(clk_hw_get_of_node);
+
 struct clk_hw *__clk_get_hw(struct clk *clk)
 {
 	return !clk ? NULL : clk->core->hw;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 2e6e603b7493..630705a47129 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1360,6 +1360,32 @@ void clk_hw_unregister(struct clk_hw *hw);
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
+
+/**
+ * clk_hw_get_dev() - get device from an hardware clock.
+ * @hw: the clk_hw pointer to get the struct device from
+ *
+ * This is a helper to get the struct device associated with a hardware
+ * clock. Some clock controllers, such as the one registered with
+ * CLK_OF_DECLARE(), may have not provided a device pointer while
+ * registering the clock.
+ *
+ * Return: the struct device associated with the clock, or NULL if there
+ * is none.
+ */
+struct device *clk_hw_get_dev(const struct clk_hw *hw);
+
+/**
+ * clk_hw_get_of_node() - get device_node from a hardware clock.
+ * @hw: the clk_hw pointer to get the struct device_node from
+ *
+ * This is a helper to get the struct device_node associated with a
+ * hardware clock.
+ *
+ * Return: the struct device_node associated with the clock, or NULL
+ * if there is none.
+ */
+struct device_node *clk_hw_get_of_node(const struct clk_hw *hw);
 #ifdef CONFIG_COMMON_CLK
 struct clk_hw *__clk_get_hw(struct clk *clk);
 #else
-- 
cgit v1.2.3


From aa34ecc42a2138af76642b68b53a5a07cb12fe43 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Date: Wed, 28 May 2025 09:09:47 +0530
Subject: wifi: ieee80211: add Radio Measurement action fields

Drivers that support Tx power insertion could examine the outgoing Radio
measurement packet and depending on the packet type, the driver can
insert specific data fields in it. These action field values will help
drivers classify the action code within the Radio Measurement action
packet.

These action fields are defined in IEEE 802.11-2024 - Table 9-470, Radio
Measurement Action field values.

Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Link: https://patch.msgid.link/20250528-add_rrm_action_code-v1-1-6b7c78b5bbaf@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 22f39e5e2ff1..120de474a8bf 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -4007,6 +4007,16 @@ enum ieee80211_s1g_actioncode {
 	WLAN_S1G_TWT_INFORMATION = 11,
 };
 
+/* Radio measurement action codes as defined in IEEE 802.11-2024 - Table 9-470 */
+enum ieee80211_radio_measurement_actioncode {
+	WLAN_RM_ACTION_RADIO_MEASUREMENT_REQUEST = 0,
+	WLAN_RM_ACTION_RADIO_MEASUREMENT_REPORT  = 1,
+	WLAN_RM_ACTION_LINK_MEASUREMENT_REQUEST  = 2,
+	WLAN_RM_ACTION_LINK_MEASUREMENT_REPORT   = 3,
+	WLAN_RM_ACTION_NEIGHBOR_REPORT_REQUEST   = 4,
+	WLAN_RM_ACTION_NEIGHBOR_REPORT_RESPONSE  = 5,
+};
+
 #define IEEE80211_WEP_IV_LEN		4
 #define IEEE80211_WEP_ICV_LEN		4
 #define IEEE80211_CCMP_HDR_LEN		8
-- 
cgit v1.2.3


From df42bfc96e0ad90d243c0ee6b783a33bdb72a184 Mon Sep 17 00:00:00 2001
From: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Date: Tue, 27 May 2025 14:11:43 +0530
Subject: wifi: cfg80211: Add utility API to get radio index from channel

Add utility API cfg80211_get_radio_idx_by_chan() to retrieve the radio
index corresponding to a given channel in a multi-radio wiphy.

This utility function can be used when we want to check the radio-specific
data for a channel in a multi-radio wiphy. For example, it can help
determine the radio index required to handle a scan request. This index
can then be used to decide whether the scan can proceed without
interfering with ongoing DFS operations on another radio.

Signed-off-by: Vasanthakumar Thiagarajan <vasanthakumar.thiagarajan@oss.qualcomm.com>
Co-developed-by: Raj Kumar Bhagat <quic_rajkbhag@quicinc.com>
Signed-off-by: Raj Kumar Bhagat <quic_rajkbhag@quicinc.com>
Link: https://patch.msgid.link/20250527-mlo-dfs-acs-v2-1-92c2f37c81d9@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 11 +++++++++++
 net/wireless/util.c    | 24 ++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d1848dc8ec99..7719a90ab4d7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -9372,6 +9372,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
 			       void (*iter)(const struct ieee80211_iface_combination *c,
 					    void *data),
 			       void *data);
+/**
+ * cfg80211_get_radio_idx_by_chan - get the radio index by the channel
+ *
+ * @wiphy: the wiphy
+ * @chan: channel for which the supported radio index is required
+ *
+ * Return: radio index on success or a negative error code
+ */
+int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
+				   const struct ieee80211_channel *chan);
+
 
 /**
  * cfg80211_stop_iface - trigger interface disconnection
diff --git a/net/wireless/util.c b/net/wireless/util.c
index ed868c0f7ca8..e438f883f085 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2516,6 +2516,30 @@ int cfg80211_check_combinations(struct wiphy *wiphy,
 }
 EXPORT_SYMBOL(cfg80211_check_combinations);
 
+int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
+				   const struct ieee80211_channel *chan)
+{
+	const struct wiphy_radio *radio;
+	int i, j;
+	u32 freq;
+
+	if (!chan)
+		return -EINVAL;
+
+	freq = ieee80211_channel_to_khz(chan);
+	for (i = 0; i < wiphy->n_radio; i++) {
+		radio = &wiphy->radio[i];
+		for (j = 0; j < radio->n_freq_range; j++) {
+			if (freq >= radio->freq_range[j].start_freq &&
+			    freq < radio->freq_range[j].end_freq)
+				return i;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan);
+
 int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
 			   const u8 *rates, unsigned int n_rates,
 			   u32 *mask)
-- 
cgit v1.2.3


From 5ae1fc4069578f50798f3372f36a3c13ee565b66 Mon Sep 17 00:00:00 2001
From: Kavita Kavita <quic_kkavita@quicinc.com>
Date: Wed, 4 Jun 2025 16:27:56 +0530
Subject: wifi: cfg80211: Improve the documentation for
 NL80211_CMD_ASSOC_MLO_RECONF

The existing documentation for the NL80211_CMD_ASSOC_MLO_RECONF
does not clearly explain handling of link reconfiguration request
results from the driver.

Add documentation to explain that the command is used as an event to
notify userspace about added links information, and that the existing
NL80211_CMD_LINKS_REMOVED command is used to notify userspace about
removed links information.

Signed-off-by: Kavita Kavita <quic_kkavita@quicinc.com>
Link: https://patch.msgid.link/20250604105757.2542-2-quic_kkavita@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e9ccf43fe3c6..e53840d009d1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1330,7 +1330,11 @@
  *      TID to Link mapping for downlink/uplink traffic.
  *
  * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to
- *      add/remove links to/from the association.
+ *      add/remove links to/from the association. To indicate link
+ *      reconfiguration request results from the driver, this command is also
+ *      used as an event to notify userspace about the added links information.
+ *      For notifying the removed links information, the existing
+ *      %NL80211_CMD_LINKS_REMOVED command is used.
  *
  * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to
  *	control EPCS configuration. Used to notify userland on the current state
-- 
cgit v1.2.3


From 7c598c653ad465138ecc2fe64492633c541effef Mon Sep 17 00:00:00 2001
From: Kavita Kavita <quic_kkavita@quicinc.com>
Date: Wed, 4 Jun 2025 16:27:57 +0530
Subject: wifi: cfg80211: Add support for link reconfiguration negotiation
 offload to driver

In the case of SME-in-driver, the driver can internally choose to
update the links based on the AP MLD recommendation and do link
reconfiguration negotiation with AP MLD.
(e.g., After the driver processing the BSS Transition Management request
frame received from the AP MLD with Neighbor Report containing
Multi-Link element with recommended links information chooses to do link
reconfiguration negotiation with AP MLD).

To support this, extend cfg80211_mlo_reconf_add_done() and
NL80211_CMD_ASSOC_MLO_RECONF to indicate added links information for
driver-initiated link reconfiguration requests. For removed links,
the driver indicates links information using the
NL80211_CMD_LINKS_REMOVED event for driver-initiated cases, the same as
supplicant initiated cases.

For the driver-initiated case, cfg80211 will receive link
reconfiguration result asynchronously from driver so holding BSSes of
the accepted add links is needed in the event path. Also, no need of
unhold call for the rejected add link BSSes since there was no hold call
happened previously.

Once the supplicant receives the NL80211_CMD_ASSOC_MLO_RECONF event,
it needs to process the information about newly added links and install
per-link group keys (e.g., GTK/IGTK/BIGTK etc.).

In case of the SME-in-driver, using a vendor interface etc. to notify
the supplicant to initiate a link reconfiguration request and then
supplicant sending command to the cfg80211 can lead to race conditions.
The correct design to avoid this is that the driver indicates the
cfg80211 directly with the results of the link reconfiguration
negotiation.

Signed-off-by: Kavita Kavita <quic_kkavita@quicinc.com>
Link: https://patch.msgid.link/20250604105757.2542-3-quic_kkavita@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  6 ++++++
 include/uapi/linux/nl80211.h |  6 +++++-
 net/wireless/mlme.c          | 10 ++++++++--
 net/wireless/trace.h         | 10 ++++++----
 4 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7719a90ab4d7..47b4235eea59 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -9747,6 +9747,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
  * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data
  * @buf: MLO Reconfiguration Response frame (header + body)
  * @len: length of the frame data
+ * @driver_initiated: Indicates whether the add links request is initiated by
+ *	driver. This is set to true when the link reconfiguration request
+ *	initiated by driver due to AP link recommendation requests
+ *	(Ex: BTM (BSS Transition Management) request) handling offloaded to
+ *	driver.
  * @added_links: BIT mask of links successfully added to the association
  * @links: per-link information indexed by link ID
  * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of
@@ -9759,6 +9764,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
 struct cfg80211_mlo_reconf_done_data {
 	const u8 *buf;
 	size_t len;
+	bool driver_initiated;
 	u16 added_links;
 	struct {
 		struct cfg80211_bss *bss;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e53840d009d1..a289014abe37 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1334,7 +1334,11 @@
  *      reconfiguration request results from the driver, this command is also
  *      used as an event to notify userspace about the added links information.
  *      For notifying the removed links information, the existing
- *      %NL80211_CMD_LINKS_REMOVED command is used.
+ *      %NL80211_CMD_LINKS_REMOVED command is used. This command is also used to
+ *      notify userspace about newly added links for the current connection in
+ *      case of AP-initiated link recommendation requests, received via
+ *      a BTM (BSS Transition Management) request or a link reconfig notify
+ *      frame, where the driver handles the link recommendation offload.
  *
  * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to
  *	control EPCS configuration. Used to notify userland on the current state
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 05d44a443518..29e1ce8aff42 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -1331,7 +1331,8 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev,
 	lockdep_assert_wiphy(wiphy);
 
 	trace_cfg80211_mlo_reconf_add_done(dev, data->added_links,
-					   data->buf, data->len);
+					   data->buf, data->len,
+					   data->driver_initiated);
 
 	if (WARN_ON(!wdev->valid_links))
 		return;
@@ -1361,11 +1362,16 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev,
 			wdev->links[link_id].client.current_bss =
 				bss_from_pub(bss);
 
+			if (data->driver_initiated)
+				cfg80211_hold_bss(bss_from_pub(bss));
+
 			memcpy(wdev->links[link_id].addr,
 			       data->links[link_id].addr,
 			       ETH_ALEN);
 		} else {
-			cfg80211_unhold_bss(bss_from_pub(bss));
+			if (!data->driver_initiated)
+				cfg80211_unhold_bss(bss_from_pub(bss));
+
 			cfg80211_put_bss(wiphy, bss);
 		}
 	}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 4ed9fada4ec0..61a5eca9c513 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -4126,20 +4126,22 @@ TRACE_EVENT(cfg80211_links_removed,
 
 TRACE_EVENT(cfg80211_mlo_reconf_add_done,
 	TP_PROTO(struct net_device *netdev, u16 link_mask,
-		 const u8 *buf, size_t len),
-	TP_ARGS(netdev, link_mask, buf, len),
+		 const u8 *buf, size_t len, bool driver_initiated),
+	TP_ARGS(netdev, link_mask, buf, len, driver_initiated),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(u16, link_mask)
 		__dynamic_array(u8, buf, len)
+		__field(bool, driver_initiated)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		__entry->link_mask = link_mask;
 		memcpy(__get_dynamic_array(buf), buf, len);
+		__entry->driver_initiated = driver_initiated;
 	),
-	TP_printk(NETDEV_PR_FMT ", link_mask:0x%x",
-		  NETDEV_PR_ARG, __entry->link_mask)
+	TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d",
+		  NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated)
 );
 
 TRACE_EVENT(rdev_assoc_ml_reconf,
-- 
cgit v1.2.3


From 13ef21dffe7691a32c83a83d697d119c045536eb Mon Sep 17 00:00:00 2001
From: Shuming Fan <shumingf@realtek.com>
Date: Mon, 16 Jun 2025 19:48:20 +0800
Subject: ASoC: SDCA: add support for HIDE entity properties and HID
 descriptor/report

Add support for parsing the HIDE entity descriptor and HID descriptor/report

Signed-off-by: Shuming Fan <shumingf@realtek.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20250616114820.855401-1-shumingf@realtek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 29 ++++++++++++++
 sound/soc/sdca/sdca_functions.c | 83 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index eaedb54a8322..856b0f40ce5e 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -11,6 +11,7 @@
 
 #include <linux/bits.h>
 #include <linux/types.h>
+#include <linux/hid.h>
 
 struct device;
 struct sdca_entity;
@@ -1040,6 +1041,32 @@ struct sdca_entity_ge {
 	int num_modes;
 };
 
+/**
+ * struct sdca_entity_hide - information specific to HIDE Entities
+ * @hid: HID device structure
+ * @hidtx_ids: HIDTx Report ID
+ * @num_hidtx_ids: number of HIDTx Report ID
+ * @hidrx_ids: HIDRx Report ID
+ * @num_hidrx_ids: number of HIDRx Report ID
+ * @hide_reside_function_num: indicating which Audio Function Numbers within this Device
+ * @max_delay: the maximum time in microseconds allowed for the Device to change the ownership from Device to Host
+ * @af_number_list: which Audio Function Numbers within this Device are sending/receiving the messages in this HIDE
+ * @hid_desc: HID descriptor for the HIDE Entity
+ * @hid_report_desc: HID Report Descriptor for the HIDE Entity
+ */
+struct sdca_entity_hide {
+	struct hid_device *hid;
+	unsigned int *hidtx_ids;
+	int num_hidtx_ids;
+	unsigned int *hidrx_ids;
+	int num_hidrx_ids;
+	unsigned int hide_reside_function_num;
+	unsigned int max_delay;
+	unsigned int af_number_list[SDCA_MAX_FUNCTION_COUNT];
+	struct hid_descriptor hid_desc;
+	unsigned char *hid_report_desc;
+};
+
 /**
  * struct sdca_entity - information for one SDCA Entity
  * @label: String such as "OT 12".
@@ -1055,6 +1082,7 @@ struct sdca_entity_ge {
  * @cs: Clock Source specific Entity properties.
  * @pde: Power Domain Entity specific Entity properties.
  * @ge: Group Entity specific Entity properties.
+ * @hide: HIDE Entity specific Entity properties.
  */
 struct sdca_entity {
 	const char *label;
@@ -1071,6 +1099,7 @@ struct sdca_entity {
 		struct sdca_entity_cs cs;
 		struct sdca_entity_pde pde;
 		struct sdca_entity_ge ge;
+		struct sdca_entity_hide hide;
 	};
 };
 
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 64ac26443890..4a89067dcf76 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -1220,6 +1220,86 @@ bad_list:
 	return -EINVAL;
 }
 
+static int
+find_sdca_entity_hide(struct device *dev, struct fwnode_handle *function_node,
+		      struct fwnode_handle *entity_node, struct sdca_entity *entity)
+{
+	struct sdca_entity_hide *hide = &entity->hide;
+	unsigned int delay, *af_list = hide->af_number_list;
+	int nval, ret;
+	unsigned char *report_desc = NULL;
+
+	ret = fwnode_property_read_u32(entity_node,
+				       "mipi-sdca-RxUMP-ownership-transition-maxdelay", &delay);
+	if (!ret)
+		hide->max_delay = delay;
+
+	nval = fwnode_property_count_u32(entity_node, "mipi-sdca-HIDTx-supported-report-ids");
+	if (nval > 0) {
+		hide->num_hidtx_ids = nval;
+		hide->hidtx_ids = devm_kcalloc(dev, hide->num_hidtx_ids,
+					       sizeof(*hide->hidtx_ids), GFP_KERNEL);
+		if (!hide->hidtx_ids)
+			return -ENOMEM;
+
+		ret = fwnode_property_read_u32_array(entity_node,
+						     "mipi-sdca-HIDTx-supported-report-ids",
+						     hide->hidtx_ids,
+						     hide->num_hidtx_ids);
+		if (ret < 0)
+			return ret;
+	}
+
+	nval = fwnode_property_count_u32(entity_node, "mipi-sdca-HIDRx-supported-report-ids");
+	if (nval > 0) {
+		hide->num_hidrx_ids = nval;
+		hide->hidrx_ids = devm_kcalloc(dev, hide->num_hidrx_ids,
+					       sizeof(*hide->hidrx_ids), GFP_KERNEL);
+		if (!hide->hidrx_ids)
+			return -ENOMEM;
+
+		ret = fwnode_property_read_u32_array(entity_node,
+						     "mipi-sdca-HIDRx-supported-report-ids",
+						     hide->hidrx_ids,
+						     hide->num_hidrx_ids);
+		if (ret < 0)
+			return ret;
+	}
+
+	nval = fwnode_property_count_u32(entity_node, "mipi-sdca-hide-related-audio-function-list");
+	if (nval <= 0) {
+		dev_err(dev, "%pfwP: audio function numbers list missing: %d\n",
+			entity_node, nval);
+		return -EINVAL;
+	} else if (nval > SDCA_MAX_FUNCTION_COUNT) {
+		dev_err(dev, "%pfwP: maximum number of audio function exceeded\n", entity_node);
+		return -EINVAL;
+	}
+
+	hide->hide_reside_function_num = nval;
+	fwnode_property_read_u32_array(entity_node,
+				       "mipi-sdca-hide-related-audio-function-list", af_list, nval);
+
+	nval = fwnode_property_count_u8(function_node, "mipi-sdca-hid-descriptor");
+	if (nval)
+		fwnode_property_read_u8_array(function_node, "mipi-sdca-hid-descriptor",
+					      (u8 *)&hide->hid_desc, nval);
+
+	if (hide->hid_desc.bNumDescriptors) {
+		nval = fwnode_property_count_u8(function_node, "mipi-sdca-report-descriptor");
+		if (nval) {
+			report_desc = devm_kzalloc(dev, nval, GFP_KERNEL);
+			if (!report_desc)
+				return -ENOMEM;
+			hide->hid_report_desc = report_desc;
+			fwnode_property_read_u8_array(function_node, "mipi-sdca-report-descriptor",
+						      report_desc, nval);
+		}
+	}
+
+	return 0;
+}
+
 static int find_sdca_entity(struct device *dev,
 			    struct fwnode_handle *function_node,
 			    struct fwnode_handle *entity_node,
@@ -1261,6 +1341,9 @@ static int find_sdca_entity(struct device *dev,
 	case SDCA_ENTITY_TYPE_GE:
 		ret = find_sdca_entity_ge(dev, entity_node, entity);
 		break;
+	case SDCA_ENTITY_TYPE_HIDE:
+		ret = find_sdca_entity_hide(dev, function_node, entity_node, entity);
+		break;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 3421d46440ebe0865bec71dbd2330b4e17a425ab Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 16 Jun 2025 19:49:07 +0800
Subject: HID: core: Add bus define for SoundWire bus

SDCA (SoundWire Device Class for Audio) uses HID to convey
input events from peripheral devices. Add a bus define for the
SoundWire bus to prepare support for this.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Shuming Fan <shumingf@realtek.com>
Acked-by: Jiri Kosina <jkosina@suse.com>
Link: https://patch.msgid.link/20250616114907.855452-1-shumingf@realtek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/hid/hid-core.c     | 3 +++
 include/uapi/linux/input.h | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index b348d0464314..b419e31005b8 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2294,6 +2294,9 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 	case BUS_I2C:
 		bus = "I2C";
 		break;
+	case BUS_SDW:
+		bus = "SOUNDWIRE";
+		break;
 	case BUS_VIRTUAL:
 		bus = "VIRTUAL";
 		break;
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 2557eb7b0561..127119c287cf 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -275,6 +275,7 @@ struct input_mask {
 #define BUS_CEC			0x1E
 #define BUS_INTEL_ISHTP		0x1F
 #define BUS_AMD_SFH		0x20
+#define BUS_SDW			0x21
 
 /*
  * MT_TOOL types
-- 
cgit v1.2.3


From 4051ead99888f101be92c7ce90d2de09aac6fd1c Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Fri, 20 Jun 2025 20:02:31 +0800
Subject: HID: rate-limit hid_warn to prevent log flooding

Syzkaller can create many uhid devices that trigger
repeated warnings like:

  "hid-generic xxxx: unknown main item tag 0x0"

These messages can flood the system log, especially if a crash occurs
(e.g., with a slow UART console, leading to soft lockups). To mitigate
this, convert `hid_warn()` to use `dev_warn_ratelimited()`.

This helps reduce log noise and improves system stability under fuzzing
or faulty device scenarios.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Jiri Kosina <jkosina@suse.com>
---
 drivers/hid/hid-core.c | 4 ++--
 include/linux/hid.h    | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 4a00bd4a4224..ef1f79951d9b 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -659,9 +659,9 @@ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item)
 	default:
 		if (item->tag >= HID_MAIN_ITEM_TAG_RESERVED_MIN &&
 			item->tag <= HID_MAIN_ITEM_TAG_RESERVED_MAX)
-			hid_warn(parser->device, "reserved main item tag 0x%x\n", item->tag);
+			hid_warn_ratelimited(parser->device, "reserved main item tag 0x%x\n", item->tag);
 		else
-			hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag);
+			hid_warn_ratelimited(parser->device, "unknown main item tag 0x%x\n", item->tag);
 		ret = 0;
 	}
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 568a9d8c749b..7f260e0e2049 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -1239,6 +1239,8 @@ void hid_quirks_exit(__u16 bus);
 	dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__)
 #define hid_warn(hid, fmt, ...)				\
 	dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__)
+#define hid_warn_ratelimited(hid, fmt, ...)				\
+	dev_warn_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__)
 #define hid_info(hid, fmt, ...)				\
 	dev_info(&(hid)->dev, fmt, ##__VA_ARGS__)
 #define hid_dbg(hid, fmt, ...)				\
-- 
cgit v1.2.3


From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001
From: Binbin Wu <binbin.wu@linux.intel.com>
Date: Tue, 10 Jun 2025 10:14:20 +0800
Subject: KVM: TDX: Handle TDG.VP.VMCALL<GetQuote>

Handle TDVMCALL for GetQuote to generate a TD-Quote.

GetQuote is a doorbell-like interface used by TDX guests to request VMM
to generate a TD-Quote signed by a service hosting TD-Quoting Enclave
operating on the host.  A TDX guest passes a TD Report (TDREPORT_STRUCT) in
a shared-memory area as parameter.  Host VMM can access it and queue the
operation for a service hosting TD-Quoting enclave.  When completed, the
Quote is returned via the same shared-memory area.

KVM only checks the GPA from the TDX guest has the shared-bit set and drops
the shared-bit before exiting to userspace to avoid bleeding the shared-bit
into KVM's exit ABI.  KVM forwards the request to userspace VMM (e.g. QEMU)
and userspace VMM queues the operation asynchronously.  KVM sets the return
code according to the 'ret' field set by userspace to notify the TDX guest
whether the request has been queued successfully or not.  When the request
has been queued successfully, the TDX guest can poll the status field in
the shared-memory area to check whether the Quote generation is completed
or not.  When completed, the generated Quote is returned via the same
buffer.

Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is
required to handle the KVM exit reason as the initial support for TDX,
by reentering KVM to ensure that the TDVMCALL is complete.  While at it,
add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN.

Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
Tested-by: Mikko Ylinen <mikko.ylinen@linux.intel.com>
Acked-by: Kai Huang <kai.huang@intel.com>
[Adjust userspace API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 49 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/tdx.c         | 32 +++++++++++++++++++++++++++
 include/uapi/linux/kvm.h       | 17 +++++++++++++++
 3 files changed, 97 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 1bd2d42e6424..115ec3c2b641 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6645,7 +6645,8 @@ to the byte array.
 .. note::
 
       For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
-      KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
+      KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX,
+      KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
       operations are complete (and guest state is consistent) only after userspace
       has re-entered the kernel with KVM_RUN.  The kernel side will first finish
       incomplete operations and then check for pending signals.
@@ -7174,6 +7175,52 @@ The valid value for 'flags' is:
   - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid
     in VMCS. It would run into unknown result if resume the target VM.
 
+::
+
+		/* KVM_EXIT_TDX */
+		struct {
+			__u64 flags;
+			__u64 nr;
+			union {
+				struct {
+					u64 ret;
+					u64 data[5];
+				} unknown;
+				struct {
+					u64 ret;
+					u64 gpa;
+					u64 size;
+				} get_quote;
+			};
+		} tdx;
+
+Process a TDVMCALL from the guest.  KVM forwards select TDVMCALL based
+on the Guest-Hypervisor Communication Interface (GHCI) specification;
+KVM bridges these requests to the userspace VMM with minimal changes,
+placing the inputs in the union and copying them back to the guest
+on re-entry.
+
+Flags are currently always zero, whereas ``nr`` contains the TDVMCALL
+number from register R11.  The remaining field of the union provide the
+inputs and outputs of the TDVMCALL.  Currently the following values of
+``nr`` are defined:
+
+* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote
+signed by a service hosting TD-Quoting Enclave operating on the host.
+Parameters and return value are in the ``get_quote`` field of the union.
+The ``gpa`` field and ``size`` specify the guest physical address
+(without the shared bit set) and the size of a shared-memory buffer, in
+which the TDX guest passes a TD Report.  The ``ret`` field represents
+the return value of the GetQuote request.  When the request has been
+queued successfully, the TDX guest can poll the status field in the
+shared-memory area to check whether the Quote generation is completed or
+not. When completed, the generated Quote is returned via the same buffer.
+
+KVM may add support for more values in the future that may cause a userspace
+exit, even without calls to ``KVM_ENABLE_CAP`` or similar.  In this case,
+it will enter with output fields already valid; in the common case, the
+``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``.
+Userspace need not do anything if it does not wish to support a TDVMCALL.
 ::
 
 		/* Fix the size of the union. */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 5d100c240ab3..b619a3478983 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int tdx_complete_simple(struct kvm_vcpu *vcpu)
+{
+	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
+	return 1;
+}
+
+static int tdx_get_quote(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	u64 gpa = tdx->vp_enter_args.r12;
+	u64 size = tdx->vp_enter_args.r13;
+
+	/* The gpa of buffer must have shared bit set. */
+	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_TDX;
+	vcpu->run->tdx.flags = 0;
+	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
+	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+	vcpu->run->tdx.get_quote.size = size;
+
+	vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+	return 0;
+}
+
 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 {
 	switch (tdvmcall_leaf(vcpu)) {
@@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 		return tdx_report_fatal_error(vcpu);
 	case TDVMCALL_GET_TD_VM_CALL_INFO:
 		return tdx_get_td_vm_call_info(vcpu);
+	case TDVMCALL_GET_QUOTE:
+		return tdx_get_quote(vcpu);
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d00b85cb168c..e23e7286ad1a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -178,6 +178,7 @@ struct kvm_xen_exit {
 #define KVM_EXIT_NOTIFY           37
 #define KVM_EXIT_LOONGARCH_IOCSR  38
 #define KVM_EXIT_MEMORY_FAULT     39
+#define KVM_EXIT_TDX              40
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -447,6 +448,22 @@ struct kvm_run {
 			__u64 gpa;
 			__u64 size;
 		} memory_fault;
+		/* KVM_EXIT_TDX */
+		struct {
+			__u64 flags;
+			__u64 nr;
+			union {
+				struct {
+					__u64 ret;
+					__u64 data[5];
+				} unknown;
+				struct {
+					__u64 ret;
+					__u64 gpa;
+					__u64 size;
+				} get_quote;
+			};
+		} tdx;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3


From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001
From: Binbin Wu <binbin.wu@linux.intel.com>
Date: Tue, 10 Jun 2025 10:14:21 +0800
Subject: KVM: TDX: Exit to userspace for GetTdVmCallInfo

Exit to userspace for TDG.VP.VMCALL<GetTdVmCallInfo> via KVM_EXIT_TDX,
to allow userspace to provide information about the support of
TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API.

GHCI spec defines the GHCI base TDVMCALLs: <GetTdVmCallInfo>, <MapGPA>,
<ReportFatalError>, <Instruction.CPUID>, <#VE.RequestMMIO>,
<Instruction.HLT>, <Instruction.IO>, <Instruction.RDMSR> and
<Instruction.WRMSR>. They must be supported by VMM to support TDX guests.

For GetTdVmCallInfo
- When leaf (r12) to enumerate TDVMCALL functionality is set to 0,
  successful execution indicates all GHCI base TDVMCALLs listed above are
  supported.

  Update the KVM TDX document with the set of the GHCI base APIs.

- When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it
  indicates the TDX guest is querying the supported TDVMCALLs beyond
  the GHCI base TDVMCALLs.
  Exit to userspace to let userspace set the TDVMCALL sub-function bit(s)
  accordingly to the leaf outputs.  KVM could set the TDVMCALL bit(s)
  supported by itself when the TDVMCALLs don't need support from userspace
  after returning from userspace and before entering guest. Currently, no
  such TDVMCALLs implemented, KVM just sets the values returned from
  userspace.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
[Adjust userspace API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 10 ++++++++++
 arch/x86/kvm/vmx/tdx.c         | 43 ++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/kvm.h       |  5 +++++
 3 files changed, 54 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 115ec3c2b641..9abf93ee5f65 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7191,6 +7191,11 @@ The valid value for 'flags' is:
 					u64 gpa;
 					u64 size;
 				} get_quote;
+				struct {
+					u64 ret;
+					u64 leaf;
+					u64 r11, r12, r13, r14;
+				} get_tdvmcall_info;
 			};
 		} tdx;
 
@@ -7216,6 +7221,11 @@ queued successfully, the TDX guest can poll the status field in the
 shared-memory area to check whether the Quote generation is completed or
 not. When completed, the generated Quote is returned via the same buffer.
 
+* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support
+status of TDVMCALLs.  The output values for the given leaf should be
+placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info``
+field of the union.
+
 KVM may add support for more values in the future that may cause a userspace
 exit, even without calls to ``KVM_ENABLE_CAP`` or similar.  In this case,
 it will enter with output fields already valid; in the common case, the
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b619a3478983..1ad20c273f3b 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1451,18 +1451,53 @@ error:
 	return 1;
 }
 
+static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
+
+	/*
+	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
+	 * directly without the support from userspace, just set the value
+	 * returned from userspace.
+	 */
+	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
+	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
+	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
+	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
+
+	return 1;
+}
+
 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 
-	if (tdx->vp_enter_args.r12)
-		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
-	else {
+	switch (tdx->vp_enter_args.r12) {
+	case 0:
 		tdx->vp_enter_args.r11 = 0;
+		tdx->vp_enter_args.r12 = 0;
 		tdx->vp_enter_args.r13 = 0;
 		tdx->vp_enter_args.r14 = 0;
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
+		return 1;
+	case 1:
+		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
+		vcpu->run->exit_reason = KVM_EXIT_TDX;
+		vcpu->run->tdx.flags = 0;
+		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
+		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
+		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
+		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
+		return 0;
+	default:
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+		return 1;
 	}
-	return 1;
 }
 
 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e23e7286ad1a..37891580d05d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -462,6 +462,11 @@ struct kvm_run {
 					__u64 gpa;
 					__u64 size;
 				} get_quote;
+				struct {
+					__u64 ret;
+					__u64 leaf;
+					__u64 r11, r12, r13, r14;
+				} get_tdvmcall_info;
 			};
 		} tdx;
 		/* Fix the size of the union. */
-- 
cgit v1.2.3


From 4580dbef5ce0f95a4bd8ac2d007bc4fbf1539332 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 20 Jun 2025 13:28:08 -0400
Subject: KVM: TDX: Exit to userspace for SetupEventNotifyInterrupt

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst    |  7 +++++++
 arch/x86/include/asm/shared/tdx.h |  1 +
 arch/x86/kvm/vmx/tdx.c            | 23 +++++++++++++++++++++++
 include/uapi/linux/kvm.h          |  4 ++++
 4 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 9abf93ee5f65..f0d961436d0f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7196,6 +7196,10 @@ The valid value for 'flags' is:
 					u64 leaf;
 					u64 r11, r12, r13, r14;
 				} get_tdvmcall_info;
+				struct {
+					u64 ret;
+					u64 vector;
+				} setup_event_notify;
 			};
 		} tdx;
 
@@ -7226,6 +7230,9 @@ status of TDVMCALLs.  The output values for the given leaf should be
 placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info``
 field of the union.
 
+* ``TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT``: the guest has requested to
+set up a notification interrupt for vector ``vector``.
+
 KVM may add support for more values in the future that may cause a userspace
 exit, even without calls to ``KVM_ENABLE_CAP`` or similar.  In this case,
 it will enter with output fields already valid; in the common case, the
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index d8525e6ef50a..8bc074c8d7c6 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -72,6 +72,7 @@
 #define TDVMCALL_MAP_GPA		0x10001
 #define TDVMCALL_GET_QUOTE		0x10002
 #define TDVMCALL_REPORT_FATAL_ERROR	0x10003
+#define TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT 0x10004ULL
 
 /*
  * TDG.VP.VMCALL Status Codes (returned in R10)
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 1ad20c273f3b..b4055a746ecd 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1530,6 +1530,27 @@ static int tdx_get_quote(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	u64 vector = tdx->vp_enter_args.r12;
+
+	if (vector < 32 || vector > 255) {
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_TDX;
+	vcpu->run->tdx.flags = 0;
+	vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
+	vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+	vcpu->run->tdx.setup_event_notify.vector = vector;
+
+	vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+	return 0;
+}
+
 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 {
 	switch (tdvmcall_leaf(vcpu)) {
@@ -1541,6 +1562,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 		return tdx_get_td_vm_call_info(vcpu);
 	case TDVMCALL_GET_QUOTE:
 		return tdx_get_quote(vcpu);
+	case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
+		return tdx_setup_event_notify_interrupt(vcpu);
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 37891580d05d..7a4c35ff03fe 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -467,6 +467,10 @@ struct kvm_run {
 					__u64 leaf;
 					__u64 r11, r12, r13, r14;
 				} get_tdvmcall_info;
+				struct {
+					__u64 ret;
+					__u64 vector;
+				} setup_event_notify;
 			};
 		} tdx;
 		/* Fix the size of the union. */
-- 
cgit v1.2.3


From 64f7548aad63d2fbca2eeb6eb33361c218ebd5a5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 20 Jun 2025 21:19:40 +0200
Subject: lib/crypto: sha256: Mark sha256_choose_blocks as __always_inline

When the compiler chooses to not inline sha256_choose_blocks() in
the purgatory code, it fails to link against the missing CPU
specific version:

x86_64-linux-ld: arch/x86/purgatory/purgatory.ro: in function `sha256_choose_blocks.part.0':
sha256.c:(.text+0x6a6): undefined reference to `irq_fpu_usable'
sha256.c:(.text+0x6c7): undefined reference to `sha256_blocks_arch'
sha256.c:(.text+0x6cc): undefined reference to `sha256_blocks_simd'

Mark this function as __always_inline to prevent this, same as sha256_finup().

Fixes: 5b90a779bc54 ("crypto: lib/sha256 - Add helpers for block-based shash")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20250620191952.1867578-1-arnd@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/internal/sha2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h
index b9bccd3ff57f..21a27fd5e198 100644
--- a/include/crypto/internal/sha2.h
+++ b/include/crypto/internal/sha2.h
@@ -25,7 +25,7 @@ void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks);
 
-static inline void sha256_choose_blocks(
+static __always_inline void sha256_choose_blocks(
 	u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks,
 	bool force_generic, bool force_simd)
 {
-- 
cgit v1.2.3


From 530a8ba71b4c3b7fcee323dd997f4bab1be1a6ba Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 16 May 2025 14:35:35 -0700
Subject: KVM: Bound the number of dirty ring entries in a single reset at
 INT_MAX

Cap the number of ring entries that are reset in a single ioctl to INT_MAX
to ensure userspace isn't confused by a wrap into negative space, and so
that, in a truly pathological scenario, KVM doesn't miss a TLB flush due
to the count wrapping to zero.  While the size of the ring is fixed at
0x10000 entries and KVM (currently) supports at most 4096, userspace is
allowed to harvest entries from the ring while the reset is in-progress,
i.e. it's possible for the ring to always have harvested entries.

Opportunistically return an actual error code from the helper so that a
future fix to handle pending signals can gracefully return -EINTR.  Drop
the function comment now that the return code is a stanard 0/-errno (and
because a future commit will add a proper lockdep assertion).

Opportunistically drop a similarly stale comment for kvm_dirty_ring_push().

Cc: Peter Xu <peterx@redhat.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Cc: Maxim Levitsky <mlevitsk@redhat.com>
Cc: Binbin Wu <binbin.wu@linux.intel.com>
Fixes: fb04a1eddb1a ("KVM: X86: Implement ring-based dirty memory tracking")
Reviewed-by: James Houghton <jthoughton@google.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/20250516213540.2546077-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_dirty_ring.h | 18 +++++-------------
 virt/kvm/dirty_ring.c          | 10 +++++-----
 virt/kvm/kvm_main.c            |  9 ++++++---
 3 files changed, 16 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
index da4d9b5f58f1..eb10d87adf7d 100644
--- a/include/linux/kvm_dirty_ring.h
+++ b/include/linux/kvm_dirty_ring.h
@@ -49,9 +49,10 @@ static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *r
 }
 
 static inline int kvm_dirty_ring_reset(struct kvm *kvm,
-				       struct kvm_dirty_ring *ring)
+				       struct kvm_dirty_ring *ring,
+				       int *nr_entries_reset)
 {
-	return 0;
+	return -ENOENT;
 }
 
 static inline void kvm_dirty_ring_push(struct kvm_vcpu *vcpu,
@@ -77,17 +78,8 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm);
 u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm);
 int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring,
 			 int index, u32 size);
-
-/*
- * called with kvm->slots_lock held, returns the number of
- * processed pages.
- */
-int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
-
-/*
- * returns =0: successfully pushed
- *         <0: unable to push, need to wait
- */
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring,
+			 int *nr_entries_reset);
 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset);
 
 bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
index d14ffc7513ee..77986f34eff8 100644
--- a/virt/kvm/dirty_ring.c
+++ b/virt/kvm/dirty_ring.c
@@ -105,19 +105,19 @@ static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
 	return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET;
 }
 
-int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring,
+			 int *nr_entries_reset)
 {
 	u32 cur_slot, next_slot;
 	u64 cur_offset, next_offset;
 	unsigned long mask;
-	int count = 0;
 	struct kvm_dirty_gfn *entry;
 	bool first_round = true;
 
 	/* This is only needed to make compilers happy */
 	cur_slot = cur_offset = mask = 0;
 
-	while (true) {
+	while (likely((*nr_entries_reset) < INT_MAX)) {
 		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
 
 		if (!kvm_dirty_gfn_harvested(entry))
@@ -130,7 +130,7 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
 		kvm_dirty_gfn_set_invalid(entry);
 
 		ring->reset_index++;
-		count++;
+		(*nr_entries_reset)++;
 		/*
 		 * Try to coalesce the reset operations when the guest is
 		 * scanning pages in the same slot.
@@ -167,7 +167,7 @@ int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
 
 	trace_kvm_dirty_ring_reset(ring);
 
-	return count;
+	return 0;
 }
 
 void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eec82775c5bf..c784b6708708 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4962,15 +4962,18 @@ static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
 {
 	unsigned long i;
 	struct kvm_vcpu *vcpu;
-	int cleared = 0;
+	int cleared = 0, r;
 
 	if (!kvm->dirty_ring_size)
 		return -EINVAL;
 
 	mutex_lock(&kvm->slots_lock);
 
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		r = kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring, &cleared);
+		if (r)
+			break;
+	}
 
 	mutex_unlock(&kvm->slots_lock);
 
-- 
cgit v1.2.3


From cd4178d19420359554e3da6fd77ecfd0f58067ce Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 12 Jun 2025 16:51:47 -0700
Subject: KVM: arm64: WARN if unmapping a vLPI fails in any path

When unmapping a vLPI, WARN if nullifying vCPU affinity fails, not just if
failure occurs when freeing an ITE.  If undoing vCPU affinity fails, then
odds are very good that vLPI state tracking has has gotten out of whack,
i.e. that KVM and the GIC disagree on the state of an IRQ/vLPI.  At best,
inconsistent state means there is a lurking bug/flaw somewhere.  At worst,
the inconsistency could eventually be fatal to the host, e.g. if an ITS
command fails because KVM's view of things doesn't match reality/hardware.

Note, only the call from kvm_arch_irq_bypass_del_producer() by way of
kvm_vgic_v4_unset_forwarding() doesn't already WARN.  Common KVM's
kvm_irq_routing_update() WARNs if kvm_arch_update_irqfd_routing() fails.
For that path, if its_unmap_vlpi() fails in kvm_vgic_v4_unset_forwarding(),
the only possible causes are that the GIC doesn't have a v4 ITS (from
its_irq_set_vcpu_affinity()):

        /* Need a v4 ITS */
        if (!is_v4(its_dev->its))
                return -EINVAL;

        guard(raw_spinlock)(&its_dev->event_map.vlpi_lock);

        /* Unmap request? */
        if (!info)
                return its_vlpi_unmap(d);

or that KVM has gotten out of sync with the GIC/ITS (from its_vlpi_unmap()):

        if (!its_dev->event_map.vm || !irqd_is_forwarded_to_vcpu(d))
                return -EINVAL;

All of the above failure scenarios are warnable offences, as they should
never occur absent a kernel/KVM bug.

Acked-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/all/aFWY2LTVIxz5rfhh@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/vgic/vgic-its.c     | 2 +-
 arch/arm64/kvm/vgic/vgic-v4.c      | 4 ++--
 drivers/irqchip/irq-gic-v4.c       | 4 ++--
 include/linux/irqchip/arm-gic-v4.h | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 534049c7c94b..98630dae910d 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -758,7 +758,7 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
 	if (irq) {
 		scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
 			if (irq->hw)
-				WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+				its_unmap_vlpi(ite->irq->host_irq);
 
 			irq->hw = false;
 		}
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 193946108192..911170d4a9c8 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -545,10 +545,10 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
 	if (irq->hw) {
 		atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
 		irq->hw = false;
-		ret = its_unmap_vlpi(host_irq);
+		its_unmap_vlpi(host_irq);
 	}
 
 	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
 	vgic_put_irq(kvm, irq);
-	return ret;
+	return 0;
 }
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 58c28895f8c4..8455b4a5fbb0 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -342,10 +342,10 @@ int its_get_vlpi(int irq, struct its_vlpi_map *map)
 	return irq_set_vcpu_affinity(irq, &info);
 }
 
-int its_unmap_vlpi(int irq)
+void its_unmap_vlpi(int irq)
 {
 	irq_clear_status_flags(irq, IRQ_DISABLE_UNLAZY);
-	return irq_set_vcpu_affinity(irq, NULL);
+	WARN_ON_ONCE(irq_set_vcpu_affinity(irq, NULL));
 }
 
 int its_prop_update_vlpi(int irq, u8 config, bool inv)
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 7f1f11a5e4e4..0b0887099fd7 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -146,7 +146,7 @@ int its_commit_vpe(struct its_vpe *vpe);
 int its_invall_vpe(struct its_vpe *vpe);
 int its_map_vlpi(int irq, struct its_vlpi_map *map);
 int its_get_vlpi(int irq, struct its_vlpi_map *map);
-int its_unmap_vlpi(int irq);
+void its_unmap_vlpi(int irq);
 int its_prop_update_vlpi(int irq, u8 config, bool inv);
 int its_prop_update_vsgi(int irq, u8 priority, bool group);
 
-- 
cgit v1.2.3


From 2b521d86ee80a436a92445b8206d38d75aeb39ea Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 16 May 2025 16:07:29 -0700
Subject: irqbypass: Take ownership of producer/consumer token tracking

Move ownership of IRQ bypass token tracking into irqbypass.ko, and
explicitly require callers to pass an eventfd_ctx structure instead of a
completely opaque token.  Relying on producers and consumers to set the
token appropriately is error prone, and hiding the fact that the token must
be an eventfd_ctx pointer (for all intents and purposes) unnecessarily
obfuscates the code and makes it more brittle.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20250516230734.2564775-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c                |  4 ++--
 drivers/vfio/pci/vfio_pci_intrs.c |  9 +++-----
 drivers/vhost/vdpa.c              |  8 +++----
 include/linux/irqbypass.h         | 35 +++++++++++++++++--------------
 virt/kvm/eventfd.c                |  7 +++----
 virt/lib/irqbypass.c              | 44 +++++++++++++++++++++++++--------------
 6 files changed, 58 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b58a74c1722d..3dc93e2d4777 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13669,8 +13669,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
 					   prod->irq, irqfd->gsi, 0);
 	if (ret)
-		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
-		       " fails: %d\n", irqfd->consumer.token, ret);
+		printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration"
+		       " fails: %d\n", irqfd->consumer.eventfd, ret);
 
 	spin_unlock_irq(&kvm->irqfds.lock);
 
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 565966351dfa..d87fe116762a 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -505,15 +505,12 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
 	if (ret)
 		goto out_put_eventfd_ctx;
 
-	ctx->producer.token = trigger;
 	ctx->producer.irq = irq;
-	ret = irq_bypass_register_producer(&ctx->producer);
+	ret = irq_bypass_register_producer(&ctx->producer, trigger);
 	if (unlikely(ret)) {
 		dev_info(&pdev->dev,
-		"irq bypass producer (token %p) registration fails: %d\n",
-		ctx->producer.token, ret);
-
-		ctx->producer.token = NULL;
+		"irq bypass producer (eventfd %p) registration fails: %d\n",
+		trigger, ret);
 	}
 	ctx->trigger = trigger;
 
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 5a49b5a6d496..7b265ffda697 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -213,10 +213,10 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
 		return;
 
 	vq->call_ctx.producer.irq = irq;
-	ret = irq_bypass_register_producer(&vq->call_ctx.producer);
+	ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx);
 	if (unlikely(ret))
-		dev_info(&v->dev, "vq %u, irq bypass producer (token %p) registration fails, ret =  %d\n",
-			 qid, vq->call_ctx.producer.token, ret);
+		dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret =  %d\n",
+			 qid, vq->call_ctx.ctx, ret);
 }
 
 static void vhost_vdpa_unsetup_vq_irq(struct vhost_vdpa *v, u16 qid)
@@ -712,7 +712,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			if (ops->get_status(vdpa) &
 			    VIRTIO_CONFIG_S_DRIVER_OK)
 				vhost_vdpa_unsetup_vq_irq(v, idx);
-			vq->call_ctx.producer.token = NULL;
 		}
 		break;
 	}
@@ -753,7 +752,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd,
 			cb.callback = vhost_vdpa_virtqueue_cb;
 			cb.private = vq;
 			cb.trigger = vq->call_ctx.ctx;
-			vq->call_ctx.producer.token = vq->call_ctx.ctx;
 			if (ops->get_status(vdpa) &
 			    VIRTIO_CONFIG_S_DRIVER_OK)
 				vhost_vdpa_setup_vq_irq(v, idx);
diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index 9bdb2a781841..1b57d15ac4cf 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -10,6 +10,7 @@
 
 #include <linux/list.h>
 
+struct eventfd_ctx;
 struct irq_bypass_consumer;
 
 /*
@@ -18,20 +19,20 @@ struct irq_bypass_consumer;
  * The IRQ bypass manager is a simple set of lists and callbacks that allows
  * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
  * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
- * via a shared token (ex. eventfd_ctx).  Producers and consumers register
- * independently.  When a token match is found, the optional @stop callback
- * will be called for each participant.  The pair will then be connected via
- * the @add_* callbacks, and finally the optional @start callback will allow
- * any final coordination.  When either participant is unregistered, the
- * process is repeated using the @del_* callbacks in place of the @add_*
- * callbacks.  Match tokens must be unique per producer/consumer, 1:N pairings
- * are not supported.
+ * via a shared eventfd_ctx.  Producers and consumers register independently.
+ * When a producer and consumer are paired, i.e. an eventfd match is found, the
+ * optional @stop callback will be called for each participant.  The pair will
+ * then be connected via the @add_* callbacks, and finally the optional @start
+ * callback will allow any final coordination.  When either participant is
+ * unregistered, the process is repeated using the @del_* callbacks in place of
+ * the @add_* callbacks.  eventfds must be unique per producer/consumer, 1:N
+ * pairings are not supported.
  */
 
 /**
  * struct irq_bypass_producer - IRQ bypass producer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @eventfd: eventfd context used to match producers and consumers
  * @irq: Linux IRQ number for the producer device
  * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
  * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@@ -44,7 +45,7 @@ struct irq_bypass_consumer;
  */
 struct irq_bypass_producer {
 	struct list_head node;
-	void *token;
+	struct eventfd_ctx *eventfd;
 	int irq;
 	int (*add_consumer)(struct irq_bypass_producer *,
 			    struct irq_bypass_consumer *);
@@ -57,7 +58,7 @@ struct irq_bypass_producer {
 /**
  * struct irq_bypass_consumer - IRQ bypass consumer definition
  * @node: IRQ bypass manager private list management
- * @token: opaque token to match between producer and consumer (non-NULL)
+ * @eventfd: eventfd context used to match producers and consumers
  * @add_producer: Connect the IRQ consumer to an IRQ producer
  * @del_producer: Disconnect the IRQ consumer from an IRQ producer
  * @stop: Perform any quiesce operations necessary prior to add/del (optional)
@@ -70,7 +71,7 @@ struct irq_bypass_producer {
  */
 struct irq_bypass_consumer {
 	struct list_head node;
-	void *token;
+	struct eventfd_ctx *eventfd;
 	int (*add_producer)(struct irq_bypass_consumer *,
 			    struct irq_bypass_producer *);
 	void (*del_producer)(struct irq_bypass_consumer *,
@@ -79,9 +80,11 @@ struct irq_bypass_consumer {
 	void (*start)(struct irq_bypass_consumer *);
 };
 
-int irq_bypass_register_producer(struct irq_bypass_producer *);
-void irq_bypass_unregister_producer(struct irq_bypass_producer *);
-int irq_bypass_register_consumer(struct irq_bypass_consumer *);
-void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
+				 struct eventfd_ctx *eventfd);
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer);
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
+				 struct eventfd_ctx *eventfd);
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer);
 
 #endif /* IRQBYPASS_H */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 11e5d1e3f12e..5bc6abe30748 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -426,15 +426,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 	if (kvm_arch_has_irq_bypass()) {
-		irqfd->consumer.token = (void *)irqfd->eventfd;
 		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
 		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
 		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
 		irqfd->consumer.start = kvm_arch_irq_bypass_start;
-		ret = irq_bypass_register_consumer(&irqfd->consumer);
+		ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
 		if (ret)
-			pr_info("irq bypass consumer (token %p) registration fails: %d\n",
-				irqfd->consumer.token, ret);
+			pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
+				irqfd->eventfd, ret);
 	}
 #endif
 
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 28a4d933569a..e8d7c420db52 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -77,30 +77,32 @@ static void __disconnect(struct irq_bypass_producer *prod,
 /**
  * irq_bypass_register_producer - register IRQ bypass producer
  * @producer: pointer to producer structure
+ * @eventfd: pointer to the eventfd context associated with the producer
  *
  * Add the provided IRQ producer to the list of producers and connect
- * with any matching token found on the IRQ consumers list.
+ * with any matching eventfd found on the IRQ consumers list.
  */
-int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+int irq_bypass_register_producer(struct irq_bypass_producer *producer,
+				 struct eventfd_ctx *eventfd)
 {
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 	int ret;
 
-	if (!producer->token)
+	if (WARN_ON_ONCE(producer->eventfd))
 		return -EINVAL;
 
 	mutex_lock(&lock);
 
 	list_for_each_entry(tmp, &producers, node) {
-		if (tmp->token == producer->token) {
+		if (tmp->eventfd == eventfd) {
 			ret = -EBUSY;
 			goto out_err;
 		}
 	}
 
 	list_for_each_entry(consumer, &consumers, node) {
-		if (consumer->token == producer->token) {
+		if (consumer->eventfd == eventfd) {
 			ret = __connect(producer, consumer);
 			if (ret)
 				goto out_err;
@@ -108,6 +110,7 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer)
 		}
 	}
 
+	producer->eventfd = eventfd;
 	list_add(&producer->node, &producers);
 
 	mutex_unlock(&lock);
@@ -131,26 +134,28 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 	struct irq_bypass_producer *tmp;
 	struct irq_bypass_consumer *consumer;
 
-	if (!producer->token)
+	if (!producer->eventfd)
 		return;
 
 	mutex_lock(&lock);
 
 	list_for_each_entry(tmp, &producers, node) {
-		if (tmp->token != producer->token)
+		if (tmp->eventfd != producer->eventfd)
 			continue;
 
 		list_for_each_entry(consumer, &consumers, node) {
-			if (consumer->token == producer->token) {
+			if (consumer->eventfd == producer->eventfd) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 
+		producer->eventfd = NULL;
 		list_del(&producer->node);
 		break;
 	}
 
+	WARN_ON_ONCE(producer->eventfd);
 	mutex_unlock(&lock);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
@@ -158,31 +163,35 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
 /**
  * irq_bypass_register_consumer - register IRQ bypass consumer
  * @consumer: pointer to consumer structure
+ * @eventfd: pointer to the eventfd context associated with the consumer
  *
  * Add the provided IRQ consumer to the list of consumers and connect
- * with any matching token found on the IRQ producer list.
+ * with any matching eventfd found on the IRQ producer list.
  */
-int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
+				 struct eventfd_ctx *eventfd)
 {
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 	int ret;
 
-	if (!consumer->token ||
-	    !consumer->add_producer || !consumer->del_producer)
+	if (WARN_ON_ONCE(consumer->eventfd))
+		return -EINVAL;
+
+	if (!consumer->add_producer || !consumer->del_producer)
 		return -EINVAL;
 
 	mutex_lock(&lock);
 
 	list_for_each_entry(tmp, &consumers, node) {
-		if (tmp->token == consumer->token || tmp == consumer) {
+		if (tmp->eventfd == eventfd) {
 			ret = -EBUSY;
 			goto out_err;
 		}
 	}
 
 	list_for_each_entry(producer, &producers, node) {
-		if (producer->token == consumer->token) {
+		if (producer->eventfd == eventfd) {
 			ret = __connect(producer, consumer);
 			if (ret)
 				goto out_err;
@@ -190,6 +199,7 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
 		}
 	}
 
+	consumer->eventfd = eventfd;
 	list_add(&consumer->node, &consumers);
 
 	mutex_unlock(&lock);
@@ -213,7 +223,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 	struct irq_bypass_consumer *tmp;
 	struct irq_bypass_producer *producer;
 
-	if (!consumer->token)
+	if (!consumer->eventfd)
 		return;
 
 	mutex_lock(&lock);
@@ -223,16 +233,18 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 			continue;
 
 		list_for_each_entry(producer, &producers, node) {
-			if (producer->token == consumer->token) {
+			if (producer->eventfd == consumer->eventfd) {
 				__disconnect(producer, consumer);
 				break;
 			}
 		}
 
+		consumer->eventfd = NULL;
 		list_del(&consumer->node);
 		break;
 	}
 
+	WARN_ON_ONCE(consumer->eventfd);
 	mutex_unlock(&lock);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);
-- 
cgit v1.2.3


From add57f493e0893ac0fb4acbdc441918d3e800f10 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 16 May 2025 16:07:30 -0700
Subject: irqbypass: Explicitly track producer and consumer bindings

Explicitly track IRQ bypass producer:consumer bindings.  This will allow
making removal an O(1) operation; searching through the list to find
information that is trivially tracked (and useful for debug) is wasteful.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20250516230734.2564775-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/irqbypass.h | 7 +++++++
 virt/lib/irqbypass.c      | 9 +++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index 1b57d15ac4cf..b28197c87483 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -29,10 +29,13 @@ struct irq_bypass_consumer;
  * pairings are not supported.
  */
 
+struct irq_bypass_consumer;
+
 /**
  * struct irq_bypass_producer - IRQ bypass producer definition
  * @node: IRQ bypass manager private list management
  * @eventfd: eventfd context used to match producers and consumers
+ * @consumer: The connected consumer (NULL if no connection)
  * @irq: Linux IRQ number for the producer device
  * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
  * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
@@ -46,6 +49,7 @@ struct irq_bypass_consumer;
 struct irq_bypass_producer {
 	struct list_head node;
 	struct eventfd_ctx *eventfd;
+	struct irq_bypass_consumer *consumer;
 	int irq;
 	int (*add_consumer)(struct irq_bypass_producer *,
 			    struct irq_bypass_consumer *);
@@ -59,6 +63,7 @@ struct irq_bypass_producer {
  * struct irq_bypass_consumer - IRQ bypass consumer definition
  * @node: IRQ bypass manager private list management
  * @eventfd: eventfd context used to match producers and consumers
+ * @producer: The connected producer (NULL if no connection)
  * @add_producer: Connect the IRQ consumer to an IRQ producer
  * @del_producer: Disconnect the IRQ consumer from an IRQ producer
  * @stop: Perform any quiesce operations necessary prior to add/del (optional)
@@ -72,6 +77,8 @@ struct irq_bypass_producer {
 struct irq_bypass_consumer {
 	struct list_head node;
 	struct eventfd_ctx *eventfd;
+	struct irq_bypass_producer *producer;
+
 	int (*add_producer)(struct irq_bypass_consumer *,
 			    struct irq_bypass_producer *);
 	void (*del_producer)(struct irq_bypass_consumer *,
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index e8d7c420db52..fdbf7ecc0c21 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -51,6 +51,10 @@ static int __connect(struct irq_bypass_producer *prod,
 	if (prod->start)
 		prod->start(prod);
 
+	if (!ret) {
+		prod->consumer = cons;
+		cons->producer = prod;
+	}
 	return ret;
 }
 
@@ -72,6 +76,9 @@ static void __disconnect(struct irq_bypass_producer *prod,
 		cons->start(cons);
 	if (prod->start)
 		prod->start(prod);
+
+	prod->consumer = NULL;
+	cons->producer = NULL;
 }
 
 /**
@@ -145,6 +152,7 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 
 		list_for_each_entry(consumer, &consumers, node) {
 			if (consumer->eventfd == producer->eventfd) {
+				WARN_ON_ONCE(producer->consumer != consumer);
 				__disconnect(producer, consumer);
 				break;
 			}
@@ -234,6 +242,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 
 		list_for_each_entry(producer, &producers, node) {
 			if (producer->eventfd == consumer->eventfd) {
+				WARN_ON_ONCE(consumer->producer != producer);
 				__disconnect(producer, consumer);
 				break;
 			}
-- 
cgit v1.2.3


From 8394b32faecd9c63b3c436e78e62519e9548e530 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 16 May 2025 16:07:33 -0700
Subject: irqbypass: Use xarray to track producers and consumers

Track IRQ bypass producers and consumers using an xarray to avoid the O(2n)
insertion time associated with walking a list to check for duplicate
entries, and to search for an partner.

At low (tens or few hundreds) total producer/consumer counts, using a list
is faster due to the need to allocate backing storage for xarray.  But as
count creeps into the thousands, xarray wins easily, and can provide
several orders of magnitude better latency at high counts.  E.g. hundreds
of nanoseconds vs. hundreds of milliseconds.

Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: David Matlack <dmatlack@google.com>
Cc: Like Xu <like.xu.linux@gmail.com>
Cc: Binbin Wu <binbin.wu@linux.intel.com>
Reported-by: Yong He <alexyonghe@tencent.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217379
Link: https://lore.kernel.org/all/20230801115646.33990-1-likexu@tencent.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Link: https://lore.kernel.org/r/20250516230734.2564775-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/irqbypass.h |  4 ---
 virt/lib/irqbypass.c      | 74 ++++++++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index b28197c87483..cd64fcaa88fe 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -33,7 +33,6 @@ struct irq_bypass_consumer;
 
 /**
  * struct irq_bypass_producer - IRQ bypass producer definition
- * @node: IRQ bypass manager private list management
  * @eventfd: eventfd context used to match producers and consumers
  * @consumer: The connected consumer (NULL if no connection)
  * @irq: Linux IRQ number for the producer device
@@ -47,7 +46,6 @@ struct irq_bypass_consumer;
  * for a physical device assigned to a VM.
  */
 struct irq_bypass_producer {
-	struct list_head node;
 	struct eventfd_ctx *eventfd;
 	struct irq_bypass_consumer *consumer;
 	int irq;
@@ -61,7 +59,6 @@ struct irq_bypass_producer {
 
 /**
  * struct irq_bypass_consumer - IRQ bypass consumer definition
- * @node: IRQ bypass manager private list management
  * @eventfd: eventfd context used to match producers and consumers
  * @producer: The connected producer (NULL if no connection)
  * @add_producer: Connect the IRQ consumer to an IRQ producer
@@ -75,7 +72,6 @@ struct irq_bypass_producer {
  * portions of the interrupt handling to the VM.
  */
 struct irq_bypass_consumer {
-	struct list_head node;
 	struct eventfd_ctx *eventfd;
 	struct irq_bypass_producer *producer;
 
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index 828556c081f5..ea888b9203d2 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -22,8 +22,8 @@
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("IRQ bypass manager utility module");
 
-static LIST_HEAD(producers);
-static LIST_HEAD(consumers);
+static DEFINE_XARRAY(producers);
+static DEFINE_XARRAY(consumers);
 static DEFINE_MUTEX(lock);
 
 /* @lock must be held when calling connect */
@@ -86,13 +86,13 @@ static void __disconnect(struct irq_bypass_producer *prod,
  * @producer: pointer to producer structure
  * @eventfd: pointer to the eventfd context associated with the producer
  *
- * Add the provided IRQ producer to the list of producers and connect
- * with any matching eventfd found on the IRQ consumers list.
+ * Add the provided IRQ producer to the set of producers and connect with the
+ * consumer with a matching eventfd, if one exists.
  */
 int irq_bypass_register_producer(struct irq_bypass_producer *producer,
 				 struct eventfd_ctx *eventfd)
 {
-	struct irq_bypass_producer *tmp;
+	unsigned long index = (unsigned long)eventfd;
 	struct irq_bypass_consumer *consumer;
 	int ret;
 
@@ -101,22 +101,20 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer,
 
 	guard(mutex)(&lock);
 
-	list_for_each_entry(tmp, &producers, node) {
-		if (tmp->eventfd == eventfd)
-			return -EBUSY;
-	}
+	ret = xa_insert(&producers, index, producer, GFP_KERNEL);
+	if (ret)
+		return ret;
 
-	list_for_each_entry(consumer, &consumers, node) {
-		if (consumer->eventfd == eventfd) {
-			ret = __connect(producer, consumer);
-			if (ret)
-				return ret;
-			break;
+	consumer = xa_load(&consumers, index);
+	if (consumer) {
+		ret = __connect(producer, consumer);
+		if (ret) {
+			WARN_ON_ONCE(xa_erase(&producers, index) != producer);
+			return ret;
 		}
 	}
 
 	producer->eventfd = eventfd;
-	list_add(&producer->node, &producers);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
@@ -125,11 +123,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
  * irq_bypass_unregister_producer - unregister IRQ bypass producer
  * @producer: pointer to producer structure
  *
- * Remove a previously registered IRQ producer from the list of producers
- * and disconnect it from any connected IRQ consumer.
+ * Remove a previously registered IRQ producer (note, it's safe to call this
+ * even if registration was unsuccessful).  Disconnect from the associated
+ * consumer, if one exists.
  */
 void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 {
+	unsigned long index = (unsigned long)producer->eventfd;
+
 	if (!producer->eventfd)
 		return;
 
@@ -138,8 +139,8 @@ void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
 	if (producer->consumer)
 		__disconnect(producer, producer->consumer);
 
+	WARN_ON_ONCE(xa_erase(&producers, index) != producer);
 	producer->eventfd = NULL;
-	list_del(&producer->node);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
 
@@ -148,13 +149,13 @@ EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
  * @consumer: pointer to consumer structure
  * @eventfd: pointer to the eventfd context associated with the consumer
  *
- * Add the provided IRQ consumer to the list of consumers and connect
- * with any matching eventfd found on the IRQ producer list.
+ * Add the provided IRQ consumer to the set of consumers and connect with the
+ * producer with a matching eventfd, if one exists.
  */
 int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
 				 struct eventfd_ctx *eventfd)
 {
-	struct irq_bypass_consumer *tmp;
+	unsigned long index = (unsigned long)eventfd;
 	struct irq_bypass_producer *producer;
 	int ret;
 
@@ -166,22 +167,20 @@ int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
 
 	guard(mutex)(&lock);
 
-	list_for_each_entry(tmp, &consumers, node) {
-		if (tmp->eventfd == eventfd)
-			return -EBUSY;
-	}
+	ret = xa_insert(&consumers, index, consumer, GFP_KERNEL);
+	if (ret)
+		return ret;
 
-	list_for_each_entry(producer, &producers, node) {
-		if (producer->eventfd == eventfd) {
-			ret = __connect(producer, consumer);
-			if (ret)
-				return ret;
-			break;
+	producer = xa_load(&producers, index);
+	if (producer) {
+		ret = __connect(producer, consumer);
+		if (ret) {
+			WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
+			return ret;
 		}
 	}
 
 	consumer->eventfd = eventfd;
-	list_add(&consumer->node, &consumers);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
@@ -190,11 +189,14 @@ EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
  * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
  * @consumer: pointer to consumer structure
  *
- * Remove a previously registered IRQ consumer from the list of consumers
- * and disconnect it from any connected IRQ producer.
+ * Remove a previously registered IRQ consumer (note, it's safe to call this
+ * even if registration was unsuccessful).  Disconnect from the associated
+ * producer, if one exists.
  */
 void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 {
+	unsigned long index = (unsigned long)consumer->eventfd;
+
 	if (!consumer->eventfd)
 		return;
 
@@ -203,7 +205,7 @@ void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
 	if (consumer->producer)
 		__disconnect(consumer->producer, consumer);
 
+	WARN_ON_ONCE(xa_erase(&consumers, index) != consumer);
 	consumer->eventfd = NULL;
-	list_del(&consumer->node);
 }
 EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);
-- 
cgit v1.2.3


From 23b54381cee2928e8b5622e654ca4516f30d2f1a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 16 May 2025 16:07:34 -0700
Subject: irqbypass: Require producers to pass in Linux IRQ number during
 registration

Pass in the Linux IRQ associated with an IRQ bypass producer instead of
relying on the caller to set the field prior to registration, as there's
no benefit to relying on callers to do the right thing.

Take care to set producer->irq before __connect(), as KVM expects the IRQ
to be valid as soon as a connection is possible.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20250516230734.2564775-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 3 +--
 drivers/vhost/vdpa.c              | 4 ++--
 include/linux/irqbypass.h         | 2 +-
 virt/lib/irqbypass.c              | 5 ++++-
 4 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index d87fe116762a..123298a4dc8f 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -505,8 +505,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
 	if (ret)
 		goto out_put_eventfd_ctx;
 
-	ctx->producer.irq = irq;
-	ret = irq_bypass_register_producer(&ctx->producer, trigger);
+	ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
 	if (unlikely(ret)) {
 		dev_info(&pdev->dev,
 		"irq bypass producer (eventfd %p) registration fails: %d\n",
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 7b265ffda697..af1e1fdfd9ed 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -212,8 +212,8 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid)
 	if (!vq->call_ctx.ctx)
 		return;
 
-	vq->call_ctx.producer.irq = irq;
-	ret = irq_bypass_register_producer(&vq->call_ctx.producer, vq->call_ctx.ctx);
+	ret = irq_bypass_register_producer(&vq->call_ctx.producer,
+					   vq->call_ctx.ctx, irq);
 	if (unlikely(ret))
 		dev_info(&v->dev, "vq %u, irq bypass producer (eventfd %p) registration fails, ret =  %d\n",
 			 qid, vq->call_ctx.ctx, ret);
diff --git a/include/linux/irqbypass.h b/include/linux/irqbypass.h
index cd64fcaa88fe..ede1fa938152 100644
--- a/include/linux/irqbypass.h
+++ b/include/linux/irqbypass.h
@@ -84,7 +84,7 @@ struct irq_bypass_consumer {
 };
 
 int irq_bypass_register_producer(struct irq_bypass_producer *producer,
-				 struct eventfd_ctx *eventfd);
+				 struct eventfd_ctx *eventfd, int irq);
 void irq_bypass_unregister_producer(struct irq_bypass_producer *producer);
 int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer,
 				 struct eventfd_ctx *eventfd);
diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c
index ea888b9203d2..62c160200be9 100644
--- a/virt/lib/irqbypass.c
+++ b/virt/lib/irqbypass.c
@@ -85,12 +85,13 @@ static void __disconnect(struct irq_bypass_producer *prod,
  * irq_bypass_register_producer - register IRQ bypass producer
  * @producer: pointer to producer structure
  * @eventfd: pointer to the eventfd context associated with the producer
+ * @irq: Linux IRQ number of the underlying producer device
  *
  * Add the provided IRQ producer to the set of producers and connect with the
  * consumer with a matching eventfd, if one exists.
  */
 int irq_bypass_register_producer(struct irq_bypass_producer *producer,
-				 struct eventfd_ctx *eventfd)
+				 struct eventfd_ctx *eventfd, int irq)
 {
 	unsigned long index = (unsigned long)eventfd;
 	struct irq_bypass_consumer *consumer;
@@ -99,6 +100,8 @@ int irq_bypass_register_producer(struct irq_bypass_producer *producer,
 	if (WARN_ON_ONCE(producer->eventfd))
 		return -EINVAL;
 
+	producer->irq = irq;
+
 	guard(mutex)(&lock);
 
 	ret = xa_insert(&producers, index, producer, GFP_KERNEL);
-- 
cgit v1.2.3


From e295d2e7fbe69ddec772c951c466dfbfc1c96818 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:40 -0700
Subject: KVM: x86: Trigger I/O APIC route rescan in
 kvm_arch_irq_routing_update()

Trigger the I/O APIC route rescan that's performed for a split IRQ chip
after userspace updates IRQ routes in kvm_arch_irq_routing_update(), i.e.
before dropping kvm->irq_lock.  Calling kvm_make_all_cpus_request() under
a mutex is perfectly safe, and the smp_wmb()+smp_mb__after_atomic() pair
in __kvm_make_request()+kvm_check_request() ensures the new routing is
visible to vCPUs prior to the request being visible to vCPUs.

In all likelihood, commit b053b2aef25d ("KVM: x86: Add EOI exit bitmap
inference") somewhat arbitrarily made the request outside of irq_lock to
avoid holding irq_lock any longer than is strictly necessary.  And then
commit abdb080f7ac8 ("kvm/irqchip: kvm_arch_irq_routing_update renaming
split") took the easy route of adding another arch hook instead of risking
a functional change.

Note, the call to synchronize_srcu_expedited() does NOT provide ordering
guarantees with respect to vCPUs scanning the new routing; as above, the
request infrastructure provides the necessary ordering.  I.e. there's no
need to wait for kvm_scan_ioapic_routes() to complete if it's actively
running, because regardless of whether it grabs the old or new table, the
vCPU will have another KVM_REQ_SCAN_IOAPIC pending, i.e. will rescan again
and see the new mappings.

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/irq_comm.c  | 10 +++-------
 include/linux/kvm_host.h |  4 ----
 virt/kvm/irqchip.c       |  2 --
 3 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d6d792b5d1bd..e2ae62ff9cc2 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -395,13 +395,6 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 				   ARRAY_SIZE(default_routing), 0);
 }
 
-void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
-	if (!irqchip_split(kvm))
-		return;
-	kvm_make_scan_ioapic_request(kvm);
-}
-
 void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
 			 u8 vector, unsigned long *ioapic_handled_vectors)
 {
@@ -466,4 +459,7 @@ void kvm_arch_irq_routing_update(struct kvm *kvm)
 #ifdef CONFIG_KVM_HYPERV
 	kvm_hv_irq_routing_update(kvm);
 #endif
+
+	if (irqchip_split(kvm))
+		kvm_make_scan_ioapic_request(kvm);
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bde4fb5c6aa..9461517b4e62 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1024,14 +1024,10 @@ void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
-void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 #else
 static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 }
-static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
-{
-}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 162d8ed889f2..6ccabfd32287 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -222,8 +222,6 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_arch_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 
-	kvm_arch_post_irq_routing_update(kvm);
-
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 	new = old;
-- 
cgit v1.2.3


From 77a74b8ff41ae620f5a5d727d596b670b7b9994e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:48 -0700
Subject: KVM: x86: Move kvm_{request,free}_irq_source_id() to i8254.c (PIT)

Move kvm_{request,free}_irq_source_id() to i8254.c, i.e. the dedicated PIT
emulation file, in anticipation of removing them entirely in favor of
hardcoding the PIT's "requested" source ID (the source ID can only ever be
'2', and the request can never fail).

No functional change intended.

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-10-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/i8254.c     | 44 ++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/irq_comm.c  | 44 --------------------------------------------
 include/linux/kvm_host.h |  2 --
 3 files changed, 44 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59f956f35f4c..2bb223bf0dac 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -641,6 +641,50 @@ static void kvm_pit_reset(struct kvm_pit *pit)
 	kvm_pit_reset_reinject(pit);
 }
 
+static int kvm_request_irq_source_id(struct kvm *kvm)
+{
+	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+	int irq_source_id;
+
+	mutex_lock(&kvm->irq_lock);
+	irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+	if (irq_source_id >= BITS_PER_LONG) {
+		pr_warn("exhausted allocatable IRQ sources!\n");
+		irq_source_id = -EFAULT;
+		goto unlock;
+	}
+
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+	set_bit(irq_source_id, bitmap);
+unlock:
+	mutex_unlock(&kvm->irq_lock);
+
+	return irq_source_id;
+}
+
+static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+	mutex_lock(&kvm->irq_lock);
+	if (irq_source_id < 0 ||
+	    irq_source_id >= BITS_PER_LONG) {
+		pr_err("IRQ source ID out of range!\n");
+		goto unlock;
+	}
+	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+	if (!irqchip_full(kvm))
+		goto unlock;
+
+	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
+unlock:
+	mutex_unlock(&kvm->irq_lock);
+}
+
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 99c521bd9db5..138c675dc24b 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -165,50 +165,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 	return -EWOULDBLOCK;
 }
 
-int kvm_request_irq_source_id(struct kvm *kvm)
-{
-	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
-	int irq_source_id;
-
-	mutex_lock(&kvm->irq_lock);
-	irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
-	if (irq_source_id >= BITS_PER_LONG) {
-		pr_warn("exhausted allocatable IRQ sources!\n");
-		irq_source_id = -EFAULT;
-		goto unlock;
-	}
-
-	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-	set_bit(irq_source_id, bitmap);
-unlock:
-	mutex_unlock(&kvm->irq_lock);
-
-	return irq_source_id;
-}
-
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
-	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
-	mutex_lock(&kvm->irq_lock);
-	if (irq_source_id < 0 ||
-	    irq_source_id >= BITS_PER_LONG) {
-		pr_err("IRQ source ID out of range!\n");
-		goto unlock;
-	}
-	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_full(kvm))
-		goto unlock;
-
-	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
-	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
-	mutex_unlock(&kvm->irq_lock);
-}
-
 void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9461517b4e62..cba8fc4529e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1784,8 +1784,6 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
-int kvm_request_irq_source_id(struct kvm *kvm);
-void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);
 
 /*
-- 
cgit v1.2.3


From 61423c413a746fd5fe5b0d865ea722e11b01105e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:49 -0700
Subject: KVM: x86: Hardcode the PIT IRQ source ID to '2'

Hardcode the PIT's source IRQ ID to '2' instead of "finding" that bit 2
is always the first available bit in irq_sources_bitmap.  Bits 0 and 1 are
set/reserved by kvm_arch_init_vm(), i.e. long before kvm_create_pit() can
be invoked, and KVM allows at most one in-kernel PIT instance, i.e. it's
impossible for the PIT to find a different free bit (there are no other
users of kvm_request_irq_source_id().

Delete the now-defunct irq_sources_bitmap and all its associated code.

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-11-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/i8254.c            | 55 ++++++-----------------------------------
 arch/x86/kvm/i8254.h            |  1 -
 arch/x86/kvm/x86.c              |  6 -----
 include/linux/kvm_host.h        |  1 +
 5 files changed, 8 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bf4459d637cc..4d68680f7051 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1397,7 +1397,6 @@ struct kvm_arch {
 	bool pause_in_guest;
 	bool cstate_in_guest;
 
-	unsigned long irq_sources_bitmap;
 	s64 kvmclock_offset;
 
 	/*
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 2bb223bf0dac..fa8187608cfc 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -248,8 +248,8 @@ static void pit_do_work(struct kthread_work *work)
 	if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
 		return;
 
-	kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
-	kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+	kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 1, false);
+	kvm_set_irq(kvm, KVM_PIT_IRQ_SOURCE_ID, 0, 0, false);
 
 	/*
 	 * Provides NMI watchdog support via Virtual Wire mode.
@@ -641,47 +641,11 @@ static void kvm_pit_reset(struct kvm_pit *pit)
 	kvm_pit_reset_reinject(pit);
 }
 
-static int kvm_request_irq_source_id(struct kvm *kvm)
+static void kvm_pit_clear_all(struct kvm *kvm)
 {
-	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
-	int irq_source_id;
-
-	mutex_lock(&kvm->irq_lock);
-	irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
-
-	if (irq_source_id >= BITS_PER_LONG) {
-		pr_warn("exhausted allocatable IRQ sources!\n");
-		irq_source_id = -EFAULT;
-		goto unlock;
-	}
-
-	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-	set_bit(irq_source_id, bitmap);
-unlock:
-	mutex_unlock(&kvm->irq_lock);
-
-	return irq_source_id;
-}
-
-static void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
-{
-	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
-	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
-
 	mutex_lock(&kvm->irq_lock);
-	if (irq_source_id < 0 ||
-	    irq_source_id >= BITS_PER_LONG) {
-		pr_err("IRQ source ID out of range!\n");
-		goto unlock;
-	}
-	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_full(kvm))
-		goto unlock;
-
-	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
-	kvm_pic_clear_all(kvm->arch.vpic, irq_source_id);
-unlock:
+	kvm_ioapic_clear_all(kvm->arch.vioapic, KVM_PIT_IRQ_SOURCE_ID);
+	kvm_pic_clear_all(kvm->arch.vpic, KVM_PIT_IRQ_SOURCE_ID);
 	mutex_unlock(&kvm->irq_lock);
 }
 
@@ -788,10 +752,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	if (!pit)
 		return NULL;
 
-	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	if (pit->irq_source_id < 0)
-		goto fail_request;
-
 	mutex_init(&pit->pit_state.lock);
 
 	pid = get_pid(task_tgid(current));
@@ -843,8 +803,7 @@ fail_register_pit:
 	kvm_pit_set_reinject(pit, false);
 	kthread_destroy_worker(pit->worker);
 fail_kthread:
-	kvm_free_irq_source_id(kvm, pit->irq_source_id);
-fail_request:
+	kvm_pit_clear_all(kvm);
 	kfree(pit);
 	return NULL;
 }
@@ -861,7 +820,7 @@ void kvm_free_pit(struct kvm *kvm)
 		kvm_pit_set_reinject(pit, false);
 		hrtimer_cancel(&pit->pit_state.timer);
 		kthread_destroy_worker(pit->worker);
-		kvm_free_irq_source_id(kvm, pit->irq_source_id);
+		kvm_pit_clear_all(kvm);
 		kfree(pit);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 338095829ec8..b9c1feb379a7 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -44,7 +44,6 @@ struct kvm_pit {
 	struct kvm_io_device speaker_dev;
 	struct kvm *kvm;
 	struct kvm_kpit_state pit_state;
-	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
 	struct kthread_worker *worker;
 	struct kthread_work expired;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 53f2ce2d40de..1d744730985e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12669,12 +12669,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
-	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
-	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
-	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-		&kvm->arch.irq_sources_bitmap);
-
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cba8fc4529e8..4ff5ea29e343 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -190,6 +190,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
+#define KVM_PIT_IRQ_SOURCE_ID			2
 
 extern struct mutex kvm_lock;
 extern struct list_head vm_list;
-- 
cgit v1.2.3


From 2c938850d9d18cbd6484a66588fac95d74d951fd Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:52 -0700
Subject: KVM: Move x86-only tracepoints to x86's trace.h

Move the I/O APIC tracepoints and trace_kvm_msi_set_irq() to x86, as
__KVM_HAVE_IOAPIC is just code for "x86", and trace_kvm_msi_set_irq()
isn't unique to I/O APIC emulation.

Opportunistically clean up the absurdly messy #includes in ioapic.c.

No functional change intended.

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-14-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/ioapic.c      |  2 +-
 arch/x86/kvm/irq_comm.c    | 10 ++----
 arch/x86/kvm/trace.h       | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/kvm.h | 77 ---------------------------------------------
 4 files changed, 82 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 65626da1407f..fa7481814bc6 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -41,11 +41,11 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 #include <asm/current.h>
-#include <trace/events/kvm.h>
 
 #include "ioapic.h"
 #include "lapic.h"
 #include "irq.h"
+#include "trace.h"
 
 static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
 		bool line_status);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 138c675dc24b..13d84c25e503 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -15,15 +15,11 @@
 #include <linux/export.h>
 #include <linux/rculist.h>
 
-#include <trace/events/kvm.h>
-
-#include "irq.h"
-
+#include "hyperv.h"
 #include "ioapic.h"
-
+#include "irq.h"
 #include "lapic.h"
-
-#include "hyperv.h"
+#include "trace.h"
 #include "x86.h"
 #include "xen.h"
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ba736cbb0587..4ef17990574d 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -260,6 +260,84 @@ TRACE_EVENT(kvm_cpuid,
 		  __entry->used_max_basic ? ", used max basic" : "")
 );
 
+#define kvm_deliver_mode		\
+	{0x0, "Fixed"},			\
+	{0x1, "LowPrio"},		\
+	{0x2, "SMI"},			\
+	{0x3, "Res3"},			\
+	{0x4, "NMI"},			\
+	{0x5, "INIT"},			\
+	{0x6, "SIPI"},			\
+	{0x7, "ExtINT"}
+
+TRACE_EVENT(kvm_ioapic_set_irq,
+	    TP_PROTO(__u64 e, int pin, bool coalesced),
+	    TP_ARGS(e, pin, coalesced),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		e		)
+		__field(	int,		pin		)
+		__field(	bool,		coalesced	)
+	),
+
+	TP_fast_assign(
+		__entry->e		= e;
+		__entry->pin		= pin;
+		__entry->coalesced	= coalesced;
+	),
+
+	TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
+		  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
+		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->e & (1<<11)) ? "logical" : "physical",
+		  (__entry->e & (1<<15)) ? "level" : "edge",
+		  (__entry->e & (1<<16)) ? "|masked" : "",
+		  __entry->coalesced ? " (coalesced)" : "")
+);
+
+TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
+	    TP_PROTO(__u64 e),
+	    TP_ARGS(e),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		e		)
+	),
+
+	TP_fast_assign(
+		__entry->e		= e;
+	),
+
+	TP_printk("dst %x vec %u (%s|%s|%s%s)",
+		  (u8)(__entry->e >> 56), (u8)__entry->e,
+		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->e & (1<<11)) ? "logical" : "physical",
+		  (__entry->e & (1<<15)) ? "level" : "edge",
+		  (__entry->e & (1<<16)) ? "|masked" : "")
+);
+
+TRACE_EVENT(kvm_msi_set_irq,
+	    TP_PROTO(__u64 address, __u64 data),
+	    TP_ARGS(address, data),
+
+	TP_STRUCT__entry(
+		__field(	__u64,		address		)
+		__field(	__u64,		data		)
+	),
+
+	TP_fast_assign(
+		__entry->address	= address;
+		__entry->data		= data;
+	),
+
+	TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+		  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+		  (u8)__entry->data,
+		  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
+		  (__entry->address & (1<<2)) ? "logical" : "physical",
+		  (__entry->data & (1<<15)) ? "level" : "edge",
+		  (__entry->address & (1<<3)) ? "|rh" : "")
+);
+
 #define AREG(x) { APIC_##x, "APIC_" #x }
 
 #define kvm_trace_symbol_apic						    \
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index fc7d0f8ff078..96e581900c8e 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -85,83 +85,6 @@ TRACE_EVENT(kvm_set_irq,
 #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
 #if defined(__KVM_HAVE_IOAPIC)
-#define kvm_deliver_mode		\
-	{0x0, "Fixed"},			\
-	{0x1, "LowPrio"},		\
-	{0x2, "SMI"},			\
-	{0x3, "Res3"},			\
-	{0x4, "NMI"},			\
-	{0x5, "INIT"},			\
-	{0x6, "SIPI"},			\
-	{0x7, "ExtINT"}
-
-TRACE_EVENT(kvm_ioapic_set_irq,
-	    TP_PROTO(__u64 e, int pin, bool coalesced),
-	    TP_ARGS(e, pin, coalesced),
-
-	TP_STRUCT__entry(
-		__field(	__u64,		e		)
-		__field(	int,		pin		)
-		__field(	bool,		coalesced	)
-	),
-
-	TP_fast_assign(
-		__entry->e		= e;
-		__entry->pin		= pin;
-		__entry->coalesced	= coalesced;
-	),
-
-	TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
-		  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
-		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
-		  (__entry->e & (1<<11)) ? "logical" : "physical",
-		  (__entry->e & (1<<15)) ? "level" : "edge",
-		  (__entry->e & (1<<16)) ? "|masked" : "",
-		  __entry->coalesced ? " (coalesced)" : "")
-);
-
-TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
-	    TP_PROTO(__u64 e),
-	    TP_ARGS(e),
-
-	TP_STRUCT__entry(
-		__field(	__u64,		e		)
-	),
-
-	TP_fast_assign(
-		__entry->e		= e;
-	),
-
-	TP_printk("dst %x vec %u (%s|%s|%s%s)",
-		  (u8)(__entry->e >> 56), (u8)__entry->e,
-		  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
-		  (__entry->e & (1<<11)) ? "logical" : "physical",
-		  (__entry->e & (1<<15)) ? "level" : "edge",
-		  (__entry->e & (1<<16)) ? "|masked" : "")
-);
-
-TRACE_EVENT(kvm_msi_set_irq,
-	    TP_PROTO(__u64 address, __u64 data),
-	    TP_ARGS(address, data),
-
-	TP_STRUCT__entry(
-		__field(	__u64,		address		)
-		__field(	__u64,		data		)
-	),
-
-	TP_fast_assign(
-		__entry->address	= address;
-		__entry->data		= data;
-	),
-
-	TP_printk("dst %llx vec %u (%s|%s|%s%s)",
-		  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
-		  (u8)__entry->data,
-		  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
-		  (__entry->address & (1<<2)) ? "logical" : "physical",
-		  (__entry->data & (1<<15)) ? "level" : "edge",
-		  (__entry->address & (1<<3)) ? "|rh" : "")
-);
 
 #define kvm_irqchips						\
 	{KVM_IRQCHIP_PIC_MASTER,	"PIC master"},		\
-- 
cgit v1.2.3


From 628a27731e3f36de7ddce226f7e09ee70e40ed66 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:53 -0700
Subject: KVM: x86: Add CONFIG_KVM_IOAPIC to allow disabling in-kernel I/O APIC

Add a Kconfig to allow building KVM without support for emulating a I/O
APIC, PIC, and PIT, which is desirable for deployments that effectively
don't support a fully in-kernel IRQ chip, i.e. never expect any VMM to
create an in-kernel I/O APIC.  E.g. compiling out support eliminates a few
thousand lines of guest-facing code and gives security folks warm fuzzies.

As a bonus, wrapping relevant paths with CONFIG_KVM_IOAPIC #ifdefs makes
it much easier for readers to understand which bits and pieces exist
specifically for fully in-kernel IRQ chips.

Opportunistically convert all two in-kernel uses of __KVM_HAVE_IOAPIC to
CONFIG_KVM_IOAPIC, e.g. rather than add a second #ifdef to generate a stub
for kvm_arch_post_irq_routing_update().

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-15-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/Kconfig            | 10 ++++++++++
 arch/x86/kvm/Makefile           |  5 +++--
 arch/x86/kvm/i8254.h            |  2 ++
 arch/x86/kvm/irq.c              |  8 ++++++++
 arch/x86/kvm/irq.h              |  9 +++++++++
 arch/x86/kvm/irq_comm.c         |  2 ++
 arch/x86/kvm/lapic.c            |  7 ++++++-
 arch/x86/kvm/trace.h            |  2 ++
 arch/x86/kvm/x86.c              | 22 ++++++++++++++++++----
 include/linux/kvm_host.h        |  2 +-
 include/trace/events/kvm.h      |  4 ++--
 12 files changed, 65 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a4649a234f05..8d511eb03933 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1375,9 +1375,11 @@ struct kvm_arch {
 	atomic_t noncoherent_dma_count;
 #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
 	atomic_t assigned_device_count;
+#ifdef CONFIG_KVM_IOAPIC
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
+#endif
 	atomic_t vapics_in_nmi_mode;
 	struct mutex apic_map_lock;
 	struct kvm_apic_map __rcu *apic_map;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec5382..2c86673155c9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,6 +166,16 @@ config KVM_AMD_SEV
 	  Encrypted State (SEV-ES), and Secure Encrypted Virtualization with
 	  Secure Nested Paging (SEV-SNP) technologies on AMD processors.
 
+config KVM_IOAPIC
+	bool "I/O APIC, PIC, and PIT emulation"
+	default y
+	depends on KVM
+	help
+	  Provides support for KVM to emulate an I/O APIC, PIC, and PIT, i.e.
+	  for full in-kernel APIC emulation.
+
+	  If unsure, say Y.
+
 config KVM_SMM
 	bool "System Management Mode emulation"
 	default y
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a5d362c7b504..92c737257789 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -5,12 +5,13 @@ ccflags-$(CONFIG_KVM_WERROR) += -Werror
 
 include $(srctree)/virt/kvm/Makefile.kvm
 
-kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+kvm-y			+= x86.o emulate.o irq.o lapic.o \
+			   irq_comm.o cpuid.o pmu.o mtrr.o \
 			   debugfs.o mmu/mmu.o mmu/page_track.o \
 			   mmu/spte.o
 
 kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_IOAPIC) += i8259.o i8254.o ioapic.o
 kvm-$(CONFIG_KVM_HYPERV) += hyperv.o
 kvm-$(CONFIG_KVM_XEN)	+= xen.o
 kvm-$(CONFIG_KVM_SMM)	+= smm.o
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index b9c1feb379a7..e8bd59ad8a7c 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,6 +8,7 @@
 
 #include <uapi/asm/kvm.h>
 
+#ifdef CONFIG_KVM_IOAPIC
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
 	u16 latched_count;
@@ -64,5 +65,6 @@ int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control)
 
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
+#endif /* CONFIG_KVM_IOAPIC */
 
 #endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index fb3bad0f4965..4c219e9f52b0 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -76,8 +76,10 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
 	if (!kvm_apic_accept_pic_intr(v))
 		return 0;
 
+#ifdef CONFIG_KVM_IOAPIC
 	if (pic_in_kernel(v->kvm))
 		return v->kvm->arch.vpic->output;
+#endif
 
 	WARN_ON_ONCE(!irqchip_split(v->kvm));
 	return pending_userspace_extint(v);
@@ -136,8 +138,10 @@ int kvm_cpu_get_extint(struct kvm_vcpu *v)
 		return v->kvm->arch.xen.upcall_vector;
 #endif
 
+#ifdef CONFIG_KVM_IOAPIC
 	if (pic_in_kernel(v->kvm))
 		return kvm_pic_read_irq(v->kvm); /* PIC */
+#endif
 
 	WARN_ON_ONCE(!irqchip_split(v->kvm));
 	return get_userspace_extint(v);
@@ -171,7 +175,9 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
 	__kvm_migrate_apic_timer(vcpu);
+#ifdef CONFIG_KVM_IOAPIC
 	__kvm_migrate_pit_timer(vcpu);
+#endif
 	kvm_x86_call(migrate_timers)(vcpu);
 }
 
@@ -187,6 +193,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
 	return irqchip_in_kernel(kvm);
 }
 
+#ifdef CONFIG_KVM_IOAPIC
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -273,3 +280,4 @@ int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 	kvm_pic_update_irq(pic);
 	return r;
 }
+#endif
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7b8b54462f95..5e62c1f79ce6 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -18,6 +18,8 @@
 #include <kvm/iodev.h>
 #include "lapic.h"
 
+#ifdef CONFIG_KVM_IOAPIC
+
 #define PIC_NUM_PINS 16
 #define SELECT_PIC(irq) \
 	((irq) < 8 ? KVM_IRQCHIP_PIC_MASTER : KVM_IRQCHIP_PIC_SLAVE)
@@ -79,12 +81,19 @@ static inline int irqchip_full(struct kvm *kvm)
 	smp_rmb();
 	return mode == KVM_IRQCHIP_KERNEL;
 }
+#else /* CONFIG_KVM_IOAPIC */
+static __always_inline int irqchip_full(struct kvm *kvm)
+{
+	return false;
+}
+#endif
 
 static inline int pic_in_kernel(struct kvm *kvm)
 {
 	return irqchip_full(kvm);
 }
 
+
 static inline int irqchip_split(struct kvm *kvm)
 {
 	int mode = kvm->arch.irqchip_mode;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 13d84c25e503..14fc8db0206c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -208,6 +208,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 	 * check kvm_arch_can_set_irq_routing() before calling this function.
 	 */
 	switch (ue->type) {
+#ifdef CONFIG_KVM_IOAPIC
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
 			return -EINVAL;
@@ -231,6 +232,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		}
 		e->irqchip.irqchip = ue->u.irqchip.irqchip;
 		break;
+#endif
 	case KVM_IRQ_ROUTING_MSI:
 		e->set = kvm_set_msi;
 		e->msi.address_lo = ue->u.msi.address_lo;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73418dc0ebb2..4cf8c1f753d3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1455,7 +1455,7 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
 
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	int trigger_mode;
+	int __maybe_unused trigger_mode;
 
 	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
 	if (!kvm_ioapic_handles_vector(apic, vector))
@@ -1476,12 +1476,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 		return;
 	}
 
+#ifdef CONFIG_KVM_IOAPIC
 	if (apic_test_vector(vector, apic->regs + APIC_TMR))
 		trigger_mode = IOAPIC_LEVEL_TRIG;
 	else
 		trigger_mode = IOAPIC_EDGE_TRIG;
 
 	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+#endif
 }
 
 static int apic_set_eoi(struct kvm_lapic *apic)
@@ -3146,8 +3148,11 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 		kvm_x86_call(hwapic_isr_update)(vcpu, apic_find_highest_isr(apic));
 	}
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+#ifdef CONFIG_KVM_IOAPIC
 	if (ioapic_in_kernel(vcpu->kvm))
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
+#endif
 
 	vcpu->arch.apic_arb_prio = 0;
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4ef17990574d..ababdba2c186 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -270,6 +270,7 @@ TRACE_EVENT(kvm_cpuid,
 	{0x6, "SIPI"},			\
 	{0x7, "ExtINT"}
 
+#ifdef CONFIG_KVM_IOAPIC
 TRACE_EVENT(kvm_ioapic_set_irq,
 	    TP_PROTO(__u64 e, int pin, bool coalesced),
 	    TP_ARGS(e, pin, coalesced),
@@ -314,6 +315,7 @@ TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
 		  (__entry->e & (1<<15)) ? "level" : "edge",
 		  (__entry->e & (1<<16)) ? "|masked" : "")
 );
+#endif
 
 TRACE_EVENT(kvm_msi_set_irq,
 	    TP_PROTO(__u64 address, __u64 data),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d744730985e..78dfa6c1cb01 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4632,17 +4632,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_EXT_EMUL_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
 	case KVM_CAP_PIT:
+	case KVM_CAP_PIT2:
+	case KVM_CAP_PIT_STATE2:
+	case KVM_CAP_REINJECT_CONTROL:
+#endif
 	case KVM_CAP_NOP_IO_DELAY:
 	case KVM_CAP_MP_STATE:
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_USER_NMI:
-	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_IOEVENTFD_NO_LENGTH:
-	case KVM_CAP_PIT2:
-	case KVM_CAP_PIT_STATE2:
+
 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
 	case KVM_CAP_VCPU_EVENTS:
 #ifdef CONFIG_KVM_HYPERV
@@ -6937,9 +6940,11 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 	struct kvm *kvm = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
 	/*
 	 * This union makes it completely explicit to gcc-3.x
-	 * that these two variables' stack usage should be
+	 * that these three variables' stack usage should be
 	 * combined, not added together.
 	 */
 	union {
@@ -6947,6 +6952,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		struct kvm_pit_state2 ps2;
 		struct kvm_pit_config pit_config;
 	} u;
+#endif
 
 	switch (ioctl) {
 	case KVM_SET_TSS_ADDR:
@@ -6970,6 +6976,7 @@ set_identity_unlock:
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		break;
+#ifdef CONFIG_KVM_IOAPIC
 	case KVM_CREATE_IRQCHIP: {
 		mutex_lock(&kvm->lock);
 
@@ -7136,6 +7143,7 @@ set_pit2_out:
 		r = kvm_vm_ioctl_reinject(kvm, &control);
 		break;
 	}
+#endif
 	case KVM_SET_BOOT_CPU_ID:
 		r = 0;
 		mutex_lock(&kvm->lock);
@@ -10595,8 +10603,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
 	else if (ioapic_in_kernel(vcpu->kvm))
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
 
 	if (is_guest_mode(vcpu))
 		vcpu->arch.load_eoi_exitmap_pending = true;
@@ -12799,7 +12809,9 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
 
+#ifdef CONFIG_KVM_IOAPIC
 	kvm_free_pit(kvm);
+#endif
 
 	kvm_mmu_pre_destroy_vm(kvm);
 	static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
@@ -12823,8 +12835,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	kvm_destroy_vcpus(kvm);
 	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
+#endif
 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
 	kvm_mmu_uninit_vm(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ff5ea29e343..3b5575d0b574 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1023,7 +1023,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm);
 void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_KVM_IOAPIC
 void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
 #else
 static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 96e581900c8e..1065a81ca57f 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -84,14 +84,14 @@ TRACE_EVENT(kvm_set_irq,
 );
 #endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
-#if defined(__KVM_HAVE_IOAPIC)
+#ifdef CONFIG_KVM_IOAPIC
 
 #define kvm_irqchips						\
 	{KVM_IRQCHIP_PIC_MASTER,	"PIC master"},		\
 	{KVM_IRQCHIP_PIC_SLAVE,		"PIC slave"},		\
 	{KVM_IRQCHIP_IOAPIC,		"IOAPIC"}
 
-#endif /* defined(__KVM_HAVE_IOAPIC) */
+#endif /* CONFIG_KVM_IOAPIC */
 
 #if defined(CONFIG_HAVE_KVM_IRQCHIP)
 
-- 
cgit v1.2.3


From 141db6cd79e2d71fce3049347177f98a233d8eb2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 14:35:54 -0700
Subject: KVM: Squash two CONFIG_HAVE_KVM_IRQCHIP #ifdefs into one

Squash two #idef CONFIG_HAVE_KVM_IRQCHIP regions in KVM's trace events, as
the only code outside of the #idefs depends on CONFIG_KVM_IOAPIC, and that
Kconfig only exists for x86, which unconditionally selects HAVE_KVM_IRQCHIP.

No functional change intended.

Acked-by: Kai Huang <kai.huang@intel.com>
Link: https://lore.kernel.org/r/20250611213557.294358-16-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/trace/events/kvm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 1065a81ca57f..0b6b79b1a1bc 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -82,7 +82,6 @@ TRACE_EVENT(kvm_set_irq,
 	TP_printk("gsi %u level %d source %d",
 		  __entry->gsi, __entry->level, __entry->irq_source_id)
 );
-#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
 #ifdef CONFIG_KVM_IOAPIC
 
@@ -93,8 +92,6 @@ TRACE_EVENT(kvm_set_irq,
 
 #endif /* CONFIG_KVM_IOAPIC */
 
-#if defined(CONFIG_HAVE_KVM_IRQCHIP)
-
 #ifdef kvm_irqchips
 #define kvm_ack_irq_string "irqchip %s pin %u"
 #define kvm_ack_irq_parm  __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin
-- 
cgit v1.2.3


From cb210737675ef4c1ad88721e84558eeb2f199312 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:06 -0700
Subject: KVM: Pass new routing entries and irqfd when updating IRTEs

When updating IRTEs in response to a GSI routing or IRQ bypass change,
pass the new/current routing information along with the associated irqfd.
This will allow KVM x86 to harden, simplify, and deduplicate its code.

Since adding/removing a bypass producer is now conveniently protected with
irqfds.lock, i.e. can't run concurrently with kvm_irq_routing_update(),
use the routing information cached in the irqfd instead of looking up
the information in the current GSI routing tables.

Opportunistically convert an existing printk() to pr_info() and put its
string onto a single line (old code that strictly adhered to 80 chars).

Link: https://lore.kernel.org/r/20250611224604.313496-5-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/arm.c            |  7 ++++---
 arch/x86/include/asm/kvm_host.h |  6 ++++--
 arch/x86/kvm/svm/avic.c         | 18 +++++++-----------
 arch/x86/kvm/svm/svm.h          |  5 +++--
 arch/x86/kvm/vmx/posted_intr.c  | 19 ++++++++-----------
 arch/x86/kvm/vmx/posted_intr.h  |  8 ++++++--
 arch/x86/kvm/x86.c              | 36 ++++++++++++++++++++----------------
 include/linux/kvm_host.h        |  7 +++++--
 virt/kvm/eventfd.c              | 11 +++++------
 9 files changed, 62 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 38a91bb5d4c7..a9a39e0375f7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2771,8 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
 }
 
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
-				  uint32_t guest_irq, bool set)
+int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				  struct kvm_kernel_irq_routing_entry *old,
+				  struct kvm_kernel_irq_routing_entry *new)
 {
 	/*
 	 * Remapping the vLPI requires taking the its_lock mutex to resolve
@@ -2781,7 +2782,7 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 	 *
 	 * Unmap the vLPI and fall back to software LPI injection.
 	 */
-	return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
+	return kvm_vgic_v4_unset_forwarding(irqfd->kvm, irqfd->producer->irq);
 }
 
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 44fd9ccc0624..2a14564dd08a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -297,6 +297,7 @@ enum x86_intercept_stage;
  */
 #define KVM_APIC_PV_EOI_PENDING	1
 
+struct kvm_kernel_irqfd;
 struct kvm_kernel_irq_routing_entry;
 
 /*
@@ -1845,8 +1846,9 @@ struct kvm_x86_ops {
 	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
 	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
 
-	int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
-			      uint32_t guest_irq, bool set);
+	int (*pi_update_irte)(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			      unsigned int host_irq, uint32_t guest_irq,
+			      struct kvm_kernel_irq_routing_entry *new);
 	void (*pi_start_assignment)(struct kvm *kvm);
 	void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
 	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 067f8e3f5a0d..49b73907de92 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -18,6 +18,7 @@
 #include <linux/hashtable.h>
 #include <linux/amd-iommu.h>
 #include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
 
 #include <asm/irq_remapping.h>
 #include <asm/msr.h>
@@ -886,21 +887,14 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
 	return 0;
 }
 
-/*
- * avic_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-			uint32_t guest_irq, bool set)
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			unsigned int host_irq, uint32_t guest_irq,
+			struct kvm_kernel_irq_routing_entry *new)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
 	bool enable_remapped_mode = true;
+	bool set = !!new;
 	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
@@ -926,6 +920,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
 
+		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
+
 		/**
 		 * Here, we setup with legacy mode in the following cases:
 		 * 1. When cannot target interrupt to a specific vcpu.
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e6f3c6a153a0..b35fce30d923 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -736,8 +736,9 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void avic_vcpu_put(struct kvm_vcpu *vcpu);
 void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
 void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-			uint32_t guest_irq, bool set);
+int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+			unsigned int host_irq, uint32_t guest_irq,
+			struct kvm_kernel_irq_routing_entry *new);
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5c615e5845bf..110fb19848ab 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -2,6 +2,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kvm_host.h>
+#include <linux/kvm_irqfd.h>
 
 #include <asm/irq_remapping.h>
 #include <asm/cpu.h>
@@ -294,17 +295,9 @@ void vmx_pi_start_assignment(struct kvm *kvm)
 	kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
 }
 
-/*
- * vmx_pi_update_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-		       uint32_t guest_irq, bool set)
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+		       unsigned int host_irq, uint32_t guest_irq,
+		       struct kvm_kernel_irq_routing_entry *new)
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_irq_routing_table *irq_rt;
@@ -312,6 +305,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
+	bool set = !!new;
 	int idx, ret = 0;
 
 	if (!vmx_can_use_vtd_pi(kvm))
@@ -329,6 +323,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
 			continue;
+
+		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
+
 		/*
 		 * VT-d PI cannot support posting multicast/broadcast
 		 * interrupts to a vCPU, we still use interrupt remapping
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 80499ea0e674..a94afcb55f7f 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -3,6 +3,9 @@
 #define __KVM_X86_VMX_POSTED_INTR_H
 
 #include <linux/bitmap.h>
+#include <linux/find.h>
+#include <linux/kvm_host.h>
+
 #include <asm/posted_intr.h>
 
 void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
@@ -11,8 +14,9 @@ void pi_wakeup_handler(void);
 void __init pi_init_cpu(int cpu);
 void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
 bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
-int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
-		       uint32_t guest_irq, bool set);
+int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
+		       unsigned int host_irq, uint32_t guest_irq,
+		       struct kvm_kernel_irq_routing_entry *new);
 void vmx_pi_start_assignment(struct kvm *kvm);
 
 static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02237dbb7f32..e1304b002062 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13507,31 +13507,31 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
-	int ret;
+	int ret = 0;
 
 	kvm_arch_start_assignment(irqfd->kvm);
 
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = prod;
 
-	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
-					   prod->irq, irqfd->gsi, 1);
-	if (ret)
-		kvm_arch_end_assignment(irqfd->kvm);
-
+	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
+						   irqfd->gsi, &irqfd->irq_entry);
+		if (ret)
+			kvm_arch_end_assignment(irqfd->kvm);
+	}
 	spin_unlock_irq(&kvm->irqfds.lock);
 
-
 	return ret;
 }
 
 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 				      struct irq_bypass_producer *prod)
 {
-	int ret;
 	struct kvm_kernel_irqfd *irqfd =
 		container_of(cons, struct kvm_kernel_irqfd, consumer);
 	struct kvm *kvm = irqfd->kvm;
+	int ret;
 
 	WARN_ON(irqfd->producer != prod);
 
@@ -13544,11 +13544,13 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	spin_lock_irq(&kvm->irqfds.lock);
 	irqfd->producer = NULL;
 
-	ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
-					   prod->irq, irqfd->gsi, 0);
-	if (ret)
-		printk(KERN_INFO "irq bypass consumer (eventfd %p) unregistration"
-		       " fails: %d\n", irqfd->consumer.eventfd, ret);
+	if (irqfd->irq_entry.type == KVM_IRQ_ROUTING_MSI) {
+		ret = kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, prod->irq,
+						   irqfd->gsi, NULL);
+		if (ret)
+			pr_info("irq bypass consumer (eventfd %p) unregistration fails: %d\n",
+				irqfd->consumer.eventfd, ret);
+	}
 
 	spin_unlock_irq(&kvm->irqfds.lock);
 
@@ -13556,10 +13558,12 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
-				   uint32_t guest_irq, bool set)
+int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				  struct kvm_kernel_irq_routing_entry *old,
+				  struct kvm_kernel_irq_routing_entry *new)
 {
-	return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
+	return kvm_x86_call(pi_update_irte)(irqfd, irqfd->kvm, irqfd->producer->irq,
+					    irqfd->gsi, new);
 }
 
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b5575d0b574..a4160c1c0c6b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2401,6 +2401,8 @@ struct kvm_vcpu *kvm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
+struct kvm_kernel_irqfd;
+
 bool kvm_arch_has_irq_bypass(void);
 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
@@ -2408,8 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
-				  uint32_t guest_irq, bool set);
+int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				  struct kvm_kernel_irq_routing_entry *old,
+				  struct kvm_kernel_irq_routing_entry *new);
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
 				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 5bc6abe30748..bd1766da6895 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -285,9 +285,9 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
 {
 }
 
-int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
-				struct kvm *kvm, unsigned int host_irq,
-				uint32_t guest_irq, bool set)
+int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+					 struct kvm_kernel_irq_routing_entry *old,
+					 struct kvm_kernel_irq_routing_entry *new)
 {
 	return 0;
 }
@@ -618,9 +618,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 		if (irqfd->producer &&
 		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
-			int ret = kvm_arch_update_irqfd_routing(
-					irqfd->kvm, irqfd->producer->irq,
-					irqfd->gsi, 1);
+			int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
+
 			WARN_ON(ret);
 		}
 #endif
-- 
cgit v1.2.3


From 05c5e23657e1d61c271c2f4a3a21d4d630b18a9b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:07 -0700
Subject: KVM: SVM: Track per-vCPU IRTEs using kvm_kernel_irqfd structure

Track the IRTEs that are posting to an SVM vCPU via the associated irqfd
structure and GSI routing instead of dynamically allocating a separate
data structure.  In addition to eliminating an atomic allocation, this
will allow hoisting much of the IRTE update logic to common x86.

Cc: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c   | 71 ++++++++++++++++++-----------------------------
 arch/x86/kvm/svm/svm.h    | 10 ++++---
 include/linux/kvm_irqfd.h |  3 ++
 3 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 49b73907de92..accc36958a75 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -76,14 +76,6 @@ static bool next_vm_id_wrapped = 0;
 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 bool x2avic_enabled;
 
-/*
- * This is a wrapper of struct amd_iommu_ir_data.
- */
-struct amd_svm_iommu_ir {
-	struct list_head node;	/* Used by SVM for per-vcpu ir_list */
-	void *data;		/* Storing pointer to struct amd_ir_data */
-};
-
 static void avic_activate_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -747,8 +739,8 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
 {
 	int ret = 0;
 	unsigned long flags;
-	struct amd_svm_iommu_ir *ir;
 	struct vcpu_svm *svm = to_svm(vcpu);
+	struct kvm_kernel_irqfd *irqfd;
 
 	if (!kvm_arch_has_assigned_device(vcpu->kvm))
 		return 0;
@@ -762,11 +754,11 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
 	if (list_empty(&svm->ir_list))
 		goto out;
 
-	list_for_each_entry(ir, &svm->ir_list, node) {
+	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
 		if (activate)
-			ret = amd_iommu_activate_guest_mode(ir->data);
+			ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data);
 		else
-			ret = amd_iommu_deactivate_guest_mode(ir->data);
+			ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data);
 		if (ret)
 			break;
 	}
@@ -775,27 +767,30 @@ out:
 	return ret;
 }
 
-static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+static void svm_ir_list_del(struct vcpu_svm *svm,
+			    struct kvm_kernel_irqfd *irqfd,
+			    struct amd_iommu_pi_data *pi)
 {
 	unsigned long flags;
-	struct amd_svm_iommu_ir *cur;
+	struct kvm_kernel_irqfd *cur;
 
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
-	list_for_each_entry(cur, &svm->ir_list, node) {
-		if (cur->data != pi->ir_data)
+	list_for_each_entry(cur, &svm->ir_list, vcpu_list) {
+		if (cur->irq_bypass_data != pi->ir_data)
+			continue;
+		if (WARN_ON_ONCE(cur != irqfd))
 			continue;
-		list_del(&cur->node);
-		kfree(cur);
+		list_del(&irqfd->vcpu_list);
 		break;
 	}
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
 
-static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+static int svm_ir_list_add(struct vcpu_svm *svm,
+			   struct kvm_kernel_irqfd *irqfd,
+			   struct amd_iommu_pi_data *pi)
 {
-	int ret = 0;
 	unsigned long flags;
-	struct amd_svm_iommu_ir *ir;
 	u64 entry;
 
 	if (WARN_ON_ONCE(!pi->ir_data))
@@ -812,25 +807,14 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
 		struct vcpu_svm *prev_svm;
 
-		if (!prev_vcpu) {
-			ret = -EINVAL;
-			goto out;
-		}
+		if (!prev_vcpu)
+			return -EINVAL;
 
 		prev_svm = to_svm(prev_vcpu);
-		svm_ir_list_del(prev_svm, pi);
+		svm_ir_list_del(prev_svm, irqfd, pi);
 	}
 
-	/**
-	 * Allocating new amd_iommu_pi_data, which will get
-	 * add to the per-vcpu ir_list.
-	 */
-	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
-	if (!ir) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ir->data = pi->ir_data;
+	irqfd->irq_bypass_data = pi->ir_data;
 
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
@@ -845,10 +829,9 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 		amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
 				    true, pi->ir_data);
 
-	list_add(&ir->node, &svm->ir_list);
+	list_add(&irqfd->vcpu_list, &svm->ir_list);
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
-out:
-	return ret;
+	return 0;
 }
 
 /*
@@ -952,7 +935,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 			 * scheduling information in IOMMU irte.
 			 */
 			if (!ret && pi.is_guest_mode)
-				svm_ir_list_add(svm, &pi);
+				svm_ir_list_add(svm, irqfd, &pi);
 		}
 
 		if (!ret && svm) {
@@ -993,7 +976,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 
 			vcpu = kvm_get_vcpu_by_id(kvm, id);
 			if (vcpu)
-				svm_ir_list_del(to_svm(vcpu), &pi);
+				svm_ir_list_del(to_svm(vcpu), irqfd, &pi);
 		}
 	}
 out:
@@ -1005,8 +988,8 @@ static inline int
 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 {
 	int ret = 0;
-	struct amd_svm_iommu_ir *ir;
 	struct vcpu_svm *svm = to_svm(vcpu);
+	struct kvm_kernel_irqfd *irqfd;
 
 	lockdep_assert_held(&svm->ir_list_lock);
 
@@ -1020,8 +1003,8 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 	if (list_empty(&svm->ir_list))
 		return 0;
 
-	list_for_each_entry(ir, &svm->ir_list, node) {
-		ret = amd_iommu_update_ga(cpu, r, ir->data);
+	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
+		ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data);
 		if (ret)
 			return ret;
 	}
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index b35fce30d923..cc27877d69ae 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -310,10 +310,12 @@ struct vcpu_svm {
 	u64 *avic_physical_id_cache;
 
 	/*
-	 * Per-vcpu list of struct amd_svm_iommu_ir:
-	 * This is used mainly to store interrupt remapping information used
-	 * when update the vcpu affinity. This avoids the need to scan for
-	 * IRTE and try to match ga_tag in the IOMMU driver.
+	 * Per-vCPU list of irqfds that are eligible to post IRQs directly to
+	 * the vCPU (a.k.a. device posted IRQs, a.k.a. IRQ bypass).  The list
+	 * is used to reconfigure IRTEs when the vCPU is loaded/put (to set the
+	 * target pCPU), when AVIC is toggled on/off (to (de)activate bypass),
+	 * and if the irqfd becomes ineligible for posting (to put the IRTE
+	 * back into remapped mode).
 	 */
 	struct list_head ir_list;
 	spinlock_t ir_list_lock;
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 8ad43692e3bb..6510a48e62aa 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -59,6 +59,9 @@ struct kvm_kernel_irqfd {
 	struct work_struct shutdown;
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
+
+	struct list_head vcpu_list;
+	void *irq_bypass_data;
 };
 
 #endif /* __LINUX_KVM_IRQFD_H */
-- 
cgit v1.2.3


From 0a917e9d4b7070fafcbf7a8ec32d2aa444b4e757 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:08 -0700
Subject: KVM: SVM: Delete IRTE link from previous vCPU before setting new IRTE

Delete the previous per-vCPU IRTE link prior to modifying the IRTE.  If
forcing the IRTE back to remapped mode fails, the IRQ is already broken;
keeping stale metadata won't change that, and the IOMMU should be
sufficiently paranoid to sanitize the IRTE when the IRQ is freed and
reallocated.

This will allow hoisting the vCPU tracking to common x86, which in turn
will allow most of the IRTE update code to be deduplicated.

Tested-by: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-7-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c   | 60 ++++++++++-------------------------------------
 include/linux/kvm_irqfd.h |  1 +
 2 files changed, 14 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index accc36958a75..7f7af8ff627d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -767,23 +767,19 @@ out:
 	return ret;
 }
 
-static void svm_ir_list_del(struct vcpu_svm *svm,
-			    struct kvm_kernel_irqfd *irqfd,
-			    struct amd_iommu_pi_data *pi)
+static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
 {
+	struct kvm_vcpu *vcpu = irqfd->irq_bypass_vcpu;
 	unsigned long flags;
-	struct kvm_kernel_irqfd *cur;
 
-	spin_lock_irqsave(&svm->ir_list_lock, flags);
-	list_for_each_entry(cur, &svm->ir_list, vcpu_list) {
-		if (cur->irq_bypass_data != pi->ir_data)
-			continue;
-		if (WARN_ON_ONCE(cur != irqfd))
-			continue;
-		list_del(&irqfd->vcpu_list);
-		break;
-	}
-	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+	if (!vcpu)
+		return;
+
+	spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+	list_del(&irqfd->vcpu_list);
+	spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+
+	irqfd->irq_bypass_vcpu = NULL;
 }
 
 static int svm_ir_list_add(struct vcpu_svm *svm,
@@ -796,24 +792,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm,
 	if (WARN_ON_ONCE(!pi->ir_data))
 		return -EINVAL;
 
-	/**
-	 * In some cases, the existing irte is updated and re-set,
-	 * so we need to check here if it's already been * added
-	 * to the ir_list.
-	 */
-	if (pi->prev_ga_tag) {
-		struct kvm *kvm = svm->vcpu.kvm;
-		u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
-		struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
-		struct vcpu_svm *prev_svm;
-
-		if (!prev_vcpu)
-			return -EINVAL;
-
-		prev_svm = to_svm(prev_vcpu);
-		svm_ir_list_del(prev_svm, irqfd, pi);
-	}
-
+	irqfd->irq_bypass_vcpu = &svm->vcpu;
 	irqfd->irq_bypass_data = pi->ir_data;
 
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
@@ -905,6 +884,8 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 
 		WARN_ON_ONCE(new && memcmp(e, new, sizeof(*new)));
 
+		svm_ir_list_del(irqfd);
+
 		/**
 		 * Here, we setup with legacy mode in the following cases:
 		 * 1. When cannot target interrupt to a specific vcpu.
@@ -963,21 +944,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		pi.prev_ga_tag = 0;
 		pi.is_guest_mode = false;
 		ret = irq_set_vcpu_affinity(host_irq, &pi);
-
-		/**
-		 * Check if the posted interrupt was previously
-		 * setup with the guest_mode by checking if the ga_tag
-		 * was cached. If so, we need to clean up the per-vcpu
-		 * ir_list.
-		 */
-		if (!ret && pi.prev_ga_tag) {
-			int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
-			struct kvm_vcpu *vcpu;
-
-			vcpu = kvm_get_vcpu_by_id(kvm, id);
-			if (vcpu)
-				svm_ir_list_del(to_svm(vcpu), irqfd, &pi);
-		}
 	}
 out:
 	srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 6510a48e62aa..361c07f4466d 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -60,6 +60,7 @@ struct kvm_kernel_irqfd {
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
 
+	struct kvm_vcpu *irq_bypass_vcpu;
 	struct list_head vcpu_list;
 	void *irq_bypass_data;
 };
-- 
cgit v1.2.3


From 1da19c5ce0533796179d9e1b55e64bf78478c4c1 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:09 -0700
Subject: iommu/amd: KVM: SVM: Delete now-unused cached/previous GA tag fields

Delete the amd_ir_data.prev_ga_tag field now that all usage is
superfluous.

Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-8-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c             |  2 --
 drivers/iommu/amd/amd_iommu_types.h |  1 -
 drivers/iommu/amd/iommu.c           | 10 ----------
 include/linux/amd-iommu.h           |  2 +-
 4 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 7f7af8ff627d..38cdfb052a3a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -939,9 +939,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		/**
 		 * Here, pi is used to:
 		 * - Tell IOMMU to use legacy mode for this interrupt.
-		 * - Retrieve ga_tag of prior interrupt remapping data.
 		 */
-		pi.prev_ga_tag = 0;
 		pi.is_guest_mode = false;
 		ret = irq_set_vcpu_affinity(host_irq, &pi);
 	}
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index ccbab3a4811a..053a0f05768c 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -1054,7 +1054,6 @@ struct irq_2_irte {
 };
 
 struct amd_ir_data {
-	u32 cached_ga_tag;
 	struct amd_iommu *iommu;
 	struct irq_2_irte irq_2_irte;
 	struct msi_msg msi_entry;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3117d99cf83d..ed96050d4933 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3887,23 +3887,13 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 	ir_data->cfg = irqd_cfg(data);
 	pi_data->ir_data = ir_data;
 
-	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
 	if (pi_data->is_guest_mode) {
 		ir_data->ga_root_ptr = (pi_data->base >> 12);
 		ir_data->ga_vector = vcpu_pi_info->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
 		ret = amd_iommu_activate_guest_mode(ir_data);
-		if (!ret)
-			ir_data->cached_ga_tag = pi_data->ga_tag;
 	} else {
 		ret = amd_iommu_deactivate_guest_mode(ir_data);
-
-		/*
-		 * This communicates the ga_tag back to the caller
-		 * so that it can do all the necessary clean up.
-		 */
-		if (!ret)
-			ir_data->cached_ga_tag = 0;
 	}
 
 	return ret;
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 062fbd4c9b77..1f9b13d803c5 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -19,8 +19,8 @@ struct amd_iommu;
  */
 struct amd_iommu_pi_data {
 	u32 ga_tag;
-	u32 prev_ga_tag;
 	u64 base;
+
 	bool is_guest_mode;
 	struct vcpu_data *vcpu_data;
 	void *ir_data;
-- 
cgit v1.2.3


From 741e595f02fe4386914b7ef656a06f7209480ca9 Mon Sep 17 00:00:00 2001
From: Liam Merwick <liam.merwick@oracle.com>
Date: Mon, 9 Jun 2025 09:11:20 +0000
Subject: KVM: Add trace_kvm_vm_set_mem_attributes()

Add a tracing function that, for a guest memory range, displays
the start and end addresses plus the per-page attributes being set.

Signed-off-by: Liam Merwick <liam.merwick@oracle.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Link: https://lore.kernel.org/r/20250609091121.2497429-3-liam.merwick@oracle.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/trace/events/kvm.h | 27 +++++++++++++++++++++++++++
 virt/kvm/kvm_main.c        |  2 ++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index fc7d0f8ff078..40b20d682d47 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -473,6 +473,33 @@ TRACE_EVENT(kvm_dirty_ring_exit,
 	TP_printk("vcpu %d", __entry->vcpu_id)
 );
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+/*
+ * @start:	Starting address of guest memory range
+ * @end:	End address of guest memory range
+ * @attr:	The value of the attribute being set.
+ */
+TRACE_EVENT(kvm_vm_set_mem_attributes,
+	TP_PROTO(gfn_t start, gfn_t end, unsigned long attr),
+	TP_ARGS(start, end, attr),
+
+	TP_STRUCT__entry(
+		__field(gfn_t,		start)
+		__field(gfn_t,		end)
+		__field(unsigned long,	attr)
+	),
+
+	TP_fast_assign(
+		__entry->start		= start;
+		__entry->end		= end;
+		__entry->attr		= attr;
+	),
+
+	TP_printk("%#016llx -- %#016llx [0x%lx]",
+		  __entry->start, __entry->end, __entry->attr)
+);
+#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
+
 TRACE_EVENT(kvm_unmap_hva_range,
 	TP_PROTO(unsigned long start, unsigned long end),
 	TP_ARGS(start, end),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eec82775c5bf..af69b1e24f40 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2558,6 +2558,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 
 	entry = attributes ? xa_mk_value(attributes) : NULL;
 
+	trace_kvm_vm_set_mem_attributes(start, end, attributes);
+
 	mutex_lock(&kvm->slots_lock);
 
 	/* Nothing to do if the entire range as the desired attributes. */
-- 
cgit v1.2.3


From 6e6558a6bc418f1478c5dc8609d03805364e0cb9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jun 2025 15:33:10 -1000
Subject: sched_ext, sched/core: Factor out struct scx_task_group

More sched_ext fields will be added to struct task_group. In preparation,
factor out sched_ext fields into struct scx_task_group to reduce clutter in
the common header. No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  8 ++++++++
 kernel/sched/ext.c        | 32 ++++++++++++++++----------------
 kernel/sched/sched.h      |  5 +----
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f7545430a548..eda89acdb7ab 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -214,4 +214,12 @@ static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
+
+struct scx_task_group {
+#ifdef CONFIG_EXT_GROUP_SCHED
+	u32			flags;		/* SCX_TG_* */
+	u32			weight;
+#endif
+};
+
 #endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4db51e708f86..6732e50e0679 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4058,7 +4058,7 @@ static bool scx_cgroup_enabled;
 
 void scx_tg_init(struct task_group *tg)
 {
-	tg->scx_weight = CGROUP_WEIGHT_DFL;
+	tg->scx.weight = CGROUP_WEIGHT_DFL;
 }
 
 int scx_tg_online(struct task_group *tg)
@@ -4066,14 +4066,14 @@ int scx_tg_online(struct task_group *tg)
 	struct scx_sched *sch = scx_root;
 	int ret = 0;
 
-	WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+	WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
 
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (scx_cgroup_enabled) {
 		if (SCX_HAS_OP(sch, cgroup_init)) {
 			struct scx_cgroup_init_args args =
-				{ .weight = tg->scx_weight };
+				{ .weight = tg->scx.weight };
 
 			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
 					      NULL, tg->css.cgroup, &args);
@@ -4081,9 +4081,9 @@ int scx_tg_online(struct task_group *tg)
 				ret = ops_sanitize_err(sch, "cgroup_init", ret);
 		}
 		if (ret == 0)
-			tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+			tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
 	} else {
-		tg->scx_flags |= SCX_TG_ONLINE;
+		tg->scx.flags |= SCX_TG_ONLINE;
 	}
 
 	percpu_up_read(&scx_cgroup_rwsem);
@@ -4094,15 +4094,15 @@ void scx_tg_offline(struct task_group *tg)
 {
 	struct scx_sched *sch = scx_root;
 
-	WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+	WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
 
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
-	    (tg->scx_flags & SCX_TG_INITED))
+	    (tg->scx.flags & SCX_TG_INITED))
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
 			    tg->css.cgroup);
-	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+	tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 
 	percpu_up_read(&scx_cgroup_rwsem);
 }
@@ -4211,11 +4211,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
-	    tg->scx_weight != weight)
+	    tg->scx.weight != weight)
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
 			    tg_cgrp(tg), weight);
 
-	tg->scx_weight = weight;
+	tg->scx.weight = weight;
 
 	percpu_up_read(&scx_cgroup_rwsem);
 }
@@ -4366,9 +4366,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
 	css_for_each_descendant_post(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
 
-		if (!(tg->scx_flags & SCX_TG_INITED))
+		if (!(tg->scx.flags & SCX_TG_INITED))
 			continue;
-		tg->scx_flags &= ~SCX_TG_INITED;
+		tg->scx.flags &= ~SCX_TG_INITED;
 
 		if (!sch->ops.cgroup_exit)
 			continue;
@@ -4400,14 +4400,14 @@ static int scx_cgroup_init(struct scx_sched *sch)
 	rcu_read_lock();
 	css_for_each_descendant_pre(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
-		struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+		struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
 
-		if ((tg->scx_flags &
+		if ((tg->scx.flags &
 		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
 			continue;
 
 		if (!sch->ops.cgroup_init) {
-			tg->scx_flags |= SCX_TG_INITED;
+			tg->scx.flags |= SCX_TG_INITED;
 			continue;
 		}
 
@@ -4422,7 +4422,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
 			scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
 			return ret;
 		}
-		tg->scx_flags |= SCX_TG_INITED;
+		tg->scx.flags |= SCX_TG_INITED;
 
 		rcu_read_lock();
 		css_put(css);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 105190b18020..fdf5f52b54a3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -471,10 +471,7 @@ struct task_group {
 	struct rt_bandwidth	rt_bandwidth;
 #endif
 
-#ifdef CONFIG_EXT_GROUP_SCHED
-	u32			scx_flags;	/* SCX_TG_* */
-	u32			scx_weight;
-#endif
+	struct scx_task_group	scx;
 
 	struct rcu_head		rcu;
 	struct list_head	list;
-- 
cgit v1.2.3


From ddceadce63d9cb752c2472e220ded05cabaf7971 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jun 2025 15:34:22 -1000
Subject: sched_ext: Add support for cgroup bandwidth control interface

From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Jun 2025 15:06:47 -1000

- Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both
  CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED.

- Put bandwidth control interface files for both cgroup v1 and v2 under
  CONFIG_GROUP_SCHED_BANDWIDTH.

- Update tg_bandwidth() to fetch configuration parameters from fair if
  CONFIG_CFS_BANDWIDTH, SCX otherwise.

- Update tg_set_bandwidth() to update the parameters for both fair and SCX.

- Add bandwidth control parameters to struct scx_cgroup_init_args.

- Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth
  control parameter updates.

- Update scx_qmap and maximal selftest to test the new feature.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h                       |  3 ++
 init/Kconfig                                    |  5 ++
 kernel/sched/core.c                             | 29 +++++++++--
 kernel/sched/ext.c                              | 66 +++++++++++++++++++++++--
 kernel/sched/ext.h                              |  2 +
 kernel/sched/sched.h                            |  4 +-
 tools/sched_ext/scx_qmap.bpf.c                  | 23 +++++++++
 tools/testing/selftests/sched_ext/maximal.bpf.c |  5 ++
 8 files changed, 127 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index eda89acdb7ab..8b92842776cb 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -219,6 +219,9 @@ struct scx_task_group {
 #ifdef CONFIG_EXT_GROUP_SCHED
 	u32			flags;		/* SCX_TG_* */
 	u32			weight;
+	u64			bw_period_us;
+	u64			bw_quota_us;
+	u64			bw_burst_us;
 #endif
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index af4c2f085455..baf59d2a20a2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1065,6 +1065,9 @@ if CGROUP_SCHED
 config GROUP_SCHED_WEIGHT
 	def_bool n
 
+config GROUP_SCHED_BANDWIDTH
+        def_bool n
+
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
@@ -1074,6 +1077,7 @@ config FAIR_GROUP_SCHED
 config CFS_BANDWIDTH
 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
 	depends on FAIR_GROUP_SCHED
+	select GROUP_SCHED_BANDWIDTH
 	default n
 	help
 	  This option allows users to define CPU bandwidth rates (limits) for
@@ -1108,6 +1112,7 @@ config EXT_GROUP_SCHED
 	bool
 	depends on SCHED_CLASS_EXT && CGROUP_SCHED
 	select GROUP_SCHED_WEIGHT
+	select GROUP_SCHED_BANDWIDTH
 	default y
 
 endif #CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0e3a00e2a2cc..91845d00a1cd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
 
 	return 0;
 }
+#endif /* CONFIG_CFS_BANDWIDTH */
 
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
 const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
 static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
 /* More than 203 days if BW_SHIFT equals 20. */
@@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
 static void tg_bandwidth(struct task_group *tg,
 			 u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
 {
+#ifdef CONFIG_CFS_BANDWIDTH
 	if (period_us_p)
 		*period_us_p = tg_get_cfs_period(tg);
 	if (quota_us_p)
 		*quota_us_p = tg_get_cfs_quota(tg);
 	if (burst_us_p)
 		*burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+	if (period_us_p)
+		*period_us_p = tg->scx.bw_period_us;
+	if (quota_us_p)
+		*quota_us_p = tg->scx.bw_quota_us;
+	if (burst_us_p)
+		*burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
 }
 
 static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg,
 			    u64 period_us, u64 quota_us, u64 burst_us)
 {
 	const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+	int ret = 0;
 
 	if (tg == &root_task_group)
 		return -EINVAL;
@@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg,
 					burst_us + quota_us > max_bw_runtime_us))
 		return -EINVAL;
 
-	return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#ifdef CONFIG_CFS_BANDWIDTH
+	ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+	if (!ret)
+		scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+	return ret;
 }
 
 static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
 	tg_bandwidth(tg, &period_us, &quota_us, NULL);
 	return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
 }
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = {
 		.write_s64 = cpu_idle_write_s64,
 	},
 #endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
 	{
 		.name = "cfs_period_us",
 		.read_u64 = cpu_period_read_u64,
@@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = {
 		.read_u64 = cpu_burst_read_u64,
 		.write_u64 = cpu_burst_write_u64,
 	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
 	{
 		.name = "stat",
 		.seq_show = cpu_cfs_stat_show,
@@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
 	return 0;
 }
 
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
 static int cpu_max_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
@@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = {
 		.write_s64 = cpu_idle_write_s64,
 	},
 #endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
 	{
 		.name = "max",
 		.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6732e50e0679..39cba11688a9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
 struct scx_cgroup_init_args {
 	/* the weight of the cgroup [1..10000] */
 	u32			weight;
+
+	/* bandwidth control parameters from cpu.max and cpu.max.burst */
+	u64			bw_period_us;
+	u64			bw_quota_us;
+	u64			bw_burst_us;
 };
 
 enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
 	 * @cgrp: cgroup whose weight is being updated
 	 * @weight: new weight [1..10000]
 	 *
-	 * Update @tg's weight to @weight.
+	 * Update @cgrp's weight to @weight.
 	 */
 	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+	/**
+	 * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+	 * @cgrp: cgroup whose bandwidth is being updated
+	 * @period_us: bandwidth control period
+	 * @quota_us: bandwidth control quota
+	 * @burst_us: bandwidth control burst
+	 *
+	 * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+	 * cgroup interface.
+	 *
+	 * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+	 * to. For example, if @period_us is 1_000_000 and @quota_us is
+	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+	 * interpreted in the same fashion and specifies how much @cgrp can
+	 * burst temporarily. The specific control mechanism and thus the
+	 * interpretation of @period_us and burstiness is upto to the BPF
+	 * scheduler.
+	 */
+	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+				     u64 period_us, u64 quota_us, u64 burst_us);
+
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
 	/*
@@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled;
 void scx_tg_init(struct task_group *tg)
 {
 	tg->scx.weight = CGROUP_WEIGHT_DFL;
+	tg->scx.bw_period_us = default_bw_period_us();
+	tg->scx.bw_quota_us = RUNTIME_INF;
 }
 
 int scx_tg_online(struct task_group *tg)
@@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg)
 	if (scx_cgroup_enabled) {
 		if (SCX_HAS_OP(sch, cgroup_init)) {
 			struct scx_cgroup_init_args args =
-				{ .weight = tg->scx.weight };
+				{ .weight = tg->scx.weight,
+				  .bw_period_us = tg->scx.bw_period_us,
+				  .bw_quota_us = tg->scx.bw_quota_us,
+				  .bw_burst_us = tg->scx.bw_burst_us };
 
 			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
 					      NULL, tg->css.cgroup, &args);
@@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
 	/* TODO: Implement ops->cgroup_set_idle() */
 }
 
+void scx_group_set_bandwidth(struct task_group *tg,
+			     u64 period_us, u64 quota_us, u64 burst_us)
+{
+	struct scx_sched *sch = scx_root;
+
+	percpu_down_read(&scx_cgroup_rwsem);
+
+	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+	    (tg->scx.bw_period_us != period_us ||
+	     tg->scx.bw_quota_us != quota_us ||
+	     tg->scx.bw_burst_us != burst_us))
+		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+			    tg_cgrp(tg), period_us, quota_us, burst_us);
+
+	tg->scx.bw_period_us = period_us;
+	tg->scx.bw_quota_us = quota_us;
+	tg->scx.bw_burst_us = burst_us;
+
+	percpu_up_read(&scx_cgroup_rwsem);
+}
+
 static void scx_cgroup_lock(void)
 {
 	percpu_down_write(&scx_cgroup_rwsem);
@@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch)
 	rcu_read_lock();
 	css_for_each_descendant_pre(css, &root_task_group.css) {
 		struct task_group *tg = css_tg(css);
-		struct scx_cgroup_init_args args = { .weight = tg->scx.weight };
+		struct scx_cgroup_init_args args = {
+			.weight = tg->scx.weight,
+			.bw_period_us = tg->scx.bw_period_us,
+			.bw_quota_us = tg->scx.bw_quota_us,
+			.bw_burst_us = tg->scx.bw_burst_us,
+		};
 
 		if ((tg->scx.flags &
 		     (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
@@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
 static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
 static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
 #endif
 static void sched_ext_ops__cpu_online(s32 cpu) {}
 static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.cgroup_move		= sched_ext_ops__cgroup_move,
 	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
 	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
+	.cgroup_set_bandwidth	= sched_ext_ops__cgroup_set_bandwidth,
 #endif
 	.cpu_online		= sched_ext_ops__cpu_online,
 	.cpu_offline		= sched_ext_ops__cpu_offline,
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index e7bcaa02ea56..292bb41a242e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void);
 void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
 void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
 void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
 #else	/* CONFIG_EXT_GROUP_SCHED */
 static inline void scx_tg_init(struct task_group *tg) {}
 static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
 static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
 static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
 static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 #endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fdf5f52b54a3..06767a210717 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -402,7 +402,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
 
 extern struct list_head task_groups;
 
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
 extern const u64 max_bw_quota_period_us;
 
 /*
@@ -413,7 +413,7 @@ static inline u64 default_bw_period_us(void)
 {
 	return 100000ULL;
 }
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
 
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index c3cd9a17d48e..69d877501cb7 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -615,6 +615,26 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
 		     taskc->force_local, taskc->core_sched_seq);
 }
 
+s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
+{
+	bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
+		   cgrp->kn->id, args->weight, args->bw_period_us,
+		   args->bw_quota_us, args->bw_burst_us);
+	return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+	bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
+		    u64 period_us, u64 quota_us, u64 burst_us)
+{
+	bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
+		   period_us, quota_us, burst_us);
+}
+
 /*
  * Print out the online and possible CPU map using bpf_printk() as a
  * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
@@ -840,6 +860,9 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
 	       .dump_task		= (void *)qmap_dump_task,
+	       .cgroup_init		= (void *)qmap_cgroup_init,
+	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
+	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 430f5e13bf55..01cf4f3da4e0 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -123,6 +123,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
 void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
 {}
 
+void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp,
+		    u64 period_us, u64 quota_us, u64 burst_us)
+{}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
 {
 	return scx_bpf_create_dsq(DSQ_ID, -1);
@@ -160,6 +164,7 @@ struct sched_ext_ops maximal_ops = {
 	.cgroup_move		= (void *) maximal_cgroup_move,
 	.cgroup_cancel_move	= (void *) maximal_cgroup_cancel_move,
 	.cgroup_set_weight	= (void *) maximal_cgroup_set_weight,
+	.cgroup_set_bandwidth	= (void *) maximal_cgroup_set_bandwidth,
 	.init			= (void *) maximal_init,
 	.exit			= (void *) maximal_exit,
 	.name			= "maximal",
-- 
cgit v1.2.3


From 4672aec56d2e8edabcb74c3e2320301d106a377e Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 19 Jun 2025 17:52:38 +0000
Subject: netmem: fix skb_frag_address_safe with unreadable skbs

skb_frag_address_safe() needs a check that the
skb_frag_page exists check similar to skb_frag_address().

Cc: ap420073@gmail.com

Signed-off-by: Mina Almasry <almasrymina@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250619175239.3039329-1-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9508968cb300..4f6dcb37bae8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3665,7 +3665,13 @@ static inline void *skb_frag_address(const skb_frag_t *frag)
  */
 static inline void *skb_frag_address_safe(const skb_frag_t *frag)
 {
-	void *ptr = page_address(skb_frag_page(frag));
+	struct page *page = skb_frag_page(frag);
+	void *ptr;
+
+	if (!page)
+		return NULL;
+
+	ptr = page_address(page);
 	if (unlikely(!ptr))
 		return NULL;
 
-- 
cgit v1.2.3


From 99aa0bbb082e7c0660751832acca897493c3082c Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Fri, 20 Jun 2025 11:16:41 +0200
Subject: net: pse-pd: Fix ethnl_pse_send_ntf() stub parameter type

The ethnl_pse_send_ntf() stub function has incorrect parameter type when
CONFIG_ETHTOOL_NETLINK is disabled. The function should take a net_device
pointer instead of phy_device pointer to match the actual implementation.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506200355.TqFiYUbN-lkp@intel.com/
Fixes: fc0e6db30941 ("net: pse-pd: Add support for reporting events")
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250620091641.2098028-1-kory.maincent@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool_netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 1dcc4059b5ab..39254b2726c0 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -122,7 +122,7 @@ static inline bool ethtool_dev_mm_supported(struct net_device *dev)
 	return false;
 }
 
-static inline void ethnl_pse_send_ntf(struct phy_device *phydev,
+static inline void ethnl_pse_send_ntf(struct net_device *netdev,
 				      unsigned long notif)
 {
 }
-- 
cgit v1.2.3


From 520c790c83e9e4c915a8e3fc9f2152ece39b6511 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Sun, 8 Jun 2025 22:40:02 +0200
Subject: power: supply: core: remove of_node from power_supply_config

All drivers have been migrated from .of_node to .fwnode,
so let's kill the former.

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Link: https://lore.kernel.org/r/20250430-psy-core-convert-to-fwnode-v2-2-f9643b958677@collabora.com
Reviewed-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/power_supply_core.c | 3 +--
 include/linux/power_supply.h             | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 33a5bfce4604..89947f1fe610 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1529,10 +1529,9 @@ __power_supply_register(struct device *parent,
 	dev_set_drvdata(dev, psy);
 	psy->desc = desc;
 	if (cfg) {
+		device_set_node(dev, cfg->fwnode);
 		dev->groups = cfg->attr_grp;
 		psy->drv_data = cfg->drv_data;
-		dev->of_node =
-			cfg->fwnode ? to_of_node(cfg->fwnode) : cfg->of_node;
 		psy->supplied_to = cfg->supplied_to;
 		psy->num_supplicants = cfg->num_supplicants;
 	}
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 7803edaa8ff8..72012141656e 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -232,7 +232,6 @@ struct power_supply;
 
 /* Run-time specific power supply configuration */
 struct power_supply_config {
-	struct device_node *of_node;
 	struct fwnode_handle *fwnode;
 
 	/* Driver private data */
-- 
cgit v1.2.3


From f368f87b22dab8e97c5f447b00a0cae79fefbdcb Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Sun, 8 Jun 2025 22:40:06 +0200
Subject: power: supply: core: convert to fwnnode

Replace any DT specific code with fwnode in the power-supply
core.

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Hans de Goede <hansg@kernel.org>
Link: https://lore.kernel.org/r/20250430-psy-core-convert-to-fwnode-v2-4-f9643b958677@collabora.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq2415x_charger.c   |  2 +-
 drivers/power/supply/power_supply_core.c | 65 ++++++++++++++++----------------
 include/linux/power_supply.h             |  2 +-
 3 files changed, 34 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c
index 9e3b9181ee76..1ecbca510bba 100644
--- a/drivers/power/supply/bq2415x_charger.c
+++ b/drivers/power/supply/bq2415x_charger.c
@@ -1674,7 +1674,7 @@ static int bq2415x_probe(struct i2c_client *client)
 	/* Query for initial reported_mode and set it */
 	if (bq->nb.notifier_call) {
 		if (np) {
-			notify_psy = power_supply_get_by_phandle(np,
+			notify_psy = power_supply_get_by_phandle(of_fwnode_handle(np),
 						"ti,usb-charger-detection");
 			if (IS_ERR(notify_psy))
 				notify_psy = NULL;
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f2c79f15838d..2d83bb125a48 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -18,7 +18,6 @@
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/err.h>
-#include <linux/of.h>
 #include <linux/power_supply.h>
 #include <linux/property.h>
 #include <linux/thermal.h>
@@ -196,24 +195,24 @@ static int __power_supply_populate_supplied_from(struct power_supply *epsy,
 						 void *data)
 {
 	struct power_supply *psy = data;
-	struct device_node *np;
+	struct fwnode_handle *np;
 	int i = 0;
 
 	do {
-		np = of_parse_phandle(psy->dev.of_node, "power-supplies", i++);
-		if (!np)
+		np = fwnode_find_reference(psy->dev.fwnode, "power-supplies", i++);
+		if (IS_ERR(np))
 			break;
 
-		if (np == epsy->dev.of_node) {
+		if (np == epsy->dev.fwnode) {
 			dev_dbg(&psy->dev, "%s: Found supply : %s\n",
 				psy->desc->name, epsy->desc->name);
 			psy->supplied_from[i-1] = (char *)epsy->desc->name;
 			psy->num_supplies++;
-			of_node_put(np);
+			fwnode_handle_put(np);
 			break;
 		}
-		of_node_put(np);
-	} while (np);
+		fwnode_handle_put(np);
+	} while (!IS_ERR(np));
 
 	return 0;
 }
@@ -232,16 +231,16 @@ static int power_supply_populate_supplied_from(struct power_supply *psy)
 static int  __power_supply_find_supply_from_node(struct power_supply *epsy,
 						 void *data)
 {
-	struct device_node *np = data;
+	struct fwnode_handle *fwnode = data;
 
 	/* returning non-zero breaks out of power_supply_for_each_psy loop */
-	if (epsy->dev.of_node == np)
+	if (epsy->dev.fwnode == fwnode)
 		return 1;
 
 	return 0;
 }
 
-static int power_supply_find_supply_from_node(struct device_node *supply_node)
+static int power_supply_find_supply_from_fwnode(struct fwnode_handle *supply_node)
 {
 	int error;
 
@@ -249,7 +248,7 @@ static int power_supply_find_supply_from_node(struct device_node *supply_node)
 	 * power_supply_for_each_psy() either returns its own errors or values
 	 * returned by __power_supply_find_supply_from_node().
 	 *
-	 * __power_supply_find_supply_from_node() will return 0 (no match)
+	 * __power_supply_find_supply_from_fwnode() will return 0 (no match)
 	 * or 1 (match).
 	 *
 	 * We return 0 if power_supply_for_each_psy() returned 1, -EPROBE_DEFER if
@@ -262,7 +261,7 @@ static int power_supply_find_supply_from_node(struct device_node *supply_node)
 
 static int power_supply_check_supplies(struct power_supply *psy)
 {
-	struct device_node *np;
+	struct fwnode_handle *np;
 	int cnt = 0;
 
 	/* If there is already a list honor it */
@@ -270,24 +269,24 @@ static int power_supply_check_supplies(struct power_supply *psy)
 		return 0;
 
 	/* No device node found, nothing to do */
-	if (!psy->dev.of_node)
+	if (!psy->dev.fwnode)
 		return 0;
 
 	do {
 		int ret;
 
-		np = of_parse_phandle(psy->dev.of_node, "power-supplies", cnt++);
-		if (!np)
+		np = fwnode_find_reference(psy->dev.fwnode, "power-supplies", cnt++);
+		if (IS_ERR(np))
 			break;
 
-		ret = power_supply_find_supply_from_node(np);
-		of_node_put(np);
+		ret = power_supply_find_supply_from_fwnode(np);
+		fwnode_handle_put(np);
 
 		if (ret) {
 			dev_dbg(&psy->dev, "Failed to find supply!\n");
 			return ret;
 		}
-	} while (np);
+	} while (!IS_ERR(np));
 
 	/* Missing valid "power-supplies" entries */
 	if (cnt == 1)
@@ -498,14 +497,14 @@ void power_supply_put(struct power_supply *psy)
 EXPORT_SYMBOL_GPL(power_supply_put);
 
 #ifdef CONFIG_OF
-static int power_supply_match_device_node(struct device *dev, const void *data)
+static int power_supply_match_device_fwnode(struct device *dev, const void *data)
 {
-	return dev->parent && dev->parent->of_node == data;
+	return dev->parent && dev_fwnode(dev->parent) == data;
 }
 
 /**
  * power_supply_get_by_phandle() - Search for a power supply and returns its ref
- * @np: Pointer to device node holding phandle property
+ * @fwnode: Pointer to fwnode holding phandle property
  * @property: Name of property holding a power supply name
  *
  * If power supply was found, it increases reference count for the
@@ -515,21 +514,21 @@ static int power_supply_match_device_node(struct device *dev, const void *data)
  * Return: On success returns a reference to a power supply with
  * matching name equals to value under @property, NULL or ERR_PTR otherwise.
  */
-struct power_supply *power_supply_get_by_phandle(struct device_node *np,
-							const char *property)
+struct power_supply *power_supply_get_by_phandle(struct fwnode_handle *fwnode,
+						 const char *property)
 {
-	struct device_node *power_supply_np;
+	struct fwnode_handle *power_supply_fwnode;
 	struct power_supply *psy = NULL;
 	struct device *dev;
 
-	power_supply_np = of_parse_phandle(np, property, 0);
-	if (!power_supply_np)
-		return ERR_PTR(-ENODEV);
+	power_supply_fwnode = fwnode_find_reference(fwnode, property, 0);
+	if (IS_ERR(power_supply_fwnode))
+		return ERR_CAST(power_supply_fwnode);
 
-	dev = class_find_device(&power_supply_class, NULL, power_supply_np,
-				power_supply_match_device_node);
+	dev = class_find_device(&power_supply_class, NULL, power_supply_fwnode,
+				power_supply_match_device_fwnode);
 
-	of_node_put(power_supply_np);
+	fwnode_handle_put(power_supply_fwnode);
 
 	if (dev) {
 		psy = dev_to_psy(dev);
@@ -561,14 +560,14 @@ struct power_supply *devm_power_supply_get_by_phandle(struct device *dev,
 {
 	struct power_supply **ptr, *psy;
 
-	if (!dev->of_node)
+	if (!dev_fwnode(dev))
 		return ERR_PTR(-ENODEV);
 
 	ptr = devres_alloc(devm_power_supply_put, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	psy = power_supply_get_by_phandle(dev->of_node, property);
+	psy = power_supply_get_by_phandle(dev_fwnode(dev), property);
 	if (IS_ERR_OR_NULL(psy)) {
 		devres_free(ptr);
 	} else {
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 72012141656e..d90ac7b73755 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -808,7 +808,7 @@ static inline struct power_supply *power_supply_get_by_name(const char *name)
 { return NULL; }
 #endif
 #ifdef CONFIG_OF
-extern struct power_supply *power_supply_get_by_phandle(struct device_node *np,
+extern struct power_supply *power_supply_get_by_phandle(struct fwnode_handle *fwnode,
 							const char *property);
 extern struct power_supply *devm_power_supply_get_by_phandle(
 				    struct device *dev, const char *property);
-- 
cgit v1.2.3


From 370643f45aad93476b6489238ccb45a77b94da3f Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sebastian.reichel@collabora.com>
Date: Sun, 8 Jun 2025 22:40:07 +0200
Subject: power: supply: core: rename power_supply_get_by_phandle to
 power_supply_get_by_reference

(devm_)power_supply_get_by_phandle now internally uses fwnode and are no
longer DT specific. Thus drop the ifdef check for CONFIG_OF and rename
to (devm_)power_supply_get_by_reference to avoid the DT terminology.

Reviewed-by: Hans de Goede <hansg@kernel.org>
Link: https://lore.kernel.org/r/20250430-psy-core-convert-to-fwnode-v2-5-f9643b958677@collabora.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/phy/allwinner/phy-sun4i-usb.c    |  2 +-
 drivers/power/supply/bq2415x_charger.c   |  2 +-
 drivers/power/supply/power_supply_core.c | 22 ++++++++++------------
 include/linux/power_supply.h             | 15 +++------------
 4 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/phy/allwinner/phy-sun4i-usb.c b/drivers/phy/allwinner/phy-sun4i-usb.c
index 29b8fd4b9351..8873aed3a52a 100644
--- a/drivers/phy/allwinner/phy-sun4i-usb.c
+++ b/drivers/phy/allwinner/phy-sun4i-usb.c
@@ -754,7 +754,7 @@ static int sun4i_usb_phy_probe(struct platform_device *pdev)
 	}
 
 	if (of_property_present(np, "usb0_vbus_power-supply")) {
-		data->vbus_power_supply = devm_power_supply_get_by_phandle(dev,
+		data->vbus_power_supply = devm_power_supply_get_by_reference(dev,
 						     "usb0_vbus_power-supply");
 		if (IS_ERR(data->vbus_power_supply)) {
 			dev_err(dev, "Couldn't get the VBUS power supply\n");
diff --git a/drivers/power/supply/bq2415x_charger.c b/drivers/power/supply/bq2415x_charger.c
index 1ecbca510bba..917c26ee56bc 100644
--- a/drivers/power/supply/bq2415x_charger.c
+++ b/drivers/power/supply/bq2415x_charger.c
@@ -1674,7 +1674,7 @@ static int bq2415x_probe(struct i2c_client *client)
 	/* Query for initial reported_mode and set it */
 	if (bq->nb.notifier_call) {
 		if (np) {
-			notify_psy = power_supply_get_by_phandle(of_fwnode_handle(np),
+			notify_psy = power_supply_get_by_reference(of_fwnode_handle(np),
 						"ti,usb-charger-detection");
 			if (IS_ERR(notify_psy))
 				notify_psy = NULL;
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 2d83bb125a48..aedb20c1d276 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -496,14 +496,13 @@ void power_supply_put(struct power_supply *psy)
 }
 EXPORT_SYMBOL_GPL(power_supply_put);
 
-#ifdef CONFIG_OF
 static int power_supply_match_device_fwnode(struct device *dev, const void *data)
 {
 	return dev->parent && dev_fwnode(dev->parent) == data;
 }
 
 /**
- * power_supply_get_by_phandle() - Search for a power supply and returns its ref
+ * power_supply_get_by_reference() - Search for a power supply and returns its ref
  * @fwnode: Pointer to fwnode holding phandle property
  * @property: Name of property holding a power supply name
  *
@@ -514,8 +513,8 @@ static int power_supply_match_device_fwnode(struct device *dev, const void *data
  * Return: On success returns a reference to a power supply with
  * matching name equals to value under @property, NULL or ERR_PTR otherwise.
  */
-struct power_supply *power_supply_get_by_phandle(struct fwnode_handle *fwnode,
-						 const char *property)
+struct power_supply *power_supply_get_by_reference(struct fwnode_handle *fwnode,
+						   const char *property)
 {
 	struct fwnode_handle *power_supply_fwnode;
 	struct power_supply *psy = NULL;
@@ -537,7 +536,7 @@ struct power_supply *power_supply_get_by_phandle(struct fwnode_handle *fwnode,
 
 	return psy;
 }
-EXPORT_SYMBOL_GPL(power_supply_get_by_phandle);
+EXPORT_SYMBOL_GPL(power_supply_get_by_reference);
 
 static void devm_power_supply_put(struct device *dev, void *res)
 {
@@ -547,16 +546,16 @@ static void devm_power_supply_put(struct device *dev, void *res)
 }
 
 /**
- * devm_power_supply_get_by_phandle() - Resource managed version of
- *  power_supply_get_by_phandle()
+ * devm_power_supply_get_by_reference() - Resource managed version of
+ *  power_supply_get_by_reference()
  * @dev: Pointer to device holding phandle property
  * @property: Name of property holding a power supply phandle
  *
  * Return: On success returns a reference to a power supply with
  * matching name equals to value under @property, NULL or ERR_PTR otherwise.
  */
-struct power_supply *devm_power_supply_get_by_phandle(struct device *dev,
-						      const char *property)
+struct power_supply *devm_power_supply_get_by_reference(struct device *dev,
+							const char *property)
 {
 	struct power_supply **ptr, *psy;
 
@@ -567,7 +566,7 @@ struct power_supply *devm_power_supply_get_by_phandle(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	psy = power_supply_get_by_phandle(dev_fwnode(dev), property);
+	psy = power_supply_get_by_reference(dev_fwnode(dev), property);
 	if (IS_ERR_OR_NULL(psy)) {
 		devres_free(ptr);
 	} else {
@@ -576,8 +575,7 @@ struct power_supply *devm_power_supply_get_by_phandle(struct device *dev,
 	}
 	return psy;
 }
-EXPORT_SYMBOL_GPL(devm_power_supply_get_by_phandle);
-#endif /* CONFIG_OF */
+EXPORT_SYMBOL_GPL(devm_power_supply_get_by_reference);
 
 int power_supply_get_battery_info(struct power_supply *psy,
 				  struct power_supply_battery_info **info_out)
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d90ac7b73755..45468959dd98 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -807,19 +807,10 @@ static inline void power_supply_put(struct power_supply *psy) {}
 static inline struct power_supply *power_supply_get_by_name(const char *name)
 { return NULL; }
 #endif
-#ifdef CONFIG_OF
-extern struct power_supply *power_supply_get_by_phandle(struct fwnode_handle *fwnode,
-							const char *property);
-extern struct power_supply *devm_power_supply_get_by_phandle(
+extern struct power_supply *power_supply_get_by_reference(struct fwnode_handle *fwnode,
+							  const char *property);
+extern struct power_supply *devm_power_supply_get_by_reference(
 				    struct device *dev, const char *property);
-#else /* !CONFIG_OF */
-static inline struct power_supply *
-power_supply_get_by_phandle(struct device_node *np, const char *property)
-{ return NULL; }
-static inline struct power_supply *
-devm_power_supply_get_by_phandle(struct device *dev, const char *property)
-{ return NULL; }
-#endif /* CONFIG_OF */
 
 extern const enum power_supply_property power_supply_battery_info_properties[];
 extern const size_t power_supply_battery_info_properties_size;
-- 
cgit v1.2.3


From ac558015dfd803626622bd0ba9645d58a3ed16b1 Mon Sep 17 00:00:00 2001
From: Shuming Fan <shumingf@realtek.com>
Date: Mon, 16 Jun 2025 19:49:29 +0800
Subject: ASoC: SDCA: add a HID device for HIDE entity

This patch supports to add a HID device for SDCA HIDE entity.
The codec driver could call 'hid_input_report' to report events.

Signed-off-by: Shuming Fan <shumingf@realtek.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20250616114929.855496-1-shumingf@realtek.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_hid.h        |  25 ++++++++
 sound/soc/sdca/Kconfig          |   4 ++
 sound/soc/sdca/Makefile         |   3 +
 sound/soc/sdca/sdca_functions.c |   9 +++
 sound/soc/sdca/sdca_hid.c       | 127 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 168 insertions(+)
 create mode 100644 include/sound/sdca_hid.h
 create mode 100644 sound/soc/sdca/sdca_hid.c

(limited to 'include')

diff --git a/include/sound/sdca_hid.h b/include/sound/sdca_hid.h
new file mode 100644
index 000000000000..8ab3e498884e
--- /dev/null
+++ b/include/sound/sdca_hid.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ */
+
+#ifndef __SDCA_HID_H__
+#define __SDCA_HID_H__
+
+#include <linux/types.h>
+#include <linux/hid.h>
+
+#if IS_ENABLED(CONFIG_SND_SOC_SDCA_HID)
+int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity);
+
+#else
+static inline int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity)
+{
+	return 0;
+}
+
+#endif
+
+#endif /* __SDCA_HID_H__ */
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
index ee20b9914aa1..ec28855fe3b0 100644
--- a/sound/soc/sdca/Kconfig
+++ b/sound/soc/sdca/Kconfig
@@ -9,3 +9,7 @@ config SND_SOC_SDCA
 
 config SND_SOC_SDCA_OPTIONAL
 	def_tristate SND_SOC_SDCA || !SND_SOC_SDCA
+
+config SND_SOC_SDCA_HID
+	tristate "SDCA HID support"
+	depends on SND_SOC_SDCA && HID
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index 53344f108ca6..9af46e7edfd2 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -2,4 +2,7 @@
 
 snd-soc-sdca-y	:= sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o
 
+snd-soc-sdca-hid-y := sdca_hid.o
+
+obj-$(CONFIG_SND_SOC_SDCA_HID)	+= snd-soc-sdca-hid.o
 obj-$(CONFIG_SND_SOC_SDCA)	+= snd-soc-sdca.o
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 4a89067dcf76..093c681e9387 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <sound/sdca.h>
 #include <sound/sdca_function.h>
+#include <sound/sdca_hid.h>
 
 /*
  * Should be long enough to encompass all the MIPI DisCo properties.
@@ -1294,6 +1295,13 @@ find_sdca_entity_hide(struct device *dev, struct fwnode_handle *function_node,
 			hide->hid_report_desc = report_desc;
 			fwnode_property_read_u8_array(function_node, "mipi-sdca-report-descriptor",
 						      report_desc, nval);
+
+			/* add HID device */
+			ret = sdca_add_hid_device(dev, entity);
+			if (ret) {
+				dev_err(dev, "%pfwP: failed to add HID device: %d\n", entity_node, ret);
+				return ret;
+			}
 		}
 	}
 
@@ -1933,3 +1941,4 @@ EXPORT_SYMBOL_NS(sdca_parse_function, "SND_SOC_SDCA");
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("SDCA library");
+MODULE_IMPORT_NS("SND_SOC_SDCA_HID");
diff --git a/sound/soc/sdca/sdca_hid.c b/sound/soc/sdca/sdca_hid.c
new file mode 100644
index 000000000000..b227ad94d08f
--- /dev/null
+++ b/sound/soc/sdca/sdca_hid.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/acpi.h>
+#include <linux/byteorder/generic.h>
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/dev_printk.h>
+#include <linux/module.h>
+#include <linux/property.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/types.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+#include <sound/sdca_hid.h>
+
+static int sdwhid_parse(struct hid_device *hid)
+{
+	struct sdca_entity *entity = hid->driver_data;
+	unsigned int rsize;
+	int ret;
+
+	rsize = entity->hide.hid_desc.rpt_desc.wDescriptorLength;
+
+	if (!rsize || rsize > HID_MAX_DESCRIPTOR_SIZE) {
+		dev_err(&hid->dev, "invalid size of report descriptor (%u)\n", rsize);
+		return -EINVAL;
+	}
+
+	ret = hid_parse_report(hid, entity->hide.hid_report_desc, rsize);
+
+	if (!ret)
+		return 0;
+
+	dev_err(&hid->dev, "parsing report descriptor failed\n");
+	return ret;
+}
+
+static int sdwhid_start(struct hid_device *hid)
+{
+	return 0;
+}
+
+static void sdwhid_stop(struct hid_device *hid)
+{
+}
+
+static int sdwhid_raw_request(struct hid_device *hid, unsigned char reportnum,
+			      __u8 *buf, size_t len, unsigned char rtype, int reqtype)
+{
+	switch (reqtype) {
+	case HID_REQ_GET_REPORT:
+		/* not implemented yet */
+		return 0;
+	case HID_REQ_SET_REPORT:
+		/* not implemented yet */
+		return 0;
+	default:
+		return -EIO;
+	}
+}
+
+static int sdwhid_open(struct hid_device *hid)
+{
+	return 0;
+}
+
+static void sdwhid_close(struct hid_device *hid)
+{
+}
+
+static const struct hid_ll_driver sdw_hid_driver = {
+	.parse = sdwhid_parse,
+	.start = sdwhid_start,
+	.stop = sdwhid_stop,
+	.open = sdwhid_open,
+	.close = sdwhid_close,
+	.raw_request = sdwhid_raw_request,
+};
+
+int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity)
+{
+	struct sdw_bus *bus;
+	struct hid_device *hid;
+	struct sdw_slave *slave = dev_to_sdw_dev(dev);
+	int ret;
+
+	bus = slave->bus;
+
+	hid = hid_allocate_device();
+	if (IS_ERR(hid))
+		return PTR_ERR(hid);
+
+	hid->ll_driver = &sdw_hid_driver;
+
+	hid->dev.parent = dev;
+	hid->bus = BUS_SDW;
+	hid->version = le16_to_cpu(entity->hide.hid_desc.bcdHID);
+
+	snprintf(hid->name, sizeof(hid->name),
+		 "HID sdw:%01x:%01x:%04x:%04x:%02x",
+		 bus->controller_id, bus->link_id, slave->id.mfg_id,
+		 slave->id.part_id, slave->id.class_id);
+
+	snprintf(hid->phys, sizeof(hid->phys), "%s", dev->bus->name);
+
+	hid->driver_data = entity;
+
+	ret = hid_add_device(hid);
+	if (ret && ret != -ENODEV) {
+		dev_err(dev, "can't add hid device: %d\n", ret);
+		hid_destroy_device(hid);
+		return ret;
+	}
+
+	entity->hide.hid = hid;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_add_hid_device, "SND_SOC_SDCA_HID");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("SDCA HID library");
-- 
cgit v1.2.3


From 1e2b7fcd3f075ff8c5b0e4474fe145d1c685f54f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 13 Jun 2025 16:51:38 +0800
Subject: crypto: ahash - Stop legacy tfms from using the set_virt fallback
 path

Ensure that drivers that have not been converted to the ahash API
do not use the ahash_request_set_virt fallback path as they cannot
use the software fallback.

Reported-by: Eric Biggers <ebiggers@kernel.org>
Fixes: 9d7a0ab1c753 ("crypto: ahash - Handle partial blocks in API")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 | 3 +++
 include/crypto/internal/hash.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index bd9e49950201..7a51e0cf9322 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -350,6 +350,9 @@ static int ahash_do_req_chain(struct ahash_request *req,
 	if (!crypto_ahash_need_fallback(tfm))
 		return -ENOSYS;
 
+	if (crypto_hash_no_export_core(tfm))
+		return -ENOSYS;
+
 	{
 		u8 state[HASH_MAX_STATESIZE];
 
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 0f85c543f80b..f052afa6e7b0 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -91,6 +91,12 @@ static inline bool crypto_hash_alg_needs_key(struct hash_alg_common *alg)
 		!(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY);
 }
 
+static inline bool crypto_hash_no_export_core(struct crypto_ahash *tfm)
+{
+	return crypto_hash_alg_common(tfm)->base.cra_flags &
+	       CRYPTO_AHASH_ALG_NO_EXPORT_CORE;
+}
+
 int crypto_grab_ahash(struct crypto_ahash_spawn *spawn,
 		      struct crypto_instance *inst,
 		      const char *name, u32 type, u32 mask);
-- 
cgit v1.2.3


From 2566de3e06a35b6517bcab11fd25b65ef90fd180 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Wed, 18 Jun 2025 18:00:26 +0800
Subject: crypto: hisilicon - Use fine grained DMA mapping direction

The following splat was triggered when booting the kernel built with
arm64's defconfig + CRYPTO_SELFTESTS + DMA_API_DEBUG.

 ------------[ cut here ]------------
 DMA-API: hisi_sec2 0000:75:00.0: cacheline tracking EEXIST, overlapping mappings aren't supported
 WARNING: CPU: 24 PID: 1273 at kernel/dma/debug.c:596 add_dma_entry+0x248/0x308

 Call trace:
  add_dma_entry+0x248/0x308 (P)
  debug_dma_map_sg+0x208/0x3e4
  __dma_map_sg_attrs+0xbc/0x118
  dma_map_sg_attrs+0x10/0x24
  hisi_acc_sg_buf_map_to_hw_sgl+0x80/0x218 [hisi_qm]
  sec_cipher_map+0xc4/0x338 [hisi_sec2]
  sec_aead_sgl_map+0x18/0x24 [hisi_sec2]
  sec_process+0xb8/0x36c [hisi_sec2]
  sec_aead_crypto+0xe4/0x264 [hisi_sec2]
  sec_aead_encrypt+0x14/0x20 [hisi_sec2]
  crypto_aead_encrypt+0x24/0x38
  test_aead_vec_cfg+0x480/0x7e4
  test_aead_vec+0x84/0x1b8
  alg_test_aead+0xc0/0x498
  alg_test.part.0+0x518/0x524
  alg_test+0x20/0x64
  cryptomgr_test+0x24/0x44
  kthread+0x130/0x1fc
  ret_from_fork+0x10/0x20
 ---[ end trace 0000000000000000 ]---
 DMA-API: Mapped at:
  debug_dma_map_sg+0x234/0x3e4
  __dma_map_sg_attrs+0xbc/0x118
  dma_map_sg_attrs+0x10/0x24
  hisi_acc_sg_buf_map_to_hw_sgl+0x80/0x218 [hisi_qm]
  sec_cipher_map+0xc4/0x338 [hisi_sec2]

This occurs in selftests where the input and the output scatterlist point
to the same underlying memory (e.g., when tested with INPLACE_TWO_SGLISTS
mode).

The problem is that the hisi_sec2 driver maps these two different
scatterlists using the DMA_BIDIRECTIONAL flag which leads to overlapped
write mappings which are not supported by the DMA layer.

Fix it by using the fine grained and correct DMA mapping directions. While
at it, switch the DMA directions used by the hisi_zip driver too.

Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Reviewed-by: Longfang Liu <liulongfang@huawei.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/hisilicon/sec2/sec_crypto.c | 21 +++++++++++++--------
 drivers/crypto/hisilicon/sgl.c             | 15 ++++++++-------
 drivers/crypto/hisilicon/zip/zip_crypto.c  | 13 +++++++------
 include/linux/hisi_acc_qm.h                |  4 ++--
 4 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 8ea5305bc320..7d04e770a8c2 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -965,6 +965,7 @@ static int sec_cipher_map(struct sec_ctx *ctx, struct sec_req *req,
 	struct sec_qp_ctx *qp_ctx = req->qp_ctx;
 	struct sec_alg_res *res = &qp_ctx->res[req->req_id];
 	struct device *dev = ctx->dev;
+	enum dma_data_direction src_direction;
 	int ret;
 
 	if (req->use_pbuf) {
@@ -990,10 +991,11 @@ static int sec_cipher_map(struct sec_ctx *ctx, struct sec_req *req,
 		a_req->out_mac_dma = res->out_mac_dma;
 	}
 
+	src_direction = dst == src ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
 	req->in = hisi_acc_sg_buf_map_to_hw_sgl(dev, src,
 						qp_ctx->c_in_pool,
 						req->req_id,
-						&req->in_dma);
+						&req->in_dma, src_direction);
 	if (IS_ERR(req->in)) {
 		dev_err(dev, "fail to dma map input sgl buffers!\n");
 		return PTR_ERR(req->in);
@@ -1003,7 +1005,7 @@ static int sec_cipher_map(struct sec_ctx *ctx, struct sec_req *req,
 		ret = sec_aead_mac_init(a_req);
 		if (unlikely(ret)) {
 			dev_err(dev, "fail to init mac data for ICV!\n");
-			hisi_acc_sg_buf_unmap(dev, src, req->in);
+			hisi_acc_sg_buf_unmap(dev, src, req->in, src_direction);
 			return ret;
 		}
 	}
@@ -1015,11 +1017,12 @@ static int sec_cipher_map(struct sec_ctx *ctx, struct sec_req *req,
 		c_req->c_out = hisi_acc_sg_buf_map_to_hw_sgl(dev, dst,
 							     qp_ctx->c_out_pool,
 							     req->req_id,
-							     &c_req->c_out_dma);
+							     &c_req->c_out_dma,
+							     DMA_FROM_DEVICE);
 
 		if (IS_ERR(c_req->c_out)) {
 			dev_err(dev, "fail to dma map output sgl buffers!\n");
-			hisi_acc_sg_buf_unmap(dev, src, req->in);
+			hisi_acc_sg_buf_unmap(dev, src, req->in, src_direction);
 			return PTR_ERR(c_req->c_out);
 		}
 	}
@@ -1036,10 +1039,12 @@ static void sec_cipher_unmap(struct sec_ctx *ctx, struct sec_req *req,
 	if (req->use_pbuf) {
 		sec_cipher_pbuf_unmap(ctx, req, dst);
 	} else {
-		if (dst != src)
-			hisi_acc_sg_buf_unmap(dev, src, req->in);
-
-		hisi_acc_sg_buf_unmap(dev, dst, c_req->c_out);
+		if (dst != src) {
+			hisi_acc_sg_buf_unmap(dev, dst, c_req->c_out, DMA_FROM_DEVICE);
+			hisi_acc_sg_buf_unmap(dev, src, req->in, DMA_TO_DEVICE);
+		} else {
+			hisi_acc_sg_buf_unmap(dev, src, req->in, DMA_BIDIRECTIONAL);
+		}
 	}
 }
 
diff --git a/drivers/crypto/hisilicon/sgl.c b/drivers/crypto/hisilicon/sgl.c
index c974f95cd126..7a9ef2a9972a 100644
--- a/drivers/crypto/hisilicon/sgl.c
+++ b/drivers/crypto/hisilicon/sgl.c
@@ -210,15 +210,15 @@ static void clear_hw_sgl_sge(struct hisi_acc_hw_sgl *hw_sgl)
  * @pool: Pool which hw sgl memory will be allocated in.
  * @index: Index of hisi_acc_hw_sgl in pool.
  * @hw_sgl_dma: The dma address of allocated hw sgl.
+ * @dir: DMA direction.
  *
  * This function builds hw sgl according input sgl, user can use hw_sgl_dma
  * as src/dst in its BD. Only support single hw sgl currently.
  */
 struct hisi_acc_hw_sgl *
-hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev,
-			      struct scatterlist *sgl,
-			      struct hisi_acc_sgl_pool *pool,
-			      u32 index, dma_addr_t *hw_sgl_dma)
+hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev, struct scatterlist *sgl,
+			      struct hisi_acc_sgl_pool *pool, u32 index,
+			      dma_addr_t *hw_sgl_dma, enum dma_data_direction dir)
 {
 	struct hisi_acc_hw_sgl *curr_hw_sgl;
 	unsigned int i, sg_n_mapped;
@@ -232,7 +232,7 @@ hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev,
 
 	sg_n = sg_nents(sgl);
 
-	sg_n_mapped = dma_map_sg(dev, sgl, sg_n, DMA_BIDIRECTIONAL);
+	sg_n_mapped = dma_map_sg(dev, sgl, sg_n, dir);
 	if (!sg_n_mapped) {
 		dev_err(dev, "DMA mapping for SG error!\n");
 		return ERR_PTR(-EINVAL);
@@ -276,16 +276,17 @@ EXPORT_SYMBOL_GPL(hisi_acc_sg_buf_map_to_hw_sgl);
  * @dev: The device which hw sgl belongs to.
  * @sgl: Related scatterlist.
  * @hw_sgl: Virtual address of hw sgl.
+ * @dir: DMA direction.
  *
  * This function unmaps allocated hw sgl.
  */
 void hisi_acc_sg_buf_unmap(struct device *dev, struct scatterlist *sgl,
-			   struct hisi_acc_hw_sgl *hw_sgl)
+			   struct hisi_acc_hw_sgl *hw_sgl, enum dma_data_direction dir)
 {
 	if (!dev || !sgl || !hw_sgl)
 		return;
 
-	dma_unmap_sg(dev, sgl, sg_nents(sgl), DMA_BIDIRECTIONAL);
+	dma_unmap_sg(dev, sgl, sg_nents(sgl), dir);
 	clear_hw_sgl_sge(hw_sgl);
 	hw_sgl->entry_sum_in_chain = 0;
 	hw_sgl->entry_sum_in_sgl = 0;
diff --git a/drivers/crypto/hisilicon/zip/zip_crypto.c b/drivers/crypto/hisilicon/zip/zip_crypto.c
index 7327f8f29b01..b97513981a3b 100644
--- a/drivers/crypto/hisilicon/zip/zip_crypto.c
+++ b/drivers/crypto/hisilicon/zip/zip_crypto.c
@@ -224,7 +224,8 @@ static int hisi_zip_do_work(struct hisi_zip_qp_ctx *qp_ctx,
 		return -EINVAL;
 
 	req->hw_src = hisi_acc_sg_buf_map_to_hw_sgl(dev, a_req->src, pool,
-						    req->req_id << 1, &req->dma_src);
+						    req->req_id << 1, &req->dma_src,
+						    DMA_TO_DEVICE);
 	if (IS_ERR(req->hw_src)) {
 		dev_err(dev, "failed to map the src buffer to hw sgl (%ld)!\n",
 			PTR_ERR(req->hw_src));
@@ -233,7 +234,7 @@ static int hisi_zip_do_work(struct hisi_zip_qp_ctx *qp_ctx,
 
 	req->hw_dst = hisi_acc_sg_buf_map_to_hw_sgl(dev, a_req->dst, pool,
 						    (req->req_id << 1) + 1,
-						    &req->dma_dst);
+						    &req->dma_dst, DMA_FROM_DEVICE);
 	if (IS_ERR(req->hw_dst)) {
 		ret = PTR_ERR(req->hw_dst);
 		dev_err(dev, "failed to map the dst buffer to hw slg (%d)!\n",
@@ -258,9 +259,9 @@ static int hisi_zip_do_work(struct hisi_zip_qp_ctx *qp_ctx,
 	return -EINPROGRESS;
 
 err_unmap_output:
-	hisi_acc_sg_buf_unmap(dev, a_req->dst, req->hw_dst);
+	hisi_acc_sg_buf_unmap(dev, a_req->dst, req->hw_dst, DMA_FROM_DEVICE);
 err_unmap_input:
-	hisi_acc_sg_buf_unmap(dev, a_req->src, req->hw_src);
+	hisi_acc_sg_buf_unmap(dev, a_req->src, req->hw_src, DMA_TO_DEVICE);
 	return ret;
 }
 
@@ -303,8 +304,8 @@ static void hisi_zip_acomp_cb(struct hisi_qp *qp, void *data)
 		err = -EIO;
 	}
 
-	hisi_acc_sg_buf_unmap(dev, acomp_req->src, req->hw_src);
-	hisi_acc_sg_buf_unmap(dev, acomp_req->dst, req->hw_dst);
+	hisi_acc_sg_buf_unmap(dev, acomp_req->dst, req->hw_dst, DMA_FROM_DEVICE);
+	hisi_acc_sg_buf_unmap(dev, acomp_req->src, req->hw_src, DMA_TO_DEVICE);
 
 	acomp_req->dlen = ops->get_dstlen(sqe);
 
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 99fcf65d575f..0c4c84b8c3be 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -556,9 +556,9 @@ int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
 struct hisi_acc_sgl_pool;
 struct hisi_acc_hw_sgl *hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev,
 	struct scatterlist *sgl, struct hisi_acc_sgl_pool *pool,
-	u32 index, dma_addr_t *hw_sgl_dma);
+	u32 index, dma_addr_t *hw_sgl_dma, enum dma_data_direction dir);
 void hisi_acc_sg_buf_unmap(struct device *dev, struct scatterlist *sgl,
-			   struct hisi_acc_hw_sgl *hw_sgl);
+			   struct hisi_acc_hw_sgl *hw_sgl, enum dma_data_direction dir);
 struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
 						   u32 count, u32 sge_nr);
 void hisi_acc_free_sgl_pool(struct device *dev,
-- 
cgit v1.2.3


From 2773d282cd56464f62e9b4703c41d2f733a67842 Mon Sep 17 00:00:00 2001
From: Junxuan Liao <ljx@cs.wisc.edu>
Date: Sun, 22 Jun 2025 23:01:32 -0500
Subject: docs/vfs: update references to i_mutex to i_rwsem

VFS has switched to i_rwsem for ten years now (9902af79c01a: parallel
lookups actual switch to rwsem), but the VFS documentation and comments
still has references to i_mutex.

Signed-off-by: Junxuan Liao <ljx@cs.wisc.edu>
Link: https://lore.kernel.org/72223729-5471-474a-af3c-f366691fba82@cs.wisc.edu
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/vfs.rst |  5 +++--
 fs/attr.c                         | 10 +++++-----
 fs/buffer.c                       |  2 +-
 fs/dcache.c                       | 10 +++++-----
 fs/direct-io.c                    |  8 ++++----
 fs/inode.c                        |  9 ++++-----
 fs/libfs.c                        |  5 +++--
 fs/locks.c                        |  2 +-
 fs/namei.c                        | 22 +++++++++++-----------
 fs/namespace.c                    |  2 +-
 fs/stack.c                        |  4 ++--
 fs/xattr.c                        |  2 +-
 include/linux/exportfs.h          |  4 ++--
 include/linux/fs.h                |  6 +++---
 include/linux/fs_stack.h          |  2 +-
 include/linux/quotaops.h          |  2 +-
 16 files changed, 48 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index fd32a9a17bfb..dd9da7e04a99 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -758,8 +758,9 @@ process is more complicated and uses write_begin/write_end or
 dirty_folio to write data into the address_space, and
 writepages to writeback data to storage.
 
-Adding and removing pages to/from an address_space is protected by the
-inode's i_mutex.
+Removing pages from an address_space requires holding the inode's i_rwsem
+exclusively, while adding pages to the address_space requires holding the
+inode's i_mapping->invalidate_lock exclusively.
 
 When data is written to a page, the PG_Dirty flag should be set.  It
 typically remains set until writepages asks for it to be written.  This
diff --git a/fs/attr.c b/fs/attr.c
index 9caf63d20d03..5425c1dbbff9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare);
  * @inode:	the inode to be truncated
  * @offset:	the new size to assign to the inode
  *
- * inode_newsize_ok must be called with i_mutex held.
+ * inode_newsize_ok must be called with i_rwsem held exclusively.
  *
  * inode_newsize_ok will check filesystem limits and ulimits to check that the
  * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
@@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
  * @inode:	the inode to be updated
  * @attr:	the new attributes
  *
- * setattr_copy must be called with i_mutex held.
+ * setattr_copy must be called with i_rwsem held exclusively.
  *
  * setattr_copy updates the inode's metadata with that specified
  * in attr on idmapped mounts. Necessary permission checks to determine
@@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr);
  * @attr:	new attributes
  * @delegated_inode: returns inode, if the inode is delegated
  *
- * The caller must hold the i_mutex on the affected object.
+ * The caller must hold the i_rwsem exclusively on the affected object.
  *
  * If notify_change discovers a delegation in need of breaking,
  * it will return -EWOULDBLOCK and return a reference to the inode in
  * delegated_inode.  The caller should then break the delegation and
  * retry.  Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
  *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
@@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 		if (S_ISLNK(inode->i_mode))
 			return -EOPNOTSUPP;
 
-		/* Flag setting protected by i_mutex */
+		/* Flag setting protected by i_rwsem */
 		if (is_sxid(attr->ia_mode))
 			inode->i_flags &= ~S_NOSEC;
 	}
diff --git a/fs/buffer.c b/fs/buffer.c
index a14d281c6a74..1d34200f69c8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2609,7 +2609,7 @@ EXPORT_SYMBOL(cont_write_begin);
  * holes and correct delalloc and unwritten extent mapping on filesystems that
  * support these features.
  *
- * We are not allowed to take the i_mutex here so we have to play games to
+ * We are not allowed to take the i_rwsem here so we have to play games to
  * protect against truncate races as the page could now be beyond EOF.  Because
  * truncate writes the inode size before removing pages, once we have the
  * page lock we can determine safely if the page is beyond EOF. If it is not
diff --git a/fs/dcache.c b/fs/dcache.c
index 03d58b2d4fa3..ab8465ae9cad 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2774,10 +2774,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
  * @target: new dentry
  * @exchange: exchange the two dentries
  *
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way. Caller must hold
- * rename_lock, the i_mutex of the source and target directories,
- * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+ * Update the dcache to reflect the move of a file name. Negative dcache
+ * entries should not be moved in this way. Caller must hold rename_lock, the
+ * i_rwsem of the source and target directories (exclusively), and the sb->
+ * s_vfs_rename_mutex if they differ. See lock_rename().
  */
 static void __d_move(struct dentry *dentry, struct dentry *target,
 		     bool exchange)
@@ -2923,7 +2923,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
  * This helper attempts to cope with remotely renamed directories
  *
  * It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, and rename_lock
+ * dentry->d_parent->d_inode->i_rwsem, and rename_lock
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
diff --git a/fs/direct-io.c b/fs/direct-io.c
index bbd05f1a2145..1694ee9a9382 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio)
  * The locking rules are governed by the flags parameter:
  *  - if the flags value contains DIO_LOCKING we use a fancy locking
  *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    For writes this function is called under i_rwsem and returns with
+ *    i_rwsem held, for reads, i_rwsem is not held on entry, but it is
  *    taken and dropped again before returning.
  *  - if the flags value does NOT contain DIO_LOCKING we don't use any
  *    internal locking but rather rely on the filesystem to synchronize
@@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio)
  * counter before starting direct I/O, and decrement it once we are done.
  * Truncate can wait for it to reach zero to provide exclusion.  It is
  * expected that filesystem provide exclusion between new direct I/O
- * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
+ * and truncates.  For DIO_LOCKING filesystems this is done by i_rwsem,
  * but other filesystems need to take care of this on their own.
  *
  * NOTE: if you pass "sdio" to anything by pointer make sure that function
@@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 
 	/*
 	 * All block lookups have been performed. For READ requests
-	 * we can let i_mutex go now that its achieved its purpose
+	 * we can let i_rwsem go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
diff --git a/fs/inode.c b/fs/inode.c
index 99318b157a9a..a0150e2ef22a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1158,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
 		/* Set new key only if filesystem hasn't already changed it */
 		if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
 			/*
-			 * ensure nobody is actually holding i_mutex
+			 * ensure nobody is actually holding i_rwsem
 			 */
-			// mutex_destroy(&inode->i_mutex);
 			init_rwsem(&inode->i_rwsem);
 			lockdep_set_class(&inode->i_rwsem,
 					  &type->i_mutex_dir_key);
@@ -2615,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished);
  * proceed with a truncate or equivalent operation.
  *
  * Must be called under a lock that serializes taking new references
- * to i_dio_count, usually by inode->i_mutex.
+ * to i_dio_count, usually by inode->i_rwsem.
  */
 void inode_dio_wait(struct inode *inode)
 {
@@ -2633,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
 /*
  * inode_set_flags - atomically set some inode flags
  *
- * Note: the caller should be holding i_mutex, or else be sure that
+ * Note: the caller should be holding i_rwsem exclusively, or else be sure that
  * they have exclusive access to the inode structure (i.e., while the
  * inode is being instantiated).  The reason for the cmpxchg() loop
  * --- which wouldn't be necessary if all code paths which modify
@@ -2641,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
  * code path which doesn't today so we use cmpxchg() out of an abundance
  * of caution.
  *
- * In the long run, i_mutex is overkill, and we should probably look
+ * In the long run, i_rwsem is overkill, and we should probably look
  * at using the i_lock spinlock to protect i_flags, and then make sure
  * it is so documented in include/linux/fs.h and that all code follows
  * the locking convention!!
diff --git a/fs/libfs.c b/fs/libfs.c
index 9ea0ecc325a8..4d1862f589e8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -946,7 +946,8 @@ EXPORT_SYMBOL(simple_write_begin);
  * simple_write_end does the minimum needed for updating a folio after
  * writing is done. It has the same API signature as the .write_end of
  * address_space_operations vector. So it can just be set onto .write_end for
- * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * FSes that don't need any other processing. i_rwsem is assumed to be held
+ * exclusively.
  * Block based filesystems should use generic_write_end().
  * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
  * is not called, so a filesystem that actually does store data in .write_inode
@@ -973,7 +974,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
 	}
 	/*
 	 * No need to use i_size_read() here, the i_size
-	 * cannot change under us because we hold the i_mutex.
+	 * cannot change under us because we hold the i_rwsem.
 	 */
 	if (last_pos > inode->i_size)
 		i_size_write(inode, last_pos);
diff --git a/fs/locks.c b/fs/locks.c
index f96024feab17..559f02aa4172 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
 
 	/*
 	 * In the delegation case we need mutual exclusion with
-	 * a number of operations that take the i_mutex.  We trylock
+	 * a number of operations that take the i_rwsem.  We trylock
 	 * because delegations are an optional optimization, and if
 	 * there's some chance of a conflict--we'd rather not
 	 * bother, maybe that's a sign this just isn't a good file to
diff --git a/fs/namei.c b/fs/namei.c
index 981da44e1291..f5c157290ce2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1469,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
 	int ret = 0;
 
 	while (flags & DCACHE_MANAGED_DENTRY) {
-		/* Allow the filesystem to manage the transit without i_mutex
+		/* Allow the filesystem to manage the transit without i_rwsem
 		 * being held. */
 		if (flags & DCACHE_MANAGE_TRANSIT) {
 			ret = path->dentry->d_op->d_manage(path, false);
@@ -2945,7 +2945,7 @@ EXPORT_SYMBOL(try_lookup_noperm);
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.  It does no permission checking.
  *
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
  */
 struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
 {
@@ -2971,7 +2971,7 @@ EXPORT_SYMBOL(lookup_noperm);
  *
  * This can be used for in-kernel filesystem clients such as file servers.
  *
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
  */
 struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
 			  struct dentry *base)
@@ -4542,13 +4542,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  * @dentry:	victim
  * @delegated_inode: returns victim inode, if the inode is delegated.
  *
- * The caller must hold dir->i_mutex.
+ * The caller must hold dir->i_rwsem exclusively.
  *
  * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
  * return a reference to the inode in delegated_inode.  The caller
  * should then break the delegation on that inode and retry.  Because
  * breaking a delegation may take a long time, the caller should drop
- * dir->i_mutex before doing so.
+ * dir->i_rwsem before doing so.
  *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
@@ -4607,7 +4607,7 @@ EXPORT_SYMBOL(vfs_unlink);
 
 /*
  * Make sure that the actual truncation of the file will occur outside its
- * directory's i_mutex.  Truncate can take a long time if there is a lot of
+ * directory's i_rwsem.  Truncate can take a long time if there is a lot of
  * writeout happening, and we don't want to prevent access to the directory
  * while waiting on the I/O.
  */
@@ -4785,13 +4785,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * @new_dentry:	where to create the new link
  * @delegated_inode: returns inode needing a delegation break
  *
- * The caller must hold dir->i_mutex
+ * The caller must hold dir->i_rwsem exclusively.
  *
  * If vfs_link discovers a delegation on the to-be-linked file in need
  * of breaking, it will return -EWOULDBLOCK and return a reference to the
  * inode in delegated_inode.  The caller should then break the delegation
  * and retry.  Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
  *
  * Alternatively, a caller may pass NULL for delegated_inode.  This may
  * be appropriate for callers that expect the underlying filesystem not
@@ -4987,7 +4987,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	c) we may have to lock up to _four_ objects - parents and victim (if it exists),
  *	   and source (if it's a non-directory or a subdirectory that moves to
  *	   different parent).
- *	   And that - after we got ->i_mutex on parents (until then we don't know
+ *	   And that - after we got ->i_rwsem on parents (until then we don't know
  *	   whether the target exists).  Solution: try to be smart with locking
  *	   order for inodes.  We rely on the fact that tree topology may change
  *	   only under ->s_vfs_rename_mutex _and_ that parent of the object we
@@ -4999,9 +4999,9 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
  *	   has no more than 1 dentry.  If "hybrid" objects will ever appear,
  *	   we'd better make sure that there's no link(2) for them.
  *	d) conversion from fhandle to dentry may come in the wrong moment - when
- *	   we are removing the target. Solution: we will have to grab ->i_mutex
+ *	   we are removing the target. Solution: we will have to grab ->i_rwsem
  *	   in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- *	   ->i_mutex on parents, which works but leads to some truly excessive
+ *	   ->i_rwsem on parents, which works but leads to some truly excessive
  *	   locking].
  */
 int vfs_rename(struct renamedata *rd)
diff --git a/fs/namespace.c b/fs/namespace.c
index e13d9ab4f564..8a1bfdf862f8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2053,7 +2053,7 @@ out:
  * detach_mounts allows lazily unmounting those mounts instead of
  * leaking them.
  *
- * The caller may hold dentry->d_inode->i_mutex.
+ * The caller may hold dentry->d_inode->i_rwsem.
  */
 void __detach_mounts(struct dentry *dentry)
 {
diff --git a/fs/stack.c b/fs/stack.c
index f18920119944..d8c782e064e3 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -3,7 +3,7 @@
 #include <linux/fs.h>
 #include <linux/fs_stack.h>
 
-/* does _NOT_ require i_mutex to be held.
+/* does _NOT_ require i_rwsem to be held.
  *
  * This function cannot be inlined since i_size_{read,write} is rather
  * heavy-weight on 32-bit systems
@@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 	 * If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for
 	 * fsstack_copy_inode_size() to hold some lock around
 	 * i_size_write(), otherwise i_size_read() may spin forever (see
-	 * include/linux/fs.h).  We don't necessarily hold i_mutex when this
+	 * include/linux/fs.h).  We don't necessarily hold i_rwsem when this
 	 * is called, so take i_lock for that case.
 	 *
 	 * And if on 32-bit, continue our effort to keep the two halves of
diff --git a/fs/xattr.c b/fs/xattr.c
index 8ec5b0204bfd..c32e7d56a5d3 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
  *
  *  returns the result of the internal setxattr or setsecurity operations.
  *
- *  This function requires the caller to lock the inode's i_mutex before it
+ *  This function requires the caller to lock the inode's i_rwsem before it
  *  is executed. It also assumes that the caller will make the appropriate
  *  permission checks.
  */
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 25c4a5afbd44..cfb0dd1ea49c 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -230,7 +230,7 @@ struct handle_to_path_ctx {
  *    directory.  The name should be stored in the @name (with the
  *    understanding that it is already pointing to a %NAME_MAX+1 sized
  *    buffer.   get_name() should return %0 on success, a negative error code
- *    or error.  @get_name will be called without @parent->i_mutex held.
+ *    or error.  @get_name will be called without @parent->i_rwsem held.
  *
  * get_parent:
  *    @get_parent should find the parent directory for the given @child which
@@ -247,7 +247,7 @@ struct handle_to_path_ctx {
  *    @commit_metadata should commit metadata changes to stable storage.
  *
  * Locking rules:
- *    get_parent is called with child->d_inode->i_mutex down
+ *    get_parent is called with child->d_inode->i_rwsem down
  *    get_name is not (which is possibly inconsistent)
  */
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1d9586a78041..09e3e80b0528 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -837,7 +837,7 @@ static inline void inode_fake_hash(struct inode *inode)
 }
 
 /*
- * inode->i_mutex nesting subclasses for the lock validator:
+ * inode->i_rwsem nesting subclasses for the lock validator:
  *
  * 0: the object of the current VFS operation
  * 1: parent
@@ -989,7 +989,7 @@ static inline loff_t i_size_read(const struct inode *inode)
 
 /*
  * NOTE: unlike i_size_read(), i_size_write() does need locking around it
- * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
+ * (normally i_rwsem), otherwise on 32bit/SMP an update of i_size_seqcount
  * can be lost, resulting in subsequent i_size_read() calls spinning forever.
  */
 static inline void i_size_write(struct inode *inode, loff_t i_size)
@@ -1921,7 +1921,7 @@ static inline void sb_end_intwrite(struct super_block *sb)
  * freeze protection should be the outermost lock. In particular, we have:
  *
  * sb_start_write
- *   -> i_mutex			(write path, truncate, directory ops, ...)
+ *   -> i_rwsem			(write path, truncate, directory ops, ...)
  *   -> s_umount		(freeze_super, thaw_super)
  */
 static inline void sb_start_write(struct super_block *sb)
diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h
index 2b1f74b24070..0cc2fa283305 100644
--- a/include/linux/fs_stack.h
+++ b/include/linux/fs_stack.h
@@ -3,7 +3,7 @@
 #define _LINUX_FS_STACK_H
 
 /* This file defines generic functions used primarily by stackable
- * filesystems; none of these functions require i_mutex to be held.
+ * filesystems; none of these functions require i_rwsem to be held.
  */
 
 #include <linux/fs.h>
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 06cc8888199e..c334f82ed385 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -19,7 +19,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 	return &sb->s_dquot;
 }
 
-/* i_mutex must being held */
+/* i_rwsem must being held */
 static inline bool is_quota_modification(struct mnt_idmap *idmap,
 					 struct inode *inode, struct iattr *ia)
 {
-- 
cgit v1.2.3


From cbe4134ea4bc493239786220bd69cb8a13493190 Mon Sep 17 00:00:00 2001
From: Shivank Garg <shivankg@amd.com>
Date: Fri, 20 Jun 2025 07:03:30 +0000
Subject: fs: export anon_inode_make_secure_inode() and fix secretmem LSM
 bypass

Export anon_inode_make_secure_inode() to allow KVM guest_memfd to create
anonymous inodes with proper security context. This replaces the current
pattern of calling alloc_anon_inode() followed by
inode_init_security_anon() for creating security context manually.

This change also fixes a security regression in secretmem where the
S_PRIVATE flag was not cleared after alloc_anon_inode(), causing
LSM/SELinux checks to be bypassed for secretmem file descriptors.

As guest_memfd currently resides in the KVM module, we need to export this
symbol for use outside the core kernel. In the future, guest_memfd might be
moved to core-mm, at which point the symbols no longer would have to be
exported. When/if that happens is still unclear.

Fixes: 2bfe15c52612 ("mm: create security context for memfd_secret inodes")
Suggested-by: David Hildenbrand <david@redhat.com>
Suggested-by: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Link: https://lore.kernel.org/20250620070328.803704-3-shivankg@amd.com
Acked-by: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/anon_inodes.c   | 23 ++++++++++++++++++-----
 include/linux/fs.h |  2 ++
 mm/secretmem.c     |  9 +--------
 3 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e51e7d88980a..1d847a939f29 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -98,14 +98,25 @@ static struct file_system_type anon_inode_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct inode *anon_inode_make_secure_inode(
-	const char *name,
-	const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb:		[in]	Superblock to allocate from
+ * @name:	[in]	Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ *		[in]	Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return:	Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+					   const struct inode *context_inode)
 {
 	struct inode *inode;
 	int error;
 
-	inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+	inode = alloc_anon_inode(sb);
 	if (IS_ERR(inode))
 		return inode;
 	inode->i_flags &= ~S_PRIVATE;
@@ -118,6 +129,7 @@ static struct inode *anon_inode_make_secure_inode(
 	}
 	return inode;
 }
+EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
 
 static struct file *__anon_inode_getfile(const char *name,
 					 const struct file_operations *fops,
@@ -132,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name,
 		return ERR_PTR(-ENOENT);
 
 	if (make_inode) {
-		inode =	anon_inode_make_secure_inode(name, context_inode);
+		inode =	anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+						     name, context_inode);
 		if (IS_ERR(inode)) {
 			file = ERR_CAST(inode);
 			goto err;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4ec77da65f14..3a8e643c4279 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3606,6 +3606,8 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping,
 extern const struct address_space_operations ram_aops;
 extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+					   const struct inode *context_inode);
 extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
 extern const struct dentry_operations simple_dentry_operations;
 
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 589b26c2d553..9a11a38a6770 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -195,18 +195,11 @@ static struct file *secretmem_file_create(unsigned long flags)
 	struct file *file;
 	struct inode *inode;
 	const char *anon_name = "[secretmem]";
-	int err;
 
-	inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+	inode = anon_inode_make_secure_inode(secretmem_mnt->mnt_sb, anon_name, NULL);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
-	if (err) {
-		file = ERR_PTR(err);
-		goto err_free_inode;
-	}
-
 	file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
 				 O_RDWR, &secretmem_fops);
 	if (IS_ERR(file))
-- 
cgit v1.2.3


From 0c40d7cb5ef3af260e8c7f88e0e5d7ae15d6ce57 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 19 Jun 2025 19:17:58 +0800
Subject: block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits

Currently, disks primarily implement the write zeroes command (aka
REQ_OP_WRITE_ZEROES) through two mechanisms: the first involves
physically writing zeros to the disk media (e.g., HDDs), while the
second performs an unmap operation on the logical blocks, effectively
putting them into a deallocated state (e.g., SSDs). The first method is
generally slow, while the second method is typically very fast.

For example, on certain NVMe SSDs that support NVME_NS_DEAC, submitting
REQ_OP_WRITE_ZEROES requests with the NVME_WZ_DEAC bit can accelerate
the write zeros operation by placing disk blocks into a deallocated
state, which opportunistically avoids writing zeroes to media while
still guaranteeing that subsequent reads from the specified block range
will return zeroed data. This is a best-effort optimization, not a
mandatory requirement, some devices may partially fall back to writing
physical zeroes due to factors such as misalignment or being asked to
clear a block range smaller than the device's internal allocation unit.
Therefore, the speed of this operation is not guaranteed.

It is difficult to determine whether the storage device supports unmap
write zeroes operation. We cannot determine this by only querying
bdev_limits(bdev)->max_write_zeroes_sectors. Therefore, first, add a new
hardware queue limit parameters, max_hw_wzeroes_unmap_sectors, to
indicate whether a device supports this unmap write zeroes operation.
Then, add two new counterpart software queue limits,
max_wzeroes_unmap_sectors and max_user_wzeroes_unmap_sectors, which
allow users to disable this operation if the speed is very slow on some
sepcial devices.

Finally, for the stacked devices cases, initialize these two parameters
to UINT_MAX. This operation should be enabled by both the stacking
driver and all underlying devices.

Thanks to Martin K. Petersen for optimizing the documentation of the
write_zeroes_unmap sysfs interface.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/20250619111806.3546162-2-yi.zhang@huaweicloud.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/ABI/stable/sysfs-block | 33 +++++++++++++++++++++++++++++++++
 block/blk-settings.c                 | 20 ++++++++++++++++++--
 block/blk-sysfs.c                    | 26 ++++++++++++++++++++++++++
 include/linux/blkdev.h               | 10 ++++++++++
 4 files changed, 87 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 4ba771b56b3b..803f578dc023 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -778,6 +778,39 @@ Description:
 		0, write zeroes is not supported by the device.
 
 
+What:		/sys/block/<disk>/queue/write_zeroes_unmap_max_hw_bytes
+Date:		January 2025
+Contact:	Zhang Yi <yi.zhang@huawei.com>
+Description:
+		[RO] This file indicates whether a device supports zeroing data
+		in a specified block range without incurring the cost of
+		physically writing zeroes to the media for each individual
+		block. If this parameter is set to write_zeroes_max_bytes, the
+		device implements a zeroing operation which opportunistically
+		avoids writing zeroes to media while still guaranteeing that
+		subsequent reads from the specified block range will return
+		zeroed data. This operation is a best-effort optimization, a
+		device may fall back to physically writing zeroes to the media
+		due to other factors such as misalignment or being asked to
+		clear a block range smaller than the device's internal
+		allocation unit. If this parameter is set to 0, the device may
+		have to write each logical block media during a zeroing
+		operation.
+
+
+What:		/sys/block/<disk>/queue/write_zeroes_unmap_max_bytes
+Date:		January 2025
+Contact:	Zhang Yi <yi.zhang@huawei.com>
+Description:
+		[RW] While write_zeroes_unmap_max_hw_bytes is the hardware limit
+		for the device, this setting is the software limit. Since the
+		unmap write zeroes operation is a best-effort optimization, some
+		devices may still physically writing zeroes to media. So the
+		speed of this operation is not guaranteed. Writing a value of
+		'0' to this file disables this operation. Otherwise, this
+		parameter should be equal to write_zeroes_unmap_max_hw_bytes.
+
+
 What:		/sys/block/<disk>/queue/zone_append_max_bytes
 Date:		May 2020
 Contact:	linux-block@vger.kernel.org
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a000daafbfb4..b5c75f0ac3e9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -50,6 +50,8 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_sectors = UINT_MAX;
 	lim->max_dev_sectors = UINT_MAX;
 	lim->max_write_zeroes_sectors = UINT_MAX;
+	lim->max_hw_wzeroes_unmap_sectors = UINT_MAX;
+	lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
 	lim->max_hw_zone_append_sectors = UINT_MAX;
 	lim->max_user_discard_sectors = UINT_MAX;
 }
@@ -333,6 +335,12 @@ int blk_validate_limits(struct queue_limits *lim)
 	if (!lim->max_segments)
 		lim->max_segments = BLK_MAX_SEGMENTS;
 
+	if (lim->max_hw_wzeroes_unmap_sectors &&
+	    lim->max_hw_wzeroes_unmap_sectors != lim->max_write_zeroes_sectors)
+		return -EINVAL;
+	lim->max_wzeroes_unmap_sectors = min(lim->max_hw_wzeroes_unmap_sectors,
+			lim->max_user_wzeroes_unmap_sectors);
+
 	lim->max_discard_sectors =
 		min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
 
@@ -418,10 +426,11 @@ int blk_set_default_limits(struct queue_limits *lim)
 {
 	/*
 	 * Most defaults are set by capping the bounds in blk_validate_limits,
-	 * but max_user_discard_sectors is special and needs an explicit
-	 * initialization to the max value here.
+	 * but these limits are special and need an explicit initialization to
+	 * the max value here.
 	 */
 	lim->max_user_discard_sectors = UINT_MAX;
+	lim->max_user_wzeroes_unmap_sectors = UINT_MAX;
 	return blk_validate_limits(lim);
 }
 
@@ -708,6 +717,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
+	t->max_user_wzeroes_unmap_sectors =
+			min(t->max_user_wzeroes_unmap_sectors,
+			    b->max_user_wzeroes_unmap_sectors);
+	t->max_hw_wzeroes_unmap_sectors =
+			min(t->max_hw_wzeroes_unmap_sectors,
+			    b->max_hw_wzeroes_unmap_sectors);
+
 	t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
 					b->max_hw_zone_append_sectors);
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b2b9b89d6967..48c7ecbb531f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -161,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page)	\
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_wzeroes_unmap_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_wzeroes_unmap_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
 QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
@@ -205,6 +207,24 @@ static int queue_max_discard_sectors_store(struct gendisk *disk,
 	return 0;
 }
 
+static int queue_max_wzeroes_unmap_sectors_store(struct gendisk *disk,
+		const char *page, size_t count, struct queue_limits *lim)
+{
+	unsigned long max_zeroes_bytes, max_hw_zeroes_bytes;
+	ssize_t ret;
+
+	ret = queue_var_store(&max_zeroes_bytes, page, count);
+	if (ret < 0)
+		return ret;
+
+	max_hw_zeroes_bytes = lim->max_hw_wzeroes_unmap_sectors << SECTOR_SHIFT;
+	if (max_zeroes_bytes != 0 && max_zeroes_bytes != max_hw_zeroes_bytes)
+		return -EINVAL;
+
+	lim->max_user_wzeroes_unmap_sectors = max_zeroes_bytes >> SECTOR_SHIFT;
+	return 0;
+}
+
 static int
 queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count,
 		struct queue_limits *lim)
@@ -514,6 +534,10 @@ QUEUE_LIM_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
 
 QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
+QUEUE_LIM_RO_ENTRY(queue_max_hw_wzeroes_unmap_sectors,
+		"write_zeroes_unmap_max_hw_bytes");
+QUEUE_LIM_RW_ENTRY(queue_max_wzeroes_unmap_sectors,
+		"write_zeroes_unmap_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
 
@@ -662,6 +686,8 @@ static struct attribute *queue_attrs[] = {
 	&queue_atomic_write_unit_min_entry.attr,
 	&queue_atomic_write_unit_max_entry.attr,
 	&queue_max_write_zeroes_sectors_entry.attr,
+	&queue_max_hw_wzeroes_unmap_sectors_entry.attr,
+	&queue_max_wzeroes_unmap_sectors_entry.attr,
 	&queue_max_zone_append_sectors_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_rotational_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a59880c809c7..1a5725c1f93d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -383,6 +383,9 @@ struct queue_limits {
 	unsigned int		max_user_discard_sectors;
 	unsigned int		max_secure_erase_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_wzeroes_unmap_sectors;
+	unsigned int		max_hw_wzeroes_unmap_sectors;
+	unsigned int		max_user_wzeroes_unmap_sectors;
 	unsigned int		max_hw_zone_append_sectors;
 	unsigned int		max_zone_append_sectors;
 	unsigned int		discard_granularity;
@@ -1042,6 +1045,7 @@ static inline void blk_queue_disable_secure_erase(struct request_queue *q)
 static inline void blk_queue_disable_write_zeroes(struct request_queue *q)
 {
 	q->limits.max_write_zeroes_sectors = 0;
+	q->limits.max_wzeroes_unmap_sectors = 0;
 }
 
 /*
@@ -1378,6 +1382,12 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
 	return bdev_limits(bdev)->max_write_zeroes_sectors;
 }
 
+static inline unsigned int
+bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
+{
+	return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
+}
+
 static inline bool bdev_nonrot(struct block_device *bdev)
 {
 	return blk_queue_nonrot(bdev_get_queue(bdev));
-- 
cgit v1.2.3


From 7bd43cc79cab3850f34da0a31d5b042b701590ef Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 19 Jun 2025 19:18:03 +0800
Subject: fs: introduce FALLOC_FL_WRITE_ZEROES to fallocate

With the development of flash-based storage devices, we can quickly
write zeros to SSDs using the WRITE_ZERO command if the devices do not
actually write physical zeroes to the media. Therefore, we can use this
command to quickly preallocate a real all-zero file with written
extents. This approach should be beneficial for subsequent pure
overwriting within this file, as it can save on block allocation and,
consequently, significant metadata changes, which should greatly improve
overwrite performance on certain filesystems.

Therefore, introduce a new operation FALLOC_FL_WRITE_ZEROES to
fallocate. This flag is used to convert a specified range of a file to
zeros by issuing a zeroing operation. Blocks should be allocated for the
regions that span holes in the file, and the entire range is converted
to written extents. If the underlying device supports the actual offload
write zeroes command, the process of zeroing out operation can be
accelerated. If it does not, we currently don't prevent the file system
from writing actual zeros to the device. This provides users with a new
method to quickly generate a zeroed file, users no longer need to write
zero data to create a file with written extents.

Users can determine whether a disk supports the unmap write zeroes
feature through querying this sysfs interface:

    /sys/block/<disk>/queue/write_zeroes_unmap_max_hw_bytes

Users can also enable or disable the unmap write zeroes operation
through this sysfs interface:

    /sys/block/<disk>/queue/write_zeroes_unmap_max_bytes

Finally, this flag cannot be specified in conjunction with the
FALLOC_FL_KEEP_SIZE since allocating written extents beyond file EOF is
not permitted. In addition, filesystems that always require out-of-place
writes should not support this flag since they still need to allocated
new blocks during subsequent overwrites.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/20250619111806.3546162-7-yi.zhang@huaweicloud.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/open.c                   |  1 +
 include/linux/falloc.h      |  3 ++-
 include/uapi/linux/falloc.h | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/open.c b/fs/open.c
index 7828234a7caa..b777e11e5522 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -281,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 		break;
 	case FALLOC_FL_COLLAPSE_RANGE:
 	case FALLOC_FL_INSERT_RANGE:
+	case FALLOC_FL_WRITE_ZEROES:
 		if (mode & FALLOC_FL_KEEP_SIZE)
 			return -EOPNOTSUPP;
 		break;
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 3f49f3df6af5..7c38c6b76b60 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -36,7 +36,8 @@ struct space_resv {
 				 FALLOC_FL_COLLAPSE_RANGE |	\
 				 FALLOC_FL_ZERO_RANGE |		\
 				 FALLOC_FL_INSERT_RANGE |	\
-				 FALLOC_FL_UNSHARE_RANGE)
+				 FALLOC_FL_UNSHARE_RANGE |	\
+				 FALLOC_FL_WRITE_ZEROES)
 
 /* on ia32 l_start is on a 32-bit boundary */
 #if defined(CONFIG_X86_64)
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 5810371ed72b..1f9ca757d02d 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -78,4 +78,21 @@
  */
 #define FALLOC_FL_UNSHARE_RANGE		0x40
 
+/*
+ * FALLOC_FL_WRITE_ZEROES zeroes a specified file range in such a way that
+ * subsequent writes to that range do not require further changes to the file
+ * mapping metadata. This flag is beneficial for subsequent pure overwriting
+ * within this range, as it can save on block allocation and, consequently,
+ * significant metadata changes. Therefore, filesystems that always require
+ * out-of-place writes should not support this flag.
+ *
+ * Different filesystems may implement different limitations on the
+ * granularity of the zeroing operation. Most will preferably be accelerated
+ * by submitting write zeroes command if the backing storage supports, which
+ * may not physically write zeros to the media.
+ *
+ * This flag cannot be specified in conjunction with the FALLOC_FL_KEEP_SIZE.
+ */
+#define FALLOC_FL_WRITE_ZEROES		0x80
+
 #endif /* _UAPI_FALLOC_H_ */
-- 
cgit v1.2.3


From f4265b8d32c49ff95711e6fef7d05245a2905b30 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Thu, 19 Jun 2025 19:18:06 +0800
Subject: ext4: add FALLOC_FL_WRITE_ZEROES support

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation. This first allocates blocks as
unwritten, then issues a zero command outside of the running journal
handle, and finally converts them to a written state.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/20250619111806.3546162-10-yi.zhang@huaweicloud.com
Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/extents.c           | 66 +++++++++++++++++++++++++++++++++++++--------
 include/trace/events/ext4.h |  3 ++-
 2 files changed, 57 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b543a46fc809..b43aa82c1b39 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	struct ext4_map_blocks map;
 	unsigned int credits;
 	loff_t epos, old_size = i_size_read(inode);
+	unsigned int blkbits = inode->i_blkbits;
+	bool alloc_zero = false;
 
 	BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
 	map.m_lblk = offset;
@@ -4513,6 +4515,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	if (len <= EXT_UNWRITTEN_MAX_LEN)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
+	/*
+	 * Do the actual write zero during a running journal transaction
+	 * costs a lot. First allocate an unwritten extent and then
+	 * convert it to written after zeroing it out.
+	 */
+	if (flags & EXT4_GET_BLOCKS_ZERO) {
+		flags &= ~EXT4_GET_BLOCKS_ZERO;
+		flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+		alloc_zero = true;
+	}
+
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
@@ -4549,9 +4562,7 @@ retry:
 		 * allow a full retry cycle for any remaining allocations
 		 */
 		retries = 0;
-		map.m_lblk += ret;
-		map.m_len = len = len - ret;
-		epos = (loff_t)map.m_lblk << inode->i_blkbits;
+		epos = (loff_t)(map.m_lblk + ret) << blkbits;
 		inode_set_ctime_current(inode);
 		if (new_size) {
 			if (epos > new_size)
@@ -4571,6 +4582,21 @@ retry:
 		ret2 = ret3 ? ret3 : ret2;
 		if (unlikely(ret2))
 			break;
+
+		if (alloc_zero &&
+		    (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+			ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+						  map.m_len);
+			if (likely(!ret2))
+				ret2 = ext4_convert_unwritten_extents(NULL,
+					inode, (loff_t)map.m_lblk << blkbits,
+					(loff_t)map.m_len << blkbits);
+			if (ret2)
+				break;
+		}
+
+		map.m_lblk += ret;
+		map.m_len = len = len - ret;
 	}
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
@@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	if (end_lblk > start_lblk) {
 		ext4_lblk_t zero_blks = end_lblk - start_lblk;
 
-		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+		if (mode & FALLOC_FL_WRITE_ZEROES)
+			flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+		else
+			flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+				  EXT4_EX_NOCACHE);
 		ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
 					     new_size, flags);
 		if (ret)
@@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (IS_ENCRYPTED(inode) &&
 	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
 		return -EOPNOTSUPP;
+	/*
+	 * Don't allow writing zeroes if the underlying device does not
+	 * enable the unmap write zeroes operation.
+	 */
+	if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+	    !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+		return -EOPNOTSUPP;
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
-		     FALLOC_FL_INSERT_RANGE))
+		     FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+		     FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
 		return -EOPNOTSUPP;
 
 	inode_lock(inode);
@@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (ret)
 		goto out_invalidate_lock;
 
-	if (mode & FALLOC_FL_PUNCH_HOLE)
+	switch (mode & FALLOC_FL_MODE_MASK) {
+	case FALLOC_FL_PUNCH_HOLE:
 		ret = ext4_punch_hole(file, offset, len);
-	else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+		break;
+	case FALLOC_FL_COLLAPSE_RANGE:
 		ret = ext4_collapse_range(file, offset, len);
-	else if (mode & FALLOC_FL_INSERT_RANGE)
+		break;
+	case FALLOC_FL_INSERT_RANGE:
 		ret = ext4_insert_range(file, offset, len);
-	else if (mode & FALLOC_FL_ZERO_RANGE)
+		break;
+	case FALLOC_FL_ZERO_RANGE:
+	case FALLOC_FL_WRITE_ZEROES:
 		ret = ext4_zero_range(file, offset, len, mode);
-	else
+		break;
+	default:
 		ret = -EOPNOTSUPP;
+	}
 
 out_invalidate_lock:
 	filemap_invalidate_unlock(mapping);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 156908641e68..6f9cf2811733 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -92,7 +92,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B);
 	{ FALLOC_FL_KEEP_SIZE,		"KEEP_SIZE"},		\
 	{ FALLOC_FL_PUNCH_HOLE,		"PUNCH_HOLE"},		\
 	{ FALLOC_FL_COLLAPSE_RANGE,	"COLLAPSE_RANGE"},	\
-	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"})
+	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"},		\
+	{ FALLOC_FL_WRITE_ZEROES,	"WRITE_ZEROES"})
 
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
-- 
cgit v1.2.3


From b872f562c8cef59743993b48eb458c2d87c1651e Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 23 Jun 2025 19:11:50 +0800
Subject: dm-crypt: Extend state buffer size in crypt_iv_lmk_one

Add a macro CRYPTO_MD5_STATESIZE for the Crypto API export state
size of md5 and use that in dm-crypt instead of relying on the
size of struct md5_state (the latter is currently undergoing a
transition and may shrink).

This commit fixes a crash on 32-bit machines:
Oops: Oops: 0000 [#1] SMP
CPU: 1 UID: 0 PID: 12 Comm: kworker/u16:0 Not tainted 6.16.0-rc2+ #993 PREEMPT(full)
Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020
Workqueue: kcryptd-254:0-1 kcryptd_crypt [dm_crypt]
EIP: __crypto_shash_export+0xf/0x90
Code: 4a c1 c7 40 20 a0 b4 4a c1 81 cf 0e 00 04 08 89 78 50 e9 2b ff ff ff 8d 74 26 00 55 89 e5 57 56 53 89 c3 89 d6 8b 00 8b 40 14 <8b> 50 fc f6 40 13 01 74 04 4a 2b 50 14 85 c9 74 10 89 f2 89 d8 ff
EAX: 303a3435 EBX: c3007c90 ECX: 00000000 EDX: c3007c38
ESI: c3007c38 EDI: c3007c90 EBP: c3007bfc ESP: c3007bf0
DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010216
CR0: 80050033 CR2: 303a3431 CR3: 04fbe000 CR4: 00350e90
Call Trace:
 crypto_shash_export+0x65/0xc0
 crypt_iv_lmk_one+0x106/0x1a0 [dm_crypt]

Fixes: efd62c85525e ("crypto: md5-generic - Use API partial block handling")
Reported-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Tested-by: Milan Broz <gmazyland@gmail.com>
Closes: https://lore.kernel.org/linux-crypto/f1625ddc-e82e-4b77-80c2-dc8e45b54848@gmail.com/T/
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-crypt.c | 11 +++++++----
 include/crypto/hash.h |  2 ++
 include/crypto/md5.h  |  4 ++++
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9dfdb63220d7..17157c4216a5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -517,7 +517,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 {
 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
 	SHASH_DESC_ON_STACK(desc, lmk->hash_tfm);
-	struct md5_state md5state;
+	union {
+		struct md5_state md5state;
+		u8 state[CRYPTO_MD5_STATESIZE];
+	} u;
 	__le32 buf[4];
 	int i, r;
 
@@ -548,13 +551,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
 		return r;
 
 	/* No MD5 padding here */
-	r = crypto_shash_export(desc, &md5state);
+	r = crypto_shash_export(desc, &u.md5state);
 	if (r)
 		return r;
 
 	for (i = 0; i < MD5_HASH_WORDS; i++)
-		__cpu_to_le32s(&md5state.hash[i]);
-	memcpy(iv, &md5state.hash, cc->iv_size);
+		__cpu_to_le32s(&u.md5state.hash[i]);
+	memcpy(iv, &u.md5state.hash, cc->iv_size);
 
 	return 0;
 }
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 6f6b9de12cd3..db294d452e8c 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -202,6 +202,8 @@ struct shash_desc {
 #define HASH_REQUEST_CLONE(name, gfp) \
 	hash_request_clone(name, sizeof(__##name##_req), gfp)
 
+#define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1)
+
 /**
  * struct shash_alg - synchronous message digest definition
  * @init: see struct ahash_alg
diff --git a/include/crypto/md5.h b/include/crypto/md5.h
index 198b5d69b92f..28ee533a0507 100644
--- a/include/crypto/md5.h
+++ b/include/crypto/md5.h
@@ -2,6 +2,7 @@
 #ifndef _CRYPTO_MD5_H
 #define _CRYPTO_MD5_H
 
+#include <crypto/hash.h>
 #include <linux/types.h>
 
 #define MD5_DIGEST_SIZE		16
@@ -15,6 +16,9 @@
 #define MD5_H2	0x98badcfeUL
 #define MD5_H3	0x10325476UL
 
+#define CRYPTO_MD5_STATESIZE \
+	CRYPTO_HASH_STATESIZE(MD5_STATE_SIZE, MD5_HMAC_BLOCK_SIZE)
+
 extern const u8 md5_zero_message_hash[MD5_DIGEST_SIZE];
 
 struct md5_state {
-- 
cgit v1.2.3


From a24cc6ce1933eade12aa2b9859de0fcd2dac2c06 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 23 Jun 2025 10:34:08 +0200
Subject: futex: Initialize futex_phash_new during fork().

During a hash resize operation the new private hash is stored in
mm_struct::futex_phash_new if the current hash can not be immediately
replaced.

The new hash must not be copied during fork() into the new task. Doing
so will lead to a double-free of the memory by the two tasks.

Initialize the mm_struct::futex_phash_new during fork().

Closes: https://lore.kernel.org/all/aFBQ8CBKmRzEqIfS@mozart.vkv.me/
Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash")
Reported-by: Calvin Owens <calvin@wbinvd.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Calvin Owens <calvin@wbinvd.org>
Link: https://lkml.kernel.org/r/20250623083408.jTiJiC6_@linutronix.de
---
 include/linux/futex.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 005b040c4791..b37193653e6b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm);
 static inline void futex_mm_init(struct mm_struct *mm)
 {
 	RCU_INIT_POINTER(mm->futex_phash, NULL);
+	mm->futex_phash_new = NULL;
 	mutex_init(&mm->futex_hash_lock);
 }
 
-- 
cgit v1.2.3


From 4d811e395bbe54ba2febb3940d4b6c4741f360a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 5 Jun 2025 11:48:33 -0600
Subject: io_uring: add IO_URING_F_INLINE issue flag

Set when the execution of the request is done inline from the system
call itself. Any deferred issue will never have this flag set.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 ++
 io_uring/io_uring.c            | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2922635986f5..054c43c02c96 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -26,6 +26,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_MULTISHOT		= 4,
 	/* executed by io-wq */
 	IO_URING_F_IOWQ			= 8,
+	/* executed inline from syscall */
+	IO_URING_F_INLINE		= 16,
 	/* int's last bit, sign checks are usually faster than a bit test */
 	IO_URING_F_NONBLOCK		= INT_MIN,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5111ec040c53..c60d1c286c87 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool cancel_all,
 					 bool is_sqpoll_thread);
 
-static void io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags);
 static void __io_req_caches_free(struct io_ring_ctx *ctx);
 
 static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -1377,7 +1377,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw)
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req);
 	else
-		io_queue_sqe(req);
+		io_queue_sqe(req, 0);
 }
 
 void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1960,12 +1960,14 @@ static void io_queue_async(struct io_kiocb *req, int ret)
 	}
 }
 
-static inline void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags)
 	__must_hold(&req->ctx->uring_lock)
 {
+	unsigned int issue_flags = IO_URING_F_NONBLOCK |
+				   IO_URING_F_COMPLETE_DEFER | extra_flags;
 	int ret;
 
-	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+	ret = io_issue_sqe(req, issue_flags);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2221,7 +2223,7 @@ fallback:
 		return 0;
 	}
 
-	io_queue_sqe(req);
+	io_queue_sqe(req, IO_URING_F_INLINE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From af19388a973877b2349df46c4487a789cd3148ed Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 5 Jun 2025 11:33:52 -0600
Subject: io_uring: add struct io_cold_def->sqe_copy() method

Will be called by the core of io_uring, if inline issue is not going
to be tried for a request. Opcodes can define this handler to defer
copying of SQE data that should remain stable.

Only called if IO_URING_F_INLINE is set. If it isn't set, then there's a
bug in the core handling of this, and -EFAULT will be returned instead
to terminate the request. This will trigger a WARN_ON_ONCE(). Don't
expect this to ever trigger, and down the line this can be removed.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/io_uring.c            | 27 +++++++++++++++++++++++++--
 io_uring/opdef.h               |  1 +
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 054c43c02c96..4ab3bdc103f2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -504,6 +504,7 @@ enum {
 	REQ_F_BUF_NODE_BIT,
 	REQ_F_HAS_METADATA_BIT,
 	REQ_F_IMPORT_BUFFER_BIT,
+	REQ_F_SQE_COPIED_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -593,6 +594,8 @@ enum {
 	 * For SEND_ZC, whether to import buffers (i.e. the first issue).
 	 */
 	REQ_F_IMPORT_BUFFER	= IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
+	/* ->sqe_copy() has been called, if necessary */
+	REQ_F_SQE_COPIED	= IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c60d1c286c87..3a23c8713f1b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1938,14 +1938,34 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd)
 	return file;
 }
 
-static void io_queue_async(struct io_kiocb *req, int ret)
+static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags)
+{
+	const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+	if (req->flags & REQ_F_SQE_COPIED)
+		return 0;
+	req->flags |= REQ_F_SQE_COPIED;
+	if (!def->sqe_copy)
+		return 0;
+	if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE)))
+		return -EFAULT;
+	def->sqe_copy(req);
+	return 0;
+}
+
+static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret)
 	__must_hold(&req->ctx->uring_lock)
 {
 	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
+fail:
 		io_req_defer_failed(req, ret);
 		return;
 	}
 
+	ret = io_req_sqe_copy(req, issue_flags);
+	if (unlikely(ret))
+		goto fail;
+
 	switch (io_arm_poll_handler(req, 0)) {
 	case IO_APOLL_READY:
 		io_kbuf_recycle(req, 0);
@@ -1974,7 +1994,7 @@ static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags)
 	 * doesn't support non-blocking read/write attempts
 	 */
 	if (unlikely(ret))
-		io_queue_async(req, ret);
+		io_queue_async(req, issue_flags, ret);
 }
 
 static void io_queue_sqe_fallback(struct io_kiocb *req)
@@ -1989,6 +2009,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req)
 		req->flags |= REQ_F_LINK;
 		io_req_defer_failed(req, req->cqe.res);
 	} else {
+		/* can't fail with IO_URING_F_INLINE */
+		io_req_sqe_copy(req, IO_URING_F_INLINE);
 		if (unlikely(req->ctx->drain_active))
 			io_drain_req(req);
 		else
@@ -2200,6 +2222,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	 */
 	if (unlikely(link->head)) {
 		trace_io_uring_link(req, link->last);
+		io_req_sqe_copy(req, IO_URING_F_INLINE);
 		link->last->link = req;
 		link->last = req;
 
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 719a52104abe..c2f0907ed78c 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -38,6 +38,7 @@ struct io_issue_def {
 struct io_cold_def {
 	const char		*name;
 
+	void (*sqe_copy)(struct io_kiocb *);
 	void (*cleanup)(struct io_kiocb *);
 	void (*fail)(struct io_kiocb *);
 };
-- 
cgit v1.2.3


From cb9ccfb404e700dc0db59d68242d79fe386bb3f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 13 Jun 2025 17:05:19 -0600
Subject: io_uring/nop: add IORING_NOP_TW completion flag

To test and profile the overhead of io_uring task_work and the various
types of it, add IORING_NOP_TW which tells nop to signal completions
through task_work rather than complete them inline.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 1 +
 io_uring/nop.c                | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cfd17e382082..8c3d43caab02 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags {
 #define IORING_NOP_FILE			(1U << 1)
 #define IORING_NOP_FIXED_FILE		(1U << 2)
 #define IORING_NOP_FIXED_BUFFER		(1U << 3)
+#define IORING_NOP_TW			(1U << 4)
 
 /*
  * IO completion data structure (Completion Queue Entry)
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 6ac2de761fd3..20ed0f85b1c2 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -20,7 +20,8 @@ struct io_nop {
 };
 
 #define NOP_FLAGS	(IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
-			 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+			 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \
+			 IORING_NOP_TW)
 
 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
@@ -68,5 +69,10 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, nop->result, 0);
+	if (nop->flags & IORING_NOP_TW) {
+		req->io_task_work.func = io_req_task_complete;
+		io_req_task_work_add(req);
+		return IOU_ISSUE_SKIP_COMPLETE;
+	}
 	return IOU_COMPLETE;
 }
-- 
cgit v1.2.3


From 1d6123102e9fbedc8d25bf4731da6d513173e49e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Tue, 17 Jun 2025 09:58:13 -0700
Subject: Bluetooth: hci_core: Fix use-after-free in vhci_flush()

syzbot reported use-after-free in vhci_flush() without repro. [0]

From the splat, a thread close()d a vhci file descriptor while
its device was being used by iotcl() on another thread.

Once the last fd refcnt is released, vhci_release() calls
hci_unregister_dev(), hci_free_dev(), and kfree() for struct
vhci_data, which is set to hci_dev->dev->driver_data.

The problem is that there is no synchronisation after unlinking
hdev from hci_dev_list in hci_unregister_dev().  There might be
another thread still accessing the hdev which was fetched before
the unlink operation.

We can use SRCU for such synchronisation.

Let's run hci_dev_reset() under SRCU and wait for its completion
in hci_unregister_dev().

Another option would be to restore hci_dev->destruct(), which was
removed in commit 587ae086f6e4 ("Bluetooth: Remove unused
hci-destruct cb").  However, this would not be a good solution, as
we should not run hci_unregister_dev() while there are in-flight
ioctl() requests, which could lead to another data-race KCSAN splat.

Note that other drivers seem to have the same problem, for exmaple,
virtbt_remove().

[0]:
BUG: KASAN: slab-use-after-free in skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline]
BUG: KASAN: slab-use-after-free in skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937
Read of size 8 at addr ffff88807cb8d858 by task syz.1.219/6718

CPU: 1 UID: 0 PID: 6718 Comm: syz.1.219 Not tainted 6.16.0-rc1-syzkaller-00196-g08207f42d3ff #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:408 [inline]
 print_report+0xd2/0x2b0 mm/kasan/report.c:521
 kasan_report+0x118/0x150 mm/kasan/report.c:634
 skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline]
 skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937
 skb_queue_purge include/linux/skbuff.h:3368 [inline]
 vhci_flush+0x44/0x50 drivers/bluetooth/hci_vhci.c:69
 hci_dev_do_reset net/bluetooth/hci_core.c:552 [inline]
 hci_dev_reset+0x420/0x5c0 net/bluetooth/hci_core.c:592
 sock_do_ioctl+0xd9/0x300 net/socket.c:1190
 sock_ioctl+0x576/0x790 net/socket.c:1311
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:907 [inline]
 __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fcf5b98e929
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fcf5c7b9038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00007fcf5bbb6160 RCX: 00007fcf5b98e929
RDX: 0000000000000000 RSI: 00000000400448cb RDI: 0000000000000009
RBP: 00007fcf5ba10b39 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00007fcf5bbb6160 R15: 00007ffd6353d528
 </TASK>

Allocated by task 6535:
 kasan_save_stack mm/kasan/common.c:47 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:68
 poison_kmalloc_redzone mm/kasan/common.c:377 [inline]
 __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394
 kasan_kmalloc include/linux/kasan.h:260 [inline]
 __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359
 kmalloc_noprof include/linux/slab.h:905 [inline]
 kzalloc_noprof include/linux/slab.h:1039 [inline]
 vhci_open+0x57/0x360 drivers/bluetooth/hci_vhci.c:635
 misc_open+0x2bc/0x330 drivers/char/misc.c:161
 chrdev_open+0x4c9/0x5e0 fs/char_dev.c:414
 do_dentry_open+0xdf0/0x1970 fs/open.c:964
 vfs_open+0x3b/0x340 fs/open.c:1094
 do_open fs/namei.c:3887 [inline]
 path_openat+0x2ee5/0x3830 fs/namei.c:4046
 do_filp_open+0x1fa/0x410 fs/namei.c:4073
 do_sys_openat2+0x121/0x1c0 fs/open.c:1437
 do_sys_open fs/open.c:1452 [inline]
 __do_sys_openat fs/open.c:1468 [inline]
 __se_sys_openat fs/open.c:1463 [inline]
 __x64_sys_openat+0x138/0x170 fs/open.c:1463
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Freed by task 6535:
 kasan_save_stack mm/kasan/common.c:47 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:68
 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576
 poison_slab_object mm/kasan/common.c:247 [inline]
 __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264
 kasan_slab_free include/linux/kasan.h:233 [inline]
 slab_free_hook mm/slub.c:2381 [inline]
 slab_free mm/slub.c:4643 [inline]
 kfree+0x18e/0x440 mm/slub.c:4842
 vhci_release+0xbc/0xd0 drivers/bluetooth/hci_vhci.c:671
 __fput+0x44c/0xa70 fs/file_table.c:465
 task_work_run+0x1d1/0x260 kernel/task_work.c:227
 exit_task_work include/linux/task_work.h:40 [inline]
 do_exit+0x6ad/0x22e0 kernel/exit.c:955
 do_group_exit+0x21c/0x2d0 kernel/exit.c:1104
 __do_sys_exit_group kernel/exit.c:1115 [inline]
 __se_sys_exit_group kernel/exit.c:1113 [inline]
 __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1113
 x64_sys_call+0x21ba/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:232
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

The buggy address belongs to the object at ffff88807cb8d800
 which belongs to the cache kmalloc-1k of size 1024
The buggy address is located 88 bytes inside of
 freed 1024-byte region [ffff88807cb8d800, ffff88807cb8dc00)

Fixes: bf18c7118cf8 ("Bluetooth: vhci: Free driver_data on file release")
Reported-by: syzbot+2faa4825e556199361f9@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=f62d64848fc4c7c30cd6
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  2 ++
 net/bluetooth/hci_core.c         | 34 ++++++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a760f05fa3fb..9fc8f544e20e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -29,6 +29,7 @@
 #include <linux/idr.h>
 #include <linux/leds.h>
 #include <linux/rculist.h>
+#include <linux/srcu.h>
 
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_drv.h>
@@ -347,6 +348,7 @@ struct adv_monitor {
 
 struct hci_dev {
 	struct list_head list;
+	struct srcu_struct srcu;
 	struct mutex	lock;
 
 	struct ida	unset_handle_ida;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 07a8b4281a39..14d7221b8ac0 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida);
 
 /* Get HCI device by index.
  * Device is held on return. */
-struct hci_dev *hci_dev_get(int index)
+static struct hci_dev *__hci_dev_get(int index, int *srcu_index)
 {
 	struct hci_dev *hdev = NULL, *d;
 
@@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index)
 	list_for_each_entry(d, &hci_dev_list, list) {
 		if (d->id == index) {
 			hdev = hci_dev_hold(d);
+			if (srcu_index)
+				*srcu_index = srcu_read_lock(&d->srcu);
 			break;
 		}
 	}
@@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index)
 	return hdev;
 }
 
+struct hci_dev *hci_dev_get(int index)
+{
+	return __hci_dev_get(index, NULL);
+}
+
+static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index)
+{
+	return __hci_dev_get(index, srcu_index);
+}
+
+static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index)
+{
+	srcu_read_unlock(&hdev->srcu, srcu_index);
+	hci_dev_put(hdev);
+}
+
 /* ---- Inquiry support ---- */
 
 bool hci_discovery_active(struct hci_dev *hdev)
@@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
 int hci_dev_reset(__u16 dev)
 {
 	struct hci_dev *hdev;
-	int err;
+	int err, srcu_index;
 
-	hdev = hci_dev_get(dev);
+	hdev = hci_dev_get_srcu(dev, &srcu_index);
 	if (!hdev)
 		return -ENODEV;
 
@@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev)
 	err = hci_dev_do_reset(hdev);
 
 done:
-	hci_dev_put(hdev);
+	hci_dev_put_srcu(hdev, srcu_index);
 	return err;
 }
 
@@ -2433,6 +2451,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	if (!hdev)
 		return NULL;
 
+	if (init_srcu_struct(&hdev->srcu)) {
+		kfree(hdev);
+		return NULL;
+	}
+
 	hdev->pkt_type  = (HCI_DM1 | HCI_DH1 | HCI_HV1);
 	hdev->esco_type = (ESCO_HV1);
 	hdev->link_mode = (HCI_LM_ACCEPT);
@@ -2678,6 +2701,9 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
 
+	synchronize_srcu(&hdev->srcu);
+	cleanup_srcu_struct(&hdev->srcu);
+
 	disable_work_sync(&hdev->rx_work);
 	disable_work_sync(&hdev->cmd_work);
 	disable_work_sync(&hdev->tx_work);
-- 
cgit v1.2.3


From 9e4ed359b8efad0e8ad4510d8ad22bf0b060526a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 16 Jun 2025 10:46:29 +0100
Subject: io_uring/netcmd: add tx timestamping cmd support

Add a new socket command which returns tx time stamps to the user. It
provide an alternative to the existing error queue recvmsg interface.
The command works in a polled multishot mode, which means io_uring will
poll the socket and keep posting timestamps until the request is
cancelled or fails in any other way (e.g. with no space in the CQ). It
reuses the net infra and grabs timestamps from the socket's error queue.

The command requires IORING_SETUP_CQE32. All non-final CQEs (marked with
IORING_CQE_F_MORE) have cqe->res set to the tskey, and the upper 16 bits
of cqe->flags keep tstype (i.e. offset by IORING_CQE_BUFFER_SHIFT). The
timevalue is store in the upper part of the extended CQE. The final
completion won't have IORING_CQE_F_MORE and will have cqe->res storing
0/error.

Suggested-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/92ee66e6b33b8de062a977843d825f58f21ecd37.1750065793.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 16 +++++++++
 io_uring/cmd_net.c            | 82 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8c3d43caab02..85600ad0ac08 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -969,6 +969,22 @@ enum io_uring_socket_op {
 	SOCKET_URING_OP_SIOCOUTQ,
 	SOCKET_URING_OP_GETSOCKOPT,
 	SOCKET_URING_OP_SETSOCKOPT,
+	SOCKET_URING_OP_TX_TIMESTAMP,
+};
+
+/*
+ * SOCKET_URING_OP_TX_TIMESTAMP definitions
+ */
+
+#define IORING_TIMESTAMP_HW_SHIFT	16
+/* The cqe->flags bit from which the timestamp type is stored */
+#define IORING_TIMESTAMP_TYPE_SHIFT	(IORING_TIMESTAMP_HW_SHIFT + 1)
+/* The cqe->flags flag signifying whether it's a hardware timestamp */
+#define IORING_CQE_F_TSTAMP_HW		((__u32)1 << IORING_TIMESTAMP_HW_SHIFT);
+
+struct io_timespec {
+	__u64		tv_sec;
+	__u64		tv_nsec;
 };
 
 /* Zero copy receive refill queue entry */
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
index e99170c7d41a..3866fe6ff541 100644
--- a/io_uring/cmd_net.c
+++ b/io_uring/cmd_net.c
@@ -1,5 +1,6 @@
 #include <asm/ioctls.h>
 #include <linux/io_uring/net.h>
+#include <linux/errqueue.h>
 #include <net/sock.h>
 
 #include "uring_cmd.h"
@@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock,
 				  optlen);
 }
 
+static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk,
+				     struct sk_buff *skb, unsigned issue_flags)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	struct io_uring_cqe cqe[2];
+	struct io_timespec *iots;
+	struct timespec64 ts;
+	u32 tstype, tskey;
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec));
+
+	ret = skb_get_tx_timestamp(skb, sk, &ts);
+	if (ret < 0)
+		return false;
+
+	tskey = serr->ee.ee_data;
+	tstype = serr->ee.ee_info;
+
+	cqe->user_data = 0;
+	cqe->res = tskey;
+	cqe->flags = IORING_CQE_F_MORE;
+	cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT;
+	if (ret == SOF_TIMESTAMPING_TX_HARDWARE)
+		cqe->flags |= IORING_CQE_F_TSTAMP_HW;
+
+	iots = (struct io_timespec *)&cqe[1];
+	iots->tv_sec = ts.tv_sec;
+	iots->tv_nsec = ts.tv_nsec;
+	return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe);
+}
+
+static int io_uring_cmd_timestamp(struct socket *sock,
+				  struct io_uring_cmd *cmd,
+				  unsigned int issue_flags)
+{
+	struct sock *sk = sock->sk;
+	struct sk_buff_head *q = &sk->sk_error_queue;
+	struct sk_buff *skb, *tmp;
+	struct sk_buff_head list;
+	int ret;
+
+	if (!(issue_flags & IO_URING_F_CQE32))
+		return -EINVAL;
+	ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR);
+	if (unlikely(ret))
+		return ret;
+
+	if (skb_queue_empty_lockless(q))
+		return -EAGAIN;
+	__skb_queue_head_init(&list);
+
+	scoped_guard(spinlock_irq, &q->lock) {
+		skb_queue_walk_safe(q, skb, tmp) {
+			/* don't support skbs with payload */
+			if (!skb_has_tx_timestamp(skb, sk) || skb->len)
+				continue;
+			__skb_unlink(skb, q);
+			__skb_queue_tail(&list, skb);
+		}
+	}
+
+	while (1) {
+		skb = skb_peek(&list);
+		if (!skb)
+			break;
+		if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags))
+			break;
+		__skb_dequeue(&list);
+		consume_skb(skb);
+	}
+
+	if (!unlikely(skb_queue_empty(&list))) {
+		scoped_guard(spinlock_irqsave, &q->lock)
+			skb_queue_splice(q, &list);
+	}
+	return -EAGAIN;
+}
+
 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	struct socket *sock = cmd->file->private_data;
@@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 		return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
 	case SOCKET_URING_OP_SETSOCKOPT:
 		return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+	case SOCKET_URING_OP_TX_TIMESTAMP:
+		return io_uring_cmd_timestamp(sock, cmd, issue_flags);
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From c4cdbaf9d81c8387da8b9b91567d4b0eb1b8a549 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:24 -0700
Subject: iommu/amd: KVM: SVM: Use pi_desc_addr to derive ga_root_ptr

Use vcpu_data.pi_desc_addr instead of amd_iommu_pi_data.base to get the
GA root pointer.  KVM is the only source of amd_iommu_pi_data.base, and
KVM's one and only path for writing amd_iommu_pi_data.base computes the
exact same value for vcpu_data.pi_desc_addr and amd_iommu_pi_data.base,
and fills amd_iommu_pi_data.base if and only if vcpu_data.pi_desc_addr is
valid, i.e. amd_iommu_pi_data.base is fully redundant.

Cc: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-23-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c   | 7 +++++--
 drivers/iommu/amd/iommu.c | 2 +-
 include/linux/amd-iommu.h | 2 --
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 375a29022000..d95742faae11 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -894,8 +894,11 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 
 			enable_remapped_mode = false;
 
-			/* Try to enable guest_mode in IRTE */
-			pi.base = avic_get_backing_page_address(svm);
+			/*
+			 * Try to enable guest_mode in IRTE.  Note, the address
+			 * of the vCPU's AVIC backing page is passed to the
+			 * IOMMU via vcpu_info->pi_desc_addr.
+			 */
 			pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
 						     svm->vcpu.vcpu_id);
 			pi.is_guest_mode = true;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ed96050d4933..bdfb57af7111 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3888,7 +3888,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 	pi_data->ir_data = ir_data;
 
 	if (pi_data->is_guest_mode) {
-		ir_data->ga_root_ptr = (pi_data->base >> 12);
+		ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12);
 		ir_data->ga_vector = vcpu_pi_info->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
 		ret = amd_iommu_activate_guest_mode(ir_data);
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 1f9b13d803c5..deeefc92a5cf 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -19,8 +19,6 @@ struct amd_iommu;
  */
 struct amd_iommu_pi_data {
 	u32 ga_tag;
-	u64 base;
-
 	bool is_guest_mode;
 	struct vcpu_data *vcpu_data;
 	void *ir_data;
-- 
cgit v1.2.3


From 53527ea1b70224d16d29edbd5c850456469f00ec Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:34 -0700
Subject: iommu: KVM: Split "struct vcpu_data" into separate AMD vs. Intel
 structs

Split the vcpu_data structure that serves as a handoff from KVM to IOMMU
drivers into vendor specific structures.  Overloading a single structure
makes the code hard to read and maintain, is *very* misleading as it
suggests that mixing vendors is actually supported, and bastardizing
Intel's posted interrupt descriptor address when AMD's IOMMU already has
its own structure is quite unnecessary.

Tested-by: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-33-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/irq_remapping.h | 15 ++++++++++++++-
 arch/x86/kvm/svm/avic.c              | 21 ++++++++-------------
 arch/x86/kvm/vmx/posted_intr.c       |  4 ++--
 drivers/iommu/amd/iommu.c            | 12 ++++--------
 drivers/iommu/intel/irq_remapping.c  | 10 +++++-----
 include/linux/amd-iommu.h            | 12 ------------
 6 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5036f13ab69f..2dbc9cb61c2f 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -26,7 +26,20 @@ enum {
 	IRQ_REMAP_X2APIC_MODE,
 };
 
-struct vcpu_data {
+/*
+ * This is mainly used to communicate information back-and-forth
+ * between SVM and IOMMU for setting up and tearing down posted
+ * interrupt
+ */
+struct amd_iommu_pi_data {
+	u64 vapic_addr;		/* Physical address of the vCPU's vAPIC. */
+	u32 ga_tag;
+	u32 vector;		/* Guest vector of the interrupt */
+	bool is_guest_mode;
+	void *ir_data;
+};
+
+struct intel_iommu_pi_data {
 	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
 	u32 vector;		/* Guest vector of the interrupt */
 };
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index eed5c58ac07f..dc1526fef18d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -823,23 +823,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 	 */
 	if (vcpu && kvm_vcpu_apicv_active(vcpu)) {
 		/*
-		 * Try to enable guest_mode in IRTE.  Note, the address
-		 * of the vCPU's AVIC backing page is passed to the
-		 * IOMMU via vcpu_info->pi_desc_addr.
+		 * Try to enable guest_mode in IRTE.
 		 */
-		struct vcpu_data vcpu_info = {
-			.pi_desc_addr = avic_get_backing_page_address(to_svm(vcpu)),
-			.vector = vector,
-		};
-
-		struct amd_iommu_pi_data pi = {
-			.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, vcpu->vcpu_id),
+		struct amd_iommu_pi_data pi_data = {
+			.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
+					     vcpu->vcpu_id),
 			.is_guest_mode = true,
-			.vcpu_data = &vcpu_info,
+			.vapic_addr = avic_get_backing_page_address(to_svm(vcpu)),
+			.vector = vector,
 		};
 		int ret;
 
-		ret = irq_set_vcpu_affinity(host_irq, &pi);
+		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
 		if (ret)
 			return ret;
 
@@ -850,7 +845,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		 * we can reference to them directly when we update vcpu
 		 * scheduling information in IOMMU irte.
 		 */
-		return svm_ir_list_add(to_svm(vcpu), irqfd, &pi);
+		return svm_ir_list_add(to_svm(vcpu), irqfd, &pi_data);
 	}
 	return irq_set_vcpu_affinity(host_irq, NULL);
 }
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 687ffde3b61c..3a23c30f73cb 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -303,12 +303,12 @@ int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		       struct kvm_vcpu *vcpu, u32 vector)
 {
 	if (vcpu) {
-		struct vcpu_data vcpu_info = {
+		struct intel_iommu_pi_data pi_data = {
 			.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)),
 			.vector = vector,
 		};
 
-		return irq_set_vcpu_affinity(host_irq, &vcpu_info);
+		return irq_set_vcpu_affinity(host_irq, &pi_data);
 	} else {
 		return irq_set_vcpu_affinity(host_irq, NULL);
 	}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index e8e259c07265..8366d32252cd 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3860,10 +3860,10 @@ int amd_iommu_deactivate_guest_mode(void *data)
 }
 EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
 
-static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 {
 	int ret;
-	struct amd_iommu_pi_data *pi_data = vcpu_info;
+	struct amd_iommu_pi_data *pi_data = info;
 	struct amd_ir_data *ir_data = data->chip_data;
 	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
 	struct iommu_dev_data *dev_data;
@@ -3886,14 +3886,10 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 	ir_data->cfg = irqd_cfg(data);
 
 	if (pi_data) {
-		struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
-
 		pi_data->ir_data = ir_data;
 
-		WARN_ON_ONCE(!pi_data->is_guest_mode);
-
-		ir_data->ga_root_ptr = (vcpu_pi_info->pi_desc_addr >> 12);
-		ir_data->ga_vector = vcpu_pi_info->vector;
+		ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
+		ir_data->ga_vector = pi_data->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
 		ret = amd_iommu_activate_guest_mode(ir_data);
 	} else {
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index cf7b6882ec75..2fc451253dc3 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1244,10 +1244,10 @@ static void intel_ir_compose_msi_msg(struct irq_data *irq_data,
 static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 {
 	struct intel_ir_data *ir_data = data->chip_data;
-	struct vcpu_data *vcpu_pi_info = info;
+	struct intel_iommu_pi_data *pi_data = info;
 
 	/* stop posting interrupts, back to the default mode */
-	if (!vcpu_pi_info) {
+	if (!pi_data) {
 		__intel_ir_reconfigure_irte(data, true);
 	} else {
 		struct irte irte_pi;
@@ -1265,10 +1265,10 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 		/* Update the posted mode fields */
 		irte_pi.p_pst = 1;
 		irte_pi.p_urgent = 0;
-		irte_pi.p_vector = vcpu_pi_info->vector;
-		irte_pi.pda_l = (vcpu_pi_info->pi_desc_addr >>
+		irte_pi.p_vector = pi_data->vector;
+		irte_pi.pda_l = (pi_data->pi_desc_addr >>
 				(32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
-		irte_pi.pda_h = (vcpu_pi_info->pi_desc_addr >> 32) &
+		irte_pi.pda_h = (pi_data->pi_desc_addr >> 32) &
 				~(-1UL << PDA_HIGH_BIT);
 
 		ir_data->irq_2_iommu.posted_vcpu = true;
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index deeefc92a5cf..99b4fa9a0296 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -12,18 +12,6 @@
 
 struct amd_iommu;
 
-/*
- * This is mainly used to communicate information back-and-forth
- * between SVM and IOMMU for setting up and tearing down posted
- * interrupt
- */
-struct amd_iommu_pi_data {
-	u32 ga_tag;
-	bool is_guest_mode;
-	struct vcpu_data *vcpu_data;
-	void *ir_data;
-};
-
 #ifdef CONFIG_AMD_IOMMU
 
 struct task_struct;
-- 
cgit v1.2.3


From b33252b9d17238a4a9fa5a29af6e6a2922a6c2b0 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:35 -0700
Subject: KVM: Don't WARN if updating IRQ bypass route fails

Don't bother WARNing if updating an IRTE route fails now that vendor code
provides much more precise WARNs.  The generic WARN doesn't provide enough
information to actually debug the problem, and has obviously done nothing
to surface the myriad bugs in KVM x86's implementation.

Drop all of the associated return code plumbing that existed just so that
common KVM could WARN.

Link: https://lore.kernel.org/r/20250611224604.313496-34-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/arm.c          |  6 +++---
 arch/arm64/kvm/vgic/vgic-v4.c |  8 +++-----
 arch/x86/kvm/irq.c            |  8 ++++----
 include/kvm/arm_vgic.h        |  2 +-
 include/linux/kvm_host.h      |  6 +++---
 virt/kvm/eventfd.c            | 15 ++++++---------
 6 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a9a39e0375f7..94fb2f096a20 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2771,9 +2771,9 @@ bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
 }
 
-int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				  struct kvm_kernel_irq_routing_entry *old,
-				  struct kvm_kernel_irq_routing_entry *new)
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				   struct kvm_kernel_irq_routing_entry *old,
+				   struct kvm_kernel_irq_routing_entry *new)
 {
 	/*
 	 * Remapping the vLPI requires taking the its_lock mutex to resolve
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 911170d4a9c8..ef3481963122 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -527,18 +527,17 @@ static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
 	return NULL;
 }
 
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
 {
 	struct vgic_irq *irq;
 	unsigned long flags;
-	int ret = 0;
 
 	if (!vgic_supports_direct_msis(kvm))
-		return 0;
+		return;
 
 	irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
 	if (!irq)
-		return 0;
+		return;
 
 	raw_spin_lock_irqsave(&irq->irq_lock, flags);
 	WARN_ON(irq->hw && irq->host_irq != host_irq);
@@ -550,5 +549,4 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
 
 	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
 	vgic_put_irq(kvm, irq);
-	return 0;
 }
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cd9b56c6a5c3..b7b0fbc218b8 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -606,11 +606,11 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	kvm_arch_end_assignment(irqfd->kvm);
 }
 
-int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				  struct kvm_kernel_irq_routing_entry *old,
-				  struct kvm_kernel_irq_routing_entry *new)
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				   struct kvm_kernel_irq_routing_entry *old,
+				   struct kvm_kernel_irq_routing_entry *new)
 {
-	return kvm_pi_update_irte(irqfd, new);
+	kvm_pi_update_irte(irqfd, new);
 }
 
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4a34f7f0a864..b2a04481de1a 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -434,7 +434,7 @@ struct kvm_kernel_irq_routing_entry;
 int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
 			       struct kvm_kernel_irq_routing_entry *irq_entry);
 
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
+void kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq);
 
 int vgic_v4_load(struct kvm_vcpu *vcpu);
 void vgic_v4_commit(struct kvm_vcpu *vcpu);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a4160c1c0c6b..8e74ac0f90b1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2410,9 +2410,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
 			   struct irq_bypass_producer *);
 void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
 void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
-int kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-				  struct kvm_kernel_irq_routing_entry *old,
-				  struct kvm_kernel_irq_routing_entry *new);
+void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+				   struct kvm_kernel_irq_routing_entry *old,
+				   struct kvm_kernel_irq_routing_entry *new);
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
 				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index bd1766da6895..719b242fc935 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -285,11 +285,11 @@ void __attribute__((weak)) kvm_arch_irq_bypass_start(
 {
 }
 
-int __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
-					 struct kvm_kernel_irq_routing_entry *old,
-					 struct kvm_kernel_irq_routing_entry *new)
+void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
+					  struct kvm_kernel_irq_routing_entry *old,
+					  struct kvm_kernel_irq_routing_entry *new)
 {
-	return 0;
+
 }
 
 bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
@@ -617,11 +617,8 @@ void kvm_irq_routing_update(struct kvm *kvm)
 
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
 		if (irqfd->producer &&
-		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
-			int ret = kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
-
-			WARN_ON(ret);
-		}
+		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry))
+			kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
 #endif
 	}
 
-- 
cgit v1.2.3


From 77bb184ab880171a1cedfbed9ab05977e6ae2258 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:36 -0700
Subject: KVM: Fold kvm_arch_irqfd_route_changed() into
 kvm_arch_update_irqfd_routing()

Fold kvm_arch_irqfd_route_changed() into kvm_arch_update_irqfd_routing().
Calling arch code to know whether or not to call arch code is absurd.

Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250611224604.313496-35-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/kvm/arm.c     | 15 +++++----------
 arch/x86/kvm/irq.c       | 15 +++++----------
 include/linux/kvm_host.h |  2 --
 virt/kvm/eventfd.c       | 10 +---------
 4 files changed, 11 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 94fb2f096a20..04f2116927b1 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2761,20 +2761,15 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 	kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
 }
 
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
-				  struct kvm_kernel_irq_routing_entry *new)
-{
-	if (old->type != KVM_IRQ_ROUTING_MSI ||
-	    new->type != KVM_IRQ_ROUTING_MSI)
-		return true;
-
-	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
 void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
 				   struct kvm_kernel_irq_routing_entry *old,
 				   struct kvm_kernel_irq_routing_entry *new)
 {
+	if (old->type == KVM_IRQ_ROUTING_MSI &&
+	    new->type == KVM_IRQ_ROUTING_MSI &&
+	    !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+		return;
+
 	/*
 	 * Remapping the vLPI requires taking the its_lock mutex to resolve
 	 * the new translation. We're in spinlock land at this point, so no
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b7b0fbc218b8..23e0acc07cb6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -610,17 +610,12 @@ void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
 				   struct kvm_kernel_irq_routing_entry *old,
 				   struct kvm_kernel_irq_routing_entry *new)
 {
-	kvm_pi_update_irte(irqfd, new);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
-				  struct kvm_kernel_irq_routing_entry *new)
-{
-	if (old->type != KVM_IRQ_ROUTING_MSI ||
-	    new->type != KVM_IRQ_ROUTING_MSI)
-		return true;
+	if (old->type == KVM_IRQ_ROUTING_MSI &&
+	    new->type == KVM_IRQ_ROUTING_MSI &&
+	    !memcmp(&old->msi, &new->msi, sizeof(new->msi)))
+		return;
 
-	return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
+	kvm_pi_update_irte(irqfd, new);
 }
 
 #ifdef CONFIG_KVM_IOAPIC
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8e74ac0f90b1..fb9ec06aa807 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2413,8 +2413,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
 void kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
 				   struct kvm_kernel_irq_routing_entry *old,
 				   struct kvm_kernel_irq_routing_entry *new);
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
-				  struct kvm_kernel_irq_routing_entry *);
 #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
 
 #ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 719b242fc935..59b1e64697f1 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -291,13 +291,6 @@ void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
 {
 
 }
-
-bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
-				struct kvm_kernel_irq_routing_entry *old,
-				struct kvm_kernel_irq_routing_entry *new)
-{
-	return true;
-}
 #endif
 
 static int
@@ -616,8 +609,7 @@ void kvm_irq_routing_update(struct kvm *kvm)
 		irqfd_update(kvm, irqfd);
 
 #if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
-		if (irqfd->producer &&
-		    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry))
+		if (irqfd->producer)
 			kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
 #endif
 	}
-- 
cgit v1.2.3


From 08d9ccdd1a5c75d7aca7ac3af56f723d780dd6ac Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:43 -0700
Subject: iommu/amd: KVM: SVM: Infer IsRun from validity of pCPU destination

Infer whether or not a vCPU should be marked running from the validity of
the pCPU on which it is running.  amd_iommu_update_ga() already skips the
IRTE update if the pCPU is invalid, i.e. passing %true for is_run with an
invalid pCPU would be a blatant and egregrious KVM bug.

Tested-by: Sairaj Kodilkar <sarunkod@amd.com>
Link: https://lore.kernel.org/r/20250611224604.313496-42-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c   | 11 +++++------
 drivers/iommu/amd/iommu.c | 14 +++++++++-----
 include/linux/amd-iommu.h |  6 ++----
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 42fd1868c32f..1960bb06c4b9 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -833,7 +833,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		entry = svm->avic_physical_id_entry;
 		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
 			amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
-					    true, pi_data.ir_data);
+					    pi_data.ir_data);
 
 		irqfd->irq_bypass_data = pi_data.ir_data;
 		list_add(&irqfd->vcpu_list, &svm->ir_list);
@@ -842,8 +842,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 	return irq_set_vcpu_affinity(host_irq, NULL);
 }
 
-static inline int
-avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
+static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu)
 {
 	int ret = 0;
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -862,7 +861,7 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
 		return 0;
 
 	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
-		ret = amd_iommu_update_ga(cpu, r, irqfd->irq_bypass_data);
+		ret = amd_iommu_update_ga(cpu, irqfd->irq_bypass_data);
 		if (ret)
 			return ret;
 	}
@@ -924,7 +923,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	WRITE_ONCE(kvm_svm->avic_physical_id_table[vcpu->vcpu_id], entry);
 
-	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
+	avic_update_iommu_vcpu_affinity(vcpu, h_physical_id);
 
 	spin_unlock_irqrestore(&svm->ir_list_lock, flags);
 }
@@ -964,7 +963,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
 	 */
 	spin_lock_irqsave(&svm->ir_list_lock, flags);
 
-	avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+	avic_update_iommu_vcpu_affinity(vcpu, -1);
 
 	entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
 	svm->avic_physical_id_entry = entry;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 0a0d73483195..3c4c81eb201b 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3990,15 +3990,17 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
  * Update the pCPU information for an IRTE that is configured to post IRQs to
  * a vCPU, without issuing an IOMMU invalidation for the IRTE.
  *
- * This API is intended to be used when a vCPU is scheduled in/out (or stops
- * running for any reason), to do a fast update of IsRun and (conditionally)
- * Destination.
+ * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
+ * with the pCPU's APIC ID and set IsRun, else clear IsRun.  I.e. treat vCPUs
+ * that are associated with a pCPU as running.  This API is intended to be used
+ * when a vCPU is scheduled in/out (or stops running for any reason), to do a
+ * fast update of IsRun and (conditionally) Destination.
  *
  * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
  * and thus don't require an invalidation to ensure the IOMMU consumes fresh
  * information.
  */
-int amd_iommu_update_ga(int cpu, bool is_run, void *data)
+int amd_iommu_update_ga(int cpu, void *data)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -4015,8 +4017,10 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data)
 					APICID_TO_IRTE_DEST_LO(cpu);
 		entry->hi.fields.destination =
 					APICID_TO_IRTE_DEST_HI(cpu);
+		entry->lo.fields_vapic.is_run = true;
+	} else {
+		entry->lo.fields_vapic.is_run = false;
 	}
-	entry->lo.fields_vapic.is_run = is_run;
 
 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 				ir_data->irq_2_irte.index, entry);
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index 99b4fa9a0296..fe0e16ffe0e5 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -30,8 +30,7 @@ static inline void amd_iommu_detect(void) { }
 /* IOMMU AVIC Function */
 extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
 
-extern int
-amd_iommu_update_ga(int cpu, bool is_run, void *data);
+extern int amd_iommu_update_ga(int cpu, void *data);
 
 extern int amd_iommu_activate_guest_mode(void *data);
 extern int amd_iommu_deactivate_guest_mode(void *data);
@@ -44,8 +43,7 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 	return 0;
 }
 
-static inline int
-amd_iommu_update_ga(int cpu, bool is_run, void *data)
+static inline int amd_iommu_update_ga(int cpu, void *data)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From f965255dc5033387ac7858787c6c792fd789ac34 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:45 -0700
Subject: iommu/amd: KVM: SVM: Set pCPU info in IRTE when setting vCPU affinity

Now that setting vCPU affinity is guarded with ir_list_lock, i.e. now that
avic_physical_id_entry can be safely accessed, set the pCPU info
straight-away when setting vCPU affinity.  Putting the IRTE into posted
mode, and then immediately updating the IRTE a second time if the target
vCPU is running is wasteful and confusing.

This also fixes a flaw where a posted IRQ that arrives between putting
the IRTE into guest_mode and setting the correct destination could cause
the IOMMU to ring the doorbell on the wrong pCPU.

Link: https://lore.kernel.org/r/20250611224604.313496-44-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/irq_remapping.h |  1 +
 arch/x86/kvm/svm/avic.c              | 26 ++++++++++++++------------
 drivers/iommu/amd/iommu.c            |  6 ++++--
 include/linux/amd-iommu.h            |  4 ++--
 4 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 2dbc9cb61c2f..4c75a17632f6 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -35,6 +35,7 @@ struct amd_iommu_pi_data {
 	u64 vapic_addr;		/* Physical address of the vCPU's vAPIC. */
 	u32 ga_tag;
 	u32 vector;		/* Guest vector of the interrupt */
+	int cpu;
 	bool is_guest_mode;
 	void *ir_data;
 };
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1960bb06c4b9..35c5fb831c28 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -727,6 +727,7 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
 
 static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
 {
+	int apic_id = kvm_cpu_get_apicid(vcpu->cpu);
 	int ret = 0;
 	unsigned long flags;
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -746,7 +747,7 @@ static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
 
 	list_for_each_entry(irqfd, &svm->ir_list, vcpu_list) {
 		if (activate)
-			ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data);
+			ret = amd_iommu_activate_guest_mode(irqfd->irq_bypass_data, apic_id);
 		else
 			ret = amd_iommu_deactivate_guest_mode(irqfd->irq_bypass_data);
 		if (ret)
@@ -810,6 +811,18 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		 */
 		guard(spinlock_irqsave)(&svm->ir_list_lock);
 
+		/*
+		 * Update the target pCPU for IOMMU doorbells if the vCPU is
+		 * running.  If the vCPU is NOT running, i.e. is blocking or
+		 * scheduled out, KVM will update the pCPU info when the vCPU
+		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
+		 */
+		entry = svm->avic_physical_id_entry;
+		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
+		else
+			pi_data.cpu = -1;
+
 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
 		if (ret)
 			return ret;
@@ -824,17 +837,6 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 			return -EIO;
 		}
 
-		/*
-		 * Update the target pCPU for IOMMU doorbells if the vCPU is
-		 * running.  If the vCPU is NOT running, i.e. is blocking or
-		 * scheduled out, KVM will update the pCPU info when the vCPU
-		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
-		 */
-		entry = svm->avic_physical_id_entry;
-		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
-			amd_iommu_update_ga(entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK,
-					    pi_data.ir_data);
-
 		irqfd->irq_bypass_data = pi_data.ir_data;
 		list_add(&irqfd->vcpu_list, &svm->ir_list);
 		return 0;
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2eaba64be68a..6352930ad011 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3850,7 +3850,7 @@ int amd_iommu_update_ga(int cpu, void *data)
 }
 EXPORT_SYMBOL(amd_iommu_update_ga);
 
-int amd_iommu_activate_guest_mode(void *data)
+int amd_iommu_activate_guest_mode(void *data, int cpu)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -3871,6 +3871,8 @@ int amd_iommu_activate_guest_mode(void *data)
 	entry->hi.fields.vector            = ir_data->ga_vector;
 	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
 
+	__amd_iommu_update_ga(entry, cpu);
+
 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 			      ir_data->irq_2_irte.index, entry);
 }
@@ -3937,7 +3939,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 		ir_data->ga_root_ptr = (pi_data->vapic_addr >> 12);
 		ir_data->ga_vector = pi_data->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
-		ret = amd_iommu_activate_guest_mode(ir_data);
+		ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu);
 	} else {
 		ret = amd_iommu_deactivate_guest_mode(ir_data);
 	}
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index fe0e16ffe0e5..c9f2df0c4596 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -32,7 +32,7 @@ extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
 
 extern int amd_iommu_update_ga(int cpu, void *data);
 
-extern int amd_iommu_activate_guest_mode(void *data);
+extern int amd_iommu_activate_guest_mode(void *data, int cpu);
 extern int amd_iommu_deactivate_guest_mode(void *data);
 
 #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@@ -48,7 +48,7 @@ static inline int amd_iommu_update_ga(int cpu, void *data)
 	return 0;
 }
 
-static inline int amd_iommu_activate_guest_mode(void *data)
+static inline int amd_iommu_activate_guest_mode(void *data, int cpu)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From b9e53f9ff4a88f01b22524878c9a381a6c5f65ff Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:46:03 -0700
Subject: iommu/amd: KVM: SVM: Allow KVM to control need for GA log interrupts

Add plumbing to the AMD IOMMU driver to allow KVM to control whether or
not an IRTE is configured to generate GA log interrupts.  KVM only needs a
notification if the target vCPU is blocking, so the vCPU can be awakened.
If a vCPU is preempted or exits to userspace, KVM clears is_run, but will
set the vCPU back to running when userspace does KVM_RUN and/or the vCPU
task is scheduled back in, i.e. KVM doesn't need a notification.

Unconditionally pass "true" in all KVM paths to isolate the IOMMU changes
from the KVM changes insofar as possible.

Opportunistically swap the ordering of parameters for amd_iommu_update_ga()
so that the match amd_iommu_activate_guest_mode().

Note, as of this writing, the AMD IOMMU manual doesn't list GALogIntr as
a non-cached field, but per AMD hardware architects, it's not cached and
can be safely updated without an invalidation.

Link: https://lore.kernel.org/all/b29b8c22-2fd4-4b5e-b755-9198874157c7@amd.com
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Link: https://lore.kernel.org/r/20250611224604.313496-62-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/irq_remapping.h |  1 +
 arch/x86/kvm/svm/avic.c              | 10 ++++++----
 drivers/iommu/amd/iommu.c            | 28 +++++++++++++++++-----------
 include/linux/amd-iommu.h            |  9 ++++-----
 4 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 4c75a17632f6..5a0d42464d44 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -36,6 +36,7 @@ struct amd_iommu_pi_data {
 	u32 ga_tag;
 	u32 vector;		/* Guest vector of the interrupt */
 	int cpu;
+	bool ga_log_intr;
 	bool is_guest_mode;
 	void *ir_data;
 };
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 5803f778999f..02f266901bfe 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -785,10 +785,12 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
 		 * is awakened and/or scheduled in.  See also avic_vcpu_load().
 		 */
 		entry = svm->avic_physical_id_entry;
-		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+		if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) {
 			pi_data.cpu = entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
-		else
+		} else {
 			pi_data.cpu = -1;
+			pi_data.ga_log_intr = true;
+		}
 
 		ret = irq_set_vcpu_affinity(host_irq, &pi_data);
 		if (ret)
@@ -850,9 +852,9 @@ static void avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu,
 		void *data = irqfd->irq_bypass_data;
 
 		if (!(action & AVIC_TOGGLE_ON_OFF))
-			WARN_ON_ONCE(amd_iommu_update_ga(cpu, data));
+			WARN_ON_ONCE(amd_iommu_update_ga(data, cpu, true));
 		else if (cpu >= 0)
-			WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu));
+			WARN_ON_ONCE(amd_iommu_activate_guest_mode(data, cpu, true));
 		else
 			WARN_ON_ONCE(amd_iommu_deactivate_guest_mode(data));
 	}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 4b0cc249771f..c50d4a8a51be 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3804,7 +3804,8 @@ static const struct irq_domain_ops amd_ir_domain_ops = {
 	.deactivate = irq_remapping_deactivate,
 };
 
-static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
+static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu,
+				  bool ga_log_intr)
 {
 	if (cpu >= 0) {
 		entry->lo.fields_vapic.destination =
@@ -3812,8 +3813,10 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
 		entry->hi.fields.destination =
 					APICID_TO_IRTE_DEST_HI(cpu);
 		entry->lo.fields_vapic.is_run = true;
+		entry->lo.fields_vapic.ga_log_intr = false;
 	} else {
 		entry->lo.fields_vapic.is_run = false;
+		entry->lo.fields_vapic.ga_log_intr = ga_log_intr;
 	}
 }
 
@@ -3822,16 +3825,19 @@ static void __amd_iommu_update_ga(struct irte_ga *entry, int cpu)
  * a vCPU, without issuing an IOMMU invalidation for the IRTE.
  *
  * If the vCPU is associated with a pCPU (@cpu >= 0), configure the Destination
- * with the pCPU's APIC ID and set IsRun, else clear IsRun.  I.e. treat vCPUs
- * that are associated with a pCPU as running.  This API is intended to be used
- * when a vCPU is scheduled in/out (or stops running for any reason), to do a
- * fast update of IsRun and (conditionally) Destination.
+ * with the pCPU's APIC ID, set IsRun, and clear GALogIntr.  If the vCPU isn't
+ * associated with a pCPU (@cpu < 0), clear IsRun and set/clear GALogIntr based
+ * on input from the caller (e.g. KVM only requests GALogIntr when the vCPU is
+ * blocking and requires a notification wake event).  I.e. treat vCPUs that are
+ * associated with a pCPU as running.  This API is intended to be used when a
+ * vCPU is scheduled in/out (or stops running for any reason), to do a fast
+ * update of IsRun, GALogIntr, and (conditionally) Destination.
  *
  * Per the IOMMU spec, the Destination, IsRun, and GATag fields are not cached
  * and thus don't require an invalidation to ensure the IOMMU consumes fresh
  * information.
  */
-int amd_iommu_update_ga(int cpu, void *data)
+int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -3845,14 +3851,14 @@ int amd_iommu_update_ga(int cpu, void *data)
 	if (!ir_data->iommu)
 		return -ENODEV;
 
-	__amd_iommu_update_ga(entry, cpu);
+	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 
 	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 				ir_data->irq_2_irte.index, entry);
 }
 EXPORT_SYMBOL(amd_iommu_update_ga);
 
-int amd_iommu_activate_guest_mode(void *data, int cpu)
+int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
 	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
@@ -3871,12 +3877,11 @@ int amd_iommu_activate_guest_mode(void *data, int cpu)
 
 	entry->lo.fields_vapic.valid       = valid;
 	entry->lo.fields_vapic.guest_mode  = 1;
-	entry->lo.fields_vapic.ga_log_intr = 1;
 	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
 	entry->hi.fields.vector            = ir_data->ga_vector;
 	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
 
-	__amd_iommu_update_ga(entry, cpu);
+	__amd_iommu_update_ga(entry, cpu, ga_log_intr);
 
 	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
 			      ir_data->irq_2_irte.index, entry);
@@ -3947,7 +3952,8 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *info)
 		ir_data->ga_vector = pi_data->vector;
 		ir_data->ga_tag = pi_data->ga_tag;
 		if (pi_data->is_guest_mode)
-			ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu);
+			ret = amd_iommu_activate_guest_mode(ir_data, pi_data->cpu,
+							    pi_data->ga_log_intr);
 		else
 			ret = amd_iommu_deactivate_guest_mode(ir_data);
 	} else {
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index c9f2df0c4596..8cced632ecd0 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -30,9 +30,8 @@ static inline void amd_iommu_detect(void) { }
 /* IOMMU AVIC Function */
 extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32));
 
-extern int amd_iommu_update_ga(int cpu, void *data);
-
-extern int amd_iommu_activate_guest_mode(void *data, int cpu);
+extern int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr);
+extern int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr);
 extern int amd_iommu_deactivate_guest_mode(void *data);
 
 #else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */
@@ -43,12 +42,12 @@ amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
 	return 0;
 }
 
-static inline int amd_iommu_update_ga(int cpu, void *data)
+static inline int amd_iommu_update_ga(void *data, int cpu, bool ga_log_intr)
 {
 	return 0;
 }
 
-static inline int amd_iommu_activate_guest_mode(void *data, int cpu)
+static inline int amd_iommu_activate_guest_mode(void *data, int cpu, bool ga_log_intr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 283ed5001d6852f85c09ed2522331b2b197ba937 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 22 May 2025 16:52:11 -0700
Subject: KVM: Use a local struct to do the initial vfs_poll() on an irqfd

Use a function-local struct for the poll_table passed to vfs_poll(), as
nothing in the vfs_poll() callchain grabs a long-term reference to the
structure, i.e. its lifetime doesn't need to be tied to the irqfd.  Using
a local structure will also allow propagating failures out of the polling
callback without further polluting kvm_kernel_irqfd.

Opportunstically rename irqfd_ptable_queue_proc() to kvm_irqfd_register()
to capture what it actually does.

Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250522235223.3178519-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_irqfd.h |  1 -
 virt/kvm/eventfd.c        | 26 +++++++++++++++++---------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_irqfd.h b/include/linux/kvm_irqfd.h
index 361c07f4466d..ef8c134ded8a 100644
--- a/include/linux/kvm_irqfd.h
+++ b/include/linux/kvm_irqfd.h
@@ -55,7 +55,6 @@ struct kvm_kernel_irqfd {
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
-	poll_table pt;
 	struct work_struct shutdown;
 	struct irq_bypass_consumer consumer;
 	struct irq_bypass_producer *producer;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 59b1e64697f1..0b655376734e 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -245,12 +245,17 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
 	return ret;
 }
 
-static void
-irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
-			poll_table *pt)
+struct kvm_irqfd_pt {
+	struct kvm_kernel_irqfd *irqfd;
+	poll_table pt;
+};
+
+static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
+			       poll_table *pt)
 {
-	struct kvm_kernel_irqfd *irqfd =
-		container_of(pt, struct kvm_kernel_irqfd, pt);
+	struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
+	struct kvm_kernel_irqfd *irqfd = p->irqfd;
+
 	add_wait_queue_priority(wqh, &irqfd->wait);
 }
 
@@ -298,6 +303,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
+	struct kvm_irqfd_pt irqfd_pt;
 	int ret;
 	__poll_t events;
 	int idx;
@@ -387,7 +393,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * a callback whenever someone signals the underlying eventfd
 	 */
 	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
-	init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
 
 	spin_lock_irq(&kvm->irqfds.lock);
 
@@ -409,11 +414,14 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	spin_unlock_irq(&kvm->irqfds.lock);
 
 	/*
-	 * Check if there was an event already pending on the eventfd
-	 * before we registered, and trigger it as if we didn't miss it.
+	 * Register the irqfd with the eventfd by polling on the eventfd.  If
+	 * there was en event pending on the eventfd prior to registering,
+	 * manually trigger IRQ injection.
 	 */
-	events = vfs_poll(fd_file(f), &irqfd->pt);
+	irqfd_pt.irqfd = irqfd;
+	init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
 
+	events = vfs_poll(fd_file(f), &irqfd_pt.pt);
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
 
-- 
cgit v1.2.3


From 0d09582b3a607436fd91d6ce813048a048ecbf10 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 22 May 2025 16:52:18 -0700
Subject: sched/wait: Add a waitqueue helper for fully exclusive priority
 waiters

Add a waitqueue helper to add a priority waiter that requires exclusive
wakeups, i.e. that requires that it be the _only_ priority waiter.  The
API will be used by KVM to ensure that at most one of KVM's irqfds is
bound to a single eventfd (across the entire kernel).

Open code the helper instead of using __add_wait_queue() so that the
common path doesn't need to "handle" impossible failures.

Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250522235223.3178519-9-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/wait.h |  2 ++
 kernel/sched/wait.c  | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 965a19809c7e..09855d819418 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -164,6 +164,8 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+					     struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 4ab3ab195277..15632c89c4f2 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -47,6 +47,24 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
 }
 EXPORT_SYMBOL_GPL(add_wait_queue_priority);
 
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+				      struct wait_queue_entry *wq_entry)
+{
+	struct list_head *head = &wq_head->head;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+	guard(spinlock_irqsave)(&wq_head->lock);
+
+	if (!list_empty(head) &&
+	    (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+		return -EBUSY;
+
+	list_add(&wq_entry->entry, head);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From ceb5ab3cb64637952657be23d347e1c79dd02212 Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin@intel.com>
Date: Tue, 17 Jun 2025 17:51:51 +0300
Subject: mtd: add driver for intel graphics non-volatile memory device

Add auxiliary driver for intel discrete graphics
non-volatile memory device.

CC: Lucas De Marchi <lucas.demarchi@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Co-developed-by: Tomas Winkler <tomasw@gmail.com>
Signed-off-by: Tomas Winkler <tomasw@gmail.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Link: https://lore.kernel.org/r/20250617145159.3803852-2-alexander.usyskin@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 MAINTAINERS                        |   7 ++
 drivers/mtd/devices/Kconfig        |  11 +++
 drivers/mtd/devices/Makefile       |   1 +
 drivers/mtd/devices/mtd_intel_dg.c | 134 +++++++++++++++++++++++++++++++++++++
 include/linux/intel_dg_nvm_aux.h   |  30 +++++++++
 5 files changed, 183 insertions(+)
 create mode 100644 drivers/mtd/devices/mtd_intel_dg.c
 create mode 100644 include/linux/intel_dg_nvm_aux.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7e7515a412e9..521730530243 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12086,6 +12086,13 @@ L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	arch/x86/include/asm/intel-family.h
 
+INTEL DISCRETE GRAPHICS NVM MTD DRIVER
+M:	Alexander Usyskin <alexander.usyskin@intel.com>
+L:	linux-mtd@lists.infradead.org
+S:	Supported
+F:	drivers/mtd/devices/mtd_intel_dg.c
+F:	include/linux/intel_dg_nvm_aux.h
+
 INTEL DRM DISPLAY FOR XE AND I915 DRIVERS
 M:	Jani Nikula <jani.nikula@linux.intel.com>
 M:	Rodrigo Vivi <rodrigo.vivi@intel.com>
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index aed653ce8fa2..46cebde79f34 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -183,6 +183,17 @@ config MTD_POWERNV_FLASH
 	  platforms from Linux. This device abstracts away the
 	  firmware interface for flash access.
 
+config MTD_INTEL_DG
+	tristate "Intel Discrete Graphics non-volatile memory driver"
+	depends on AUXILIARY_BUS
+	depends on MTD
+	help
+	  This provides an MTD device to access Intel Discrete Graphics
+	  non-volatile memory.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called mtd-intel-dg.
+
 comment "Disk-On-Chip Device Drivers"
 
 config MTD_DOCG3
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index d11eb2b8b6f8..9fe4ce9cffde 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_MTD_SST25L)	+= sst25l.o
 obj-$(CONFIG_MTD_BCM47XXSFLASH)	+= bcm47xxsflash.o
 obj-$(CONFIG_MTD_ST_SPI_FSM)    += st_spi_fsm.o
 obj-$(CONFIG_MTD_POWERNV_FLASH)	+= powernv_flash.o
+obj-$(CONFIG_MTD_INTEL_DG)	+= mtd_intel_dg.o
 
 
 CFLAGS_docg3.o			+= -I$(src)
diff --git a/drivers/mtd/devices/mtd_intel_dg.c b/drivers/mtd/devices/mtd_intel_dg.c
new file mode 100644
index 000000000000..c2d0dbfb0378
--- /dev/null
+++ b/drivers/mtd/devices/mtd_intel_dg.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright(c) 2019-2025, Intel Corporation. All rights reserved.
+ */
+
+#include <linux/device.h>
+#include <linux/intel_dg_nvm_aux.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+struct intel_dg_nvm {
+	struct kref refcnt;
+	void __iomem *base;
+	size_t size;
+	unsigned int nregions;
+	struct {
+		const char *name;
+		u8 id;
+		u64 offset;
+		u64 size;
+	} regions[] __counted_by(nregions);
+};
+
+static void intel_dg_nvm_release(struct kref *kref)
+{
+	struct intel_dg_nvm *nvm = container_of(kref, struct intel_dg_nvm, refcnt);
+	int i;
+
+	pr_debug("freeing intel_dg nvm\n");
+	for (i = 0; i < nvm->nregions; i++)
+		kfree(nvm->regions[i].name);
+	kfree(nvm);
+}
+
+static int intel_dg_mtd_probe(struct auxiliary_device *aux_dev,
+			      const struct auxiliary_device_id *aux_dev_id)
+{
+	struct intel_dg_nvm_dev *invm = auxiliary_dev_to_intel_dg_nvm_dev(aux_dev);
+	struct intel_dg_nvm *nvm;
+	struct device *device;
+	unsigned int nregions;
+	unsigned int i, n;
+	int ret;
+
+	device = &aux_dev->dev;
+
+	/* count available regions */
+	for (nregions = 0, i = 0; i < INTEL_DG_NVM_REGIONS; i++) {
+		if (invm->regions[i].name)
+			nregions++;
+	}
+
+	if (!nregions) {
+		dev_err(device, "no regions defined\n");
+		return -ENODEV;
+	}
+
+	nvm = kzalloc(struct_size(nvm, regions, nregions), GFP_KERNEL);
+	if (!nvm)
+		return -ENOMEM;
+
+	kref_init(&nvm->refcnt);
+
+	for (n = 0, i = 0; i < INTEL_DG_NVM_REGIONS; i++) {
+		if (!invm->regions[i].name)
+			continue;
+
+		char *name = kasprintf(GFP_KERNEL, "%s.%s",
+				       dev_name(&aux_dev->dev), invm->regions[i].name);
+		if (!name)
+			continue;
+		nvm->regions[n].name = name;
+		nvm->regions[n].id = i;
+		n++;
+	}
+	nvm->nregions = n; /* in case where kasprintf fail */
+
+	nvm->base = devm_ioremap_resource(device, &invm->bar);
+	if (IS_ERR(nvm->base)) {
+		ret = PTR_ERR(nvm->base);
+		goto err;
+	}
+
+	dev_set_drvdata(&aux_dev->dev, nvm);
+
+	return 0;
+
+err:
+	kref_put(&nvm->refcnt, intel_dg_nvm_release);
+	return ret;
+}
+
+static void intel_dg_mtd_remove(struct auxiliary_device *aux_dev)
+{
+	struct intel_dg_nvm *nvm = dev_get_drvdata(&aux_dev->dev);
+
+	if (!nvm)
+		return;
+
+	dev_set_drvdata(&aux_dev->dev, NULL);
+
+	kref_put(&nvm->refcnt, intel_dg_nvm_release);
+}
+
+static const struct auxiliary_device_id intel_dg_mtd_id_table[] = {
+	{
+		.name = "i915.nvm",
+	},
+	{
+		.name = "xe.nvm",
+	},
+	{
+		/* sentinel */
+	}
+};
+MODULE_DEVICE_TABLE(auxiliary, intel_dg_mtd_id_table);
+
+static struct auxiliary_driver intel_dg_mtd_driver = {
+	.probe  = intel_dg_mtd_probe,
+	.remove = intel_dg_mtd_remove,
+	.driver = {
+		/* auxiliary_driver_register() sets .name to be the modname */
+	},
+	.id_table = intel_dg_mtd_id_table
+};
+module_auxiliary_driver(intel_dg_mtd_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel DGFX MTD driver");
diff --git a/include/linux/intel_dg_nvm_aux.h b/include/linux/intel_dg_nvm_aux.h
new file mode 100644
index 000000000000..00b6c1301bd8
--- /dev/null
+++ b/include/linux/intel_dg_nvm_aux.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright(c) 2019-2025, Intel Corporation. All rights reserved.
+ */
+
+#ifndef __INTEL_DG_NVM_AUX_H__
+#define __INTEL_DG_NVM_AUX_H__
+
+#include <linux/auxiliary_bus.h>
+#include <linux/container_of.h>
+#include <linux/ioport.h>
+#include <linux/types.h>
+
+#define INTEL_DG_NVM_REGIONS 13
+
+struct intel_dg_nvm_region {
+	const char *name;
+};
+
+struct intel_dg_nvm_dev {
+	struct auxiliary_device aux_dev;
+	bool writable_override;
+	struct resource bar;
+	const struct intel_dg_nvm_region *regions;
+};
+
+#define auxiliary_dev_to_intel_dg_nvm_dev(auxiliary_dev) \
+	container_of(auxiliary_dev, struct intel_dg_nvm_dev, aux_dev)
+
+#endif /* __INTEL_DG_NVM_AUX_H__ */
-- 
cgit v1.2.3


From a1c940cbf505e2342ebb5ea996f0acf205d6af7b Mon Sep 17 00:00:00 2001
From: Reuven Abliyev <reuven.abliyev@intel.com>
Date: Tue, 17 Jun 2025 17:51:58 +0300
Subject: drm/xe/nvm: add support for non-posted erase

Erase command is slow on discrete graphics storage
and may overshot PCI completion timeout.
BMG introduces the ability to have non-posted erase.
Add driver support for non-posted erase with polling
for erase completion.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Reuven Abliyev <reuven.abliyev@intel.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
Link: https://lore.kernel.org/r/20250617145159.3803852-9-alexander.usyskin@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/xe/xe_nvm.c        | 25 +++++++++++++++++++++++
 drivers/mtd/devices/mtd_intel_dg.c | 42 ++++++++++++++++++++++++++++++++++++--
 include/linux/intel_dg_nvm_aux.h   |  2 ++
 3 files changed, 67 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/xe_nvm.c b/drivers/gpu/drm/xe/xe_nvm.c
index 20aa3b5d3637..61b0a1531a53 100644
--- a/drivers/gpu/drm/xe/xe_nvm.c
+++ b/drivers/gpu/drm/xe/xe_nvm.c
@@ -14,7 +14,15 @@
 #include "xe_sriov.h"
 
 #define GEN12_GUNIT_NVM_BASE 0x00102040
+#define GEN12_DEBUG_NVM_BASE 0x00101018
+
+#define GEN12_CNTL_PROTECTED_NVM_REG 0x0010100C
+
 #define GEN12_GUNIT_NVM_SIZE 0x80
+#define GEN12_DEBUG_NVM_SIZE 0x4
+
+#define NVM_NON_POSTED_ERASE_CHICKEN_BIT BIT(13)
+
 #define HECI_FW_STATUS_2_NVM_ACCESS_MODE BIT(3)
 
 static const struct intel_dg_nvm_region regions[INTEL_DG_NVM_REGIONS] = {
@@ -29,6 +37,16 @@ static void xe_nvm_release_dev(struct device *dev)
 {
 }
 
+static bool xe_nvm_non_posted_erase(struct xe_device *xe)
+{
+	struct xe_gt *gt = xe_root_mmio_gt(xe);
+
+	if (xe->info.platform != XE_BATTLEMAGE)
+		return false;
+	return !(xe_mmio_read32(&gt->mmio, XE_REG(GEN12_CNTL_PROTECTED_NVM_REG)) &
+		 NVM_NON_POSTED_ERASE_CHICKEN_BIT);
+}
+
 static bool xe_nvm_writable_override(struct xe_device *xe)
 {
 	struct xe_gt *gt = xe_root_mmio_gt(xe);
@@ -86,6 +104,7 @@ int xe_nvm_init(struct xe_device *xe)
 	nvm = xe->nvm;
 
 	nvm->writable_override = xe_nvm_writable_override(xe);
+	nvm->non_posted_erase = xe_nvm_non_posted_erase(xe);
 	nvm->bar.parent = &pdev->resource[0];
 	nvm->bar.start = GEN12_GUNIT_NVM_BASE + pdev->resource[0].start;
 	nvm->bar.end = nvm->bar.start + GEN12_GUNIT_NVM_SIZE - 1;
@@ -93,6 +112,12 @@ int xe_nvm_init(struct xe_device *xe)
 	nvm->bar.desc = IORES_DESC_NONE;
 	nvm->regions = regions;
 
+	nvm->bar2.parent = &pdev->resource[0];
+	nvm->bar2.start = GEN12_DEBUG_NVM_BASE + pdev->resource[0].start;
+	nvm->bar2.end = nvm->bar2.start + GEN12_DEBUG_NVM_SIZE - 1;
+	nvm->bar2.flags = IORESOURCE_MEM;
+	nvm->bar2.desc = IORES_DESC_NONE;
+
 	aux_dev = &nvm->aux_dev;
 
 	aux_dev->name = "nvm";
diff --git a/drivers/mtd/devices/mtd_intel_dg.c b/drivers/mtd/devices/mtd_intel_dg.c
index 97e1dc1ada5d..b438ee5aacc3 100644
--- a/drivers/mtd/devices/mtd_intel_dg.c
+++ b/drivers/mtd/devices/mtd_intel_dg.c
@@ -25,6 +25,9 @@ struct intel_dg_nvm {
 	struct mtd_info mtd;
 	struct mutex lock; /* region access lock */
 	void __iomem *base;
+	void __iomem *base2;
+	bool non_posted_erase;
+
 	size_t size;
 	unsigned int nregions;
 	struct {
@@ -41,6 +44,7 @@ struct intel_dg_nvm {
 #define NVM_VALSIG_REG        0x00000010
 #define NVM_ADDRESS_REG       0x00000040
 #define NVM_REGION_ID_REG     0x00000044
+#define NVM_DEBUG_REG         0x00000000
 /*
  * [15:0]-Erase size = 0x0010 4K 0x0080 32K 0x0100 64K
  * [23:16]-Reserved
@@ -72,6 +76,9 @@ struct intel_dg_nvm {
 #define NVM_FREG_ADDR_SHIFT 12
 #define NVM_FREG_MIN_REGION_SIZE 0xFFF
 
+#define NVM_NON_POSTED_ERASE_DONE BIT(23)
+#define NVM_NON_POSTED_ERASE_DONE_ITER 3000
+
 static inline void idg_nvm_set_region_id(struct intel_dg_nvm *nvm, u8 region)
 {
 	iowrite32((u32)region, nvm->base + NVM_REGION_ID_REG);
@@ -373,13 +380,32 @@ static ssize_t idg_read(struct intel_dg_nvm *nvm, u8 region,
 static ssize_t
 idg_erase(struct intel_dg_nvm *nvm, u8 region, loff_t from, u64 len, u64 *fail_addr)
 {
+	void __iomem *base2 = nvm->base2;
 	void __iomem *base = nvm->base;
 	const u32 block = 0x10;
+	u32 iter = 0;
+	u32 reg;
 	u64 i;
 
 	for (i = 0; i < len; i += SZ_4K) {
 		iowrite32(from + i, base + NVM_ADDRESS_REG);
 		iowrite32(region << 24 | block, base + NVM_ERASE_REG);
+		if (nvm->non_posted_erase) {
+			/* Wait for Erase Done */
+			reg = ioread32(base2 + NVM_DEBUG_REG);
+			while (!(reg & NVM_NON_POSTED_ERASE_DONE) &&
+			       ++iter < NVM_NON_POSTED_ERASE_DONE_ITER) {
+				msleep(10);
+				reg = ioread32(base2 + NVM_DEBUG_REG);
+			}
+			if (reg & NVM_NON_POSTED_ERASE_DONE) {
+				/* Clear Erase Done */
+				iowrite32(reg, base2 + NVM_DEBUG_REG);
+			} else {
+				*fail_addr = from + i;
+				return -ETIME;
+			}
+		}
 		/* Since the writes are via sgunit
 		 * we cannot do back to back erases.
 		 */
@@ -388,7 +414,8 @@ idg_erase(struct intel_dg_nvm *nvm, u8 region, loff_t from, u64 len, u64 *fail_a
 	return len;
 }
 
-static int intel_dg_nvm_init(struct intel_dg_nvm *nvm, struct device *device)
+static int intel_dg_nvm_init(struct intel_dg_nvm *nvm, struct device *device,
+			     bool non_posted_erase)
 {
 	u32 access_map = 0;
 	unsigned int i, n;
@@ -448,7 +475,10 @@ static int intel_dg_nvm_init(struct intel_dg_nvm *nvm, struct device *device)
 			n++;
 	}
 
+	nvm->non_posted_erase = non_posted_erase;
+
 	dev_dbg(device, "Registered %d regions\n", n);
+	dev_dbg(device, "Non posted erase %d\n", nvm->non_posted_erase);
 
 	/* Need to add 1 to the amount of memory
 	 * so it is reported as an even block
@@ -729,7 +759,15 @@ static int intel_dg_mtd_probe(struct auxiliary_device *aux_dev,
 		goto err;
 	}
 
-	ret = intel_dg_nvm_init(nvm, device);
+	if (invm->non_posted_erase) {
+		nvm->base2 = devm_ioremap_resource(device, &invm->bar2);
+		if (IS_ERR(nvm->base2)) {
+			ret = PTR_ERR(nvm->base2);
+			goto err;
+		}
+	}
+
+	ret = intel_dg_nvm_init(nvm, device, invm->non_posted_erase);
 	if (ret < 0) {
 		dev_err(device, "cannot initialize nvm %d\n", ret);
 		goto err;
diff --git a/include/linux/intel_dg_nvm_aux.h b/include/linux/intel_dg_nvm_aux.h
index 00b6c1301bd8..625d46a6b96e 100644
--- a/include/linux/intel_dg_nvm_aux.h
+++ b/include/linux/intel_dg_nvm_aux.h
@@ -20,7 +20,9 @@ struct intel_dg_nvm_region {
 struct intel_dg_nvm_dev {
 	struct auxiliary_device aux_dev;
 	bool writable_override;
+	bool non_posted_erase;
 	struct resource bar;
+	struct resource bar2;
 	const struct intel_dg_nvm_region *regions;
 };
 
-- 
cgit v1.2.3


From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 17 Jun 2025 00:09:51 -0400
Subject: replace collect_mounts()/drop_collected_mounts() with a safer variant

collect_mounts() has several problems - one can't iterate over the results
directly, so it has to be done with callback passed to iterate_mounts();
it has an oopsable race with d_invalidate(); it creates temporary clones
of mounts invisibly for sync umount (IOW, you can have non-lazy umount
succeed leaving filesystem not mounted anywhere and yet still busy).

A saner approach is to give caller an array of struct path that would pin
every mount in a subtree, without cloning any mounts.

        * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone
        * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or
a pointer to array of struct path, one for each chunk of tree visible under
'where' (i.e. the first element is a copy of where, followed by (mount,root)
for everything mounted under it - the same set collect_mounts() would give).
Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning
references to the roots of subtrees in the caller's namespace.
        Array is terminated by {NULL, NULL} struct path.  If it fits into
preallocated array (on-stack, normally), that's where it goes; otherwise
it's allocated by kmalloc_array().  Passing 0 as size means that 'preallocated'
is ignored (and expected to be NULL).
        * drop_collected_paths(paths, preallocated) is given the array returned
by an earlier call of collect_paths() and the preallocated array passed to that
call.  All mount/dentry references are dropped and array is kfree'd if it's not
equal to 'preallocated'.
        * instead of iterate_mounts(), users should just iterate over array
of struct path - nothing exotic is needed for that.  Existing users (all in
audit_tree.c) are converted.

[folded a fix for braino reported by Venkat Rao Bagalkote <venkat88@linux.ibm.com>]

Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry")
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst |  9 ++++
 fs/namespace.c                        | 97 +++++++++++++++++++++--------------
 fs/pnode.h                            |  2 -
 include/linux/mount.h                 |  6 +--
 kernel/audit_tree.c                   | 63 ++++++++++++-----------
 5 files changed, 104 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 3616d7161dab..a5734bdd1cc7 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1249,3 +1249,12 @@ Using try_lookup_noperm() will require linux/namei.h to be included.
 
 Calling conventions for ->d_automount() have changed; we should *not* grab
 an extra reference to new mount - it should be returned with refcount 1.
+
+---
+
+collect_mounts()/drop_collected_mounts()/iterate_mounts() are gone now.
+Replacement is collect_paths()/drop_collected_path(), with no special
+iterator needed.  Instead of a cloned mount tree, the new interface returns
+an array of struct path, one for each mount collect_mounts() would've
+created.  These struct path point to locations in the caller's namespace
+that would be roots of the cloned mounts.
diff --git a/fs/namespace.c b/fs/namespace.c
index e13d9ab4f564..14601ec4c2c5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2310,21 +2310,62 @@ out:
 	return dst_mnt;
 }
 
-/* Caller should check returned pointer for errors */
+static inline bool extend_array(struct path **res, struct path **to_free,
+				unsigned n, unsigned *count, unsigned new_count)
+{
+	struct path *p;
+
+	if (likely(n < *count))
+		return true;
+	p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
+	if (p && *count)
+		memcpy(p, *res, *count * sizeof(struct path));
+	*count = new_count;
+	kfree(*to_free);
+	*to_free = *res = p;
+	return p;
+}
 
-struct vfsmount *collect_mounts(const struct path *path)
+struct path *collect_paths(const struct path *path,
+			      struct path *prealloc, unsigned count)
 {
-	struct mount *tree;
-	namespace_lock();
-	if (!check_mnt(real_mount(path->mnt)))
-		tree = ERR_PTR(-EINVAL);
-	else
-		tree = copy_tree(real_mount(path->mnt), path->dentry,
-				 CL_COPY_ALL | CL_PRIVATE);
-	namespace_unlock();
-	if (IS_ERR(tree))
-		return ERR_CAST(tree);
-	return &tree->mnt;
+	struct mount *root = real_mount(path->mnt);
+	struct mount *child;
+	struct path *res = prealloc, *to_free = NULL;
+	unsigned n = 0;
+
+	guard(rwsem_read)(&namespace_sem);
+
+	if (!check_mnt(root))
+		return ERR_PTR(-EINVAL);
+	if (!extend_array(&res, &to_free, 0, &count, 32))
+		return ERR_PTR(-ENOMEM);
+	res[n++] = *path;
+	list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
+		if (!is_subdir(child->mnt_mountpoint, path->dentry))
+			continue;
+		for (struct mount *m = child; m; m = next_mnt(m, child)) {
+			if (!extend_array(&res, &to_free, n, &count, 2 * count))
+				return ERR_PTR(-ENOMEM);
+			res[n].mnt = &m->mnt;
+			res[n].dentry = m->mnt.mnt_root;
+			n++;
+		}
+	}
+	if (!extend_array(&res, &to_free, n, &count, count + 1))
+		return ERR_PTR(-ENOMEM);
+	memset(res + n, 0, (count - n) * sizeof(struct path));
+	for (struct path *p = res; p->mnt; p++)
+		path_get(p);
+	return res;
+}
+
+void drop_collected_paths(struct path *paths, struct path *prealloc)
+{
+	for (struct path *p = paths; p->mnt; p++)
+		path_put(p);
+	if (paths != prealloc)
+		kfree(paths);
 }
 
 static void free_mnt_ns(struct mnt_namespace *);
@@ -2401,15 +2442,6 @@ void dissolve_on_fput(struct vfsmount *mnt)
 	free_mnt_ns(ns);
 }
 
-void drop_collected_mounts(struct vfsmount *mnt)
-{
-	namespace_lock();
-	lock_mount_hash();
-	umount_tree(real_mount(mnt), 0);
-	unlock_mount_hash();
-	namespace_unlock();
-}
-
 static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
 {
 	struct mount *child;
@@ -2511,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path)
 }
 EXPORT_SYMBOL_GPL(clone_private_mount);
 
-int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
-		   struct vfsmount *root)
-{
-	struct mount *mnt;
-	int res = f(root, arg);
-	if (res)
-		return res;
-	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
-		res = f(&mnt->mnt, arg);
-		if (res)
-			return res;
-	}
-	return 0;
-}
-
 static void lock_mnt_tree(struct mount *mnt)
 {
 	struct mount *p;
@@ -6262,7 +6279,11 @@ void put_mnt_ns(struct mnt_namespace *ns)
 {
 	if (!refcount_dec_and_test(&ns->ns.count))
 		return;
-	drop_collected_mounts(&ns->root->mnt);
+	namespace_lock();
+	lock_mount_hash();
+	umount_tree(ns->root, 0);
+	unlock_mount_hash();
+	namespace_unlock();
 	free_mnt_ns(ns);
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index 34b6247af01d..2d026fb98b18 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -28,8 +28,6 @@
 #define CL_SHARED_TO_SLAVE	0x20
 #define CL_COPY_MNT_NS_FILE	0x40
 
-#define CL_COPY_ALL		(CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
-
 static inline void set_mnt_shared(struct mount *mnt)
 {
 	mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4880f434c021..1a508beba446 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -116,10 +116,8 @@ extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 int do_mount(const char *, const char __user *,
 		     const char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(const struct path *);
-extern void drop_collected_mounts(struct vfsmount *);
-extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
-			  struct vfsmount *);
+extern struct path *collect_paths(const struct path *, struct path *, unsigned);
+extern void drop_collected_paths(struct path *, struct path *);
 extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
 
 extern int cifs_root_data(char **dev, char **opts);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f2f38903b2fe..b0eae2a3c895 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule)
 	return 0;
 }
 
-static int compare_root(struct vfsmount *mnt, void *arg)
-{
-	return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
-	       (unsigned long)arg;
-}
-
 void audit_trim_trees(void)
 {
 	struct list_head cursor;
@@ -683,8 +677,9 @@ void audit_trim_trees(void)
 	while (cursor.next != &tree_list) {
 		struct audit_tree *tree;
 		struct path path;
-		struct vfsmount *root_mnt;
 		struct audit_node *node;
+		struct path *paths;
+		struct path array[16];
 		int err;
 
 		tree = container_of(cursor.next, struct audit_tree, list);
@@ -696,9 +691,9 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(&path);
+		paths = collect_paths(&path, array, 16);
 		path_put(&path);
-		if (IS_ERR(root_mnt))
+		if (IS_ERR(paths))
 			goto skip_it;
 
 		spin_lock(&hash_lock);
@@ -706,14 +701,17 @@ void audit_trim_trees(void)
 			struct audit_chunk *chunk = find_chunk(node);
 			/* this could be NULL if the watch is dying else where... */
 			node->index |= 1U<<31;
-			if (iterate_mounts(compare_root,
-					   (void *)(chunk->key),
-					   root_mnt))
-				node->index &= ~(1U<<31);
+			for (struct path *p = paths; p->dentry; p++) {
+				struct inode *inode = p->dentry->d_inode;
+				if (inode_to_key(inode) == chunk->key) {
+					node->index &= ~(1U<<31);
+					break;
+				}
+			}
 		}
 		spin_unlock(&hash_lock);
 		trim_marked(tree);
-		drop_collected_mounts(root_mnt);
+		drop_collected_paths(paths, array);
 skip_it:
 		put_tree(tree);
 		mutex_lock(&audit_filter_mutex);
@@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree)
 	put_tree(tree);
 }
 
-static int tag_mount(struct vfsmount *mnt, void *arg)
+static int tag_mounts(struct path *paths, struct audit_tree *tree)
 {
-	return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+	for (struct path *p = paths; p->dentry; p++) {
+		int err = tag_chunk(p->dentry->d_inode, tree);
+		if (err)
+			return err;
+	}
+	return 0;
 }
 
 /*
@@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
 {
 	struct audit_tree *seed = rule->tree, *tree;
 	struct path path;
-	struct vfsmount *mnt;
+	struct path array[16];
+	struct path *paths;
 	int err;
 
 	rule->tree = NULL;
@@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(&path);
+	paths = collect_paths(&path, array, 16);
 	path_put(&path);
-	if (IS_ERR(mnt)) {
-		err = PTR_ERR(mnt);
+	if (IS_ERR(paths)) {
+		err = PTR_ERR(paths);
 		goto Err;
 	}
 
 	get_tree(tree);
-	err = iterate_mounts(tag_mount, tree, mnt);
-	drop_collected_mounts(mnt);
+	err = tag_mounts(paths, tree);
+	drop_collected_paths(paths, array);
 
 	if (!err) {
 		struct audit_node *node;
@@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new)
 	struct list_head cursor, barrier;
 	int failed = 0;
 	struct path path1, path2;
-	struct vfsmount *tagged;
+	struct path array[16];
+	struct path *paths;
 	int err;
 
 	err = kern_path(new, 0, &path2);
 	if (err)
 		return err;
-	tagged = collect_mounts(&path2);
+	paths = collect_paths(&path2, array, 16);
 	path_put(&path2);
-	if (IS_ERR(tagged))
-		return PTR_ERR(tagged);
+	if (IS_ERR(paths))
+		return PTR_ERR(paths);
 
 	err = kern_path(old, 0, &path1);
 	if (err) {
-		drop_collected_mounts(tagged);
+		drop_collected_paths(paths, array);
 		return err;
 	}
 
@@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new)
 			continue;
 		}
 
-		failed = iterate_mounts(tag_mount, tree, tagged);
+		failed = tag_mounts(paths, tree);
 		if (failed) {
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
@@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new)
 	list_del(&cursor);
 	mutex_unlock(&audit_filter_mutex);
 	path_put(&path1);
-	drop_collected_mounts(tagged);
+	drop_collected_paths(paths, array);
 	return failed;
 }
 
-- 
cgit v1.2.3


From fc2898ea793a48bc4b74b61cde2d8656f20efdf4 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Mon, 23 Jun 2025 01:30:49 +0100
Subject: workqueue: Remove unused work_on_cpu_safe

The last use of the work_on_cpu_safe() macro was removed recently by
commit 9cda46babdfe ("crypto: n2 - remove Niagara2 SPU driver")

Remove it, and the work_on_cpu_safe_key() function it calls.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 13 -------------
 kernel/workqueue.c        | 25 -------------------------
 2 files changed, 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 83d158bb2791..45d5dd470ff6 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -847,19 +847,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
 	work_on_cpu_key(_cpu, _fn, _arg, &__key);	\
 })
 
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
-			  void *arg, struct lock_class_key *key);
-
-/*
- * A new key is defined for each caller to make sure the work
- * associated with the function doesn't share its locking class.
- */
-#define work_on_cpu_safe(_cpu, _fn, _arg)		\
-({							\
-	static struct lock_class_key __key;		\
-							\
-	work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);	\
-})
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 086452e339c4..12c3080332bd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -6771,31 +6771,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
 	return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu_key);
-
-/**
- * work_on_cpu_safe_key - run a function in thread context on a particular cpu
- * @cpu: the cpu to run on
- * @fn:  the function to run
- * @arg: the function argument
- * @key: The lock class key for lock debugging purposes
- *
- * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
- * any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
-			  void *arg, struct lock_class_key *key)
-{
-	long ret = -ENODEV;
-
-	cpus_read_lock();
-	if (cpu_online(cpu))
-		ret = work_on_cpu_key(cpu, fn, arg, key);
-	cpus_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.2.3


From 3aa54d162490f14d1f1fdf3b3d1170b2ea50276b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 18 Jun 2025 11:11:29 +0200
Subject: PCI/pwrctrl: Fix the kerneldoc tag for private fields

The correct tag for marking private fields in kerneldoc is "private:", not
capitalized "Private:". Fix the pwrctl struct to silence the following
warnings:

  Warning: include/linux/pci-pwrctrl.h:45 struct member 'nb' not described in 'pci_pwrctrl'
  Warning: include/linux/pci-pwrctrl.h:45 struct member 'link' not described in 'pci_pwrctrl'
  Warning: include/linux/pci-pwrctrl.h:45 struct member 'work' not described in 'pci_pwrctrl'

Fixes: 4565d2652a37 ("PCI/pwrctl: Add PCI power control core code")
Reported-by: Bjorn Helgaas <helgaas@kernel.org>
Closes: https://lore.kernel.org/all/20250617233539.GA1177120@bhelgaas/
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/20250618091129.44810-1-brgl@bgdev.pl
---
 include/linux/pci-pwrctrl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pci-pwrctrl.h b/include/linux/pci-pwrctrl.h
index 7d439b0675e9..4aefc7901cd1 100644
--- a/include/linux/pci-pwrctrl.h
+++ b/include/linux/pci-pwrctrl.h
@@ -39,7 +39,7 @@ struct device_link;
 struct pci_pwrctrl {
 	struct device *dev;
 
-	/* Private: don't use. */
+	/* private: internal use only */
 	struct notifier_block nb;
 	struct device_link *link;
 	struct work_struct work;
-- 
cgit v1.2.3


From e84a4927a404f369c842c19de93b216627fcc690 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jun 2025 13:30:00 +0000
Subject: net: annotate races around sk->sk_uid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sk->sk_uid can be read while another thread changes its
value in sockfs_setattr().

Add sk_uid(const struct sock *sk) helper to factorize the needed
READ_ONCE() annotations, and add corresponding WRITE_ONCE()
where needed.

Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Link: https://patch.msgid.link/20250620133001.4090592-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/route.h              |  4 ++--
 include/net/sock.h               | 12 ++++++++++--
 net/ipv4/inet_connection_sock.c  |  4 ++--
 net/ipv4/ping.c                  |  2 +-
 net/ipv4/raw.c                   |  2 +-
 net/ipv4/route.c                 |  3 ++-
 net/ipv4/syncookies.c            |  3 ++-
 net/ipv4/udp.c                   |  3 ++-
 net/ipv6/af_inet6.c              |  2 +-
 net/ipv6/datagram.c              |  2 +-
 net/ipv6/inet6_connection_sock.c |  4 ++--
 net/ipv6/ping.c                  |  2 +-
 net/ipv6/raw.c                   |  2 +-
 net/ipv6/route.c                 |  4 ++--
 net/ipv6/syncookies.c            |  2 +-
 net/ipv6/tcp_ipv6.c              |  2 +-
 net/ipv6/udp.c                   |  5 +++--
 net/l2tp/l2tp_ip6.c              |  2 +-
 net/mptcp/protocol.c             |  2 +-
 net/socket.c                     |  8 +++++---
 20 files changed, 42 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 8e39aa822cf9..3d3d6048ffca 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet,
 			   ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
 			   sk->sk_protocol, inet_sk_flowi_flags(sk), daddr,
 			   inet->inet_saddr, inet->inet_dport,
-			   inet->inet_sport, sk->sk_uid);
+			   inet->inet_sport, sk_uid(sk));
 	security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 }
 
@@ -331,7 +331,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
 
 	flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
 			   ip_sock_rt_scope(sk), protocol, flow_flags, dst,
-			   src, dport, sport, sk->sk_uid);
+			   src, dport, sport, sk_uid(sk));
 }
 
 static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
diff --git a/include/net/sock.h b/include/net/sock.h
index ca532227cbfd..fc5e6f66b00a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2076,6 +2076,7 @@ static inline void sock_orphan(struct sock *sk)
 	sock_set_flag(sk, SOCK_DEAD);
 	sk_set_socket(sk, NULL);
 	sk->sk_wq  = NULL;
+	/* Note: sk_uid is unchanged. */
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -2086,18 +2087,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
-	sk->sk_uid = SOCK_INODE(parent)->i_uid;
+	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 kuid_t sock_i_uid(struct sock *sk);
+
+static inline kuid_t sk_uid(const struct sock *sk)
+{
+	/* Paired with WRITE_ONCE() in sockfs_setattr() */
+	return READ_ONCE(sk->sk_uid);
+}
+
 unsigned long __sock_i_ino(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
 static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
 {
-	return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+	return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
 }
 
 static inline u32 net_tx_rndhash(void)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6906bedad19a..46750c96d08e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -812,7 +812,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num), sk->sk_uid);
+			   htons(ireq->ir_num), sk_uid(sk));
 	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
@@ -849,7 +849,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk),
 			   (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, ireq->ir_rmt_port,
-			   htons(ireq->ir_num), sk->sk_uid);
+			   htons(ireq->ir_num), sk_uid(sk));
 	security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
 	rt = ip_route_output_flow(net, fl4, sk);
 	if (IS_ERR(rt))
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index c14baa6589c7..4eacaf00e2e9 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
 			   ipc.tos & INET_DSCP_MASK, scope,
 			   sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
-			   saddr, 0, 0, sk->sk_uid);
+			   saddr, 0, 0, sk_uid(sk));
 
 	fl4.fl4_icmp_type = user_icmph.type;
 	fl4.fl4_icmp_code = user_icmph.code;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6aace4d55733..32f942d0f944 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			   hdrincl ? ipc.protocol : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk) |
 			    (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
-			   daddr, saddr, 0, 0, sk->sk_uid);
+			   daddr, saddr, 0, 0, sk_uid(sk));
 
 	fl4.fl4_icmp_type = 0;
 	fl4.fl4_icmp_code = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3ddf6bf40357..3ff2bd56d050 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -556,7 +556,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 			   inet_test_bit(HDRINCL, sk) ?
 				IPPROTO_RAW : sk->sk_protocol,
 			   inet_sk_flowi_flags(sk),
-			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
+			   daddr, inet->inet_saddr, 0, 0,
+			   sk_uid(sk));
 	rcu_read_unlock();
 }
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 5459a78b9809..eb0819463fae 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -454,7 +454,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 			   ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
 			   IPPROTO_TCP, inet_sk_flowi_flags(sk),
 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
-			   ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
+			   ireq->ir_loc_addr, th->source, th->dest,
+			   sk_uid(sk));
 	security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index dde52b8050b8..f94bb222aa2d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1445,7 +1445,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark,
 				   ipc.tos & INET_DSCP_MASK, scope,
 				   sk->sk_protocol, flow_flags, faddr, saddr,
-				   dport, inet->inet_sport, sk->sk_uid);
+				   dport, inet->inet_sport,
+				   sk_uid(sk));
 
 		security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
 		rt = ip_route_output_flow(net, fl4, sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index acaff1296783..1992621e3f3f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -842,7 +842,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
 		fl6.flowi6_mark = sk->sk_mark;
 		fl6.fl6_dport = inet->inet_dport;
 		fl6.fl6_sport = inet->inet_sport;
-		fl6.flowi6_uid = sk->sk_uid;
+		fl6.flowi6_uid = sk_uid(sk);
 		security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
 
 		rcu_read_lock();
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index fff78496803d..83f5aa5e133a 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -53,7 +53,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6,
 	fl6->fl6_dport = inet->inet_dport;
 	fl6->fl6_sport = inet->inet_sport;
 	fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
-	fl6->flowi6_uid = sk->sk_uid;
+	fl6->flowi6_uid = sk_uid(sk);
 
 	if (!oif)
 		oif = np->sticky_pktinfo.ipi6_ifindex;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 8f500eaf33cf..333e43434dd7 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -45,7 +45,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
 	fl6->flowi6_mark = ireq->ir_mark;
 	fl6->fl6_dport = ireq->ir_rmt_port;
 	fl6->fl6_sport = htons(ireq->ir_num);
-	fl6->flowi6_uid = sk->sk_uid;
+	fl6->flowi6_uid = sk_uid(sk);
 	security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
 
 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
@@ -79,7 +79,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
 	fl6->flowi6_mark = sk->sk_mark;
 	fl6->fl6_sport = inet->inet_sport;
 	fl6->fl6_dport = inet->inet_dport;
-	fl6->flowi6_uid = sk->sk_uid;
+	fl6->flowi6_uid = sk_uid(sk);
 	security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 
 	rcu_read_lock();
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 84d90dd8b3f0..82b0492923d4 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -142,7 +142,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	fl6.saddr = np->saddr;
 	fl6.daddr = *daddr;
 	fl6.flowi6_mark = ipc6.sockc.mark;
-	fl6.flowi6_uid = sk->sk_uid;
+	fl6.flowi6_uid = sk_uid(sk);
 	fl6.fl6_icmp_type = user_icmph.icmp6_type;
 	fl6.fl6_icmp_code = user_icmph.icmp6_code;
 	security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index fda640ebd53f..4c3f8245c40f 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	memset(&fl6, 0, sizeof(fl6));
 
 	fl6.flowi6_mark = ipc6.sockc.mark;
-	fl6.flowi6_uid = sk->sk_uid;
+	fl6.flowi6_uid = sk_uid(sk);
 
 	if (sin6) {
 		if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index df0caffefb38..d7a9b5bf30c8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3011,7 +3011,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 		oif = l3mdev_master_ifindex(skb->dev);
 
 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
-			sk->sk_uid);
+			sk_uid(sk));
 
 	dst = __sk_dst_get(sk);
 	if (!dst || !dst->obsolete ||
@@ -3233,7 +3233,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
 {
 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
-		     READ_ONCE(sk->sk_mark), sk->sk_uid);
+		     READ_ONCE(sk->sk_mark), sk_uid(sk));
 }
 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 9d83eadd308b..f0ee1a909771 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -236,7 +236,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.flowi6_mark = ireq->ir_mark;
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
-		fl6.flowi6_uid = sk->sk_uid;
+		fl6.flowi6_uid = sk_uid(sk);
 		security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
 
 		dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e8e68a142649..f61b0396ef6b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -269,7 +269,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	fl6.fl6_sport = inet->inet_sport;
 	if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
 		fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
-	fl6.flowi6_uid = sk->sk_uid;
+	fl6.flowi6_uid = sk_uid(sk);
 
 	opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
 	final_p = fl6_update_dst(&fl6, opt, &final);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7317f8e053f1..ebb95d8bc681 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -750,7 +750,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (type == NDISC_REDIRECT) {
 		if (tunnel) {
 			ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
-				     READ_ONCE(sk->sk_mark), sk->sk_uid);
+				     READ_ONCE(sk->sk_mark),
+				     sk_uid(sk));
 		} else {
 			ip6_sk_redirect(skb, sk);
 		}
@@ -1620,7 +1621,7 @@ do_udp_sendmsg:
 	if (!fl6->flowi6_oif)
 		fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
 
-	fl6->flowi6_uid = sk->sk_uid;
+	fl6->flowi6_uid = sk_uid(sk);
 
 	if (msg->msg_controllen) {
 		opt = &opt_space;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index b98d13584c81..ea232f338dcb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -545,7 +545,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	memset(&fl6, 0, sizeof(fl6));
 
 	fl6.flowi6_mark = READ_ONCE(sk->sk_mark);
-	fl6.flowi6_uid = sk->sk_uid;
+	fl6.flowi6_uid = sk_uid(sk);
 
 	ipcm6_init_sk(&ipc6, sk);
 
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index edf14c2c2062..e7972e633236 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3503,7 +3503,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent)
 	write_lock_bh(&sk->sk_callback_lock);
 	rcu_assign_pointer(sk->sk_wq, &parent->wq);
 	sk_set_socket(sk, parent);
-	sk->sk_uid = SOCK_INODE(parent)->i_uid;
+	WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
diff --git a/net/socket.c b/net/socket.c
index 2cab805943c0..682969deaed3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -592,10 +592,12 @@ static int sockfs_setattr(struct mnt_idmap *idmap,
 	if (!err && (iattr->ia_valid & ATTR_UID)) {
 		struct socket *sock = SOCKET_I(d_inode(dentry));
 
-		if (sock->sk)
-			sock->sk->sk_uid = iattr->ia_uid;
-		else
+		if (sock->sk) {
+			/* Paired with READ_ONCE() in sk_uid() */
+			WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid);
+		} else {
 			err = -ENOENT;
+		}
 	}
 
 	return err;
-- 
cgit v1.2.3


From c51da3f7a161c6822232be832abdffe47eb55b4c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jun 2025 13:30:01 +0000
Subject: net: remove sock_i_uid()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Difference between sock_i_uid() and sk_uid() is that
after sock_orphan(), sock_i_uid() returns GLOBAL_ROOT_UID
while sk_uid() returns the last cached sk->sk_uid value.

None of sock_i_uid() callers care about this.

Use sk_uid() which is much faster and inlined.

Note that diag/dump users are calling sock_i_ino() and
can not see the full benefit yet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lorenzo Colitti <lorenzo@google.com>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Link: https://patch.msgid.link/20250620133001.4090592-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h              |  2 --
 net/appletalk/atalk_proc.c      |  2 +-
 net/bluetooth/af_bluetooth.c    |  2 +-
 net/core/sock.c                 | 11 -----------
 net/ipv4/inet_connection_sock.c | 27 ++++++++++++---------------
 net/ipv4/inet_diag.c            |  2 +-
 net/ipv4/inet_hashtables.c      |  4 ++--
 net/ipv4/ping.c                 |  2 +-
 net/ipv4/raw.c                  |  2 +-
 net/ipv4/tcp_ipv4.c             |  8 ++++----
 net/ipv4/udp.c                  | 16 ++++++++--------
 net/ipv6/datagram.c             |  2 +-
 net/ipv6/tcp_ipv6.c             |  4 ++--
 net/key/af_key.c                |  2 +-
 net/llc/llc_proc.c              |  2 +-
 net/packet/af_packet.c          |  2 +-
 net/packet/diag.c               |  2 +-
 net/phonet/socket.c             |  4 ++--
 net/sctp/input.c                |  2 +-
 net/sctp/proc.c                 |  4 ++--
 net/sctp/socket.c               |  4 ++--
 net/smc/smc_diag.c              |  2 +-
 net/tipc/socket.c               |  2 +-
 net/unix/af_unix.c              |  2 +-
 net/unix/diag.c                 |  2 +-
 net/xdp/xsk_diag.c              |  2 +-
 26 files changed, 50 insertions(+), 66 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index fc5e6f66b00a..bbd97fbc5935 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2092,8 +2092,6 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
-kuid_t sock_i_uid(struct sock *sk);
-
 static inline kuid_t sk_uid(const struct sock *sk)
 {
 	/* Paired with WRITE_ONCE() in sockfs_setattr() */
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 9c1241292d1d..01787fb6a7bc 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -181,7 +181,7 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v)
 		   sk_wmem_alloc_get(s),
 		   sk_rmem_alloc_get(s),
 		   s->sk_state,
-		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)));
+		   from_kuid_munged(seq_user_ns(seq), sk_uid(s)));
 out:
 	return 0;
 }
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 6ad2f72f53f4..ee9bf84c88a7 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -815,7 +815,7 @@ static int bt_seq_show(struct seq_file *seq, void *v)
 			   refcount_read(&sk->sk_refcnt),
 			   sk_rmem_alloc_get(sk),
 			   sk_wmem_alloc_get(sk),
-			   from_kuid(seq_user_ns(seq), sock_i_uid(sk)),
+			   from_kuid(seq_user_ns(seq), sk_uid(sk)),
 			   sock_i_ino(sk),
 			   bt->parent ? sock_i_ino(bt->parent) : 0LU);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 502042a0d3b5..ceb74ceecb6c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2780,17 +2780,6 @@ void sock_pfree(struct sk_buff *skb)
 EXPORT_SYMBOL(sock_pfree);
 #endif /* CONFIG_INET */
 
-kuid_t sock_i_uid(struct sock *sk)
-{
-	kuid_t uid;
-
-	read_lock_bh(&sk->sk_callback_lock);
-	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
-	read_unlock_bh(&sk->sk_callback_lock);
-	return uid;
-}
-EXPORT_SYMBOL(sock_i_uid);
-
 unsigned long __sock_i_ino(struct sock *sk)
 {
 	unsigned long ino;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 46750c96d08e..f4157d26ec9e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -168,7 +168,7 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
 }
 
 static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
-			       kuid_t sk_uid, bool relax,
+			       kuid_t uid, bool relax,
 			       bool reuseport_cb_ok, bool reuseport_ok)
 {
 	int bound_dev_if2;
@@ -185,12 +185,12 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
 			if (!relax || (!reuseport_ok && sk->sk_reuseport &&
 				       sk2->sk_reuseport && reuseport_cb_ok &&
 				       (sk2->sk_state == TCP_TIME_WAIT ||
-					uid_eq(sk_uid, sock_i_uid(sk2)))))
+					uid_eq(uid, sk_uid(sk2)))))
 				return true;
 		} else if (!reuseport_ok || !sk->sk_reuseport ||
 			   !sk2->sk_reuseport || !reuseport_cb_ok ||
 			   (sk2->sk_state != TCP_TIME_WAIT &&
-			    !uid_eq(sk_uid, sock_i_uid(sk2)))) {
+			    !uid_eq(uid, sk_uid(sk2)))) {
 			return true;
 		}
 	}
@@ -198,7 +198,7 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
 }
 
 static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
-				   kuid_t sk_uid, bool relax,
+				   kuid_t uid, bool relax,
 				   bool reuseport_cb_ok, bool reuseport_ok)
 {
 	if (ipv6_only_sock(sk2)) {
@@ -211,20 +211,20 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
 #endif
 	}
 
-	return inet_bind_conflict(sk, sk2, sk_uid, relax,
+	return inet_bind_conflict(sk, sk2, uid, relax,
 				  reuseport_cb_ok, reuseport_ok);
 }
 
 static bool inet_bhash2_conflict(const struct sock *sk,
 				 const struct inet_bind2_bucket *tb2,
-				 kuid_t sk_uid,
+				 kuid_t uid,
 				 bool relax, bool reuseport_cb_ok,
 				 bool reuseport_ok)
 {
 	struct sock *sk2;
 
 	sk_for_each_bound(sk2, &tb2->owners) {
-		if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
+		if (__inet_bhash2_conflict(sk, sk2, uid, relax,
 					   reuseport_cb_ok, reuseport_ok))
 			return true;
 	}
@@ -242,8 +242,8 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 				  const struct inet_bind2_bucket *tb2, /* may be null */
 				  bool relax, bool reuseport_ok)
 {
-	kuid_t uid = sock_i_uid((struct sock *)sk);
 	struct sock_reuseport *reuseport_cb;
+	kuid_t uid = sk_uid(sk);
 	bool reuseport_cb_ok;
 	struct sock *sk2;
 
@@ -287,11 +287,11 @@ static int inet_csk_bind_conflict(const struct sock *sk,
 static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
 					  bool relax, bool reuseport_ok)
 {
-	kuid_t uid = sock_i_uid((struct sock *)sk);
 	const struct net *net = sock_net(sk);
 	struct sock_reuseport *reuseport_cb;
 	struct inet_bind_hashbucket *head2;
 	struct inet_bind2_bucket *tb2;
+	kuid_t uid = sk_uid(sk);
 	bool conflict = false;
 	bool reuseport_cb_ok;
 
@@ -425,15 +425,13 @@ success:
 static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
 				     struct sock *sk)
 {
-	kuid_t uid = sock_i_uid(sk);
-
 	if (tb->fastreuseport <= 0)
 		return 0;
 	if (!sk->sk_reuseport)
 		return 0;
 	if (rcu_access_pointer(sk->sk_reuseport_cb))
 		return 0;
-	if (!uid_eq(tb->fastuid, uid))
+	if (!uid_eq(tb->fastuid, sk_uid(sk)))
 		return 0;
 	/* We only need to check the rcv_saddr if this tb was once marked
 	 * without fastreuseport and then was reset, as we can only know that
@@ -458,14 +456,13 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
 void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
 			       struct sock *sk)
 {
-	kuid_t uid = sock_i_uid(sk);
 	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
 
 	if (hlist_empty(&tb->bhash2)) {
 		tb->fastreuse = reuse;
 		if (sk->sk_reuseport) {
 			tb->fastreuseport = FASTREUSEPORT_ANY;
-			tb->fastuid = uid;
+			tb->fastuid = sk_uid(sk);
 			tb->fast_rcv_saddr = sk->sk_rcv_saddr;
 			tb->fast_ipv6_only = ipv6_only_sock(sk);
 			tb->fast_sk_family = sk->sk_family;
@@ -492,7 +489,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
 			 */
 			if (!sk_reuseport_match(tb, sk)) {
 				tb->fastreuseport = FASTREUSEPORT_STRICT;
-				tb->fastuid = uid;
+				tb->fastuid = sk_uid(sk);
 				tb->fast_rcv_saddr = sk->sk_rcv_saddr;
 				tb->fast_ipv6_only = ipv6_only_sock(sk);
 				tb->fast_sk_family = sk->sk_family;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 1d1d6ad53f4c..2fa53b16fe77 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -181,7 +181,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 		goto errout;
 #endif
 
-	r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk));
 	r->idiag_inode = sock_i_ino(sk);
 
 	memset(&inet_sockopt, 0, sizeof(inet_sockopt));
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 77a0b52b2eab..ceeeec9b7290 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -721,8 +721,8 @@ static int inet_reuseport_add_sock(struct sock *sk,
 {
 	struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
 	const struct hlist_nulls_node *node;
+	kuid_t uid = sk_uid(sk);
 	struct sock *sk2;
-	kuid_t uid = sock_i_uid(sk);
 
 	sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
 		if (sk2 != sk &&
@@ -730,7 +730,7 @@ static int inet_reuseport_add_sock(struct sock *sk,
 		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
 		    sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
 		    inet_csk(sk2)->icsk_bind_hash == tb &&
-		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false))
 			return reuseport_add_sock(sk, sk2,
 						  inet_rcv_saddr_any(sk));
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4eacaf00e2e9..031df4c19fcc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1116,7 +1116,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
 		0, 0L, 0,
-		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
 		0, sock_i_ino(sp),
 		refcount_read(&sp->sk_refcnt), sp,
 		atomic_read(&sp->sk_drops));
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 32f942d0f944..1d2c89d63cc7 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1043,7 +1043,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
 		sk_wmem_alloc_get(sp),
 		sk_rmem_alloc_get(sp),
 		0, 0L, 0,
-		from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+		from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
 		0, sock_i_ino(sp),
 		refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6a14f9e6fef6..429fb34b075e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2896,7 +2896,7 @@ static void get_openreq4(const struct request_sock *req,
 		jiffies_delta_to_clock_t(delta),
 		req->num_timeout,
 		from_kuid_munged(seq_user_ns(f),
-				 sock_i_uid(req->rsk_listener)),
+				 sk_uid(req->rsk_listener)),
 		0,  /* non standard timer */
 		0, /* open_requests have no inode */
 		0,
@@ -2954,7 +2954,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 		timer_active,
 		jiffies_delta_to_clock_t(timer_expires - jiffies),
 		icsk->icsk_retransmits,
-		from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
+		from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
 		icsk->icsk_probes_out,
 		sock_i_ino(sk),
 		refcount_read(&sk->sk_refcnt), sk,
@@ -3246,9 +3246,9 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
 		const struct request_sock *req = v;
 
 		uid = from_kuid_munged(seq_user_ns(seq),
-				       sock_i_uid(req->rsk_listener));
+				       sk_uid(req->rsk_listener));
 	} else {
-		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+		uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
 	}
 
 	meta.seq = seq;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f94bb222aa2d..19573ee64a0f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -145,8 +145,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       unsigned long *bitmap,
 			       struct sock *sk, unsigned int log)
 {
+	kuid_t uid = sk_uid(sk);
 	struct sock *sk2;
-	kuid_t uid = sock_i_uid(sk);
 
 	sk_for_each(sk2, &hslot->head) {
 		if (net_eq(sock_net(sk2), net) &&
@@ -158,7 +158,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    inet_rcv_saddr_equal(sk, sk2, true)) {
 			if (sk2->sk_reuseport && sk->sk_reuseport &&
 			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			    uid_eq(uid, sock_i_uid(sk2))) {
+			    uid_eq(uid, sk_uid(sk2))) {
 				if (!bitmap)
 					return 0;
 			} else {
@@ -180,8 +180,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 				struct udp_hslot *hslot2,
 				struct sock *sk)
 {
+	kuid_t uid = sk_uid(sk);
 	struct sock *sk2;
-	kuid_t uid = sock_i_uid(sk);
 	int res = 0;
 
 	spin_lock(&hslot2->lock);
@@ -195,7 +195,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    inet_rcv_saddr_equal(sk, sk2, true)) {
 			if (sk2->sk_reuseport && sk->sk_reuseport &&
 			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
-			    uid_eq(uid, sock_i_uid(sk2))) {
+			    uid_eq(uid, sk_uid(sk2))) {
 				res = 0;
 			} else {
 				res = 1;
@@ -210,7 +210,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 {
 	struct net *net = sock_net(sk);
-	kuid_t uid = sock_i_uid(sk);
+	kuid_t uid = sk_uid(sk);
 	struct sock *sk2;
 
 	sk_for_each(sk2, &hslot->head) {
@@ -220,7 +220,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
 		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
 		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
-		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+		    sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
 		    inet_rcv_saddr_equal(sk, sk2, false)) {
 			return reuseport_add_sock(sk, sk2,
 						  inet_rcv_saddr_any(sk));
@@ -3387,7 +3387,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
 		sk_wmem_alloc_get(sp),
 		udp_rqueue_get(sp),
 		0, 0L, 0,
-		from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+		from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
 		0, sock_i_ino(sp),
 		refcount_read(&sp->sk_refcnt), sp,
 		atomic_read(&sp->sk_drops));
@@ -3630,7 +3630,7 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
 		goto unlock;
 	}
 
-	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
 	ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 83f5aa5e133a..281722817a65 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1064,7 +1064,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
 		   sk_wmem_alloc_get(sp),
 		   rqueue,
 		   0, 0L, 0,
-		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+		   from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
 		   0,
 		   sock_i_ino(sp),
 		   refcount_read(&sp->sk_refcnt), sp,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f61b0396ef6b..f0ce62549d90 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2168,7 +2168,7 @@ static void get_openreq6(struct seq_file *seq,
 		   jiffies_to_clock_t(ttd),
 		   req->num_timeout,
 		   from_kuid_munged(seq_user_ns(seq),
-				    sock_i_uid(req->rsk_listener)),
+				    sk_uid(req->rsk_listener)),
 		   0,  /* non standard timer */
 		   0, /* open_requests have no inode */
 		   0, req);
@@ -2234,7 +2234,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 		   timer_active,
 		   jiffies_delta_to_clock_t(timer_expires - jiffies),
 		   icsk->icsk_retransmits,
-		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+		   from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
 		   icsk->icsk_probes_out,
 		   sock_i_ino(sp),
 		   refcount_read(&sp->sk_refcnt), sp,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index efc2a91f4c48..1f82f69acfde 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3788,7 +3788,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v)
 			       refcount_read(&s->sk_refcnt),
 			       sk_rmem_alloc_get(s),
 			       sk_wmem_alloc_get(s),
-			       from_kuid_munged(seq_user_ns(f), sock_i_uid(s)),
+			       from_kuid_munged(seq_user_ns(f), sk_uid(s)),
 			       sock_i_ino(s)
 			       );
 	return 0;
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 07e9abb5978a..aa81c67b24a1 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -151,7 +151,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v)
 		   sk_wmem_alloc_get(sk),
 		   sk_rmem_alloc_get(sk) - llc->copied_seq,
 		   sk->sk_state,
-		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+		   from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
 		   llc->link);
 out:
 	return 0;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3d43f3eae759..f6b1ff883c93 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4783,7 +4783,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)
 			   READ_ONCE(po->ifindex),
 			   packet_sock_flag(po, PACKET_SOCK_RUNNING),
 			   atomic_read(&s->sk_rmem_alloc),
-			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
+			   from_kuid_munged(seq_user_ns(seq), sk_uid(s)),
 			   sock_i_ino(s));
 	}
 
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 47f69f3dbf73..6ce1dcc284d9 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -153,7 +153,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 
 	if ((req->pdiag_show & PACKET_SHOW_INFO) &&
 	    nla_put_u32(skb, PACKET_DIAG_UID,
-			from_kuid_munged(user_ns, sock_i_uid(sk))))
+			from_kuid_munged(user_ns, sk_uid(sk))))
 		goto out_nlmsg_trim;
 
 	if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 5ce0b3ee5def..ea4d5e6533db 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -584,7 +584,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
 			sk->sk_protocol, pn->sobject, pn->dobject,
 			pn->resource, sk->sk_state,
 			sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk),
-			from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+			from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
 			sock_i_ino(sk),
 			refcount_read(&sk->sk_refcnt), sk,
 			atomic_read(&sk->sk_drops));
@@ -755,7 +755,7 @@ static int pn_res_seq_show(struct seq_file *seq, void *v)
 
 		seq_printf(seq, "%02X %5u %lu",
 			   (int) (psk - pnres.sk),
-			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+			   from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
 			   sock_i_ino(sk));
 	}
 	seq_pad(seq, '\n');
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 0c0d2757f6f8..2dc2666988fb 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -756,7 +756,7 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
 			struct sock *sk2 = ep2->base.sk;
 
 			if (!net_eq(sock_net(sk2), net) || sk2 == sk ||
-			    !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) ||
+			    !uid_eq(sk_uid(sk2), sk_uid(sk)) ||
 			    !sk2->sk_reuseport)
 				continue;
 
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ec00ee75d59a..74bff317e205 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -177,7 +177,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk,
 			   sctp_sk(sk)->type, sk->sk_state, hash,
 			   ep->base.bind_addr.port,
-			   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+			   from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
 			   sock_i_ino(sk));
 
 		sctp_seq_dump_local_addrs(seq, &ep->base);
@@ -267,7 +267,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 		   assoc->assoc_id,
 		   assoc->sndbuf_used,
 		   atomic_read(&assoc->rmem_alloc),
-		   from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+		   from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
 		   sock_i_ino(sk),
 		   epb->bind_addr.port,
 		   assoc->peer.port);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 1e5739858c20..aa6400811018 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8345,8 +8345,8 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 	bool reuse = (sk->sk_reuse || sp->reuse);
 	struct sctp_bind_hashbucket *head; /* hash list */
 	struct net *net = sock_net(sk);
-	kuid_t uid = sock_i_uid(sk);
 	struct sctp_bind_bucket *pp;
+	kuid_t uid = sk_uid(sk);
 	unsigned short snum;
 	int ret;
 
@@ -8444,7 +8444,7 @@ pp_found:
 			    (reuse && (sk2->sk_reuse || sp2->reuse) &&
 			     sk2->sk_state != SCTP_SS_LISTENING) ||
 			    (sk->sk_reuseport && sk2->sk_reuseport &&
-			     uid_eq(uid, sock_i_uid(sk2))))
+			     uid_eq(uid, sk_uid(sk2))))
 				continue;
 
 			if ((!sk->sk_bound_dev_if || !bound_dev_if2 ||
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 6fdb2d96777a..8ed2f6689b01 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -64,7 +64,7 @@ static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
 	if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
 		return 1;
 
-	r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	r->diag_uid = from_kuid_munged(user_ns, sk_uid(sk));
 	r->diag_inode = sock_i_ino(sk);
 	return 0;
 }
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 7c61d47ea208..e028bf658499 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3642,7 +3642,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
 	    nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) ||
 	    nla_put_u32(skb, TIPC_NLA_SOCK_UID,
 			from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk),
-					 sock_i_uid(sk))) ||
+					 sk_uid(sk))) ||
 	    nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE,
 			      tipc_diag_gen_cookie(sk),
 			      TIPC_NLA_SOCK_PAD))
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 22e170fb5dda..1e320f89168d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -3682,7 +3682,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
 		goto unlock;
 	}
 
-	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
 	meta.seq = seq;
 	prog = bpf_iter_get_info(&meta, false);
 	ret = unix_prog_seq_show(prog, &meta, v, uid);
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 79b182d0e62a..ca3473026151 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -106,7 +106,7 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
 static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb,
 			    struct user_namespace *user_ns)
 {
-	uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+	uid_t uid = from_kuid_munged(user_ns, sk_uid(sk));
 	return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);
 }
 
diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c
index 09dcea0cbbed..0e0bca031c03 100644
--- a/net/xdp/xsk_diag.c
+++ b/net/xdp/xsk_diag.c
@@ -119,7 +119,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb,
 
 	if ((req->xdiag_show & XDP_SHOW_INFO) &&
 	    nla_put_u32(nlskb, XDP_DIAG_UID,
-			from_kuid_munged(user_ns, sock_i_uid(sk))))
+			from_kuid_munged(user_ns, sk_uid(sk))))
 		goto out_nlmsg_trim;
 
 	if ((req->xdiag_show & XDP_SHOW_RING_CFG) &&
-- 
cgit v1.2.3


From 3169e36ae14802b01abe4bfa7ec593b0a1af5cc7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jun 2025 15:55:35 +0000
Subject: net: make sk->sk_sndtimeo lockless

Followup of commit 285975dd6742 ("net: annotate data-races around
sk->sk_{rcv|snd}timeo").

Remove lock_sock()/release_sock() from sock_set_sndtimeo(),
and add READ_ONCE()/WRITE_ONCE() where it is needed.

Also SO_SNDTIMEO_OLD and SO_SNDTIMEO_NEW can call sock_set_timeout()
without holding the socket lock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250620155536.335520-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h         |  2 +-
 net/bluetooth/iso.c        |  4 ++--
 net/bluetooth/l2cap_sock.c |  4 ++--
 net/bluetooth/sco.c        |  4 ++--
 net/core/sock.c            | 12 ++++--------
 net/sctp/socket.c          |  2 +-
 net/smc/af_smc.c           |  4 ++--
 7 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index bbd97fbc5935..b08e36bf9669 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2601,7 +2601,7 @@ static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
 
 static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
 {
-	return noblock ? 0 : sk->sk_sndtimeo;
+	return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
 }
 
 static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 3c2c98eecc62..34e89bb5f384 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -413,7 +413,7 @@ static int iso_connect_bis(struct sock *sk)
 		sk->sk_state = BT_CONNECT;
 	} else {
 		sk->sk_state = BT_CONNECT;
-		iso_sock_set_timer(sk, sk->sk_sndtimeo);
+		iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
 	}
 
 	release_sock(sk);
@@ -503,7 +503,7 @@ static int iso_connect_cis(struct sock *sk)
 		sk->sk_state = BT_CONNECT;
 	} else {
 		sk->sk_state = BT_CONNECT;
-		iso_sock_set_timer(sk, sk->sk_sndtimeo);
+		iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
 	}
 
 	release_sock(sk);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 5aa55fa69594..113656489db5 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -255,7 +255,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
 
 	err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
 				 &la.l2_bdaddr, la.l2_bdaddr_type,
-				 sk->sk_sndtimeo);
+				 READ_ONCE(sk->sk_sndtimeo));
 	if (err)
 		return err;
 
@@ -1725,7 +1725,7 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
 {
 	struct sock *sk = chan->data;
 
-	return sk->sk_sndtimeo;
+	return READ_ONCE(sk->sk_sndtimeo);
 }
 
 static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan)
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 2945d27e75dc..d382d980fd9a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -338,7 +338,7 @@ static int sco_connect(struct sock *sk)
 
 	hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
 			       sco_pi(sk)->setting, &sco_pi(sk)->codec,
-			       sk->sk_sndtimeo);
+			       READ_ONCE(sk->sk_sndtimeo));
 	if (IS_ERR(hcon)) {
 		err = PTR_ERR(hcon);
 		goto unlock;
@@ -367,7 +367,7 @@ static int sco_connect(struct sock *sk)
 		sk->sk_state = BT_CONNECTED;
 	} else {
 		sk->sk_state = BT_CONNECT;
-		sco_sock_set_timer(sk, sk->sk_sndtimeo);
+		sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
 	}
 
 	release_sock(sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index ceb74ceecb6c..b0b5a0a76045 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -818,12 +818,10 @@ EXPORT_SYMBOL(sock_set_priority);
 
 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 {
-	lock_sock(sk);
 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 	else
 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
-	release_sock(sk);
 }
 EXPORT_SYMBOL(sock_set_sndtimeo);
 
@@ -1287,6 +1285,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
 	case SO_DEVMEM_DONTNEED:
 		return sock_devmem_dontneed(sk, optval, optlen);
 #endif
+	case SO_SNDTIMEO_OLD:
+	case SO_SNDTIMEO_NEW:
+		return sock_set_timeout(&sk->sk_sndtimeo, optval,
+					optlen, optname == SO_SNDTIMEO_OLD);
 	}
 
 	sockopt_lock_sock(sk);
@@ -1448,12 +1450,6 @@ set_sndbuf:
 				       optlen, optname == SO_RCVTIMEO_OLD);
 		break;
 
-	case SO_SNDTIMEO_OLD:
-	case SO_SNDTIMEO_NEW:
-		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
-				       optlen, optname == SO_SNDTIMEO_OLD);
-		break;
-
 	case SO_ATTACH_FILTER: {
 		struct sock_fprog fprog;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aa6400811018..5b690a4d2969 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9493,7 +9493,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newsk->sk_rcvbuf = sk->sk_rcvbuf;
 	newsk->sk_lingertime = sk->sk_lingertime;
 	newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
-	newsk->sk_sndtimeo = sk->sk_sndtimeo;
+	newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo);
 	newsk->sk_rxhash = sk->sk_rxhash;
 
 	newinet = inet_sk(newsk);
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 3760131f1484..6375a86fe2b5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -486,7 +486,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 {
 	/* options we don't get control via setsockopt for */
 	nsk->sk_type = osk->sk_type;
-	nsk->sk_sndtimeo = osk->sk_sndtimeo;
+	nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo);
 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 	nsk->sk_mark = READ_ONCE(osk->sk_mark);
 	nsk->sk_priority = READ_ONCE(osk->sk_priority);
@@ -1585,7 +1585,7 @@ static void smc_connect_work(struct work_struct *work)
 {
 	struct smc_sock *smc = container_of(work, struct smc_sock,
 					    connect_work);
-	long timeo = smc->sk.sk_sndtimeo;
+	long timeo = READ_ONCE(smc->sk.sk_sndtimeo);
 	int rc = 0;
 
 	if (!timeo)
-- 
cgit v1.2.3


From 935b67675a9f233aa4ac4ae6452b2cc45418d839 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 20 Jun 2025 15:55:36 +0000
Subject: net: make sk->sk_rcvtimeo lockless

Followup of commit 285975dd6742 ("net: annotate data-races around
sk->sk_{rcv|snd}timeo").

Remove lock_sock()/release_sock() from ksmbd_tcp_rcv_timeout()
and add READ_ONCE()/WRITE_ONCE() where it is needed.

Also SO_RCVTIMEO_OLD and SO_RCVTIMEO_NEW can call sock_set_timeout()
without holding the socket lock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250620155536.335520-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 fs/smb/server/transport_tcp.c |  6 ++----
 include/net/sock.h            |  2 +-
 net/core/sock.c               | 10 ++++------
 net/llc/af_llc.c              |  6 +++---
 net/sctp/socket.c             |  2 +-
 net/smc/af_smc.c              |  2 +-
 net/smc/smc_clc.c             |  6 +++---
 net/strparser/strparser.c     |  2 +-
 net/x25/af_x25.c              |  2 +-
 9 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 4e9f98db9ff4..f8c772a7cb43 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -58,12 +58,10 @@ static inline void ksmbd_tcp_reuseaddr(struct socket *sock)
 
 static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs)
 {
-	lock_sock(sock->sk);
 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
-		sock->sk->sk_rcvtimeo = secs * HZ;
+		WRITE_ONCE(sock->sk->sk_rcvtimeo, secs * HZ);
 	else
-		sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-	release_sock(sock->sk);
+		WRITE_ONCE(sock->sk->sk_rcvtimeo, MAX_SCHEDULE_TIMEOUT);
 }
 
 static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs)
diff --git a/include/net/sock.h b/include/net/sock.h
index b08e36bf9669..0f2443d4ec58 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2596,7 +2596,7 @@ static inline gfp_t gfp_memcg_charge(void)
 
 static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
 {
-	return noblock ? 0 : sk->sk_rcvtimeo;
+	return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
 }
 
 static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
diff --git a/net/core/sock.c b/net/core/sock.c
index b0b5a0a76045..3a71d6c4ccf0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1289,6 +1289,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
 	case SO_SNDTIMEO_NEW:
 		return sock_set_timeout(&sk->sk_sndtimeo, optval,
 					optlen, optname == SO_SNDTIMEO_OLD);
+	case SO_RCVTIMEO_OLD:
+	case SO_RCVTIMEO_NEW:
+		return sock_set_timeout(&sk->sk_rcvtimeo, optval,
+					optlen, optname == SO_RCVTIMEO_OLD);
 	}
 
 	sockopt_lock_sock(sk);
@@ -1444,12 +1448,6 @@ set_sndbuf:
 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
 		break;
 		}
-	case SO_RCVTIMEO_OLD:
-	case SO_RCVTIMEO_NEW:
-		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
-				       optlen, optname == SO_RCVTIMEO_OLD);
-		break;
-
 	case SO_ATTACH_FILTER: {
 		struct sock_fprog fprog;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index cc77ec5769d8..5958a80fe14c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -210,7 +210,7 @@ static int llc_ui_release(struct socket *sock)
 	dprintk("%s: closing local(%02X) remote(%02X)\n", __func__,
 		llc->laddr.lsap, llc->daddr.lsap);
 	if (!llc_send_disc(sk))
-		llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+		llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo));
 	if (!sock_flag(sk, SOCK_ZAPPED)) {
 		struct llc_sap *sap = llc->sap;
 
@@ -455,7 +455,7 @@ static int llc_ui_shutdown(struct socket *sock, int how)
 		goto out;
 	rc = llc_send_disc(sk);
 	if (!rc)
-		rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+		rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo));
 	/* Wake up anyone sleeping in poll */
 	sk->sk_state_change(sk);
 out:
@@ -712,7 +712,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock,
 		goto out;
 	/* wait for a connection to arrive. */
 	if (skb_queue_empty(&sk->sk_receive_queue)) {
-		rc = llc_wait_data(sk, sk->sk_rcvtimeo);
+		rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo));
 		if (rc)
 			goto out;
 	}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5b690a4d2969..4921416434f9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9492,7 +9492,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 	newsk->sk_sndbuf = sk->sk_sndbuf;
 	newsk->sk_rcvbuf = sk->sk_rcvbuf;
 	newsk->sk_lingertime = sk->sk_lingertime;
-	newsk->sk_rcvtimeo = sk->sk_rcvtimeo;
+	newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo);
 	newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo);
 	newsk->sk_rxhash = sk->sk_rxhash;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 6375a86fe2b5..8d56e4db63e0 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -487,7 +487,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 	/* options we don't get control via setsockopt for */
 	nsk->sk_type = osk->sk_type;
 	nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo);
-	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+	nsk->sk_rcvtimeo = READ_ONCE(osk->sk_rcvtimeo);
 	nsk->sk_mark = READ_ONCE(osk->sk_mark);
 	nsk->sk_priority = READ_ONCE(osk->sk_priority);
 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 521f5df80e10..5a4db151fe95 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -688,7 +688,7 @@ out:
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type, unsigned long timeout)
 {
-	long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
+	long rcvtimeo = READ_ONCE(smc->clcsock->sk->sk_rcvtimeo);
 	struct sock *clc_sk = smc->clcsock->sk;
 	struct smc_clc_msg_hdr *clcm = buf;
 	struct msghdr msg = {NULL, 0};
@@ -707,7 +707,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	 * sizeof(struct smc_clc_msg_hdr)
 	 */
 	krflags = MSG_PEEK | MSG_WAITALL;
-	clc_sk->sk_rcvtimeo = timeout;
+	WRITE_ONCE(clc_sk->sk_rcvtimeo, timeout);
 	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1,
 			sizeof(struct smc_clc_msg_hdr));
 	len = sock_recvmsg(smc->clcsock, &msg, krflags);
@@ -795,7 +795,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 	}
 
 out:
-	clc_sk->sk_rcvtimeo = rcvtimeo;
+	WRITE_ONCE(clc_sk->sk_rcvtimeo, rcvtimeo);
 	return reason_code;
 }
 
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index d946bfb424c7..43b1f558b33d 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -333,7 +333,7 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 	struct strparser *strp = (struct strparser *)desc->arg.data;
 
 	return __strp_recv(desc, orig_skb, orig_offset, orig_len,
-			   strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo);
+			   strp->sk->sk_rcvbuf, READ_ONCE(strp->sk->sk_rcvtimeo));
 }
 
 static int default_read_sock_done(struct strparser *strp, int err)
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 1f8ae9f4a3f1..655d1e0ae25f 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock,
 	if (sk->sk_state != TCP_LISTEN)
 		goto out2;
 
-	rc = x25_wait_for_data(sk, sk->sk_rcvtimeo);
+	rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo));
 	if (rc)
 		goto out2;
 	skb = skb_dequeue(&sk->sk_receive_queue);
-- 
cgit v1.2.3


From bbb7d478d91ac4d5c288e226cc8744daf3820798 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexanderduyck@fb.com>
Date: Wed, 18 Jun 2025 15:07:22 -0700
Subject: net: phy: Add interface types for 50G and 100G

Add support for 802.3cd based interface types 50GBASE-R and 100GBASE-P.
This choice in naming is based on section 135 of the 802.3-2022 IEEE
Standard.

In addition it is adding support for what I am referring to as LAUI
which is based on annex 135C of the IEEE Standard, and shares many
similarities with the 25/50G consortium. The main difference between the
two is that IEEE spec refers to LAUI as the AUI before the RS(544/514) FEC,
whereas the 25/50G use this lane and frequency combination after going
through RS(528/514), Base-R or no FEC at all.

Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
Link: https://patch.msgid.link/175028444205.625704.4191700324472974116.stgit@ahduyck-xeon-server.home.arpa
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/phy-core.c |  3 +++
 drivers/net/phy/phy_caps.c |  9 +++++++++
 drivers/net/phy/phylink.c  | 13 +++++++++++++
 include/linux/phy.h        | 12 ++++++++++++
 4 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index 27f1833563ab..c480bb40fa73 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -142,6 +142,9 @@ int phy_interface_num_ports(phy_interface_t interface)
 	case PHY_INTERFACE_MODE_RXAUI:
 	case PHY_INTERFACE_MODE_XAUI:
 	case PHY_INTERFACE_MODE_1000BASEKX:
+	case PHY_INTERFACE_MODE_50GBASER:
+	case PHY_INTERFACE_MODE_LAUI:
+	case PHY_INTERFACE_MODE_100GBASEP:
 		return 1;
 	case PHY_INTERFACE_MODE_QSGMII:
 	case PHY_INTERFACE_MODE_QUSGMII:
diff --git a/drivers/net/phy/phy_caps.c b/drivers/net/phy/phy_caps.c
index 38417e288611..d11ce1c7e712 100644
--- a/drivers/net/phy/phy_caps.c
+++ b/drivers/net/phy/phy_caps.c
@@ -351,6 +351,15 @@ unsigned long phy_caps_from_interface(phy_interface_t interface)
 		link_caps |= BIT(LINK_CAPA_40000FD);
 		break;
 
+	case PHY_INTERFACE_MODE_50GBASER:
+	case PHY_INTERFACE_MODE_LAUI:
+		link_caps |= BIT(LINK_CAPA_50000FD);
+		break;
+
+	case PHY_INTERFACE_MODE_100GBASEP:
+		link_caps |= BIT(LINK_CAPA_100000FD);
+		break;
+
 	case PHY_INTERFACE_MODE_INTERNAL:
 		link_caps |= LINK_CAPA_ALL;
 		break;
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 0faa3d97e06b..67218d278ce6 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -127,6 +127,9 @@ do {									\
 #endif
 
 static const phy_interface_t phylink_sfp_interface_preference[] = {
+	PHY_INTERFACE_MODE_100GBASEP,
+	PHY_INTERFACE_MODE_50GBASER,
+	PHY_INTERFACE_MODE_LAUI,
 	PHY_INTERFACE_MODE_25GBASER,
 	PHY_INTERFACE_MODE_USXGMII,
 	PHY_INTERFACE_MODE_10GBASER,
@@ -274,6 +277,13 @@ static int phylink_interface_max_speed(phy_interface_t interface)
 	case PHY_INTERFACE_MODE_XLGMII:
 		return SPEED_40000;
 
+	case PHY_INTERFACE_MODE_50GBASER:
+	case PHY_INTERFACE_MODE_LAUI:
+		return SPEED_50000;
+
+	case PHY_INTERFACE_MODE_100GBASEP:
+		return SPEED_100000;
+
 	case PHY_INTERFACE_MODE_INTERNAL:
 	case PHY_INTERFACE_MODE_NA:
 	case PHY_INTERFACE_MODE_MAX:
@@ -798,6 +808,9 @@ static int phylink_parse_mode(struct phylink *pl,
 		case PHY_INTERFACE_MODE_10GKR:
 		case PHY_INTERFACE_MODE_10GBASER:
 		case PHY_INTERFACE_MODE_XLGMII:
+		case PHY_INTERFACE_MODE_50GBASER:
+		case PHY_INTERFACE_MODE_LAUI:
+		case PHY_INTERFACE_MODE_100GBASEP:
 			caps = ~(MAC_SYM_PAUSE | MAC_ASYM_PAUSE);
 			caps = phylink_get_capabilities(pl->link_config.interface, caps,
 							RATE_MATCH_NONE);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b037aab7b71d..74c1bcf64b3c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -103,6 +103,9 @@ extern const int phy_basic_ports_array[3];
  * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII
  * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN
  * @PHY_INTERFACE_MODE_10G_QXGMII: 10G-QXGMII - 4 ports over 10G USXGMII
+ * @PHY_INTERFACE_MODE_50GBASER: 50GBase-R - with Clause 134 FEC
+ * @PHY_INTERFACE_MODE_LAUI: 50 Gigabit Attachment Unit Interface
+ * @PHY_INTERFACE_MODE_100GBASEP: 100GBase-P - with Clause 134 FEC
  * @PHY_INTERFACE_MODE_MAX: Book keeping
  *
  * Describes the interface between the MAC and PHY.
@@ -144,6 +147,9 @@ typedef enum {
 	PHY_INTERFACE_MODE_QUSGMII,
 	PHY_INTERFACE_MODE_1000BASEKX,
 	PHY_INTERFACE_MODE_10G_QXGMII,
+	PHY_INTERFACE_MODE_50GBASER,
+	PHY_INTERFACE_MODE_LAUI,
+	PHY_INTERFACE_MODE_100GBASEP,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -260,6 +266,12 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "qusgmii";
 	case PHY_INTERFACE_MODE_10G_QXGMII:
 		return "10g-qxgmii";
+	case PHY_INTERFACE_MODE_50GBASER:
+		return "50gbase-r";
+	case PHY_INTERFACE_MODE_LAUI:
+		return "laui";
+	case PHY_INTERFACE_MODE_100GBASEP:
+		return "100gbase-p";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From dad51ea09040f9ac2ea2a0f694e5e3ed5cf167b9 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Fri, 20 Jun 2025 12:02:40 +0200
Subject: net: pse-pd: tps23881: Clarify setup_pi_matrix callback documentation

Improve the setup_pi_matrix callback documentation to clarify its purpose
and usage. The enhanced description explains that PSE PI devicetree nodes
are pre-parsed before this callback is invoked, and drivers should utilize
pcdev->pi[x]->pairset[y].np to map PSE controller hardware ports to their
corresponding Power Interfaces.

This clarification helps driver implementers understand the callback's
role in establishing the hardware-to-PI relationship mapping.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250620-poe_doc_improve-v1-2-96357bb95d52@bootlin.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/pse-pd/pse.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index e5f305cef82e..4e5696cfade7 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -159,7 +159,13 @@ struct ethtool_pse_control_status {
 /**
  * struct pse_controller_ops - PSE controller driver callbacks
  *
- * @setup_pi_matrix: setup PI matrix of the PSE controller
+ * @setup_pi_matrix: Setup PI matrix of the PSE controller.
+ *		     The PSE PIs devicetree nodes have already been parsed by
+ *		     of_load_pse_pis() and the pcdev->pi[x]->pairset[y].np
+ *		     populated. This callback should establish the
+ *		     relationship between the PSE controller hardware ports
+ *		     and the PSE Power Interfaces, either through software
+ *		     mapping or hardware configuration.
  * @pi_get_admin_state: Get the operational state of the PSE PI. This ops
  *			is mandatory.
  * @pi_get_pw_status: Get the power detection status of the PSE PI. This
-- 
cgit v1.2.3


From 7934a8dd8692b56714ce9b36421e316445d94a77 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Fri, 6 Jun 2025 13:10:23 +0900
Subject: module: remove meaningless 'name' parameter from __MODULE_INFO()

The symbol names in the .modinfo section are never used and already
randomized by the __UNIQUE_ID() macro.

Therefore, the second parameter of  __MODULE_INFO() is meaningless
and can be removed to simplify the code.

With this change, the symbol names in the .modinfo section will be
prefixed with __UNIQUE_ID_modinfo, making it clearer that they
originate from MODULE_INFO().

[Before]

  $ objcopy  -j .modinfo vmlinux.o modinfo.o
  $ nm -n modinfo.o | head -n10
  0000000000000000 r __UNIQUE_ID_license560
  0000000000000011 r __UNIQUE_ID_file559
  0000000000000030 r __UNIQUE_ID_description558
  0000000000000074 r __UNIQUE_ID_license580
  000000000000008e r __UNIQUE_ID_file579
  00000000000000bd r __UNIQUE_ID_description578
  00000000000000e6 r __UNIQUE_ID_license581
  00000000000000ff r __UNIQUE_ID_file580
  0000000000000134 r __UNIQUE_ID_description579
  0000000000000179 r __UNIQUE_ID_uncore_no_discover578

[After]

  $ objcopy  -j .modinfo vmlinux.o modinfo.o
  $ nm -n modinfo.o | head -n10
  0000000000000000 r __UNIQUE_ID_modinfo560
  0000000000000011 r __UNIQUE_ID_modinfo559
  0000000000000030 r __UNIQUE_ID_modinfo558
  0000000000000074 r __UNIQUE_ID_modinfo580
  000000000000008e r __UNIQUE_ID_modinfo579
  00000000000000bd r __UNIQUE_ID_modinfo578
  00000000000000e6 r __UNIQUE_ID_modinfo581
  00000000000000ff r __UNIQUE_ID_modinfo580
  0000000000000134 r __UNIQUE_ID_modinfo579
  0000000000000179 r __UNIQUE_ID_modinfo578

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
---
 include/crypto/algapi.h     | 4 ++--
 include/linux/module.h      | 3 ---
 include/linux/moduleparam.h | 9 +++++----
 include/net/tcp.h           | 4 ++--
 4 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 188eface0a11..fc4574940636 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -43,8 +43,8 @@
  * alias.
  */
 #define MODULE_ALIAS_CRYPTO(name)	\
-		__MODULE_INFO(alias, alias_userspace, name);	\
-		__MODULE_INFO(alias, alias_crypto, "crypto-" name)
+		MODULE_INFO(alias, name);	\
+		MODULE_INFO(alias, "crypto-" name)
 
 struct crypto_aead;
 struct crypto_instance;
diff --git a/include/linux/module.h b/include/linux/module.h
index 92e1420fccdf..81b41cc6a19e 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -164,9 +164,6 @@ extern void cleanup_module(void);
 
 struct module_kobject *lookup_or_create_module_kobject(const char *name);
 
-/* Generic info of form tag = "info" */
-#define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info)
-
 /* For userspace: you can also call me... */
 #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias)
 
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index bfb85fd13e1f..00166f747e27 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -20,18 +20,19 @@
 /* Chosen so that structs with an unsigned long line up. */
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
-#define __MODULE_INFO(tag, name, info)					  \
-	static const char __UNIQUE_ID(name)[]				  \
+/* Generic info of form tag = "info" */
+#define MODULE_INFO(tag, info)					  \
+	static const char __UNIQUE_ID(modinfo)[]			  \
 		__used __section(".modinfo") __aligned(1)		  \
 		= __MODULE_INFO_PREFIX __stringify(tag) "=" info
 
 #define __MODULE_PARM_TYPE(name, _type)					  \
-	__MODULE_INFO(parmtype, name##type, #name ":" _type)
+	MODULE_INFO(parmtype, #name ":" _type)
 
 /* One for each parameter, describing how to use it.  Some files do
    multiple of these per line, so can't just use MODULE_INFO. */
 #define MODULE_PARM_DESC(_parm, desc) \
-	__MODULE_INFO(parm, _parm, #_parm ":" desc)
+	MODULE_INFO(parm, #_parm ":" desc)
 
 struct kernel_param;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5078ad868fee..9b39ef630c92 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2662,8 +2662,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
 		    void (*write_space)(struct sock *sk));
 
 #define MODULE_ALIAS_TCP_ULP(name)				\
-	__MODULE_INFO(alias, alias_userspace, name);		\
-	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+	MODULE_INFO(alias, name);		\
+	MODULE_INFO(alias, "tcp-ulp-" name)
 
 #ifdef CONFIG_NET_SOCK_MSG
 struct sk_msg;
-- 
cgit v1.2.3


From e581b7fe62218d390520287e0095bfd6fe0454f8 Mon Sep 17 00:00:00 2001
From: Sarika Sharma <quic_sarishar@quicinc.com>
Date: Wed, 28 May 2025 11:14:11 +0530
Subject: wifi: mac80211: add support towards MLO handling of station
 statistics

Currently, in supporting API's to fill sinfo structure from sta
structure, is mapped to fill the fields from sta->deflink. However,
for multi-link (ML) station, sinfo structure should be filled from
corresponding link_id.

Therefore, add  link_id as an additional argument in supporting API's
for filling sinfo structure correctly. Link_id is set to -1 for non-ML
station and corresponding link_id for ML stations. In supporting API's
for filling sinfo structure, check for link_id, if link_id < 0, fill
the sinfo structure from sta->deflink, otherwise fill from
sta->link[link_id].

Current, changes are done at the deflink level i.e, pass -1 as link_id.
Actual link_id will be added in subsequent patches to support
station statistics for MLO.

Signed-off-by: Sarika Sharma <quic_sarishar@quicinc.com>
Link: https://patch.msgid.link/20250528054420.3050133-2-quic_sarishar@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/dvm/lib.c |  2 +-
 include/net/mac80211.h                       |  3 +-
 net/mac80211/ibss.c                          |  4 +-
 net/mac80211/sta_info.c                      | 81 +++++++++++++++++++---------
 net/mac80211/sta_info.h                      |  2 +-
 net/mac80211/util.c                          | 14 ++++-
 6 files changed, 74 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
index 1dc974e2c511..48711dbcfa5a 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c
@@ -586,7 +586,7 @@ static bool iwlagn_fill_txpower_mode(struct iwl_priv *priv,
 		return false;
 	}
 
-	ave_rssi = ieee80211_ave_rssi(ctx->vif);
+	ave_rssi = ieee80211_ave_rssi(ctx->vif, -1);
 	if (!ave_rssi) {
 		/* no rssi data, no changes to reduce tx power */
 		IWL_DEBUG_COEX(priv, "no rssi data available\n");
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 82617579d910..a305e7f9c6b2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7242,13 +7242,14 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif);
  * ieee80211_ave_rssi - report the average RSSI for the specified interface
  *
  * @vif: the specified virtual interface
+ * @link_id: the link ID for MLO, or -1 for non-MLO
  *
  * Note: This function assumes that the given vif is valid.
  *
  * Return: The average RSSI value for the requested interface, or 0 if not
  * applicable.
  */
-int ieee80211_ave_rssi(struct ieee80211_vif *vif);
+int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id);
 
 /**
  * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 9ed87d6f5019..6e36b09fe97f 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -635,7 +635,7 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(sta, &local->sta_list, list) {
-		unsigned long last_active = ieee80211_sta_last_active(sta);
+		unsigned long last_active = ieee80211_sta_last_active(sta, -1);
 
 		if (sta->sdata == sdata &&
 		    time_is_after_jiffies(last_active +
@@ -1228,7 +1228,7 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata)
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
-		unsigned long last_active = ieee80211_sta_last_active(sta);
+		unsigned long last_active = ieee80211_sta_last_active(sta, -1);
 
 		if (sdata != sta->sdata)
 			continue;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 61583173629e..6acbe1a7314b 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1651,7 +1651,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata,
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	list_for_each_entry_safe(sta, tmp, &local->sta_list, list) {
-		unsigned long last_active = ieee80211_sta_last_active(sta);
+		unsigned long last_active = ieee80211_sta_last_active(sta, -1);
 
 		if (sdata != sta->sdata)
 			continue;
@@ -2420,18 +2420,27 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local,
 }
 
 static struct ieee80211_sta_rx_stats *
-sta_get_last_rx_stats(struct sta_info *sta)
+sta_get_last_rx_stats(struct sta_info *sta, int link_id)
 {
-	struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats;
+	struct ieee80211_sta_rx_stats *stats;
+	struct link_sta_info *link_sta_info;
 	int cpu;
 
-	if (!sta->deflink.pcpu_rx_stats)
+	if (link_id < 0)
+		link_sta_info = &sta->deflink;
+	else
+		link_sta_info = wiphy_dereference(sta->local->hw.wiphy,
+						  sta->link[link_id]);
+
+	stats = &link_sta_info->rx_stats;
+
+	if (!link_sta_info->pcpu_rx_stats)
 		return stats;
 
 	for_each_possible_cpu(cpu) {
 		struct ieee80211_sta_rx_stats *cpustats;
 
-		cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu);
+		cpustats = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu);
 
 		if (time_after(cpustats->last_rx, stats->last_rx))
 			stats = cpustats;
@@ -2499,9 +2508,10 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
 	}
 }
 
-static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
+static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo,
+				int link_id)
 {
-	u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate);
+	u32 rate = READ_ONCE(sta_get_last_rx_stats(sta, link_id)->last_rate);
 
 	if (rate == STA_STATS_RATE_INVALID)
 		return -EINVAL;
@@ -2526,20 +2536,28 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
 
 static void sta_set_tidstats(struct sta_info *sta,
 			     struct cfg80211_tid_stats *tidstats,
-			     int tid)
+			     int tid, int link_id)
 {
 	struct ieee80211_local *local = sta->local;
+	struct link_sta_info *link_sta_info;
 	int cpu;
 
+	if (link_id < 0)
+		link_sta_info = &sta->deflink;
+	else
+		link_sta_info = wiphy_dereference(sta->local->hw.wiphy,
+						  sta->link[link_id]);
+
 	if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) {
-		tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats,
-							   tid);
+		tidstats->rx_msdu +=
+			sta_get_tidstats_msdu(&link_sta_info->rx_stats,
+					      tid);
 
-		if (sta->deflink.pcpu_rx_stats) {
+		if (link_sta_info->pcpu_rx_stats) {
 			for_each_possible_cpu(cpu) {
 				struct ieee80211_sta_rx_stats *cpurxs;
 
-				cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats,
+				cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats,
 						     cpu);
 				tidstats->rx_msdu +=
 					sta_get_tidstats_msdu(cpurxs, tid);
@@ -2551,19 +2569,21 @@ static void sta_set_tidstats(struct sta_info *sta,
 
 	if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) {
 		tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU);
-		tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid];
+		tidstats->tx_msdu = link_sta_info->tx_stats.msdu[tid];
 	}
 
 	if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) &&
 	    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
 		tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES);
-		tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid];
+		tidstats->tx_msdu_retries =
+			link_sta_info->status_stats.msdu_retries[tid];
 	}
 
 	if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) &&
 	    ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
 		tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED);
-		tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid];
+		tidstats->tx_msdu_failed =
+			link_sta_info->status_stats.msdu_failed[tid];
 	}
 
 	if (tid < IEEE80211_NUM_TIDS) {
@@ -2634,7 +2654,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 	int i, ac, cpu;
 	struct ieee80211_sta_rx_stats *last_rxstats;
 
-	last_rxstats = sta_get_last_rx_stats(sta);
+	last_rxstats = sta_get_last_rx_stats(sta, -1);
 
 	sinfo->generation = sdata->local->sta_generation;
 
@@ -2662,7 +2682,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 	sinfo->connected_time = ktime_get_seconds() - sta->last_connected;
 	sinfo->assoc_at = sta->assoc_at;
 	sinfo->inactive_time =
-		jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta));
+		jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, -1));
 
 	if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
 			       BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
@@ -2751,7 +2771,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 	    !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
 		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
 				 BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
-		sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif);
+		sinfo->rx_beacon_signal_avg =
+			ieee80211_ave_rssi(&sdata->vif, -1);
 	}
 
 	if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
@@ -2800,13 +2821,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 
 	if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) &&
 	    !sta->sta.valid_links) {
-		if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0)
+		if (sta_set_rate_info_rx(sta, &sinfo->rxrate, -1) == 0)
 			sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
 	}
 
 	if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) {
 		for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
-			sta_set_tidstats(sta, &sinfo->pertid[i], i);
+			sta_set_tidstats(sta, &sinfo->pertid[i], i, -1);
 	}
 
 #ifdef CONFIG_MAC80211_MESH
@@ -2889,14 +2910,24 @@ u32 sta_get_expected_throughput(struct sta_info *sta)
 	return thr;
 }
 
-unsigned long ieee80211_sta_last_active(struct sta_info *sta)
+unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id)
 {
-	struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta);
+	struct ieee80211_sta_rx_stats *stats;
+	struct link_sta_info *link_sta_info;
+
+	stats = sta_get_last_rx_stats(sta, link_id);
 
-	if (!sta->deflink.status_stats.last_ack ||
-	    time_after(stats->last_rx, sta->deflink.status_stats.last_ack))
+	if (link_id < 0)
+		link_sta_info = &sta->deflink;
+	else
+		link_sta_info = wiphy_dereference(sta->local->hw.wiphy,
+						  sta->link[link_id]);
+
+	if (!link_sta_info->status_stats.last_ack ||
+	    time_after(stats->last_rx, link_sta_info->status_stats.last_ack))
 		return stats->last_rx;
-	return sta->deflink.status_stats.last_ack;
+
+	return link_sta_info->status_stats.last_ack;
 }
 
 int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 7a95d8d34fca..e5b91e60405b 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -936,7 +936,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta);
 void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta);
 void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta);
 
-unsigned long ieee80211_sta_last_active(struct sta_info *sta);
+unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id);
 
 void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta,
 					   const u8 *ext_capab,
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ea73a38fb866..24c43a1ef2aa 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -3265,14 +3265,24 @@ int ieee80211_put_srates_elem(struct sk_buff *skb,
 	return 0;
 }
 
-int ieee80211_ave_rssi(struct ieee80211_vif *vif)
+int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_link_data *link_data;
 
 	if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
 		return 0;
 
-	return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal);
+	if (link_id < 0)
+		link_data = &sdata->deflink;
+	else
+		link_data = wiphy_dereference(sdata->local->hw.wiphy,
+					      sdata->link[link_id]);
+
+	if (WARN_ON_ONCE(!link_data))
+		return -99;
+
+	return -ewma_beacon_signal_read(&link_data->u.mgd.ave_beacon_signal);
 }
 EXPORT_SYMBOL_GPL(ieee80211_ave_rssi);
 
-- 
cgit v1.2.3


From d2329fff7e527e8b350086be2e7cbf0d190177a3 Mon Sep 17 00:00:00 2001
From: Sarika Sharma <quic_sarishar@quicinc.com>
Date: Wed, 28 May 2025 11:14:12 +0530
Subject: wifi: cfg80211: add link_station_info structure to support MLO
 statistics

Current implementation of NL80211_GET_STATION does not work for
multi-link operation(MLO) since in case of MLO only deflink (or one
of the links) is considered and not all links.

Therefore to support for MLO, add link_station_info structure
to account link level statistics for station.

Additionally, add valid_links in station_info structure to indicate
bitmap of valid links for MLO. This will be helpful to check the link
related statistics during MLO.

Signed-off-by: Sarika Sharma <quic_sarishar@quicinc.com>
Link: https://patch.msgid.link/20250528054420.3050133-3-quic_sarishar@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 47b4235eea59..b008357cac03 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2017,6 +2017,99 @@ struct cfg80211_tid_stats {
 
 #define IEEE80211_MAX_CHAINS	4
 
+/**
+ * struct link_station_info - link station information
+ *
+ * Link station information filled by driver for get_station() and
+ *	dump_station().
+ * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to
+ *	indicate the relevant values in this struct for them
+ * @connected_time: time(in secs) since a link of station is last connected
+ * @inactive_time: time since last activity for link station(tx/rx)
+ *	in milliseconds
+ * @assoc_at: bootime (ns) of the last association of link of station
+ * @rx_bytes: bytes (size of MPDUs) received from this link of station
+ * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station
+ * @signal: The signal strength, type depends on the wiphy's signal_type.
+ *	For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
+ * @signal_avg: Average signal strength, type depends on the wiphy's
+ *	signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_
+ * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
+ * @chain_signal: per-chain signal strength of last received packet in dBm
+ * @chain_signal_avg: per-chain signal strength average in dBm
+ * @txrate: current unicast bitrate from this link of station
+ * @rxrate: current unicast bitrate to this link of station
+ * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station
+ * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station
+ * @tx_retries: cumulative retry counts (MPDUs) for this link of station
+ * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
+ * @rx_dropped_misc:  Dropped for un-specified reason.
+ * @bss_param: current BSS parameters
+ * @beacon_loss_count: Number of times beacon loss event has triggered.
+ * @expected_throughput: expected throughput in kbps (including 802.11 headers)
+ *	towards this station.
+ * @rx_beacon: number of beacons received from this peer
+ * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
+ *	from this peer
+ * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
+ * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
+ * @airtime_weight: current airtime scheduling weight
+ * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
+ *	(IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
+ *	Note that this doesn't use the @filled bit, but is used if non-NULL.
+ * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ *	been sent.
+ * @rx_mpdu_count: number of MPDUs received from this station
+ * @fcs_err_count: number of packets (MPDUs) received from this station with
+ *	an FCS error. This counter should be incremented only when TA of the
+ *	received packet with an FCS error matches the peer MAC address.
+ * @addr: For MLO STA connection, filled with address of the link of station.
+ */
+struct link_station_info {
+	u64 filled;
+	u32 connected_time;
+	u32 inactive_time;
+	u64 assoc_at;
+	u64 rx_bytes;
+	u64 tx_bytes;
+	s8 signal;
+	s8 signal_avg;
+
+	u8 chains;
+	s8 chain_signal[IEEE80211_MAX_CHAINS];
+	s8 chain_signal_avg[IEEE80211_MAX_CHAINS];
+
+	struct rate_info txrate;
+	struct rate_info rxrate;
+	u32 rx_packets;
+	u32 tx_packets;
+	u32 tx_retries;
+	u32 tx_failed;
+	u32 rx_dropped_misc;
+	struct sta_bss_parameters bss_param;
+
+	u32 beacon_loss_count;
+
+	u32 expected_throughput;
+
+	u64 tx_duration;
+	u64 rx_duration;
+	u64 rx_beacon;
+	u8 rx_beacon_signal_avg;
+
+	u16 airtime_weight;
+
+	s8 ack_signal;
+	s8 avg_ack_signal;
+	struct cfg80211_tid_stats *pertid;
+
+	u32 rx_mpdu_count;
+	u32 fcs_err_count;
+
+	u8 addr[ETH_ALEN] __aligned(2);
+};
+
 /**
  * struct station_info - station information
  *
@@ -2101,6 +2194,11 @@ struct cfg80211_tid_stats {
  *	dump_station() callbacks. User space needs this information to determine
  *	the accepted and rejected affiliated links of the connected station.
  * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets.
+ * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this
+ *	information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(),
+ *	get_station() and dump_station() callbacks.
+ * @links: reference to Link sta entries for MLO STA, all link specific
+ *	information is accessed through links[link_id].
  */
 struct station_info {
 	u64 filled;
@@ -2165,6 +2263,9 @@ struct station_info {
 	u8 mld_addr[ETH_ALEN] __aligned(2);
 	const u8 *assoc_resp_ies;
 	size_t assoc_resp_ies_len;
+
+	u16 valid_links;
+	struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
 /**
-- 
cgit v1.2.3


From 49e47223ecc4af0bd15b5267184d46b3654d520b Mon Sep 17 00:00:00 2001
From: Sarika Sharma <quic_sarishar@quicinc.com>
Date: Wed, 28 May 2025 11:14:15 +0530
Subject: wifi: cfg80211: allocate memory for link_station info structure

Currently, station_info structure is passed to fill station statistics
from mac80211/drivers. After NL message send to user space for requested
station statistics, memory for station statistics is freed in cfg80211.
Therefore, memory allocation/free for link station statistics should
also happen in cfg80211 only.

Hence, allocate the memory for link_station structure for all
possible links and free in cfg80211_sinfo_release_content().

Signed-off-by: Sarika Sharma <quic_sarishar@quicinc.com>
Link: https://patch.msgid.link/20250528054420.3050133-6-quic_sarishar@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  7 +++++++
 net/wireless/nl80211.c | 27 ++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b008357cac03..7bf0c97d2ab1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8577,6 +8577,13 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
 {
 	kfree(sinfo->pertid);
+
+	for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) {
+		if (sinfo->links[link_id]) {
+			kfree(sinfo->links[link_id]->pertid);
+			kfree(sinfo->links[link_id]);
+		}
+	}
 }
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 83f6291eac92..5137824520a1 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7366,7 +7366,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	struct wireless_dev *wdev;
 	u8 mac_addr[ETH_ALEN];
 	int sta_idx = cb->args[2];
-	int err;
+	int err, i;
 
 	err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL);
 	if (err)
@@ -7386,6 +7386,16 @@ static int nl80211_dump_station(struct sk_buff *skb,
 
 	while (1) {
 		memset(&sinfo, 0, sizeof(sinfo));
+
+		for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) {
+			sinfo.links[i] =
+				kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL);
+			if (!sinfo.links[i]) {
+				err = -ENOMEM;
+				goto out_err;
+			}
+		}
+
 		err = rdev_dump_station(rdev, wdev->netdev, sta_idx,
 					mac_addr, &sinfo);
 		if (err == -ENOENT)
@@ -7410,6 +7420,7 @@ static int nl80211_dump_station(struct sk_buff *skb,
 	cb->args[2] = sta_idx;
 	err = skb->len;
  out_err:
+	cfg80211_sinfo_release_content(&sinfo);
 	wiphy_unlock(&rdev->wiphy);
 
 	return err;
@@ -7422,7 +7433,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 	struct station_info sinfo;
 	struct sk_buff *msg;
 	u8 *mac_addr = NULL;
-	int err;
+	int err, i;
 
 	memset(&sinfo, 0, sizeof(sinfo));
 
@@ -7434,9 +7445,19 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)
 	if (!rdev->ops->get_station)
 		return -EOPNOTSUPP;
 
+	for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) {
+		sinfo.links[i] = kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL);
+		if (!sinfo.links[i]) {
+			cfg80211_sinfo_release_content(&sinfo);
+			return -ENOMEM;
+		}
+	}
+
 	err = rdev_get_station(rdev, dev, mac_addr, &sinfo);
-	if (err)
+	if (err) {
+		cfg80211_sinfo_release_content(&sinfo);
 		return err;
+	}
 
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg) {
-- 
cgit v1.2.3


From 505991fba9ec112770c79a0fea56b4c49a5ad2fa Mon Sep 17 00:00:00 2001
From: Sarika Sharma <quic_sarishar@quicinc.com>
Date: Wed, 28 May 2025 11:14:18 +0530
Subject: wifi: mac80211: extend support to fill link level sinfo structure

Currently, sinfo structure is supported to fill information at
deflink( or one of the links) level for station. This has problems
when applied to fetch multi-link(ML) station information.

Hence, if valid_links are present, support filling link_station
structure for each link.

This will be helpful to check the link related statistics during MLO.

Additionally, TXQ stats for pertid are applicable at station level
not at link level. Therefore check link_id is less then 0, before
filling TXQ stats in pertid stats.

Signed-off-by: Sarika Sharma <quic_sarishar@quicinc.com>
Link: https://patch.msgid.link/20250528054420.3050133-9-quic_sarishar@quicinc.com
[fix some indentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  |  11 ++
 net/mac80211/sta_info.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/util.c     |  12 +++
 3 files changed, 281 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7bf0c97d2ab1..eec066f4738a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8566,6 +8566,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
  */
 int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
+/**
+ * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics.
+ *
+ * @link_sinfo: the link station information
+ * @gfp: allocation flags
+ *
+ * Return: 0 on success. Non-zero on error.
+ */
+int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo,
+					gfp_t gfp);
+
 /**
  * cfg80211_sinfo_release_content - release contents of station info
  * @sinfo: the station information
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cf80b2fc8898..67af43d2e09b 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2634,7 +2634,7 @@ static void sta_set_tidstats(struct sta_info *sta,
 			link_sta_info->status_stats.msdu_failed[tid];
 	}
 
-	if (tid < IEEE80211_NUM_TIDS) {
+	if (link_id < 0 && tid < IEEE80211_NUM_TIDS) {
 		spin_lock_bh(&local->fq.lock);
 		rcu_read_lock();
 
@@ -2719,13 +2719,249 @@ void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta,
 	}
 }
 
+static void sta_set_link_sinfo(struct sta_info *sta,
+			       struct link_station_info *link_sinfo,
+			       struct ieee80211_link_data *link,
+			       bool tidstats)
+{
+	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_sta_rx_stats *last_rxstats;
+	int i, ac, cpu, link_id = link->link_id;
+	struct link_sta_info *link_sta_info;
+	u32 thr = 0;
+
+	last_rxstats = sta_get_last_rx_stats(sta, link_id);
+
+	link_sta_info = wiphy_dereference(sta->local->hw.wiphy,
+					  sta->link[link_id]);
+
+	/* do before driver, so beacon filtering drivers have a
+	 * chance to e.g. just add the number of filtered beacons
+	 * (or just modify the value entirely, of course)
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+		link_sinfo->rx_beacon = link->u.mgd.count_beacon_signal;
+
+	ether_addr_copy(link_sinfo->addr, link_sta_info->addr);
+
+	/* TODO: add drv_link_sta_statistics() ops to fill link_station
+	 * statistics of station.
+	 */
+
+	link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
+			 BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
+			 BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC);
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+		link_sinfo->beacon_loss_count =
+			link->u.mgd.beacon_loss_count;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS);
+	}
+
+	link_sinfo->inactive_time =
+		jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, link_id));
+
+	if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) |
+				    BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) {
+		link_sinfo->tx_bytes = 0;
+		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+			link_sinfo->tx_bytes +=
+				link_sta_info->tx_stats.bytes[ac];
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) {
+		link_sinfo->tx_packets = 0;
+		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+			link_sinfo->tx_packets +=
+				link_sta_info->tx_stats.packets[ac];
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS);
+	}
+
+	if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) |
+			       BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) {
+		link_sinfo->rx_bytes +=
+			sta_get_stats_bytes(&link_sta_info->rx_stats);
+
+		if (link_sta_info->pcpu_rx_stats) {
+			for_each_possible_cpu(cpu) {
+				struct ieee80211_sta_rx_stats *cpurxs;
+
+				cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats,
+						     cpu);
+				link_sinfo->rx_bytes +=
+					sta_get_stats_bytes(cpurxs);
+			}
+		}
+
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) {
+		link_sinfo->rx_packets = link_sta_info->rx_stats.packets;
+		if (link_sta_info->pcpu_rx_stats) {
+			for_each_possible_cpu(cpu) {
+				struct ieee80211_sta_rx_stats *cpurxs;
+
+				cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats,
+						     cpu);
+				link_sinfo->rx_packets += cpurxs->packets;
+			}
+		}
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) {
+		link_sinfo->tx_retries =
+			link_sta_info->status_stats.retry_count;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) {
+		link_sinfo->tx_failed =
+			link_sta_info->status_stats.retry_failed;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) {
+		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+			link_sinfo->rx_duration += sta->airtime[ac].rx_airtime;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) {
+		for (ac = 0; ac < IEEE80211_NUM_ACS; ac++)
+			link_sinfo->tx_duration += sta->airtime[ac].tx_airtime;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) {
+		link_sinfo->airtime_weight = sta->airtime_weight;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT);
+	}
+
+	link_sinfo->rx_dropped_misc = link_sta_info->rx_stats.dropped;
+	if (link_sta_info->pcpu_rx_stats) {
+		for_each_possible_cpu(cpu) {
+			struct ieee80211_sta_rx_stats *cpurxs;
+
+			cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats,
+					     cpu);
+			link_sinfo->rx_dropped_misc += cpurxs->dropped;
+		}
+	}
+
+	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) {
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) |
+				 BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG);
+		link_sinfo->rx_beacon_signal_avg =
+			ieee80211_ave_rssi(&sdata->vif, -1);
+	}
+
+	if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) ||
+	    ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) {
+		if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) {
+			link_sinfo->signal = (s8)last_rxstats->last_signal;
+			link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL);
+		}
+
+		if (!link_sta_info->pcpu_rx_stats &&
+		    !(link_sinfo->filled &
+		       BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) {
+			link_sinfo->signal_avg =
+				-ewma_signal_read(&link_sta_info->rx_stats_avg.signal);
+			link_sinfo->filled |=
+				BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG);
+		}
+	}
+
+	/* for the average - if pcpu_rx_stats isn't set - rxstats must point to
+	 * the sta->rx_stats struct, so the check here is fine with and without
+	 * pcpu statistics
+	 */
+	if (last_rxstats->chains &&
+	    !(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) |
+			       BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) {
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL);
+		if (!link_sta_info->pcpu_rx_stats)
+			link_sinfo->filled |=
+				BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG);
+
+		link_sinfo->chains = last_rxstats->chains;
+
+		for (i = 0; i < ARRAY_SIZE(link_sinfo->chain_signal); i++) {
+			link_sinfo->chain_signal[i] =
+				last_rxstats->chain_signal_last[i];
+			link_sinfo->chain_signal_avg[i] =
+				-ewma_signal_read(
+					&link_sta_info->rx_stats_avg.chain_signal[i]);
+		}
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) &&
+	    ieee80211_rate_valid(&link_sta_info->tx_stats.last_rate)) {
+		sta_set_rate_info_tx(sta, &link_sta_info->tx_stats.last_rate,
+				     &link_sinfo->txrate);
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) {
+		if (sta_set_rate_info_rx(sta, &link_sinfo->rxrate,
+					 link_id) == 0)
+			link_sinfo->filled |=
+				BIT_ULL(NL80211_STA_INFO_RX_BITRATE);
+	}
+
+	if (tidstats && !cfg80211_link_sinfo_alloc_tid_stats(link_sinfo,
+							     GFP_KERNEL)) {
+		for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++)
+			sta_set_tidstats(sta, &link_sinfo->pertid[i], i,
+					 link_id);
+	}
+
+	link_sinfo->bss_param.flags = 0;
+	if (sdata->vif.bss_conf.use_cts_prot)
+		link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT;
+	if (sdata->vif.bss_conf.use_short_preamble)
+		link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE;
+	if (sdata->vif.bss_conf.use_short_slot)
+		link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME;
+	link_sinfo->bss_param.dtim_period = link->conf->dtim_period;
+	link_sinfo->bss_param.beacon_interval = link->conf->beacon_int;
+
+	thr = sta_get_expected_throughput(sta);
+
+	if (thr != 0) {
+		link_sinfo->filled |=
+			BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
+		link_sinfo->expected_throughput = thr;
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) &&
+	    link_sta_info->status_stats.ack_signal_filled) {
+		link_sinfo->ack_signal =
+			link_sta_info->status_stats.last_ack_signal;
+		link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
+	}
+
+	if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) &&
+	    link_sta_info->status_stats.ack_signal_filled) {
+		link_sinfo->avg_ack_signal =
+			-(s8)ewma_avg_signal_read(
+				&link_sta_info->status_stats.avg_ack_signal);
+		link_sinfo->filled |=
+			BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
+	}
+}
+
 void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 		   bool tidstats)
 {
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct ieee80211_local *local = sdata->local;
 	u32 thr = 0;
-	int i, ac, cpu;
+	int i, ac, cpu, link_id;
 	struct ieee80211_sta_rx_stats *last_rxstats;
 
 	last_rxstats = sta_get_last_rx_stats(sta, -1);
@@ -2963,6 +3199,26 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
 		sinfo->filled |=
 			BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG);
 	}
+
+	if (sta->sta.valid_links) {
+		struct ieee80211_link_data *link;
+		struct link_sta_info *link_sta;
+
+		ether_addr_copy(sinfo->mld_addr, sta->addr);
+		for_each_valid_link(sinfo, link_id) {
+			link_sta = wiphy_dereference(sta->local->hw.wiphy,
+						     sta->link[link_id]);
+			link = wiphy_dereference(sdata->local->hw.wiphy,
+						 sdata->link[link_id]);
+
+			if (!link_sta || !sinfo->links[link_id] || !link)
+				continue;
+
+			sinfo->valid_links = sta->sta.valid_links;
+			sta_set_link_sinfo(sta, sinfo->links[link_id],
+					   link, tidstats);
+		}
+	}
 }
 
 u32 sta_get_expected_throughput(struct sta_info *sta)
diff --git a/net/wireless/util.c b/net/wireless/util.c
index e438f883f085..5aff11c35303 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2650,6 +2650,18 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range,
 	return false;
 }
 
+int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo,
+					gfp_t gfp)
+{
+	link_sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
+				     sizeof(*link_sinfo->pertid), gfp);
+	if (!link_sinfo->pertid)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(cfg80211_link_sinfo_alloc_tid_stats);
+
 int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp)
 {
 	sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1,
-- 
cgit v1.2.3


From 4cb1ce7e254adeeeec7ccbb45125307aec4d0f0b Mon Sep 17 00:00:00 2001
From: Sarika Sharma <quic_sarishar@quicinc.com>
Date: Wed, 28 May 2025 11:14:20 +0530
Subject: wifi: mac80211: add link_sta_statistics ops to fill link station
 statistics

Currently, link station statistics for MLO are filled by mac80211.
But there are some statistics that kept by mac80211 might not be
accurate, so let the driver pre-fill the link statistics. The driver
can fill the values (indicating which field is filled, by setting the
filled bitmapin in link_station structure).
Statistics that driver don't fill are filled by mac80211.

Hence, add link_sta_statistics callback to fill link station statistics
for MLO in sta_set_link_sinfo() by drivers.

Signed-off-by: Sarika Sharma <quic_sarishar@quicinc.com>
Link: https://patch.msgid.link/20250528054420.3050133-11-quic_sarishar@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    | 13 +++++++++++++
 net/mac80211/driver-ops.h | 19 +++++++++++++++++++
 net/mac80211/sta_info.c   |  6 +++---
 net/mac80211/trace.h      | 27 +++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a305e7f9c6b2..fa2325692abf 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4133,6 +4133,15 @@ struct ieee80211_prep_tx_info {
  *	Statistics that the driver doesn't fill will be filled by mac80211.
  *	The callback can sleep.
  *
+ * @link_sta_statistics: Get link statistics for this station. For example with
+ *	beacon filtering, the statistics kept by mac80211 might not be
+ *	accurate, so let the driver pre-fill the statistics. The driver can
+ *	fill most of the values (indicating which by setting the filled
+ *	bitmap), but not all of them make sense - see the source for which
+ *	ones are possible.
+ *	Statistics that the driver doesn't fill will be filled by mac80211.
+ *	The callback can sleep.
+ *
  * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max),
  *	bursting) for a hardware TX queue.
  *	Returns a negative error code on failure.
@@ -4627,6 +4636,10 @@ struct ieee80211_ops {
 			   s64 offset);
 	void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 	int (*tx_last_beacon)(struct ieee80211_hw *hw);
+	void (*link_sta_statistics)(struct ieee80211_hw *hw,
+				    struct ieee80211_vif *vif,
+				    struct ieee80211_link_sta *link_sta,
+				    struct link_station_info *link_sinfo);
 
 	/**
 	 * @ampdu_action:
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 307587c8a003..ba017bf3fd15 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -631,6 +631,25 @@ static inline void drv_sta_statistics(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
+static inline void drv_link_sta_statistics(struct ieee80211_local *local,
+					   struct ieee80211_sub_if_data *sdata,
+					   struct ieee80211_link_sta *link_sta,
+					   struct link_station_info *link_sinfo)
+{
+	might_sleep();
+	lockdep_assert_wiphy(local->hw.wiphy);
+
+	sdata = get_bss_sdata(sdata);
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	trace_drv_link_sta_statistics(local, sdata, link_sta);
+	if (local->ops->link_sta_statistics)
+		local->ops->link_sta_statistics(&local->hw, &sdata->vif,
+						link_sta, link_sinfo);
+	trace_drv_return_void(local);
+}
+
 int drv_conf_tx(struct ieee80211_local *local,
 		struct ieee80211_link_data *link, u16 ac,
 		const struct ieee80211_tx_queue_params *params);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 67af43d2e09b..89cf365b07e6 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2744,9 +2744,9 @@ static void sta_set_link_sinfo(struct sta_info *sta,
 
 	ether_addr_copy(link_sinfo->addr, link_sta_info->addr);
 
-	/* TODO: add drv_link_sta_statistics() ops to fill link_station
-	 * statistics of station.
-	 */
+	drv_link_sta_statistics(sta->local, sdata,
+				link_sta_info->pub,
+				link_sinfo);
 
 	link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) |
 			 BIT_ULL(NL80211_STA_INFO_BSS_PARAM) |
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 72fad8ea8bb9..8215ca58ce5e 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1002,6 +1002,33 @@ DEFINE_EVENT(sta_event, drv_sta_statistics,
 	TP_ARGS(local, sdata, sta)
 );
 
+TRACE_EVENT(drv_link_sta_statistics,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_link_sta *link_sta),
+
+	TP_ARGS(local, sdata, link_sta),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		STA_ENTRY
+		__field(u32, link_id)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		STA_NAMED_ASSIGN(link_sta->sta);
+		__entry->link_id = link_sta->link_id;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT  VIF_PR_FMT  STA_PR_FMT " (link %d)",
+		LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id
+	)
+);
+
 DEFINE_EVENT(sta_event, drv_sta_add,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From b74947b4f6ff7c122a1bb6eb38bb7ecfbb1d3820 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Sun, 15 Jun 2025 13:53:09 +0530
Subject: wifi: cfg80211/mac80211: Add support to get radio index

Currently, per-radio attributes are set on per-phy basis, i.e., all the
radios present in a wiphy will take attributes values sent from user. But
each radio in a wiphy can get different values from userspace based on
its requirement.

To extend support to set per-radio attributes, add support to get radio
index from userspace. Add an NL attribute - NL80211_ATTR_WIPHY_RADIO_INDEX,
to get user specified radio index for which attributes should be changed.
Pass this to individual drivers, so that the drivers can use this radio
index to change per-radio attributes when necessary. Currently, per-radio
attributes identified are:
NL80211_ATTR_WIPHY_TX_POWER_LEVEL
NL80211_ATTR_WIPHY_ANTENNA_TX
NL80211_ATTR_WIPHY_ANTENNA_RX
NL80211_ATTR_WIPHY_RETRY_SHORT
NL80211_ATTR_WIPHY_RETRY_LONG
NL80211_ATTR_WIPHY_FRAG_THRESHOLD
NL80211_ATTR_WIPHY_RTS_THRESHOLD
NL80211_ATTR_WIPHY_COVERAGE_CLASS
NL80211_ATTR_TXQ_LIMIT
NL80211_ATTR_TXQ_MEMORY_LIMIT
NL80211_ATTR_TXQ_QUANTUM

By default, the radio index is set to -1. This means the attribute should
be treated as a global configuration. If the user has not specified any
index, then the radio index passed to individual drivers would be -1. This
would indicate that the attribute applies to all radios in that wiphy.

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20250615082312.619639-2-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/admtek/adm8211.c              |  2 +-
 drivers/net/wireless/ath/ar5523/ar5523.c           |  5 +-
 drivers/net/wireless/ath/ath10k/core.c             |  2 +-
 drivers/net/wireless/ath/ath10k/hw.c               |  1 +
 drivers/net/wireless/ath/ath10k/hw.h               |  2 +-
 drivers/net/wireless/ath/ath10k/mac.c              | 19 ++++--
 drivers/net/wireless/ath/ath11k/mac.c              | 14 ++--
 drivers/net/wireless/ath/ath12k/mac.c              | 14 ++--
 drivers/net/wireless/ath/ath5k/mac80211-ops.c      | 12 ++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c         |  7 +-
 drivers/net/wireless/ath/ath9k/htc_drv_main.c      | 10 +--
 drivers/net/wireless/ath/ath9k/main.c              |  9 ++-
 drivers/net/wireless/ath/carl9170/main.c           |  2 +-
 drivers/net/wireless/ath/wcn36xx/main.c            |  5 +-
 drivers/net/wireless/ath/wil6210/cfg80211.c        |  3 +-
 drivers/net/wireless/atmel/at76c50x-usb.c          |  2 +-
 drivers/net/wireless/broadcom/b43/main.c           |  6 +-
 drivers/net/wireless/broadcom/b43legacy/main.c     |  2 +-
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  8 ++-
 .../broadcom/brcm80211/brcmsmac/mac80211_if.c      |  3 +-
 drivers/net/wireless/intel/iwlegacy/common.c       |  2 +-
 drivers/net/wireless/intel/iwlegacy/common.h       |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/agn.h       |  2 +-
 drivers/net/wireless/intel/iwlwifi/dvm/rxon.c      |  2 +-
 drivers/net/wireless/intel/iwlwifi/mld/mac80211.c  |  6 +-
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  |  9 ++-
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       | 12 ++--
 drivers/net/wireless/intersil/p54/main.c           |  3 +-
 drivers/net/wireless/marvell/libertas_tf/main.c    |  2 +-
 drivers/net/wireless/marvell/mwifiex/cfg80211.c    | 11 ++-
 drivers/net/wireless/marvell/mwl8k.c               | 12 ++--
 drivers/net/wireless/mediatek/mt76/mac80211.c      |  3 +-
 drivers/net/wireless/mediatek/mt76/mt76.h          |  3 +-
 drivers/net/wireless/mediatek/mt76/mt7603/main.c   |  5 +-
 drivers/net/wireless/mediatek/mt76/mt7615/main.c   | 11 +--
 drivers/net/wireless/mediatek/mt76/mt76x0/main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h |  2 +-
 drivers/net/wireless/mediatek/mt76/mt76x02.h       |  4 +-
 drivers/net/wireless/mediatek/mt76/mt76x02_util.c  |  4 +-
 .../net/wireless/mediatek/mt76/mt76x2/pci_main.c   |  6 +-
 .../net/wireless/mediatek/mt76/mt76x2/usb_main.c   |  2 +-
 drivers/net/wireless/mediatek/mt76/mt7915/main.c   | 13 ++--
 drivers/net/wireless/mediatek/mt76/mt7921/main.c   |  8 ++-
 drivers/net/wireless/mediatek/mt76/mt7925/main.c   |  8 ++-
 drivers/net/wireless/mediatek/mt76/mt792x.h        |  3 +-
 drivers/net/wireless/mediatek/mt76/mt792x_core.c   |  3 +-
 drivers/net/wireless/mediatek/mt76/mt7996/main.c   | 11 +--
 drivers/net/wireless/mediatek/mt7601u/main.c       |  5 +-
 drivers/net/wireless/microchip/wilc1000/cfg80211.c |  7 +-
 drivers/net/wireless/purelifi/plfxlc/mac.c         |  5 +-
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c  |  8 ++-
 drivers/net/wireless/ralink/rt2x00/rt2800lib.c     |  2 +-
 drivers/net/wireless/ralink/rt2x00/rt2800lib.h     |  3 +-
 drivers/net/wireless/ralink/rt2x00/rt2x00.h        |  8 ++-
 drivers/net/wireless/ralink/rt2x00/rt2x00mac.c     |  8 ++-
 drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c |  2 +-
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c |  2 +-
 drivers/net/wireless/realtek/rtl8xxxu/core.c       |  8 ++-
 drivers/net/wireless/realtek/rtlwifi/core.c        |  2 +-
 drivers/net/wireless/realtek/rtw88/mac80211.c      |  9 ++-
 drivers/net/wireless/realtek/rtw88/main.h          |  2 +-
 drivers/net/wireless/realtek/rtw88/rtw8822b.c      |  1 +
 drivers/net/wireless/realtek/rtw88/rtw8822c.c      |  1 +
 drivers/net/wireless/realtek/rtw89/mac80211.c      | 10 +--
 drivers/net/wireless/rsi/rsi_91x_mac80211.c        |  9 ++-
 drivers/net/wireless/silabs/wfx/sta.c              |  4 +-
 drivers/net/wireless/silabs/wfx/sta.h              |  4 +-
 drivers/net/wireless/st/cw1200/sta.c               |  5 +-
 drivers/net/wireless/st/cw1200/sta.h               |  5 +-
 drivers/net/wireless/ti/wl1251/main.c              |  5 +-
 drivers/net/wireless/ti/wlcore/main.c              |  8 ++-
 drivers/net/wireless/virtual/mac80211_hwsim.c      |  6 +-
 drivers/net/wireless/zydas/zd1211rw/zd_mac.c       |  2 +-
 drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c  |  6 +-
 include/net/cfg80211.h                             | 12 ++--
 include/net/mac80211.h                             | 17 +++--
 include/uapi/linux/nl80211.h                       | 10 +++
 net/mac80211/cfg.c                                 | 30 +++++---
 net/mac80211/chan.c                                |  2 +-
 net/mac80211/driver-ops.h                          | 36 +++++-----
 net/mac80211/ieee80211_i.h                         |  5 +-
 net/mac80211/iface.c                               |  6 +-
 net/mac80211/main.c                                |  9 +--
 net/mac80211/mlme.c                                | 12 ++--
 net/mac80211/offchannel.c                          |  2 +-
 net/mac80211/pm.c                                  |  2 +-
 net/mac80211/trace.h                               | 78 ++++++++++++++++-----
 net/mac80211/tx.c                                  |  4 +-
 net/mac80211/util.c                                | 16 ++---
 net/wireless/nl80211.c                             | 26 +++++--
 net/wireless/rdev-ops.h                            | 39 ++++++-----
 net/wireless/trace.h                               | 79 +++++++++++++++-------
 net/wireless/wext-compat.c                         | 10 +--
 93 files changed, 520 insertions(+), 291 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c
index a2d87c3ad196..e94a6b180314 100644
--- a/drivers/net/wireless/admtek/adm8211.c
+++ b/drivers/net/wireless/admtek/adm8211.c
@@ -1293,7 +1293,7 @@ static void adm8211_set_bssid(struct ieee80211_hw *dev, const u8 *bssid)
 	ADM8211_CSR_WRITE(ABDA1, reg);
 }
 
-static int adm8211_config(struct ieee80211_hw *dev, u32 changed)
+static int adm8211_config(struct ieee80211_hw *dev, int radio_idx, u32 changed)
 {
 	struct adm8211_priv *priv = dev->priv;
 	struct ieee80211_conf *conf = &dev->conf;
diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c
index 343c9de2749c..1230e6278f23 100644
--- a/drivers/net/wireless/ath/ar5523/ar5523.c
+++ b/drivers/net/wireless/ath/ar5523/ar5523.c
@@ -1083,7 +1083,8 @@ static void ar5523_stop(struct ieee80211_hw *hw, bool suspend)
 	mutex_unlock(&ar->mutex);
 }
 
-static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 value)
 {
 	struct ar5523 *ar = hw->priv;
 	int ret;
@@ -1137,7 +1138,7 @@ static void ar5523_remove_interface(struct ieee80211_hw *hw,
 	ar->vif = NULL;
 }
 
-static int ar5523_hwconfig(struct ieee80211_hw *hw, u32 changed)
+static int ar5523_hwconfig(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ar5523 *ar = hw->priv;
 
diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c
index fe3a8f4a1cc1..52163c2bfe7a 100644
--- a/drivers/net/wireless/ath/ath10k/core.c
+++ b/drivers/net/wireless/ath/ath10k/core.c
@@ -2606,7 +2606,7 @@ static void ath10k_core_set_coverage_class_work(struct work_struct *work)
 					 set_coverage_class_work);
 
 	if (ar->hw_params.hw_ops->set_coverage_class)
-		ar->hw_params.hw_ops->set_coverage_class(ar, -1);
+		ar->hw_params.hw_ops->set_coverage_class(ar, -1, -1);
 }
 
 static int ath10k_core_init_firmware_features(struct ath10k *ar)
diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c
index 84b35a22fc23..59b6cebfdd8f 100644
--- a/drivers/net/wireless/ath/ath10k/hw.c
+++ b/drivers/net/wireless/ath/ath10k/hw.c
@@ -590,6 +590,7 @@ void ath10k_hw_fill_survey_time(struct ath10k *ar, struct survey_info *survey,
  * function monitors and modifies the corresponding MAC registers.
  */
 static void ath10k_hw_qca988x_set_coverage_class(struct ath10k *ar,
+						 int radio_idx,
 						 s16 value)
 {
 	u32 slottime_reg;
diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h
index 7ffa1fbe2874..fec56b916497 100644
--- a/drivers/net/wireless/ath/ath10k/hw.h
+++ b/drivers/net/wireless/ath/ath10k/hw.h
@@ -646,7 +646,7 @@ struct htt_rx_ring_rx_desc_offsets;
 
 /* Defines needed for Rx descriptor abstraction */
 struct ath10k_hw_ops {
-	void (*set_coverage_class)(struct ath10k *ar, s16 value);
+	void (*set_coverage_class)(struct ath10k *ar, int radio_idx, s16 value);
 	int (*enable_pll_clk)(struct ath10k *ar);
 	int (*tx_data_rssi_pad_bytes)(struct htt_resp *htt);
 	int (*is_rssi_enable)(struct htt_resp *resp);
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 07fe05384cdf..590d7a8dd399 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -4820,7 +4820,8 @@ void ath10k_halt(struct ath10k *ar)
 	spin_unlock_bh(&ar->data_lock);
 }
 
-static int ath10k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+static int ath10k_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+			      u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath10k *ar = hw->priv;
 
@@ -5067,7 +5068,8 @@ static int __ath10k_set_antenna(struct ath10k *ar, u32 tx_ant, u32 rx_ant)
 	return 0;
 }
 
-static int ath10k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+static int ath10k_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+			      u32 tx_ant, u32 rx_ant)
 {
 	struct ath10k *ar = hw->priv;
 	int ret;
@@ -5437,7 +5439,7 @@ static int ath10k_config_ps(struct ath10k *ar)
 	return ret;
 }
 
-static int ath10k_config(struct ieee80211_hw *hw, u32 changed)
+static int ath10k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ath10k *ar = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -6336,7 +6338,8 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw,
 	mutex_unlock(&ar->conf_mutex);
 }
 
-static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value)
+static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+					     s16 value)
 {
 	struct ath10k *ar = hw->priv;
 
@@ -6347,7 +6350,7 @@ static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value)
 		WARN_ON_ONCE(1);
 		return;
 	}
-	ar->hw_params.hw_ops->set_coverage_class(ar, value);
+	ar->hw_params.hw_ops->set_coverage_class(ar, -1, value);
 }
 
 struct ath10k_mac_tdls_iter_data {
@@ -8035,7 +8038,8 @@ static int ath10k_cancel_remain_on_channel(struct ieee80211_hw *hw,
  * in ath10k, but device-specific in mac80211.
  */
 
-static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 value)
 {
 	struct ath10k *ar = hw->priv;
 	struct ath10k_vif *arvif;
@@ -8058,7 +8062,8 @@ static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 	return ret;
 }
 
-static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw,
+					    int radio_idx, u32 value)
 {
 	/* Even though there's a WMI enum for fragmentation threshold no known
 	 * firmware actually implements it. Moreover it is not possible to rely
diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 13301ca317a5..758ef6f26432 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1283,7 +1283,7 @@ static int ath11k_mac_config_ps(struct ath11k *ar)
 	return ret;
 }
 
-static int ath11k_mac_op_config(struct ieee80211_hw *hw, u32 changed)
+static int ath11k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ath11k *ar = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -7044,7 +7044,8 @@ static void ath11k_mac_op_configure_filter(struct ieee80211_hw *hw,
 	mutex_unlock(&ar->conf_mutex);
 }
 
-static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+				     u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath11k *ar = hw->priv;
 
@@ -7058,7 +7059,8 @@ static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *
 	return 0;
 }
 
-static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+				     u32 tx_ant, u32 rx_ant)
 {
 	struct ath11k *ar = hw->priv;
 	int ret;
@@ -8182,7 +8184,8 @@ ath11k_set_vdev_param_to_all_vifs(struct ath11k *ar, int param, u32 value)
 /* mac80211 stores device specific RTS/Fragmentation threshold value,
  * this is set interface specific to firmware from ath11k driver
  */
-static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw,
+					   int radio_idx, u32 value)
 {
 	struct ath11k *ar = hw->priv;
 	int param_id = WMI_VDEV_PARAM_RTS_THRESHOLD;
@@ -8190,7 +8193,8 @@ static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 	return ath11k_set_vdev_param_to_all_vifs(ar, param_id, value);
 }
 
-static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw,
+					    int radio_idx, u32 value)
 {
 	/* Even though there's a WMI vdev param for fragmentation threshold no
 	 * known firmware actually implements it. Moreover it is not possible to
diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c
index 59ec422992d3..81c6b80fa890 100644
--- a/drivers/net/wireless/ath/ath12k/mac.c
+++ b/drivers/net/wireless/ath/ath12k/mac.c
@@ -1392,7 +1392,7 @@ err:
 	return ret;
 }
 
-static int ath12k_mac_op_config(struct ieee80211_hw *hw, u32 changed)
+static int ath12k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	return 0;
 }
@@ -9354,7 +9354,8 @@ static void ath12k_mac_op_configure_filter(struct ieee80211_hw *hw,
 	ar->filter_flags = *total_flags;
 }
 
-static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+				     u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath12k_hw *ah = ath12k_hw_to_ah(hw);
 	int antennas_rx = 0, antennas_tx = 0;
@@ -9374,7 +9375,8 @@ static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *
 	return 0;
 }
 
-static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+				     u32 tx_ant, u32 rx_ant)
 {
 	struct ath12k_hw *ah = ath12k_hw_to_ah(hw);
 	struct ath12k *ar;
@@ -10735,7 +10737,8 @@ ath12k_set_vdev_param_to_all_vifs(struct ath12k *ar, int param, u32 value)
 /* mac80211 stores device specific RTS/Fragmentation threshold value,
  * this is set interface specific to firmware from ath12k driver
  */
-static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw,
+					   int radio_idx, u32 value)
 {
 	struct ath12k_hw *ah = ath12k_hw_to_ah(hw);
 	struct ath12k *ar;
@@ -10760,7 +10763,8 @@ static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
 	return ret;
 }
 
-static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw,
+					    int radio_idx, u32 value)
 {
 	/* Even though there's a WMI vdev param for fragmentation threshold no
 	 * known firmware actually implements it. Moreover it is not possible to
diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
index d81b2ad0b095..eca8145d3874 100644
--- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c
+++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c
@@ -192,7 +192,7 @@ ath5k_remove_interface(struct ieee80211_hw *hw,
  * TODO: Phy disable/diversity etc
  */
 static int
-ath5k_config(struct ieee80211_hw *hw, u32 changed)
+ath5k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ath5k_hw *ah = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -686,6 +686,7 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey)
  * ath5k_set_coverage_class - Set IEEE 802.11 coverage class
  *
  * @hw: struct ieee80211_hw pointer
+ * @radio_idx: Radio index
  * @coverage_class: IEEE 802.11 coverage class number
  *
  * Mac80211 callback. Sets slot time, ACK timeout and CTS timeout for given
@@ -693,7 +694,8 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey)
  * reset.
  */
 static void
-ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+ath5k_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			 s16 coverage_class)
 {
 	struct ath5k_hw *ah = hw->priv;
 
@@ -704,7 +706,8 @@ ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
 
 
 static int
-ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+ath5k_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant,
+		  u32 rx_ant)
 {
 	struct ath5k_hw *ah = hw->priv;
 
@@ -721,7 +724,8 @@ ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
 
 
 static int
-ath5k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+ath5k_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+		  u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath5k_hw *ah = hw->priv;
 
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 8c2e8081112e..88f0197fc041 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1376,7 +1376,8 @@ void ath6kl_cfg80211_tkip_micerr_event(struct ath6kl_vif *vif, u8 keyid,
 				     GFP_KERNEL);
 }
 
-static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+					    u32 changed)
 {
 	struct ath6kl *ar = (struct ath6kl *)wiphy_priv(wiphy);
 	struct ath6kl_vif *vif;
@@ -1405,6 +1406,7 @@ static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 
 static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy,
 				       struct wireless_dev *wdev,
+				       int radio_idx,
 				       enum nl80211_tx_power_setting type,
 				       int mbm)
 {
@@ -1441,6 +1443,7 @@ static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy,
 
 static int ath6kl_cfg80211_get_txpower(struct wiphy *wiphy,
 				       struct wireless_dev *wdev,
+				       int radio_idx,
 				       unsigned int link_id,
 				       int *dbm)
 {
@@ -3242,7 +3245,7 @@ static int ath6kl_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
 					wait, buf, len, no_cck);
 }
 
-static int ath6kl_get_antenna(struct wiphy *wiphy,
+static int ath6kl_get_antenna(struct wiphy *wiphy, int radio_idx,
 			      u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath6kl *ar = wiphy_priv(wiphy);
diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
index 19600018e562..0d6272ac0dac 100644
--- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c
+++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c
@@ -1172,7 +1172,7 @@ static void ath9k_htc_remove_interface(struct ieee80211_hw *hw,
 	mutex_unlock(&priv->mutex);
 }
 
-static int ath9k_htc_config(struct ieee80211_hw *hw, u32 changed)
+static int ath9k_htc_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
 	struct ath_common *common = ath9k_hw_common(priv->ah);
@@ -1737,12 +1737,14 @@ static void ath9k_htc_sw_scan_complete(struct ieee80211_hw *hw,
 	mutex_unlock(&priv->mutex);
 }
 
-static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw,
+				       int radio_idx, u32 value)
 {
 	return 0;
 }
 
 static void ath9k_htc_set_coverage_class(struct ieee80211_hw *hw,
+					 int radio_idx,
 					 s16 coverage_class)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
@@ -1841,8 +1843,8 @@ struct base_eep_header *ath9k_htc_get_eeprom_base(struct ath9k_htc_priv *priv)
 }
 
 
-static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant,
-				 u32 *rx_ant)
+static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+				 u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath9k_htc_priv *priv = hw->priv;
 	struct base_eep_header *pBase = ath9k_htc_get_eeprom_base(priv);
diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index c56f4f3b8990..740a6fc7b067 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1484,7 +1484,7 @@ static void ath9k_disable_ps(struct ath_softc *sc)
 	ath_dbg(common, PS, "PowerSave disabled\n");
 }
 
-static int ath9k_config(struct ieee80211_hw *hw, u32 changed)
+static int ath9k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_hw *ah = sc->sc_ah;
@@ -2114,6 +2114,7 @@ static void ath9k_enable_dynack(struct ath_softc *sc)
 }
 
 static void ath9k_set_coverage_class(struct ieee80211_hw *hw,
+				     int radio_idx,
 				     s16 coverage_class)
 {
 	struct ath_softc *sc = hw->priv;
@@ -2338,7 +2339,8 @@ static bool validate_antenna_mask(struct ath_hw *ah, u32 val)
 	}
 }
 
-static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+static int ath9k_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+			     u32 tx_ant, u32 rx_ant)
 {
 	struct ath_softc *sc = hw->priv;
 	struct ath_hw *ah = sc->sc_ah;
@@ -2367,7 +2369,8 @@ static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
 	return 0;
 }
 
-static int ath9k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+static int ath9k_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+			     u32 *tx_ant, u32 *rx_ant)
 {
 	struct ath_softc *sc = hw->priv;
 
diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c
index 755c068e4197..a7a9345f3483 100644
--- a/drivers/net/wireless/ath/carl9170/main.c
+++ b/drivers/net/wireless/ath/carl9170/main.c
@@ -890,7 +890,7 @@ static void carl9170_stat_work(struct work_struct *work)
 		round_jiffies(msecs_to_jiffies(CARL9170_STAT_WORK)));
 }
 
-static int carl9170_op_config(struct ieee80211_hw *hw, u32 changed)
+static int carl9170_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ar9170 *ar = hw->priv;
 	int err = 0;
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 94d08d6ae1a3..02a525645bfa 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -361,7 +361,7 @@ static void wcn36xx_change_opchannel(struct wcn36xx *wcn, int ch)
 	return;
 }
 
-static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed)
+static int wcn36xx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct wcn36xx *wcn = hw->priv;
 	int ret;
@@ -965,7 +965,8 @@ out:
 }
 
 /* this is required when using IEEE80211_HW_HAS_RATE_CONTROL */
-static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				     u32 value)
 {
 	struct wcn36xx *wcn = hw->priv;
 	wcn36xx_dbg(WCN36XX_DBG_MAC, "mac set RTS threshold %d\n", value);
diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 5473c01cbe66..7703a0933a14 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -1408,7 +1408,8 @@ static int wil_cfg80211_disconnect(struct wiphy *wiphy,
 	return rc;
 }
 
-static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+					 u32 changed)
 {
 	struct wil6210_priv *wil = wiphy_to_wil(wiphy);
 	int rc;
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 6842c2b02b39..aa683eacaf38 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -2002,7 +2002,7 @@ exit:
 	return 0;
 }
 
-static int at76_config(struct ieee80211_hw *hw, u32 changed)
+static int at76_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct at76_priv *priv = hw->priv;
 
diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
index 7529afd24aed..f1a77c4c445f 100644
--- a/drivers/net/wireless/broadcom/b43/main.c
+++ b/drivers/net/wireless/broadcom/b43/main.c
@@ -3975,7 +3975,7 @@ static void b43_set_retry_limits(struct b43_wldev *dev,
 			long_retry);
 }
 
-static int b43_op_config(struct ieee80211_hw *hw, u32 changed)
+static int b43_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct b43_wl *wl = hw_to_b43_wl(hw);
 	struct b43_wldev *dev = wl->current_dev;
@@ -5073,7 +5073,7 @@ static int b43_op_start(struct ieee80211_hw *hw)
 	 * may hang the system.
 	 */
 	if (!err)
-		b43_op_config(hw, ~0);
+		b43_op_config(hw, -1, ~0);
 
 	return err;
 }
@@ -5248,7 +5248,7 @@ out:
 	}
 
 	/* reload configuration */
-	b43_op_config(wl->hw, ~0);
+	b43_op_config(wl->hw, -1, ~0);
 	if (wl->vif)
 		b43_op_bss_info_changed(wl->hw, wl->vif, &wl->vif->bss_conf, ~0);
 
diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c
index 2370a2e6a2e3..aada342e0b80 100644
--- a/drivers/net/wireless/broadcom/b43legacy/main.c
+++ b/drivers/net/wireless/broadcom/b43legacy/main.c
@@ -2662,7 +2662,7 @@ static void b43legacy_set_retry_limits(struct b43legacy_wldev *dev,
 	b43legacy_shm_write16(dev, B43legacy_SHM_WIRELESS, 0x0007, long_retry);
 }
 
-static int b43legacy_op_dev_config(struct ieee80211_hw *hw,
+static int b43legacy_op_dev_config(struct ieee80211_hw *hw, int radio_idx,
 				   u32 changed)
 {
 	struct b43legacy_wl *wl = hw_to_b43legacy_wl(hw);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 5a0b252dfeaf..40a9a8177de6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -1637,7 +1637,8 @@ static s32 brcmf_set_retry(struct net_device *ndev, u32 retry, bool l)
 	return err;
 }
 
-static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+					   u32 changed)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct net_device *ndev = cfg_to_ndev(cfg);
@@ -2645,7 +2646,8 @@ brcmf_cfg80211_disconnect(struct wiphy *wiphy, struct net_device *ndev,
 
 static s32
 brcmf_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			    enum nl80211_tx_power_setting type, s32 mbm)
+			    int radio_idx, enum nl80211_tx_power_setting type,
+			    s32 mbm)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct net_device *ndev = cfg_to_ndev(cfg);
@@ -2696,7 +2698,7 @@ done:
 
 static s32
 brcmf_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			    unsigned int link_id, s32 *dbm)
+			    int radio_idx, unsigned int link_id, s32 *dbm)
 {
 	struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy);
 	struct brcmf_cfg80211_vif *vif = wdev_to_vif(wdev);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
index 1c3d29dca424..8ab452cf48c4 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c
@@ -525,7 +525,8 @@ brcms_ops_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 	spin_unlock_bh(&wl->lock);
 }
 
-static int brcms_ops_config(struct ieee80211_hw *hw, u32 changed)
+static int brcms_ops_config(struct ieee80211_hw *hw, int radio_idx,
+			    u32 changed)
 {
 	struct ieee80211_conf *conf = &hw->conf;
 	struct brcms_info *wl = hw->priv;
diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c
index 9a86688aea67..b7bd3ec4cc50 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.c
+++ b/drivers/net/wireless/intel/iwlegacy/common.c
@@ -4990,7 +4990,7 @@ il_update_qos(struct il_priv *il)
  * il_mac_config - mac80211 config callback
  */
 int
-il_mac_config(struct ieee80211_hw *hw, u32 changed)
+il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct il_priv *il = hw->priv;
 	const struct il_channel_info *ch_info;
diff --git a/drivers/net/wireless/intel/iwlegacy/common.h b/drivers/net/wireless/intel/iwlegacy/common.h
index 52610f5e57a3..4c9836ab11dd 100644
--- a/drivers/net/wireless/intel/iwlegacy/common.h
+++ b/drivers/net/wireless/intel/iwlegacy/common.h
@@ -1956,7 +1956,7 @@ il_get_hw_mode(struct il_priv *il, enum nl80211_band band)
 }
 
 /* mac80211 handlers */
-int il_mac_config(struct ieee80211_hw *hw, u32 changed);
+int il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed);
 void il_mac_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
 void il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *bss_conf, u64 changes);
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
index 1ebc7effcc2a..b39bf401567f 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h
@@ -88,7 +88,7 @@ void iwl_connection_init_rx_config(struct iwl_priv *priv,
 int iwlagn_set_pan_params(struct iwl_priv *priv);
 int iwlagn_commit_rxon(struct iwl_priv *priv, struct iwl_rxon_context *ctx);
 void iwlagn_set_rxon_chain(struct iwl_priv *priv, struct iwl_rxon_context *ctx);
-int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed);
+int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed);
 void iwlagn_bss_info_changed(struct ieee80211_hw *hw,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_bss_conf *bss_conf,
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
index 2d3c1627f283..e08e44cae434 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c
@@ -1149,7 +1149,7 @@ void iwlagn_config_ht40(struct ieee80211_conf *conf,
 	}
 }
 
-int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed)
+int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
 	struct iwl_rxon_context *ctx;
diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c
index 4ba050397632..76e7e3fa2d13 100644
--- a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c
@@ -574,7 +574,8 @@ void iwl_mld_mac80211_stop(struct ieee80211_hw *hw, bool suspend)
 }
 
 static
-int iwl_mld_mac80211_config(struct ieee80211_hw *hw, u32 changed)
+int iwl_mld_mac80211_config(struct ieee80211_hw *hw, int radio_idx,
+			    u32 changed)
 {
 	return 0;
 }
@@ -1102,7 +1103,8 @@ void iwl_mld_unassign_vif_chanctx(struct ieee80211_hw *hw,
 }
 
 static
-int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				       u32 value)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
index 956b491ae5a4..619d822efa5b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c
@@ -298,7 +298,8 @@ static const struct wiphy_iftype_ext_capab add_iftypes_ext_capa[] = {
 	},
 };
 
-int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+			   u32 *tx_ant, u32 *rx_ant)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 	*tx_ant = iwl_mvm_get_valid_tx_ant(mvm);
@@ -306,7 +307,8 @@ int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
 	return 0;
 }
 
-int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant,
+			   u32 rx_ant)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 
@@ -4249,7 +4251,8 @@ int iwl_mvm_mac_sta_state_common(struct ieee80211_hw *hw,
 	return ret;
 }
 
-int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				  u32 value)
 {
 	struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw);
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index a4f412e750d0..5c8eaf1eacff 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -2866,13 +2866,16 @@ void iwl_mvm_mac_wake_tx_queue(struct ieee80211_hw *hw,
 int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw,
 			     struct ieee80211_vif *vif,
 			     struct ieee80211_ampdu_params *params);
-int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
-int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
+int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant,
+			   u32 *rx_ant);
+int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant,
+			   u32 rx_ant);
 int iwl_mvm_mac_start(struct ieee80211_hw *hw);
 void iwl_mvm_mac_reconfig_complete(struct ieee80211_hw *hw,
 				   enum ieee80211_reconfig_type reconfig_type);
 void iwl_mvm_mac_stop(struct ieee80211_hw *hw, bool suspend);
-static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, u32 changed)
+static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, int radio_idx,
+				     u32 changed)
 {
 	return 0;
 }
@@ -2905,7 +2908,8 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw,
 				    int num_frames,
 				    enum ieee80211_frame_release_type reason,
 				    bool more_data);
-int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value);
+int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				  u32 value);
 void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			   struct ieee80211_link_sta *link_sta, u32 changed);
 void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw,
diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c
index 42111bb53f58..2ec3655f1a9c 100644
--- a/drivers/net/wireless/intersil/p54/main.c
+++ b/drivers/net/wireless/intersil/p54/main.c
@@ -313,7 +313,7 @@ static void p54_reset_stats(struct p54_common *priv)
 	priv->survey_raw.tx = 0;
 }
 
-static int p54_config(struct ieee80211_hw *dev, u32 changed)
+static int p54_config(struct ieee80211_hw *dev, int radio_idx, u32 changed)
 {
 	int ret = 0;
 	struct p54_common *priv = dev->priv;
@@ -692,6 +692,7 @@ static void p54_flush(struct ieee80211_hw *dev, struct ieee80211_vif *vif,
 }
 
 static void p54_set_coverage_class(struct ieee80211_hw *dev,
+				   int radio_idx,
 				   s16 coverage_class)
 {
 	struct p54_common *priv = dev->priv;
diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c
index 50c0f6179e2d..d1067874428f 100644
--- a/drivers/net/wireless/marvell/libertas_tf/main.c
+++ b/drivers/net/wireless/marvell/libertas_tf/main.c
@@ -337,7 +337,7 @@ static void lbtf_op_remove_interface(struct ieee80211_hw *hw,
 	lbtf_deb_leave(LBTF_DEB_MACOPS);
 }
 
-static int lbtf_op_config(struct ieee80211_hw *hw, u32 changed)
+static int lbtf_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct lbtf_private *priv = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index 60c12328c2f3..286378770e9e 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -375,6 +375,7 @@ mwifiex_cfg80211_cancel_remain_on_channel(struct wiphy *wiphy,
 static int
 mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy,
 			      struct wireless_dev *wdev,
+			      int radio_idx,
 			      enum nl80211_tx_power_setting type,
 			      int mbm)
 {
@@ -410,6 +411,7 @@ mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy,
 static int
 mwifiex_cfg80211_get_tx_power(struct wiphy *wiphy,
 			      struct wireless_dev *wdev,
+			      int radio_idx,
 			      unsigned int link_id, int *dbm)
 {
 	struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy);
@@ -737,7 +739,8 @@ mwifiex_set_rts(struct mwifiex_private *priv, u32 rts_thr)
  * Fragmentation threshold of the driver.
  */
 static int
-mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+				  u32 changed)
 {
 	struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy);
 	struct mwifiex_private *priv;
@@ -1939,7 +1942,8 @@ mwifiex_cfg80211_del_station(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int
-mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
+mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, int radio_idx, u32 tx_ant,
+			     u32 rx_ant)
 {
 	struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy);
 	struct mwifiex_private *priv = mwifiex_get_priv(adapter,
@@ -2002,7 +2006,8 @@ mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
 }
 
 static int
-mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
+mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, int radio_idx, u32 *tx_ant,
+			     u32 *rx_ant)
 {
 	struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy);
 	struct mwifiex_private *priv = mwifiex_get_priv(adapter,
diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c
index bab9ef37a1ab..bc34a025acd6 100644
--- a/drivers/net/wireless/marvell/mwl8k.c
+++ b/drivers/net/wireless/marvell/mwl8k.c
@@ -3369,7 +3369,8 @@ struct mwl8k_cmd_set_rts_threshold {
 } __packed;
 
 static int
-mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int rts_thresh)
+mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+			    int rts_thresh)
 {
 	struct mwl8k_cmd_set_rts_threshold *cmd;
 	int rc;
@@ -4955,7 +4956,7 @@ fail:
 	wiphy_err(hw->wiphy, "Firmware restart failed\n");
 }
 
-static int mwl8k_config(struct ieee80211_hw *hw, u32 changed)
+static int mwl8k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct ieee80211_conf *conf = &hw->conf;
 	struct mwl8k_priv *priv = hw->priv;
@@ -5321,9 +5322,10 @@ static void mwl8k_configure_filter(struct ieee80211_hw *hw,
 	mwl8k_fw_unlock(hw);
 }
 
-static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				   u32 value)
 {
-	return mwl8k_cmd_set_rts_threshold(hw, value);
+	return mwl8k_cmd_set_rts_threshold(hw, radio_idx, value);
 }
 
 static int mwl8k_sta_remove(struct ieee80211_hw *hw,
@@ -6056,7 +6058,7 @@ static int mwl8k_reload_firmware(struct ieee80211_hw *hw, char *fw_image)
 	if (rc)
 		goto fail;
 
-	rc = mwl8k_config(hw, ~0);
+	rc = mwl8k_config(hw, -1, ~0);
 	if (rc)
 		goto fail;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c
index 45c8db939d55..3afe4c4cd7bb 100644
--- a/drivers/net/wireless/mediatek/mt76/mac80211.c
+++ b/drivers/net/wireless/mediatek/mt76/mac80211.c
@@ -1892,7 +1892,8 @@ void mt76_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 EXPORT_SYMBOL_GPL(mt76_sw_scan_complete);
 
-int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant,
+		     u32 *rx_ant)
 {
 	struct mt76_phy *phy = hw->priv;
 	struct mt76_dev *dev = phy->dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h
index 5f8d81cda6cd..14927a92f9d1 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76.h
@@ -1513,7 +1513,8 @@ int mt76_get_sar_power(struct mt76_phy *phy,
 void mt76_csa_check(struct mt76_dev *dev);
 void mt76_csa_finish(struct mt76_dev *dev);
 
-int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
+int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant,
+		     u32 *rx_ant);
 int mt76_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set);
 void mt76_insert_ccmp_hdr(struct sk_buff *skb, u8 key_id);
 int mt76_get_rate(struct mt76_dev *dev,
diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
index 3e8b1ec76169..0d7c84941cd0 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c
@@ -216,7 +216,7 @@ static int mt7603_set_sar_specs(struct ieee80211_hw *hw,
 }
 
 static int
-mt7603_config(struct ieee80211_hw *hw, u32 changed)
+mt7603_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt7603_dev *dev = hw->priv;
 	int ret = 0;
@@ -657,7 +657,8 @@ mt7603_sta_rate_tbl_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 static void
-mt7603_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+mt7603_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			  s16 coverage_class)
 {
 	struct mt7603_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
index 8a37fb37f77d..15fe155ac3f3 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c
@@ -420,7 +420,7 @@ static int mt7615_set_sar_specs(struct ieee80211_hw *hw,
 	return mt76_update_channel(phy->mt76);
 }
 
-static int mt7615_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7615_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt7615_dev *dev = mt7615_hw_dev(hw);
 	struct mt7615_phy *phy = mt7615_hw_phy(hw);
@@ -784,7 +784,8 @@ static void mt7615_tx(struct ieee80211_hw *hw,
 	mt76_connac_pm_queue_skb(hw, &dev->pm, wcid, skb);
 }
 
-static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 val)
 {
 	struct mt7615_dev *dev = mt7615_hw_dev(hw);
 	struct mt7615_phy *phy = mt7615_hw_phy(hw);
@@ -972,7 +973,8 @@ mt7615_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 static void
-mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+mt7615_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			  s16 coverage_class)
 {
 	struct mt7615_phy *phy = mt7615_hw_phy(hw);
 	struct mt7615_dev *dev = phy->dev;
@@ -984,7 +986,8 @@ mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
 }
 
 static int
-mt7615_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+mt7615_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+		   u32 tx_ant, u32 rx_ant)
 {
 	struct mt7615_dev *dev = mt7615_hw_dev(hw);
 	struct mt7615_phy *phy = mt7615_hw_phy(hw);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c
index 4aa2dcedc874..a5c40d350612 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c
@@ -57,7 +57,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(mt76x0_set_sar_specs);
 
-int mt76x0_config(struct ieee80211_hw *hw, u32 changed)
+int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt76x02_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h
index 50f755344968..e5bc14d4c712 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h
@@ -48,7 +48,7 @@ void mt76x0_chip_onoff(struct mt76x02_dev *dev, bool enable, bool reset);
 
 void mt76x0_mac_stop(struct mt76x02_dev *dev);
 
-int mt76x0_config(struct ieee80211_hw *hw, u32 changed);
+int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed);
 int mt76x0_set_channel(struct mt76_phy *mphy);
 int mt76x0_set_sar_specs(struct ieee80211_hw *hw,
 			 const struct cfg80211_sar_specs *sar);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h
index 4cd63bacd742..2094c7d2af81 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h
@@ -183,8 +183,8 @@ void mt76x02_wdt_work(struct work_struct *work);
 void mt76x02_tx_set_txpwr_auto(struct mt76x02_dev *dev, s8 txpwr);
 void mt76x02_set_tx_ackto(struct mt76x02_dev *dev);
 void mt76x02_set_coverage_class(struct ieee80211_hw *hw,
-				s16 coverage_class);
-int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val);
+				int radio_idx, s16 coverage_class);
+int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val);
 void mt76x02_remove_hdr_pad(struct sk_buff *skb, int len);
 bool mt76x02_tx_status_data(struct mt76_dev *mdev, u8 *update);
 void mt76x02_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q,
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
index 4fb30589fa7a..7dfcb20c692c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c
@@ -548,7 +548,7 @@ void mt76x02_set_tx_ackto(struct mt76x02_dev *dev)
 EXPORT_SYMBOL_GPL(mt76x02_set_tx_ackto);
 
 void mt76x02_set_coverage_class(struct ieee80211_hw *hw,
-				s16 coverage_class)
+				int radio_idx, s16 coverage_class)
 {
 	struct mt76x02_dev *dev = hw->priv;
 
@@ -559,7 +559,7 @@ void mt76x02_set_coverage_class(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL_GPL(mt76x02_set_coverage_class);
 
-int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val)
 {
 	struct mt76x02_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
index eb70130d2711..c5dfb06d81e8 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c
@@ -54,7 +54,7 @@ int mt76x2e_set_channel(struct mt76_phy *phy)
 }
 
 static int
-mt76x2_config(struct ieee80211_hw *hw, u32 changed)
+mt76x2_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt76x02_dev *dev = hw->priv;
 
@@ -99,8 +99,8 @@ mt76x2_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 {
 }
 
-static int mt76x2_set_antenna(struct ieee80211_hw *hw, u32 tx_ant,
-			      u32 rx_ant)
+static int mt76x2_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+			      u32 tx_ant, u32 rx_ant)
 {
 	struct mt76x02_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
index 83e7061b10e2..6671c53faf9f 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c
@@ -50,7 +50,7 @@ int mt76x2u_set_channel(struct mt76_phy *mphy)
 }
 
 static int
-mt76x2u_config(struct ieee80211_hw *hw, u32 changed)
+mt76x2u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt76x02_dev *dev = hw->priv;
 	int err = 0;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
index 3aa31c5cefa6..fe0639c14bf9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c
@@ -449,7 +449,8 @@ out:
 	return err;
 }
 
-static int mt7915_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7915_config(struct ieee80211_hw *hw, int radio_idx,
+			 u32 changed)
 {
 	struct mt7915_dev *dev = mt7915_hw_dev(hw);
 	struct mt7915_phy *phy = mt7915_hw_phy(hw);
@@ -906,7 +907,8 @@ static void mt7915_tx(struct ieee80211_hw *hw,
 	mt76_tx(mphy, control->sta, wcid, skb);
 }
 
-static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 val)
 {
 	struct mt7915_dev *dev = mt7915_hw_dev(hw);
 	struct mt7915_phy *phy = mt7915_hw_phy(hw);
@@ -1102,7 +1104,8 @@ mt7915_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 
 static void
-mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+mt7915_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			  s16 coverage_class)
 {
 	struct mt7915_phy *phy = mt7915_hw_phy(hw);
 	struct mt7915_dev *dev = phy->dev;
@@ -1114,7 +1117,7 @@ mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
 }
 
 static int
-mt7915_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+mt7915_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant)
 {
 	struct mt7915_dev *dev = mt7915_hw_dev(hw);
 	struct mt7915_phy *phy = mt7915_hw_phy(hw);
@@ -1655,7 +1658,7 @@ mt7915_twt_teardown_request(struct ieee80211_hw *hw,
 }
 
 static int
-mt7915_set_frag_threshold(struct ieee80211_hw *hw, u32 val)
+mt7915_set_frag_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
index 1fffa43379b2..1678204296d7 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c
@@ -624,7 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev)
 	mt76_connac_mcu_set_deep_sleep(&dev->mt76, pm->ds_enable);
 }
 
-static int mt7921_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7921_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
@@ -907,7 +907,8 @@ void mt7921_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL_GPL(mt7921_mac_sta_remove);
 
-static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 val)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 
@@ -1088,7 +1089,8 @@ mt7921_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static int
-mt7921_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+mt7921_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+		   u32 tx_ant, u32 rx_ant)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
index 94b0099dcd41..ed7cd75aa6bc 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c
@@ -757,7 +757,7 @@ void mt7925_set_runtime_pm(struct mt792x_dev *dev)
 	mt7925_mcu_set_deep_sleep(dev, pm->ds_enable);
 }
 
-static int mt7925_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7925_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 	int ret = 0;
@@ -1265,7 +1265,8 @@ void mt7925_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL_GPL(mt7925_mac_sta_remove);
 
-static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 val)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 
@@ -1507,7 +1508,8 @@ mt7925_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif)
 }
 
 static int
-mt7925_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+mt7925_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+		   u32 tx_ant, u32 rx_ant)
 {
 	struct mt792x_dev *dev = mt792x_hw_dev(hw);
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h
index e0359d431eca..443d397d9961 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x.h
+++ b/drivers/net/wireless/mediatek/mt76/mt792x.h
@@ -412,7 +412,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw,
 			   struct ieee80211_vif *vif,
 			   struct ieee80211_sta *sta,
 			   struct station_info *sinfo);
-void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class);
+void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			       s16 coverage_class);
 void mt792x_dma_cleanup(struct mt792x_dev *dev);
 int mt792x_dma_enable(struct mt792x_dev *dev);
 int mt792x_wpdma_reset(struct mt792x_dev *dev, bool force);
diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
index a50c1723ca29..43a7ac0f718e 100644
--- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c
+++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c
@@ -579,7 +579,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL_GPL(mt792x_sta_statistics);
 
-void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			       s16 coverage_class)
 {
 	struct mt792x_phy *phy = mt792x_hw_phy(hw);
 	struct mt792x_dev *dev = phy->dev;
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
index 78ae9f5cb176..5283aee619a9 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c
@@ -591,7 +591,7 @@ static int mt7996_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	return err;
 }
 
-static int mt7996_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7996_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	return 0;
 }
@@ -1251,7 +1251,8 @@ unlock:
 	rcu_read_unlock();
 }
 
-static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, u32 val)
+static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 val)
 {
 	struct mt7996_dev *dev = mt7996_hw_dev(hw);
 	int i, ret = 0;
@@ -1491,7 +1492,8 @@ unlock:
 }
 
 static void
-mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
+mt7996_set_coverage_class(struct ieee80211_hw *hw, int radio_idx,
+			  s16 coverage_class)
 {
 	struct mt7996_dev *dev = mt7996_hw_dev(hw);
 	struct mt7996_phy *phy;
@@ -1505,7 +1507,8 @@ mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class)
 }
 
 static int
-mt7996_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+mt7996_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+		   u32 tx_ant, u32 rx_ant)
 {
 	struct mt7996_dev *dev = mt7996_hw_dev(hw);
 	int i;
diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c
index 7570c6ceecea..05ba43e1985c 100644
--- a/drivers/net/wireless/mediatek/mt7601u/main.c
+++ b/drivers/net/wireless/mediatek/mt7601u/main.c
@@ -78,7 +78,7 @@ static void mt7601u_remove_interface(struct ieee80211_hw *hw,
 	dev->wcid_mask[wcid / BITS_PER_LONG] &= ~BIT(wcid % BITS_PER_LONG);
 }
 
-static int mt7601u_config(struct ieee80211_hw *hw, u32 changed)
+static int mt7601u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct mt7601u_dev *dev = hw->priv;
 	int ret = 0;
@@ -334,7 +334,8 @@ mt7601u_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd,
 	return mt76_mac_wcid_set_key(dev, msta->wcid.idx, key);
 }
 
-static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				     u32 value)
 {
 	struct mt7601u_dev *dev = hw->priv;
 
diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
index e7aa0f991923..a395829ebadf 100644
--- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c
+++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c
@@ -800,7 +800,7 @@ static int change_bss(struct wiphy *wiphy, struct net_device *dev,
 	return 0;
 }
 
-static int set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int set_wiphy_params(struct wiphy *wiphy, int radio_idx, u32 changed)
 {
 	int ret = -EINVAL;
 	struct cfg_param_attr cfg_param_val;
@@ -1637,7 +1637,8 @@ static void wilc_set_wakeup(struct wiphy *wiphy, bool enabled)
 }
 
 static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			enum nl80211_tx_power_setting type, int mbm)
+			int radio_idx, enum nl80211_tx_power_setting type,
+			int mbm)
 {
 	int ret;
 	int srcu_idx;
@@ -1669,7 +1670,7 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
 }
 
 static int get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			unsigned int link_id, int *dbm)
+			int radio_idx, unsigned int link_id, int *dbm)
 {
 	int ret;
 	struct wilc_vif *vif = netdev_priv(wdev->netdev);
diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c
index 82d1bf7edba2..d375ad60167f 100644
--- a/drivers/net/wireless/purelifi/plfxlc/mac.c
+++ b/drivers/net/wireless/purelifi/plfxlc/mac.c
@@ -531,7 +531,7 @@ static void plfxlc_op_remove_interface(struct ieee80211_hw *hw,
 	mac->vif = NULL;
 }
 
-static int plfxlc_op_config(struct ieee80211_hw *hw, u32 changed)
+static int plfxlc_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	return 0;
 }
@@ -677,7 +677,8 @@ static void plfxlc_get_et_stats(struct ieee80211_hw *hw,
 	data[1] = mac->crc_errors;
 }
 
-static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				    u32 value)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 0b2282528342..f1188368e66b 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -370,7 +370,8 @@ static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	return ret;
 }
 
-static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int qtnf_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+				 u32 changed)
 {
 	struct qtnf_wmac *mac = wiphy_priv(wiphy);
 	struct qtnf_vif *vif;
@@ -881,7 +882,7 @@ static int qtnf_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 }
 
 static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			     unsigned int link_id, int *dbm)
+			     int radio_idx, unsigned int link_id, int *dbm)
 {
 	struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev);
 	int ret;
@@ -894,7 +895,8 @@ static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
 }
 
 static int qtnf_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev,
-			     enum nl80211_tx_power_setting type, int mbm)
+			     int radio_idx, enum nl80211_tx_power_setting type,
+			     int mbm)
 {
 	struct qtnf_vif *vif;
 	int ret;
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
index b7ea606bda08..4b5a7c9b6499 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c
@@ -12100,7 +12100,7 @@ void rt2800_get_key_seq(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL_GPL(rt2800_get_key_seq);
 
-int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 	u32 reg;
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
index 194de676df8f..620a3d9872ce 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h
@@ -253,7 +253,8 @@ int rt2800_probe_hw(struct rt2x00_dev *rt2x00dev);
 void rt2800_get_key_seq(struct ieee80211_hw *hw,
 			struct ieee80211_key_conf *key,
 			struct ieee80211_key_seq *seq);
-int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value);
+int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+			     u32 value);
 int rt2800_conf_tx(struct ieee80211_hw *hw,
 		   struct ieee80211_vif *vif,
 		   unsigned int link_id, u16 queue_idx,
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
index dfb4bb370f01..09b9d1f9f793 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h
@@ -1457,7 +1457,7 @@ int rt2x00mac_add_interface(struct ieee80211_hw *hw,
 			    struct ieee80211_vif *vif);
 void rt2x00mac_remove_interface(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif);
-int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed);
+int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed);
 void rt2x00mac_configure_filter(struct ieee80211_hw *hw,
 				unsigned int changed_flags,
 				unsigned int *total_flags,
@@ -1489,8 +1489,10 @@ int rt2x00mac_conf_tx(struct ieee80211_hw *hw,
 void rt2x00mac_rfkill_poll(struct ieee80211_hw *hw);
 void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		     u32 queues, bool drop);
-int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
-int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
+int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+			  u32 tx_ant, u32 rx_ant);
+int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+			  u32 *tx_ant, u32 *rx_ant);
 void rt2x00mac_get_ringparam(struct ieee80211_hw *hw,
 			     u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max);
 bool rt2x00mac_tx_frames_pending(struct ieee80211_hw *hw);
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
index 451632488805..3bc0c1c906c9 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c
@@ -304,7 +304,7 @@ void rt2x00mac_remove_interface(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL_GPL(rt2x00mac_remove_interface);
 
-int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed)
+int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -740,7 +740,8 @@ void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 }
 EXPORT_SYMBOL_GPL(rt2x00mac_flush);
 
-int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx,
+			  u32 tx_ant, u32 rx_ant)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct link_ant *ant = &rt2x00dev->link.ant;
@@ -785,7 +786,8 @@ int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
 }
 EXPORT_SYMBOL_GPL(rt2x00mac_set_antenna);
 
-int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx,
+			  u32 *tx_ant, u32 *rx_ant)
 {
 	struct rt2x00_dev *rt2x00dev = hw->priv;
 	struct link_ant *ant = &rt2x00dev->link.ant;
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
index ded8d4d59289..2905baea6239 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c
@@ -1370,7 +1370,7 @@ static void rtl8180_remove_interface(struct ieee80211_hw *dev,
 	priv->vif = NULL;
 }
 
-static int rtl8180_config(struct ieee80211_hw *dev, u32 changed)
+static int rtl8180_config(struct ieee80211_hw *dev, int radio_idx, u32 changed)
 {
 	struct rtl8180_priv *priv = dev->priv;
 	struct ieee80211_conf *conf = &dev->conf;
diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index 220ac5bdf279..8857bb542c7f 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -1151,7 +1151,7 @@ static void rtl8187_remove_interface(struct ieee80211_hw *dev,
 	mutex_unlock(&priv->conf_mutex);
 }
 
-static int rtl8187_config(struct ieee80211_hw *dev, u32 changed)
+static int rtl8187_config(struct ieee80211_hw *dev, int radio_idx, u32 changed)
 {
 	struct rtl8187_priv *priv = dev->priv;
 	struct ieee80211_conf *conf = &dev->conf;
diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c
index 569856ca677f..496836f716aa 100644
--- a/drivers/net/wireless/realtek/rtl8xxxu/core.c
+++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c
@@ -4552,7 +4552,8 @@ static void rtl8xxxu_cam_write(struct rtl8xxxu_priv *priv,
 }
 
 static
-int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant)
+int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant,
+			 u32 *rx_ant)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
 
@@ -6839,7 +6840,7 @@ static void rtl8xxxu_remove_interface(struct ieee80211_hw *hw,
 	priv->vifs[rtlvif->port_num] = NULL;
 }
 
-static int rtl8xxxu_config(struct ieee80211_hw *hw, u32 changed)
+static int rtl8xxxu_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct rtl8xxxu_priv *priv = hw->priv;
 	struct device *dev = &priv->udev->dev;
@@ -6988,7 +6989,8 @@ static void rtl8xxxu_configure_filter(struct ieee80211_hw *hw,
 			 FIF_PROBE_REQ);
 }
 
-static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, u32 rts)
+static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				      u32 rts)
 {
 	if (rts > 2347 && rts != (u32)-1)
 		return -EINVAL;
diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c
index 819cf519e66e..22633c301564 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -566,7 +566,7 @@ static int rtl_op_resume(struct ieee80211_hw *hw)
 }
 #endif
 
-static int rtl_op_config(struct ieee80211_hw *hw, u32 changed)
+static int rtl_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct rtl_priv *rtlpriv = rtl_priv(hw);
 	struct rtl_phy *rtlphy = &(rtlpriv->phy);
diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c
index 77f9fbe1870c..766f22d31079 100644
--- a/drivers/net/wireless/realtek/rtw88/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw88/mac80211.c
@@ -71,7 +71,7 @@ static void rtw_ops_stop(struct ieee80211_hw *hw, bool suspend)
 	mutex_unlock(&rtwdev->mutex);
 }
 
-static int rtw_ops_config(struct ieee80211_hw *hw, u32 changed)
+static int rtw_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct rtw_dev *rtwdev = hw->priv;
 	int ret = 0;
@@ -708,7 +708,8 @@ static void rtw_ops_mgd_prepare_tx(struct ieee80211_hw *hw,
 	mutex_unlock(&rtwdev->mutex);
 }
 
-static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				     u32 value)
 {
 	struct rtw_dev *rtwdev = hw->priv;
 
@@ -797,6 +798,7 @@ static int rtw_ops_set_bitrate_mask(struct ieee80211_hw *hw,
 }
 
 static int rtw_ops_set_antenna(struct ieee80211_hw *hw,
+			       int radio_idx,
 			       u32 tx_antenna,
 			       u32 rx_antenna)
 {
@@ -808,13 +810,14 @@ static int rtw_ops_set_antenna(struct ieee80211_hw *hw,
 		return -EOPNOTSUPP;
 
 	mutex_lock(&rtwdev->mutex);
-	ret = chip->ops->set_antenna(rtwdev, tx_antenna, rx_antenna);
+	ret = chip->ops->set_antenna(rtwdev, -1, tx_antenna, rx_antenna);
 	mutex_unlock(&rtwdev->mutex);
 
 	return ret;
 }
 
 static int rtw_ops_get_antenna(struct ieee80211_hw *hw,
+			       int radio_idx,
 			       u32 *tx_antenna,
 			       u32 *rx_antenna)
 {
diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h
index b0f1fabe9554..7ae67143e909 100644
--- a/drivers/net/wireless/realtek/rtw88/main.h
+++ b/drivers/net/wireless/realtek/rtw88/main.h
@@ -873,7 +873,7 @@ struct rtw_chip_ops {
 	void (*set_tx_power_index)(struct rtw_dev *rtwdev);
 	int (*rsvd_page_dump)(struct rtw_dev *rtwdev, u8 *buf, u32 offset,
 			      u32 size);
-	int (*set_antenna)(struct rtw_dev *rtwdev,
+	int (*set_antenna)(struct rtw_dev *rtwdev, int radio_idx,
 			   u32 antenna_tx,
 			   u32 antenna_rx);
 	void (*cfg_ldo25)(struct rtw_dev *rtwdev, bool enable);
diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c
index ab199eaea3c7..710126379e77 100644
--- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c
+++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c
@@ -983,6 +983,7 @@ static bool rtw8822b_check_rf_path(u8 antenna)
 }
 
 static int rtw8822b_set_antenna(struct rtw_dev *rtwdev,
+				int radio_idx,
 				u32 antenna_tx,
 				u32 antenna_rx)
 {
diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c
index 017d959de3ce..0ce6aa10493e 100644
--- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c
+++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c
@@ -2767,6 +2767,7 @@ static void rtw8822c_set_tx_power_index(struct rtw_dev *rtwdev)
 }
 
 static int rtw8822c_set_antenna(struct rtw_dev *rtwdev,
+				int radio_idx,
 				u32 antenna_tx,
 				u32 antenna_rx)
 {
diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c
index a47971003bd4..b9e046208424 100644
--- a/drivers/net/wireless/realtek/rtw89/mac80211.c
+++ b/drivers/net/wireless/realtek/rtw89/mac80211.c
@@ -72,7 +72,7 @@ static void rtw89_ops_stop(struct ieee80211_hw *hw, bool suspend)
 	rtw89_core_stop(rtwdev);
 }
 
-static int rtw89_ops_config(struct ieee80211_hw *hw, u32 changed)
+static int rtw89_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 
@@ -1007,7 +1007,8 @@ static int rtw89_ops_ampdu_action(struct ieee80211_hw *hw,
 	return 0;
 }
 
-static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				       u32 value)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 
@@ -1119,7 +1120,7 @@ static int rtw89_ops_set_bitrate_mask(struct ieee80211_hw *hw,
 }
 
 static
-int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
+int rtw89_ops_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_hal *hal = &rtwdev->hal;
@@ -1142,7 +1143,8 @@ int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant)
 }
 
 static
-int rtw89_ops_get_antenna(struct ieee80211_hw *hw,  u32 *tx_ant, u32 *rx_ant)
+int rtw89_ops_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant,
+			  u32 *rx_ant)
 {
 	struct rtw89_dev *rtwdev = hw->priv;
 	struct rtw89_hal *hal = &rtwdev->hal;
diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
index 0e115b428f96..f3a853edfc11 100644
--- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c
+++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c
@@ -656,11 +656,13 @@ static int rsi_config_power(struct ieee80211_hw *hw)
  *			   requests. The stack calls this function to
  *			   change hardware configuration, e.g., channel.
  * @hw: Pointer to the ieee80211_hw structure.
+ * @radio_idx: Radio index.
  * @changed: Changed flags set.
  *
  * Return: 0 on success, negative error code on failure.
  */
 static int rsi_mac80211_config(struct ieee80211_hw *hw,
+			       int radio_idx,
 			       u32 changed)
 {
 	struct rsi_hw *adapter = hw->priv;
@@ -1201,12 +1203,13 @@ unlock:
 /**
  * rsi_mac80211_set_rts_threshold() - This function sets rts threshold value.
  * @hw: Pointer to the ieee80211_hw structure.
+ * @radio_idx: Radio index.
  * @value: Rts threshold value.
  *
  * Return: 0 on success.
  */
 static int rsi_mac80211_set_rts_threshold(struct ieee80211_hw *hw,
-					  u32 value)
+					  int radio_idx, u32 value)
 {
 	struct rsi_hw *adapter = hw->priv;
 	struct rsi_common *common = adapter->priv;
@@ -1583,12 +1586,14 @@ static int rsi_mac80211_sta_remove(struct ieee80211_hw *hw,
  * rsi_mac80211_set_antenna() - This function is used to configure
  *				tx and rx antennas.
  * @hw: Pointer to the ieee80211_hw structure.
+ * @radio_idx: Radio index
  * @tx_ant: Bitmap for tx antenna
  * @rx_ant: Bitmap for rx antenna
  *
  * Return: 0 on success, Negative error code on failure.
  */
 static int rsi_mac80211_set_antenna(struct ieee80211_hw *hw,
+				    int radio_idx,
 				    u32 tx_ant, u32 rx_ant)
 {
 	struct rsi_hw *adapter = hw->priv;
@@ -1634,12 +1639,14 @@ fail_set_antenna:
  * 				tx and rx antennas.
  *
  * @hw: Pointer to the ieee80211_hw structure.
+ * @radio_idx: Radio index
  * @tx_ant: Bitmap for tx antenna
  * @rx_ant: Bitmap for rx antenna
  * 
  * Return: 0 on success, negative error codes on failure.
  */
 static int rsi_mac80211_get_antenna(struct ieee80211_hw *hw,
+				    int radio_idx,
 				    u32 *tx_ant, u32 *rx_ant)
 {
 	struct rsi_hw *adapter = hw->priv;
diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c
index e95b9ded17d9..d12fcc755701 100644
--- a/drivers/net/wireless/silabs/wfx/sta.c
+++ b/drivers/net/wireless/silabs/wfx/sta.c
@@ -220,7 +220,7 @@ int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	return 0;
 }
 
-int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value)
 {
 	struct wfx_dev *wdev = hw->priv;
 	struct wfx_vif *wvif = NULL;
@@ -706,7 +706,7 @@ void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif
 	wvif->channel = NULL;
 }
 
-int wfx_config(struct ieee80211_hw *hw, u32 changed)
+int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h
index 8702eed5267f..b4812b294f3c 100644
--- a/drivers/net/wireless/silabs/wfx/sta.h
+++ b/drivers/net/wireless/silabs/wfx/sta.h
@@ -21,8 +21,8 @@ struct wfx_sta_priv {
 /* mac80211 interface */
 int wfx_start(struct ieee80211_hw *hw);
 void wfx_stop(struct ieee80211_hw *hw, bool suspend);
-int wfx_config(struct ieee80211_hw *hw, u32 changed);
-int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value);
+int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed);
+int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value);
 void wfx_set_default_unicast_key(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx);
 void wfx_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags,
 			  unsigned int *total_flags, u64 unused);
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 5dd7f6a38900..b1dd76e8aecb 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -321,7 +321,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev,
 	return ret;
 }
 
-int cw1200_config(struct ieee80211_hw *dev, u32 changed)
+int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed)
 {
 	int ret = 0;
 	struct cw1200_common *priv = dev->priv;
@@ -857,7 +857,8 @@ void cw1200_wep_key_work(struct work_struct *work)
 	wsm_unlock_tx(priv);
 }
 
-int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+			     u32 value)
 {
 	int ret = 0;
 	__le32 val32;
diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h
index b955b92cfd73..b4f04371668d 100644
--- a/drivers/net/wireless/st/cw1200/sta.h
+++ b/drivers/net/wireless/st/cw1200/sta.h
@@ -22,7 +22,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev,
 			    struct ieee80211_vif *vif,
 			    enum nl80211_iftype new_type,
 			    bool p2p);
-int cw1200_config(struct ieee80211_hw *dev, u32 changed);
+int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed);
 void cw1200_configure_filter(struct ieee80211_hw *dev,
 			     unsigned int changed_flags,
 			     unsigned int *total_flags,
@@ -36,7 +36,8 @@ int cw1200_set_key(struct ieee80211_hw *dev, enum set_key_cmd cmd,
 		   struct ieee80211_vif *vif, struct ieee80211_sta *sta,
 		   struct ieee80211_key_conf *key);
 
-int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value);
+int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+			     u32 value);
 
 void cw1200_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		  u32 queues, bool drop);
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index bb53d681c11b..69fc51f183ad 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -589,7 +589,7 @@ static bool wl1251_can_do_pm(struct ieee80211_conf *conf, struct wl1251 *wl)
 	return (conf->flags & IEEE80211_CONF_PS) && !wl->monitor_present;
 }
 
-static int wl1251_op_config(struct ieee80211_hw *hw, u32 changed)
+static int wl1251_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct wl1251 *wl = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -1051,7 +1051,8 @@ out:
 	return ret;
 }
 
-static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				       u32 value)
 {
 	struct wl1251 *wl = hw->priv;
 	int ret;
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index f93c95edd991..6116a8522d96 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -3166,7 +3166,7 @@ static int wl12xx_config_vif(struct wl1271 *wl, struct wl12xx_vif *wlvif,
 	return 0;
 }
 
-static int wl1271_op_config(struct ieee80211_hw *hw, u32 changed)
+static int wl1271_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif;
@@ -3895,7 +3895,8 @@ out:
 	return 0;
 }
 
-static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value)
+static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw,
+					int radio_idx, u32 value)
 {
 	struct wl1271 *wl = hw->priv;
 	int ret = 0;
@@ -3924,7 +3925,8 @@ out:
 	return ret;
 }
 
-static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx,
+				       u32 value)
 {
 	struct wl1271 *wl = hw->priv;
 	struct wl12xx_vif *wlvif;
diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c
index f6add19d1da1..eefe8da3b14d 100644
--- a/drivers/net/wireless/virtual/mac80211_hwsim.c
+++ b/drivers/net/wireless/virtual/mac80211_hwsim.c
@@ -2381,7 +2381,8 @@ static const char * const hwsim_chanwidths[] = {
 	[NL80211_CHAN_WIDTH_320] = "eht320",
 };
 
-static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed)
+static int mac80211_hwsim_config(struct ieee80211_hw *hw, int radio_idx,
+				 u32 changed)
 {
 	struct mac80211_hwsim_data *data = hw->priv;
 	struct ieee80211_conf *conf = &hw->conf;
@@ -3338,7 +3339,8 @@ static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw)
 	return 1;
 }
 
-static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, u32 value)
+static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw,
+					    int radio_idx, u32 value)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
index 9653dbaac3c0..f7c56174424d 100644
--- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c
@@ -1133,7 +1133,7 @@ static void zd_op_remove_interface(struct ieee80211_hw *hw,
 	zd_mac_free_cur_beacon(mac);
 }
 
-static int zd_op_config(struct ieee80211_hw *hw, u32 changed)
+static int zd_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed)
 {
 	struct zd_mac *mac = zd_hw_mac(hw);
 	struct ieee80211_conf *conf = &hw->conf;
diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
index 7fcc46a0bb48..4e29652f8ee7 100644
--- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
+++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c
@@ -1298,7 +1298,8 @@ exit:
 	return ret;
 }
 
-static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+					 u32 changed)
 {
 	return 0;
 }
@@ -1795,7 +1796,7 @@ static int cfg80211_rtw_disconnect(struct wiphy *wiphy, struct net_device *ndev,
 }
 
 static int cfg80211_rtw_set_txpower(struct wiphy *wiphy,
-				    struct wireless_dev *wdev,
+				    struct wireless_dev *wdev, int radio_idx,
 				    enum nl80211_tx_power_setting type, int mbm)
 {
 	return 0;
@@ -1803,6 +1804,7 @@ static int cfg80211_rtw_set_txpower(struct wiphy *wiphy,
 
 static int cfg80211_rtw_get_txpower(struct wiphy *wiphy,
 				    struct wireless_dev *wdev,
+				    int radio_idx,
 				    unsigned int link_id, int *dbm)
 {
 	*dbm = (12);
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index eec066f4738a..ffd9564fc840 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4853,12 +4853,14 @@ struct cfg80211_ops {
 	int	(*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
 				  int rate[NUM_NL80211_BANDS]);
 
-	int	(*set_wiphy_params)(struct wiphy *wiphy, u32 changed);
+	int	(*set_wiphy_params)(struct wiphy *wiphy, int radio_idx,
+				    u32 changed);
 
 	int	(*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
+				int radio_idx,
 				enum nl80211_tx_power_setting type, int mbm);
 	int	(*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
-				unsigned int link_id, int *dbm);
+				int radio_idx, unsigned int link_id, int *dbm);
 
 	void	(*rfkill_poll)(struct wiphy *wiphy);
 
@@ -4920,8 +4922,10 @@ struct cfg80211_ops {
 						   struct wireless_dev *wdev,
 						   struct mgmt_frame_regs *upd);
 
-	int	(*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
-	int	(*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);
+	int	(*set_antenna)(struct wiphy *wiphy, int radio_idx,
+			       u32 tx_ant, u32 rx_ant);
+	int	(*get_antenna)(struct wiphy *wiphy, int radio_idx,
+			       u32 *tx_ant, u32 *rx_ant);
 
 	int	(*sched_scan_start)(struct wiphy *wiphy,
 				struct net_device *dev,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fa2325692abf..a0de0da4d79b 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4517,7 +4517,7 @@ struct ieee80211_ops {
 				enum nl80211_iftype new_type, bool p2p);
 	void (*remove_interface)(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif);
-	int (*config)(struct ieee80211_hw *hw, u32 changed);
+	int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed);
 	void (*bss_info_changed)(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
 				 struct ieee80211_bss_conf *info,
@@ -4580,8 +4580,10 @@ struct ieee80211_ops {
 	void (*get_key_seq)(struct ieee80211_hw *hw,
 			    struct ieee80211_key_conf *key,
 			    struct ieee80211_key_seq *seq);
-	int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value);
-	int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value);
+	int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx,
+				  u32 value);
+	int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx,
+				 u32 value);
 	int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		       struct ieee80211_sta *sta);
 	int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
@@ -4678,7 +4680,8 @@ struct ieee80211_ops {
 	int (*get_survey)(struct ieee80211_hw *hw, int idx,
 		struct survey_info *survey);
 	void (*rfkill_poll)(struct ieee80211_hw *hw);
-	void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class);
+	void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx,
+				   s16 coverage_class);
 #ifdef CONFIG_NL80211_TESTMODE
 	int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			    void *data, int len);
@@ -4693,8 +4696,10 @@ struct ieee80211_ops {
 	void (*channel_switch)(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_channel_switch *ch_switch);
-	int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
-	int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
+	int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx,
+			   u32 tx_ant, u32 rx_ant);
+	int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx,
+			   u32 *tx_ant, u32 *rx_ant);
 
 	int (*remain_on_channel)(struct ieee80211_hw *hw,
 				 struct ieee80211_vif *vif,
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a289014abe37..2a71149c3065 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2907,6 +2907,14 @@ enum nl80211_commands {
  *	APs Support". Drivers may set additional flags that they support
  *	in the kernel or device.
  *
+ * @NL80211_ATTR_WIPHY_RADIO_INDEX: (int) Integer attribute denoting the index
+ *	of the radio in interest. Internally a value of -1 is used to
+ *	indicate that the radio id is not given in user-space. This means
+ *	that all the attributes are applicable to all the radios. If there is
+ *	a radio index provided in user-space, the attributes will be
+ *	applicable to that specific radio only. If the radio id is greater
+ *	thank the number of radios, error denoting invalid value is returned.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3464,6 +3472,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS,
 
+	NL80211_ATTR_WIPHY_RADIO_INDEX,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 1a17d66dfa75..72cecc304658 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3045,7 +3045,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
 	return 0;
 }
 
-static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int ieee80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+				      u32 changed)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	int err;
@@ -3053,7 +3054,8 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 	if (changed & WIPHY_PARAM_FRAG_THRESHOLD) {
 		ieee80211_check_fast_xmit_all(local);
 
-		err = drv_set_frag_threshold(local, wiphy->frag_threshold);
+		err = drv_set_frag_threshold(local, radio_idx,
+					     wiphy->frag_threshold);
 
 		if (err) {
 			ieee80211_check_fast_xmit_all(local);
@@ -3067,14 +3069,16 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 
 		coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ?
 					wiphy->coverage_class : -1;
-		err = drv_set_coverage_class(local, coverage_class);
+		err = drv_set_coverage_class(local, radio_idx,
+					     coverage_class);
 
 		if (err)
 			return err;
 	}
 
 	if (changed & WIPHY_PARAM_RTS_THRESHOLD) {
-		err = drv_set_rts_threshold(local, wiphy->rts_threshold);
+		err = drv_set_rts_threshold(local, radio_idx,
+					    wiphy->rts_threshold);
 
 		if (err)
 			return err;
@@ -3092,18 +3096,19 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
 	}
 	if (changed &
 	    (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+		ieee80211_hw_config(local, radio_idx,
+				    IEEE80211_CONF_CHANGE_RETRY_LIMITS);
 
 	if (changed & (WIPHY_PARAM_TXQ_LIMIT |
 		       WIPHY_PARAM_TXQ_MEMORY_LIMIT |
 		       WIPHY_PARAM_TXQ_QUANTUM))
-		ieee80211_txq_set_params(local);
+		ieee80211_txq_set_params(local, radio_idx);
 
 	return 0;
 }
 
 static int ieee80211_set_tx_power(struct wiphy *wiphy,
-				  struct wireless_dev *wdev,
+				  struct wireless_dev *wdev, int radio_idx,
 				  enum nl80211_tx_power_setting type, int mbm)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
@@ -3231,6 +3236,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
 
 static int ieee80211_get_tx_power(struct wiphy *wiphy,
 				  struct wireless_dev *wdev,
+				  int radio_idx,
 				  unsigned int link_id,
 				  int *dbm)
 {
@@ -3409,7 +3415,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 
 	ieee80211_recalc_ps(local);
 	ieee80211_recalc_ps_vif(sdata);
@@ -4305,7 +4311,8 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
 		ieee80211_configure_filter(local);
 }
 
-static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
+static int ieee80211_set_antenna(struct wiphy *wiphy, int radio_idx,
+				 u32 tx_ant, u32 rx_ant)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 	int ret;
@@ -4321,11 +4328,12 @@ static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
 	return 0;
 }
 
-static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
+static int ieee80211_get_antenna(struct wiphy *wiphy, int radio_idx,
+				 u32 *tx_ant, u32 *rx_ant)
 {
 	struct ieee80211_local *local = wiphy_priv(wiphy);
 
-	return drv_get_antenna(local, tx_ant, rx_ant);
+	return drv_get_antenna(local, radio_idx, tx_ant, rx_ant);
 }
 
 static int ieee80211_set_rekey_data(struct wiphy *wiphy,
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index d62f91656a19..4bcbcf9d98b5 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -744,7 +744,7 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local,
 	/* turn idle off *before* setting channel -- some drivers need that */
 	changed = ieee80211_idle_off(local);
 	if (changed)
-		ieee80211_hw_config(local, changed);
+		ieee80211_hw_config(local, -1, changed);
 
 	err = drv_add_chanctx(local, ctx);
 	if (err) {
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index ba017bf3fd15..8baebb5636ec 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -143,15 +143,16 @@ int drv_change_interface(struct ieee80211_local *local,
 void drv_remove_interface(struct ieee80211_local *local,
 			  struct ieee80211_sub_if_data *sdata);
 
-static inline int drv_config(struct ieee80211_local *local, u32 changed)
+static inline int drv_config(struct ieee80211_local *local, int radio_idx,
+			     u32 changed)
 {
 	int ret;
 
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	trace_drv_config(local, changed);
-	ret = local->ops->config(&local->hw, changed);
+	trace_drv_config(local, radio_idx, changed);
+	ret = local->ops->config(&local->hw, radio_idx, changed);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
@@ -387,45 +388,47 @@ static inline void drv_get_key_seq(struct ieee80211_local *local,
 }
 
 static inline int drv_set_frag_threshold(struct ieee80211_local *local,
-					u32 value)
+					 int radio_idx, u32 value)
 {
 	int ret = 0;
 
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	trace_drv_set_frag_threshold(local, value);
+	trace_drv_set_frag_threshold(local, radio_idx, value);
 	if (local->ops->set_frag_threshold)
-		ret = local->ops->set_frag_threshold(&local->hw, value);
+		ret = local->ops->set_frag_threshold(&local->hw, radio_idx,
+						     value);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
 
 static inline int drv_set_rts_threshold(struct ieee80211_local *local,
-					u32 value)
+					int radio_idx, u32 value)
 {
 	int ret = 0;
 
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	trace_drv_set_rts_threshold(local, value);
+	trace_drv_set_rts_threshold(local, radio_idx, value);
 	if (local->ops->set_rts_threshold)
-		ret = local->ops->set_rts_threshold(&local->hw, value);
+		ret = local->ops->set_rts_threshold(&local->hw, radio_idx,
+						    value);
 	trace_drv_return_int(local, ret);
 	return ret;
 }
 
 static inline int drv_set_coverage_class(struct ieee80211_local *local,
-					 s16 value)
+					 int radio_idx, s16 value)
 {
 	int ret = 0;
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	trace_drv_set_coverage_class(local, value);
+	trace_drv_set_coverage_class(local, radio_idx, value);
 	if (local->ops->set_coverage_class)
-		local->ops->set_coverage_class(&local->hw, value);
+		local->ops->set_coverage_class(&local->hw, radio_idx, value);
 	else
 		ret = -EOPNOTSUPP;
 
@@ -772,20 +775,21 @@ static inline int drv_set_antenna(struct ieee80211_local *local,
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 	if (local->ops->set_antenna)
-		ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant);
+		ret = local->ops->set_antenna(&local->hw, -1, tx_ant, rx_ant);
 	trace_drv_set_antenna(local, tx_ant, rx_ant, ret);
 	return ret;
 }
 
-static inline int drv_get_antenna(struct ieee80211_local *local,
+static inline int drv_get_antenna(struct ieee80211_local *local, int radio_idx,
 				  u32 *tx_ant, u32 *rx_ant)
 {
 	int ret = -EOPNOTSUPP;
 	might_sleep();
 	lockdep_assert_wiphy(local->hw.wiphy);
 	if (local->ops->get_antenna)
-		ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant);
-	trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret);
+		ret = local->ops->get_antenna(&local->hw, radio_idx,
+					      tx_ant, rx_ant);
+	trace_drv_get_antenna(local, radio_idx, *tx_ant, *rx_ant, ret);
 	return ret;
 }
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9b9c7209878b..f59a5b38e6f2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1872,7 +1872,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
 				     struct ieee80211_rx_status *status,
 				     unsigned int mpdu_len,
 				     unsigned int mpdu_offset);
-int ieee80211_hw_config(struct ieee80211_local *local, u32 changed);
+int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx,
+			u32 changed);
 int ieee80211_hw_conf_chan(struct ieee80211_local *local);
 void ieee80211_hw_conf_init(struct ieee80211_local *local);
 void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
@@ -2542,7 +2543,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local)
 }
 
 int ieee80211_txq_setup_flows(struct ieee80211_local *local);
-void ieee80211_txq_set_params(struct ieee80211_local *local);
+void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx);
 void ieee80211_txq_teardown_flows(struct ieee80211_local *local);
 void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata,
 			struct sta_info *sta,
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 7c27f3cd841c..7b2baebb8644 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -146,7 +146,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local)
 {
 	u32 change = __ieee80211_recalc_idle(local, false);
 	if (change)
-		ieee80211_hw_config(local, change);
+		ieee80211_hw_config(local, -1, change);
 }
 
 static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr,
@@ -726,7 +726,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 
 	/* do after stop to avoid reconfiguring when we stop anyway */
 	ieee80211_configure_filter(local);
-	ieee80211_hw_config(local, hw_reconf_flags);
+	ieee80211_hw_config(local, -1, hw_reconf_flags);
 
 	if (local->virt_monitors == local->open_count)
 		ieee80211_add_virtual_monitor(local);
@@ -1491,7 +1491,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 	if (local->open_count == 1)
 		ieee80211_hw_conf_init(local);
 	else if (hw_reconf_flags)
-		ieee80211_hw_config(local, hw_reconf_flags);
+		ieee80211_hw_config(local, -1, hw_reconf_flags);
 
 	ieee80211_recalc_ps(local);
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 6b6de43d9420..c1c758e76d2e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -190,7 +190,8 @@ static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local,
 	return changed;
 }
 
-int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
+int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx,
+			u32 changed)
 {
 	int ret = 0;
 
@@ -201,7 +202,7 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed)
 			   IEEE80211_CONF_CHANGE_SMPS));
 
 	if (changed && local->open_count) {
-		ret = drv_config(local, changed);
+		ret = drv_config(local, radio_idx, changed);
 		/*
 		 * Goal:
 		 * HW reconfiguration should never fail, the driver has told
@@ -235,7 +236,7 @@ static int _ieee80211_hw_conf_chan(struct ieee80211_local *local,
 	if (!changed)
 		return 0;
 
-	return drv_config(local, changed);
+	return drv_config(local, -1, changed);
 }
 
 int ieee80211_hw_conf_chan(struct ieee80211_local *local)
@@ -269,7 +270,7 @@ void ieee80211_hw_conf_init(struct ieee80211_local *local)
 						       ctx ? &ctx->conf : NULL);
 	}
 
-	WARN_ON(drv_config(local, changed));
+	WARN_ON(drv_config(local, -1, changed));
 }
 
 int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2d46d4af60d7..d526f2fe9fe5 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3181,7 +3181,7 @@ static void ieee80211_enable_ps(struct ieee80211_local *local,
 			return;
 
 		conf->flags |= IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 	}
 }
 
@@ -3193,7 +3193,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local)
 		ieee80211_enable_ps(local, local->ps_sdata);
 	} else if (conf->flags & IEEE80211_CONF_PS) {
 		conf->flags &= ~IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 		timer_delete_sync(&local->dynamic_ps_timer);
 		wiphy_work_cancel(local->hw.wiphy,
 				  &local->dynamic_ps_enable_work);
@@ -3302,7 +3302,7 @@ void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy,
 
 	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 	}
 
 	ieee80211_wake_queues_by_reason(&local->hw,
@@ -3377,7 +3377,7 @@ void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy,
 	    (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) {
 		ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED;
 		local->hw.conf.flags |= IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 	}
 }
 
@@ -3986,7 +3986,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 	 */
 	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 	}
 	local->ps_sdata = NULL;
 
@@ -7340,7 +7340,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 		if (local->hw.conf.dynamic_ps_timeout > 0) {
 			if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 				local->hw.conf.flags &= ~IEEE80211_CONF_PS;
-				ieee80211_hw_config(local,
+				ieee80211_hw_config(local, -1,
 						    IEEE80211_CONF_CHANGE_PS);
 			}
 			ieee80211_send_nullfunc(local, sdata, false);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 686d9f6e9b52..13df6321634d 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -39,7 +39,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata)
 	if (local->hw.conf.flags & IEEE80211_CONF_PS) {
 		offchannel_ps_enabled = true;
 		local->hw.conf.flags &= ~IEEE80211_CONF_PS;
-		ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+		ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
 	}
 
 	if (!offchannel_ps_enabled ||
diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c
index a9cc832240a5..5a508d99e84f 100644
--- a/net/mac80211/pm.c
+++ b/net/mac80211/pm.c
@@ -108,7 +108,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan)
 			    sdata->u.mgd.powersave &&
 			     !(local->hw.conf.flags & IEEE80211_CONF_PS)) {
 				local->hw.conf.flags |= IEEE80211_CONF_PS;
-				ieee80211_hw_config(local,
+				ieee80211_hw_config(local, -1,
 						    IEEE80211_CONF_CHANGE_PS);
 			}
 		}
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 8215ca58ce5e..0bfbce157486 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -384,12 +384,14 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface,
 
 TRACE_EVENT(drv_config,
 	TP_PROTO(struct ieee80211_local *local,
+		 int radio_idx,
 		 u32 changed),
 
-	TP_ARGS(local, changed),
+	TP_ARGS(local, radio_idx, changed),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
+		__field(int, radio_idx)
 		__field(u32, changed)
 		__field(u32, flags)
 		__field(int, power_level)
@@ -403,6 +405,7 @@ TRACE_EVENT(drv_config,
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->changed = changed;
 		__entry->flags = local->hw.conf.flags;
 		__entry->power_level = local->hw.conf.power_level;
@@ -417,8 +420,8 @@ TRACE_EVENT(drv_config,
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT,
-		LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG
+		LOCAL_PR_FMT " radio_idx:%d ch:%#x" CHANDEF_PR_FMT,
+		LOCAL_PR_ARG, __entry->radio_idx, __entry->changed, CHANDEF_PR_ARG
 	)
 );
 
@@ -818,34 +821,71 @@ TRACE_EVENT(drv_get_key_seq,
 	)
 );
 
-DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold,
-	TP_PROTO(struct ieee80211_local *local, u32 value),
-	TP_ARGS(local, value)
+TRACE_EVENT(drv_set_frag_threshold,
+	TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value),
+
+	TP_ARGS(local, radio_idx, value),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, radio_idx)
+		__field(u32, value)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->radio_idx = radio_idx;
+		__entry->value = value;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " radio_id:%d value:%u",
+		LOCAL_PR_ARG, __entry->radio_idx, __entry->value
+	)
 );
 
-DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold,
-	TP_PROTO(struct ieee80211_local *local, u32 value),
-	TP_ARGS(local, value)
+TRACE_EVENT(drv_set_rts_threshold,
+	TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value),
+
+	TP_ARGS(local, radio_idx, value),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		__field(int, radio_idx)
+		__field(u32, value)
+	),
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		__entry->radio_idx = radio_idx;
+		__entry->value = value;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT " radio_id:%d value:%u",
+		LOCAL_PR_ARG, __entry->radio_idx, __entry->value
+	)
 );
 
 TRACE_EVENT(drv_set_coverage_class,
-	TP_PROTO(struct ieee80211_local *local, s16 value),
+	TP_PROTO(struct ieee80211_local *local, int radio_idx, s16 value),
 
-	TP_ARGS(local, value),
+	TP_ARGS(local, radio_idx, value),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
+		__field(int, radio_idx)
 		__field(s16, value)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->value = value;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " value:%d",
-		LOCAL_PR_ARG, __entry->value
+		LOCAL_PR_FMT " radio_id:%d value:%d",
+		LOCAL_PR_ARG, __entry->radio_idx, __entry->value
 	)
 );
 
@@ -1318,12 +1358,14 @@ TRACE_EVENT(drv_set_antenna,
 );
 
 TRACE_EVENT(drv_get_antenna,
-	TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret),
+	TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 tx_ant,
+		 u32 rx_ant, int ret),
 
-	TP_ARGS(local, tx_ant, rx_ant, ret),
+	TP_ARGS(local, radio_idx, tx_ant, rx_ant, ret),
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
+		__field(int, radio_idx)
 		__field(u32, tx_ant)
 		__field(u32, rx_ant)
 		__field(int, ret)
@@ -1331,14 +1373,16 @@ TRACE_EVENT(drv_get_antenna,
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->tx_ant = tx_ant;
 		__entry->rx_ant = rx_ant;
 		__entry->ret = ret;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d",
-		LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret
+		LOCAL_PR_FMT " radio_idx:%d tx_ant:%d rx_ant:%d ret:%d",
+		LOCAL_PR_ARG, __entry->radio_idx, __entry->tx_ant,
+		__entry->rx_ant, __entry->ret
 	)
 );
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index d58b80813bdd..6278d55aeb2e 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1541,7 +1541,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local,
 	spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]);
 }
 
-void ieee80211_txq_set_params(struct ieee80211_local *local)
+void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx)
 {
 	if (local->hw.wiphy->txq_limit)
 		local->fq.limit = local->hw.wiphy->txq_limit;
@@ -1605,7 +1605,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local)
 	for (i = 0; i < fq->flows_cnt; i++)
 		codel_vars_init(&local->cvars[i]);
 
-	ieee80211_txq_set_params(local);
+	ieee80211_txq_set_params(local, -1);
 
 	return 0;
 }
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 24c43a1ef2aa..773c8da0acc9 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1826,13 +1826,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 	}
 
 	/* setup fragmentation threshold */
-	drv_set_frag_threshold(local, hw->wiphy->frag_threshold);
+	drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold);
 
 	/* setup RTS threshold */
-	drv_set_rts_threshold(local, hw->wiphy->rts_threshold);
+	drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold);
 
 	/* reset coverage class */
-	drv_set_coverage_class(local, hw->wiphy->coverage_class);
+	drv_set_coverage_class(local, -1, hw->wiphy->coverage_class);
 
 	ieee80211_led_radio(local, true);
 	ieee80211_mod_tpt_led_trig(local,
@@ -1890,11 +1890,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 		ieee80211_assign_chanctx(local, sdata, &sdata->deflink);
 
 	/* reconfigure hardware */
-	ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL |
-				   IEEE80211_CONF_CHANGE_MONITOR |
-				   IEEE80211_CONF_CHANGE_PS |
-				   IEEE80211_CONF_CHANGE_RETRY_LIMITS |
-				   IEEE80211_CONF_CHANGE_IDLE);
+	ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL |
+				       IEEE80211_CONF_CHANGE_MONITOR |
+				       IEEE80211_CONF_CHANGE_PS |
+				       IEEE80211_CONF_CHANGE_RETRY_LIMITS |
+				       IEEE80211_CONF_CHANGE_IDLE);
 
 	ieee80211_configure_filter(local);
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 9ef618baac9e..b40978549790 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -854,6 +854,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 },
 	[NL80211_ATTR_EPCS] = { .type = NLA_FLAG },
 	[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 },
+	[NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 },
 };
 
 /* policy for the key attributes */
@@ -2639,7 +2640,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 			u32 tx_ant = 0, rx_ant = 0;
 			int res;
 
-			res = rdev_get_antenna(rdev, &tx_ant, &rx_ant);
+			res = rdev_get_antenna(rdev, -1, &tx_ant, &rx_ant);
 			if (!res) {
 				if (nla_put_u32(msg,
 						NL80211_ATTR_WIPHY_ANTENNA_TX,
@@ -3620,6 +3621,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	u32 frag_threshold = 0, rts_threshold = 0;
 	u8 coverage_class = 0;
 	u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0;
+	int radio_idx = -1;
 
 	rtnl_lock();
 	/*
@@ -3670,6 +3672,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	if (result)
 		return result;
 
+	if (info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]) {
+		/* Radio idx is not expected for non-multi radio wiphy */
+		if (rdev->wiphy.n_radio <= 0)
+			return -EINVAL;
+
+		radio_idx = nla_get_u8(
+				info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]);
+		if (radio_idx >= rdev->wiphy.n_radio)
+			return -EINVAL;
+	}
+
 	if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
 		struct ieee80211_txq_params txq_params;
 		struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];
@@ -3759,7 +3772,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			mbm = nla_get_u32(info->attrs[idx]);
 		}
 
-		result = rdev_set_tx_power(rdev, txp_wdev, type, mbm);
+		result = rdev_set_tx_power(rdev, txp_wdev, radio_idx, type,
+					   mbm);
 		if (result)
 			return result;
 	}
@@ -3785,7 +3799,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		tx_ant = tx_ant & rdev->wiphy.available_antennas_tx;
 		rx_ant = rx_ant & rdev->wiphy.available_antennas_rx;
 
-		result = rdev_set_antenna(rdev, tx_ant, rx_ant);
+		result = rdev_set_antenna(rdev, radio_idx, tx_ant, rx_ant);
 		if (result)
 			return result;
 	}
@@ -3911,7 +3925,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 		if (changed & WIPHY_PARAM_TXQ_QUANTUM)
 			rdev->wiphy.txq_quantum = txq_quantum;
 
-		result = rdev_set_wiphy_params(rdev, changed);
+		result = rdev_set_wiphy_params(rdev, radio_idx, changed);
 		if (result) {
 			rdev->wiphy.retry_short = old_retry_short;
 			rdev->wiphy.retry_long = old_retry_long;
@@ -4012,7 +4026,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 	if (rdev->ops->get_tx_power && !wdev->valid_links) {
 		int dbm, ret;
 
-		ret = rdev_get_tx_power(rdev, wdev, 0, &dbm);
+		ret = rdev_get_tx_power(rdev, wdev, -1, 0, &dbm);
 		if (ret == 0 &&
 		    nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
 				DBM_TO_MBM(dbm)))
@@ -4084,7 +4098,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
 			if (rdev->ops->get_tx_power) {
 				int dbm, ret;
 
-				ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm);
+				ret = rdev_get_tx_power(rdev, wdev, -1, link_id, &dbm);
 				if (ret == 0 &&
 				    nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
 						DBM_TO_MBM(dbm)))
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 9f4783c2354c..803b39c26587 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -577,35 +577,40 @@ static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev,
 }
 
 static inline int
-rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed)
+rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx,
+		      u32 changed)
 {
 	int ret = -EOPNOTSUPP;
 
-	trace_rdev_set_wiphy_params(&rdev->wiphy, changed);
+	trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed);
 	if (rdev->ops->set_wiphy_params)
-		ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed);
+		ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx,
+						  changed);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
-				    struct wireless_dev *wdev,
-				    enum nl80211_tx_power_setting type, int mbm)
+				    struct wireless_dev *wdev, int radio_idx,
+				    enum nl80211_tx_power_setting type,
+				    int mbm)
 {
 	int ret;
-	trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm);
-	ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm);
+	trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm);
+	ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type,
+				      mbm);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
-				    struct wireless_dev *wdev, unsigned int link_id,
-				    int *dbm)
+				    struct wireless_dev *wdev, int radio_idx,
+				    unsigned int link_id, int *dbm)
 {
 	int ret;
-	trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id);
-	ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm);
+	trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id);
+	ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id,
+				      dbm);
 	trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
 	return ret;
 }
@@ -857,21 +862,21 @@ rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev,
 }
 
 static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev,
-				   u32 tx_ant, u32 rx_ant)
+				   int radio_idx, u32 tx_ant, u32 rx_ant)
 {
 	int ret;
-	trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant);
-	ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant);
+	trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant);
+	ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
 
 static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev,
-				   u32 *tx_ant, u32 *rx_ant)
+				   int radio_idx, u32 *tx_ant, u32 *rx_ant)
 {
 	int ret;
-	trace_rdev_get_antenna(&rdev->wiphy);
-	ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant);
+	trace_rdev_get_antenna(&rdev->wiphy, radio_idx);
+	ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant);
 	if (ret)
 		trace_rdev_return_int(&rdev->wiphy, ret);
 	else
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 61a5eca9c513..7e43ab9de923 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -406,9 +406,19 @@ DEFINE_EVENT(wiphy_only_evt, rdev_return_void,
 	TP_ARGS(wiphy)
 );
 
-DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna,
-	TP_PROTO(struct wiphy *wiphy),
-	TP_ARGS(wiphy)
+TRACE_EVENT(rdev_get_antenna,
+	TP_PROTO(struct wiphy *wiphy, int radio_idx),
+	TP_ARGS(wiphy, radio_idx),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		__field(int, radio_idx)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		__entry->radio_idx = radio_idx;
+	),
+	TP_printk(WIPHY_PR_FMT ", radio_idx: %d",
+		  WIPHY_PR_ARG, __entry->radio_idx)
 );
 
 DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll,
@@ -1678,18 +1688,20 @@ TRACE_EVENT(rdev_join_ocb,
 );
 
 TRACE_EVENT(rdev_set_wiphy_params,
-	TP_PROTO(struct wiphy *wiphy, u32 changed),
-	TP_ARGS(wiphy, changed),
+	TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed),
+	TP_ARGS(wiphy, radio_idx, changed),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
+		__field(int, radio_idx)
 		__field(u32, changed)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->changed = changed;
 	),
-	TP_printk(WIPHY_PR_FMT ", changed: %u",
-		  WIPHY_PR_ARG, __entry->changed)
+	TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u",
+		  WIPHY_PR_ARG, __entry->radio_idx, __entry->changed)
 );
 
 DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
@@ -1710,30 +1722,51 @@ DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
 		  WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
 );
 
-DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power,
+TRACE_EVENT(rdev_get_tx_power,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
-		 unsigned int link_id),
-	TP_ARGS(wiphy, wdev, link_id)
+		 int radio_idx, unsigned int link_id),
+	TP_ARGS(wiphy, wdev, radio_idx, link_id),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		WDEV_ENTRY
+		__field(int, radio_idx)
+		__field(unsigned int, link_id)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		WDEV_ASSIGN;
+		__entry->radio_idx = radio_idx;
+		__entry->link_id = link_id;
+	),
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+		  ", radio_idx: %d, link_id: %u",
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  __entry->radio_idx, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_set_tx_power,
 	TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
-		 enum nl80211_tx_power_setting type, int mbm),
-	TP_ARGS(wiphy, wdev, type, mbm),
+		 int radio_idx, enum nl80211_tx_power_setting type,
+		 int mbm),
+	TP_ARGS(wiphy, wdev, radio_idx, type, mbm),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
 		WDEV_ENTRY
+		__field(int, radio_idx)
 		__field(enum nl80211_tx_power_setting, type)
 		__field(int, mbm)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
 		WDEV_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->type = type;
 		__entry->mbm = mbm;
 	),
-	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d",
-		  WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm)
+	TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT
+		  ", radio_idx: %d, type: %u, mbm: %d",
+		  WIPHY_PR_ARG, WDEV_PR_ARG,
+		  __entry->radio_idx, __entry->type, __entry->mbm)
 );
 
 TRACE_EVENT(rdev_return_int_int,
@@ -1866,26 +1899,24 @@ TRACE_EVENT(rdev_return_void_tx_rx,
 		  __entry->rx_max)
 );
 
-DECLARE_EVENT_CLASS(tx_rx_evt,
-	TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
-	TP_ARGS(wiphy, tx, rx),
+TRACE_EVENT(rdev_set_antenna,
+	TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx),
+	TP_ARGS(wiphy, radio_idx, tx, rx),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
+		__field(int, radio_idx)
 		__field(u32, tx)
 		__field(u32, rx)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
+		__entry->radio_idx = radio_idx;
 		__entry->tx = tx;
 		__entry->rx = rx;
 	),
-	TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ",
-		  WIPHY_PR_ARG, __entry->tx, __entry->rx)
-);
-
-DEFINE_EVENT(tx_rx_evt, rdev_set_antenna,
-	TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx),
-	TP_ARGS(wiphy, tx, rx)
+	TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ",
+		  WIPHY_PR_ARG, __entry->radio_idx,
+		  __entry->tx, __entry->rx)
 );
 
 DECLARE_EVENT_CLASS(wiphy_netdev_id_evt,
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index a74b1afc594e..1241fda78a68 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -263,7 +263,7 @@ int cfg80211_wext_siwrts(struct net_device *dev,
 	else
 		wdev->wiphy->rts_threshold = rts->value;
 
-	err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD);
+	err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_RTS_THRESHOLD);
 	if (err)
 		wdev->wiphy->rts_threshold = orts;
 	return err;
@@ -304,7 +304,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev,
 		wdev->wiphy->frag_threshold = frag->value & ~0x1;
 	}
 
-	err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD);
+	err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_FRAG_THRESHOLD);
 	if (err)
 		wdev->wiphy->frag_threshold = ofrag;
 	return err;
@@ -355,7 +355,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev,
 		changed |= WIPHY_PARAM_RETRY_SHORT;
 	}
 
-	err = rdev_set_wiphy_params(rdev, changed);
+	err = rdev_set_wiphy_params(rdev, -1, changed);
 	if (err) {
 		wdev->wiphy->retry_short = oshort;
 		wdev->wiphy->retry_long = olong;
@@ -890,7 +890,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev,
 
 	guard(wiphy)(&rdev->wiphy);
 
-	return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm));
+	return rdev_set_tx_power(rdev, wdev, -1, type, DBM_TO_MBM(dbm));
 }
 
 static int cfg80211_wext_giwtxpower(struct net_device *dev,
@@ -910,7 +910,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev,
 		return -EOPNOTSUPP;
 
 	scoped_guard(wiphy, &rdev->wiphy) {
-		err = rdev_get_tx_power(rdev, wdev, 0, &val);
+		err = rdev_get_tx_power(rdev, wdev, -1, 0, &val);
 	}
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 264637941cf45cd3ffe070e25853d7e1a29f2004 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Sun, 15 Jun 2025 13:53:10 +0530
Subject: wifi: cfg80211: Add Support to Set RTS Threshold for each Radio

Currently, setting RTS threshold is based on per-phy basis, i.e., all the
radios present in a wiphy will take RTS threshold value to be the one sent
from userspace. But each radio in a multi-radio wiphy can have different
RTS threshold requirements.

To extend support to set RTS threshold for each radio, get the radio for
which RTS threshold needs to be changed from the user. Use the attribute
in NL - NL80211_ATTR_WIPHY_RADIO_INDEX, to identify the radio of interest.
Create a new structure - wiphy_radio_cfg and add rts_threshold in it as a
u32 value to store RTS threshold of each radio in a wiphy and allocate
memory for it during wiphy register based on the wiphy.n_radio updated by
drivers. Pass radio id received from the user to mac80211 drivers along
with its corresponding RTS threshold.

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20250615082312.619639-3-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 18 +++++++++++++++
 net/wireless/core.c    | 19 ++++++++++++++++
 net/wireless/nl80211.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 96 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ffd9564fc840..0003733b1e77 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5547,6 +5547,18 @@ struct wiphy_iftype_akm_suites {
 	int n_akm_suites;
 };
 
+/**
+ * struct wiphy_radio_cfg - physical radio config of a wiphy
+ * This structure describes the configurations of a physical radio in a
+ * wiphy. It is used to denote per-radio attributes belonging to a wiphy.
+ *
+ * @rts_threshold: RTS threshold (dot11RTSThreshold);
+ *	-1 (default) = RTS/CTS disabled
+ */
+struct wiphy_radio_cfg {
+	u32 rts_threshold;
+};
+
 /**
  * struct wiphy_radio_freq_range - wiphy frequency range
  * @start_freq:  start range edge frequency (kHz)
@@ -5802,6 +5814,10 @@ struct wiphy_radio {
  *	supports enabling HW timestamping for all peers (i.e. no need to
  *	specify a mac address).
  *
+ * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This
+ *	struct contains a list of all radio specific attributes and should be
+ *	used only for multi-radio wiphy.
+ *
  * @radio: radios belonging to this wiphy
  * @n_radio: number of radios
  */
@@ -5891,6 +5907,8 @@ struct wiphy {
 	void (*reg_notifier)(struct wiphy *wiphy,
 			     struct regulatory_request *request);
 
+	struct wiphy_radio_cfg *radio_cfg;
+
 	/* fields below are read-only, assigned by cfg80211 */
 
 	const struct ieee80211_regdomain __rcu *regd;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 5c3c72df0591..f3cd70757ef2 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -995,6 +995,24 @@ int wiphy_register(struct wiphy *wiphy)
 		 wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES)
 		return -EINVAL;
 
+	/* Allocate radio configuration space for multi-radio wiphy */
+	if (wiphy->n_radio > 0) {
+		int idx;
+
+		wiphy->radio_cfg = kcalloc(wiphy->n_radio,
+					   sizeof(*wiphy->radio_cfg),
+					   GFP_KERNEL);
+		if (!wiphy->radio_cfg)
+			return -ENOMEM;
+		/*
+		 * Initialize wiphy radio parameters to IEEE 802.11
+		 * MIB default values. RTS threshold is disabled by
+		 * default with the special -1 value.
+		 */
+		for (idx = 0; idx < wiphy->n_radio; idx++)
+			wiphy->radio_cfg[idx].rts_threshold = (u32)-1;
+	}
+
 	/* check and set up bitrates */
 	ieee80211_set_bitrate_flags(wiphy);
 
@@ -1222,6 +1240,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev)
 
 void wiphy_free(struct wiphy *wiphy)
 {
+	kfree(wiphy->radio_cfg);
 	put_device(&wiphy->dev);
 }
 EXPORT_SYMBOL(wiphy_free);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b40978549790..b0176090182c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3609,6 +3609,33 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info)
 	return __nl80211_set_channel(rdev, netdev, info, link_id);
 }
 
+static int nl80211_set_wiphy_radio(struct genl_info *info,
+				   struct cfg80211_registered_device *rdev,
+				   int radio_idx)
+{
+	u32 rts_threshold = 0, old_rts, changed = 0;
+	int result;
+
+	if (!rdev->ops->set_wiphy_params)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) {
+		rts_threshold = nla_get_u32(
+				info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]);
+		changed |= WIPHY_PARAM_RTS_THRESHOLD;
+	}
+
+	old_rts = rdev->wiphy.radio_cfg[radio_idx].rts_threshold;
+
+	rdev->wiphy.radio_cfg[radio_idx].rts_threshold = rts_threshold;
+
+	result = rdev_set_wiphy_params(rdev, radio_idx, changed);
+	if (result)
+		rdev->wiphy.radio_cfg[radio_idx].rts_threshold = old_rts;
+
+	return 0;
+}
+
 static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = NULL;
@@ -3681,6 +3708,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 				info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]);
 		if (radio_idx >= rdev->wiphy.n_radio)
 			return -EINVAL;
+
+		return nl80211_set_wiphy_radio(info, rdev, radio_idx);
 	}
 
 	if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
@@ -3893,16 +3922,30 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 	if (changed) {
 		u8 old_retry_short, old_retry_long;
 		u32 old_frag_threshold, old_rts_threshold;
-		u8 old_coverage_class;
+		u8 old_coverage_class, i;
 		u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
+		u32 *old_radio_rts_threshold = NULL;
 
 		if (!rdev->ops->set_wiphy_params)
 			return -EOPNOTSUPP;
 
+		if (rdev->wiphy.n_radio) {
+			old_radio_rts_threshold = kcalloc(rdev->wiphy.n_radio,
+							  sizeof(u32),
+							  GFP_KERNEL);
+			if (!old_radio_rts_threshold)
+				return -ENOMEM;
+		}
+
 		old_retry_short = rdev->wiphy.retry_short;
 		old_retry_long = rdev->wiphy.retry_long;
 		old_frag_threshold = rdev->wiphy.frag_threshold;
 		old_rts_threshold = rdev->wiphy.rts_threshold;
+		if (old_radio_rts_threshold) {
+			for (i = 0 ; i < rdev->wiphy.n_radio; i++)
+				old_radio_rts_threshold[i] =
+					rdev->wiphy.radio_cfg[i].rts_threshold;
+		}
 		old_coverage_class = rdev->wiphy.coverage_class;
 		old_txq_limit = rdev->wiphy.txq_limit;
 		old_txq_memory_limit = rdev->wiphy.txq_memory_limit;
@@ -3914,8 +3957,13 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.retry_long = retry_long;
 		if (changed & WIPHY_PARAM_FRAG_THRESHOLD)
 			rdev->wiphy.frag_threshold = frag_threshold;
-		if (changed & WIPHY_PARAM_RTS_THRESHOLD)
+		if ((changed & WIPHY_PARAM_RTS_THRESHOLD) &&
+		    old_radio_rts_threshold) {
 			rdev->wiphy.rts_threshold = rts_threshold;
+			for (i = 0 ; i < rdev->wiphy.n_radio; i++)
+				rdev->wiphy.radio_cfg[i].rts_threshold =
+					rdev->wiphy.rts_threshold;
+		}
 		if (changed & WIPHY_PARAM_COVERAGE_CLASS)
 			rdev->wiphy.coverage_class = coverage_class;
 		if (changed & WIPHY_PARAM_TXQ_LIMIT)
@@ -3931,12 +3979,20 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
 			rdev->wiphy.retry_long = old_retry_long;
 			rdev->wiphy.frag_threshold = old_frag_threshold;
 			rdev->wiphy.rts_threshold = old_rts_threshold;
+			if (old_radio_rts_threshold) {
+				for (i = 0 ; i < rdev->wiphy.n_radio; i++)
+					rdev->wiphy.radio_cfg[i].rts_threshold =
+						old_radio_rts_threshold[i];
+			}
 			rdev->wiphy.coverage_class = old_coverage_class;
 			rdev->wiphy.txq_limit = old_txq_limit;
 			rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
 			rdev->wiphy.txq_quantum = old_txq_quantum;
-			return result;
 		}
+
+		if (old_rts_threshold)
+			kfree(old_radio_rts_threshold);
+		return result;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 89595190058c6e9ca4a8ca7d49be3fc8d2395e79 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Sun, 15 Jun 2025 13:53:11 +0530
Subject: wifi: cfg80211: Report per-radio RTS threshold to userspace

In case of multi-radio wiphys, with per-radio RTS threshold brought
into use, RTS threshold for each radio in a wiphy can be recorded in
wiphy parameter - wiphy_radio_cfg, as an array. Add a new attribute -
NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD in nested parameter -
NL80211_ATTR_WIPHY_RADIOS. When a request for getting RTS threshold
for a particular radio is received, parse the radio id and get the
required data. Add this data to the newly added nested attribute
NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD. Add support to report this
data to userspace.

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20250615082312.619639-4-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 net/wireless/nl80211.c       | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 2a71149c3065..39460334dafb 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -8106,6 +8106,7 @@ enum nl80211_ap_settings_flags {
  *	and contains attributes defined in &enum nl80211_if_combination_attrs.
  * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas
  *	connected to this radio.
+ * @NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD: RTS threshold (u32) of this radio.
  *
  * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal
  * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute
@@ -8117,6 +8118,7 @@ enum nl80211_wiphy_radio_attrs {
 	NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE,
 	NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION,
 	NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK,
+	NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD,
 
 	/* keep last */
 	__NL80211_WIPHY_RADIO_ATTR_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index b0176090182c..70bfe2bfdcc7 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -2447,6 +2447,7 @@ fail:
 static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx)
 {
 	const struct wiphy_radio *r = &wiphy->radio[idx];
+	const struct wiphy_radio_cfg *rcfg = &wiphy->radio_cfg[idx];
 	struct nlattr *radio, *freq;
 	int i;
 
@@ -2457,6 +2458,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx)
 	if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx))
 		goto nla_put_failure;
 
+	if (rcfg->rts_threshold &&
+	    nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD,
+			rcfg->rts_threshold))
+		goto nla_put_failure;
+
 	if (r->antenna_mask &&
 	    nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK,
 			r->antenna_mask))
-- 
cgit v1.2.3


From 5ea255673cdb4a9bf99dd3e4fc9ca1089f5692a3 Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Tue, 17 Jun 2025 18:06:07 +1000
Subject: wifi: cfg80211: support configuration of S1G station capabilities

Currently there is no support for initialising a peers S1G capabilities,
this patch adds support for configuring an S1G station.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250617080610.756048-2-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 4 +++-
 net/wireless/nl80211.c | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0003733b1e77..4a092da3a9de 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -560,7 +560,7 @@ struct ieee80211_sta_s1g_cap {
  * @vht_cap: VHT capabilities in this band
  * @s1g_cap: S1G capabilities in this band
  * @edmg_cap: EDMG capabilities in this band
- * @s1g_cap: S1G capabilities in this band (S1B band only, of course)
+ * @s1g_cap: S1G capabilities in this band (S1G band only, of course)
  * @n_iftype_data: number of iftype data entries
  * @iftype_data: interface type data entries.  Note that the bits in
  *	@types_mask inside this structure cannot overlap (i.e. only
@@ -1653,6 +1653,7 @@ struct sta_txpwr {
  * @he_6ghz_capa: HE 6 GHz Band capabilities of station
  * @eht_capa: EHT capabilities of station
  * @eht_capa_len: the length of the EHT capabilities
+ * @s1g_capa: S1G capabilities of station
  */
 struct link_station_parameters {
 	const u8 *mld_mac;
@@ -1671,6 +1672,7 @@ struct link_station_parameters {
 	const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
 	const struct ieee80211_eht_cap_elem *eht_capa;
 	u8 eht_capa_len;
+	const struct ieee80211_s1g_cap *s1g_capa;
 };
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 70bfe2bfdcc7..70ca74a75f22 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7856,6 +7856,10 @@ static int nl80211_set_station_tdls(struct genl_info *info,
 		}
 	}
 
+	if (info->attrs[NL80211_ATTR_S1G_CAPABILITY])
+		params->link_sta_params.s1g_capa =
+			nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]);
+
 	err = nl80211_parse_sta_channel_info(info, params);
 	if (err)
 		return err;
@@ -8182,6 +8186,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
 		params.link_sta_params.he_6ghz_capa =
 			nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
 
+	if (info->attrs[NL80211_ATTR_S1G_CAPABILITY])
+		params.link_sta_params.s1g_capa =
+			nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]);
+
 	if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) {
 		params.link_sta_params.opmode_notif_used = true;
 		params.link_sta_params.opmode_notif =
-- 
cgit v1.2.3


From 2a8a6b7c4cb03808a707ae19b2f0c5eb9b631e9e Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Tue, 17 Jun 2025 18:06:08 +1000
Subject: wifi: mac80211: handle station association response with S1G

Add support for updating the stations S1G capabilities when
an S1G association occurs.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250617080610.756048-3-lachlan.hodges@morsemicro.com
[remove unused S1G_CAP3_MAX_MPDU_LEN_3895/_7791]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  2 ++
 net/mac80211/ieee80211_i.h |  3 +++
 net/mac80211/mlme.c        |  6 ++++++
 net/mac80211/s1g.c         | 26 ++++++++++++++++++++++++++
 4 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a0de0da4d79b..dcd5969bb559 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2428,6 +2428,7 @@ struct ieee80211_sta_aggregates {
  * @he_cap: HE capabilities of this STA
  * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
  * @eht_cap: EHT capabilities of this STA
+ * @s1g_cap: S1G capabilities of this STA
  * @agg: per-link data for multi-link aggregation
  * @bandwidth: current bandwidth the station can receive with
  * @rx_nss: in HT/VHT, the maximum number of spatial streams the
@@ -2450,6 +2451,7 @@ struct ieee80211_link_sta {
 	struct ieee80211_sta_he_cap he_cap;
 	struct ieee80211_he_6ghz_capa he_6ghz_capa;
 	struct ieee80211_sta_eht_cap eht_cap;
+	struct ieee80211_sta_s1g_cap s1g_cap;
 
 	struct ieee80211_sta_aggregates agg;
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index f59a5b38e6f2..4ef7b3656aca 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2270,6 +2270,9 @@ void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb);
 void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata,
 				     struct sk_buff *skb);
+void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata,
+				      const struct ieee80211_s1g_cap *s1g_cap_ie,
+				      struct link_sta_info *link_sta);
 
 /* Spectrum management */
 void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d526f2fe9fe5..6001c8897d7c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -5399,6 +5399,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 		bss_conf->epcs_support = false;
 	}
 
+	if (elems->s1g_oper &&
+	    link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G &&
+	    elems->s1g_capab)
+		ieee80211_s1g_cap_to_sta_s1g_cap(sdata, elems->s1g_capab,
+						 link_sta);
+
 	bss_conf->twt_broadcast =
 		ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta);
 
diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c
index d4ed0c0a335c..1f68df6e8067 100644
--- a/net/mac80211/s1g.c
+++ b/net/mac80211/s1g.c
@@ -194,3 +194,29 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata,
 		break;
 	}
 }
+
+void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata,
+				      const struct ieee80211_s1g_cap *s1g_cap_ie,
+				      struct link_sta_info *link_sta)
+{
+	struct ieee80211_sta_s1g_cap *s1g_cap = &link_sta->pub->s1g_cap;
+
+	memset(s1g_cap, 0, sizeof(*s1g_cap));
+
+	memcpy(s1g_cap->cap, s1g_cap_ie->capab_info, sizeof(s1g_cap->cap));
+	memcpy(s1g_cap->nss_mcs, s1g_cap_ie->supp_mcs_nss,
+	       sizeof(s1g_cap->nss_mcs));
+
+	s1g_cap->s1g = true;
+
+	/* Maximum MPDU length is 1 bit for S1G */
+	if (s1g_cap->cap[3] & S1G_CAP3_MAX_MPDU_LEN) {
+		link_sta->pub->agg.max_amsdu_len =
+			IEEE80211_MAX_MPDU_LEN_VHT_7991;
+	} else {
+		link_sta->pub->agg.max_amsdu_len =
+			IEEE80211_MAX_MPDU_LEN_VHT_3895;
+	}
+
+	ieee80211_sta_recalc_aggregates(&link_sta->sta->sta);
+}
-- 
cgit v1.2.3


From a4c746f06853f91d3759ae8aca514d135b6aa56d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Jun 2025 15:48:42 +0200
Subject: uapi/fcntl: mark range as reserved

Mark the range from -10000 to -40000 as a range reserved for special
in-kernel values. Move the PIDFD_SELF_*/PIDFD_THREAD_* sentinels over so
all the special values are in one place.

Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-6-d02a04858fe3@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 16 ++++++++++++++++
 include/uapi/linux/pidfd.h | 15 ---------------
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index a15ac2fa4b20..ed02d8ae0667 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -90,10 +90,26 @@
 #define DN_ATTRIB	0x00000020	/* File changed attibutes */
 #define DN_MULTISHOT	0x80000000	/* Don't remove notifier */
 
+/* Reserved kernel ranges [-100], [-10000, -40000]. */
 #define AT_FDCWD		-100    /* Special value for dirfd used to
 					   indicate openat should use the
 					   current working directory. */
 
+/*
+ * The concept of process and threads in userland and the kernel is a confusing
+ * one - within the kernel every thread is a 'task' with its own individual PID,
+ * however from userland's point of view threads are grouped by a single PID,
+ * which is that of the 'thread group leader', typically the first thread
+ * spawned.
+ *
+ * To cut the Gideon knot, for internal kernel usage, we refer to
+ * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
+ * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
+ * group leader...
+ */
+#define PIDFD_SELF_THREAD		-10000 /* Current thread. */
+#define PIDFD_SELF_THREAD_GROUP		-20000 /* Current thread group leader. */
+
 
 /* Generic flags for the *at(2) family of syscalls. */
 
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index c27a4e238e4b..957db425d459 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -42,21 +42,6 @@
 #define PIDFD_COREDUMP_USER	(1U << 2) /* coredump was done as the user. */
 #define PIDFD_COREDUMP_ROOT	(1U << 3) /* coredump was done as root. */
 
-/*
- * The concept of process and threads in userland and the kernel is a confusing
- * one - within the kernel every thread is a 'task' with its own individual PID,
- * however from userland's point of view threads are grouped by a single PID,
- * which is that of the 'thread group leader', typically the first thread
- * spawned.
- *
- * To cut the Gideon knot, for internal kernel usage, we refer to
- * PIDFD_SELF_THREAD to refer to the current thread (or task from a kernel
- * perspective), and PIDFD_SELF_THREAD_GROUP to refer to the current thread
- * group leader...
- */
-#define PIDFD_SELF_THREAD		-10000 /* Current thread. */
-#define PIDFD_SELF_THREAD_GROUP		-20000 /* Current thread group leader. */
-
 /*
  * ...and for userland we make life simpler - PIDFD_SELF refers to the current
  * thread, PIDFD_SELF_PROCESS refers to the process thread group leader.
-- 
cgit v1.2.3


From 67fcec2919e4ed31ab845eb456ad7d6f1e85505c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Jun 2025 15:48:49 +0200
Subject: fcntl/pidfd: redefine PIDFD_SELF_THREAD_GROUP

Don't jump somewhere into the middle of the reserved range. We're still
able to change that value it won't be that widely used yet. If not, we
can revert.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h            | 2 +-
 tools/testing/selftests/pidfd/pidfd.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index ed02d8ae0667..ba4a698d2f33 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -108,7 +108,7 @@
  * group leader...
  */
 #define PIDFD_SELF_THREAD		-10000 /* Current thread. */
-#define PIDFD_SELF_THREAD_GROUP		-20000 /* Current thread group leader. */
+#define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
 
 
 /* Generic flags for the *at(2) family of syscalls. */
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
index efd74063126e..5dfeb1bdf399 100644
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -56,7 +56,7 @@
 #endif
 
 #ifndef PIDFD_SELF_THREAD_GROUP
-#define PIDFD_SELF_THREAD_GROUP		-20000 /* Current thread group leader. */
+#define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
 #endif
 
 #ifndef PIDFD_SELF
-- 
cgit v1.2.3


From cd5d2006327b6d8488612cb8c03ad7304417c8f2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Jun 2025 10:29:10 +0200
Subject: uapi/fcntl: add FD_INVALID

Add a marker for an invalid file descriptor.

Link: https://lore.kernel.org/20250624-work-pidfs-fhandle-v2-7-d02a04858fe3@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index ba4a698d2f33..a5bebe7c4400 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -110,6 +110,7 @@
 #define PIDFD_SELF_THREAD		-10000 /* Current thread. */
 #define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
 
+#define FD_INVALID			-10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
 
 /* Generic flags for the *at(2) family of syscalls. */
 
-- 
cgit v1.2.3


From b39f7d75dc41b5f5d028192cd5d66cff71179f35 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 24 Jun 2025 14:21:27 +0100
Subject: fs: Remove three arguments from block_write_end()

block_write_end() looks like it can be used as a ->write_end()
implementation.  However, it can't as it does not unlock nor put
the folio.  Since it does not use the 'file', 'mapping' nor 'fsdata'
arguments, remove them.

Signed-off-by: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Link: https://lore.kernel.org/20250624132130.1590285-1-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/fops.c                | 2 +-
 fs/buffer.c                 | 7 +++----
 fs/ext2/dir.c               | 2 +-
 fs/ext4/inode.c             | 5 ++---
 fs/iomap/buffered-io.c      | 3 +--
 fs/minix/dir.c              | 2 +-
 fs/nilfs2/dir.c             | 2 +-
 fs/nilfs2/recovery.c        | 3 +--
 fs/ufs/dir.c                | 2 +-
 include/linux/buffer_head.h | 4 +---
 10 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index 1309861d4c2c..35cea0cb304d 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -507,7 +507,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 		void *fsdata)
 {
 	int ret;
-	ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	ret = block_write_end(pos, len, copied, folio);
 
 	folio_unlock(folio);
 	folio_put(folio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1d34200f69c8..d61073143127 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2270,9 +2270,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 }
 EXPORT_SYMBOL(block_write_begin);
 
-int block_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+int block_write_end(loff_t pos, unsigned len, unsigned copied,
+		struct folio *folio)
 {
 	size_t start = pos - folio_pos(folio);
 
@@ -2311,7 +2310,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	loff_t old_size = inode->i_size;
 	bool i_size_changed = false;
 
-	copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	copied = block_write_end(pos, len, copied, folio);
 
 	/*
 	 * No need to use i_size_read() here, the i_size cannot change under us
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 402fecf90a44..b07b3b369710 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
 	struct inode *dir = mapping->host;
 
 	inode_inc_iversion(dir);
-	block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+	block_write_end(pos, len, len, folio);
 
 	if (pos+len > dir->i_size) {
 		i_size_write(dir, pos+len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be9a4cba35fd..e6aa7ca6d842 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1424,7 +1424,7 @@ static int ext4_write_end(struct file *file,
 		return ext4_write_inline_data_end(inode, pos, len, copied,
 						  folio);
 
-	copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	copied = block_write_end(pos, len, copied, folio);
 	/*
 	 * it's important to update i_size while still holding folio lock:
 	 * page writeout could otherwise come in and zero beyond i_size.
@@ -3144,8 +3144,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	 * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
 	 * flag, which all that's needed to trigger page writeback.
 	 */
-	copied = block_write_end(NULL, mapping, pos, len, copied,
-			folio, NULL);
+	copied = block_write_end(pos, len, copied, folio);
 	new_i_size = pos + copied;
 
 	/*
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3729391a18f3..775e8f1f0286 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -923,8 +923,7 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
 	if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
 		size_t bh_written;
 
-		bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
-					len, copied, folio, NULL);
+		bh_written = block_write_end(pos, len, copied, folio);
 		WARN_ON_ONCE(bh_written != copied && bh_written != 0);
 		return bh_written == copied;
 	}
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index dd2a425b41f0..19052fc47e9e 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
 	struct address_space *mapping = folio->mapping;
 	struct inode *dir = mapping->host;
 
-	block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+	block_write_end(pos, len, len, folio);
 
 	if (pos+len > dir->i_size) {
 		i_size_write(dir, pos+len);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9b7f8e9655a2..6ca3d74be1e1 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio,
 	int err;
 
 	nr_dirty = nilfs_page_count_clean_buffers(folio, from, to);
-	copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+	copied = block_write_end(pos, len, len, folio);
 	if (pos + copied > dir->i_size)
 		i_size_write(dir, pos + copied);
 	if (IS_DIRSYNC(dir))
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 22aecf6e2344..a9c61d0492cb 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 		if (unlikely(err))
 			goto failed_folio;
 
-		block_write_end(NULL, inode->i_mapping, pos, blocksize,
-				blocksize, folio, NULL);
+		block_write_end(pos, blocksize, blocksize, folio);
 
 		folio_unlock(folio);
 		folio_put(folio);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 88d0062cfdb9..0388a1bae326 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
 	struct inode *dir = mapping->host;
 
 	inode_inc_iversion(dir);
-	block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+	block_write_end(pos, len, len, folio);
 	if (pos+len > dir->i_size) {
 		i_size_write(dir, pos+len);
 		mark_inode_dirty(dir);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 0029ff880e27..178eb90e9cf3 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -262,9 +262,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 		struct folio **foliop, get_block_t *get_block);
 int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 		get_block_t *get_block);
-int block_write_end(struct file *, struct address_space *,
-				loff_t, unsigned len, unsigned copied,
-				struct folio *, void *);
+int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
 int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned len, unsigned copied,
 				struct folio *, void *);
-- 
cgit v1.2.3


From 6241b49540a65a6d5274fa938fd3eb4cbfe2e076 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 24 Jun 2025 10:06:41 +0200
Subject: tty: fix tty_port_tty_*hangup() kernel-doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit below added a new helper, but omitted to move (and add) the
corressponding kernel-doc. Do it now.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Fixes: 2b5eac0f8c6e ("tty: introduce and use tty_port_tty_vhangup() helper")
Link: https://lore.kernel.org/all/b23d566c-09dc-7374-cc87-0ad4660e8b2e@linux.intel.com/
Reported-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Link: https://lore.kernel.org/r/20250624080641.509959-6-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/driver-api/tty/tty_port.rst | 5 +++--
 drivers/tty/tty_port.c                    | 5 -----
 include/linux/tty_port.h                  | 9 +++++++++
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/tty/tty_port.rst b/Documentation/driver-api/tty/tty_port.rst
index 5cb90e954fcf..504a353f2682 100644
--- a/Documentation/driver-api/tty/tty_port.rst
+++ b/Documentation/driver-api/tty/tty_port.rst
@@ -42,9 +42,10 @@ TTY Refcounting
 TTY Helpers
 -----------
 
+.. kernel-doc::  include/linux/tty_port.h
+   :identifiers: tty_port_tty_hangup tty_port_tty_vhangup
 .. kernel-doc::  drivers/tty/tty_port.c
-   :identifiers: tty_port_tty_hangup tty_port_tty_wakeup
-
+   :identifiers: tty_port_tty_wakeup
 
 Modem Signals
 -------------
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 903eebdbe12d..5b4d5fb99a59 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -391,11 +391,6 @@ void tty_port_hangup(struct tty_port *port)
 }
 EXPORT_SYMBOL(tty_port_hangup);
 
-/**
- * tty_port_tty_hangup - helper to hang up a tty
- * @port: tty port
- * @check_clocal: hang only ttys with %CLOCAL unset?
- */
 void __tty_port_tty_hangup(struct tty_port *port, bool check_clocal, bool async)
 {
 	struct tty_struct *tty = tty_port_tty_get(port);
diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h
index 021f9a8415c0..332ddb93603e 100644
--- a/include/linux/tty_port.h
+++ b/include/linux/tty_port.h
@@ -251,11 +251,20 @@ static inline int tty_port_users(struct tty_port *port)
 	return port->count + port->blocked_open;
 }
 
+/**
+ * tty_port_tty_hangup - helper to hang up a tty asynchronously
+ * @port: tty port
+ * @check_clocal: hang only ttys with %CLOCAL unset?
+ */
 static inline void tty_port_tty_hangup(struct tty_port *port, bool check_clocal)
 {
 	__tty_port_tty_hangup(port, check_clocal, true);
 }
 
+/**
+ * tty_port_tty_vhangup - helper to hang up a tty synchronously
+ * @port: tty port
+ */
 static inline void tty_port_tty_vhangup(struct tty_port *port)
 {
 	__tty_port_tty_hangup(port, false, false);
-- 
cgit v1.2.3


From af4db5a35a4ef7a68046883bfd12468007db38f1 Mon Sep 17 00:00:00 2001
From: RD Babiera <rdbabiera@google.com>
Date: Wed, 18 Jun 2025 22:49:42 +0000
Subject: usb: typec: altmodes/displayport: do not index invalid
 pin_assignments

A poorly implemented DisplayPort Alt Mode port partner can indicate
that its pin assignment capabilities are greater than the maximum
value, DP_PIN_ASSIGN_F. In this case, calls to pin_assignment_show
will cause a BRK exception due to an out of bounds array access.

Prevent for loop in pin_assignment_show from accessing
invalid values in pin_assignments by adding DP_PIN_ASSIGN_MAX
value in typec_dp.h and using i < DP_PIN_ASSIGN_MAX as a loop
condition.

Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode")
Cc: stable <stable@kernel.org>
Signed-off-by: RD Babiera <rdbabiera@google.com>
Reviewed-by: Badhri Jagan Sridharan <badhri@google.com>
Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20250618224943.3263103-2-rdbabiera@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/altmodes/displayport.c | 2 +-
 include/linux/usb/typec_dp.h             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c
index b09b58d7311d..773786129dfb 100644
--- a/drivers/usb/typec/altmodes/displayport.c
+++ b/drivers/usb/typec/altmodes/displayport.c
@@ -677,7 +677,7 @@ static ssize_t pin_assignment_show(struct device *dev,
 
 	assignments = get_current_pin_assignments(dp);
 
-	for (i = 0; assignments; assignments >>= 1, i++) {
+	for (i = 0; assignments && i < DP_PIN_ASSIGN_MAX; assignments >>= 1, i++) {
 		if (assignments & 1) {
 			if (i == cur)
 				len += sprintf(buf + len, "[%s] ",
diff --git a/include/linux/usb/typec_dp.h b/include/linux/usb/typec_dp.h
index f2da264d9c14..acb0ad03bdac 100644
--- a/include/linux/usb/typec_dp.h
+++ b/include/linux/usb/typec_dp.h
@@ -57,6 +57,7 @@ enum {
 	DP_PIN_ASSIGN_D,
 	DP_PIN_ASSIGN_E,
 	DP_PIN_ASSIGN_F, /* Not supported after v1.0b */
+	DP_PIN_ASSIGN_MAX,
 };
 
 /* DisplayPort alt mode specific commands */
-- 
cgit v1.2.3


From 3941e37f62fe2c3c8b8675c12183185f20450539 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 24 Jun 2025 16:57:51 +0200
Subject: uapi/fcntl: add FD_PIDFS_ROOT

Add a special file descriptor indicating the root of the pidfs
filesystem.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index a5bebe7c4400..f291ab4f94eb 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -110,6 +110,7 @@
 #define PIDFD_SELF_THREAD		-10000 /* Current thread. */
 #define PIDFD_SELF_THREAD_GROUP		-10001 /* Current thread group leader. */
 
+#define FD_PIDFS_ROOT			-10002 /* Root of the pidfs filesystem */
 #define FD_INVALID			-10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */
 
 /* Generic flags for the *at(2) family of syscalls. */
-- 
cgit v1.2.3


From a02fd05661d73a8507dd70dd820e9b984490c545 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Tue, 24 Jun 2025 14:29:27 +0800
Subject: PCI: Extend isolated function probing to LoongArch

Like s390 and the jailhouse hypervisor, LoongArch's PCI architecture allows
passing isolated PCI functions to a guest OS instance. So it is possible
that there is a multi-function device without function 0 for the host or
guest.

Allow probing such functions by adding a IS_ENABLED(CONFIG_LOONGARCH) case
in the hypervisor_isolated_pci_functions() helper.

This is similar to commit 189c6c33ff42 ("PCI: Extend isolated function
probing to s390").

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20250624062927.4037734-1-chenhuacai@loongson.cn
---
 include/linux/hypervisor.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index 9efbc54e35e5..be5417303ecf 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -37,6 +37,9 @@ static inline bool hypervisor_isolated_pci_functions(void)
 	if (IS_ENABLED(CONFIG_S390))
 		return true;
 
+	if (IS_ENABLED(CONFIG_LOONGARCH))
+		return true;
+
 	return jailhouse_paravirt();
 }
 
-- 
cgit v1.2.3


From 2ce80bc11ff91913bc6eb31a5876621f090cf5cc Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 14 Jun 2025 02:03:43 +0100
Subject: misc: vmw_vmci: Remove unused vmci_doorbell_notify

vmci_doorbell_notify() was added in 2013 by
commit 83e2ec765be0 ("VMCI: doorbell implementation.")

but has remained unused.

Remove it.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250614010344.636076-3-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_doorbell.c | 53 -----------------------------------
 include/linux/vmw_vmci_api.h          |  1 -
 2 files changed, 54 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/vmw_vmci/vmci_doorbell.c b/drivers/misc/vmw_vmci/vmci_doorbell.c
index fa8a7fce4481..53eeb9e6cb56 100644
--- a/drivers/misc/vmw_vmci/vmci_doorbell.c
+++ b/drivers/misc/vmw_vmci/vmci_doorbell.c
@@ -257,23 +257,6 @@ static int dbell_unlink(struct vmci_handle handle)
 	return vmci_send_datagram(&unlink_msg.hdr);
 }
 
-/*
- * Notify another guest or the host.  We send a datagram down to the
- * host via the hypervisor with the notification info.
- */
-static int dbell_notify_as_guest(struct vmci_handle handle, u32 priv_flags)
-{
-	struct vmci_doorbell_notify_msg notify_msg;
-
-	notify_msg.hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID,
-					      VMCI_DOORBELL_NOTIFY);
-	notify_msg.hdr.src = VMCI_ANON_SRC_HANDLE;
-	notify_msg.hdr.payload_size = sizeof(notify_msg) - VMCI_DG_HEADERSIZE;
-	notify_msg.handle = handle;
-
-	return vmci_send_datagram(&notify_msg.hdr);
-}
-
 /*
  * Calls the specified callback in a delayed context.
  */
@@ -566,39 +549,3 @@ int vmci_doorbell_destroy(struct vmci_handle handle)
 	return VMCI_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(vmci_doorbell_destroy);
-
-/*
- * vmci_doorbell_notify() - Ring the doorbell (and hide in the bushes).
- * @dst:        The handlle identifying the doorbell resource
- * @priv_flags: Priviledge flags.
- *
- * Generates a notification on the doorbell identified by the
- * handle. For host side generation of notifications, the caller
- * can specify what the privilege of the calling side is.
- */
-int vmci_doorbell_notify(struct vmci_handle dst, u32 priv_flags)
-{
-	int retval;
-	enum vmci_route route;
-	struct vmci_handle src;
-
-	if (vmci_handle_is_invalid(dst) ||
-	    (priv_flags & ~VMCI_PRIVILEGE_ALL_FLAGS))
-		return VMCI_ERROR_INVALID_ARGS;
-
-	src = VMCI_INVALID_HANDLE;
-	retval = vmci_route(&src, &dst, false, &route);
-	if (retval < VMCI_SUCCESS)
-		return retval;
-
-	if (VMCI_ROUTE_AS_HOST == route)
-		return vmci_ctx_notify_dbell(VMCI_HOST_CONTEXT_ID,
-					     dst, priv_flags);
-
-	if (VMCI_ROUTE_AS_GUEST == route)
-		return dbell_notify_as_guest(dst, priv_flags);
-
-	pr_warn("Unknown route (%d) for doorbell\n", route);
-	return VMCI_ERROR_DST_UNREACHABLE;
-}
-EXPORT_SYMBOL_GPL(vmci_doorbell_notify);
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index f28907345c80..28a3b6a9e1ca 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -35,7 +35,6 @@ int vmci_doorbell_create(struct vmci_handle *handle, u32 flags,
 			 u32 priv_flags,
 			 vmci_callback notify_cb, void *client_data);
 int vmci_doorbell_destroy(struct vmci_handle handle);
-int vmci_doorbell_notify(struct vmci_handle handle, u32 priv_flags);
 u32 vmci_get_context_id(void);
 bool vmci_is_context_owner(u32 context_id, kuid_t uid);
 int vmci_register_vsock_callback(vmci_vsock_cb callback);
-- 
cgit v1.2.3


From ea6895f021c0716a280eff576d3f64f549187faa Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 14 Jun 2025 02:03:44 +0100
Subject: misc: vmw_vmci: Remove unused qpair functions

vmci_qpair_dequeue(), vmci_qpair_enqueue() and vmci_qpair_peek()
were added in 2013 by
commit 06164d2b72aa ("VMCI: queue pairs implementation.")

but have remained unused.

Remove them.

(The iov version of those functions is used)

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250614010344.636076-4-linux@treblig.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 133 --------------------------------
 include/linux/vmw_vmci_api.h            |   6 --
 2 files changed, 139 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 73d71c4ec139..b88ac144ad32 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -3022,139 +3022,6 @@ s64 vmci_qpair_consume_buf_ready(const struct vmci_qp *qpair)
 }
 EXPORT_SYMBOL_GPL(vmci_qpair_consume_buf_ready);
 
-/*
- * vmci_qpair_enqueue() - Throw data on the queue.
- * @qpair:      Pointer to the queue pair struct.
- * @buf:        Pointer to buffer containing data
- * @buf_size:   Length of buffer.
- * @buf_type:   Buffer type (Unused).
- *
- * This is the client interface for enqueueing data into the queue.
- * Returns number of bytes enqueued or < 0 on error.
- */
-ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair,
-			   const void *buf,
-			   size_t buf_size,
-			   int buf_type)
-{
-	ssize_t result;
-	struct iov_iter from;
-	struct kvec v = {.iov_base = (void *)buf, .iov_len = buf_size};
-
-	if (!qpair || !buf)
-		return VMCI_ERROR_INVALID_ARGS;
-
-	iov_iter_kvec(&from, ITER_SOURCE, &v, 1, buf_size);
-
-	qp_lock(qpair);
-
-	do {
-		result = qp_enqueue_locked(qpair->produce_q,
-					   qpair->consume_q,
-					   qpair->produce_q_size,
-					   &from);
-
-		if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY &&
-		    !qp_wait_for_ready_queue(qpair))
-			result = VMCI_ERROR_WOULD_BLOCK;
-
-	} while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY);
-
-	qp_unlock(qpair);
-
-	return result;
-}
-EXPORT_SYMBOL_GPL(vmci_qpair_enqueue);
-
-/*
- * vmci_qpair_dequeue() - Get data from the queue.
- * @qpair:      Pointer to the queue pair struct.
- * @buf:        Pointer to buffer for the data
- * @buf_size:   Length of buffer.
- * @buf_type:   Buffer type (Unused).
- *
- * This is the client interface for dequeueing data from the queue.
- * Returns number of bytes dequeued or < 0 on error.
- */
-ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair,
-			   void *buf,
-			   size_t buf_size,
-			   int buf_type)
-{
-	ssize_t result;
-	struct iov_iter to;
-	struct kvec v = {.iov_base = buf, .iov_len = buf_size};
-
-	if (!qpair || !buf)
-		return VMCI_ERROR_INVALID_ARGS;
-
-	iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size);
-
-	qp_lock(qpair);
-
-	do {
-		result = qp_dequeue_locked(qpair->produce_q,
-					   qpair->consume_q,
-					   qpair->consume_q_size,
-					   &to, true);
-
-		if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY &&
-		    !qp_wait_for_ready_queue(qpair))
-			result = VMCI_ERROR_WOULD_BLOCK;
-
-	} while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY);
-
-	qp_unlock(qpair);
-
-	return result;
-}
-EXPORT_SYMBOL_GPL(vmci_qpair_dequeue);
-
-/*
- * vmci_qpair_peek() - Peek at the data in the queue.
- * @qpair:      Pointer to the queue pair struct.
- * @buf:        Pointer to buffer for the data
- * @buf_size:   Length of buffer.
- * @buf_type:   Buffer type (Unused on Linux).
- *
- * This is the client interface for peeking into a queue.  (I.e.,
- * copy data from the queue without updating the head pointer.)
- * Returns number of bytes dequeued or < 0 on error.
- */
-ssize_t vmci_qpair_peek(struct vmci_qp *qpair,
-			void *buf,
-			size_t buf_size,
-			int buf_type)
-{
-	struct iov_iter to;
-	struct kvec v = {.iov_base = buf, .iov_len = buf_size};
-	ssize_t result;
-
-	if (!qpair || !buf)
-		return VMCI_ERROR_INVALID_ARGS;
-
-	iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size);
-
-	qp_lock(qpair);
-
-	do {
-		result = qp_dequeue_locked(qpair->produce_q,
-					   qpair->consume_q,
-					   qpair->consume_q_size,
-					   &to, false);
-
-		if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY &&
-		    !qp_wait_for_ready_queue(qpair))
-			result = VMCI_ERROR_WOULD_BLOCK;
-
-	} while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY);
-
-	qp_unlock(qpair);
-
-	return result;
-}
-EXPORT_SYMBOL_GPL(vmci_qpair_peek);
-
 /*
  * vmci_qpair_enquev() - Throw data on the queue using iov.
  * @qpair:      Pointer to the queue pair struct.
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index 28a3b6a9e1ca..41764a684423 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -60,12 +60,6 @@ s64 vmci_qpair_produce_free_space(const struct vmci_qp *qpair);
 s64 vmci_qpair_produce_buf_ready(const struct vmci_qp *qpair);
 s64 vmci_qpair_consume_free_space(const struct vmci_qp *qpair);
 s64 vmci_qpair_consume_buf_ready(const struct vmci_qp *qpair);
-ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair,
-			   const void *buf, size_t buf_size, int mode);
-ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair,
-			   void *buf, size_t buf_size, int mode);
-ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size,
-			int mode);
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
 			  struct msghdr *msg, size_t iov_size, int mode);
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
-- 
cgit v1.2.3


From 7aecbbdf86d3288576a84df360f96b93ceec7ee2 Mon Sep 17 00:00:00 2001
From: Zijun Hu <zijun.hu@oss.qualcomm.com>
Date: Fri, 20 Jun 2025 22:35:18 +0800
Subject: char: misc: Remove redundant forward declarations

Header miscdevice.h includes linux/device.h which has definations for
below two forward declarations directly or indirectly:

struct device;
struct attribute_group;

Remove these redundant forward declarations from miscdevice.h

Signed-off-by: Zijun Hu <zijun.hu@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250620-fix_mischar-v1-1-6c2716bbf1fa@oss.qualcomm.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/miscdevice.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 69e110c2b86a..3e6deb00fc85 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -73,9 +73,6 @@
 #define RFKILL_MINOR		242
 #define MISC_DYNAMIC_MINOR	255
 
-struct device;
-struct attribute_group;
-
 struct miscdevice {
 	int minor;
 	const char *name;
-- 
cgit v1.2.3


From cb444006a625c60e6d4dd3753863c3c74f96aac3 Mon Sep 17 00:00:00 2001
From: David Dai <david.dai@linux.dev>
Date: Tue, 24 Jun 2025 15:49:06 -0700
Subject: sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic

For systems using a sched_ext scheduler and has panic_on_rcu_stall
enabled, try kicking out the current scheduler before issuing a panic.

While there are numerous reasons for RCU CPU stalls that are not
directly attributed to the scheduler, deferring the panic gives
sched_ext an opportunity to provide additional debug info when ejecting
the current scheduler. Also, handling the event more gracefully allows
us to potentially recover the system instead of incurring additional
down time.

Suggested-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: David Dai <david.dai@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h |  2 ++
 kernel/rcu/tree_stall.h   |  7 +++++++
 kernel/sched/ext.c        | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 8b92842776cb..0cf0915572c9 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -206,12 +206,14 @@ struct sched_ext_entity {
 void sched_ext_free(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
 void scx_softlockup(u32 dur_s);
+bool scx_rcu_cpu_stall(void);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
+static inline bool scx_rcu_cpu_stall(void) { return false; }
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 486c00536207..af61b2d0d311 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -134,6 +134,13 @@ static void panic_on_rcu_stall(void)
 {
 	static int cpu_stall;
 
+	/*
+	 * Attempt to kick out the BPF scheduler if it's installed and defer
+	 * the panic to give the system a chance to recover.
+	 */
+	if (scx_rcu_cpu_stall())
+		return;
+
 	if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
 		return;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bee98fdcdd01..df5b2c952cf7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4672,6 +4672,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
 		p->sched_class != &ext_sched_class;
 }
 
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+	struct scx_sched *sch;
+
+	rcu_read_lock();
+
+	sch = rcu_dereference(scx_root);
+	if (unlikely(!sch)) {
+		rcu_read_unlock();
+		return false;
+	}
+
+	switch (scx_enable_state()) {
+	case SCX_ENABLING:
+	case SCX_ENABLED:
+		break;
+	default:
+		rcu_read_unlock();
+		return false;
+	}
+
+	scx_error(sch, "RCU CPU stall detected!");
+	rcu_read_unlock();
+
+	return true;
+}
+
 /**
  * scx_softlockup - sched_ext softlockup handler
  * @dur_s: number of seconds of CPU stuck due to soft lockup
-- 
cgit v1.2.3


From c9e78afa688afec528784b79bb02d513cdcd6527 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 23 Jun 2025 12:53:55 +0200
Subject: udp_tunnel: fix deadlock in udp_tunnel_nic_set_port_priv()

While configuring a vxlan tunnel in a system with a i40e NIC driver, I
observe the following deadlock:

 WARNING: possible recursive locking detected
 6.16.0-rc2.net-next-6.16_92d87230d899+ #13 Tainted: G            E
 --------------------------------------------
 kworker/u256:4/1125 is trying to acquire lock:
 ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: i40e_udp_tunnel_set_port (/home/pabeni/net-next/include/net/udp_tunnel.h:343 /home/pabeni/net-next/drivers/net/ethernet/intel/i40e/i40e_main.c:13013) i40e

 but task is already holding lock:
 ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:739) udp_tunnel

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&utn->lock);
   lock(&utn->lock);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 4 locks held by kworker/u256:4/1125:
 #0: ffff8892910ca158 ((wq_completion)udp_tunnel_nic){+.+.}-{0:0}, at: process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3213)
 #1: ffffc900244efd30 ((work_completion)(&utn->work)){+.+.}-{0:0}, at: process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3214)
 #2: ffffffff9a14e290 (rtnl_mutex){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:737) udp_tunnel
 #3: ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:739) udp_tunnel

 stack backtrace:
 Hardware name: Dell Inc. PowerEdge R7525/0YHMCJ, BIOS 2.2.5 04/08/2021
i
 Call Trace:
  <TASK>
 dump_stack_lvl (/home/pabeni/net-next/lib/dump_stack.c:123)
 print_deadlock_bug (/home/pabeni/net-next/kernel/locking/lockdep.c:3047)
 validate_chain (/home/pabeni/net-next/kernel/locking/lockdep.c:3901)
 __lock_acquire (/home/pabeni/net-next/kernel/locking/lockdep.c:5240)
 lock_acquire.part.0 (/home/pabeni/net-next/kernel/locking/lockdep.c:473 /home/pabeni/net-next/kernel/locking/lockdep.c:5873)
 __mutex_lock (/home/pabeni/net-next/kernel/locking/mutex.c:604 /home/pabeni/net-next/kernel/locking/mutex.c:747)
 i40e_udp_tunnel_set_port (/home/pabeni/net-next/include/net/udp_tunnel.h:343 /home/pabeni/net-next/drivers/net/ethernet/intel/i40e/i40e_main.c:13013) i40e
 udp_tunnel_nic_device_sync_by_port (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:230 /home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:249) udp_tunnel
 __udp_tunnel_nic_device_sync.part.0 (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:292) udp_tunnel
 udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:742) udp_tunnel
 process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3243)
 worker_thread (/home/pabeni/net-next/kernel/workqueue.c:3315 /home/pabeni/net-next/kernel/workqueue.c:3402)
 kthread (/home/pabeni/net-next/kernel/kthread.c:464)

AFAICS all the existing callsites of udp_tunnel_nic_set_port_priv() are
already under the utn lock scope, avoid (re-)acquiring it in such a
function.

Fixes: 1ead7501094c ("udp_tunnel: remove rtnl_lock dependency")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/95a827621ec78c12d1564ec3209e549774f9657d.1750675978.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/udp_tunnel.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index cbd3a43074bd..9acef2fbd2fd 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -339,9 +339,8 @@ udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
 			     unsigned int idx, u8 priv)
 {
 	if (udp_tunnel_nic_ops) {
-		udp_tunnel_nic_ops->lock(dev);
+		udp_tunnel_nic_ops->assert_locked(dev);
 		udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
-		udp_tunnel_nic_ops->unlock(dev);
 	}
 }
 
-- 
cgit v1.2.3


From f3128dd6762d71ec92ae888bf582a5f751c3f2e0 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 24 Jun 2025 09:41:08 +0800
Subject: security: Remove unused declaration cap_mmap_file()

Commit 3f4f1f8a1ab7 ("capabilities: remove cap_mmap_file()")
removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/security.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index dba349629229..e8d9f6069f0c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -193,8 +193,6 @@ int cap_inode_getsecurity(struct mnt_idmap *idmap,
 			  struct inode *inode, const char *name, void **buffer,
 			  bool alloc);
 extern int cap_mmap_addr(unsigned long addr);
-extern int cap_mmap_file(struct file *file, unsigned long reqprot,
-			 unsigned long prot, unsigned long flags);
 extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			  unsigned long arg4, unsigned long arg5);
-- 
cgit v1.2.3


From 22bbc1dcd0d6785fb390c41f0dd5b5e218d23bdd Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Mon, 23 Jun 2025 12:00:53 +0200
Subject: vsock/uapi: fix linux/vm_sockets.h userspace compilation errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a userspace application just include <linux/vm_sockets.h> will fail
to build with the following errors:

    /usr/include/linux/vm_sockets.h:182:39: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’
      182 |         unsigned char svm_zero[sizeof(struct sockaddr) -
          |                                       ^~~~~~
    /usr/include/linux/vm_sockets.h:183:39: error: ‘sa_family_t’ undeclared here (not in a function)
      183 |                                sizeof(sa_family_t) -
          |

Include <sys/socket.h> for userspace (guarded by ifndef __KERNEL__)
where `struct sockaddr` and `sa_family_t` are defined.
We already do something similar in <linux/mptcp.h> and <linux/if.h>.

Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Reported-by: Daan De Meyer <daan.j.demeyer@gmail.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20250623100053.40979-1-sgarzare@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/vm_sockets.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index ed07181d4eff..e05280e41522 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -17,6 +17,10 @@
 #ifndef _UAPI_VM_SOCKETS_H
 #define _UAPI_VM_SOCKETS_H
 
+#ifndef __KERNEL__
+#include <sys/socket.h>        /* for struct sockaddr and sa_family_t */
+#endif
+
 #include <linux/socket.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From f65bbf05392b44714ccdcc4b5b1bebfd471d2665 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 1 Dec 2024 19:28:34 -0500
Subject: alpha: regularize the situation with asm/param.h

The only reason why alpha can't do what sparc et.al. are doing
is that include/asm-generic/param.h relies upon the value of HZ
set for userland header in uapi/asm/param.h being 100.

We need that value to define USER_HZ and we need that definition
to outlive the redefinition of HZ kernel-side.  And alpha needs
it to be 1024, not 100 like everybody else.

So let's add __USER_HZ to uapi/asm-generic/param.h, defaulting to
100 and used to define HZ.  That way include/asm-generic/param.h
can use that thing instead of open-coding it - it won't be affected
by undefining and redefining HZ.

That done, alpha asm/param.h can be removed and uapi/asm/param.h
switched to defining __USER_HZ and EXEC_PAGESIZE and then including
<asm-generic/param.h> - asm/param.h will resolve to uapi/asm/param.h,
which pulls <asm-generic/param.h>, which will do the right thing
both in the kernel and userland contexts.

Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/param.h      | 12 ------------
 arch/alpha/include/uapi/asm/param.h |  9 ++-------
 include/asm-generic/param.h         |  2 +-
 include/uapi/asm-generic/param.h    |  6 +++++-
 4 files changed, 8 insertions(+), 21 deletions(-)
 delete mode 100644 arch/alpha/include/asm/param.h

(limited to 'include')

diff --git a/arch/alpha/include/asm/param.h b/arch/alpha/include/asm/param.h
deleted file mode 100644
index cfe947ce9461..000000000000
--- a/arch/alpha/include/asm/param.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_ALPHA_PARAM_H
-#define _ASM_ALPHA_PARAM_H
-
-#include <uapi/asm/param.h>
-
-# undef HZ
-# define HZ		CONFIG_HZ
-# define USER_HZ	1024
-# define CLOCKS_PER_SEC	USER_HZ	/* frequency at which times() counts */
-
-#endif /* _ASM_ALPHA_PARAM_H */
diff --git a/arch/alpha/include/uapi/asm/param.h b/arch/alpha/include/uapi/asm/param.h
index 49c7119934e2..e4e410f9bf85 100644
--- a/arch/alpha/include/uapi/asm/param.h
+++ b/arch/alpha/include/uapi/asm/param.h
@@ -2,14 +2,9 @@
 #ifndef _UAPI_ASM_ALPHA_PARAM_H
 #define _UAPI_ASM_ALPHA_PARAM_H
 
-#define HZ		1024
-
+#define __USER_HZ	1024
 #define EXEC_PAGESIZE	8192
 
-#ifndef NOGROUP
-#define NOGROUP		(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
+#include <asm-generic/param.h>
 
 #endif /* _UAPI_ASM_ALPHA_PARAM_H */
diff --git a/include/asm-generic/param.h b/include/asm-generic/param.h
index 8d3009dd28ff..8348c116aa3b 100644
--- a/include/asm-generic/param.h
+++ b/include/asm-generic/param.h
@@ -6,6 +6,6 @@
 
 # undef HZ
 # define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
-# define USER_HZ	100		/* some user interfaces are */
+# define USER_HZ	__USER_HZ	/* some user interfaces are */
 # define CLOCKS_PER_SEC	(USER_HZ)       /* in "ticks" like times() */
 #endif /* __ASM_GENERIC_PARAM_H */
diff --git a/include/uapi/asm-generic/param.h b/include/uapi/asm-generic/param.h
index baad02ea7f93..3ed505dfea13 100644
--- a/include/uapi/asm-generic/param.h
+++ b/include/uapi/asm-generic/param.h
@@ -2,8 +2,12 @@
 #ifndef _UAPI__ASM_GENERIC_PARAM_H
 #define _UAPI__ASM_GENERIC_PARAM_H
 
+#ifndef __USER_HZ
+#define __USER_HZ	100
+#endif
+
 #ifndef HZ
-#define HZ 100
+#define HZ __USER_HZ
 #endif
 
 #ifndef EXEC_PAGESIZE
-- 
cgit v1.2.3


From 67caa528ae08cd05e485c0ea6aea0baaf6579b06 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Sat, 21 Jun 2025 10:28:41 -0600
Subject: ublk: fix narrowing warnings in UAPI header

When a C++ file compiled with -Wc++11-narrowing includes the UAPI header
linux/ublk_cmd.h, ublk_sqe_addr_to_auto_buf_reg()'s assignments of u64
values to u8, u16, and u32 fields result in compiler warnings. Add
explicit casts to the intended types to avoid these warnings. Drop the
unnecessary bitmasks.

Reported-by: Uday Shankar <ushankar@purestorage.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Fixes: 99c1e4eb6a3f ("ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250621162842.337452-1-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ublk_cmd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 77d9d6af46da..c062109cb686 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -450,10 +450,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg(
 		__u64 sqe_addr)
 {
 	struct ublk_auto_buf_reg reg = {
-		.index = sqe_addr & 0xffff,
-		.flags = (sqe_addr >> 16) & 0xff,
-		.reserved0 = (sqe_addr >> 24) & 0xff,
-		.reserved1 = sqe_addr >> 32,
+		.index = (__u16)sqe_addr,
+		.flags = (__u8)(sqe_addr >> 16),
+		.reserved0 = (__u8)(sqe_addr >> 24),
+		.reserved1 = (__u32)(sqe_addr >> 32),
 	};
 
 	return reg;
-- 
cgit v1.2.3


From 81b4d1a1d03301dcca8af5c58eded9e535f1f6ed Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Sat, 21 Jun 2025 11:10:14 -0600
Subject: ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header

UBLK_F_SUPPORT_ZERO_COPY has a very old comment describing the initial
idea for how zero-copy would be implemented. The actual implementation
added in commit 1f6540e2aabb ("ublk: zc register/unregister bvec") uses
io_uring registered buffers rather than shared memory mapping.
Remove the inaccurate remarks about mapping ublk request memory into the
ublk server's address space and requiring 4K block size. Replace them
with a description of the current zero-copy mechanism.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250621171015.354932-1-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/ublk_cmd.h | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index c062109cb686..c9751bdfd937 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -135,8 +135,28 @@
 #define UBLKSRV_IO_BUF_TOTAL_SIZE	(1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
 
 /*
- * zero copy requires 4k block size, and can remap ublk driver's io
- * request into ublksrv's vm space
+ * ublk server can register data buffers for incoming I/O requests with a sparse
+ * io_uring buffer table. The request buffer can then be used as the data buffer
+ * for io_uring operations via the fixed buffer index.
+ * Note that the ublk server can never directly access the request data memory.
+ *
+ * To use this feature, the ublk server must first register a sparse buffer
+ * table on an io_uring instance.
+ * When an incoming ublk request is received, the ublk server submits a
+ * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The
+ * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register
+ * and addr is the index in the io_uring's buffer table to install the buffer.
+ * SQEs can now be submitted to the io_uring to read/write the request's buffer
+ * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or
+ * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index.
+ * Once the last io_uring operation using the request's buffer has completed,
+ * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag,
+ * and addr again specifying the request buffer to unregister.
+ * The ublk request is completed when its buffer is unregistered from all
+ * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ.
+ *
+ * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak
+ * uninitialized kernel memory by not reading into the full request buffer.
  */
 #define UBLK_F_SUPPORT_ZERO_COPY	(1ULL << 0)
 
-- 
cgit v1.2.3


From 63d0a9123120a2e10861ac7f6dc474bee653d3b2 Mon Sep 17 00:00:00 2001
From: Ujwal Jain <ujwaljain@google.com>
Date: Sat, 14 Jun 2025 16:47:11 +0800
Subject: kunit: Adjust kunit_test timeout based on test_{suite,case} speed

Currently, the in-kernel kunit test case timeout is 300 seconds. (There
is a separate timeout mechanism for the whole test execution in
kunit.py, but that's unrelated.) However, tests marked 'slow' or 'very
slow' may timeout, particularly on slower machines.

Implement a multiplier to the test-case timeout, so that slower tests
have longer to complete:
- DEFAULT -> 1x default timeout
- KUNIT_SPEED_SLOW -> 3x default timeout
- KUNIT_SPEED_VERY_SLOW -> 12x default timeout

A further change is planned to allow user configuration of the
default/base timeout to allow people with faster or slower machines to
adjust these to their use-cases.

Link: https://lore.kernel.org/r/20250614084711.2654593-2-davidgow@google.com
Signed-off-by: Ujwal Jain <ujwaljain@google.com>
Co-developed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Reviewed-by: Rae Moar <rmoar@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/try-catch.h  |  1 +
 lib/kunit/kunit-test.c     |  9 ++++++---
 lib/kunit/test.c           | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 lib/kunit/try-catch-impl.h |  4 +++-
 lib/kunit/try-catch.c      | 29 ++---------------------------
 5 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/kunit/try-catch.h b/include/kunit/try-catch.h
index 7c966a1adbd3..d4e1a5b98ed6 100644
--- a/include/kunit/try-catch.h
+++ b/include/kunit/try-catch.h
@@ -47,6 +47,7 @@ struct kunit_try_catch {
 	int try_result;
 	kunit_try_catch_func_t try;
 	kunit_try_catch_func_t catch;
+	unsigned long timeout;
 	void *context;
 };
 
diff --git a/lib/kunit/kunit-test.c b/lib/kunit/kunit-test.c
index f8f567196ca9..8c01eabd4eaf 100644
--- a/lib/kunit/kunit-test.c
+++ b/lib/kunit/kunit-test.c
@@ -44,7 +44,8 @@ static void kunit_test_try_catch_successful_try_no_catch(struct kunit *test)
 	kunit_try_catch_init(try_catch,
 			     test,
 			     kunit_test_successful_try,
-			     kunit_test_no_catch);
+			     kunit_test_no_catch,
+			     300 * msecs_to_jiffies(MSEC_PER_SEC));
 	kunit_try_catch_run(try_catch, test);
 
 	KUNIT_EXPECT_TRUE(test, ctx->function_called);
@@ -76,7 +77,8 @@ static void kunit_test_try_catch_unsuccessful_try_does_catch(struct kunit *test)
 	kunit_try_catch_init(try_catch,
 			     test,
 			     kunit_test_unsuccessful_try,
-			     kunit_test_catch);
+			     kunit_test_catch,
+			     300 * msecs_to_jiffies(MSEC_PER_SEC));
 	kunit_try_catch_run(try_catch, test);
 
 	KUNIT_EXPECT_TRUE(test, ctx->function_called);
@@ -130,7 +132,8 @@ static void kunit_test_fault_null_dereference(struct kunit *test)
 	kunit_try_catch_init(try_catch,
 			     test,
 			     kunit_test_null_dereference,
-			     kunit_test_catch);
+			     kunit_test_catch,
+			     300 * msecs_to_jiffies(MSEC_PER_SEC));
 	kunit_try_catch_run(try_catch, test);
 
 	KUNIT_EXPECT_EQ(test, try_catch->try_result, -EINTR);
diff --git a/lib/kunit/test.c b/lib/kunit/test.c
index 146d1b48a096..002121675605 100644
--- a/lib/kunit/test.c
+++ b/lib/kunit/test.c
@@ -373,6 +373,46 @@ static void kunit_run_case_check_speed(struct kunit *test,
 		   duration.tv_sec, duration.tv_nsec);
 }
 
+/* Returns timeout multiplier based on speed.
+ * DEFAULT:		    1
+ * KUNIT_SPEED_SLOW:        3
+ * KUNIT_SPEED_VERY_SLOW:   12
+ */
+static int kunit_timeout_mult(enum kunit_speed speed)
+{
+	switch (speed) {
+	case KUNIT_SPEED_SLOW:
+		return 3;
+	case KUNIT_SPEED_VERY_SLOW:
+		return 12;
+	default:
+		return 1;
+	}
+}
+
+static unsigned long kunit_test_timeout(struct kunit_suite *suite, struct kunit_case *test_case)
+{
+	int mult = 1;
+	/*
+	 * TODO: Make the default (base) timeout configurable, so that users with
+	 * particularly slow or fast machines can successfully run tests, while
+	 * still taking advantage of the relative speed.
+	 */
+	unsigned long default_timeout = 300;
+
+	/*
+	 * The default test timeout is 300 seconds and will be adjusted by mult
+	 * based on the test speed. The test speed will be overridden by the
+	 * innermost test component.
+	 */
+	if (suite->attr.speed != KUNIT_SPEED_UNSET)
+		mult = kunit_timeout_mult(suite->attr.speed);
+	if (test_case->attr.speed != KUNIT_SPEED_UNSET)
+		mult = kunit_timeout_mult(test_case->attr.speed);
+	return mult * default_timeout * msecs_to_jiffies(MSEC_PER_SEC);
+}
+
+
 /*
  * Initializes and runs test case. Does not clean up or do post validations.
  */
@@ -527,7 +567,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
 	kunit_try_catch_init(try_catch,
 			     test,
 			     kunit_try_run_case,
-			     kunit_catch_run_case);
+			     kunit_catch_run_case,
+			     kunit_test_timeout(suite, test_case));
 	context.test = test;
 	context.suite = suite;
 	context.test_case = test_case;
@@ -537,7 +578,8 @@ static void kunit_run_case_catch_errors(struct kunit_suite *suite,
 	kunit_try_catch_init(try_catch,
 			     test,
 			     kunit_try_run_case_cleanup,
-			     kunit_catch_run_case_cleanup);
+			     kunit_catch_run_case_cleanup,
+			     kunit_test_timeout(suite, test_case));
 	kunit_try_catch_run(try_catch, &context);
 
 	/* Propagate the parameter result to the test case. */
diff --git a/lib/kunit/try-catch-impl.h b/lib/kunit/try-catch-impl.h
index 203ba6a5e740..6f401b97cd0b 100644
--- a/lib/kunit/try-catch-impl.h
+++ b/lib/kunit/try-catch-impl.h
@@ -17,11 +17,13 @@ struct kunit;
 static inline void kunit_try_catch_init(struct kunit_try_catch *try_catch,
 					struct kunit *test,
 					kunit_try_catch_func_t try,
-					kunit_try_catch_func_t catch)
+					kunit_try_catch_func_t catch,
+					unsigned long timeout)
 {
 	try_catch->test = test;
 	try_catch->try = try;
 	try_catch->catch = catch;
+	try_catch->timeout = timeout;
 }
 
 #endif /* _KUNIT_TRY_CATCH_IMPL_H */
diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c
index 6bbe0025b079..d84a879f0a78 100644
--- a/lib/kunit/try-catch.c
+++ b/lib/kunit/try-catch.c
@@ -34,31 +34,6 @@ static int kunit_generic_run_threadfn_adapter(void *data)
 	return 0;
 }
 
-static unsigned long kunit_test_timeout(void)
-{
-	/*
-	 * TODO(brendanhiggins@google.com): We should probably have some type of
-	 * variable timeout here. The only question is what that timeout value
-	 * should be.
-	 *
-	 * The intention has always been, at some point, to be able to label
-	 * tests with some type of size bucket (unit/small, integration/medium,
-	 * large/system/end-to-end, etc), where each size bucket would get a
-	 * default timeout value kind of like what Bazel does:
-	 * https://docs.bazel.build/versions/master/be/common-definitions.html#test.size
-	 * There is still some debate to be had on exactly how we do this. (For
-	 * one, we probably want to have some sort of test runner level
-	 * timeout.)
-	 *
-	 * For more background on this topic, see:
-	 * https://mike-bland.com/2011/11/01/small-medium-large.html
-	 *
-	 * If tests timeout due to exceeding sysctl_hung_task_timeout_secs,
-	 * the task will be killed and an oops generated.
-	 */
-	return 300 * msecs_to_jiffies(MSEC_PER_SEC); /* 5 min */
-}
-
 void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 {
 	struct kunit *test = try_catch->test;
@@ -85,8 +60,8 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 	task_done = task_struct->vfork_done;
 	wake_up_process(task_struct);
 
-	time_remaining = wait_for_completion_timeout(task_done,
-						     kunit_test_timeout());
+	time_remaining = wait_for_completion_timeout(
+		task_done, try_catch->timeout);
 	if (time_remaining == 0) {
 		try_catch->try_result = -ETIMEDOUT;
 		kthread_stop(task_struct);
-- 
cgit v1.2.3


From 5a2a5b65d5d67279be9e1f0e4b9baf39ee594cb1 Mon Sep 17 00:00:00 2001
From: Mark Zhang <markzhang@nvidia.com>
Date: Mon, 16 Jun 2025 11:26:20 +0300
Subject: RDMA/core: Add driver APIs pre_destroy_cq() and post_destroy_cq()

Currently in ib_free_cq, it disables IRQ or cancel the CQ work before
driver destroy_cq. This isn't good as a new IRQ or a CQ work can be
submitted immediately after disabling IRQ or canceling CQ work, which
may run concurrently with destroy_cq and cause crashes.
The right flow should be:
 1. Driver disables CQ to make sure no new CQ event will be submitted;
 2. Disables IRQ or Cancels CQ work in core layer, to make sure no CQ
    polling work is running;
 3. Free all resources to destroy the CQ.

This patch adds 2 driver APIs:
- pre_destroy_cq(): Disable a CQ to prevent it from generating any new
  work completions, but not free any kernel resources;
- post_destroy_cq(): Free all kernel resources.

In ib_free_cq, the IRQ is disabled or CQ work is canceled after
pre_destroy_cq, and before post_destroy_cq.

Fixes: 14d3a3b2498e ("IB: add a proper completion queue abstraction")
Signed-off-by: Mark Zhang <markzhang@nvidia.com>
Link: https://patch.msgid.link/b5f7ae3d75f44a3e15ff3f4eb2bbdea13e06b97f.1750062328.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/cq.c     | 12 ++++++++++--
 drivers/infiniband/core/device.c |  2 ++
 include/rdma/ib_verbs.h          |  9 +++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index a70876a0a231..584537c71545 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -317,13 +317,18 @@ EXPORT_SYMBOL(__ib_alloc_cq_any);
  */
 void ib_free_cq(struct ib_cq *cq)
 {
-	int ret;
+	int ret = 0;
 
 	if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
 		return;
 	if (WARN_ON_ONCE(cq->cqe_used))
 		return;
 
+	if (cq->device->ops.pre_destroy_cq) {
+		ret = cq->device->ops.pre_destroy_cq(cq);
+		WARN_ONCE(ret, "Disable of kernel CQ shouldn't fail");
+	}
+
 	switch (cq->poll_ctx) {
 	case IB_POLL_DIRECT:
 		break;
@@ -340,7 +345,10 @@ void ib_free_cq(struct ib_cq *cq)
 
 	rdma_dim_destroy(cq);
 	trace_cq_free(cq);
-	ret = cq->device->ops.destroy_cq(cq, NULL);
+	if (cq->device->ops.post_destroy_cq)
+		cq->device->ops.post_destroy_cq(cq);
+	else
+		ret = cq->device->ops.destroy_cq(cq, NULL);
 	WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
 	rdma_restrack_del(&cq->res);
 	kfree(cq->wc);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index d4263385850a..468ed6bd4722 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2763,8 +2763,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, modify_srq);
 	SET_DEVICE_OP(dev_ops, modify_wq);
 	SET_DEVICE_OP(dev_ops, peek_cq);
+	SET_DEVICE_OP(dev_ops, pre_destroy_cq);
 	SET_DEVICE_OP(dev_ops, poll_cq);
 	SET_DEVICE_OP(dev_ops, port_groups);
+	SET_DEVICE_OP(dev_ops, post_destroy_cq);
 	SET_DEVICE_OP(dev_ops, post_recv);
 	SET_DEVICE_OP(dev_ops, post_send);
 	SET_DEVICE_OP(dev_ops, post_srq_recv);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index af43a8d2a74a..38f68d245fa6 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2489,6 +2489,15 @@ struct ib_device_ops {
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
 	int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+	/**
+	 * pre_destroy_cq - Prevent a cq from generating any new work
+	 * completions, but not free any kernel resources
+	 */
+	int (*pre_destroy_cq)(struct ib_cq *cq);
+	/**
+	 * post_destroy_cq - Free all kernel resources
+	 */
+	void (*post_destroy_cq)(struct ib_cq *cq);
 	struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
 	struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
 				     u64 virt_addr, int mr_access_flags,
-- 
cgit v1.2.3


From 52931f55159ea5c27ad4fe66fc0cb8ad75ab795b Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad@nvidia.com>
Date: Tue, 17 Jun 2025 11:19:15 +0300
Subject: net/mlx5: fs, add multiple prios to RDMA TRANSPORT steering domain

RDMA TRANSPORT domains were initially limited to a single priority.
This change allows the domains to have multiple priorities, making
it possible to add several rules and control the order in which
they're evaluated.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/b299cbb4c8678a33da6e6b6988b5bf6145c54b88.1750148083.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 30 +++++++++++++++++------
 include/linux/mlx5/fs.h                           |  2 +-
 2 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index a8046200d376..7f5608081ea0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3245,34 +3245,48 @@ static int
 init_rdma_transport_rx_root_ns_one(struct mlx5_flow_steering *steering,
 				   int vport_idx)
 {
+	struct mlx5_flow_root_namespace *root_ns;
 	struct fs_prio *prio;
+	int i;
 
 	steering->rdma_transport_rx_root_ns[vport_idx] =
 		create_root_ns(steering, FS_FT_RDMA_TRANSPORT_RX);
 	if (!steering->rdma_transport_rx_root_ns[vport_idx])
 		return -ENOMEM;
 
-	/* create 1 prio*/
-	prio = fs_create_prio(&steering->rdma_transport_rx_root_ns[vport_idx]->ns,
-			      MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
-	return PTR_ERR_OR_ZERO(prio);
+	root_ns = steering->rdma_transport_rx_root_ns[vport_idx];
+
+	for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) {
+		prio = fs_create_prio(&root_ns->ns, i, 1);
+		if (IS_ERR(prio))
+			return PTR_ERR(prio);
+	}
+	set_prio_attrs(root_ns);
+	return 0;
 }
 
 static int
 init_rdma_transport_tx_root_ns_one(struct mlx5_flow_steering *steering,
 				   int vport_idx)
 {
+	struct mlx5_flow_root_namespace *root_ns;
 	struct fs_prio *prio;
+	int i;
 
 	steering->rdma_transport_tx_root_ns[vport_idx] =
 		create_root_ns(steering, FS_FT_RDMA_TRANSPORT_TX);
 	if (!steering->rdma_transport_tx_root_ns[vport_idx])
 		return -ENOMEM;
 
-	/* create 1 prio*/
-	prio = fs_create_prio(&steering->rdma_transport_tx_root_ns[vport_idx]->ns,
-			      MLX5_RDMA_TRANSPORT_BYPASS_PRIO, 1);
-	return PTR_ERR_OR_ZERO(prio);
+	root_ns = steering->rdma_transport_tx_root_ns[vport_idx];
+
+	for (i = 0; i < MLX5_RDMA_TRANSPORT_BYPASS_PRIO; i++) {
+		prio = fs_create_prio(&root_ns->ns, i, 1);
+		if (IS_ERR(prio))
+			return PTR_ERR(prio);
+	}
+	set_prio_attrs(root_ns);
+	return 0;
 }
 
 static int init_rdma_transport_rx_root_ns(struct mlx5_flow_steering *steering)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 939e58c2f386..86055d55836d 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -40,7 +40,7 @@
 
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
-#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 0
+#define MLX5_RDMA_TRANSPORT_BYPASS_PRIO 16
 #define MLX5_FS_MAX_POOL_SIZE BIT(30)
 
 enum mlx5_flow_destination_type {
-- 
cgit v1.2.3


From ebf8d47121b6ef3f38425a343a72f37c60fd6dbc Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Thu, 19 Jun 2025 14:37:17 +0300
Subject: net/mlx5: Small refactor for general object capabilities

Make enum for capability bits of general object types depend on
the type definitions themselves.

Make sure that capabilities in the [64,127] bit range are
properly calculated (type id - 64).

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250619113721.60201-2-mbloch@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2c09df4ee574..5c8f75605eac 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12501,17 +12501,6 @@ struct mlx5_ifc_affiliated_event_header_bits {
 	u8         obj_id[0x20];
 };
 
-enum {
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24),
-};
-
-enum {
-	MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL = BIT_ULL(0x13),
-};
-
 enum {
 	MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc,
 	MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13,
@@ -12523,6 +12512,22 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
 };
 
+enum {
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_IPSEC),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_SAMPLER),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO),
+};
+
+enum {
+	MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL - 0x40),
+};
+
 enum {
 	MLX5_IPSEC_OBJECT_ICV_LEN_16B,
 };
-- 
cgit v1.2.3


From 1f6da56679d33c733aaee929fd9af962ad66edbd Mon Sep 17 00:00:00 2001
From: Dragos Tatulea <dtatulea@nvidia.com>
Date: Thu, 19 Jun 2025 14:37:18 +0300
Subject: net/mlx5: Add IFC bits for PCIe Congestion Event object

Add definitions for the PCIe Congestion Event object
and the relevant FW command structures.

Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250619113721.60201-3-mbloch@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5c8f75605eac..0e93f342be09 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12509,6 +12509,7 @@ enum {
 	MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27,
 	MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47,
 	MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL = 0x53,
+	MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT = 0x58,
 	MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS = 0xff15,
 };
 
@@ -12526,6 +12527,8 @@ enum {
 enum {
 	MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL =
 		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_RDMA_CTRL - 0x40),
+	MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT =
+		BIT_ULL(MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT - 0x40),
 };
 
 enum {
@@ -13284,4 +13287,41 @@ struct mlx5_ifc_mrtcq_reg_bits {
 	u8         reserved_at_80[0x180];
 };
 
+struct mlx5_ifc_pcie_cong_event_obj_bits {
+	u8         modify_select_field[0x40];
+
+	u8         inbound_event_en[0x1];
+	u8         outbound_event_en[0x1];
+	u8         reserved_at_42[0x1e];
+
+	u8         reserved_at_60[0x1];
+	u8         inbound_cong_state[0x3];
+	u8         reserved_at_64[0x1];
+	u8         outbound_cong_state[0x3];
+	u8         reserved_at_68[0x18];
+
+	u8         inbound_cong_low_threshold[0x10];
+	u8         inbound_cong_high_threshold[0x10];
+
+	u8         outbound_cong_low_threshold[0x10];
+	u8         outbound_cong_high_threshold[0x10];
+
+	u8         reserved_at_e0[0x340];
+};
+
+struct mlx5_ifc_pcie_cong_event_cmd_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_pcie_cong_event_obj_bits cong_obj;
+};
+
+struct mlx5_ifc_pcie_cong_event_cmd_out_bits {
+	struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+	struct mlx5_ifc_pcie_cong_event_obj_bits cong_obj;
+};
+
+enum mlx5e_pcie_cong_event_mod_field {
+	MLX5_PCIE_CONG_EVENT_MOD_EVENT_EN = BIT(0),
+	MLX5_PCIE_CONG_EVENT_MOD_THRESH   = BIT(2),
+};
+
 #endif /* MLX5_IFC_H */
-- 
cgit v1.2.3


From 61977ccf6568f9d104462727b49412a80c22c519 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 20 Jun 2025 01:10:24 +0800
Subject: dt-bindings: reset: sun55i-a523-r-ccu: Add missing PPU0 reset

There is a PPU0 reset control bit in the same register as the PPU1
reset control. This missing reset control is for the PCK-600 unit
in the SoC. Manual tests show that the reset control indeed exists,
and if not configured, the system will hang when the PCK-600 registers
are accessed.

Add a reset entry for it at the end of the existing ones.

Fixes: 52dbf84857f0 ("dt-bindings: clk: sunxi-ng: document two Allwinner A523 CCUs")
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Link: https://patch.msgid.link/20250619171025.3359384-2-wens@kernel.org
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/dt-bindings/reset/sun55i-a523-r-ccu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/sun55i-a523-r-ccu.h b/include/dt-bindings/reset/sun55i-a523-r-ccu.h
index dd6fbb372e19..eb31ae9958d6 100644
--- a/include/dt-bindings/reset/sun55i-a523-r-ccu.h
+++ b/include/dt-bindings/reset/sun55i-a523-r-ccu.h
@@ -21,5 +21,6 @@
 #define RST_BUS_R_IR_RX		12
 #define RST_BUS_R_RTC		13
 #define RST_BUS_R_CPUCFG	14
+#define RST_BUS_R_PPU0		15
 
 #endif /* _DT_BINDINGS_RST_SUN55I_A523_R_CCU_H_ */
-- 
cgit v1.2.3


From f792733e08d5f5d44ef76d22bcca7ca45a82d0de Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Wed, 25 Jun 2025 22:04:28 +0800
Subject: ASoC: sdw_utils: add component_name string to dai_info
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the sdw machine driver uses different way to get the
component name from the DAI name for different codecs in the rtd_init
callback. It means that we need to rely on the rtd_init callback to get
the component name. Add an optional component string to the
asoc_sdw_dai_info struct allows the machine driver to get the component
name directly.
The commit adds the component names for the AMP dais for the preparation
to set card->components string for combined speaker configs.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Shuming Fan <shumingf@realtek.com>
Link: https://patch.msgid.link/20250625140430.311865-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc_sdw_utils.h       | 1 +
 sound/soc/sdw_utils/soc_sdw_utils.c | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h
index b63021f5afaf..6049a5d0cfcd 100644
--- a/include/sound/soc_sdw_utils.h
+++ b/include/sound/soc_sdw_utils.h
@@ -46,6 +46,7 @@ struct asoc_sdw_codec_info;
 struct asoc_sdw_dai_info {
 	const bool direction[2]; /* playback & capture support */
 	const char *dai_name;
+	const char *component_name;
 	const int dai_type;
 	const int dailink[2]; /* dailink id for each direction */
 	const struct snd_kcontrol_new *controls;
diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c
index d75e7292240b..2e8820137c69 100644
--- a/sound/soc/sdw_utils/soc_sdw_utils.c
+++ b/sound/soc/sdw_utils/soc_sdw_utils.c
@@ -135,6 +135,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "rt712-sdca-aif2",
+				.component_name = "rt712",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
 				.init = asoc_sdw_rt_amp_init,
@@ -275,6 +276,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "rt1320-aif1",
+				.component_name = "rt1320",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
 				.init = asoc_sdw_rt_amp_init,
@@ -409,6 +411,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "rt722-sdca-aif2",
+				.component_name = "rt722",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				/* No feedback capability is provided by rt722-sdca codec driver*/
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
@@ -438,6 +441,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, true},
 				.dai_name = "max98373-aif1",
+				.component_name = "mx8373",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_AMP_IN_DAI_ID},
 				.init = asoc_sdw_maxim_init,
@@ -456,6 +460,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "max98363-aif1",
+				.component_name = "mx8363",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
 				.init = asoc_sdw_maxim_init,
@@ -491,6 +496,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "cs35l56-sdw1",
+				.component_name = "cs35l56",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
 				.init = asoc_sdw_cs_amp_init,
@@ -516,6 +522,7 @@ struct asoc_sdw_codec_info codec_info_list[] = {
 			{
 				.direction = {true, false},
 				.dai_name = "cs35l56-sdw1",
+				.component_name = "cs35l56",
 				.dai_type = SOC_SDW_DAI_TYPE_AMP,
 				.dailink = {SOC_SDW_AMP_OUT_DAI_ID, SOC_SDW_UNUSED_DAI_ID},
 				.init = asoc_sdw_cs_amp_init,
-- 
cgit v1.2.3


From 771f002ef1d6f6c2b9bddf779abd31da6b9ccd25 Mon Sep 17 00:00:00 2001
From: Shuicheng Lin <shuicheng.lin@intel.com>
Date: Sun, 8 Jun 2025 23:01:33 +0000
Subject: drm/xe/uapi: Correct sync type definition in comments

Commit 37d078e51b4c ("drm/xe/uapi: Split xe_sync types from flags") renamed some DRM_XE_SYNC_*
defines but later commits kept using the old names. Correct them with the new definition.

v2: correct fixes tag and update commit message to explain why (Lucas)

Fixes: 9329f0667215 ("drm/xe/uapi: Use LR abbrev for long-running vms")
Fixes: 4b437893a826 ("drm/xe/uapi: More uAPI documentation additions and cosmetic updates")
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Zongyao Bai <zongyao.bai@intel.com>
Signed-off-by: Shuicheng Lin <shuicheng.lin@intel.com>
Link: https://lore.kernel.org/r/20250608230133.1250849-1-shuicheng.lin@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 include/uapi/drm/xe_drm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 8e8bbdec8c5c..e2426413488f 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -925,9 +925,9 @@ struct drm_xe_gem_mmap_offset {
  *  - %DRM_XE_VM_CREATE_FLAG_LR_MODE - An LR, or Long Running VM accepts
  *    exec submissions to its exec_queues that don't have an upper time
  *    limit on the job execution time. But exec submissions to these
- *    don't allow any of the flags DRM_XE_SYNC_FLAG_SYNCOBJ,
- *    DRM_XE_SYNC_FLAG_TIMELINE_SYNCOBJ, DRM_XE_SYNC_FLAG_DMA_BUF,
- *    used as out-syncobjs, that is, together with DRM_XE_SYNC_FLAG_SIGNAL.
+ *    don't allow any of the sync types DRM_XE_SYNC_TYPE_SYNCOBJ,
+ *    DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ, used as out-syncobjs, that is,
+ *    together with sync flag DRM_XE_SYNC_FLAG_SIGNAL.
  *    LR VMs can be created in recoverable page-fault mode using
  *    DRM_XE_VM_CREATE_FLAG_FAULT_MODE, if the device supports it.
  *    If that flag is omitted, the UMD can not rely on the slightly
@@ -1394,7 +1394,7 @@ struct drm_xe_sync {
 
 	/**
 	 * @timeline_value: Input for the timeline sync object. Needs to be
-	 * different than 0 when used with %DRM_XE_SYNC_FLAG_TIMELINE_SYNCOBJ.
+	 * different than 0 when used with %DRM_XE_SYNC_TYPE_TIMELINE_SYNCOBJ.
 	 */
 	__u64 timeline_value;
 
-- 
cgit v1.2.3


From 2642f55d44ce563f227dd9c620eda0dec8d882be Mon Sep 17 00:00:00 2001
From: Peter Griffin <peter.griffin@linaro.org>
Date: Thu, 19 Jun 2025 12:18:16 +0100
Subject: pinctrl: samsung: add support for gs101 wakeup mask programming

gs101 differs to other currently supported SoCs in that it has 3 wakeup
mask registers for the 67 external wakeup interrupt pins in alive and
far_alive.

EINT_WAKEUP_MASK  0x3A80 EINT[31:0]
EINT_WAKEUP_MASK2 0x3A84 EINT[63:32]
EINT_WAKEUP_MASK3 0x3A88 EINT[66:64]

Add gs101 specific callbacks and a dedicated gs101_wkup_irq_chip struct to
handle these differences.

The current wakeup mask with upstream is programmed as
WAKEUP_MASK0[0x3A80] value[0xFFFFFFFF]
WAKEUP_MASK1[0x3A84] value[0xF2FFEFFF]
WAKEUP_MASK2[0x3A88] value[0xFFFFFFFF]

Which corresponds to the following wakeup sources:
gpa7-3  vol down
gpa8-1  vol up
gpa10-1 power
gpa8-2  typec-int

Signed-off-by: Peter Griffin <peter.griffin@linaro.org>
Link: https://lore.kernel.org/r/20250619-gs101-eint-mask-v1-2-89438cfd7499@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 drivers/pinctrl/samsung/pinctrl-exynos.c    | 100 ++++++++++++++++++++++++----
 drivers/pinctrl/samsung/pinctrl-samsung.h   |   4 ++
 include/linux/soc/samsung/exynos-regs-pmu.h |   1 +
 3 files changed, 91 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.c b/drivers/pinctrl/samsung/pinctrl-exynos.c
index f3e1c11abe55..5554768d465f 100644
--- a/drivers/pinctrl/samsung/pinctrl-exynos.c
+++ b/drivers/pinctrl/samsung/pinctrl-exynos.c
@@ -32,18 +32,24 @@
 #include "pinctrl-samsung.h"
 #include "pinctrl-exynos.h"
 
+#define MAX_WAKEUP_REG 3
+
 struct exynos_irq_chip {
 	struct irq_chip chip;
 
 	u32 eint_con;
 	u32 eint_mask;
 	u32 eint_pend;
-	u32 *eint_wake_mask_value;
+	u32 eint_num_wakeup_reg;
 	u32 eint_wake_mask_reg;
 	void (*set_eint_wakeup_mask)(struct samsung_pinctrl_drv_data *drvdata,
 				     struct exynos_irq_chip *irq_chip);
 };
 
+static u32 eint_wake_mask_values[MAX_WAKEUP_REG] = { EXYNOS_EINT_WAKEUP_MASK_DISABLED,
+						     EXYNOS_EINT_WAKEUP_MASK_DISABLED,
+						     EXYNOS_EINT_WAKEUP_MASK_DISABLED};
+
 static inline struct exynos_irq_chip *to_exynos_irq_chip(struct irq_chip *chip)
 {
 	return container_of(chip, struct exynos_irq_chip, chip);
@@ -307,7 +313,7 @@ static const struct exynos_irq_chip exynos_gpio_irq_chip __initconst = {
 	.eint_con = EXYNOS_GPIO_ECON_OFFSET,
 	.eint_mask = EXYNOS_GPIO_EMASK_OFFSET,
 	.eint_pend = EXYNOS_GPIO_EPEND_OFFSET,
-	/* eint_wake_mask_value not used */
+	/* eint_wake_mask_values not used */
 };
 
 static int exynos_eint_irq_map(struct irq_domain *h, unsigned int virq,
@@ -467,10 +473,55 @@ err_domains:
 	return ret;
 }
 
+#define BITS_PER_U32 32
+static int gs101_wkup_irq_set_wake(struct irq_data *irqd, unsigned int on)
+{
+	struct samsung_pin_bank *bank = irq_data_get_irq_chip_data(irqd);
+	struct samsung_pinctrl_drv_data *d = bank->drvdata;
+	u32 bit, wakeup_reg, shift;
+
+	bit = bank->eint_num + irqd->hwirq;
+	wakeup_reg = bit / BITS_PER_U32;
+	shift = bit - (wakeup_reg * BITS_PER_U32);
+
+	if (!on)
+		eint_wake_mask_values[wakeup_reg] |= BIT_U32(shift);
+	else
+		eint_wake_mask_values[wakeup_reg] &= ~BIT_U32(shift);
+
+	dev_info(d->dev, "wake %s for irq %d\n", str_enabled_disabled(on),
+		 irqd->irq);
+
+	return 0;
+}
+
+static void
+gs101_pinctrl_set_eint_wakeup_mask(struct samsung_pinctrl_drv_data *drvdata,
+				   struct exynos_irq_chip *irq_chip)
+{
+	struct regmap *pmu_regs;
+
+	if (!drvdata->retention_ctrl || !drvdata->retention_ctrl->priv) {
+		dev_warn(drvdata->dev,
+			 "No PMU syscon available. Wake-up mask will not be set.\n");
+		return;
+	}
+
+	pmu_regs = drvdata->retention_ctrl->priv;
+
+	dev_dbg(drvdata->dev, "Setting external wakeup interrupt mask:\n");
+
+	for (int i = 0; i < irq_chip->eint_num_wakeup_reg; i++) {
+		dev_dbg(drvdata->dev, "\tWAKEUP_MASK%d[0x%X] value[0x%X]\n",
+			i, irq_chip->eint_wake_mask_reg + i * 4,
+			eint_wake_mask_values[i]);
+		regmap_write(pmu_regs, irq_chip->eint_wake_mask_reg + i * 4,
+			     eint_wake_mask_values[i]);
+	}
+}
+
 static int exynos_wkup_irq_set_wake(struct irq_data *irqd, unsigned int on)
 {
-	struct irq_chip *chip = irq_data_get_irq_chip(irqd);
-	struct exynos_irq_chip *our_chip = to_exynos_irq_chip(chip);
 	struct samsung_pin_bank *bank = irq_data_get_irq_chip_data(irqd);
 	unsigned long bit = 1UL << (2 * bank->eint_offset + irqd->hwirq);
 
@@ -478,9 +529,9 @@ static int exynos_wkup_irq_set_wake(struct irq_data *irqd, unsigned int on)
 		irqd->irq, bank->name, irqd->hwirq);
 
 	if (!on)
-		*our_chip->eint_wake_mask_value |= bit;
+		eint_wake_mask_values[0] |= bit;
 	else
-		*our_chip->eint_wake_mask_value &= ~bit;
+		eint_wake_mask_values[0] &= ~bit;
 
 	return 0;
 }
@@ -500,10 +551,10 @@ exynos_pinctrl_set_eint_wakeup_mask(struct samsung_pinctrl_drv_data *drvdata,
 	pmu_regs = drvdata->retention_ctrl->priv;
 	dev_info(drvdata->dev,
 		 "Setting external wakeup interrupt mask: 0x%x\n",
-		 *irq_chip->eint_wake_mask_value);
+		 eint_wake_mask_values[0]);
 
 	regmap_write(pmu_regs, irq_chip->eint_wake_mask_reg,
-		     *irq_chip->eint_wake_mask_value);
+		     eint_wake_mask_values[0]);
 }
 
 static void
@@ -522,11 +573,10 @@ s5pv210_pinctrl_set_eint_wakeup_mask(struct samsung_pinctrl_drv_data *drvdata,
 
 	clk_base = (void __iomem *) drvdata->retention_ctrl->priv;
 
-	__raw_writel(*irq_chip->eint_wake_mask_value,
+	__raw_writel(eint_wake_mask_values[0],
 		     clk_base + irq_chip->eint_wake_mask_reg);
 }
 
-static u32 eint_wake_mask_value = EXYNOS_EINT_WAKEUP_MASK_DISABLED;
 /*
  * irq_chip for wakeup interrupts
  */
@@ -544,7 +594,7 @@ static const struct exynos_irq_chip s5pv210_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS_WKUP_EPEND_OFFSET,
-	.eint_wake_mask_value = &eint_wake_mask_value,
+	.eint_num_wakeup_reg = 1,
 	/* Only differences with exynos4210_wkup_irq_chip: */
 	.eint_wake_mask_reg = S5PV210_EINT_WAKEUP_MASK,
 	.set_eint_wakeup_mask = s5pv210_pinctrl_set_eint_wakeup_mask,
@@ -564,7 +614,7 @@ static const struct exynos_irq_chip exynos4210_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS_WKUP_EPEND_OFFSET,
-	.eint_wake_mask_value = &eint_wake_mask_value,
+	.eint_num_wakeup_reg = 1,
 	.eint_wake_mask_reg = EXYNOS_EINT_WAKEUP_MASK,
 	.set_eint_wakeup_mask = exynos_pinctrl_set_eint_wakeup_mask,
 };
@@ -583,7 +633,7 @@ static const struct exynos_irq_chip exynos7_wkup_irq_chip __initconst = {
 	.eint_con = EXYNOS7_WKUP_ECON_OFFSET,
 	.eint_mask = EXYNOS7_WKUP_EMASK_OFFSET,
 	.eint_pend = EXYNOS7_WKUP_EPEND_OFFSET,
-	.eint_wake_mask_value = &eint_wake_mask_value,
+	.eint_num_wakeup_reg = 1,
 	.eint_wake_mask_reg = EXYNOS5433_EINT_WAKEUP_MASK,
 	.set_eint_wakeup_mask = exynos_pinctrl_set_eint_wakeup_mask,
 };
@@ -599,13 +649,31 @@ static const struct exynos_irq_chip exynosautov920_wkup_irq_chip __initconst = {
 		.irq_request_resources = exynos_irq_request_resources,
 		.irq_release_resources = exynos_irq_release_resources,
 	},
-	.eint_wake_mask_value = &eint_wake_mask_value,
+	.eint_num_wakeup_reg = 1,
 	.eint_wake_mask_reg = EXYNOS5433_EINT_WAKEUP_MASK,
 	.set_eint_wakeup_mask = exynos_pinctrl_set_eint_wakeup_mask,
 };
 
+static const struct exynos_irq_chip gs101_wkup_irq_chip __initconst = {
+	.chip = {
+		.name = "gs101_wkup_irq_chip",
+		.irq_unmask = exynos_irq_unmask,
+		.irq_mask = exynos_irq_mask,
+		.irq_ack = exynos_irq_ack,
+		.irq_set_type = exynos_irq_set_type,
+		.irq_set_wake = gs101_wkup_irq_set_wake,
+		.irq_request_resources = exynos_irq_request_resources,
+		.irq_release_resources = exynos_irq_release_resources,
+	},
+	.eint_num_wakeup_reg = 3,
+	.eint_wake_mask_reg = GS101_EINT_WAKEUP_MASK,
+	.set_eint_wakeup_mask = gs101_pinctrl_set_eint_wakeup_mask,
+};
+
 /* list of external wakeup controllers supported */
 static const struct of_device_id exynos_wkup_irq_ids[] = {
+	{ .compatible = "google,gs101-wakeup-eint",
+			.data = &gs101_wkup_irq_chip },
 	{ .compatible = "samsung,s5pv210-wakeup-eint",
 			.data = &s5pv210_wkup_irq_chip },
 	{ .compatible = "samsung,exynos4210-wakeup-eint",
@@ -688,6 +756,7 @@ out:
 	chained_irq_exit(chip, desc);
 }
 
+static int eint_num;
 /*
  * exynos_eint_wkup_init() - setup handling of external wakeup interrupts.
  * @d: driver data of samsung pinctrl driver.
@@ -736,6 +805,9 @@ __init int exynos_eint_wkup_init(struct samsung_pinctrl_drv_data *d)
 			return -ENXIO;
 		}
 
+		bank->eint_num = eint_num;
+		eint_num = eint_num + bank->nr_pins;
+
 		if (!fwnode_property_present(bank->fwnode, "interrupts")) {
 			bank->eint_type = EINT_TYPE_WKUP_MUX;
 			++muxed_banks;
diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h
index fcc57c244d16..1cabcbe1401a 100644
--- a/drivers/pinctrl/samsung/pinctrl-samsung.h
+++ b/drivers/pinctrl/samsung/pinctrl-samsung.h
@@ -141,6 +141,7 @@ struct samsung_pin_bank_type {
  * @eint_type: type of the external interrupt supported by the bank.
  * @eint_mask: bit mask of pins which support EINT function.
  * @eint_offset: SoC-specific EINT register or interrupt offset of bank.
+ * @eint_num: total number of eint pins.
  * @eint_con_offset: ExynosAuto SoC-specific EINT control register offset of bank.
  * @eint_mask_offset: ExynosAuto SoC-specific EINT mask register offset of bank.
  * @eint_pend_offset: ExynosAuto SoC-specific EINT pend register offset of bank.
@@ -156,6 +157,7 @@ struct samsung_pin_bank_data {
 	enum eint_type	eint_type;
 	u32		eint_mask;
 	u32		eint_offset;
+	u32		eint_num;
 	u32		eint_con_offset;
 	u32		eint_mask_offset;
 	u32		eint_pend_offset;
@@ -174,6 +176,7 @@ struct samsung_pin_bank_data {
  * @eint_type: type of the external interrupt supported by the bank.
  * @eint_mask: bit mask of pins which support EINT function.
  * @eint_offset: SoC-specific EINT register or interrupt offset of bank.
+ * @eint_num: total number of eint pins.
  * @eint_con_offset: ExynosAuto SoC-specific EINT register or interrupt offset of bank.
  * @eint_mask_offset: ExynosAuto SoC-specific EINT mask register offset of bank.
  * @eint_pend_offset: ExynosAuto SoC-specific EINT pend register offset of bank.
@@ -201,6 +204,7 @@ struct samsung_pin_bank {
 	enum eint_type	eint_type;
 	u32		eint_mask;
 	u32		eint_offset;
+	u32		eint_num;
 	u32		eint_con_offset;
 	u32		eint_mask_offset;
 	u32		eint_pend_offset;
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 1a2c0e0838f9..938c6db235fb 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -669,6 +669,7 @@
 #define GS101_CPU_INFORM(cpu)	\
 			(GS101_CPU0_INFORM + (cpu*4))
 #define GS101_SYSTEM_CONFIGURATION				(0x3A00)
+#define GS101_EINT_WAKEUP_MASK					(0x3A80)
 #define GS101_PHY_CTRL_USB20					(0x3EB0)
 #define GS101_PHY_CTRL_USBDP					(0x3EB4)
 
-- 
cgit v1.2.3


From 51a4273dcab39dd1e850870945ccec664352d383 Mon Sep 17 00:00:00 2001
From: Nikunj A Dadhania <nikunj@amd.com>
Date: Tue, 8 Apr 2025 15:02:11 +0530
Subject: KVM: SVM: Add missing member in SNP_LAUNCH_START command structure

The sev_data_snp_launch_start structure should include a 4-byte
desired_tsc_khz field before the gosvw field, which was missed in the
initial implementation. As a result, the structure is 4 bytes shorter than
expected by the firmware, causing the gosvw field to start 4 bytes early.
Fix this by adding the missing 4-byte member for the desired TSC frequency.

Fixes: 3a45dc2b419e ("crypto: ccp: Define the SEV-SNP commands")
Cc: stable@vger.kernel.org
Suggested-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Tested-by: Vaishali Thakkar <vaishali.thakkar@suse.com>
Signed-off-by: Nikunj A Dadhania <nikunj@amd.com>
Link: https://lore.kernel.org/r/20250408093213.57962-3-nikunj@amd.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/psp-sev.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 0b3a36bdaa90..0f5f94137f6d 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -594,6 +594,7 @@ struct sev_data_snp_addr {
  * @imi_en: launch flow is launching an IMI (Incoming Migration Image) for the
  *          purpose of guest-assisted migration.
  * @rsvd: reserved
+ * @desired_tsc_khz: hypervisor desired mean TSC freq in kHz of the guest
  * @gosvw: guest OS-visible workarounds, as defined by hypervisor
  */
 struct sev_data_snp_launch_start {
@@ -603,6 +604,7 @@ struct sev_data_snp_launch_start {
 	u32 ma_en:1;				/* In */
 	u32 imi_en:1;				/* In */
 	u32 rsvd:30;
+	u32 desired_tsc_khz;			/* In */
 	u8 gosvw[16];				/* In */
 } __packed;
 
-- 
cgit v1.2.3


From bbc13ae593e0ea47357ff6e4740c533c16c2ae1e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 22 May 2025 18:17:56 -0700
Subject: VFIO: KVM: x86: Drop kvm_arch_{start,end}_assignment()

Drop kvm_arch_{start,end}_assignment() and all associated code now that
KVM x86 no longer consumes assigned_device_count.  Tracking whether or not
a VFIO-assigned device is formally associated with a VM is fundamentally
flawed, as such an association is optional for general usage, i.e. is prone
to false negatives.  E.g. prior to commit 2edd9cb79fb3 ("kvm: detect
assigned device via irqbypass manager"), device passthrough via VFIO would
fail to enable IRQ bypass if userspace omitted the formal VFIO<=>KVM
binding.

And device drivers that *need* the VFIO<=>KVM connection, e.g. KVM-GT,
shouldn't be relying on generic x86 tracking infrastructure.

Cc: Jim Mattson <jmattson@google.com>
Link: https://lore.kernel.org/r/20250523011756.3243624-6-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 --
 arch/x86/kvm/x86.c              | 18 ------------------
 include/linux/kvm_host.h        | 18 ------------------
 virt/kvm/vfio.c                 |  3 ---
 4 files changed, 41 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7262bb11032e..c8ffb23d8441 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1381,8 +1381,6 @@ struct kvm_arch {
 
 #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
 	atomic_t noncoherent_dma_count;
-#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
-	atomic_t assigned_device_count;
 	unsigned long nr_possible_bypass_irqs;
 
 #ifdef CONFIG_KVM_IOAPIC
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1508b77622b9..d898b8082e15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13438,24 +13438,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
 		return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
 }
 
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
-	atomic_inc(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
-	atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
-	return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
 static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
 {
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb9ec06aa807..15656b7fba6c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1690,24 +1690,6 @@ static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 	return false;
 }
 #endif
-#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE
-void kvm_arch_start_assignment(struct kvm *kvm);
-void kvm_arch_end_assignment(struct kvm *kvm);
-bool kvm_arch_has_assigned_device(struct kvm *kvm);
-#else
-static inline void kvm_arch_start_assignment(struct kvm *kvm)
-{
-}
-
-static inline void kvm_arch_end_assignment(struct kvm *kvm)
-{
-}
-
-static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
-{
-	return false;
-}
-#endif
 
 static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu)
 {
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 196a102e34fb..be50514bbd11 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -175,7 +175,6 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
 	kvf->file = get_file(filp);
 	list_add_tail(&kvf->node, &kv->file_list);
 
-	kvm_arch_start_assignment(dev->kvm);
 	kvm_vfio_file_set_kvm(kvf->file, dev->kvm);
 	kvm_vfio_update_coherency(dev);
 
@@ -205,7 +204,6 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
 			continue;
 
 		list_del(&kvf->node);
-		kvm_arch_end_assignment(dev->kvm);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 		kvm_spapr_tce_release_vfio_group(dev->kvm, kvf);
 #endif
@@ -336,7 +334,6 @@ static void kvm_vfio_release(struct kvm_device *dev)
 		fput(kvf->file);
 		list_del(&kvf->node);
 		kfree(kvf);
-		kvm_arch_end_assignment(dev->kvm);
 	}
 
 	kvm_vfio_update_coherency(dev);
-- 
cgit v1.2.3


From c8dc579169738a3546f57ecb38e62d3872a3cc04 Mon Sep 17 00:00:00 2001
From: Pratap Nirujogi <pratap.nirujogi@amd.com>
Date: Mon, 9 Jun 2025 11:53:56 -0400
Subject: i2c: amd-isp: Initialize unique adapter name

Initialize unique name for amdisp i2c adapter, which is used
in the platform driver to detect the matching adapter for
i2c_client creation.

Add definition of amdisp i2c adapter name in a new header file
(include/linux/soc/amd/isp4_misc.h) as it is referred in different
driver modules.

Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Pratap Nirujogi <pratap.nirujogi@amd.com>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
Link: https://lore.kernel.org/r/20250609155601.1477055-3-pratap.nirujogi@amd.com
---
 MAINTAINERS                                |  1 +
 drivers/i2c/busses/i2c-designware-amdisp.c |  2 ++
 include/linux/soc/amd/isp4_misc.h          | 12 ++++++++++++
 3 files changed, 15 insertions(+)
 create mode 100644 include/linux/soc/amd/isp4_misc.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index c3f7fbd0d67a..8719f097aae3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24063,6 +24063,7 @@ M:	Bin Du <bin.du@amd.com>
 L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-designware-amdisp.c
+F:	include/linux/soc/amd/isp4_misc.h
 
 SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER
 M:	Jaehoon Chung <jh80.chung@samsung.com>
diff --git a/drivers/i2c/busses/i2c-designware-amdisp.c b/drivers/i2c/busses/i2c-designware-amdisp.c
index ad6f08338124..450793d5f839 100644
--- a/drivers/i2c/busses/i2c-designware-amdisp.c
+++ b/drivers/i2c/busses/i2c-designware-amdisp.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/soc/amd/isp4_misc.h>
 
 #include "i2c-designware-core.h"
 
@@ -62,6 +63,7 @@ static int amd_isp_dw_i2c_plat_probe(struct platform_device *pdev)
 
 	adap = &isp_i2c_dev->adapter;
 	adap->owner = THIS_MODULE;
+	scnprintf(adap->name, sizeof(adap->name), AMDISP_I2C_ADAP_NAME);
 	ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev));
 	adap->dev.of_node = pdev->dev.of_node;
 	/* use dynamically allocated adapter id */
diff --git a/include/linux/soc/amd/isp4_misc.h b/include/linux/soc/amd/isp4_misc.h
new file mode 100644
index 000000000000..6738796986a7
--- /dev/null
+++ b/include/linux/soc/amd/isp4_misc.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ */
+
+#ifndef __SOC_ISP4_MISC_H
+#define __SOC_ISP4_MISC_H
+
+#define AMDISP_I2C_ADAP_NAME "AMDISP DesignWare I2C adapter"
+
+#endif
-- 
cgit v1.2.3


From aced132599b3c8884c050218d4c48eef203678f6 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Wed, 25 Jun 2025 09:40:24 -0700
Subject: bpf: Add range tracking for BPF_NEG

Add range tracking for instruction BPF_NEG. Without this logic, a trivial
program like the following will fail

    volatile bool found_value_b;
    SEC("lsm.s/socket_connect")
    int BPF_PROG(test_socket_connect)
    {
        if (!found_value_b)
                return -1;
        return 0;
    }

with verifier log:

"At program exit the register R0 has smin=0 smax=4294967295 should have
been in [-4095, 0]".

This is because range information is lost in BPF_NEG:

0: R1=ctx() R10=fp0
; if (!found_value_b) @ xxxx.c:24
0: (18) r1 = 0xffa00000011e7048       ; R1_w=map_value(...)
2: (71) r0 = *(u8 *)(r1 +0)           ; R0_w=scalar(smin32=0,smax=255)
3: (a4) w0 ^= 1                       ; R0_w=scalar(smin32=0,smax=255)
4: (84) w0 = -w0                      ; R0_w=scalar(range info lost)

Note that, the log above is manually modified to highlight relevant bits.

Fix this by maintaining proper range information with BPF_NEG, so that
the verifier will know:

4: (84) w0 = -w0                      ; R0_w=scalar(smin32=-255,smax=0)

Also updated selftests based on the expected behavior.

Signed-off-by: Song Liu <song@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250625164025.3310203-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tnum.h                               |  2 ++
 kernel/bpf/tnum.c                                  |  5 +++++
 kernel/bpf/verifier.c                              | 17 ++++++++++++++++-
 .../bpf/progs/verifier_bounds_deduction.c          | 11 +++++++----
 .../selftests/bpf/progs/verifier_value_ptr_arith.c | 22 ++++++++++++++++------
 5 files changed, 46 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/tnum.h b/include/linux/tnum.h
index 3c13240077b8..57ed3035cc30 100644
--- a/include/linux/tnum.h
+++ b/include/linux/tnum.h
@@ -40,6 +40,8 @@ struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness);
 struct tnum tnum_add(struct tnum a, struct tnum b);
 /* Subtract two tnums, return @a - @b */
 struct tnum tnum_sub(struct tnum a, struct tnum b);
+/* Neg of a tnum, return  0 - @a */
+struct tnum tnum_neg(struct tnum a);
 /* Bitwise-AND, return @a & @b */
 struct tnum tnum_and(struct tnum a, struct tnum b);
 /* Bitwise-OR, return @a | @b */
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 9dbc31b25e3d..fa353c5d550f 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b)
 	return TNUM(dv & ~mu, mu);
 }
 
+struct tnum tnum_neg(struct tnum a)
+{
+	return tnum_sub(TNUM(0, 0), a);
+}
+
 struct tnum tnum_and(struct tnum a, struct tnum b)
 {
 	u64 alpha, beta, v;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f403524bd215..2ff22ef42348 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -15182,6 +15182,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn,
 	switch (BPF_OP(insn->code)) {
 	case BPF_ADD:
 	case BPF_SUB:
+	case BPF_NEG:
 	case BPF_AND:
 	case BPF_XOR:
 	case BPF_OR:
@@ -15250,6 +15251,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		scalar_min_max_sub(dst_reg, &src_reg);
 		dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off);
 		break;
+	case BPF_NEG:
+		env->fake_reg[0] = *dst_reg;
+		__mark_reg_known(dst_reg, 0);
+		scalar32_min_max_sub(dst_reg, &env->fake_reg[0]);
+		scalar_min_max_sub(dst_reg, &env->fake_reg[0]);
+		dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off);
+		break;
 	case BPF_MUL:
 		dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off);
 		scalar32_min_max_mul(dst_reg, &src_reg);
@@ -15473,7 +15481,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 		/* check dest operand */
-		err = check_reg_arg(env, insn->dst_reg, DST_OP);
+		if (opcode == BPF_NEG) {
+			err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+			err = err ?: adjust_scalar_min_max_vals(env, insn,
+							 &regs[insn->dst_reg],
+							 regs[insn->dst_reg]);
+		} else {
+			err = check_reg_arg(env, insn->dst_reg, DST_OP);
+		}
 		if (err)
 			return err;
 
diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
index c506afbdd936..260a6df264e3 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bounds_deduction.c
@@ -159,13 +159,16 @@ __failure_unpriv
 __naked void deducing_bounds_from_const_10(void)
 {
 	asm volatile ("					\
+	r6 = r1;					\
 	r0 = 0;						\
 	if r0 s<= 0 goto l0_%=;				\
-l0_%=:	/* Marks reg as unknown. */			\
-	r0 = -r0;					\
-	r0 -= r1;					\
+l0_%=: /* Marks r0 as unknown. */			\
+	call %[bpf_get_prandom_u32];			\
+	r0 -= r6;					\
 	exit;						\
-"	::: __clobber_all);
+"	:
+	: __imm(bpf_get_prandom_u32)
+	: __clobber_all);
 }
 
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
index fcea9819e359..af7938ce56cb 100644
--- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c
@@ -231,6 +231,10 @@ __retval(1)
 __naked void ptr_unknown_vs_unknown_lt(void)
 {
 	asm volatile ("					\
+	r8 = r1;					\
+	call %[bpf_get_prandom_u32];			\
+	r9 = r0;					\
+	r1 = r8;					\
 	r0 = *(u32*)(r1 + %[__sk_buff_len]);		\
 	r1 = 0;						\
 	*(u64*)(r10 - 8) = r1;				\
@@ -245,11 +249,11 @@ l1_%=:	call %[bpf_map_lookup_elem];			\
 	r4 = *(u8*)(r0 + 0);				\
 	if r4 == 1 goto l3_%=;				\
 	r1 = 6;						\
-	r1 = -r1;					\
+	r1 = r9;					\
 	r1 &= 0x3;					\
 	goto l4_%=;					\
 l3_%=:	r1 = 6;						\
-	r1 = -r1;					\
+	r1 = r9;					\
 	r1 &= 0x7;					\
 l4_%=:	r1 += r0;					\
 	r0 = *(u8*)(r1 + 0);				\
@@ -259,7 +263,8 @@ l2_%=:	r0 = 1;						\
 	: __imm(bpf_map_lookup_elem),
 	  __imm_addr(map_array_48b),
 	  __imm_addr(map_hash_16b),
-	  __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len))
+	  __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)),
+	  __imm(bpf_get_prandom_u32)
 	: __clobber_all);
 }
 
@@ -271,6 +276,10 @@ __retval(1)
 __naked void ptr_unknown_vs_unknown_gt(void)
 {
 	asm volatile ("					\
+	r8 = r1;					\
+	call %[bpf_get_prandom_u32];			\
+	r9 = r0;					\
+	r1 = r8;					\
 	r0 = *(u32*)(r1 + %[__sk_buff_len]);		\
 	r1 = 0;						\
 	*(u64*)(r10 - 8) = r1;				\
@@ -285,11 +294,11 @@ l1_%=:	call %[bpf_map_lookup_elem];			\
 	r4 = *(u8*)(r0 + 0);				\
 	if r4 == 1 goto l3_%=;				\
 	r1 = 6;						\
-	r1 = -r1;					\
+	r1 = r9;					\
 	r1 &= 0x7;					\
 	goto l4_%=;					\
 l3_%=:	r1 = 6;						\
-	r1 = -r1;					\
+	r1 = r9;					\
 	r1 &= 0x3;					\
 l4_%=:	r1 += r0;					\
 	r0 = *(u8*)(r1 + 0);				\
@@ -299,7 +308,8 @@ l2_%=:	r0 = 1;						\
 	: __imm(bpf_map_lookup_elem),
 	  __imm_addr(map_array_48b),
 	  __imm_addr(map_hash_16b),
-	  __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len))
+	  __imm_const(__sk_buff_len, offsetof(struct __sk_buff, len)),
+	  __imm(bpf_get_prandom_u32)
 	: __clobber_all);
 }
 
-- 
cgit v1.2.3


From d83caf7c8dad96051267c18786b7bc446b537f3c Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Wed, 25 Jun 2025 15:16:21 +0000
Subject: bpf: add btf_type_is_i{32,64} helpers

There are places in BPF code which check if a BTF type is an integer
of particular size. This code can be made simpler by using helpers.
Add new btf_type_is_i{32,64} helpers, and simplify code in a few
files. (Suggested by Eduard for a patch which copy-pasted such a
check [1].)

  v1 -> v2:
    * export less generic helpers (Eduard)
    * make subject less generic than in [v1] (Eduard)

[1] https://lore.kernel.org/bpf/7edb47e73baa46705119a23c6bf4af26517a640f.camel@gmail.com/
[v1] https://lore.kernel.org/bpf/20250624193655.733050-1-a.s.protopopov@gmail.com/

Suggested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250625151621.1000584-1-a.s.protopopov@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h            |  2 ++
 kernel/bpf/arraymap.c          | 11 +++--------
 kernel/bpf/bpf_local_storage.c |  8 +-------
 kernel/bpf/btf.c               | 41 ++++++++++++++++++++++++++---------------
 kernel/bpf/local_storage.c     |  9 +--------
 5 files changed, 33 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index b2983706292f..a40beb9cf160 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -221,6 +221,8 @@ bool btf_is_vmlinux(const struct btf *btf);
 struct module *btf_try_get_module(const struct btf *btf);
 u32 btf_nr_types(const struct btf *btf);
 struct btf *btf_base_btf(const struct btf *btf);
+bool btf_type_is_i32(const struct btf_type *t);
+bool btf_type_is_i64(const struct btf_type *t);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index eb28c0f219ee..3d080916faf9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -530,8 +530,6 @@ static int array_map_check_btf(const struct bpf_map *map,
 			       const struct btf_type *key_type,
 			       const struct btf_type *value_type)
 {
-	u32 int_data;
-
 	/* One exception for keyless BTF: .bss/.data/.rodata map */
 	if (btf_type_is_void(key_type)) {
 		if (map->map_type != BPF_MAP_TYPE_ARRAY ||
@@ -544,14 +542,11 @@ static int array_map_check_btf(const struct bpf_map *map,
 		return 0;
 	}
 
-	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
-		return -EINVAL;
-
-	int_data = *(u32 *)(key_type + 1);
-	/* bpf array can only take a u32 key. This check makes sure
+	/*
+	 * Bpf array can only take a u32 key. This check makes sure
 	 * that the btf matches the attr used during map_create.
 	 */
-	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+	if (!btf_type_is_i32(key_type))
 		return -EINVAL;
 
 	return 0;
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index fa56c30833ff..b931fbceb54d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -722,13 +722,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
 {
-	u32 int_data;
-
-	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
-		return -EINVAL;
-
-	int_data = *(u32 *)(key_type + 1);
-	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+	if (!btf_type_is_i32(key_type))
 		return -EINVAL;
 
 	return 0;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 682acb1ed234..05fd64a371af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -858,26 +858,37 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 EXPORT_SYMBOL_GPL(btf_type_by_id);
 
 /*
- * Regular int is not a bit field and it must be either
- * u8/u16/u32/u64 or __int128.
+ * Check that the type @t is a regular int. This means that @t is not
+ * a bit field and it has the same size as either of u8/u16/u32/u64
+ * or __int128. If @expected_size is not zero, then size of @t should
+ * be the same. A caller should already have checked that the type @t
+ * is an integer.
  */
+static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size)
+{
+	u32 int_data = btf_type_int(t);
+	u8 nr_bits = BTF_INT_BITS(int_data);
+	u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+	return BITS_PER_BYTE_MASKED(nr_bits) == 0 &&
+	       BTF_INT_OFFSET(int_data) == 0 &&
+	       (nr_bytes <= 16 && is_power_of_2(nr_bytes)) &&
+	       (expected_size == 0 || nr_bytes == expected_size);
+}
+
 static bool btf_type_int_is_regular(const struct btf_type *t)
 {
-	u8 nr_bits, nr_bytes;
-	u32 int_data;
+	return __btf_type_int_is_regular(t, 0);
+}
 
-	int_data = btf_type_int(t);
-	nr_bits = BTF_INT_BITS(int_data);
-	nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-	if (BITS_PER_BYTE_MASKED(nr_bits) ||
-	    BTF_INT_OFFSET(int_data) ||
-	    (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-	     nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) &&
-	     nr_bytes != (2 * sizeof(u64)))) {
-		return false;
-	}
+bool btf_type_is_i32(const struct btf_type *t)
+{
+	return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4);
+}
 
-	return true;
+bool btf_type_is_i64(const struct btf_type *t)
+{
+	return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
 }
 
 /*
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 3969eb0382af..632d51b05fe9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 		if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
 			return -EINVAL;
 	} else {
-		u32 int_data;
-
 		/*
 		 * Key is expected to be u64, which stores the cgroup_inode_id
 		 */
-
-		if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
-			return -EINVAL;
-
-		int_data = *(u32 *)(key_type + 1);
-		if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+		if (!btf_type_is_i64(key_type))
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From bfb4fb77f9a8ce33ce357224569eae5564eec573 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Mon, 23 Jun 2025 08:31:47 -0700
Subject: team: replace team lock with rtnl lock

syszbot reports various ordering issues for lower instance locks and
team lock. Switch to using rtnl lock for protecting team device,
similar to bonding. Based on the patch by Tetsuo Handa.

Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot+705c61d60b091ef42c04@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=705c61d60b091ef42c04
Reported-by: syzbot+71fd22ae4b81631e22fd@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=71fd22ae4b81631e22fd
Fixes: 6b1d3c5f675c ("team: grab team lock during team_change_rx_flags")
Link: https://lkml.kernel.org/r/ZoZ2RH9BcahEB9Sb@nanopsycho.orion
Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250623153147.3413631-1-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/team/team_core.c              | 96 ++++++++++++++-----------------
 drivers/net/team/team_mode_activebackup.c |  3 +-
 drivers/net/team/team_mode_loadbalance.c  | 13 ++---
 include/linux/if_team.h                   |  3 -
 4 files changed, 50 insertions(+), 65 deletions(-)

(limited to 'include')

diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c
index 8bc56186b2a3..17f07eb0ee52 100644
--- a/drivers/net/team/team_core.c
+++ b/drivers/net/team/team_core.c
@@ -933,7 +933,7 @@ static bool team_port_find(const struct team *team,
  * Enable/disable port by adding to enabled port hashlist and setting
  * port->index (Might be racy so reader could see incorrect ifindex when
  * processing a flying packet, but that is not a problem). Write guarded
- * by team->lock.
+ * by RTNL.
  */
 static void team_port_enable(struct team *team,
 			     struct team_port *port)
@@ -1660,8 +1660,6 @@ static int team_init(struct net_device *dev)
 		goto err_options_register;
 	netif_carrier_off(dev);
 
-	lockdep_register_key(&team->team_lock_key);
-	__mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key);
 	netdev_lockdep_set_classes(dev);
 
 	return 0;
@@ -1682,7 +1680,8 @@ static void team_uninit(struct net_device *dev)
 	struct team_port *port;
 	struct team_port *tmp;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	list_for_each_entry_safe(port, tmp, &team->port_list, list)
 		team_port_del(team, port->dev);
 
@@ -1691,9 +1690,7 @@ static void team_uninit(struct net_device *dev)
 	team_mcast_rejoin_fini(team);
 	team_notify_peers_fini(team);
 	team_queue_override_fini(team);
-	mutex_unlock(&team->lock);
 	netdev_change_features(dev);
-	lockdep_unregister_key(&team->team_lock_key);
 }
 
 static void team_destructor(struct net_device *dev)
@@ -1778,7 +1775,8 @@ static void team_change_rx_flags(struct net_device *dev, int change)
 	struct team_port *port;
 	int inc;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	list_for_each_entry(port, &team->port_list, list) {
 		if (change & IFF_PROMISC) {
 			inc = dev->flags & IFF_PROMISC ? 1 : -1;
@@ -1789,7 +1787,6 @@ static void team_change_rx_flags(struct net_device *dev, int change)
 			dev_set_allmulti(port->dev, inc);
 		}
 	}
-	mutex_unlock(&team->lock);
 }
 
 static void team_set_rx_mode(struct net_device *dev)
@@ -1811,14 +1808,14 @@ static int team_set_mac_address(struct net_device *dev, void *p)
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
+	ASSERT_RTNL();
+
 	if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;
 	dev_addr_set(dev, addr->sa_data);
-	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list)
 		if (team->ops.port_change_dev_addr)
 			team->ops.port_change_dev_addr(team, port);
-	mutex_unlock(&team->lock);
 	return 0;
 }
 
@@ -1828,11 +1825,8 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 	struct team_port *port;
 	int err;
 
-	/*
-	 * Alhough this is reader, it's guarded by team lock. It's not possible
-	 * to traverse list in reverse under rcu_read_lock
-	 */
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	team->port_mtu_change_allowed = true;
 	list_for_each_entry(port, &team->port_list, list) {
 		err = dev_set_mtu(port->dev, new_mtu);
@@ -1843,7 +1837,6 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
 		}
 	}
 	team->port_mtu_change_allowed = false;
-	mutex_unlock(&team->lock);
 
 	WRITE_ONCE(dev->mtu, new_mtu);
 
@@ -1853,7 +1846,6 @@ unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		dev_set_mtu(port->dev, dev->mtu);
 	team->port_mtu_change_allowed = false;
-	mutex_unlock(&team->lock);
 
 	return err;
 }
@@ -1903,24 +1895,19 @@ static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	struct team_port *port;
 	int err;
 
-	/*
-	 * Alhough this is reader, it's guarded by team lock. It's not possible
-	 * to traverse list in reverse under rcu_read_lock
-	 */
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	list_for_each_entry(port, &team->port_list, list) {
 		err = vlan_vid_add(port->dev, proto, vid);
 		if (err)
 			goto unwind;
 	}
-	mutex_unlock(&team->lock);
 
 	return 0;
 
 unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	mutex_unlock(&team->lock);
 
 	return err;
 }
@@ -1930,10 +1917,10 @@ static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	list_for_each_entry(port, &team->port_list, list)
 		vlan_vid_del(port->dev, proto, vid);
-	mutex_unlock(&team->lock);
 
 	return 0;
 }
@@ -1955,9 +1942,9 @@ static void team_netpoll_cleanup(struct net_device *dev)
 {
 	struct team *team = netdev_priv(dev);
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	__team_netpoll_cleanup(team);
-	mutex_unlock(&team->lock);
 }
 
 static int team_netpoll_setup(struct net_device *dev)
@@ -1966,7 +1953,8 @@ static int team_netpoll_setup(struct net_device *dev)
 	struct team_port *port;
 	int err = 0;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	list_for_each_entry(port, &team->port_list, list) {
 		err = __team_port_enable_netpoll(port);
 		if (err) {
@@ -1974,7 +1962,6 @@ static int team_netpoll_setup(struct net_device *dev)
 			break;
 		}
 	}
-	mutex_unlock(&team->lock);
 	return err;
 }
 #endif
@@ -1985,9 +1972,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev,
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	err = team_port_add(team, port_dev, extack);
-	mutex_unlock(&team->lock);
 
 	if (!err)
 		netdev_change_features(dev);
@@ -2000,18 +1987,13 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
 	struct team *team = netdev_priv(dev);
 	int err;
 
-	mutex_lock(&team->lock);
+	ASSERT_RTNL();
+
 	err = team_port_del(team, port_dev);
-	mutex_unlock(&team->lock);
 
 	if (err)
 		return err;
 
-	if (netif_is_team_master(port_dev)) {
-		lockdep_unregister_key(&team->team_lock_key);
-		lockdep_register_key(&team->team_lock_key);
-		lockdep_set_class(&team->lock, &team->team_lock_key);
-	}
 	netdev_change_features(dev);
 
 	return err;
@@ -2304,9 +2286,10 @@ err_msg_put:
 static struct team *team_nl_team_get(struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
-	int ifindex;
 	struct net_device *dev;
-	struct team *team;
+	int ifindex;
+
+	ASSERT_RTNL();
 
 	if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX])
 		return NULL;
@@ -2318,14 +2301,11 @@ static struct team *team_nl_team_get(struct genl_info *info)
 		return NULL;
 	}
 
-	team = netdev_priv(dev);
-	mutex_lock(&team->lock);
-	return team;
+	return netdev_priv(dev);
 }
 
 static void team_nl_team_put(struct team *team)
 {
-	mutex_unlock(&team->lock);
 	dev_put(team->dev);
 }
 
@@ -2515,9 +2495,13 @@ int team_nl_options_get_doit(struct sk_buff *skb, struct genl_info *info)
 	int err;
 	LIST_HEAD(sel_opt_inst_list);
 
+	rtnl_lock();
+
 	team = team_nl_team_get(info);
-	if (!team)
-		return -EINVAL;
+	if (!team) {
+		err = -EINVAL;
+		goto rtnl_unlock;
+	}
 
 	list_for_each_entry(opt_inst, &team->option_inst_list, list)
 		list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list);
@@ -2527,6 +2511,9 @@ int team_nl_options_get_doit(struct sk_buff *skb, struct genl_info *info)
 
 	team_nl_team_put(team);
 
+rtnl_unlock:
+	rtnl_unlock();
+
 	return err;
 }
 
@@ -2805,15 +2792,22 @@ int team_nl_port_list_get_doit(struct sk_buff *skb,
 	struct team *team;
 	int err;
 
+	rtnl_lock();
+
 	team = team_nl_team_get(info);
-	if (!team)
-		return -EINVAL;
+	if (!team) {
+		err = -EINVAL;
+		goto rtnl_unlock;
+	}
 
 	err = team_nl_send_port_list_get(team, info->snd_portid, info->snd_seq,
 					 NLM_F_ACK, team_nl_send_unicast, NULL);
 
 	team_nl_team_put(team);
 
+rtnl_unlock:
+	rtnl_unlock();
+
 	return err;
 }
 
@@ -2961,11 +2955,9 @@ static void __team_port_change_port_removed(struct team_port *port)
 
 static void team_port_change_check(struct team_port *port, bool linkup)
 {
-	struct team *team = port->team;
+	ASSERT_RTNL();
 
-	mutex_lock(&team->lock);
 	__team_port_change_check(port, linkup);
-	mutex_unlock(&team->lock);
 }
 
 
diff --git a/drivers/net/team/team_mode_activebackup.c b/drivers/net/team/team_mode_activebackup.c
index e0f599e2a51d..1c3336c7a1b2 100644
--- a/drivers/net/team/team_mode_activebackup.c
+++ b/drivers/net/team/team_mode_activebackup.c
@@ -67,8 +67,7 @@ static void ab_active_port_get(struct team *team, struct team_gsetter_ctx *ctx)
 {
 	struct team_port *active_port;
 
-	active_port = rcu_dereference_protected(ab_priv(team)->active_port,
-						lockdep_is_held(&team->lock));
+	active_port = rtnl_dereference(ab_priv(team)->active_port);
 	if (active_port)
 		ctx->data.u32_val = active_port->dev->ifindex;
 	else
diff --git a/drivers/net/team/team_mode_loadbalance.c b/drivers/net/team/team_mode_loadbalance.c
index 00f8989c29c0..b14538bde2f8 100644
--- a/drivers/net/team/team_mode_loadbalance.c
+++ b/drivers/net/team/team_mode_loadbalance.c
@@ -301,8 +301,7 @@ static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx)
 	if (lb_priv->ex->orig_fprog) {
 		/* Clear old filter data */
 		__fprog_destroy(lb_priv->ex->orig_fprog);
-		orig_fp = rcu_dereference_protected(lb_priv->fp,
-						lockdep_is_held(&team->lock));
+		orig_fp = rtnl_dereference(lb_priv->fp);
 	}
 
 	rcu_assign_pointer(lb_priv->fp, fp);
@@ -324,8 +323,7 @@ static void lb_bpf_func_free(struct team *team)
 		return;
 
 	__fprog_destroy(lb_priv->ex->orig_fprog);
-	fp = rcu_dereference_protected(lb_priv->fp,
-				       lockdep_is_held(&team->lock));
+	fp = rtnl_dereference(lb_priv->fp);
 	bpf_prog_destroy(fp);
 }
 
@@ -335,8 +333,7 @@ static void lb_tx_method_get(struct team *team, struct team_gsetter_ctx *ctx)
 	lb_select_tx_port_func_t *func;
 	char *name;
 
-	func = rcu_dereference_protected(lb_priv->select_tx_port_func,
-					 lockdep_is_held(&team->lock));
+	func = rtnl_dereference(lb_priv->select_tx_port_func);
 	name = lb_select_tx_port_get_name(func);
 	BUG_ON(!name);
 	ctx->data.str_val = name;
@@ -478,7 +475,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	team = lb_priv_ex->team;
 	lb_priv = get_lb_priv(team);
 
-	if (!mutex_trylock(&team->lock)) {
+	if (!rtnl_trylock()) {
 		schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0);
 		return;
 	}
@@ -515,7 +512,7 @@ static void lb_stats_refresh(struct work_struct *work)
 	schedule_delayed_work(&lb_priv_ex->stats.refresh_dw,
 			      (lb_priv_ex->stats.refresh_interval * HZ) / 10);
 
-	mutex_unlock(&team->lock);
+	rtnl_unlock();
 }
 
 static void lb_stats_refresh_interval_get(struct team *team,
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index cdc684e04a2f..ce97d891cf72 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -191,8 +191,6 @@ struct team {
 
 	const struct header_ops *header_ops_cache;
 
-	struct mutex lock; /* used for overall locking, e.g. port lists write */
-
 	/*
 	 * List of enabled ports and their count
 	 */
@@ -223,7 +221,6 @@ struct team {
 		atomic_t count_pending;
 		struct delayed_work dw;
 	} mcast_rejoin;
-	struct lock_class_key team_lock_key;
 	long mode_priv[TEAM_MODE_PRIV_LONGS];
 };
 
-- 
cgit v1.2.3


From 826334359eacc1b70e9752ebc4954ed775dd40ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Jun 2025 16:17:13 -0700
Subject: netlink: specs: add the multicast group name to spec

Add the multicast group's name to the YAML spec.
Without it YNL doesn't know how to subscribe to notifications.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250623231720.3124717-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       | 6 ++++++
 include/uapi/linux/ethtool_netlink.h           | 2 --
 include/uapi/linux/ethtool_netlink_generated.h | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index c1651e175e8b..cfe84f84ba29 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -2492,3 +2492,9 @@ operations:
         attributes:
           - header
           - events
+
+mcast-groups:
+  list:
+    -
+      name: monitor
+      c-define-name: ethtool-mcgrp-monitor-name
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 09a75bdb6560..fa5d645140a4 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -208,6 +208,4 @@ enum {
 	ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1)
 };
 
-#define ETHTOOL_MCGRP_MONITOR_NAME "monitor"
-
 #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 4944badf9fba..859e28c8a91a 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -867,4 +867,6 @@ enum {
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
 };
 
+#define ETHTOOL_MCGRP_MONITOR_NAME	"monitor"
+
 #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */
-- 
cgit v1.2.3


From f9dc3e52d821dc1f9afeec43fb1c18ac94bd587a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Jun 2025 16:17:16 -0700
Subject: net: ethtool: remove the data argument from ethtool_notify()

ethtool_notify() takes a const void *data argument, which presumably
was intended to pass information from the call site to the subcommand
handler. This argument currently has no users.

Expecting the data to be subcommand-specific has two complications.

Complication #1 is that its not plumbed thru any of the standardized
callbacks. It gets propagated to ethnl_default_notify() where it
remains unused. Coming from the ethnl_default_set_doit() side we pass
in NULL, because how could we have a command specific attribute in
a generic handler.

Complication #2 is that we expect the ethtool_notify() callers to
know what attribute type to pass in. Again, the data pointer is
untyped.

RSS will need to pass the context ID to the notifications.
I think it's a better design if the "subcommand" exports its own
typed interface and constructs the appropriate argument struct
(which will be req_info). Remove the unused data argument from
ethtool_notify() but retain it in a new internal helper which
subcommands can use to build a typed interface.

Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Tested-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Link: https://patch.msgid.link/20250623231720.3124717-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  5 ++---
 net/ethtool/ioctl.c       | 24 ++++++++++++------------
 net/ethtool/netlink.c     | 11 ++++++++---
 net/ethtool/netlink.h     |  1 +
 4 files changed, 23 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03c26bb0fbbe..db5bfd4e7ec8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5138,10 +5138,9 @@ void netdev_bonding_info_change(struct net_device *dev,
 				struct netdev_bonding_info *bonding_info);
 
 #if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
-void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
+void ethtool_notify(struct net_device *dev, unsigned int cmd);
 #else
-static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
-				  const void *data)
+static inline void ethtool_notify(struct net_device *dev, unsigned int cmd)
 {
 }
 #endif
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 82cde640aa87..96da9d18789b 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -617,8 +617,8 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
 
 	err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 	if (err >= 0) {
-		ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
-		ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+		ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
 	}
 	return err;
 }
@@ -708,8 +708,8 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 		__ETHTOOL_LINK_MODE_MASK_NU32;
 	ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
 	if (ret >= 0) {
-		ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL);
-		ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+		ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
 	}
 	return ret;
 }
@@ -1868,7 +1868,7 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
 		return ret;
 
 	dev->ethtool->wol_enabled = !!wol.wolopts;
-	ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL);
+	ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF);
 
 	return 0;
 }
@@ -1944,7 +1944,7 @@ static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
 	eee_to_keee(&keee, &eee);
 	ret = dev->ethtool_ops->set_eee(dev, &keee);
 	if (!ret)
-		ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF);
 	return ret;
 }
 
@@ -2184,7 +2184,7 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
 	ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
 					     NULL);
 	if (!ret)
-		ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF);
 	return ret;
 }
 
@@ -2228,7 +2228,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
 	ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
 					      &kernel_ringparam, NULL);
 	if (!ret)
-		ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF);
 	return ret;
 }
 
@@ -2295,7 +2295,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
 
 	ret = dev->ethtool_ops->set_channels(dev, &channels);
 	if (!ret)
-		ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF);
 	return ret;
 }
 
@@ -2326,7 +2326,7 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
 
 	ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
 	if (!ret)
-		ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL);
+		ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF);
 	return ret;
 }
 
@@ -3328,7 +3328,7 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr,
 		rc = ethtool_set_value_void(dev, useraddr,
 				       dev->ethtool_ops->set_msglevel);
 		if (!rc)
-			ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL);
+			ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF);
 		break;
 	case ETHTOOL_GEEE:
 		rc = ethtool_get_eee(dev, useraddr);
@@ -3392,7 +3392,7 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr,
 		rc = ethtool_get_value(dev, useraddr, ethcmd,
 				       dev->ethtool_ops->get_priv_flags);
 		if (!rc)
-			ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL);
+			ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF);
 		break;
 	case ETHTOOL_SPFLAGS:
 		rc = ethtool_set_value(dev, useraddr,
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index c5ec3c82ab2e..129f9d56ac65 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -911,7 +911,7 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
 	swap(dev->cfg, dev->cfg_pending);
 	if (!ret)
 		goto out_ops;
-	ethtool_notify(dev, ops->set_ntf_cmd, NULL);
+	ethtool_notify(dev, ops->set_ntf_cmd);
 
 	ret = 0;
 out_ops:
@@ -1049,7 +1049,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
 	[ETHTOOL_MSG_MM_NTF]		= ethnl_default_notify,
 };
 
-void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
+void ethnl_notify(struct net_device *dev, unsigned int cmd, const void *data)
 {
 	if (unlikely(!ethnl_ok))
 		return;
@@ -1062,13 +1062,18 @@ void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
 		WARN_ONCE(1, "notification %u not implemented (dev=%s)\n",
 			  cmd, netdev_name(dev));
 }
+
+void ethtool_notify(struct net_device *dev, unsigned int cmd)
+{
+	ethnl_notify(dev, cmd, NULL);
+}
 EXPORT_SYMBOL(ethtool_notify);
 
 static void ethnl_notify_features(struct netdev_notifier_info *info)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(info);
 
-	ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL);
+	ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF);
 }
 
 static int ethnl_netdev_event(struct notifier_block *this, unsigned long event,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 91b953924af3..4a061944a3aa 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -23,6 +23,7 @@ void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd);
 void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd);
 void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd);
 int ethnl_multicast(struct sk_buff *skb, struct net_device *dev);
+void ethnl_notify(struct net_device *dev, unsigned int cmd, const void *data);
 
 /**
  * ethnl_strz_size() - calculate attribute length for fixed size string
-- 
cgit v1.2.3


From 46837be5afc6ea70bc827ca4439410e069e2ee37 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Jun 2025 16:17:18 -0700
Subject: net: ethtool: rss: add notifications

In preparation for RSS_SET handling in ethnl introduce Netlink
notifications for RSS. Only cover modifications, not creation
and not removal of a context, because the latter may deserve
a different notification type. We should cross that bridge
when we add the support for context add / remove via Netlink.

Link: https://patch.msgid.link/20250623231720.3124717-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  7 +++++++
 Documentation/networking/ethtool-netlink.rst   |  1 +
 include/uapi/linux/ethtool_netlink_generated.h |  1 +
 net/ethtool/common.h                           |  8 ++++++++
 net/ethtool/ioctl.c                            |  4 ++++
 net/ethtool/netlink.c                          |  2 ++
 net/ethtool/rss.c                              | 11 +++++++++++
 7 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index cfe84f84ba29..19a32229772a 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -2492,6 +2492,13 @@ operations:
         attributes:
           - header
           - events
+    -
+      name: rss-ntf
+      doc: |
+        Notification for change in RSS configuration.
+        For additional contexts only modifications are modified, not creation
+        or removal of the contexts.
+      notify: rss-get
 
 mcast-groups:
   list:
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index e45bb555e909..08abca99a6dc 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -281,6 +281,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_MODULE_GET_REPLY``         transceiver module parameters
   ``ETHTOOL_MSG_PSE_GET_REPLY``            PSE parameters
   ``ETHTOOL_MSG_RSS_GET_REPLY``            RSS settings
+  ``ETHTOOL_MSG_RSS_NTF``                  RSS settings
   ``ETHTOOL_MSG_PLCA_GET_CFG_REPLY``       PLCA RS parameters
   ``ETHTOOL_MSG_PLCA_GET_STATUS_REPLY``    PLCA RS status
   ``ETHTOOL_MSG_PLCA_NTF``                 PLCA RS parameters
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 859e28c8a91a..8f30ffa1cd14 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -862,6 +862,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_GET_REPLY,
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
 	ETHTOOL_MSG_PSE_NTF,
+	ETHTOOL_MSG_RSS_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index b4683d286a5a..c41db1595621 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -74,4 +74,12 @@ int ethtool_get_module_eeprom_call(struct net_device *dev,
 
 bool __ethtool_dev_mm_supported(struct net_device *dev);
 
+#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
+void ethtool_rss_notify(struct net_device *dev, u32 rss_context);
+#else
+static inline void ethtool_rss_notify(struct net_device *dev, u32 rss_context)
+{
+}
+#endif
+
 #endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 96da9d18789b..c34bac7bffd8 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1502,6 +1502,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	struct ethtool_rxfh rxfh;
 	bool locked = false; /* dev->ethtool->rss_lock taken */
 	bool create = false;
+	bool mod = false;
 	u8 *rss_config;
 	int ret;
 
@@ -1688,6 +1689,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		}
 		goto out;
 	}
+	mod = !create && !rxfh_dev.rss_delete;
 
 	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
 			 &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context)))
@@ -1757,6 +1759,8 @@ out:
 	if (locked)
 		mutex_unlock(&dev->ethtool->rss_lock);
 	kfree(rss_config);
+	if (mod)
+		ethtool_rss_notify(dev, rxfh.rss_context);
 	return ret;
 }
 
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 60b3c07507d2..09c81cc9a08f 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -946,6 +946,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
 	[ETHTOOL_MSG_MODULE_NTF]	= &ethnl_module_request_ops,
 	[ETHTOOL_MSG_PLCA_NTF]		= &ethnl_plca_cfg_request_ops,
 	[ETHTOOL_MSG_MM_NTF]		= &ethnl_mm_request_ops,
+	[ETHTOOL_MSG_RSS_NTF]		= &ethnl_rss_request_ops,
 };
 
 /* default notification handler */
@@ -1052,6 +1053,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
 	[ETHTOOL_MSG_MODULE_NTF]	= ethnl_default_notify,
 	[ETHTOOL_MSG_PLCA_NTF]		= ethnl_default_notify,
 	[ETHTOOL_MSG_MM_NTF]		= ethnl_default_notify,
+	[ETHTOOL_MSG_RSS_NTF]		= ethnl_default_notify,
 };
 
 void ethnl_notify(struct net_device *dev, unsigned int cmd,
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 6d9b1769896b..3adddca7e215 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -358,6 +358,17 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 	return ret;
 }
 
+/* RSS_NTF */
+
+void ethtool_rss_notify(struct net_device *dev, u32 rss_context)
+{
+	struct rss_req_info req_info = {
+		.rss_context = rss_context,
+	};
+
+	ethnl_notify(dev, ETHTOOL_MSG_RSS_NTF, &req_info.base);
+}
+
 const struct ethnl_request_ops ethnl_rss_request_ops = {
 	.request_cmd		= ETHTOOL_MSG_RSS_GET,
 	.reply_cmd		= ETHTOOL_MSG_RSS_GET_REPLY,
-- 
cgit v1.2.3


From 4b70e2a069d90cdc447c6bf8437c8b99345852e9 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 24 Jun 2025 09:43:27 +0800
Subject: net/sched: Remove unused functions

Since commit c54e1d920f04 ("flow_offload: add ops to tc_action_ops for
flow action setup") these are unused.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://patch.msgid.link/20250624014327.3686873-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_csum.h   | 9 ---------
 include/net/tc_act/tc_ct.h     | 9 ---------
 include/net/tc_act/tc_gate.h   | 9 ---------
 include/net/tc_act/tc_mpls.h   | 9 ---------
 include/net/tc_act/tc_police.h | 9 ---------
 include/net/tc_act/tc_sample.h | 9 ---------
 include/net/tc_act/tc_vlan.h   | 9 ---------
 7 files changed, 63 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 68269e4581b7..2515da0142a6 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -18,15 +18,6 @@ struct tcf_csum {
 };
 #define to_tcf_csum(a) ((struct tcf_csum *)a)
 
-static inline bool is_tcf_csum(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->id == TCA_ID_CSUM)
-		return true;
-#endif
-	return false;
-}
-
 static inline u32 tcf_csum_update_flags(const struct tc_action *a)
 {
 	u32 update_flags;
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 77f87c622a2e..e6b45cb27ebf 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -92,13 +92,4 @@ static inline void
 tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { }
 #endif
 
-static inline bool is_tcf_ct(const struct tc_action *a)
-{
-#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK)
-	if (a->ops && a->ops->id == TCA_ID_CT)
-		return true;
-#endif
-	return false;
-}
-
 #endif /* __NET_TC_CT_H */
diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index c8fa11ebb397..c1a67149c6b6 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -51,15 +51,6 @@ struct tcf_gate {
 
 #define to_gate(a) ((struct tcf_gate *)a)
 
-static inline bool is_tcf_gate(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->id == TCA_ID_GATE)
-		return true;
-#endif
-	return false;
-}
-
 static inline s32 tcf_gate_prio(const struct tc_action *a)
 {
 	s32 tcfg_prio;
diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
index 721de4f5733a..d452e5e94fd0 100644
--- a/include/net/tc_act/tc_mpls.h
+++ b/include/net/tc_act/tc_mpls.h
@@ -27,15 +27,6 @@ struct tcf_mpls {
 };
 #define to_mpls(a) ((struct tcf_mpls *)a)
 
-static inline bool is_tcf_mpls(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->id == TCA_ID_MPLS)
-		return true;
-#endif
-	return false;
-}
-
 static inline u32 tcf_mpls_action(const struct tc_action *a)
 {
 	u32 tcfm_action;
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 283bde711a42..490d88cb5233 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -44,15 +44,6 @@ struct tc_police_compat {
 	struct tc_ratespec	peakrate;
 };
 
-static inline bool is_tcf_police(const struct tc_action *act)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (act->ops && act->ops->id == TCA_ID_POLICE)
-		return true;
-#endif
-	return false;
-}
-
 static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act)
 {
 	struct tcf_police *police = to_police(act);
diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index b5d76305e854..abd163ca1864 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -17,15 +17,6 @@ struct tcf_sample {
 };
 #define to_sample(a) ((struct tcf_sample *)a)
 
-static inline bool is_tcf_sample(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return a->ops && a->ops->id == TCA_ID_SAMPLE;
-#else
-	return false;
-#endif
-}
-
 static inline __u32 tcf_sample_rate(const struct tc_action *a)
 {
 	return to_sample(a)->rate;
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 904eddfc1826..3f5e9242b5e8 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -26,15 +26,6 @@ struct tcf_vlan {
 };
 #define to_vlan(a) ((struct tcf_vlan *)a)
 
-static inline bool is_tcf_vlan(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	if (a->ops && a->ops->id == TCA_ID_VLAN)
-		return true;
-#endif
-	return false;
-}
-
 static inline u32 tcf_vlan_action(const struct tc_action *a)
 {
 	u32 tcfv_action;
-- 
cgit v1.2.3


From 8bd0af3154b2206ce19f8b1410339f7a2a56d0c3 Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathan.lynch@amd.com>
Date: Tue, 24 Jun 2025 08:50:44 -0500
Subject: lib: packing: Include necessary headers

packing.h uses ARRAY_SIZE(), BUILD_BUG_ON_MSG(), min(), max(), and
sizeof_field() without including the headers where they are defined,
potentially causing build failures.

Fix this in packing.h and sort the result.

Signed-off-by: Nathan Lynch <nathan.lynch@amd.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://patch.msgid.link/20250624-packing-includes-v1-1-c23c81fab508@amd.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/packing.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/packing.h b/include/linux/packing.h
index 0589d70bbe04..20ae4d452c7b 100644
--- a/include/linux/packing.h
+++ b/include/linux/packing.h
@@ -5,8 +5,12 @@
 #ifndef _LINUX_PACKING_H
 #define _LINUX_PACKING_H
 
-#include <linux/types.h>
+#include <linux/array_size.h>
 #include <linux/bitops.h>
+#include <linux/build_bug.h>
+#include <linux/minmax.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
 
 #define GEN_PACKED_FIELD_STRUCT(__type) \
 	struct packed_field_ ## __type { \
-- 
cgit v1.2.3


From 2855e43c6bb154a9b8e27abda8df364aed574b22 Mon Sep 17 00:00:00 2001
From: RubenKelevra <rubenkelevra@gmail.com>
Date: Tue, 24 Jun 2025 18:57:11 +0200
Subject: uapi: net_dropmon: drop unused is_drop_point_hw macro

Commit 4ea7e38696c7 ("dropmon: add ability to detect when hardware
drops rx packets") introduced is_drop_point_hw, but the symbol was
never referenced anywhere in the kernel tree and is currently not used
by dropwatch. I could not find, to the best of my abilities, a current
out-of-tree user of this macro.

The definition also contains a syntax error in its for-loop, so any
project that tried to compile against it would fail. Removing the
macro therefore eliminates dead code without breaking existing
users.

Signed-off-by: RubenKelevra <rubenkelevra@gmail.com>
Link: https://patch.msgid.link/20250624165711.1188691-1-rubenkelevra@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/net_dropmon.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 9dd41c2f58a6..87cbef48d4c7 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -10,13 +10,6 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
-#define is_drop_point_hw(x) do {\
-	int ____i, ____j;\
-	for (____i = 0; ____i < 8; i ____i++)\
-		____j |= x[____i];\
-	____j;\
-} while (0)
-
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
-- 
cgit v1.2.3


From 9e6dd4c256d0774701637b958ba682eff4991277 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 24 Jun 2025 14:09:59 -0700
Subject: netlink: specs: mptcp: replace underscores with dashes in names

We're trying to add a strict regexp for the name format in the spec.
Underscores will not be allowed, dashes should be used instead.
This makes no difference to C (codegen, if used, replaces special
chars in names) but it gives more uniform naming in Python.

Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp")
Reviewed-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250624211002.3475021-8-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/mptcp_pm.yaml | 8 ++++----
 include/uapi/linux/mptcp_pm.h             | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml
index dfd017780d2f..fb57860fe778 100644
--- a/Documentation/netlink/specs/mptcp_pm.yaml
+++ b/Documentation/netlink/specs/mptcp_pm.yaml
@@ -57,21 +57,21 @@ definitions:
       doc: >-
         A new subflow has been established. 'error' should not be set.
         Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 |
-        daddr6, sport, dport, backup, if_idx [, error].
+        daddr6, sport, dport, backup, if-idx [, error].
      -
       name: sub-closed
       doc: >-
         A subflow has been closed. An error (copy of sk_err) could be set if an
         error has been detected for this subflow.
         Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 |
-        daddr6, sport, dport, backup, if_idx [, error].
+        daddr6, sport, dport, backup, if-idx [, error].
      -
       name: sub-priority
       value: 13
       doc: >-
         The priority of a subflow has changed. 'error' should not be set.
         Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 |
-        daddr6, sport, dport, backup, if_idx [, error].
+        daddr6, sport, dport, backup, if-idx [, error].
      -
       name: listener-created
       value: 15
@@ -255,7 +255,7 @@ attribute-sets:
         name: timeout
         type: u32
       -
-        name: if_idx
+        name: if-idx
         type: u32
       -
         name: reset-reason
diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h
index 84fa8a21dfd0..6ac84b2f636c 100644
--- a/include/uapi/linux/mptcp_pm.h
+++ b/include/uapi/linux/mptcp_pm.h
@@ -27,14 +27,14 @@
  *   token, rem_id.
  * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error'
  *   should not be set. Attributes: token, family, loc_id, rem_id, saddr4 |
- *   saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error].
+ *   saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error].
  * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of
  *   sk_err) could be set if an error has been detected for this subflow.
  *   Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 |
- *   daddr6, sport, dport, backup, if_idx [, error].
+ *   daddr6, sport, dport, backup, if-idx [, error].
  * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error'
  *   should not be set. Attributes: token, family, loc_id, rem_id, saddr4 |
- *   saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error].
+ *   saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error].
  * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes:
  *   family, sport, saddr4 | saddr6.
  * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family,
-- 
cgit v1.2.3


From f5769359c5b241978e6933672bb78b3adc36aa18 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Fri, 20 Jun 2025 02:31:54 +0800
Subject: mm/alloc_tag: fix the kmemleak false positive issue in the allocation
 of the percpu variable tag->counters

When loading a module, as long as the module has memory allocation
operations, kmemleak produces a false positive report that resembles the
following:

unreferenced object (percpu) 0x7dfd232a1650 (size 16):
  comm "modprobe", pid 1301, jiffies 4294940249
  hex dump (first 16 bytes on cpu 2):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace (crc 0):
    kmemleak_alloc_percpu+0xb4/0xd0
    pcpu_alloc_noprof+0x700/0x1098
    load_module+0xd4/0x348
    codetag_module_init+0x20c/0x450
    codetag_load_module+0x70/0xb8
    load_module+0xef8/0x1608
    init_module_from_file+0xec/0x158
    idempotent_init_module+0x354/0x608
    __arm64_sys_finit_module+0xbc/0x150
    invoke_syscall+0xd4/0x258
    el0_svc_common.constprop.0+0xb4/0x240
    do_el0_svc+0x48/0x68
    el0_svc+0x40/0xf8
    el0t_64_sync_handler+0x10c/0x138
    el0t_64_sync+0x1ac/0x1b0

This is because the module can only indirectly reference
alloc_tag_counters through the alloc_tag section, which misleads kmemleak.

However, we don't have a kmemleak ignore interface for percpu allocations
yet.  So let's create one and invoke it for tag->counters.

[gehao@kylinos.cn: fix build error when CONFIG_DEBUG_KMEMLEAK=n, s/igonore/ignore/]
  Link: https://lkml.kernel.org/r/20250620093102.2416767-1-hao.ge@linux.dev
Link: https://lkml.kernel.org/r/20250619183154.2122608-1-hao.ge@linux.dev
Fixes: 12ca42c23775 ("alloc_tag: allocate percpu counters for module tags dynamically")
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>	[lib/alloc_tag.c]
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmemleak.h |  4 ++++
 lib/alloc_tag.c          |  8 +++++++-
 mm/kmemleak.c            | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 93a73c076d16..fbd424b2abb1 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -28,6 +28,7 @@ extern void kmemleak_update_trace(const void *ptr) __ref;
 extern void kmemleak_not_leak(const void *ptr) __ref;
 extern void kmemleak_transient_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
+extern void kmemleak_ignore_percpu(const void __percpu *ptr) __ref;
 extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
 extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size,
@@ -97,6 +98,9 @@ static inline void kmemleak_not_leak(const void *ptr)
 static inline void kmemleak_transient_leak(const void *ptr)
 {
 }
+static inline void kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+}
 static inline void kmemleak_ignore(const void *ptr)
 {
 }
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d48b80f3f007..3a74d63a959e 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -10,6 +10,7 @@
 #include <linux/seq_buf.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/kmemleak.h>
 
 #define ALLOCINFO_FILE_NAME		"allocinfo"
 #define MODULE_ALLOC_TAG_VMAP_SIZE	(100000UL * sizeof(struct alloc_tag))
@@ -632,8 +633,13 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag
 			       mod->name);
 			return -ENOMEM;
 		}
-	}
 
+		/*
+		 * Avoid a kmemleak false positive. The pointer to the counters is stored
+		 * in the alloc_tag section of the module and cannot be directly accessed.
+		 */
+		kmemleak_ignore_percpu(tag->counters);
+	}
 	return 0;
 }
 
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index da9cee34ee1b..8d588e685311 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1246,6 +1246,20 @@ void __ref kmemleak_transient_leak(const void *ptr)
 }
 EXPORT_SYMBOL(kmemleak_transient_leak);
 
+/**
+ * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu
+ *			    address argument
+ * @ptr:	percpu address of the object
+ */
+void __ref kmemleak_ignore_percpu(const void __percpu *ptr)
+{
+	pr_debug("%s(0x%px)\n", __func__, ptr);
+
+	if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+		make_black_object((unsigned long)ptr, OBJECT_PERCPU);
+}
+EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu);
+
 /**
  * kmemleak_ignore - ignore an allocated object
  * @ptr:	pointer to beginning of the object
-- 
cgit v1.2.3


From 4ecf83741401c70d4420588ee1f3b1ca04ef58d5 Mon Sep 17 00:00:00 2001
From: Jake Hillion <jake@hillion.co.uk>
Date: Wed, 25 Jun 2025 18:05:46 +0100
Subject: sched_ext: Drop kfuncs marked for removal in 6.15

sched_ext performed a kfunc renaming pass in 6.13 and kept the old names
around for compatibility with old binaries. These were scheduled for
cleanup in 6.15 but were missed. Submitting for cleanup in for-next.

Removed the kfuncs, their flags, and any references I could find to them
in doc comments. Left the entries in include/scx/compat.bpf.h as they're
still useful to make new binaries compatible with old kernels.

Tested by applying to my kernel. It builds and a modern version of
scx_lavd loads fine.

Signed-off-by: Jake Hillion <jake@hillion.co.uk>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 10 +++----
 kernel/sched/ext.c        | 71 ++---------------------------------------------
 2 files changed, 7 insertions(+), 74 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 0cf0915572c9..7047101dbf58 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -164,7 +164,7 @@ struct sched_ext_entity {
 
 	/*
 	 * Runtime budget in nsecs. This is usually set through
-	 * scx_bpf_dispatch() but can also be modified directly by the BPF
+	 * scx_bpf_dsq_insert() but can also be modified directly by the BPF
 	 * scheduler. Automatically decreased by SCX as the task executes. On
 	 * depletion, a scheduling event is triggered.
 	 *
@@ -176,10 +176,10 @@ struct sched_ext_entity {
 
 	/*
 	 * Used to order tasks when dispatching to the vtime-ordered priority
-	 * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
-	 * but can also be modified directly by the BPF scheduler. Modifying it
-	 * while a task is queued on a dsq may mangle the ordering and is not
-	 * recommended.
+	 * queue of a dsq. This is usually set through
+	 * scx_bpf_dsq_insert_vtime() but can also be modified directly by the
+	 * BPF scheduler. Modifying it while a task is queued on a dsq may
+	 * mangle the ordering and is not recommended.
 	 */
 	u64			dsq_vtime;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index df5b2c952cf7..512474eabea6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6391,7 +6391,8 @@ __bpf_kfunc_start_defs();
  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
  * and this function can be called upto ops.dispatch_max_batch times to insert
  * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
- * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
  *
  * This function doesn't have any locking restrictions and may be called under
  * BPF locks (in the future when BPF introduces more flexible locking).
@@ -6415,14 +6416,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
 	scx_dsq_insert_commit(p, dsq_id, enq_flags);
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
-				  u64 enq_flags)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
-	scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
-}
-
 /**
  * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
  * @p: task_struct to insert
@@ -6460,21 +6453,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 	scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
-					u64 slice, u64 vtime, u64 enq_flags)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
-	scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
-}
-
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -6647,13 +6630,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 	}
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
-	return scx_bpf_dsq_move_to_local(dsq_id);
-}
-
 /**
  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
  * @it__iter: DSQ iterator in progress
@@ -6672,14 +6648,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
-			struct bpf_iter_scx_dsq *it__iter, u64 slice)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
-	scx_bpf_dsq_move_set_slice(it__iter, slice);
-}
-
 /**
  * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
  * @it__iter: DSQ iterator in progress
@@ -6699,14 +6667,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
-			struct bpf_iter_scx_dsq *it__iter, u64 vtime)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
-	scx_bpf_dsq_move_set_vtime(it__iter, vtime);
-}
-
 /**
  * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
  * @it__iter: DSQ iterator in progress
@@ -6739,15 +6699,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
 			    p, dsq_id, enq_flags);
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
-					   struct task_struct *p, u64 dsq_id,
-					   u64 enq_flags)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
-	return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
-}
-
 /**
  * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
  * @it__iter: DSQ iterator in progress
@@ -6773,30 +6724,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
 			    p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
 }
 
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
-						 struct task_struct *p, u64 dsq_id,
-						 u64 enq_flags)
-{
-	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
-	return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
-}
-
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_consume)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6927,10 +6864,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
-- 
cgit v1.2.3


From 16e2707cf15e09234445d40ddd76f11240be8767 Mon Sep 17 00:00:00 2001
From: "Yury Norov [NVIDIA]" <yury.norov@gmail.com>
Date: Wed, 4 Jun 2025 15:39:37 -0400
Subject: cpumask: add cpumask_clear_cpus()

When user wants to clear a range in cpumask, the only option the API
provides now is a for-loop, like:

	for_each_cpu_from(cpu, mask) {
		if (cpu >= ncpus)
			break;
		__cpumask_clear_cpu(cpu, mask);
	}

In the bitmap API we have bitmap_clear() for that, which is
significantly faster than a for-loop. Propagate it to cpumasks.

Signed-off-by: Yury Norov [NVIDIA] <yury.norov@gmail.com>
Link: https://patch.msgid.link/20250604193947.11834-2-yury.norov@gmail.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/cpumask.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 7ae80a7ca81e..ede95bbe8b80 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -609,6 +609,18 @@ void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
 	__set_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
+/**
+ * cpumask_clear_cpus - clear cpus in a cpumask
+ * @dstp:  the cpumask pointer
+ * @cpu:   cpu number (< nr_cpu_ids)
+ * @ncpus: number of cpus to clear (< nr_cpu_ids)
+ */
+static __always_inline void cpumask_clear_cpus(struct cpumask *dstp,
+						unsigned int cpu, unsigned int ncpus)
+{
+	cpumask_check(cpu + ncpus - 1);
+	bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus);
+}
 
 /**
  * cpumask_clear_cpu - clear a cpu in a cpumask
-- 
cgit v1.2.3


From 2f8839e6c5f8e200629fd730aac5dd874c1d2544 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Tue, 17 Jun 2025 15:44:35 +0200
Subject: crypto: ahash - make hash walk functions from ahash.c public

Make the hash walk functions
  crypto_hash_walk_done()
  crypto_hash_walk_first()
  crypto_hash_walk_last()
public again.
These functions had been removed from the header file
include/crypto/internal/hash.h with commit 7fa481734016
("crypto: ahash - make hash walk functions private to ahash.c")
as there was no crypto algorithm code using them.

With the upcoming crypto implementation for s390 phmac
these functions will be exploited and thus need to be
public within the kernel again.

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Acked-by: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c                 | 26 +++++---------------------
 include/crypto/internal/hash.h | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 7a51e0cf9322..00a5e36f710b 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -29,19 +29,6 @@
 
 #define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
 
-struct crypto_hash_walk {
-	const char *data;
-
-	unsigned int offset;
-	unsigned int flags;
-
-	struct page *pg;
-	unsigned int entrylen;
-
-	unsigned int total;
-	struct scatterlist *sg;
-};
-
 static int ahash_def_finup(struct ahash_request *req);
 
 static inline bool crypto_ahash_block_only(struct crypto_ahash *tfm)
@@ -112,8 +99,8 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk)
 	return hash_walk_next(walk);
 }
 
-static int crypto_hash_walk_first(struct ahash_request *req,
-				  struct crypto_hash_walk *walk)
+int crypto_hash_walk_first(struct ahash_request *req,
+			   struct crypto_hash_walk *walk)
 {
 	walk->total = req->nbytes;
 	walk->entrylen = 0;
@@ -133,8 +120,9 @@ static int crypto_hash_walk_first(struct ahash_request *req,
 
 	return hash_walk_new_entry(walk);
 }
+EXPORT_SYMBOL_GPL(crypto_hash_walk_first);
 
-static int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
+int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 {
 	if ((walk->flags & CRYPTO_AHASH_REQ_VIRT))
 		return err;
@@ -160,11 +148,7 @@ static int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err)
 
 	return hash_walk_new_entry(walk);
 }
-
-static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk)
-{
-	return !(walk->entrylen | walk->total);
-}
+EXPORT_SYMBOL_GPL(crypto_hash_walk_done);
 
 /*
  * For an ahash tfm that is using an shash algorithm (instead of an ahash
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index f052afa6e7b0..2456d6ea73f0 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -30,6 +30,20 @@
                 __##name##_req, (req))
 
 struct ahash_request;
+struct scatterlist;
+
+struct crypto_hash_walk {
+	const char *data;
+
+	unsigned int offset;
+	unsigned int flags;
+
+	struct page *pg;
+	unsigned int entrylen;
+
+	unsigned int total;
+	struct scatterlist *sg;
+};
 
 struct ahash_instance {
 	void (*free)(struct ahash_instance *inst);
@@ -61,6 +75,15 @@ struct crypto_shash_spawn {
 	struct crypto_spawn base;
 };
 
+int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
+int crypto_hash_walk_first(struct ahash_request *req,
+			   struct crypto_hash_walk *walk);
+
+static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk)
+{
+	return !(walk->entrylen | walk->total);
+}
+
 int crypto_register_ahash(struct ahash_alg *alg);
 void crypto_unregister_ahash(struct ahash_alg *alg);
 int crypto_register_ahashes(struct ahash_alg *algs, int count);
-- 
cgit v1.2.3


From d0da164ba63dd6ff3696c565b118631a277822c3 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Tue, 17 Jun 2025 15:44:38 +0200
Subject: crypto: ahash - Add crypto_ahash_tested() helper function

Add a little inline helper function
    crypto_ahash_tested()
to the internal/hash.h header file to retrieve the tested
status (that is the CRYPTO_ALG_TESTED bit in the cra_flags).

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/internal/hash.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 2456d6ea73f0..6ec5f2f37ccb 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -196,6 +196,13 @@ static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm,
 	tfm->reqsize = reqsize;
 }
 
+static inline bool crypto_ahash_tested(struct crypto_ahash *tfm)
+{
+	struct crypto_tfm *tfm_base = crypto_ahash_tfm(tfm);
+
+	return tfm_base->__crt_alg->cra_flags & CRYPTO_ALG_TESTED;
+}
+
 static inline void crypto_ahash_set_reqsize_dma(struct crypto_ahash *ahash,
 						unsigned int reqsize)
 {
-- 
cgit v1.2.3


From 8cffca866ba86cbf0d097e56521b17d830956d4a Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 17 Jun 2025 11:44:01 +0300
Subject: RDMA/core: Extend RDMA device registration to be net namespace aware

Presently, RDMA devices are always registered within the init network
namespace, even if the associated devlink device's namespace was
changed via a devlink reload. This mismatch leads to discrepancies
between the network namespace of the devlink device and that of the
RDMA device.

Therefore, extend the RDMA device allocation API to optionally take
the net namespace. This isn't limited to devices that support devlink
but allows all users to provide the network namespace if they need to
do so.

If a network namespace is provided during device allocation, it's up
to the caller to make sure the namespace stays valid until
ib_register_device() is called.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/device.c  | 14 ++++++++++++--
 drivers/infiniband/sw/rdmavt/vt.c |  2 +-
 include/rdma/ib_verbs.h           | 11 +++++++++--
 3 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 468ed6bd4722..c0f8b8cba7c0 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -557,6 +557,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
 /**
  * _ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
+ * @net: network namespace device should be located in, namespace
+ *       must stay valid until ib_register_device() is completed.
  *
  * Low-level drivers should use ib_alloc_device() to allocate &struct
  * ib_device.  @size is the size of the structure to be allocated,
@@ -564,7 +566,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
  * ib_dealloc_device() must be used to free structures allocated with
  * ib_alloc_device().
  */
-struct ib_device *_ib_alloc_device(size_t size)
+struct ib_device *_ib_alloc_device(size_t size, struct net *net)
 {
 	struct ib_device *device;
 	unsigned int i;
@@ -581,7 +583,15 @@ struct ib_device *_ib_alloc_device(size_t size)
 		return NULL;
 	}
 
-	rdma_init_coredev(&device->coredev, device, &init_net);
+	/* ib_devices_shared_netns can't change while we have active namespaces
+	 * in the system which means either init_net is passed or the user has
+	 * no idea what they are doing.
+	 *
+	 * To avoid breaking backward compatibility, when in shared mode,
+	 * force to init the device in the init_net.
+	 */
+	net = ib_devices_shared_netns ? &init_net : net;
+	rdma_init_coredev(&device->coredev, device, net);
 
 	INIT_LIST_HEAD(&device->event_handler_list);
 	spin_lock_init(&device->qp_open_list_lock);
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 5499025e8a0a..d22d610c2696 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -49,7 +49,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
 {
 	struct rvt_dev_info *rdi;
 
-	rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev);
+	rdi = container_of(_ib_alloc_device(size, &init_net), struct rvt_dev_info, ibdev);
 	if (!rdi)
 		return rdi;
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 38f68d245fa6..b91a81234832 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2914,11 +2914,18 @@ struct ib_block_iter {
 	unsigned int __pg_bit;		/* alignment of current block */
 };
 
-struct ib_device *_ib_alloc_device(size_t size);
+struct ib_device *_ib_alloc_device(size_t size, struct net *net);
 #define ib_alloc_device(drv_struct, member)                                    \
 	container_of(_ib_alloc_device(sizeof(struct drv_struct) +              \
 				      BUILD_BUG_ON_ZERO(offsetof(              \
-					      struct drv_struct, member))),    \
+					      struct drv_struct, member)),     \
+				      &init_net),			       \
+		     struct drv_struct, member)
+
+#define ib_alloc_device_with_net(drv_struct, member, net)		       \
+	container_of(_ib_alloc_device(sizeof(struct drv_struct) +              \
+				      BUILD_BUG_ON_ZERO(offsetof(              \
+					struct drv_struct, member)), net),     \
 		     struct drv_struct, member)
 
 void ib_dealloc_device(struct ib_device *device);
-- 
cgit v1.2.3


From 611d08207d313500d010d8792424346ce70d0cfb Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 17 Jun 2025 11:44:02 +0300
Subject: RDMA/mlx5: Allocate IB device with net namespace supplied from core
 dev

Use the new ib_alloc_device_with_net() API to allocate the IB device
so that it is properly bound to the network namespace obtained via
mlx5_core_net(). This change ensures correct namespace association
(e.g., for containerized setups).

Additionally, expose mlx5_core_net so that RDMA driver can use it.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c                | 3 ++-
 drivers/infiniband/hw/mlx5/main.c                  | 6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 5 -----
 include/linux/mlx5/driver.h                        | 5 +++++
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 49af1cfbe6d1..cc8859d3c2f5 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -88,7 +88,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	else
 		return mlx5_ib_set_vport_rep(lag_master, rep, vport_index);
 
-	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
+	ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+					 mlx5_core_net(lag_master));
 	if (!ibdev)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index efea3ffd9715..c521bce2eeff 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4793,7 +4793,8 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
 	    !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
+	mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+					  mlx5_core_net(mparent->mdev));
 	if (!mplane)
 		return ERR_PTR(-ENOMEM);
 
@@ -4907,7 +4908,8 @@ static int mlx5r_probe(struct auxiliary_device *adev,
 
 	num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
 			MLX5_CAP_GEN(mdev, num_vhca_ports));
-	dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
+	dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev,
+				       mlx5_core_net(mdev));
 	if (!dev)
 		return -ENOMEM;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 37d5f445598c..b111ccd03b02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -45,11 +45,6 @@ int mlx5_crdump_enable(struct mlx5_core_dev *dev);
 void mlx5_crdump_disable(struct mlx5_core_dev *dev);
 int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data);
 
-static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
-{
-	return devlink_net(priv_to_devlink(dev));
-}
-
 static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev)
 {
 	return mdev->mlx5e_res.uplink_netdev;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e6ba8f4f4bd1..3475d33c75f4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1349,4 +1349,9 @@ enum {
 };
 
 bool mlx5_wc_support_get(struct mlx5_core_dev *mdev);
+
+static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
+{
+	return devlink_net(priv_to_devlink(dev));
+}
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From f1208b05574f63c52e88109d8c75afdf4fc6bf42 Mon Sep 17 00:00:00 2001
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 17 Jun 2025 11:44:03 +0300
Subject: RDMA/ipoib: Use parent rdma device net namespace

Use the net namespace of the underlying rdma device.
After honoring the rdma device's namespace, the ipoib
netdev now also runs in the same net namespace of the
rdma device.

Add an API to read the net namespace of the rdma device
so that ULP such as IPoIB can use it to initialize its
netdev.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++
 include/rdma/ib_verbs.h                   | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index f2f5465f2a90..7acafc5c0e09 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -2577,6 +2577,8 @@ static struct net_device *ipoib_add_port(const char *format,
 
 	ndev->rtnl_link_ops = ipoib_get_link_ops();
 
+	dev_net_set(ndev, rdma_dev_net(hca));
+
 	result = register_netdev(ndev);
 	if (result) {
 		pr_warn("%s: couldn't register ipoib port %d; error %d\n",
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b91a81234832..7da27f01eeb6 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4871,6 +4871,11 @@ static inline int ibdev_to_node(struct ib_device *ibdev)
 bool rdma_dev_access_netns(const struct ib_device *device,
 			   const struct net *net);
 
+static inline struct net *rdma_dev_net(struct ib_device *device)
+{
+	return read_pnet(&device->coredev.rdma_net);
+}
+
 #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000)
 #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF)
 #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF)
-- 
cgit v1.2.3


From df0f030ee7e444c55341f4210124115878284125 Mon Sep 17 00:00:00 2001
From: Vladimir Kondratiev <vladimir.kondratiev@mobileye.com>
Date: Thu, 12 Jun 2025 17:39:08 +0300
Subject: irqchip/thead-c900-aclint-sswi: Generalize aclint-sswi driver and add
 MIPS P800 support

Refactor the Thead specific implementation of the ACLINT-SSWI irqchip:

 - Rename the source file and related details to reflect the generic nature
   of the driver

 - Factor out the generic code that serves both Thead and MIPS variants.
   This generic part is compliant with the RISC-V draft spec [1]

 - Provide generic and Thead specific initialization functions

Signed-off-by: Vladimir Kondratiev <vladimir.kondratiev@mobileye.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250612143911.3224046-5-vladimir.kondratiev@mobileye.com
Link: https://github.com/riscvarchive/riscv-aclint [1]
---
 drivers/irqchip/Kconfig                      |  15 +-
 drivers/irqchip/Makefile                     |   2 +-
 drivers/irqchip/irq-aclint-sswi.c            | 208 +++++++++++++++++++++++++++
 drivers/irqchip/irq-thead-c900-aclint-sswi.c | 176 -----------------------
 include/linux/cpuhotplug.h                   |   2 +-
 5 files changed, 221 insertions(+), 182 deletions(-)
 create mode 100644 drivers/irqchip/irq-aclint-sswi.c
 delete mode 100644 drivers/irqchip/irq-thead-c900-aclint-sswi.c

(limited to 'include')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 0d196e447142..39f6f421fc75 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -634,18 +634,25 @@ config STARFIVE_JH8100_INTC
 
 	  If you don't know what to do here, say Y.
 
-config THEAD_C900_ACLINT_SSWI
-	bool "THEAD C9XX ACLINT S-mode IPI Interrupt Controller"
+config ACLINT_SSWI
+	bool "RISC-V ACLINT S-mode IPI Interrupt Controller"
 	depends on RISCV
 	depends on SMP
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_IRQ_IPI_MUX
 	help
-	  This enables support for T-HEAD specific ACLINT SSWI device
-	  support.
+	  This enables support for variants of the RISC-V ACLINT-SSWI device.
+	  Supported variants are:
+	  - T-HEAD, with compatible "thead,c900-aclint-sswi"
+	  - MIPS P8700, with compatible "mips,p8700-aclint-sswi"
 
 	  If you don't know what to do here, say Y.
 
+# Backwards compatibility so oldconfig does not drop it.
+config THEAD_C900_ACLINT_SSWI
+	bool
+	select ACLINT_SSWI
+
 config EXYNOS_IRQ_COMBINER
 	bool "Samsung Exynos IRQ combiner support" if COMPILE_TEST
 	depends on (ARCH_EXYNOS && ARM) || COMPILE_TEST
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 23ca4959e6ce..0458d6c5d161 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -105,7 +105,7 @@ obj-$(CONFIG_RISCV_APLIC_MSI)		+= irq-riscv-aplic-msi.o
 obj-$(CONFIG_RISCV_IMSIC)		+= irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_STARFIVE_JH8100_INTC)	+= irq-starfive-jh8100-intc.o
-obj-$(CONFIG_THEAD_C900_ACLINT_SSWI)	+= irq-thead-c900-aclint-sswi.o
+obj-$(CONFIG_ACLINT_SSWI)		+= irq-aclint-sswi.o
 obj-$(CONFIG_IMX_IRQSTEER)		+= irq-imx-irqsteer.o
 obj-$(CONFIG_IMX_INTMUX)		+= irq-imx-intmux.o
 obj-$(CONFIG_IMX_MU_MSI)		+= irq-imx-mu-msi.o
diff --git a/drivers/irqchip/irq-aclint-sswi.c b/drivers/irqchip/irq-aclint-sswi.c
new file mode 100644
index 000000000000..0131194d4847
--- /dev/null
+++ b/drivers/irqchip/irq-aclint-sswi.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Inochi Amaoto <inochiama@gmail.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/string_choices.h>
+#include <asm/sbi.h>
+#include <asm/vendorid_list.h>
+
+static int sswi_ipi_virq __ro_after_init;
+static DEFINE_PER_CPU(void __iomem *, sswi_cpu_regs);
+
+static void aclint_sswi_ipi_send(unsigned int cpu)
+{
+	writel(0x1, per_cpu(sswi_cpu_regs, cpu));
+}
+
+static void aclint_sswi_ipi_clear(void)
+{
+	writel_relaxed(0x0, this_cpu_read(sswi_cpu_regs));
+}
+
+static void aclint_sswi_ipi_handle(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+
+	chained_irq_enter(chip, desc);
+
+	csr_clear(CSR_IP, IE_SIE);
+	aclint_sswi_ipi_clear();
+
+	ipi_mux_process();
+
+	chained_irq_exit(chip, desc);
+}
+
+static int aclint_sswi_starting_cpu(unsigned int cpu)
+{
+	enable_percpu_irq(sswi_ipi_virq, irq_get_trigger_type(sswi_ipi_virq));
+
+	return 0;
+}
+
+static int aclint_sswi_dying_cpu(unsigned int cpu)
+{
+	aclint_sswi_ipi_clear();
+
+	disable_percpu_irq(sswi_ipi_virq);
+
+	return 0;
+}
+
+static int __init aclint_sswi_parse_irq(struct fwnode_handle *fwnode, void __iomem *reg)
+{
+	struct of_phandle_args parent;
+	unsigned long hartid;
+	u32 contexts, i;
+	int rc, cpu;
+
+	contexts = of_irq_count(to_of_node(fwnode));
+	if (!(contexts)) {
+		pr_err("%pfwP: no ACLINT SSWI context available\n", fwnode);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < contexts; i++) {
+		rc = of_irq_parse_one(to_of_node(fwnode), i, &parent);
+		if (rc)
+			return rc;
+
+		rc = riscv_of_parent_hartid(parent.np, &hartid);
+		if (rc)
+			return rc;
+
+		if (parent.args[0] != RV_IRQ_SOFT)
+			return -ENOTSUPP;
+
+		cpu = riscv_hartid_to_cpuid(hartid);
+
+		per_cpu(sswi_cpu_regs, cpu) = reg + hart_index * 4;
+	}
+
+	pr_info("%pfwP: register %u CPU%s\n", fwnode, contexts, str_plural(contexts));
+
+	return 0;
+}
+
+static int __init aclint_sswi_probe(struct fwnode_handle *fwnode)
+{
+	struct irq_domain *domain;
+	void __iomem *reg;
+	int virq, rc;
+
+	if (!is_of_node(fwnode))
+		return -EINVAL;
+
+	reg = of_iomap(to_of_node(fwnode), 0);
+	if (!reg)
+		return -ENOMEM;
+
+	/* Parse SSWI setting */
+	rc = aclint_sswi_parse_irq(fwnode, reg);
+	if (rc < 0)
+		return rc;
+
+	/* If mulitple SSWI devices are present, do not register irq again */
+	if (sswi_ipi_virq)
+		return 0;
+
+	/* Find riscv intc domain and create IPI irq mapping */
+	domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY);
+	if (!domain) {
+		pr_err("%pfwP: Failed to find INTC domain\n", fwnode);
+		return -ENOENT;
+	}
+
+	sswi_ipi_virq = irq_create_mapping(domain, RV_IRQ_SOFT);
+	if (!sswi_ipi_virq) {
+		pr_err("unable to create ACLINT SSWI IRQ mapping\n");
+		return -ENOMEM;
+	}
+
+	/* Register SSWI irq and handler */
+	virq = ipi_mux_create(BITS_PER_BYTE, aclint_sswi_ipi_send);
+	if (virq <= 0) {
+		pr_err("unable to create muxed IPIs\n");
+		irq_dispose_mapping(sswi_ipi_virq);
+		return virq < 0 ? virq : -ENOMEM;
+	}
+
+	irq_set_chained_handler(sswi_ipi_virq, aclint_sswi_ipi_handle);
+
+	cpuhp_setup_state(CPUHP_AP_IRQ_ACLINT_SSWI_STARTING,
+			  "irqchip/aclint-sswi:starting",
+			  aclint_sswi_starting_cpu,
+			  aclint_sswi_dying_cpu);
+
+	riscv_ipi_set_virq_range(virq, BITS_PER_BYTE);
+
+	return 0;
+}
+
+/* generic/MIPS variant */
+static int __init generic_aclint_sswi_probe(struct fwnode_handle *fwnode)
+{
+	int rc;
+
+	rc = aclint_sswi_probe(fwnode);
+	if (rc)
+		return rc;
+
+	/* Announce that SSWI is providing IPIs */
+	pr_info("providing IPIs using ACLINT SSWI\n");
+
+	return 0;
+}
+
+static int __init generic_aclint_sswi_early_probe(struct device_node *node,
+						  struct device_node *parent)
+{
+	return generic_aclint_sswi_probe(&node->fwnode);
+}
+IRQCHIP_DECLARE(generic_aclint_sswi, "mips,p8700-aclint-sswi", generic_aclint_sswi_early_probe);
+
+/* THEAD variant */
+#define THEAD_C9XX_CSR_SXSTATUS			0x5c0
+#define THEAD_C9XX_SXSTATUS_CLINTEE		BIT(17)
+
+static int __init thead_aclint_sswi_probe(struct fwnode_handle *fwnode)
+{
+	int rc;
+
+	/* If it is T-HEAD CPU, check whether SSWI is enabled */
+	if (riscv_cached_mvendorid(0) == THEAD_VENDOR_ID &&
+	    !(csr_read(THEAD_C9XX_CSR_SXSTATUS) & THEAD_C9XX_SXSTATUS_CLINTEE))
+		return -ENOTSUPP;
+
+	rc = aclint_sswi_probe(fwnode);
+	if (rc)
+		return rc;
+
+	/* Announce that SSWI is providing IPIs */
+	pr_info("providing IPIs using THEAD ACLINT SSWI\n");
+
+	return 0;
+}
+
+static int __init thead_aclint_sswi_early_probe(struct device_node *node,
+						struct device_node *parent)
+{
+	return thead_aclint_sswi_probe(&node->fwnode);
+}
+IRQCHIP_DECLARE(thead_aclint_sswi, "thead,c900-aclint-sswi", thead_aclint_sswi_early_probe);
diff --git a/drivers/irqchip/irq-thead-c900-aclint-sswi.c b/drivers/irqchip/irq-thead-c900-aclint-sswi.c
deleted file mode 100644
index 8ff6e7a1363b..000000000000
--- a/drivers/irqchip/irq-thead-c900-aclint-sswi.c
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2024 Inochi Amaoto <inochiama@gmail.com>
- */
-
-#define pr_fmt(fmt) "thead-c900-aclint-sswi: " fmt
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/irqchip.h>
-#include <linux/irqchip/chained_irq.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/pci.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/string_choices.h>
-#include <asm/sbi.h>
-#include <asm/vendorid_list.h>
-
-#define THEAD_ACLINT_xSWI_REGISTER_SIZE		4
-
-#define THEAD_C9XX_CSR_SXSTATUS			0x5c0
-#define THEAD_C9XX_SXSTATUS_CLINTEE		BIT(17)
-
-static int sswi_ipi_virq __ro_after_init;
-static DEFINE_PER_CPU(void __iomem *, sswi_cpu_regs);
-
-static void thead_aclint_sswi_ipi_send(unsigned int cpu)
-{
-	writel(0x1, per_cpu(sswi_cpu_regs, cpu));
-}
-
-static void thead_aclint_sswi_ipi_clear(void)
-{
-	writel_relaxed(0x0, this_cpu_read(sswi_cpu_regs));
-}
-
-static void thead_aclint_sswi_ipi_handle(struct irq_desc *desc)
-{
-	struct irq_chip *chip = irq_desc_get_chip(desc);
-
-	chained_irq_enter(chip, desc);
-
-	csr_clear(CSR_IP, IE_SIE);
-	thead_aclint_sswi_ipi_clear();
-
-	ipi_mux_process();
-
-	chained_irq_exit(chip, desc);
-}
-
-static int thead_aclint_sswi_starting_cpu(unsigned int cpu)
-{
-	enable_percpu_irq(sswi_ipi_virq, irq_get_trigger_type(sswi_ipi_virq));
-
-	return 0;
-}
-
-static int thead_aclint_sswi_dying_cpu(unsigned int cpu)
-{
-	thead_aclint_sswi_ipi_clear();
-
-	disable_percpu_irq(sswi_ipi_virq);
-
-	return 0;
-}
-
-static int __init thead_aclint_sswi_parse_irq(struct fwnode_handle *fwnode,
-					      void __iomem *reg)
-{
-	struct of_phandle_args parent;
-	unsigned long hartid;
-	u32 contexts, i;
-	int rc, cpu;
-
-	contexts = of_irq_count(to_of_node(fwnode));
-	if (!(contexts)) {
-		pr_err("%pfwP: no ACLINT SSWI context available\n", fwnode);
-		return -EINVAL;
-	}
-
-	for (i = 0; i < contexts; i++) {
-		rc = of_irq_parse_one(to_of_node(fwnode), i, &parent);
-		if (rc)
-			return rc;
-
-		rc = riscv_of_parent_hartid(parent.np, &hartid);
-		if (rc)
-			return rc;
-
-		if (parent.args[0] != RV_IRQ_SOFT)
-			return -ENOTSUPP;
-
-		cpu = riscv_hartid_to_cpuid(hartid);
-
-		per_cpu(sswi_cpu_regs, cpu) = reg + i * THEAD_ACLINT_xSWI_REGISTER_SIZE;
-	}
-
-	pr_info("%pfwP: register %u CPU%s\n", fwnode, contexts, str_plural(contexts));
-
-	return 0;
-}
-
-static int __init thead_aclint_sswi_probe(struct fwnode_handle *fwnode)
-{
-	struct irq_domain *domain;
-	void __iomem *reg;
-	int virq, rc;
-
-	/* If it is T-HEAD CPU, check whether SSWI is enabled */
-	if (riscv_cached_mvendorid(0) == THEAD_VENDOR_ID &&
-	    !(csr_read(THEAD_C9XX_CSR_SXSTATUS) & THEAD_C9XX_SXSTATUS_CLINTEE))
-		return -ENOTSUPP;
-
-	if (!is_of_node(fwnode))
-		return -EINVAL;
-
-	reg = of_iomap(to_of_node(fwnode), 0);
-	if (!reg)
-		return -ENOMEM;
-
-	/* Parse SSWI setting */
-	rc = thead_aclint_sswi_parse_irq(fwnode, reg);
-	if (rc < 0)
-		return rc;
-
-	/* If mulitple SSWI devices are present, do not register irq again */
-	if (sswi_ipi_virq)
-		return 0;
-
-	/* Find riscv intc domain and create IPI irq mapping */
-	domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY);
-	if (!domain) {
-		pr_err("%pfwP: Failed to find INTC domain\n", fwnode);
-		return -ENOENT;
-	}
-
-	sswi_ipi_virq = irq_create_mapping(domain, RV_IRQ_SOFT);
-	if (!sswi_ipi_virq) {
-		pr_err("unable to create ACLINT SSWI IRQ mapping\n");
-		return -ENOMEM;
-	}
-
-	/* Register SSWI irq and handler */
-	virq = ipi_mux_create(BITS_PER_BYTE, thead_aclint_sswi_ipi_send);
-	if (virq <= 0) {
-		pr_err("unable to create muxed IPIs\n");
-		irq_dispose_mapping(sswi_ipi_virq);
-		return virq < 0 ? virq : -ENOMEM;
-	}
-
-	irq_set_chained_handler(sswi_ipi_virq, thead_aclint_sswi_ipi_handle);
-
-	cpuhp_setup_state(CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING,
-			  "irqchip/thead-aclint-sswi:starting",
-			  thead_aclint_sswi_starting_cpu,
-			  thead_aclint_sswi_dying_cpu);
-
-	riscv_ipi_set_virq_range(virq, BITS_PER_BYTE);
-
-	/* Announce that SSWI is providing IPIs */
-	pr_info("providing IPIs using THEAD ACLINT SSWI\n");
-
-	return 0;
-}
-
-static int __init thead_aclint_sswi_early_probe(struct device_node *node,
-						struct device_node *parent)
-{
-	return thead_aclint_sswi_probe(&node->fwnode);
-}
-IRQCHIP_DECLARE(thead_aclint_sswi, "thead,c900-aclint-sswi", thead_aclint_sswi_early_probe);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index df366ee15456..d381420bbd5f 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -145,7 +145,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_EIOINTC_STARTING,
 	CPUHP_AP_IRQ_AVECINTC_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
-	CPUHP_AP_IRQ_THEAD_ACLINT_SSWI_STARTING,
+	CPUHP_AP_IRQ_ACLINT_SSWI_STARTING,
 	CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
 	CPUHP_AP_IRQ_RISCV_SBI_IPI_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
-- 
cgit v1.2.3


From f86ad0ed620cb3c91ec7d5468e93ac68d727539d Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Thu, 19 Jun 2025 15:40:33 +0200
Subject: drm/gpusvm, drm/pagemap: Move migration functionality to drm_pagemap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The migration functionality and track-keeping of per-pagemap VRAM
mapped to the CPU mm is not per GPU_vm, but rather per pagemap.
This is also reflected by the functions not needing the drm_gpusvm
structures. So move to drm_pagemap.

With this, drm_gpusvm shouldn't really access the page zone-device-data
since its meaning is internal to drm_pagemap. Currently it's used to
reject mapping ranges backed by multiple drm_pagemap allocations.
For now, make the zone-device-data a void pointer.

Alter the interface of drm_gpusvm_migrate_to_devmem() to ensure we don't
pass a gpusvm pointer.

Rename CONFIG_DRM_XE_DEVMEM_MIRROR to CONFIG_DRM_XE_PAGEMAP.

Matt is listed as author of this commit since he wrote most of the code,
and it makes sense to retain his git authorship.
Thomas mostly moved the code around.

v3:
- Kerneldoc fixes (CI)
- Don't update documentation about how the drm_pagemap
  migration should be interpreted until upcoming
  patches where the functionality is implemented.
  (Matt Brost)
v4:
- More kerneldoc fixes around timeslice_ms
  (Himal Ghimiray, Matt Brost)
v6:
- Fix an uninitialized pagemap pointer (CI)

Co-developed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Himal Prasad Ghimiray <himal.prasad.ghimiray@intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20250619134035.170086-2-thomas.hellstrom@linux.intel.com
---
 Documentation/gpu/rfc/gpusvm.rst     |  12 +-
 drivers/gpu/drm/Makefile             |   6 +-
 drivers/gpu/drm/drm_gpusvm.c         | 761 +--------------------------------
 drivers/gpu/drm/drm_pagemap.c        | 794 +++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/Kconfig           |  10 +-
 drivers/gpu/drm/xe/xe_bo_types.h     |   2 +-
 drivers/gpu/drm/xe/xe_device_types.h |   2 +-
 drivers/gpu/drm/xe/xe_svm.c          |  47 ++-
 include/drm/drm_gpusvm.h             |  96 -----
 include/drm/drm_pagemap.h            | 101 +++++
 10 files changed, 958 insertions(+), 873 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_pagemap.c

(limited to 'include')

diff --git a/Documentation/gpu/rfc/gpusvm.rst b/Documentation/gpu/rfc/gpusvm.rst
index bcf66a8137a6..469db1372f16 100644
--- a/Documentation/gpu/rfc/gpusvm.rst
+++ b/Documentation/gpu/rfc/gpusvm.rst
@@ -73,15 +73,21 @@ Overview of baseline design
 .. kernel-doc:: drivers/gpu/drm/drm_gpusvm.c
    :doc: Locking
 
-.. kernel-doc:: drivers/gpu/drm/drm_gpusvm.c
-   :doc: Migration
-
 .. kernel-doc:: drivers/gpu/drm/drm_gpusvm.c
    :doc: Partial Unmapping of Ranges
 
 .. kernel-doc:: drivers/gpu/drm/drm_gpusvm.c
    :doc: Examples
 
+Overview of drm_pagemap design
+==============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_pagemap.c
+   :doc: Overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_pagemap.c
+   :doc: Migration
+
 Possible future design features
 ===============================
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 5050ac32bba2..4dafbdc8f86a 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -104,7 +104,11 @@ obj-$(CONFIG_DRM_PANEL_BACKLIGHT_QUIRKS) += drm_panel_backlight_quirks.o
 #
 obj-$(CONFIG_DRM_EXEC) += drm_exec.o
 obj-$(CONFIG_DRM_GPUVM) += drm_gpuvm.o
-obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm.o
+
+drm_gpusvm_helper-y := \
+	drm_gpusvm.o\
+	drm_pagemap.o
+obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o
 
 obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o
 
diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c
index ab198645d90f..e454bb806c72 100644
--- a/drivers/gpu/drm/drm_gpusvm.c
+++ b/drivers/gpu/drm/drm_gpusvm.c
@@ -9,10 +9,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/export.h>
 #include <linux/hmm.h>
+#include <linux/hugetlb_inline.h>
 #include <linux/memremap.h>
-#include <linux/migrate.h>
 #include <linux/mm_types.h>
-#include <linux/pagemap.h>
 #include <linux/slab.h>
 
 #include <drm/drm_device.h>
@@ -108,21 +107,6 @@
  * to add annotations to GPU SVM.
  */
 
-/**
- * DOC: Migration
- *
- * The migration support is quite simple, allowing migration between RAM and
- * device memory at the range granularity. For example, GPU SVM currently does
- * not support mixing RAM and device memory pages within a range. This means
- * that upon GPU fault, the entire range can be migrated to device memory, and
- * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
- * memory storage within a range could be added in the future if required.
- *
- * The reasoning for only supporting range granularity is as follows: it
- * simplifies the implementation, and range sizes are driver-defined and should
- * be relatively small.
- */
-
 /**
  * DOC: Partial Unmapping of Ranges
  *
@@ -194,10 +178,10 @@
  *		if (driver_migration_policy(range)) {
  *			mmap_read_lock(mm);
  *			devmem = driver_alloc_devmem();
- *			err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
- *							   devmem_allocation,
- *							   &ctx);
- *			mmap_read_unlock(mm);
+ *			err = drm_pagemap_migrate_to_devmem(devmem, gpusvm->mm, gpuva_start,
+ *                                                          gpuva_end, ctx->timeslice_ms,
+ *                                                          driver_pgmap_owner());
+ *                      mmap_read_unlock(mm);
  *			if (err)	// CPU mappings may have changed
  *				goto retry;
  *		}
@@ -289,97 +273,6 @@ npages_in_range(unsigned long start, unsigned long end)
 	return (end - start) >> PAGE_SHIFT;
 }
 
-/**
- * struct drm_gpusvm_zdd - GPU SVM zone device data
- *
- * @refcount: Reference count for the zdd
- * @devmem_allocation: device memory allocation
- * @device_private_page_owner: Device private pages owner
- *
- * This structure serves as a generic wrapper installed in
- * page->zone_device_data. It provides infrastructure for looking up a device
- * memory allocation upon CPU page fault and asynchronously releasing device
- * memory once the CPU has no page references. Asynchronous release is useful
- * because CPU page references can be dropped in IRQ contexts, while releasing
- * device memory likely requires sleeping locks.
- */
-struct drm_gpusvm_zdd {
-	struct kref refcount;
-	struct drm_gpusvm_devmem *devmem_allocation;
-	void *device_private_page_owner;
-};
-
-/**
- * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
- * @device_private_page_owner: Device private pages owner
- *
- * This function allocates and initializes a new zdd structure. It sets up the
- * reference count and initializes the destroy work.
- *
- * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
- */
-static struct drm_gpusvm_zdd *
-drm_gpusvm_zdd_alloc(void *device_private_page_owner)
-{
-	struct drm_gpusvm_zdd *zdd;
-
-	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
-	if (!zdd)
-		return NULL;
-
-	kref_init(&zdd->refcount);
-	zdd->devmem_allocation = NULL;
-	zdd->device_private_page_owner = device_private_page_owner;
-
-	return zdd;
-}
-
-/**
- * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
- * @zdd: Pointer to the zdd structure.
- *
- * This function increments the reference count of the provided zdd structure.
- *
- * Return: Pointer to the zdd structure.
- */
-static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
-{
-	kref_get(&zdd->refcount);
-	return zdd;
-}
-
-/**
- * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
- * @ref: Pointer to the reference count structure.
- *
- * This function queues the destroy_work of the zdd for asynchronous destruction.
- */
-static void drm_gpusvm_zdd_destroy(struct kref *ref)
-{
-	struct drm_gpusvm_zdd *zdd =
-		container_of(ref, struct drm_gpusvm_zdd, refcount);
-	struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
-
-	if (devmem) {
-		complete_all(&devmem->detached);
-		if (devmem->ops->devmem_release)
-			devmem->ops->devmem_release(devmem);
-	}
-	kfree(zdd);
-}
-
-/**
- * drm_gpusvm_zdd_put() - Put a zdd reference.
- * @zdd: Pointer to the zdd structure.
- *
- * This function decrements the reference count of the provided zdd structure
- * and schedules its destruction if the count drops to zero.
- */
-static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
-{
-	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
-}
-
 /**
  * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
  * @notifier: Pointer to the GPU SVM notifier structure.
@@ -946,7 +839,7 @@ retry:
 		 * process-many-malloc' fails. In the failure case, each process
 		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
 		 * ranges. When migrating the SVM ranges, some processes fail in
-		 * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
+		 * drm_pagemap_migrate_to_devmem with 'migrate.cpages != npages'
 		 * and then upon drm_gpusvm_range_get_pages device pages from
 		 * other processes are collected + faulted in which creates all
 		 * sorts of problems. Unsure exactly how this happening, also
@@ -1364,7 +1257,7 @@ int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
 		.dev_private_owner = gpusvm->device_private_page_owner,
 	};
 	struct mm_struct *mm = gpusvm->mm;
-	struct drm_gpusvm_zdd *zdd;
+	void *zdd;
 	unsigned long timeout =
 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
 	unsigned long i, j;
@@ -1447,6 +1340,7 @@ map_pages:
 	}
 
 	zdd = NULL;
+	pagemap = NULL;
 	num_dma_mapped = 0;
 	for (i = 0, j = 0; i < npages; ++j) {
 		struct page *page = hmm_pfn_to_page(pfns[i]);
@@ -1466,7 +1360,7 @@ map_pages:
 				}
 
 				pagemap = page_pgmap(page);
-				dpagemap = zdd->devmem_allocation->dpagemap;
+				dpagemap = drm_pagemap_page_to_dpagemap(page);
 				if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
 					/*
 					 * Raced. This is not supposed to happen
@@ -1490,7 +1384,7 @@ map_pages:
 		} else {
 			dma_addr_t addr;
 
-			if (is_zone_device_page(page) || zdd) {
+			if (is_zone_device_page(page) || pagemap) {
 				err = -EOPNOTSUPP;
 				goto err_unmap;
 			}
@@ -1518,7 +1412,7 @@ map_pages:
 		flags.has_dma_mapping = true;
 	}
 
-	if (zdd) {
+	if (pagemap) {
 		flags.has_devmem_pages = true;
 		range->dpagemap = dpagemap;
 	}
@@ -1546,6 +1440,7 @@ EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
 
 /**
  * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
+ * drm_gpusvm_range_evict() - Evict GPU SVM range
  * @gpusvm: Pointer to the GPU SVM structure
  * @range: Pointer to the GPU SVM range structure
  * @ctx: GPU SVM context
@@ -1576,562 +1471,11 @@ void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
 
 /**
- * drm_gpusvm_migration_unlock_put_page() - Put a migration page
- * @page: Pointer to the page to put
- *
- * This function unlocks and puts a page.
- */
-static void drm_gpusvm_migration_unlock_put_page(struct page *page)
-{
-	unlock_page(page);
-	put_page(page);
-}
-
-/**
- * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
- * @npages: Number of pages
- * @migrate_pfn: Array of migrate page frame numbers
- *
- * This function unlocks and puts an array of pages.
- */
-static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
-						  unsigned long *migrate_pfn)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i) {
-		struct page *page;
-
-		if (!migrate_pfn[i])
-			continue;
-
-		page = migrate_pfn_to_page(migrate_pfn[i]);
-		drm_gpusvm_migration_unlock_put_page(page);
-		migrate_pfn[i] = 0;
-	}
-}
-
-/**
- * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
- * @page: Pointer to the page
- * @zdd: Pointer to the GPU SVM zone device data
- *
- * This function associates the given page with the specified GPU SVM zone
- * device data and initializes it for zone device usage.
- */
-static void drm_gpusvm_get_devmem_page(struct page *page,
-				       struct drm_gpusvm_zdd *zdd)
-{
-	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
-	zone_device_page_init(page);
-}
-
-/**
- * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
- * @dev: The device for which the pages are being mapped
- * @dma_addr: Array to store DMA addresses corresponding to mapped pages
- * @migrate_pfn: Array of migrate page frame numbers to map
- * @npages: Number of pages to map
- * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
- *
- * This function maps pages of memory for migration usage in GPU SVM. It
- * iterates over each page frame number provided in @migrate_pfn, maps the
- * corresponding page, and stores the DMA address in the provided @dma_addr
- * array.
- *
- * Return: 0 on success, -EFAULT if an error occurs during mapping.
- */
-static int drm_gpusvm_migrate_map_pages(struct device *dev,
-					dma_addr_t *dma_addr,
-					unsigned long *migrate_pfn,
-					unsigned long npages,
-					enum dma_data_direction dir)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i) {
-		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
-
-		if (!page)
-			continue;
-
-		if (WARN_ON_ONCE(is_zone_device_page(page)))
-			return -EFAULT;
-
-		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
-		if (dma_mapping_error(dev, dma_addr[i]))
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
-/**
- * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
- * @dev: The device for which the pages were mapped
- * @dma_addr: Array of DMA addresses corresponding to mapped pages
- * @npages: Number of pages to unmap
- * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
- *
- * This function unmaps previously mapped pages of memory for GPU Shared Virtual
- * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
- * if it's valid and not already unmapped, and unmaps the corresponding page.
- */
-static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
-					   dma_addr_t *dma_addr,
-					   unsigned long npages,
-					   enum dma_data_direction dir)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i) {
-		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
-			continue;
-
-		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
-	}
-}
-
-/**
- * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
+ * drm_gpusvm_range_evict() - Evict GPU SVM range
  * @gpusvm: Pointer to the GPU SVM structure
- * @range: Pointer to the GPU SVM range structure
- * @devmem_allocation: Pointer to the device memory allocation. The caller
- *                     should hold a reference to the device memory allocation,
- *                     which should be dropped via ops->devmem_release or upon
- *                     the failure of this function.
- * @ctx: GPU SVM context
- *
- * This function migrates the specified GPU SVM range to device memory. It
- * performs the necessary setup and invokes the driver-specific operations for
- * migration to device memory. Upon successful return, @devmem_allocation can
- * safely reference @range until ops->devmem_release is called which only upon
- * successful return. Expected to be called while holding the mmap lock in read
- * mode.
- *
- * Return: 0 on success, negative error code on failure.
- */
-int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
-				 struct drm_gpusvm_range *range,
-				 struct drm_gpusvm_devmem *devmem_allocation,
-				 const struct drm_gpusvm_ctx *ctx)
-{
-	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
-	unsigned long start = drm_gpusvm_range_start(range),
-		      end = drm_gpusvm_range_end(range);
-	struct migrate_vma migrate = {
-		.start		= start,
-		.end		= end,
-		.pgmap_owner	= gpusvm->device_private_page_owner,
-		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
-	};
-	struct mm_struct *mm = gpusvm->mm;
-	unsigned long i, npages = npages_in_range(start, end);
-	struct vm_area_struct *vas;
-	struct drm_gpusvm_zdd *zdd = NULL;
-	struct page **pages;
-	dma_addr_t *dma_addr;
-	void *buf;
-	int err;
-
-	mmap_assert_locked(gpusvm->mm);
-
-	if (!range->flags.migrate_devmem)
-		return -EINVAL;
-
-	if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
-	    !ops->copy_to_ram)
-		return -EOPNOTSUPP;
-
-	vas = vma_lookup(mm, start);
-	if (!vas) {
-		err = -ENOENT;
-		goto err_out;
-	}
-
-	if (end > vas->vm_end || start < vas->vm_start) {
-		err = -EINVAL;
-		goto err_out;
-	}
-
-	if (!vma_is_anonymous(vas)) {
-		err = -EBUSY;
-		goto err_out;
-	}
-
-	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
-		       sizeof(*pages), GFP_KERNEL);
-	if (!buf) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
-	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
-
-	zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
-	if (!zdd) {
-		err = -ENOMEM;
-		goto err_free;
-	}
-
-	migrate.vma = vas;
-	migrate.src = buf;
-	migrate.dst = migrate.src + npages;
-
-	err = migrate_vma_setup(&migrate);
-	if (err)
-		goto err_free;
-
-	if (!migrate.cpages) {
-		err = -EFAULT;
-		goto err_free;
-	}
-
-	if (migrate.cpages != npages) {
-		err = -EBUSY;
-		goto err_finalize;
-	}
-
-	err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
-	if (err)
-		goto err_finalize;
-
-	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
-					   migrate.src, npages, DMA_TO_DEVICE);
-	if (err)
-		goto err_finalize;
-
-	for (i = 0; i < npages; ++i) {
-		struct page *page = pfn_to_page(migrate.dst[i]);
-
-		pages[i] = page;
-		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
-		drm_gpusvm_get_devmem_page(page, zdd);
-	}
-
-	err = ops->copy_to_devmem(pages, dma_addr, npages);
-	if (err)
-		goto err_finalize;
-
-	/* Upon success bind devmem allocation to range and zdd */
-	devmem_allocation->timeslice_expiration = get_jiffies_64() +
-		msecs_to_jiffies(ctx->timeslice_ms);
-	zdd->devmem_allocation = devmem_allocation;	/* Owns ref */
-
-err_finalize:
-	if (err)
-		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
-	migrate_vma_pages(&migrate);
-	migrate_vma_finalize(&migrate);
-	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
-				       DMA_TO_DEVICE);
-err_free:
-	if (zdd)
-		drm_gpusvm_zdd_put(zdd);
-	kvfree(buf);
-err_out:
-	return err;
-}
-EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
-
-/**
- * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
- * @vas: Pointer to the VM area structure, can be NULL
- * @fault_page: Fault page
- * @npages: Number of pages to populate
- * @mpages: Number of pages to migrate
- * @src_mpfn: Source array of migrate PFNs
- * @mpfn: Array of migrate PFNs to populate
- * @addr: Start address for PFN allocation
- *
- * This function populates the RAM migrate page frame numbers (PFNs) for the
- * specified VM area structure. It allocates and locks pages in the VM area for
- * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
- * alloc_page for allocation.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
-					       struct page *fault_page,
-					       unsigned long npages,
-					       unsigned long *mpages,
-					       unsigned long *src_mpfn,
-					       unsigned long *mpfn,
-					       unsigned long addr)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
-		struct page *page, *src_page;
-
-		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
-			continue;
-
-		src_page = migrate_pfn_to_page(src_mpfn[i]);
-		if (!src_page)
-			continue;
-
-		if (fault_page) {
-			if (src_page->zone_device_data !=
-			    fault_page->zone_device_data)
-				continue;
-		}
-
-		if (vas)
-			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
-		else
-			page = alloc_page(GFP_HIGHUSER);
-
-		if (!page)
-			goto free_pages;
-
-		mpfn[i] = migrate_pfn(page_to_pfn(page));
-	}
-
-	for (i = 0; i < npages; ++i) {
-		struct page *page = migrate_pfn_to_page(mpfn[i]);
-
-		if (!page)
-			continue;
-
-		WARN_ON_ONCE(!trylock_page(page));
-		++*mpages;
-	}
-
-	return 0;
-
-free_pages:
-	for (i = 0; i < npages; ++i) {
-		struct page *page = migrate_pfn_to_page(mpfn[i]);
-
-		if (!page)
-			continue;
-
-		put_page(page);
-		mpfn[i] = 0;
-	}
-	return -ENOMEM;
-}
-
-/**
- * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
- * @devmem_allocation: Pointer to the device memory allocation
- *
- * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
- * migration done via migrate_device_* functions.
- *
- * Return: 0 on success, negative error code on failure.
- */
-int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
-{
-	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
-	unsigned long npages, mpages = 0;
-	struct page **pages;
-	unsigned long *src, *dst;
-	dma_addr_t *dma_addr;
-	void *buf;
-	int i, err = 0;
-	unsigned int retry_count = 2;
-
-	npages = devmem_allocation->size >> PAGE_SHIFT;
-
-retry:
-	if (!mmget_not_zero(devmem_allocation->mm))
-		return -EFAULT;
-
-	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
-		       sizeof(*pages), GFP_KERNEL);
-	if (!buf) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	src = buf;
-	dst = buf + (sizeof(*src) * npages);
-	dma_addr = buf + (2 * sizeof(*src) * npages);
-	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
-
-	err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
-	if (err)
-		goto err_free;
-
-	err = migrate_device_pfns(src, npages);
-	if (err)
-		goto err_free;
-
-	err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
-						  src, dst, 0);
-	if (err || !mpages)
-		goto err_finalize;
-
-	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
-					   dst, npages, DMA_FROM_DEVICE);
-	if (err)
-		goto err_finalize;
-
-	for (i = 0; i < npages; ++i)
-		pages[i] = migrate_pfn_to_page(src[i]);
-
-	err = ops->copy_to_ram(pages, dma_addr, npages);
-	if (err)
-		goto err_finalize;
-
-err_finalize:
-	if (err)
-		drm_gpusvm_migration_unlock_put_pages(npages, dst);
-	migrate_device_pages(src, dst, npages);
-	migrate_device_finalize(src, dst, npages);
-	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
-				       DMA_FROM_DEVICE);
-err_free:
-	kvfree(buf);
-err_out:
-	mmput_async(devmem_allocation->mm);
-
-	if (completion_done(&devmem_allocation->detached))
-		return 0;
-
-	if (retry_count--) {
-		cond_resched();
-		goto retry;
-	}
-
-	return err ?: -EBUSY;
-}
-EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
-
-/**
- * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
- * @vas: Pointer to the VM area structure
- * @device_private_page_owner: Device private pages owner
- * @page: Pointer to the page for fault handling (can be NULL)
- * @fault_addr: Fault address
- * @size: Size of migration
- *
- * This internal function performs the migration of the specified GPU SVM range
- * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
- * invokes the driver-specific operations for migration to RAM.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
-				       void *device_private_page_owner,
-				       struct page *page,
-				       unsigned long fault_addr,
-				       unsigned long size)
-{
-	struct migrate_vma migrate = {
-		.vma		= vas,
-		.pgmap_owner	= device_private_page_owner,
-		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
-			MIGRATE_VMA_SELECT_DEVICE_COHERENT,
-		.fault_page	= page,
-	};
-	struct drm_gpusvm_zdd *zdd;
-	const struct drm_gpusvm_devmem_ops *ops;
-	struct device *dev = NULL;
-	unsigned long npages, mpages = 0;
-	struct page **pages;
-	dma_addr_t *dma_addr;
-	unsigned long start, end;
-	void *buf;
-	int i, err = 0;
-
-	if (page) {
-		zdd = page->zone_device_data;
-		if (time_before64(get_jiffies_64(),
-				  zdd->devmem_allocation->timeslice_expiration))
-			return 0;
-	}
-
-	start = ALIGN_DOWN(fault_addr, size);
-	end = ALIGN(fault_addr + 1, size);
-
-	/* Corner where VMA area struct has been partially unmapped */
-	if (start < vas->vm_start)
-		start = vas->vm_start;
-	if (end > vas->vm_end)
-		end = vas->vm_end;
-
-	migrate.start = start;
-	migrate.end = end;
-	npages = npages_in_range(start, end);
-
-	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
-		       sizeof(*pages), GFP_KERNEL);
-	if (!buf) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
-	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
-
-	migrate.vma = vas;
-	migrate.src = buf;
-	migrate.dst = migrate.src + npages;
-
-	err = migrate_vma_setup(&migrate);
-	if (err)
-		goto err_free;
-
-	/* Raced with another CPU fault, nothing to do */
-	if (!migrate.cpages)
-		goto err_free;
-
-	if (!page) {
-		for (i = 0; i < npages; ++i) {
-			if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
-				continue;
-
-			page = migrate_pfn_to_page(migrate.src[i]);
-			break;
-		}
-
-		if (!page)
-			goto err_finalize;
-	}
-	zdd = page->zone_device_data;
-	ops = zdd->devmem_allocation->ops;
-	dev = zdd->devmem_allocation->dev;
-
-	err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
-						  migrate.src, migrate.dst,
-						  start);
-	if (err)
-		goto err_finalize;
-
-	err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
-					   DMA_FROM_DEVICE);
-	if (err)
-		goto err_finalize;
-
-	for (i = 0; i < npages; ++i)
-		pages[i] = migrate_pfn_to_page(migrate.src[i]);
-
-	err = ops->copy_to_ram(pages, dma_addr, npages);
-	if (err)
-		goto err_finalize;
-
-err_finalize:
-	if (err)
-		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
-	migrate_vma_pages(&migrate);
-	migrate_vma_finalize(&migrate);
-	if (dev)
-		drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
-					       DMA_FROM_DEVICE);
-err_free:
-	kvfree(buf);
-err_out:
-
-	return err;
-}
-
-/**
- * drm_gpusvm_range_evict - Evict GPU SVM range
  * @range: Pointer to the GPU SVM range to be removed
  *
- * This function evicts the specified GPU SVM range. This function will not
- * evict coherent pages.
+ * This function evicts the specified GPU SVM range.
  *
  * Return: 0 on success, a negative error code on failure.
  */
@@ -2183,60 +1527,6 @@ int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
 }
 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
 
-/**
- * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
- * @page: Pointer to the page
- *
- * This function is a callback used to put the GPU SVM zone device data
- * associated with a page when it is being released.
- */
-static void drm_gpusvm_page_free(struct page *page)
-{
-	drm_gpusvm_zdd_put(page->zone_device_data);
-}
-
-/**
- * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
- * @vmf: Pointer to the fault information structure
- *
- * This function is a page fault handler used to migrate a GPU SVM range to RAM.
- * It retrieves the GPU SVM range information from the faulting page and invokes
- * the internal migration function to migrate the range back to RAM.
- *
- * Return: VM_FAULT_SIGBUS on failure, 0 on success.
- */
-static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
-{
-	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
-	int err;
-
-	err = __drm_gpusvm_migrate_to_ram(vmf->vma,
-					  zdd->device_private_page_owner,
-					  vmf->page, vmf->address,
-					  zdd->devmem_allocation->size);
-
-	return err ? VM_FAULT_SIGBUS : 0;
-}
-
-/*
- * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
- */
-static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
-	.page_free = drm_gpusvm_page_free,
-	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
-};
-
-/**
- * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
- *
- * Return: Pointer to the GPU SVM device page map operations structure.
- */
-const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
-{
-	return &drm_gpusvm_pagemap_ops;
-}
-EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
-
 /**
  * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
  * @gpusvm: Pointer to the GPU SVM structure.
@@ -2281,28 +1571,5 @@ void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
 }
 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
 
-/**
- * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
- *
- * @dev: Pointer to the device structure which device memory allocation belongs to
- * @mm: Pointer to the mm_struct for the address space
- * @ops: Pointer to the operations structure for GPU SVM device memory
- * @dpagemap: The struct drm_pagemap we're allocating from.
- * @size: Size of device memory allocation
- */
-void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
-			    struct device *dev, struct mm_struct *mm,
-			    const struct drm_gpusvm_devmem_ops *ops,
-			    struct drm_pagemap *dpagemap, size_t size)
-{
-	init_completion(&devmem_allocation->detached);
-	devmem_allocation->dev = dev;
-	devmem_allocation->mm = mm;
-	devmem_allocation->ops = ops;
-	devmem_allocation->dpagemap = dpagemap;
-	devmem_allocation->size = size;
-}
-EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
-
 MODULE_DESCRIPTION("DRM GPUSVM");
 MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
new file mode 100644
index 000000000000..cef4657b6e8a
--- /dev/null
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/*
+ * Copyright © 2024-2025 Intel Corporation
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+#include <drm/drm_pagemap.h>
+
+/**
+ * DOC: Overview
+ *
+ * The DRM pagemap layer is intended to augment the dev_pagemap functionality by
+ * providing a way to populate a struct mm_struct virtual range with device
+ * private pages and to provide helpers to abstract device memory allocations,
+ * to migrate memory back and forth between device memory and system RAM and
+ * to handle access (and in the future migration) between devices implementing
+ * a fast interconnect that is not necessarily visible to the rest of the
+ * system.
+ *
+ * Typically the DRM pagemap receives requests from one or more DRM GPU SVM
+ * instances to populate struct mm_struct virtual ranges with memory.
+ */
+
+/**
+ * DOC: Migration
+ *
+ * The migration support is quite simple, allowing migration between RAM and
+ * device memory at the range granularity. For example, GPU SVM currently does
+ * not support mixing RAM and device memory pages within a range. This means
+ * that upon GPU fault, the entire range can be migrated to device memory, and
+ * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
+ * memory storage within a range could be added in the future if required.
+ *
+ * The reasoning for only supporting range granularity is as follows: it
+ * simplifies the implementation, and range sizes are driver-defined and should
+ * be relatively small.
+ *
+ *
+ * Key DRM pagemap components:
+ *
+ * - Device Memory Allocations:
+ *      Embedded structure containing enough information for the drm_pagemap to
+ *      migrate to / from device memory.
+ *
+ * - Device Memory Operations:
+ *      Define the interface for driver-specific device memory operations
+ *      release memory, populate pfns, and copy to / from device memory.
+ */
+
+/**
+ * struct drm_pagemap_zdd - GPU SVM zone device data
+ *
+ * @refcount: Reference count for the zdd
+ * @devmem_allocation: device memory allocation
+ * @device_private_page_owner: Device private pages owner
+ *
+ * This structure serves as a generic wrapper installed in
+ * page->zone_device_data. It provides infrastructure for looking up a device
+ * memory allocation upon CPU page fault and asynchronously releasing device
+ * memory once the CPU has no page references. Asynchronous release is useful
+ * because CPU page references can be dropped in IRQ contexts, while releasing
+ * device memory likely requires sleeping locks.
+ */
+struct drm_pagemap_zdd {
+	struct kref refcount;
+	struct drm_pagemap_devmem *devmem_allocation;
+	void *device_private_page_owner;
+};
+
+/**
+ * drm_pagemap_zdd_alloc() - Allocate a zdd structure.
+ * @device_private_page_owner: Device private pages owner
+ *
+ * This function allocates and initializes a new zdd structure. It sets up the
+ * reference count and initializes the destroy work.
+ *
+ * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
+ */
+static struct drm_pagemap_zdd *
+drm_pagemap_zdd_alloc(void *device_private_page_owner)
+{
+	struct drm_pagemap_zdd *zdd;
+
+	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
+	if (!zdd)
+		return NULL;
+
+	kref_init(&zdd->refcount);
+	zdd->devmem_allocation = NULL;
+	zdd->device_private_page_owner = device_private_page_owner;
+
+	return zdd;
+}
+
+/**
+ * drm_pagemap_zdd_get() - Get a reference to a zdd structure.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function increments the reference count of the provided zdd structure.
+ *
+ * Return: Pointer to the zdd structure.
+ */
+static struct drm_pagemap_zdd *drm_pagemap_zdd_get(struct drm_pagemap_zdd *zdd)
+{
+	kref_get(&zdd->refcount);
+	return zdd;
+}
+
+/**
+ * drm_pagemap_zdd_destroy() - Destroy a zdd structure.
+ * @ref: Pointer to the reference count structure.
+ *
+ * This function queues the destroy_work of the zdd for asynchronous destruction.
+ */
+static void drm_pagemap_zdd_destroy(struct kref *ref)
+{
+	struct drm_pagemap_zdd *zdd =
+		container_of(ref, struct drm_pagemap_zdd, refcount);
+	struct drm_pagemap_devmem *devmem = zdd->devmem_allocation;
+
+	if (devmem) {
+		complete_all(&devmem->detached);
+		if (devmem->ops->devmem_release)
+			devmem->ops->devmem_release(devmem);
+	}
+	kfree(zdd);
+}
+
+/**
+ * drm_pagemap_zdd_put() - Put a zdd reference.
+ * @zdd: Pointer to the zdd structure.
+ *
+ * This function decrements the reference count of the provided zdd structure
+ * and schedules its destruction if the count drops to zero.
+ */
+static void drm_pagemap_zdd_put(struct drm_pagemap_zdd *zdd)
+{
+	kref_put(&zdd->refcount, drm_pagemap_zdd_destroy);
+}
+
+/**
+ * drm_pagemap_migration_unlock_put_page() - Put a migration page
+ * @page: Pointer to the page to put
+ *
+ * This function unlocks and puts a page.
+ */
+static void drm_pagemap_migration_unlock_put_page(struct page *page)
+{
+	unlock_page(page);
+	put_page(page);
+}
+
+/**
+ * drm_pagemap_migration_unlock_put_pages() - Put migration pages
+ * @npages: Number of pages
+ * @migrate_pfn: Array of migrate page frame numbers
+ *
+ * This function unlocks and puts an array of pages.
+ */
+static void drm_pagemap_migration_unlock_put_pages(unsigned long npages,
+						   unsigned long *migrate_pfn)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page;
+
+		if (!migrate_pfn[i])
+			continue;
+
+		page = migrate_pfn_to_page(migrate_pfn[i]);
+		drm_pagemap_migration_unlock_put_page(page);
+		migrate_pfn[i] = 0;
+	}
+}
+
+/**
+ * drm_pagemap_get_devmem_page() - Get a reference to a device memory page
+ * @page: Pointer to the page
+ * @zdd: Pointer to the GPU SVM zone device data
+ *
+ * This function associates the given page with the specified GPU SVM zone
+ * device data and initializes it for zone device usage.
+ */
+static void drm_pagemap_get_devmem_page(struct page *page,
+					struct drm_pagemap_zdd *zdd)
+{
+	page->zone_device_data = drm_pagemap_zdd_get(zdd);
+	zone_device_page_init(page);
+}
+
+/**
+ * drm_pagemap_migrate_map_pages() - Map migration pages for GPU SVM migration
+ * @dev: The device for which the pages are being mapped
+ * @dma_addr: Array to store DMA addresses corresponding to mapped pages
+ * @migrate_pfn: Array of migrate page frame numbers to map
+ * @npages: Number of pages to map
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function maps pages of memory for migration usage in GPU SVM. It
+ * iterates over each page frame number provided in @migrate_pfn, maps the
+ * corresponding page, and stores the DMA address in the provided @dma_addr
+ * array.
+ *
+ * Returns: 0 on success, -EFAULT if an error occurs during mapping.
+ */
+static int drm_pagemap_migrate_map_pages(struct device *dev,
+					 dma_addr_t *dma_addr,
+					 unsigned long *migrate_pfn,
+					 unsigned long npages,
+					 enum dma_data_direction dir)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
+
+		if (!page)
+			continue;
+
+		if (WARN_ON_ONCE(is_zone_device_page(page)))
+			return -EFAULT;
+
+		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
+		if (dma_mapping_error(dev, dma_addr[i]))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+/**
+ * drm_pagemap_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
+ * @dev: The device for which the pages were mapped
+ * @dma_addr: Array of DMA addresses corresponding to mapped pages
+ * @npages: Number of pages to unmap
+ * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
+ *
+ * This function unmaps previously mapped pages of memory for GPU Shared Virtual
+ * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
+ * if it's valid and not already unmapped, and unmaps the corresponding page.
+ */
+static void drm_pagemap_migrate_unmap_pages(struct device *dev,
+					    dma_addr_t *dma_addr,
+					    unsigned long npages,
+					    enum dma_data_direction dir)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i) {
+		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
+			continue;
+
+		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
+	}
+}
+
+static unsigned long
+npages_in_range(unsigned long start, unsigned long end)
+{
+	return (end - start) >> PAGE_SHIFT;
+}
+
+/**
+ * drm_pagemap_migrate_to_devmem() - Migrate a struct mm_struct range to device memory
+ * @devmem_allocation: The device memory allocation to migrate to.
+ * The caller should hold a reference to the device memory allocation,
+ * and the reference is consumed by this function unless it returns with
+ * an error.
+ * @mm: Pointer to the struct mm_struct.
+ * @start: Start of the virtual address range to migrate.
+ * @end: End of the virtual address range to migrate.
+ * @timeslice_ms: The time requested for the migrated pagemap pages to
+ * be present in @mm before being allowed to be migrated back.
+ * @pgmap_owner: Not used currently, since only system memory is considered.
+ *
+ * This function migrates the specified virtual address range to device memory.
+ * It performs the necessary setup and invokes the driver-specific operations for
+ * migration to device memory. Expected to be called while holding the mmap lock in
+ * at least read mode.
+ *
+ * Note: The @timeslice_ms parameter can typically be used to force data to
+ * remain in pagemap pages long enough for a GPU to perform a task and to prevent
+ * a migration livelock. One alternative would be for the GPU driver to block
+ * in a mmu_notifier for the specified amount of time, but adding the
+ * functionality to the pagemap is likely nicer to the system as a whole.
+ *
+ * Return: %0 on success, negative error code on failure.
+ */
+int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation,
+				  struct mm_struct *mm,
+				  unsigned long start, unsigned long end,
+				  unsigned long timeslice_ms,
+				  void *pgmap_owner)
+{
+	const struct drm_pagemap_devmem_ops *ops = devmem_allocation->ops;
+	struct migrate_vma migrate = {
+		.start		= start,
+		.end		= end,
+		.pgmap_owner	= pgmap_owner,
+		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
+	};
+	unsigned long i, npages = npages_in_range(start, end);
+	struct vm_area_struct *vas;
+	struct drm_pagemap_zdd *zdd = NULL;
+	struct page **pages;
+	dma_addr_t *dma_addr;
+	void *buf;
+	int err;
+
+	mmap_assert_locked(mm);
+
+	if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
+	    !ops->copy_to_ram)
+		return -EOPNOTSUPP;
+
+	vas = vma_lookup(mm, start);
+	if (!vas) {
+		err = -ENOENT;
+		goto err_out;
+	}
+
+	if (end > vas->vm_end || start < vas->vm_start) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (!vma_is_anonymous(vas)) {
+		err = -EBUSY;
+		goto err_out;
+	}
+
+	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+	zdd = drm_pagemap_zdd_alloc(pgmap_owner);
+	if (!zdd) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	migrate.vma = vas;
+	migrate.src = buf;
+	migrate.dst = migrate.src + npages;
+
+	err = migrate_vma_setup(&migrate);
+	if (err)
+		goto err_free;
+
+	if (!migrate.cpages) {
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	if (migrate.cpages != npages) {
+		err = -EBUSY;
+		goto err_finalize;
+	}
+
+	err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
+	if (err)
+		goto err_finalize;
+
+	err = drm_pagemap_migrate_map_pages(devmem_allocation->dev, dma_addr,
+					    migrate.src, npages, DMA_TO_DEVICE);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page = pfn_to_page(migrate.dst[i]);
+
+		pages[i] = page;
+		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
+		drm_pagemap_get_devmem_page(page, zdd);
+	}
+
+	err = ops->copy_to_devmem(pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+	/* Upon success bind devmem allocation to range and zdd */
+	devmem_allocation->timeslice_expiration = get_jiffies_64() +
+		msecs_to_jiffies(timeslice_ms);
+	zdd->devmem_allocation = devmem_allocation;	/* Owns ref */
+
+err_finalize:
+	if (err)
+		drm_pagemap_migration_unlock_put_pages(npages, migrate.dst);
+	migrate_vma_pages(&migrate);
+	migrate_vma_finalize(&migrate);
+	drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
+					DMA_TO_DEVICE);
+err_free:
+	if (zdd)
+		drm_pagemap_zdd_put(zdd);
+	kvfree(buf);
+err_out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(drm_pagemap_migrate_to_devmem);
+
+/**
+ * drm_pagemap_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
+ * @vas: Pointer to the VM area structure, can be NULL
+ * @fault_page: Fault page
+ * @npages: Number of pages to populate
+ * @mpages: Number of pages to migrate
+ * @src_mpfn: Source array of migrate PFNs
+ * @mpfn: Array of migrate PFNs to populate
+ * @addr: Start address for PFN allocation
+ *
+ * This function populates the RAM migrate page frame numbers (PFNs) for the
+ * specified VM area structure. It allocates and locks pages in the VM area for
+ * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
+ * alloc_page for allocation.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int drm_pagemap_migrate_populate_ram_pfn(struct vm_area_struct *vas,
+						struct page *fault_page,
+						unsigned long npages,
+						unsigned long *mpages,
+						unsigned long *src_mpfn,
+						unsigned long *mpfn,
+						unsigned long addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
+		struct page *page, *src_page;
+
+		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		src_page = migrate_pfn_to_page(src_mpfn[i]);
+		if (!src_page)
+			continue;
+
+		if (fault_page) {
+			if (src_page->zone_device_data !=
+			    fault_page->zone_device_data)
+				continue;
+		}
+
+		if (vas)
+			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
+		else
+			page = alloc_page(GFP_HIGHUSER);
+
+		if (!page)
+			goto free_pages;
+
+		mpfn[i] = migrate_pfn(page_to_pfn(page));
+	}
+
+	for (i = 0; i < npages; ++i) {
+		struct page *page = migrate_pfn_to_page(mpfn[i]);
+
+		if (!page)
+			continue;
+
+		WARN_ON_ONCE(!trylock_page(page));
+		++*mpages;
+	}
+
+	return 0;
+
+free_pages:
+	for (i = 0; i < npages; ++i) {
+		struct page *page = migrate_pfn_to_page(mpfn[i]);
+
+		if (!page)
+			continue;
+
+		put_page(page);
+		mpfn[i] = 0;
+	}
+	return -ENOMEM;
+}
+
+/**
+ * drm_pagemap_evict_to_ram() - Evict GPU SVM range to RAM
+ * @devmem_allocation: Pointer to the device memory allocation
+ *
+ * Similar to __drm_pagemap_migrate_to_ram but does not require mmap lock and
+ * migration done via migrate_device_* functions.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation)
+{
+	const struct drm_pagemap_devmem_ops *ops = devmem_allocation->ops;
+	unsigned long npages, mpages = 0;
+	struct page **pages;
+	unsigned long *src, *dst;
+	dma_addr_t *dma_addr;
+	void *buf;
+	int i, err = 0;
+	unsigned int retry_count = 2;
+
+	npages = devmem_allocation->size >> PAGE_SHIFT;
+
+retry:
+	if (!mmget_not_zero(devmem_allocation->mm))
+		return -EFAULT;
+
+	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	src = buf;
+	dst = buf + (sizeof(*src) * npages);
+	dma_addr = buf + (2 * sizeof(*src) * npages);
+	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
+
+	err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
+	if (err)
+		goto err_free;
+
+	err = migrate_device_pfns(src, npages);
+	if (err)
+		goto err_free;
+
+	err = drm_pagemap_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
+						   src, dst, 0);
+	if (err || !mpages)
+		goto err_finalize;
+
+	err = drm_pagemap_migrate_map_pages(devmem_allocation->dev, dma_addr,
+					    dst, npages, DMA_FROM_DEVICE);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i)
+		pages[i] = migrate_pfn_to_page(src[i]);
+
+	err = ops->copy_to_ram(pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+err_finalize:
+	if (err)
+		drm_pagemap_migration_unlock_put_pages(npages, dst);
+	migrate_device_pages(src, dst, npages);
+	migrate_device_finalize(src, dst, npages);
+	drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
+					DMA_FROM_DEVICE);
+err_free:
+	kvfree(buf);
+err_out:
+	mmput_async(devmem_allocation->mm);
+
+	if (completion_done(&devmem_allocation->detached))
+		return 0;
+
+	if (retry_count--) {
+		cond_resched();
+		goto retry;
+	}
+
+	return err ?: -EBUSY;
+}
+EXPORT_SYMBOL_GPL(drm_pagemap_evict_to_ram);
+
+/**
+ * __drm_pagemap_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
+ * @vas: Pointer to the VM area structure
+ * @device_private_page_owner: Device private pages owner
+ * @page: Pointer to the page for fault handling (can be NULL)
+ * @fault_addr: Fault address
+ * @size: Size of migration
+ *
+ * This internal function performs the migration of the specified GPU SVM range
+ * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
+ * invokes the driver-specific operations for migration to RAM.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+static int __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas,
+					void *device_private_page_owner,
+					struct page *page,
+					unsigned long fault_addr,
+					unsigned long size)
+{
+	struct migrate_vma migrate = {
+		.vma		= vas,
+		.pgmap_owner	= device_private_page_owner,
+		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
+		MIGRATE_VMA_SELECT_DEVICE_COHERENT,
+		.fault_page	= page,
+	};
+	struct drm_pagemap_zdd *zdd;
+	const struct drm_pagemap_devmem_ops *ops;
+	struct device *dev = NULL;
+	unsigned long npages, mpages = 0;
+	struct page **pages;
+	dma_addr_t *dma_addr;
+	unsigned long start, end;
+	void *buf;
+	int i, err = 0;
+
+	if (page) {
+		zdd = page->zone_device_data;
+		if (time_before64(get_jiffies_64(),
+				  zdd->devmem_allocation->timeslice_expiration))
+			return 0;
+	}
+
+	start = ALIGN_DOWN(fault_addr, size);
+	end = ALIGN(fault_addr + 1, size);
+
+	/* Corner where VMA area struct has been partially unmapped */
+	if (start < vas->vm_start)
+		start = vas->vm_start;
+	if (end > vas->vm_end)
+		end = vas->vm_end;
+
+	migrate.start = start;
+	migrate.end = end;
+	npages = npages_in_range(start, end);
+
+	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
+		       sizeof(*pages), GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
+	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
+
+	migrate.vma = vas;
+	migrate.src = buf;
+	migrate.dst = migrate.src + npages;
+
+	err = migrate_vma_setup(&migrate);
+	if (err)
+		goto err_free;
+
+	/* Raced with another CPU fault, nothing to do */
+	if (!migrate.cpages)
+		goto err_free;
+
+	if (!page) {
+		for (i = 0; i < npages; ++i) {
+			if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
+				continue;
+
+			page = migrate_pfn_to_page(migrate.src[i]);
+			break;
+		}
+
+		if (!page)
+			goto err_finalize;
+	}
+	zdd = page->zone_device_data;
+	ops = zdd->devmem_allocation->ops;
+	dev = zdd->devmem_allocation->dev;
+
+	err = drm_pagemap_migrate_populate_ram_pfn(vas, page, npages, &mpages,
+						   migrate.src, migrate.dst,
+						   start);
+	if (err)
+		goto err_finalize;
+
+	err = drm_pagemap_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
+					    DMA_FROM_DEVICE);
+	if (err)
+		goto err_finalize;
+
+	for (i = 0; i < npages; ++i)
+		pages[i] = migrate_pfn_to_page(migrate.src[i]);
+
+	err = ops->copy_to_ram(pages, dma_addr, npages);
+	if (err)
+		goto err_finalize;
+
+err_finalize:
+	if (err)
+		drm_pagemap_migration_unlock_put_pages(npages, migrate.dst);
+	migrate_vma_pages(&migrate);
+	migrate_vma_finalize(&migrate);
+	if (dev)
+		drm_pagemap_migrate_unmap_pages(dev, dma_addr, npages,
+						DMA_FROM_DEVICE);
+err_free:
+	kvfree(buf);
+err_out:
+
+	return err;
+}
+
+/**
+ * drm_pagemap_page_free() - Put GPU SVM zone device data associated with a page
+ * @page: Pointer to the page
+ *
+ * This function is a callback used to put the GPU SVM zone device data
+ * associated with a page when it is being released.
+ */
+static void drm_pagemap_page_free(struct page *page)
+{
+	drm_pagemap_zdd_put(page->zone_device_data);
+}
+
+/**
+ * drm_pagemap_migrate_to_ram() - Migrate a virtual range to RAM (page fault handler)
+ * @vmf: Pointer to the fault information structure
+ *
+ * This function is a page fault handler used to migrate a virtual range
+ * to ram. The device memory allocation in which the device page is found is
+ * migrated in its entirety.
+ *
+ * Returns:
+ * VM_FAULT_SIGBUS on failure, 0 on success.
+ */
+static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf)
+{
+	struct drm_pagemap_zdd *zdd = vmf->page->zone_device_data;
+	int err;
+
+	err = __drm_pagemap_migrate_to_ram(vmf->vma,
+					   zdd->device_private_page_owner,
+					   vmf->page, vmf->address,
+					   zdd->devmem_allocation->size);
+
+	return err ? VM_FAULT_SIGBUS : 0;
+}
+
+static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = {
+	.page_free = drm_pagemap_page_free,
+	.migrate_to_ram = drm_pagemap_migrate_to_ram,
+};
+
+/**
+ * drm_pagemap_pagemap_ops_get() - Retrieve GPU SVM device page map operations
+ *
+ * Returns:
+ * Pointer to the GPU SVM device page map operations structure.
+ */
+const struct dev_pagemap_ops *drm_pagemap_pagemap_ops_get(void)
+{
+	return &drm_pagemap_pagemap_ops;
+}
+EXPORT_SYMBOL_GPL(drm_pagemap_pagemap_ops_get);
+
+/**
+ * drm_pagemap_devmem_init() - Initialize a drm_pagemap device memory allocation
+ *
+ * @devmem_allocation: The struct drm_pagemap_devmem to initialize.
+ * @dev: Pointer to the device structure which device memory allocation belongs to
+ * @mm: Pointer to the mm_struct for the address space
+ * @ops: Pointer to the operations structure for GPU SVM device memory
+ * @dpagemap: The struct drm_pagemap we're allocating from.
+ * @size: Size of device memory allocation
+ */
+void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation,
+			     struct device *dev, struct mm_struct *mm,
+			     const struct drm_pagemap_devmem_ops *ops,
+			     struct drm_pagemap *dpagemap, size_t size)
+{
+	init_completion(&devmem_allocation->detached);
+	devmem_allocation->dev = dev;
+	devmem_allocation->mm = mm;
+	devmem_allocation->ops = ops;
+	devmem_allocation->dpagemap = dpagemap;
+	devmem_allocation->size = size;
+}
+EXPORT_SYMBOL_GPL(drm_pagemap_devmem_init);
+
+/**
+ * drm_pagemap_page_to_dpagemap() - Return a pointer the drm_pagemap of a page
+ * @page: The struct page.
+ *
+ * Return: A pointer to the struct drm_pagemap of a device private page that
+ * was populated from the struct drm_pagemap. If the page was *not* populated
+ * from a struct drm_pagemap, the result is undefined and the function call
+ * may result in dereferencing and invalid address.
+ */
+struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page)
+{
+	struct drm_pagemap_zdd *zdd = page->zone_device_data;
+
+	return zdd->devmem_allocation->dpagemap;
+}
+EXPORT_SYMBOL_GPL(drm_pagemap_page_to_dpagemap);
diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig
index 30ed74ad29ab..553c29e1030b 100644
--- a/drivers/gpu/drm/xe/Kconfig
+++ b/drivers/gpu/drm/xe/Kconfig
@@ -87,14 +87,16 @@ config DRM_XE_GPUSVM
 
 	  If in doubut say "Y".
 
-config DRM_XE_DEVMEM_MIRROR
-	bool "Enable device memory mirror"
+config DRM_XE_PAGEMAP
+	bool "Enable device memory pool for SVM"
 	depends on DRM_XE_GPUSVM
 	select GET_FREE_REGION
 	default y
 	help
-	  Disable this option only if you want to compile out without device
-	  memory mirror. Will reduce KMD memory footprint when disabled.
+	  Disable this option only if you don't want to expose local device
+	  memory for SVM. Will reduce KMD memory footprint when disabled.
+
+	  If in doubut say "Y".
 
 config DRM_XE_FORCE_PROBE
 	string "Force probe xe for selected Intel hardware IDs"
diff --git a/drivers/gpu/drm/xe/xe_bo_types.h b/drivers/gpu/drm/xe/xe_bo_types.h
index eb5e83c5f233..e0efaf23d051 100644
--- a/drivers/gpu/drm/xe/xe_bo_types.h
+++ b/drivers/gpu/drm/xe/xe_bo_types.h
@@ -86,7 +86,7 @@ struct xe_bo {
 	u16 cpu_caching;
 
 	/** @devmem_allocation: SVM device memory allocation */
-	struct drm_gpusvm_devmem devmem_allocation;
+	struct drm_pagemap_devmem devmem_allocation;
 
 	/** @vram_userfault_link: Link into @mem_access.vram_userfault.list */
 		struct list_head vram_userfault_link;
diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 6aca4b1a2824..7e4f6d846af6 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -106,7 +106,7 @@ struct xe_vram_region {
 	void __iomem *mapping;
 	/** @ttm: VRAM TTM manager */
 	struct xe_ttm_vram_mgr ttm;
-#if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
+#if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
 	/** @pagemap: Used to remap device memory as ZONE_DEVICE */
 	struct dev_pagemap pagemap;
 	/**
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index 26418e9bdff0..a4bb219b2407 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -295,7 +295,7 @@ static void xe_svm_garbage_collector_work_func(struct work_struct *w)
 	up_write(&vm->lock);
 }
 
-#if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
+#if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
 
 static struct xe_vram_region *page_to_vr(struct page *page)
 {
@@ -483,12 +483,12 @@ static int xe_svm_copy_to_ram(struct page **pages, dma_addr_t *dma_addr,
 	return xe_svm_copy(pages, dma_addr, npages, XE_SVM_COPY_TO_SRAM);
 }
 
-static struct xe_bo *to_xe_bo(struct drm_gpusvm_devmem *devmem_allocation)
+static struct xe_bo *to_xe_bo(struct drm_pagemap_devmem *devmem_allocation)
 {
 	return container_of(devmem_allocation, struct xe_bo, devmem_allocation);
 }
 
-static void xe_svm_devmem_release(struct drm_gpusvm_devmem *devmem_allocation)
+static void xe_svm_devmem_release(struct drm_pagemap_devmem *devmem_allocation)
 {
 	struct xe_bo *bo = to_xe_bo(devmem_allocation);
 
@@ -505,7 +505,7 @@ static struct drm_buddy *tile_to_buddy(struct xe_tile *tile)
 	return &tile->mem.vram.ttm.mm;
 }
 
-static int xe_svm_populate_devmem_pfn(struct drm_gpusvm_devmem *devmem_allocation,
+static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocation,
 				      unsigned long npages, unsigned long *pfn)
 {
 	struct xe_bo *bo = to_xe_bo(devmem_allocation);
@@ -528,7 +528,7 @@ static int xe_svm_populate_devmem_pfn(struct drm_gpusvm_devmem *devmem_allocatio
 	return 0;
 }
 
-static const struct drm_gpusvm_devmem_ops gpusvm_devmem_ops = {
+static const struct drm_pagemap_devmem_ops dpagemap_devmem_ops = {
 	.devmem_release = xe_svm_devmem_release,
 	.populate_devmem_pfn = xe_svm_populate_devmem_pfn,
 	.copy_to_devmem = xe_svm_copy_to_devmem,
@@ -676,7 +676,7 @@ u64 xe_svm_find_vma_start(struct xe_vm *vm, u64 start, u64 end, struct xe_vma *v
 					 min(end, xe_vma_end(vma)));
 }
 
-#if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
+#if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
 static struct xe_vram_region *tile_to_vr(struct xe_tile *tile)
 {
 	return &tile->mem.vram;
@@ -704,6 +704,9 @@ int xe_svm_alloc_vram(struct xe_vm *vm, struct xe_tile *tile,
 	ktime_t end = 0;
 	int err;
 
+	if (!range->base.flags.migrate_devmem)
+		return -EINVAL;
+
 	range_debug(range, "ALLOCATE VRAM");
 
 	if (!mmget_not_zero(mm))
@@ -723,19 +726,23 @@ retry:
 		goto unlock;
 	}
 
-	drm_gpusvm_devmem_init(&bo->devmem_allocation,
-			       vm->xe->drm.dev, mm,
-			       &gpusvm_devmem_ops,
-			       &tile->mem.vram.dpagemap,
-			       xe_svm_range_size(range));
+	drm_pagemap_devmem_init(&bo->devmem_allocation,
+				vm->xe->drm.dev, mm,
+				&dpagemap_devmem_ops,
+				&tile->mem.vram.dpagemap,
+				xe_svm_range_size(range));
 
 	blocks = &to_xe_ttm_vram_mgr_resource(bo->ttm.resource)->blocks;
 	list_for_each_entry(block, blocks, link)
 		block->private = vr;
 
 	xe_bo_get(bo);
-	err = drm_gpusvm_migrate_to_devmem(&vm->svm.gpusvm, &range->base,
-					   &bo->devmem_allocation, ctx);
+	err = drm_pagemap_migrate_to_devmem(&bo->devmem_allocation,
+					    mm,
+					    xe_svm_range_start(range),
+					    xe_svm_range_end(range),
+					    ctx->timeslice_ms,
+					    xe_svm_devm_owner(vm->xe));
 	if (err)
 		xe_svm_devmem_release(&bo->devmem_allocation);
 
@@ -810,13 +817,13 @@ int xe_svm_handle_pagefault(struct xe_vm *vm, struct xe_vma *vma,
 	struct drm_gpusvm_ctx ctx = {
 		.read_only = xe_vma_read_only(vma),
 		.devmem_possible = IS_DGFX(vm->xe) &&
-			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
+			IS_ENABLED(CONFIG_DRM_XE_PAGEMAP),
 		.check_pages_threshold = IS_DGFX(vm->xe) &&
-			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ? SZ_64K : 0,
+			IS_ENABLED(CONFIG_DRM_XE_PAGEMAP) ? SZ_64K : 0,
 		.devmem_only = atomic && IS_DGFX(vm->xe) &&
-			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR),
+			IS_ENABLED(CONFIG_DRM_XE_PAGEMAP),
 		.timeslice_ms = atomic && IS_DGFX(vm->xe) &&
-			IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR) ?
+			IS_ENABLED(CONFIG_DRM_XE_PAGEMAP) ?
 			vm->xe->atomic_svm_timeslice_ms : 0,
 	};
 	struct xe_svm_range *range;
@@ -944,7 +951,7 @@ bool xe_svm_has_mapping(struct xe_vm *vm, u64 start, u64 end)
  */
 int xe_svm_bo_evict(struct xe_bo *bo)
 {
-	return drm_gpusvm_evict_to_ram(&bo->devmem_allocation);
+	return drm_pagemap_evict_to_ram(&bo->devmem_allocation);
 }
 
 /**
@@ -997,7 +1004,7 @@ int xe_svm_range_get_pages(struct xe_vm *vm, struct xe_svm_range *range,
 	return err;
 }
 
-#if IS_ENABLED(CONFIG_DRM_XE_DEVMEM_MIRROR)
+#if IS_ENABLED(CONFIG_DRM_XE_PAGEMAP)
 
 static struct drm_pagemap_device_addr
 xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap,
@@ -1054,7 +1061,7 @@ int xe_devm_add(struct xe_tile *tile, struct xe_vram_region *vr)
 	vr->pagemap.range.start = res->start;
 	vr->pagemap.range.end = res->end;
 	vr->pagemap.nr_range = 1;
-	vr->pagemap.ops = drm_gpusvm_pagemap_ops_get();
+	vr->pagemap.ops = drm_pagemap_pagemap_ops_get();
 	vr->pagemap.owner = xe_svm_devm_owner(xe);
 	addr = devm_memremap_pages(dev, &vr->pagemap);
 
diff --git a/include/drm/drm_gpusvm.h b/include/drm/drm_gpusvm.h
index 6a5156476bf4..4aedc5423aff 100644
--- a/include/drm/drm_gpusvm.h
+++ b/include/drm/drm_gpusvm.h
@@ -16,91 +16,9 @@ struct drm_gpusvm;
 struct drm_gpusvm_notifier;
 struct drm_gpusvm_ops;
 struct drm_gpusvm_range;
-struct drm_gpusvm_devmem;
 struct drm_pagemap;
 struct drm_pagemap_device_addr;
 
-/**
- * struct drm_gpusvm_devmem_ops - Operations structure for GPU SVM device memory
- *
- * This structure defines the operations for GPU Shared Virtual Memory (SVM)
- * device memory. These operations are provided by the GPU driver to manage device memory
- * allocations and perform operations such as migration between device memory and system
- * RAM.
- */
-struct drm_gpusvm_devmem_ops {
-	/**
-	 * @devmem_release: Release device memory allocation (optional)
-	 * @devmem_allocation: device memory allocation
-	 *
-	 * Release device memory allocation and drop a reference to device
-	 * memory allocation.
-	 */
-	void (*devmem_release)(struct drm_gpusvm_devmem *devmem_allocation);
-
-	/**
-	 * @populate_devmem_pfn: Populate device memory PFN (required for migration)
-	 * @devmem_allocation: device memory allocation
-	 * @npages: Number of pages to populate
-	 * @pfn: Array of page frame numbers to populate
-	 *
-	 * Populate device memory page frame numbers (PFN).
-	 *
-	 * Return: 0 on success, a negative error code on failure.
-	 */
-	int (*populate_devmem_pfn)(struct drm_gpusvm_devmem *devmem_allocation,
-				   unsigned long npages, unsigned long *pfn);
-
-	/**
-	 * @copy_to_devmem: Copy to device memory (required for migration)
-	 * @pages: Pointer to array of device memory pages (destination)
-	 * @dma_addr: Pointer to array of DMA addresses (source)
-	 * @npages: Number of pages to copy
-	 *
-	 * Copy pages to device memory.
-	 *
-	 * Return: 0 on success, a negative error code on failure.
-	 */
-	int (*copy_to_devmem)(struct page **pages,
-			      dma_addr_t *dma_addr,
-			      unsigned long npages);
-
-	/**
-	 * @copy_to_ram: Copy to system RAM (required for migration)
-	 * @pages: Pointer to array of device memory pages (source)
-	 * @dma_addr: Pointer to array of DMA addresses (destination)
-	 * @npages: Number of pages to copy
-	 *
-	 * Copy pages to system RAM.
-	 *
-	 * Return: 0 on success, a negative error code on failure.
-	 */
-	int (*copy_to_ram)(struct page **pages,
-			   dma_addr_t *dma_addr,
-			   unsigned long npages);
-};
-
-/**
- * struct drm_gpusvm_devmem - Structure representing a GPU SVM device memory allocation
- *
- * @dev: Pointer to the device structure which device memory allocation belongs to
- * @mm: Pointer to the mm_struct for the address space
- * @detached: device memory allocations is detached from device pages
- * @ops: Pointer to the operations structure for GPU SVM device memory
- * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to.
- * @size: Size of device memory allocation
- * @timeslice_expiration: Timeslice expiration in jiffies
- */
-struct drm_gpusvm_devmem {
-	struct device *dev;
-	struct mm_struct *mm;
-	struct completion detached;
-	const struct drm_gpusvm_devmem_ops *ops;
-	struct drm_pagemap *dpagemap;
-	size_t size;
-	u64 timeslice_expiration;
-};
-
 /**
  * struct drm_gpusvm_ops - Operations structure for GPU SVM
  *
@@ -361,15 +279,6 @@ void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
 				  struct drm_gpusvm_range *range,
 				  const struct drm_gpusvm_ctx *ctx);
 
-int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
-				 struct drm_gpusvm_range *range,
-				 struct drm_gpusvm_devmem *devmem_allocation,
-				 const struct drm_gpusvm_ctx *ctx);
-
-int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation);
-
-const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void);
-
 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
 			    unsigned long end);
 
@@ -380,11 +289,6 @@ drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
 				   const struct mmu_notifier_range *mmu_range);
 
-void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
-			    struct device *dev, struct mm_struct *mm,
-			    const struct drm_gpusvm_devmem_ops *ops,
-			    struct drm_pagemap *dpagemap, size_t size);
-
 #ifdef CONFIG_LOCKDEP
 /**
  * drm_gpusvm_driver_set_lock() - Set the lock protecting accesses to GPU SVM
diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
index 202c157ff4d7..dabc9c365df4 100644
--- a/include/drm/drm_pagemap.h
+++ b/include/drm/drm_pagemap.h
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 
 struct drm_pagemap;
+struct drm_pagemap_zdd;
 struct device;
 
 /**
@@ -104,4 +105,104 @@ struct drm_pagemap {
 	struct device *dev;
 };
 
+struct drm_pagemap_devmem;
+
+/**
+ * struct drm_pagemap_devmem_ops - Operations structure for GPU SVM device memory
+ *
+ * This structure defines the operations for GPU Shared Virtual Memory (SVM)
+ * device memory. These operations are provided by the GPU driver to manage device memory
+ * allocations and perform operations such as migration between device memory and system
+ * RAM.
+ */
+struct drm_pagemap_devmem_ops {
+	/**
+	 * @devmem_release: Release device memory allocation (optional)
+	 * @devmem_allocation: device memory allocation
+	 *
+	 * Release device memory allocation and drop a reference to device
+	 * memory allocation.
+	 */
+	void (*devmem_release)(struct drm_pagemap_devmem *devmem_allocation);
+
+	/**
+	 * @populate_devmem_pfn: Populate device memory PFN (required for migration)
+	 * @devmem_allocation: device memory allocation
+	 * @npages: Number of pages to populate
+	 * @pfn: Array of page frame numbers to populate
+	 *
+	 * Populate device memory page frame numbers (PFN).
+	 *
+	 * Return: 0 on success, a negative error code on failure.
+	 */
+	int (*populate_devmem_pfn)(struct drm_pagemap_devmem *devmem_allocation,
+				   unsigned long npages, unsigned long *pfn);
+
+	/**
+	 * @copy_to_devmem: Copy to device memory (required for migration)
+	 * @pages: Pointer to array of device memory pages (destination)
+	 * @dma_addr: Pointer to array of DMA addresses (source)
+	 * @npages: Number of pages to copy
+	 *
+	 * Copy pages to device memory.
+	 *
+	 * Return: 0 on success, a negative error code on failure.
+	 */
+	int (*copy_to_devmem)(struct page **pages,
+			      dma_addr_t *dma_addr,
+			      unsigned long npages);
+
+	/**
+	 * @copy_to_ram: Copy to system RAM (required for migration)
+	 * @pages: Pointer to array of device memory pages (source)
+	 * @dma_addr: Pointer to array of DMA addresses (destination)
+	 * @npages: Number of pages to copy
+	 *
+	 * Copy pages to system RAM.
+	 *
+	 * Return: 0 on success, a negative error code on failure.
+	 */
+	int (*copy_to_ram)(struct page **pages,
+			   dma_addr_t *dma_addr,
+			   unsigned long npages);
+};
+
+/**
+ * struct drm_pagemap_devmem - Structure representing a GPU SVM device memory allocation
+ *
+ * @dev: Pointer to the device structure which device memory allocation belongs to
+ * @mm: Pointer to the mm_struct for the address space
+ * @detached: device memory allocations is detached from device pages
+ * @ops: Pointer to the operations structure for GPU SVM device memory
+ * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to.
+ * @size: Size of device memory allocation
+ * @timeslice_expiration: Timeslice expiration in jiffies
+ */
+struct drm_pagemap_devmem {
+	struct device *dev;
+	struct mm_struct *mm;
+	struct completion detached;
+	const struct drm_pagemap_devmem_ops *ops;
+	struct drm_pagemap *dpagemap;
+	size_t size;
+	u64 timeslice_expiration;
+};
+
+int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation,
+				  struct mm_struct *mm,
+				  unsigned long start, unsigned long end,
+				  unsigned long timeslice_ms,
+				  void *pgmap_owner);
+
+int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation);
+
+const struct dev_pagemap_ops *drm_pagemap_pagemap_ops_get(void);
+
+struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page);
+
+void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation,
+			     struct device *dev, struct mm_struct *mm,
+			     const struct drm_pagemap_devmem_ops *ops,
+			     struct drm_pagemap *dpagemap, size_t size);
+
 #endif
-- 
cgit v1.2.3


From 2ef19be2a545a63310c5c0fae0e173fc0c33bb6a Mon Sep 17 00:00:00 2001
From: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Date: Thu, 19 Jun 2025 15:40:34 +0200
Subject: drm/pagemap: Add a populate_mm op
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an operation to populate a part of a drm_mm with device
private memory. Clarify how migration using it is intended
to work.

v3:
- Kerneldoc fixes and updates (Matt Brost).
v4:
- More kerneldoc fixes. Rebase.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20250619134035.170086-3-thomas.hellstrom@linux.intel.com
---
 drivers/gpu/drm/drm_gpusvm.c  |  9 ++----
 drivers/gpu/drm/drm_pagemap.c | 67 +++++++++++++++++++++++++++++++++++--------
 include/drm/drm_pagemap.h     | 34 ++++++++++++++++++++++
 3 files changed, 92 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gpusvm.c b/drivers/gpu/drm/drm_gpusvm.c
index e454bb806c72..5bb4c77db2c3 100644
--- a/drivers/gpu/drm/drm_gpusvm.c
+++ b/drivers/gpu/drm/drm_gpusvm.c
@@ -176,12 +176,9 @@
  *		}
  *
  *		if (driver_migration_policy(range)) {
- *			mmap_read_lock(mm);
- *			devmem = driver_alloc_devmem();
- *			err = drm_pagemap_migrate_to_devmem(devmem, gpusvm->mm, gpuva_start,
- *                                                          gpuva_end, ctx->timeslice_ms,
- *                                                          driver_pgmap_owner());
- *                      mmap_read_unlock(mm);
+ *			err = drm_pagemap_populate_mm(driver_choose_drm_pagemap(),
+ *						      gpuva_start, gpuva_end, gpusvm->mm,
+ *						      ctx->timeslice_ms);
  *			if (err)	// CPU mappings may have changed
  *				goto retry;
  *		}
diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
index cef4657b6e8a..13e1519aa6d6 100644
--- a/drivers/gpu/drm/drm_pagemap.c
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -6,6 +6,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/migrate.h>
 #include <linux/pagemap.h>
+#include <drm/drm_drv.h>
 #include <drm/drm_pagemap.h>
 
 /**
@@ -20,23 +21,30 @@
  * system.
  *
  * Typically the DRM pagemap receives requests from one or more DRM GPU SVM
- * instances to populate struct mm_struct virtual ranges with memory.
+ * instances to populate struct mm_struct virtual ranges with memory, and the
+ * migration is best effort only and may thus fail. The implementation should
+ * also handle device unbinding by blocking (return an -ENODEV) error for new
+ * population requests and after that migrate all device pages to system ram.
  */
 
 /**
  * DOC: Migration
  *
- * The migration support is quite simple, allowing migration between RAM and
- * device memory at the range granularity. For example, GPU SVM currently does
- * not support mixing RAM and device memory pages within a range. This means
- * that upon GPU fault, the entire range can be migrated to device memory, and
- * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
- * memory storage within a range could be added in the future if required.
- *
- * The reasoning for only supporting range granularity is as follows: it
- * simplifies the implementation, and range sizes are driver-defined and should
- * be relatively small.
- *
+ * Migration granularity typically follows the GPU SVM range requests, but
+ * if there are clashes, due to races or due to the fact that multiple GPU
+ * SVM instances have different views of the ranges used, and because of that
+ * parts of a requested range is already present in the requested device memory,
+ * the implementation has a variety of options. It can fail and it can choose
+ * to populate only the part of the range that isn't already in device memory,
+ * and it can evict the range to system before trying to migrate. Ideally an
+ * implementation would just try to migrate the missing part of the range and
+ * allocate just enough memory to do so.
+ *
+ * When migrating to system memory as a response to a cpu fault or a device
+ * memory eviction request, currently a full device memory allocation is
+ * migrated back to system. Moving forward this might need improvement for
+ * situations where a single page needs bouncing between system memory and
+ * device memory due to, for example, atomic operations.
  *
  * Key DRM pagemap components:
  *
@@ -792,3 +800,38 @@ struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page)
 	return zdd->devmem_allocation->dpagemap;
 }
 EXPORT_SYMBOL_GPL(drm_pagemap_page_to_dpagemap);
+
+/**
+ * drm_pagemap_populate_mm() - Populate a virtual range with device memory pages
+ * @dpagemap: Pointer to the drm_pagemap managing the device memory
+ * @start: Start of the virtual range to populate.
+ * @end: End of the virtual range to populate.
+ * @mm: Pointer to the virtual address space.
+ * @timeslice_ms: The time requested for the migrated pagemap pages to
+ * be present in @mm before being allowed to be migrated back.
+ *
+ * Attempt to populate a virtual range with device memory pages,
+ * clearing them or migrating data from the existing pages if necessary.
+ * The function is best effort only, and implementations may vary
+ * in how hard they try to satisfy the request.
+ *
+ * Return: %0 on success, negative error code on error. If the hardware
+ * device was removed / unbound the function will return %-ENODEV.
+ */
+int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
+			    unsigned long start, unsigned long end,
+			    struct mm_struct *mm,
+			    unsigned long timeslice_ms)
+{
+	int err;
+
+	if (!mmget_not_zero(mm))
+		return -EFAULT;
+	mmap_read_lock(mm);
+	err = dpagemap->ops->populate_mm(dpagemap, start, end, mm,
+					 timeslice_ms);
+	mmap_read_unlock(mm);
+	mmput(mm);
+
+	return err;
+}
diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
index dabc9c365df4..e5f20a1235be 100644
--- a/include/drm/drm_pagemap.h
+++ b/include/drm/drm_pagemap.h
@@ -92,6 +92,35 @@ struct drm_pagemap_ops {
 			     struct device *dev,
 			     struct drm_pagemap_device_addr addr);
 
+	/**
+	 * @populate_mm: Populate part of the mm with @dpagemap memory,
+	 * migrating existing data.
+	 * @dpagemap: The struct drm_pagemap managing the memory.
+	 * @start: The virtual start address in @mm
+	 * @end: The virtual end address in @mm
+	 * @mm: Pointer to a live mm. The caller must have an mmget()
+	 * reference.
+	 *
+	 * The caller will have the mm lock at least in read mode.
+	 * Note that there is no guarantee that the memory is resident
+	 * after the function returns, it's best effort only.
+	 * When the mm is not using the memory anymore,
+	 * it will be released. The struct drm_pagemap might have a
+	 * mechanism in place to reclaim the memory and the data will
+	 * then be migrated. Typically to system memory.
+	 * The implementation should hold sufficient runtime power-
+	 * references while pages are used in an address space and
+	 * should ideally guard against hardware device unbind in
+	 * a way such that device pages are migrated back to system
+	 * followed by device page removal. The implementation should
+	 * return -ENODEV after device removal.
+	 *
+	 * Return: 0 if successful. Negative error code on error.
+	 */
+	int (*populate_mm)(struct drm_pagemap *dpagemap,
+			   unsigned long start, unsigned long end,
+			   struct mm_struct *mm,
+			   unsigned long timeslice_ms);
 };
 
 /**
@@ -205,4 +234,9 @@ void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation,
 			     const struct drm_pagemap_devmem_ops *ops,
 			     struct drm_pagemap *dpagemap, size_t size);
 
+int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
+			    unsigned long start, unsigned long end,
+			    struct mm_struct *mm,
+			    unsigned long timeslice_ms);
+
 #endif
-- 
cgit v1.2.3


From 3a95a561f2763e3854e207de3ea821e795a1f1e0 Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Thu, 26 Jun 2025 08:08:28 +0200
Subject: uaccess: Define pagefault lock guard

Define a pagefault lock guard which allows to simplify functions that
need to disable page faults.

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Link: https://lore.kernel.org/r/8a01beb0b671923976f08297d81242bb2129881d.1750917800.git.vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/uaccess.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 7c06f4795670..1beb5b395d81 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -296,6 +296,8 @@ static inline bool pagefault_disabled(void)
  */
 #define faulthandler_disabled() (pagefault_disabled() || in_atomic())
 
+DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable())
+
 #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS
 
 /**
-- 
cgit v1.2.3


From c430955d0cb87fb7c6b186e457cb3beca4a9c89a Mon Sep 17 00:00:00 2001
From: Gwendal Grignou <gwendal@chromium.org>
Date: Tue, 3 Jun 2025 22:39:03 -0700
Subject: iio: cros_ec_sensors: add cros_ec_activity driver

ChromeOS EC can report activity information derived from the
accelerometer:
- Reports on-body/off-body as a proximity event.
- Reports significant motion as an activity event.

This new sensor is a virtual sensor, included only when the EC firmware
is compiled with the appropriate module.

Signed-off-by: Gwendal Grignou <gwendal@google.com>
Reviewed-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://patch.msgid.link/20250604053903.1376465-1-gwendal@google.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/common/cros_ec_sensors/Kconfig         |   9 +
 drivers/iio/common/cros_ec_sensors/Makefile        |   1 +
 .../iio/common/cros_ec_sensors/cros_ec_activity.c  | 307 +++++++++++++++++++++
 .../common/cros_ec_sensors/cros_ec_sensors_core.c  |  10 +
 include/linux/iio/common/cros_ec_sensors_core.h    |   1 +
 include/linux/platform_data/cros_ec_commands.h     |  26 +-
 6 files changed, 352 insertions(+), 2 deletions(-)
 create mode 100644 drivers/iio/common/cros_ec_sensors/cros_ec_activity.c

(limited to 'include')

diff --git a/drivers/iio/common/cros_ec_sensors/Kconfig b/drivers/iio/common/cros_ec_sensors/Kconfig
index fefad9572790..394e319c9c97 100644
--- a/drivers/iio/common/cros_ec_sensors/Kconfig
+++ b/drivers/iio/common/cros_ec_sensors/Kconfig
@@ -30,3 +30,12 @@ config IIO_CROS_EC_SENSORS_LID_ANGLE
 	  convertible devices.
 	  This module is loaded when the EC can calculate the angle between the base
 	  and the lid.
+
+config IIO_CROS_EC_ACTIVITY
+	tristate "ChromeOS EC Activity Sensors"
+	depends on IIO_CROS_EC_SENSORS_CORE
+	help
+	  Module to handle activity events presented by the ChromeOS EC sensor hub.
+	  Activities can be a proximity detector (on body/off body detection)
+	  or a significant motion detector.
+	  Creates an IIO device to manage all activities.
diff --git a/drivers/iio/common/cros_ec_sensors/Makefile b/drivers/iio/common/cros_ec_sensors/Makefile
index c358fa0328ab..a7dfb5794cae 100644
--- a/drivers/iio/common/cros_ec_sensors/Makefile
+++ b/drivers/iio/common/cros_ec_sensors/Makefile
@@ -7,3 +7,4 @@ cros-ec-sensors-core-objs += cros_ec_sensors_core.o cros_ec_sensors_trace.o
 obj-$(CONFIG_IIO_CROS_EC_SENSORS_CORE) += cros-ec-sensors-core.o
 obj-$(CONFIG_IIO_CROS_EC_SENSORS) += cros_ec_sensors.o
 obj-$(CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE) += cros_ec_lid_angle.o
+obj-$(CONFIG_IIO_CROS_EC_ACTIVITY) += cros_ec_activity.o
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_activity.c b/drivers/iio/common/cros_ec_sensors/cros_ec_activity.c
new file mode 100644
index 000000000000..6e38d115b6fe
--- /dev/null
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_activity.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cros_ec_activity - Driver for activities/gesture recognition.
+ *
+ * Copyright 2025 Google, Inc
+ *
+ * This driver uses the cros-ec interface to communicate with the ChromeOS
+ * EC about activity data.
+ */
+
+#include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+#include <linux/platform_data/cros_ec_commands.h>
+#include <linux/platform_data/cros_ec_proto.h>
+
+#include <linux/iio/common/cros_ec_sensors_core.h>
+#include <linux/iio/events.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/trigger_consumer.h>
+
+#define DRV_NAME "cros-ec-activity"
+
+/* state data for ec_sensors iio driver. */
+struct cros_ec_sensors_state {
+	/* Shared by all sensors */
+	struct cros_ec_sensors_core_state core;
+
+	struct iio_chan_spec *channels;
+
+	int body_detection_channel_index;
+	int sig_motion_channel_index;
+};
+
+static const struct iio_event_spec cros_ec_activity_single_shot[] = {
+	{
+		.type = IIO_EV_TYPE_CHANGE,
+		/* significant motion trigger when we get out of still. */
+		.dir = IIO_EV_DIR_FALLING,
+		.mask_separate = BIT(IIO_EV_INFO_ENABLE),
+	},
+};
+
+static const struct iio_event_spec cros_ec_body_detect_events[] = {
+	{
+		.type = IIO_EV_TYPE_CHANGE,
+		.dir = IIO_EV_DIR_EITHER,
+		.mask_separate = BIT(IIO_EV_INFO_ENABLE),
+	},
+};
+
+static int cros_ec_activity_sensors_read_raw(struct iio_dev *indio_dev,
+					     struct iio_chan_spec const *chan,
+					     int *val, int *val2, long mask)
+{
+	struct cros_ec_sensors_state *st = iio_priv(indio_dev);
+	int ret;
+
+	if (chan->type != IIO_PROXIMITY || mask != IIO_CHAN_INFO_RAW)
+		return -EINVAL;
+
+	guard(mutex)(&st->core.cmd_lock);
+	st->core.param.cmd = MOTIONSENSE_CMD_GET_ACTIVITY;
+	st->core.param.get_activity.activity =
+		MOTIONSENSE_ACTIVITY_BODY_DETECTION;
+	ret = cros_ec_motion_send_host_cmd(&st->core, 0);
+	if (ret)
+		return ret;
+
+	/*
+	 * EC actually report if a body is near (1) or far (0).
+	 * Units for proximity sensor after scale is in meter,
+	 * so invert the result to return 0m when near and 1m when far.
+	 */
+	*val = !st->core.resp->get_activity.state;
+	return IIO_VAL_INT;
+}
+
+static int cros_ec_activity_read_event_config(struct iio_dev *indio_dev,
+					      const struct iio_chan_spec *chan,
+					      enum iio_event_type type,
+					      enum iio_event_direction dir)
+{
+	struct cros_ec_sensors_state *st = iio_priv(indio_dev);
+	int ret;
+
+	if (chan->type != IIO_ACTIVITY && chan->type != IIO_PROXIMITY)
+		return -EINVAL;
+
+	guard(mutex)(&st->core.cmd_lock);
+	st->core.param.cmd = MOTIONSENSE_CMD_LIST_ACTIVITIES;
+	ret = cros_ec_motion_send_host_cmd(&st->core, 0);
+	if (ret)
+		return ret;
+
+	switch (chan->type) {
+	case IIO_PROXIMITY:
+		return !!(st->core.resp->list_activities.enabled &
+			 (1 << MOTIONSENSE_ACTIVITY_BODY_DETECTION));
+	case IIO_ACTIVITY:
+		if (chan->channel2 == IIO_MOD_STILL) {
+			return !!(st->core.resp->list_activities.enabled &
+				 (1 << MOTIONSENSE_ACTIVITY_SIG_MOTION));
+		}
+
+		dev_warn(&indio_dev->dev, "Unknown activity: %d\n",
+			 chan->channel2);
+		return -EINVAL;
+	default:
+		dev_warn(&indio_dev->dev, "Unknown channel type: %d\n",
+			 chan->type);
+		return -EINVAL;
+	}
+}
+
+static int cros_ec_activity_write_event_config(struct iio_dev *indio_dev,
+					       const struct iio_chan_spec *chan,
+					       enum iio_event_type type,
+					       enum iio_event_direction dir,
+					       bool state)
+{
+	struct cros_ec_sensors_state *st = iio_priv(indio_dev);
+
+	guard(mutex)(&st->core.cmd_lock);
+	st->core.param.cmd = MOTIONSENSE_CMD_SET_ACTIVITY;
+	switch (chan->type) {
+	case IIO_PROXIMITY:
+		st->core.param.set_activity.activity =
+			MOTIONSENSE_ACTIVITY_BODY_DETECTION;
+		break;
+	case IIO_ACTIVITY:
+		if (chan->channel2 == IIO_MOD_STILL) {
+			st->core.param.set_activity.activity =
+				MOTIONSENSE_ACTIVITY_SIG_MOTION;
+			break;
+		}
+		dev_warn(&indio_dev->dev, "Unknown activity: %d\n",
+			 chan->channel2);
+		return -EINVAL;
+	default:
+		dev_warn(&indio_dev->dev, "Unknown channel type: %d\n",
+			 chan->type);
+		return -EINVAL;
+	}
+	st->core.param.set_activity.enable = state;
+	return cros_ec_motion_send_host_cmd(&st->core, 0);
+}
+
+static int cros_ec_activity_push_data(struct iio_dev *indio_dev,
+				      s16 *data, s64 timestamp)
+{
+	struct ec_response_activity_data *activity_data =
+			(struct ec_response_activity_data *)data;
+	enum motionsensor_activity activity = activity_data->activity;
+	u8 state = activity_data->state;
+	const struct cros_ec_sensors_state *st = iio_priv(indio_dev);
+	const struct iio_chan_spec *chan;
+	enum iio_event_direction dir;
+	int index;
+
+	switch (activity) {
+	case MOTIONSENSE_ACTIVITY_BODY_DETECTION:
+		index = st->body_detection_channel_index;
+		dir = state ? IIO_EV_DIR_FALLING : IIO_EV_DIR_RISING;
+		break;
+	case MOTIONSENSE_ACTIVITY_SIG_MOTION:
+		index = st->sig_motion_channel_index;
+		dir = IIO_EV_DIR_FALLING;
+		break;
+	default:
+		dev_warn(&indio_dev->dev, "Unknown activity: %d\n", activity);
+		return 0;
+	}
+	chan = &st->channels[index];
+	iio_push_event(indio_dev,
+		       IIO_UNMOD_EVENT_CODE(chan->type, index, chan->event_spec[0].type, dir),
+		       timestamp);
+	return 0;
+}
+
+static irqreturn_t cros_ec_activity_capture(int irq, void *p)
+{
+	struct iio_poll_func *pf = p;
+	struct iio_dev *indio_dev = pf->indio_dev;
+
+	/*
+	 * This callback would be called when a software trigger is
+	 * used. But when this virtual sensor is present, it is guaranteed
+	 * the sensor hub is advanced enough to not need a software trigger.
+	 */
+	dev_warn(&indio_dev->dev, "%s: Not Expected\n", __func__);
+	return IRQ_NONE;
+}
+
+static const struct iio_info ec_sensors_info = {
+	.read_raw = &cros_ec_activity_sensors_read_raw,
+	.read_event_config = cros_ec_activity_read_event_config,
+	.write_event_config = cros_ec_activity_write_event_config,
+};
+
+static int cros_ec_sensors_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct cros_ec_device *ec_device = dev_get_drvdata(dev->parent);
+	struct iio_dev *indio_dev;
+	struct cros_ec_sensors_state *st;
+	struct iio_chan_spec *channel;
+	unsigned long activities;
+	int i, index, ret, nb_activities;
+
+	if (!ec_device) {
+		dev_warn(dev, "No CROS EC device found.\n");
+		return -EINVAL;
+	}
+
+	indio_dev = devm_iio_device_alloc(dev, sizeof(*st));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	ret = cros_ec_sensors_core_init(pdev, indio_dev, true,
+					cros_ec_activity_capture);
+	if (ret)
+		return ret;
+
+	indio_dev->info = &ec_sensors_info;
+	st = iio_priv(indio_dev);
+	st->core.type = st->core.resp->info.type;
+	st->core.read_ec_sensors_data = cros_ec_sensors_read_cmd;
+
+	st->core.param.cmd = MOTIONSENSE_CMD_LIST_ACTIVITIES;
+	ret = cros_ec_motion_send_host_cmd(&st->core, 0);
+	if (ret)
+		return ret;
+
+	activities = st->core.resp->list_activities.enabled |
+		     st->core.resp->list_activities.disabled;
+	if (!activities)
+		return -ENODEV;
+
+	/* Allocate a channel per activity and one for timestamp */
+	nb_activities = hweight_long(activities) + 1;
+	st->channels = devm_kcalloc(dev, nb_activities,
+				    sizeof(*st->channels), GFP_KERNEL);
+	if (!st->channels)
+		return -ENOMEM;
+
+	channel = &st->channels[0];
+	index = 0;
+	for_each_set_bit(i, &activities, BITS_PER_LONG) {
+		/* List all available triggers */
+		if (i == MOTIONSENSE_ACTIVITY_BODY_DETECTION) {
+			channel->type = IIO_PROXIMITY;
+			channel->info_mask_separate = BIT(IIO_CHAN_INFO_RAW);
+			channel->event_spec = cros_ec_body_detect_events;
+			channel->num_event_specs =
+				ARRAY_SIZE(cros_ec_body_detect_events);
+			st->body_detection_channel_index = index;
+		} else {
+			channel->type = IIO_ACTIVITY;
+			channel->modified = 1;
+			channel->event_spec = cros_ec_activity_single_shot;
+			channel->num_event_specs =
+				ARRAY_SIZE(cros_ec_activity_single_shot);
+			if (i == MOTIONSENSE_ACTIVITY_SIG_MOTION) {
+				channel->channel2 = IIO_MOD_STILL;
+				st->sig_motion_channel_index = index;
+			} else {
+				dev_warn(dev, "Unknown activity: %d\n", i);
+				continue;
+			}
+		}
+		channel->ext_info = cros_ec_sensors_limited_info;
+		channel->scan_index = index++;
+		channel++;
+	}
+
+	/* Timestamp */
+	channel->scan_index = index;
+	channel->type = IIO_TIMESTAMP;
+	channel->channel = -1;
+	channel->scan_type.sign = 's';
+	channel->scan_type.realbits = 64;
+	channel->scan_type.storagebits = 64;
+
+	indio_dev->channels = st->channels;
+	indio_dev->num_channels = index + 1;
+
+	return cros_ec_sensors_core_register(dev, indio_dev,
+					     cros_ec_activity_push_data);
+}
+
+static struct platform_driver cros_ec_sensors_platform_driver = {
+	.driver = {
+		.name	= DRV_NAME,
+	},
+	.probe		= cros_ec_sensors_probe,
+};
+module_platform_driver(cros_ec_sensors_platform_driver);
+
+MODULE_DESCRIPTION("ChromeOS EC activity sensors driver");
+MODULE_ALIAS("platform:" DRV_NAME);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
index 700ebcd68ff4..9ac80e4b7d75 100644
--- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
+++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors_core.c
@@ -486,6 +486,16 @@ const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[] = {
 };
 EXPORT_SYMBOL_GPL(cros_ec_sensors_ext_info);
 
+const struct iio_chan_spec_ext_info cros_ec_sensors_limited_info[] = {
+	{
+		.name = "id",
+		.shared = IIO_SHARED_BY_ALL,
+		.read = cros_ec_sensors_id
+	},
+	{ }
+};
+EXPORT_SYMBOL_GPL(cros_ec_sensors_limited_info);
+
 /**
  * cros_ec_sensors_idx_to_reg - convert index into offset in shared memory
  * @st:		pointer to state information for device
diff --git a/include/linux/iio/common/cros_ec_sensors_core.h b/include/linux/iio/common/cros_ec_sensors_core.h
index e72167b96d27..bb966abcde53 100644
--- a/include/linux/iio/common/cros_ec_sensors_core.h
+++ b/include/linux/iio/common/cros_ec_sensors_core.h
@@ -126,5 +126,6 @@ extern const struct dev_pm_ops cros_ec_sensors_pm_ops;
 
 /* List of extended channel specification for all sensors. */
 extern const struct iio_chan_spec_ext_info cros_ec_sensors_ext_info[];
+extern const struct iio_chan_spec_ext_info cros_ec_sensors_limited_info[];
 
 #endif  /* __CROS_EC_SENSORS_CORE_H */
diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 1f4e4f2b89bb..c19b404e3d8d 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -2388,6 +2388,12 @@ enum motionsense_command {
 	 */
 	MOTIONSENSE_CMD_SENSOR_SCALE = 18,
 
+	/*
+	 * Activity management
+	 * Retrieve current status of given activity.
+	 */
+	MOTIONSENSE_CMD_GET_ACTIVITY = 20,
+
 	/* Number of motionsense sub-commands. */
 	MOTIONSENSE_NUM_CMDS
 };
@@ -2447,6 +2453,11 @@ enum motionsensor_orientation {
 	MOTIONSENSE_ORIENTATION_UNKNOWN = 4,
 };
 
+struct ec_response_activity_data {
+	uint8_t activity; /* motionsensor_activity */
+	uint8_t state;
+} __ec_todo_packed;
+
 struct ec_response_motion_sensor_data {
 	/* Flags for each sensor. */
 	uint8_t flags;
@@ -2460,8 +2471,7 @@ struct ec_response_motion_sensor_data {
 			uint32_t    timestamp;
 		};
 		struct __ec_todo_unpacked {
-			uint8_t     activity; /* motionsensor_activity */
-			uint8_t     state;
+			struct ec_response_activity_data activity_data;
 			int16_t     add_info[2];
 		};
 	};
@@ -2494,6 +2504,7 @@ enum motionsensor_activity {
 	MOTIONSENSE_ACTIVITY_SIG_MOTION = 1,
 	MOTIONSENSE_ACTIVITY_DOUBLE_TAP = 2,
 	MOTIONSENSE_ACTIVITY_ORIENTATION = 3,
+	MOTIONSENSE_ACTIVITY_BODY_DETECTION = 4,
 };
 
 struct ec_motion_sense_activity {
@@ -2671,6 +2682,7 @@ struct ec_params_motion_sense {
 			uint32_t max_data_vector;
 		} fifo_read;
 
+		/* Used for MOTIONSENSE_CMD_SET_ACTIVITY */
 		struct ec_motion_sense_activity set_activity;
 
 		/* Used for MOTIONSENSE_CMD_LID_ANGLE */
@@ -2716,6 +2728,12 @@ struct ec_params_motion_sense {
 			 */
 			int16_t hys_degree;
 		} tablet_mode_threshold;
+
+		/* Used for MOTIONSENSE_CMD_GET_ACTIVITY */
+		struct __ec_todo_unpacked {
+			uint8_t sensor_num;
+			uint8_t activity;  /* enum motionsensor_activity */
+		} get_activity;
 	};
 } __ec_todo_packed;
 
@@ -2833,6 +2851,10 @@ struct ec_response_motion_sense {
 			uint16_t hys_degree;
 		} tablet_mode_threshold;
 
+		/* USED for MOTIONSENSE_CMD_GET_ACTIVITY. */
+		struct __ec_todo_unpacked {
+			uint8_t     state;
+		} get_activity;
 	};
 } __ec_todo_packed;
 
-- 
cgit v1.2.3


From 97e6882ed1a16cd22184127abf2bc9b8202f37e0 Mon Sep 17 00:00:00 2001
From: Pop Ioan Daniel <pop.ioan-daniel@analog.com>
Date: Thu, 5 Jun 2025 18:09:40 +0300
Subject: iio: backend: update iio_backend_oversampling_ratio_set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add chan parameter to iio_backend_oversampling_ratio_set() to allow
for contexts where the channel must be specified. Modify all
existing users.

Reviewed-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Signed-off-by: Pop Ioan Daniel <pop.ioan-daniel@analog.com>
Link: https://patch.msgid.link/20250605150948.3091827-3-pop.ioan-daniel@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad4851.c           | 3 ++-
 drivers/iio/adc/adi-axi-adc.c      | 3 ++-
 drivers/iio/industrialio-backend.c | 3 ++-
 include/linux/iio/backend.h        | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ad4851.c b/drivers/iio/adc/ad4851.c
index 1751f601e7f7..31e1e02c0ce3 100644
--- a/drivers/iio/adc/ad4851.c
+++ b/drivers/iio/adc/ad4851.c
@@ -320,7 +320,8 @@ static int ad4851_set_oversampling_ratio(struct iio_dev *indio_dev,
 			return ret;
 	}
 
-	ret = iio_backend_oversampling_ratio_set(st->back, osr);
+	/* Channel is ignored by the backend being used here */
+	ret = iio_backend_oversampling_ratio_set(st->back, 0, osr);
 	if (ret)
 		return ret;
 
diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index ec08a62f0ef7..d0ad318d5400 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -387,7 +387,8 @@ static int axi_adc_ad485x_data_size_set(struct iio_backend *back,
 }
 
 static int axi_adc_ad485x_oversampling_ratio_set(struct iio_backend *back,
-					  unsigned int ratio)
+						 unsigned int chan,
+						 unsigned int ratio)
 {
 	struct adi_axi_adc_state *st = iio_backend_get_priv(back);
 
diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index 6b2d3dac52b3..decd74caf305 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -720,9 +720,10 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_data_size_set, "IIO_BACKEND");
  * 0 on success, negative error number on failure.
  */
 int iio_backend_oversampling_ratio_set(struct iio_backend *back,
+				       unsigned int chan,
 				       unsigned int ratio)
 {
-	return iio_backend_op_call(back, oversampling_ratio_set, ratio);
+	return iio_backend_op_call(back, oversampling_ratio_set, chan, ratio);
 }
 EXPORT_SYMBOL_NS_GPL(iio_backend_oversampling_ratio_set, "IIO_BACKEND");
 
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 1f528fbd9d11..7f815f3fed6a 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -155,7 +155,7 @@ struct iio_backend_ops {
 				  enum iio_backend_interface_type *type);
 	int (*data_size_set)(struct iio_backend *back, unsigned int size);
 	int (*oversampling_ratio_set)(struct iio_backend *back,
-				      unsigned int ratio);
+				      unsigned int chan, unsigned int ratio);
 	int (*read_raw)(struct iio_backend *back,
 			struct iio_chan_spec const *chan, int *val, int *val2,
 			long mask);
@@ -228,6 +228,7 @@ int iio_backend_interface_type_get(struct iio_backend *back,
 				   enum iio_backend_interface_type *type);
 int iio_backend_data_size_set(struct iio_backend *back, unsigned int size);
 int iio_backend_oversampling_ratio_set(struct iio_backend *back,
+				       unsigned int chan,
 				       unsigned int ratio);
 int iio_backend_read_raw(struct iio_backend *back,
 			 struct iio_chan_spec const *chan, int *val, int *val2,
-- 
cgit v1.2.3


From 1905e6c9ce018bd7eb9a1722ae689e9cebed24ad Mon Sep 17 00:00:00 2001
From: Jonathan Santos <Jonathan.Santos@analog.com>
Date: Wed, 11 Jun 2025 08:50:27 -0300
Subject: dt-bindings: iio: adc: ad7768-1: add trigger-sources property

In addition to GPIO synchronization, The AD7768-1 also supports
synchronization over SPI, which use is recommended when the GPIO
cannot provide a pulse synchronous with the base MCLK signal. It
consists of looping back the SYNC_OUT to the SYNC_IN pin and send
a command via SPI to trigger the synchronization.

Introduce the 'trigger-sources' property to enable SPI-based
synchronization via SYNC_OUT pin, along with additional optional
entries for GPIO3 and DRDY pins.

Also create #trigger-source-cells property to differentiate the trigger
sources provided by the ADC. To improve readability, create a
adi,ad7768-1.h header with the macros for the cell values.

While at it, add description to the interrupts property.

Acked-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: David Lechner <dlechner@baylirbe.com>
Signed-off-by: Jonathan Santos <Jonathan.Santos@analog.com>
Link: https://patch.msgid.link/713fd786010c75858700efaec8bb285274e7057e.1749569957.git.Jonathan.Santos@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../devicetree/bindings/iio/adc/adi,ad7768-1.yaml  | 39 +++++++++++++++++++++-
 MAINTAINERS                                        |  1 +
 include/dt-bindings/iio/adc/adi,ad7768-1.h         | 10 ++++++
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/iio/adc/adi,ad7768-1.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml
index 89db6f56a379..c06d0fc791d3 100644
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml
@@ -26,7 +26,26 @@ properties:
   clock-names:
     const: mclk
 
+  trigger-sources:
+    $ref: /schemas/types.yaml#/definitions/phandle-array
+    minItems: 1
+    maxItems: 2
+    description: |
+      A list of phandles referencing trigger source providers. Each entry
+      represents a trigger source for the ADC:
+
+        - First entry specifies the device responsible for driving the
+          synchronization (SYNC_IN) pin, as an alternative to adi,sync-in-gpios.
+          This can be a `gpio-trigger` or another `ad7768-1` device. If the
+          device's own SYNC_OUT pin is internally connected to its SYNC_IN pin,
+          reference the device itself or omit this property.
+        - Second entry optionally defines a GPIO3 pin used as a START signal trigger.
+
+      Use the accompanying trigger source cell to identify the type of each entry.
+
   interrupts:
+    description:
+      DRDY (Data Ready) pin, which signals conversion results are available.
     maxItems: 1
 
   '#address-cells':
@@ -70,6 +89,15 @@ properties:
   "#io-channel-cells":
     const: 1
 
+  "#trigger-source-cells":
+    description: |
+      Cell indicates the trigger output signal: 0 = SYNC_OUT, 1 = GPIO3,
+      2 = DRDY.
+
+      For better readability, macros for these values are available in
+      dt-bindings/iio/adc/adi,ad7768-1.h.
+    const: 1
+
   gpio-controller: true
 
   "#gpio-cells":
@@ -86,7 +114,16 @@ required:
   - vref-supply
   - spi-cpol
   - spi-cpha
-  - adi,sync-in-gpios
+
+dependencies:
+  adi,sync-in-gpios:
+    not:
+      required:
+        - trigger-sources
+  trigger-sources:
+    not:
+      required:
+        - adi,sync-in-gpios
 
 patternProperties:
   "^channel@([0-9]|1[0-5])$":
diff --git a/MAINTAINERS b/MAINTAINERS
index daa83ce96249..d0809d62ff48 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1490,6 +1490,7 @@ S:	Supported
 W:	https://ez.analog.com/linux-software-drivers
 F:	Documentation/devicetree/bindings/iio/adc/adi,ad7768-1.yaml
 F:	drivers/iio/adc/ad7768-1.c
+F:	include/dt-bindings/iio/adc/adi,ad7768-1.h
 
 ANALOG DEVICES INC AD7780 DRIVER
 M:	Michael Hennerich <Michael.Hennerich@analog.com>
diff --git a/include/dt-bindings/iio/adc/adi,ad7768-1.h b/include/dt-bindings/iio/adc/adi,ad7768-1.h
new file mode 100644
index 000000000000..34d92856a50b
--- /dev/null
+++ b/include/dt-bindings/iio/adc/adi,ad7768-1.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef _DT_BINDINGS_ADI_AD7768_1_H
+#define _DT_BINDINGS_ADI_AD7768_1_H
+
+#define AD7768_TRIGGER_SOURCE_SYNC_OUT  0
+#define AD7768_TRIGGER_SOURCE_GPIO3     1
+#define AD7768_TRIGGER_SOURCE_DRDY      2
+
+#endif /* _DT_BINDINGS_ADI_AD7768_1_H */
-- 
cgit v1.2.3


From 12ffc3b1513ebc1f11ae77d053948504a94a68a6 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Fri, 13 Jun 2025 16:43:44 -0500
Subject: PM: Restrict swap use to later in the suspend sequence

Currently swap is restricted before drivers have had a chance to do
their prepare() PM callbacks. Restricting swap this early means that if
a driver needs to evict some content from memory into sawp in it's
prepare callback, it won't be able to.

On AMD dGPUs this can lead to failed suspends under memory pressure
situations as all VRAM must be evicted to system memory or swap.

Move the swap restriction to right after all devices have had a chance
to do the prepare() callback.  If there is any problem with the sequence,
restore swap in the appropriate dpm resume callbacks or error handling
paths.

Closes: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Tested-by: Nat Wittstock <nat@fardog.io>
Tested-by: Lucian Langa <lucilanga@7pot.org>
Link: https://patch.msgid.link/20250613214413.4127087-1-superm1@kernel.org
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 5 ++++-
 include/linux/suspend.h   | 5 +++++
 kernel/kexec_core.c       | 1 +
 kernel/power/hibernate.c  | 3 ---
 kernel/power/power.h      | 5 -----
 kernel/power/suspend.c    | 3 +--
 6 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index eebe699fdf4f..bf77d28e959f 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1236,6 +1236,7 @@ void dpm_complete(pm_message_t state)
  */
 void dpm_resume_end(pm_message_t state)
 {
+	pm_restore_gfp_mask();
 	dpm_resume(state);
 	dpm_complete(state);
 }
@@ -2176,8 +2177,10 @@ int dpm_suspend_start(pm_message_t state)
 	error = dpm_prepare(state);
 	if (error)
 		dpm_save_failed_step(SUSPEND_PREPARE);
-	else
+	else {
+		pm_restrict_gfp_mask();
 		error = dpm_suspend(state);
+	}
 
 	dpm_show_time(starttime, state, error, "start");
 	return error;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index b1c76c8f2c82..6a3f92098872 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -446,6 +446,8 @@ extern int unregister_pm_notifier(struct notifier_block *nb);
 extern void ksys_sync_helper(void);
 extern void pm_report_hw_sleep_time(u64 t);
 extern void pm_report_max_hw_sleep(u64 t);
+void pm_restrict_gfp_mask(void);
+void pm_restore_gfp_mask(void);
 
 #define pm_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
@@ -492,6 +494,9 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 static inline void pm_report_hw_sleep_time(u64 t) {};
 static inline void pm_report_max_hw_sleep(u64 t) {};
 
+static inline void pm_restrict_gfp_mask(void) {}
+static inline void pm_restore_gfp_mask(void) {}
+
 static inline void ksys_sync_helper(void) {}
 
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9c59fa480b0b..3a9a9f240dbc 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1136,6 +1136,7 @@ int kernel_kexec(void)
  Resume_devices:
 		dpm_resume_end(PMSG_RESTORE);
  Resume_console:
+		pm_restore_gfp_mask();
 		console_resume_all();
 		thaw_processes();
  Restore_console:
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 519fb09de5e0..9216e3b91d3b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -423,7 +423,6 @@ int hibernation_snapshot(int platform_mode)
 	}
 
 	console_suspend_all();
-	pm_restrict_gfp_mask();
 
 	error = dpm_suspend(PMSG_FREEZE);
 
@@ -559,7 +558,6 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	console_suspend_all();
-	pm_restrict_gfp_mask();
 	error = dpm_suspend_start(PMSG_QUIESCE);
 	if (!error) {
 		error = resume_target_kernel(platform_mode);
@@ -571,7 +569,6 @@ int hibernation_restore(int platform_mode)
 		BUG_ON(!error);
 	}
 	dpm_resume_end(PMSG_RECOVER);
-	pm_restore_gfp_mask();
 	console_resume_all();
 	pm_restore_console();
 	return error;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cb1d71562002..7ccd709af93f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -239,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {}
 /* kernel/power/main.c */
 extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
 extern int pm_notifier_call_chain(unsigned long val);
-void pm_restrict_gfp_mask(void);
-void pm_restore_gfp_mask(void);
-#else
-static inline void pm_restrict_gfp_mask(void) {}
-static inline void pm_restore_gfp_mask(void) {}
 #endif
 
 #ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 76b141b9aac0..bb608b68fb30 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -540,6 +540,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	return error;
 
  Recover_platform:
+	pm_restore_gfp_mask();
 	platform_recover(state);
 	goto Resume_devices;
 }
@@ -606,9 +607,7 @@ static int enter_state(suspend_state_t state)
 
 	trace_suspend_resume(TPS("suspend_enter"), state, false);
 	pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
-	pm_restrict_gfp_mask();
 	error = suspend_devices_and_enter(state);
-	pm_restore_gfp_mask();
 
  Finish:
 	events_check_enabled = false;
-- 
cgit v1.2.3


From 08bf1663c21a3e815eda28fa242d84c945ca3b94 Mon Sep 17 00:00:00 2001
From: Bence Csókás <csokas.bence@prolan.hu>
Date: Tue, 10 Jun 2025 10:22:53 +0200
Subject: dmaengine: Add devm_dma_request_chan()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expand the arsenal of devm functions for DMA devices, this time for
requesting channels.

Signed-off-by: Bence Csókás <csokas.bence@prolan.hu>
Link: https://lore.kernel.org/r/20250610082256.400492-2-csokas.bence@prolan.hu
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dmaengine.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/dmaengine.h |  7 +++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 758fcd0546d8..ca13cd39330b 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -926,6 +926,36 @@ void dma_release_channel(struct dma_chan *chan)
 }
 EXPORT_SYMBOL_GPL(dma_release_channel);
 
+static void dmaenginem_release_channel(void *chan)
+{
+	dma_release_channel(chan);
+}
+
+/**
+ * devm_dma_request_chan - try to allocate an exclusive slave channel
+ * @dev:	pointer to client device structure
+ * @name:	slave channel name
+ *
+ * Returns pointer to appropriate DMA channel on success or an error pointer.
+ *
+ * The operation is managed and will be undone on driver detach.
+ */
+
+struct dma_chan *devm_dma_request_chan(struct device *dev, const char *name)
+{
+	struct dma_chan *chan = dma_request_chan(dev, name);
+	int ret = 0;
+
+	if (!IS_ERR(chan))
+		ret = devm_add_action_or_reset(dev, dmaenginem_release_channel, chan);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	return chan;
+}
+EXPORT_SYMBOL_GPL(devm_dma_request_chan);
+
 /**
  * dmaengine_get - register interest in dma_channels
  */
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index bb146c5ac3e4..6de7c05d6bd8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1524,6 +1524,7 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
 struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask);
+struct dma_chan *devm_dma_request_chan(struct device *dev, const char *name);
 
 void dma_release_channel(struct dma_chan *chan);
 int dma_get_slave_caps(struct dma_chan *chan, struct dma_slave_caps *caps);
@@ -1560,6 +1561,12 @@ static inline struct dma_chan *dma_request_chan_by_mask(
 {
 	return ERR_PTR(-ENODEV);
 }
+
+static inline struct dma_chan *devm_dma_request_chan(struct device *dev, const char *name)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void dma_release_channel(struct dma_chan *chan)
 {
 }
-- 
cgit v1.2.3


From ce57bc9771411d6d27f2ca7b40396cbd7d684ba9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 18:23:07 +0300
Subject: regulator: core: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20250626152307.322627-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/coupler.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/regulator/coupler.h b/include/linux/regulator/coupler.h
index 73291f280a23..5e314a4294fb 100644
--- a/include/linux/regulator/coupler.h
+++ b/include/linux/regulator/coupler.h
@@ -8,7 +8,8 @@
 #ifndef __LINUX_REGULATOR_COUPLER_H_
 #define __LINUX_REGULATOR_COUPLER_H_
 
-#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
 #include <linux/suspend.h>
 
 struct regulator_coupler;
-- 
cgit v1.2.3


From 040ae95a984f04afa5d300a97d11643a1577aa72 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 25 Jun 2025 18:21:55 +0800
Subject: net: Remove unused function first_net_device_rcu()

This is unused since commit f04565ddf52e ("dev: use name hash for
dev_seq_ops")

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250625102155.483570-1-yuehaibing@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index db5bfd4e7ec8..5847c20994d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3316,13 +3316,6 @@ static inline struct net_device *first_net_device(struct net *net)
 		net_device_entry(net->dev_base_head.next);
 }
 
-static inline struct net_device *first_net_device_rcu(struct net *net)
-{
-	struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));
-
-	return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
-}
-
 int netdev_boot_setup_check(struct net_device *dev);
 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
 				   const char *hwaddr);
-- 
cgit v1.2.3


From 2d22b63f3a5aae2088708941d08cf0f01f430a58 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <mripard@kernel.org>
Date: Thu, 26 Jun 2025 12:04:59 +0200
Subject: drm/mipi-dsi: Add dev_is_mipi_dsi function

This will be especially useful for generic panels (like panel-simple)
which can take different code path depending on if they are MIPI-DSI
devices or platform devices.

Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Tested-by: Francesco Dolcini <francesco.dolcini@toradex.com> # Toradex Colibri iMX6
Link: https://lore.kernel.org/r/20250626-drm-panel-simple-fixes-v2-1-5afcaa608bdc@kernel.org
Signed-off-by: Maxime Ripard <mripard@kernel.org>
---
 drivers/gpu/drm/drm_mipi_dsi.c | 3 ++-
 include/drm/drm_mipi_dsi.h     | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c
index e5184a0c2465..21fd647f8ce1 100644
--- a/drivers/gpu/drm/drm_mipi_dsi.c
+++ b/drivers/gpu/drm/drm_mipi_dsi.c
@@ -91,12 +91,13 @@ static const struct dev_pm_ops mipi_dsi_device_pm_ops = {
 	.restore = pm_generic_restore,
 };
 
-static const struct bus_type mipi_dsi_bus_type = {
+const struct bus_type mipi_dsi_bus_type = {
 	.name = "mipi-dsi",
 	.match = mipi_dsi_device_match,
 	.uevent = mipi_dsi_uevent,
 	.pm = &mipi_dsi_device_pm_ops,
 };
+EXPORT_SYMBOL_GPL(mipi_dsi_bus_type);
 
 /**
  * of_find_mipi_dsi_device_by_node() - find the MIPI DSI device matching a
diff --git a/include/drm/drm_mipi_dsi.h b/include/drm/drm_mipi_dsi.h
index b37860f4a895..6d2c08e81101 100644
--- a/include/drm/drm_mipi_dsi.h
+++ b/include/drm/drm_mipi_dsi.h
@@ -223,6 +223,9 @@ struct mipi_dsi_multi_context {
 
 #define to_mipi_dsi_device(__dev)	container_of_const(__dev, struct mipi_dsi_device, dev)
 
+extern const struct bus_type mipi_dsi_bus_type;
+#define dev_is_mipi_dsi(dev)	((dev)->bus == &mipi_dsi_bus_type)
+
 /**
  * mipi_dsi_pixel_format_to_bpp - obtain the number of bits per pixel for any
  *                                given pixel format defined by the MIPI DSI
-- 
cgit v1.2.3


From 792ea7b6cafa46f8e6d6f40c557e614358a89520 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 9 Jun 2025 17:41:31 -0300
Subject: iommu: Remove ops->pgsize_bitmap

No driver uses it now, remove the core code.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/7-v2-68a2e1ba507c+1fb-iommu_rm_ops_pgsize_jgg@nvidia.com
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/iommu/iommu.c | 7 -------
 include/linux/iommu.h | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a4b606c591da..060ebe330ee1 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2002,13 +2002,6 @@ static void iommu_domain_init(struct iommu_domain *domain, unsigned int type,
 	domain->owner = ops;
 	if (!domain->ops)
 		domain->ops = ops->default_domain_ops;
-
-	/*
-	 * If not already set, assume all sizes by default; the driver
-	 * may override this later
-	 */
-	if (!domain->pgsize_bitmap)
-		domain->pgsize_bitmap = ops->pgsize_bitmap;
 }
 
 static struct iommu_domain *
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 156732807994..7073be1d8841 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -604,7 +604,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  *                It is required to call iommufd_viommu_alloc() helper for
  *                a bundled allocation of the core and the driver structures,
  *                using the given @ictx pointer.
- * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @owner: Driver module providing these ops
  * @identity_domain: An always available, always attachable identity
  *                   translation.
@@ -659,7 +658,6 @@ struct iommu_ops {
 		struct iommufd_ctx *ictx, unsigned int viommu_type);
 
 	const struct iommu_domain_ops *default_domain_ops;
-	unsigned long pgsize_bitmap;
 	struct module *owner;
 	struct iommu_domain *identity_domain;
 	struct iommu_domain *blocked_domain;
-- 
cgit v1.2.3


From 3c2968fcd72c4b130894d1a2b835e18474c559e2 Mon Sep 17 00:00:00 2001
From: Junhui Liu <junhui.liu@pigmoral.tech>
Date: Fri, 13 Jun 2025 16:49:23 +0800
Subject: dt-bindings: reset: add support for canaan,k230-rst

Introduces a reset controller driver for the Kendryte K230 SoC,
resposible for managing the reset functionality of the CPUs and
various sub-modules.

Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Chen Wang <unicorn_wang@outlook.com>
Signed-off-by: Junhui Liu <junhui.liu@pigmoral.tech>
Link: https://lore.kernel.org/r/20250613-k230-reset-v4-1-e5266d2be440@pigmoral.tech
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 .../devicetree/bindings/reset/canaan,k230-rst.yaml | 39 ++++++++++
 include/dt-bindings/reset/canaan,k230-rst.h        | 90 ++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/reset/canaan,k230-rst.yaml
 create mode 100644 include/dt-bindings/reset/canaan,k230-rst.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/canaan,k230-rst.yaml b/Documentation/devicetree/bindings/reset/canaan,k230-rst.yaml
new file mode 100644
index 000000000000..d352d0e12d81
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/canaan,k230-rst.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/reset/canaan,k230-rst.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Canaan Kendryte K230 Reset Controller
+
+maintainers:
+  - Junhui Liu <junhui.liu@pigmoral.tech>
+
+description:
+  The Canaan Kendryte K230 reset controller is part of the SoC's system
+  controller and controls the reset registers for CPUs and various peripherals.
+
+properties:
+  compatible:
+    const: canaan,k230-rst
+
+  reg:
+    maxItems: 1
+
+  '#reset-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - '#reset-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    reset-controller@91101000 {
+      compatible = "canaan,k230-rst";
+      reg = <0x91101000 0x1000>;
+      #reset-cells = <1>;
+    };
diff --git a/include/dt-bindings/reset/canaan,k230-rst.h b/include/dt-bindings/reset/canaan,k230-rst.h
new file mode 100644
index 000000000000..e4f6612607fe
--- /dev/null
+++ b/include/dt-bindings/reset/canaan,k230-rst.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2023-2024 Canaan Bright Sight Co., Ltd
+ * Copyright (C) 2024-2025 Junhui Liu <junhui.liu@pigmoral.tech>
+ */
+#ifndef _DT_BINDINGS_CANAAN_K230_RST_H_
+#define _DT_BINDINGS_CANAAN_K230_RST_H_
+
+#define RST_CPU0		0
+#define RST_CPU1		1
+#define RST_CPU0_FLUSH		2
+#define RST_CPU1_FLUSH		3
+#define RST_AI			4
+#define RST_VPU			5
+#define RST_HISYS		6
+#define RST_HISYS_AHB		7
+#define RST_SDIO0		8
+#define RST_SDIO1		9
+#define RST_SDIO_AXI		10
+#define RST_USB0		11
+#define RST_USB1		12
+#define RST_USB0_AHB		13
+#define RST_USB1_AHB		14
+#define RST_SPI0		15
+#define RST_SPI1		16
+#define RST_SPI2		17
+#define RST_SEC			18
+#define RST_PDMA		19
+#define RST_SDMA		20
+#define RST_DECOMPRESS		21
+#define RST_SRAM		22
+#define RST_SHRM_AXIM		23
+#define RST_SHRM_AXIS		24
+#define RST_NONAI2D		25
+#define RST_MCTL		26
+#define RST_ISP			27
+#define RST_ISP_DW		28
+#define RST_DPU			29
+#define RST_DISP		30
+#define RST_GPU			31
+#define RST_AUDIO		32
+#define RST_TIMER0		33
+#define RST_TIMER1		34
+#define RST_TIMER2		35
+#define RST_TIMER3		36
+#define RST_TIMER4		37
+#define RST_TIMER5		38
+#define RST_TIMER_APB		39
+#define RST_HDI			40
+#define RST_WDT0		41
+#define RST_WDT1		42
+#define RST_WDT0_APB		43
+#define RST_WDT1_APB		44
+#define RST_TS_APB		45
+#define RST_MAILBOX		46
+#define RST_STC			47
+#define RST_PMU			48
+#define RST_LOSYS_APB		49
+#define RST_UART0		50
+#define RST_UART1		51
+#define RST_UART2		52
+#define RST_UART3		53
+#define RST_UART4		54
+#define RST_I2C0		55
+#define RST_I2C1		56
+#define RST_I2C2		57
+#define RST_I2C3		58
+#define RST_I2C4		59
+#define RST_JAMLINK0_APB	60
+#define RST_JAMLINK1_APB	61
+#define RST_JAMLINK2_APB	62
+#define RST_JAMLINK3_APB	63
+#define RST_CODEC_APB		64
+#define RST_GPIO_DB		65
+#define RST_GPIO_APB		66
+#define RST_ADC			67
+#define RST_ADC_APB		68
+#define RST_PWM_APB		69
+#define RST_SHRM_APB		70
+#define RST_CSI0		71
+#define RST_CSI1		72
+#define RST_CSI2		73
+#define RST_CSI_DPHY		74
+#define RST_ISP_AHB		75
+#define RST_M0			76
+#define RST_M1			77
+#define RST_M2			78
+#define RST_SPI2AXI		79
+
+#endif
-- 
cgit v1.2.3


From 05bc6e6290f91d2d40086ab4ef52da21c14ec4b6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 25 Jun 2025 20:38:31 +0200
Subject: timekeeping: Provide time getters for auxiliary clocks

Provide interfaces similar to the ktime_get*() family which provide access
to the auxiliary clocks.

These interfaces have a boolean return value, which indicates whether the
accessed clock is valid or not.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250625183757.868342628@linutronix.de
---
 include/linux/posix-timers.h |  5 ++++
 include/linux/timekeeping.h  | 11 ++++++++
 kernel/time/timekeeping.c    | 65 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+)

(limited to 'include')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index dd48c64b605e..4d3dbcef379e 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -37,6 +37,11 @@ static inline int clockid_to_fd(const clockid_t clk)
 	return ~(clk >> 3);
 }
 
+static inline bool clockid_aux_valid(clockid_t id)
+{
+	return IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) && id >= CLOCK_AUX && id <= CLOCK_AUX_LAST;
+}
+
 #ifdef CONFIG_POSIX_TIMERS
 
 #include <linux/signal_types.h>
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 542773650200..de9a3b7d7d0d 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -263,6 +263,17 @@ extern bool timekeeping_rtc_skipresume(void);
 
 extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);
 
+/*
+ * Auxiliary clock interfaces
+ */
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+extern bool ktime_get_aux(clockid_t id, ktime_t *kt);
+extern bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt);
+#else
+static inline bool ktime_get_aux(clockid_t id, ktime_t *kt) { return false; }
+static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; }
+#endif
+
 /**
  * struct system_time_snapshot - simultaneous raw/real time capture with
  *				 counter value
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ee9757018341..c7d2913e68c3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2664,6 +2664,18 @@ EXPORT_SYMBOL(hardpps);
  */
 static unsigned long aux_timekeepers;
 
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+	return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+	if (!clockid_aux_valid(id))
+		return NULL;
+	return &timekeeper_data[clockid_to_tkid(id)];
+}
+
 /* Invoked from timekeeping after a clocksource change */
 static void tk_aux_update_clocksource(void)
 {
@@ -2684,6 +2696,59 @@ static void tk_aux_update_clocksource(void)
 	}
 }
 
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id:	ID of the clock to read (CLOCK_AUX...)
+ * @kt:	Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+	struct tk_data *aux_tkd = aux_get_tk_data(id);
+	struct timekeeper *aux_tk;
+	unsigned int seq;
+	ktime_t base;
+	u64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	if (!aux_tkd)
+		return false;
+
+	aux_tk = &aux_tkd->timekeeper;
+	do {
+		seq = read_seqcount_begin(&aux_tkd->seq);
+		if (!aux_tk->clock_valid)
+			return false;
+
+		base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+		nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+	} while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+	*kt = ktime_add_ns(base, nsecs);
+	return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id:	ID of the clock to read (CLOCK_AUX...)
+ * @ts:	Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+	ktime_t now;
+
+	if (!ktime_get_aux(id, &now))
+		return false;
+	*ts = ktime_to_timespec64(now);
+	return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
 static __init void tk_aux_setup(void)
 {
 	for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
-- 
cgit v1.2.3


From c5b60592886f97b01503c1bb553f88d6a7df42ea Mon Sep 17 00:00:00 2001
From: Gabor Juhos <j4g8y7@gmail.com>
Date: Fri, 27 Jun 2025 09:58:54 +0200
Subject: interconnect: avoid memory allocation when 'icc_bw_lock' is held

The 'icc_bw_lock' mutex is introduced in commit af42269c3523
("interconnect: Fix locking for runpm vs reclaim") in order to decouple
serialization of bw aggregation from codepaths that require memory
allocation.

However commit d30f83d278a9 ("interconnect: core: Add dynamic id
allocation support") added a devm_kasprintf() call into a path protected
by the 'icc_bw_lock' which causes the following lockdep warning on
machines like the Lenovo ThinkPad X13s:

    ======================================================
    WARNING: possible circular locking dependency detected
    6.16.0-rc3 #15 Not tainted
    ------------------------------------------------------
    (udev-worker)/342 is trying to acquire lock:
    ffffb973f7ec4638 (fs_reclaim){+.+.}-{0:0}, at: __kmalloc_node_track_caller_noprof+0xa0/0x3e0

    but task is already holding lock:
    ffffb973f7f7f0e8 (icc_bw_lock){+.+.}-{4:4}, at: icc_node_add+0x44/0x154

    which lock already depends on the new lock.

    the existing dependency chain (in reverse order) is:

    -> #1 (icc_bw_lock){+.+.}-{4:4}:
           icc_init+0x48/0x108
           do_one_initcall+0x64/0x30c
           kernel_init_freeable+0x27c/0x500
           kernel_init+0x20/0x1d8
           ret_from_fork+0x10/0x20

    -> #0 (fs_reclaim){+.+.}-{0:0}:
           __lock_acquire+0x136c/0x2114
           lock_acquire+0x1c8/0x354
           fs_reclaim_acquire+0x74/0xa8
           __kmalloc_node_track_caller_noprof+0xa0/0x3e0
           devm_kmalloc+0x54/0x124
           devm_kvasprintf+0x74/0xd4
           devm_kasprintf+0x58/0x80
           icc_node_add+0xb4/0x154
           qcom_osm_l3_probe+0x20c/0x314 [icc_osm_l3]
           platform_probe+0x68/0xd8
           really_probe+0xc0/0x38c
           __driver_probe_device+0x7c/0x160
           driver_probe_device+0x40/0x110
           __driver_attach+0xfc/0x208
           bus_for_each_dev+0x74/0xd0
           driver_attach+0x24/0x30
           bus_add_driver+0x110/0x234
           driver_register+0x60/0x128
           __platform_driver_register+0x24/0x30
           osm_l3_driver_init+0x20/0x1000 [icc_osm_l3]
           do_one_initcall+0x64/0x30c
           do_init_module+0x58/0x23c
           load_module+0x1df8/0x1f70
           init_module_from_file+0x88/0xc4
           idempotent_init_module+0x188/0x280
           __arm64_sys_finit_module+0x6c/0xd8
           invoke_syscall+0x48/0x110
           el0_svc_common.constprop.0+0xc0/0xe0
           do_el0_svc+0x1c/0x28
           el0_svc+0x4c/0x158
           el0t_64_sync_handler+0xc8/0xcc
           el0t_64_sync+0x198/0x19c

    other info that might help us debug this:

     Possible unsafe locking scenario:

           CPU0                    CPU1
           ----                    ----
      lock(icc_bw_lock);
                                   lock(fs_reclaim);
                                   lock(icc_bw_lock);
      lock(fs_reclaim);

     *** DEADLOCK ***

The icc_node_add() functions is not designed to fail, and as such it
should not do any memory allocation. In order to avoid this, add a new
helper function for the name generation to be called by drivers which
are using the new dynamic id feature.

Fixes: d30f83d278a9 ("interconnect: core: Add dynamic id allocation support")
Signed-off-by: Gabor Juhos <j4g8y7@gmail.com>
Link: https://lore.kernel.org/r/20250625-icc-bw-lockdep-v3-1-2b8f8b8987c4@gmail.com
Co-developed-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20250627075854.26943-1-johan+linaro@kernel.org
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 drivers/interconnect/core.c           | 29 +++++++++++++++++++++++++----
 drivers/interconnect/qcom/icc-rpmh.c  |  7 ++++++-
 drivers/interconnect/qcom/osm-l3.c    |  7 ++++++-
 include/linux/interconnect-provider.h |  7 +++++++
 4 files changed, 44 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/interconnect/core.c b/drivers/interconnect/core.c
index 3a41b2717edd..3ebf37ddfc18 100644
--- a/drivers/interconnect/core.c
+++ b/drivers/interconnect/core.c
@@ -909,10 +909,35 @@ void icc_node_destroy(int id)
 		return;
 
 	kfree(node->links);
+	if (node->id >= ICC_DYN_ID_START)
+		kfree(node->name);
 	kfree(node);
 }
 EXPORT_SYMBOL_GPL(icc_node_destroy);
 
+/**
+ * icc_node_set_name() - set node name
+ * @node: node
+ * @provider: node provider
+ * @name: node name
+ *
+ * Return: 0 on success, or -ENOMEM on allocation failure
+ */
+int icc_node_set_name(struct icc_node *node, const struct icc_provider *provider, const char *name)
+{
+	if (node->id >= ICC_DYN_ID_START) {
+		node->name = kasprintf(GFP_KERNEL, "%s@%s", name,
+				       dev_name(provider->dev));
+		if (!node->name)
+			return -ENOMEM;
+	} else {
+		node->name = name;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(icc_node_set_name);
+
 /**
  * icc_link_nodes() - create link between two nodes
  * @src_node: source node
@@ -1041,10 +1066,6 @@ void icc_node_add(struct icc_node *node, struct icc_provider *provider)
 	node->avg_bw = node->init_avg;
 	node->peak_bw = node->init_peak;
 
-	if (node->id >= ICC_DYN_ID_START)
-		node->name = devm_kasprintf(provider->dev, GFP_KERNEL, "%s@%s",
-					    node->name, dev_name(provider->dev));
-
 	if (node->avg_bw || node->peak_bw) {
 		if (provider->pre_aggregate)
 			provider->pre_aggregate(node);
diff --git a/drivers/interconnect/qcom/icc-rpmh.c b/drivers/interconnect/qcom/icc-rpmh.c
index 41bfc6e7ee1d..001404e91041 100644
--- a/drivers/interconnect/qcom/icc-rpmh.c
+++ b/drivers/interconnect/qcom/icc-rpmh.c
@@ -293,7 +293,12 @@ int qcom_icc_rpmh_probe(struct platform_device *pdev)
 			goto err_remove_nodes;
 		}
 
-		node->name = qn->name;
+		ret = icc_node_set_name(node, provider, qn->name);
+		if (ret) {
+			icc_node_destroy(node->id);
+			goto err_remove_nodes;
+		}
+
 		node->data = qn;
 		icc_node_add(node, provider);
 
diff --git a/drivers/interconnect/qcom/osm-l3.c b/drivers/interconnect/qcom/osm-l3.c
index baecbf2533f7..b33f00da1880 100644
--- a/drivers/interconnect/qcom/osm-l3.c
+++ b/drivers/interconnect/qcom/osm-l3.c
@@ -236,7 +236,12 @@ static int qcom_osm_l3_probe(struct platform_device *pdev)
 			goto err;
 		}
 
-		node->name = qnodes[i]->name;
+		ret = icc_node_set_name(node, provider, qnodes[i]->name);
+		if (ret) {
+			icc_node_destroy(node->id);
+			goto err;
+		}
+
 		/* Cast away const and add it back in qcom_osm_l3_set() */
 		node->data = (void *)qnodes[i];
 		icc_node_add(node, provider);
diff --git a/include/linux/interconnect-provider.h b/include/linux/interconnect-provider.h
index 55cfebc658e6..8a2f652a05ff 100644
--- a/include/linux/interconnect-provider.h
+++ b/include/linux/interconnect-provider.h
@@ -119,6 +119,7 @@ int icc_std_aggregate(struct icc_node *node, u32 tag, u32 avg_bw,
 struct icc_node *icc_node_create_dyn(void);
 struct icc_node *icc_node_create(int id);
 void icc_node_destroy(int id);
+int icc_node_set_name(struct icc_node *node, const struct icc_provider *provider, const char *name);
 int icc_link_nodes(struct icc_node *src_node, struct icc_node **dst_node);
 int icc_link_create(struct icc_node *node, const int dst_id);
 void icc_node_add(struct icc_node *node, struct icc_provider *provider);
@@ -152,6 +153,12 @@ static inline void icc_node_destroy(int id)
 {
 }
 
+static inline int icc_node_set_name(struct icc_node *node, const struct icc_provider *provider,
+				    const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int icc_link_nodes(struct icc_node *src_node, struct icc_node **dst_node)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 8d68411a128705f86da7f037e1c33d81786fee96 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Jun 2025 15:30:16 +0000
Subject: tcp: remove rtx_syn_ack field

Now inet_rtx_syn_ack() is only used by TCP, it can directly
call tcp_rtx_synack() instead of using an indirect call
to req->rsk_ops->rtx_syn_ack().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250626153017.2156274-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/request_sock.h      | 2 --
 net/ipv4/inet_connection_sock.c | 2 +-
 net/ipv4/tcp_ipv4.c             | 1 -
 net/ipv6/tcp_ipv6.c             | 1 -
 4 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b07b1cd14e9f..bad7d16a5515 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -30,8 +30,6 @@ struct request_sock_ops {
 	unsigned int	obj_size;
 	struct kmem_cache	*slab;
 	char		*slab_name;
-	int		(*rtx_syn_ack)(const struct sock *sk,
-				       struct request_sock *req);
 	void		(*send_ack)(const struct sock *sk, struct sk_buff *skb,
 				    struct request_sock *req);
 	void		(*send_reset)(const struct sock *sk,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f4157d26ec9e..d61eef748851 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -886,7 +886,7 @@ static void syn_ack_recalc(struct request_sock *req,
 
 int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 {
-	int err = req->rsk_ops->rtx_syn_ack(parent, req);
+	int err = tcp_rtx_synack(parent, req);
 
 	if (!err)
 		req->num_retrans++;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 429fb34b075e..56223338bc0f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1703,7 +1703,6 @@ static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
-	.rtx_syn_ack	=	tcp_rtx_synack,
 	.send_ack	=	tcp_v4_reqsk_send_ack,
 	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f0ce62549d90..9fb614e17bde 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -835,7 +835,6 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
-	.rtx_syn_ack	=	tcp_rtx_synack,
 	.send_ack	=	tcp_v6_reqsk_send_ack,
 	.destructor	=	tcp_v6_reqsk_destructor,
 	.send_reset	=	tcp_v6_send_reset,
-- 
cgit v1.2.3


From cf56a98202970adf298df5caaa225ed68350e9ab Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 26 Jun 2025 15:30:17 +0000
Subject: tcp: remove inet_rtx_syn_ack()

inet_rtx_syn_ack() is a simple wrapper around tcp_rtx_synack(),
if we move req->num_retrans update.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250626153017.2156274-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/request_sock.h      |  2 --
 net/ipv4/inet_connection_sock.c | 11 +----------
 net/ipv4/tcp_minisocks.c        |  2 +-
 net/ipv4/tcp_output.c           |  1 +
 net/ipv4/tcp_timer.c            |  2 +-
 5 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index bad7d16a5515..6a5ec1418e85 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -39,8 +39,6 @@ struct request_sock_ops {
 	void		(*syn_ack_timeout)(const struct request_sock *req);
 };
 
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
-
 struct saved_syn {
 	u32 mac_hdrlen;
 	u32 network_hdrlen;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index d61eef748851..1e2df51427fe 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -884,15 +884,6 @@ static void syn_ack_recalc(struct request_sock *req,
 		  req->num_timeout >= rskq_defer_accept - 1;
 }
 
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
-{
-	int err = tcp_rtx_synack(parent, req);
-
-	if (!err)
-		req->num_retrans++;
-	return err;
-}
-
 static struct request_sock *
 reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
 		   bool attach_listener)
@@ -1132,7 +1123,7 @@ static void reqsk_timer_handler(struct timer_list *t)
 	req->rsk_ops->syn_ack_timeout(req);
 	if (!expire &&
 	    (!resend ||
-	     !inet_rtx_syn_ack(sk_listener, req) ||
+	     !tcp_rtx_synack(sk_listener, req) ||
 	     inet_rsk(req)->acked)) {
 		if (req->num_timeout++ == 0)
 			atomic_dec(&queue->young);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 43d7852ce07e..2994c9222c9c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -726,7 +726,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
 					  &tcp_rsk(req)->last_oow_ack_time) &&
 
-		    !inet_rtx_syn_ack(sk, req)) {
+		    !tcp_rtx_synack(sk, req)) {
 			unsigned long expires = jiffies;
 
 			expires += reqsk_timeout(req, TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 28f840724fe8..b616776e3354 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4425,6 +4425,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 			tcp_sk_rw(sk)->total_retrans++;
 		}
 		trace_tcp_retransmit_synack(sk, req);
+		req->num_retrans++;
 	}
 	return res;
 }
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bb37e24b97a7..a207877270fb 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -478,7 +478,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
 	 * regular retransmit because if the child socket has been accepted
 	 * it's not good to give up too easily.
 	 */
-	inet_rtx_syn_ack(sk, req);
+	tcp_rtx_synack(sk, req);
 	req->num_timeout++;
 	tcp_update_rto_stats(sk);
 	if (!tp->retrans_stamp)
-- 
cgit v1.2.3


From 7f15ee35972dd3dee37704bfd0f136290f6d63d9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 26 Jun 2025 15:52:17 +0200
Subject: dpll: add reference-sync netlink attribute

Add new netlink attribute to allow user space configuration of reference
sync pin pairs, where both pins are used to provide one clock signal
consisting of both: base frequency and sync signal.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20250626135219.1769350-2-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/driver-api/dpll.rst     | 25 +++++++++++++++++++++++++
 Documentation/netlink/specs/dpll.yaml | 19 +++++++++++++++++++
 drivers/dpll/dpll_nl.c                | 10 ++++++++--
 drivers/dpll/dpll_nl.h                |  1 +
 include/uapi/linux/dpll.h             |  1 +
 5 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst
index 195e1e5d9a58..eca72d9b9ed8 100644
--- a/Documentation/driver-api/dpll.rst
+++ b/Documentation/driver-api/dpll.rst
@@ -253,6 +253,31 @@ the pin.
   ``DPLL_A_PIN_ESYNC_PULSE``                pulse type of Embedded SYNC
   ========================================= =================================
 
+Reference SYNC
+==============
+
+The device may support the Reference SYNC feature, which allows the combination
+of two inputs into a input pair. In this configuration, clock signals
+from both inputs are used to synchronize the DPLL device. The higher frequency
+signal is utilized for the loop bandwidth of the DPLL, while the lower frequency
+signal is used to syntonize the output signal of the DPLL device. This feature
+enables the provision of a high-quality loop bandwidth signal from an external
+source.
+
+A capable input provides a list of inputs that can be bound with to create
+Reference SYNC. To control this feature, the user must request a desired
+state for a target pin: use ``DPLL_PIN_STATE_CONNECTED`` to enable or
+``DPLL_PIN_STATE_DISCONNECTED`` to disable the feature. An input pin can be
+bound to only one other pin at any given time.
+
+  ============================== ==========================================
+  ``DPLL_A_PIN_REFERENCE_SYNC``  nested attribute for providing info or
+                                 requesting configuration of the Reference
+                                 SYNC feature
+    ``DPLL_A_PIN_ID``            target pin id for Reference SYNC feature
+    ``DPLL_A_PIN_STATE``         state of Reference SYNC connection
+  ============================== ==========================================
+
 Configuration commands group
 ============================
 
diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml
index c13440efab24..5decee61a2c4 100644
--- a/Documentation/netlink/specs/dpll.yaml
+++ b/Documentation/netlink/specs/dpll.yaml
@@ -428,6 +428,15 @@ attribute-sets:
         doc: |
           A ratio of high to low state of a SYNC signal pulse embedded
           into base clock frequency. Value is in percents.
+      -
+        name: reference-sync
+        type: nest
+        multi-attr: true
+        nested-attributes: reference-sync
+        doc: |
+          Capable pin provides list of pins that can be bound to create a
+          reference-sync pin pair.
+
   -
     name: pin-parent-device
     subset-of: pin
@@ -458,6 +467,14 @@ attribute-sets:
         name: frequency-min
       -
         name: frequency-max
+  -
+    name: reference-sync
+    subset-of: pin
+    attributes:
+      -
+        name: id
+      -
+        name: state
 
 operations:
   enum-name: dpll_cmd
@@ -598,6 +615,7 @@ operations:
             - esync-frequency
             - esync-frequency-supported
             - esync-pulse
+            - reference-sync
 
       dump:
         request:
@@ -625,6 +643,7 @@ operations:
             - parent-pin
             - phase-adjust
             - esync-frequency
+            - reference-sync
     -
       name: pin-create-ntf
       doc: Notification about pin appearing
diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c
index 8de90310c3be..9f2efaf25268 100644
--- a/drivers/dpll/dpll_nl.c
+++ b/drivers/dpll/dpll_nl.c
@@ -24,6 +24,11 @@ const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1] = {
 	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
 };
 
+const struct nla_policy dpll_reference_sync_nl_policy[DPLL_A_PIN_STATE + 1] = {
+	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
+	[DPLL_A_PIN_STATE] = NLA_POLICY_RANGE(NLA_U32, 1, 3),
+};
+
 /* DPLL_CMD_DEVICE_ID_GET - do */
 static const struct nla_policy dpll_device_id_get_nl_policy[DPLL_A_TYPE + 1] = {
 	[DPLL_A_MODULE_NAME] = { .type = NLA_NUL_STRING, },
@@ -63,7 +68,7 @@ static const struct nla_policy dpll_pin_get_dump_nl_policy[DPLL_A_PIN_ID + 1] =
 };
 
 /* DPLL_CMD_PIN_SET - do */
-static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_ESYNC_FREQUENCY + 1] = {
+static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_REFERENCE_SYNC + 1] = {
 	[DPLL_A_PIN_ID] = { .type = NLA_U32, },
 	[DPLL_A_PIN_FREQUENCY] = { .type = NLA_U64, },
 	[DPLL_A_PIN_DIRECTION] = NLA_POLICY_RANGE(NLA_U32, 1, 2),
@@ -73,6 +78,7 @@ static const struct nla_policy dpll_pin_set_nl_policy[DPLL_A_PIN_ESYNC_FREQUENCY
 	[DPLL_A_PIN_PARENT_PIN] = NLA_POLICY_NESTED(dpll_pin_parent_pin_nl_policy),
 	[DPLL_A_PIN_PHASE_ADJUST] = { .type = NLA_S32, },
 	[DPLL_A_PIN_ESYNC_FREQUENCY] = { .type = NLA_U64, },
+	[DPLL_A_PIN_REFERENCE_SYNC] = NLA_POLICY_NESTED(dpll_reference_sync_nl_policy),
 };
 
 /* Ops table for dpll */
@@ -140,7 +146,7 @@ static const struct genl_split_ops dpll_nl_ops[] = {
 		.doit		= dpll_nl_pin_set_doit,
 		.post_doit	= dpll_pin_post_doit,
 		.policy		= dpll_pin_set_nl_policy,
-		.maxattr	= DPLL_A_PIN_ESYNC_FREQUENCY,
+		.maxattr	= DPLL_A_PIN_REFERENCE_SYNC,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 };
diff --git a/drivers/dpll/dpll_nl.h b/drivers/dpll/dpll_nl.h
index f491262bee4f..3da10cfe9a6e 100644
--- a/drivers/dpll/dpll_nl.h
+++ b/drivers/dpll/dpll_nl.h
@@ -14,6 +14,7 @@
 /* Common nested types */
 extern const struct nla_policy dpll_pin_parent_device_nl_policy[DPLL_A_PIN_PHASE_OFFSET + 1];
 extern const struct nla_policy dpll_pin_parent_pin_nl_policy[DPLL_A_PIN_STATE + 1];
+extern const struct nla_policy dpll_reference_sync_nl_policy[DPLL_A_PIN_STATE + 1];
 
 int dpll_lock_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
 		   struct genl_info *info);
diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 349e1b3ca1ae..37b438ce8efc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -249,6 +249,7 @@ enum dpll_a_pin {
 	DPLL_A_PIN_ESYNC_FREQUENCY,
 	DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED,
 	DPLL_A_PIN_ESYNC_PULSE,
+	DPLL_A_PIN_REFERENCE_SYNC,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit v1.2.3


From 58256a26bfb37a94738dd65618b1f31f460f8d91 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 26 Jun 2025 15:52:18 +0200
Subject: dpll: add reference sync get/set

Define function for reference sync pin registration and callback ops to
set/get current feature state.

Implement netlink handler to fill netlink messages with reference sync
pin configuration of capable pins (pin-get).

Implement netlink handler to call proper ops and configure reference
sync pin state (pin-set).

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20250626135219.1769350-3-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/dpll/dpll_core.c    |  45 +++++++++++
 drivers/dpll/dpll_core.h    |   2 +
 drivers/dpll/dpll_netlink.c | 190 +++++++++++++++++++++++++++++++++++++++-----
 drivers/dpll/dpll_netlink.h |   2 +
 include/linux/dpll.h        |  13 +++
 5 files changed, 233 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
index 20bdc52f63a5..a461095efd8a 100644
--- a/drivers/dpll/dpll_core.c
+++ b/drivers/dpll/dpll_core.c
@@ -506,6 +506,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
 	refcount_set(&pin->refcount, 1);
 	xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
 	xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+	xa_init_flags(&pin->ref_sync_pins, XA_FLAGS_ALLOC);
 	ret = xa_alloc_cyclic(&dpll_pin_xa, &pin->id, pin, xa_limit_32b,
 			      &dpll_pin_xa_id, GFP_KERNEL);
 	if (ret < 0)
@@ -514,6 +515,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
 err_xa_alloc:
 	xa_destroy(&pin->dpll_refs);
 	xa_destroy(&pin->parent_refs);
+	xa_destroy(&pin->ref_sync_pins);
 	dpll_pin_prop_free(&pin->prop);
 err_pin_prop:
 	kfree(pin);
@@ -595,6 +597,7 @@ void dpll_pin_put(struct dpll_pin *pin)
 		xa_erase(&dpll_pin_xa, pin->id);
 		xa_destroy(&pin->dpll_refs);
 		xa_destroy(&pin->parent_refs);
+		xa_destroy(&pin->ref_sync_pins);
 		dpll_pin_prop_free(&pin->prop);
 		kfree_rcu(pin, rcu);
 	}
@@ -659,11 +662,26 @@ dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
 }
 EXPORT_SYMBOL_GPL(dpll_pin_register);
 
+static void dpll_pin_ref_sync_pair_del(u32 ref_sync_pin_id)
+{
+	struct dpll_pin *pin, *ref_sync_pin;
+	unsigned long i;
+
+	xa_for_each(&dpll_pin_xa, i, pin) {
+		ref_sync_pin = xa_load(&pin->ref_sync_pins, ref_sync_pin_id);
+		if (ref_sync_pin) {
+			xa_erase(&pin->ref_sync_pins, ref_sync_pin_id);
+			__dpll_pin_change_ntf(pin);
+		}
+	}
+}
+
 static void
 __dpll_pin_unregister(struct dpll_device *dpll, struct dpll_pin *pin,
 		      const struct dpll_pin_ops *ops, void *priv, void *cookie)
 {
 	ASSERT_DPLL_PIN_REGISTERED(pin);
+	dpll_pin_ref_sync_pair_del(pin->id);
 	dpll_xa_ref_pin_del(&dpll->pin_refs, pin, ops, priv, cookie);
 	dpll_xa_ref_dpll_del(&pin->dpll_refs, dpll, ops, priv, cookie);
 	if (xa_empty(&pin->dpll_refs))
@@ -783,6 +801,33 @@ void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 }
 EXPORT_SYMBOL_GPL(dpll_pin_on_pin_unregister);
 
+/**
+ * dpll_pin_ref_sync_pair_add - create a reference sync signal pin pair
+ * @pin: pin which produces the base frequency
+ * @ref_sync_pin: pin which produces the sync signal
+ *
+ * Once pins are paired, the user-space configuration of reference sync pair
+ * is possible.
+ * Context: Acquires a lock (dpll_lock)
+ * Return:
+ * * 0 on success
+ * * negative - error value
+ */
+int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin,
+			       struct dpll_pin *ref_sync_pin)
+{
+	int ret;
+
+	mutex_lock(&dpll_lock);
+	ret = xa_insert(&pin->ref_sync_pins, ref_sync_pin->id,
+			ref_sync_pin, GFP_KERNEL);
+	__dpll_pin_change_ntf(pin);
+	mutex_unlock(&dpll_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dpll_pin_ref_sync_pair_add);
+
 static struct dpll_device_registration *
 dpll_device_registration_first(struct dpll_device *dpll)
 {
diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
index 9b11e637397b..8ce969bbeb64 100644
--- a/drivers/dpll/dpll_core.h
+++ b/drivers/dpll/dpll_core.h
@@ -44,6 +44,7 @@ struct dpll_device {
  * @module:		module of creator
  * @dpll_refs:		hold referencees to dplls pin was registered with
  * @parent_refs:	hold references to parent pins pin was registered with
+ * @ref_sync_pins:	hold references to pins for Reference SYNC feature
  * @prop:		pin properties copied from the registerer
  * @refcount:		refcount
  * @rcu:		rcu_head for kfree_rcu()
@@ -55,6 +56,7 @@ struct dpll_pin {
 	struct module *module;
 	struct xarray dpll_refs;
 	struct xarray parent_refs;
+	struct xarray ref_sync_pins;
 	struct dpll_pin_properties prop;
 	refcount_t refcount;
 	struct rcu_head rcu;
diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
index 4619aaa18b9c..036f21cac0a9 100644
--- a/drivers/dpll/dpll_netlink.c
+++ b/drivers/dpll/dpll_netlink.c
@@ -48,6 +48,24 @@ dpll_msg_add_dev_parent_handle(struct sk_buff *msg, u32 id)
 	return 0;
 }
 
+static bool dpll_pin_available(struct dpll_pin *pin)
+{
+	struct dpll_pin_ref *par_ref;
+	unsigned long i;
+
+	if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
+		return false;
+	xa_for_each(&pin->parent_refs, i, par_ref)
+		if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id,
+				DPLL_REGISTERED))
+			return true;
+	xa_for_each(&pin->dpll_refs, i, par_ref)
+		if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id,
+				DPLL_REGISTERED))
+			return true;
+	return false;
+}
+
 /**
  * dpll_msg_add_pin_handle - attach pin handle attribute to a given message
  * @msg: pointer to sk_buff message to attach a pin handle
@@ -428,6 +446,47 @@ nest_cancel:
 	return -EMSGSIZE;
 }
 
+static int
+dpll_msg_add_pin_ref_sync(struct sk_buff *msg, struct dpll_pin *pin,
+			  struct dpll_pin_ref *ref,
+			  struct netlink_ext_ack *extack)
+{
+	const struct dpll_pin_ops *ops = dpll_pin_ops(ref);
+	struct dpll_device *dpll = ref->dpll;
+	void *pin_priv, *ref_sync_pin_priv;
+	struct dpll_pin *ref_sync_pin;
+	enum dpll_pin_state state;
+	struct nlattr *nest;
+	unsigned long index;
+	int ret;
+
+	pin_priv = dpll_pin_on_dpll_priv(dpll, pin);
+	xa_for_each(&pin->ref_sync_pins, index, ref_sync_pin) {
+		if (!dpll_pin_available(ref_sync_pin))
+			continue;
+		ref_sync_pin_priv = dpll_pin_on_dpll_priv(dpll, ref_sync_pin);
+		if (WARN_ON(!ops->ref_sync_get))
+			return -EOPNOTSUPP;
+		ret = ops->ref_sync_get(pin, pin_priv, ref_sync_pin,
+					ref_sync_pin_priv, &state, extack);
+		if (ret)
+			return ret;
+		nest = nla_nest_start(msg, DPLL_A_PIN_REFERENCE_SYNC);
+		if (!nest)
+			return -EMSGSIZE;
+		if (nla_put_s32(msg, DPLL_A_PIN_ID, ref_sync_pin->id))
+			goto nest_cancel;
+		if (nla_put_s32(msg, DPLL_A_PIN_STATE, state))
+			goto nest_cancel;
+		nla_nest_end(msg, nest);
+	}
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(msg, nest);
+	return -EMSGSIZE;
+}
+
 static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq)
 {
 	int fs;
@@ -570,6 +629,10 @@ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
 	if (ret)
 		return ret;
 	ret = dpll_msg_add_pin_esync(msg, pin, ref, extack);
+	if (ret)
+		return ret;
+	if (!xa_empty(&pin->ref_sync_pins))
+		ret = dpll_msg_add_pin_ref_sync(msg, pin, ref, extack);
 	if (ret)
 		return ret;
 	if (xa_empty(&pin->parent_refs))
@@ -665,24 +728,6 @@ __dpll_device_change_ntf(struct dpll_device *dpll)
 	return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll);
 }
 
-static bool dpll_pin_available(struct dpll_pin *pin)
-{
-	struct dpll_pin_ref *par_ref;
-	unsigned long i;
-
-	if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
-		return false;
-	xa_for_each(&pin->parent_refs, i, par_ref)
-		if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id,
-				DPLL_REGISTERED))
-			return true;
-	xa_for_each(&pin->dpll_refs, i, par_ref)
-		if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id,
-				DPLL_REGISTERED))
-			return true;
-	return false;
-}
-
 /**
  * dpll_device_change_ntf - notify that the dpll device has been changed
  * @dpll: registered dpll pointer
@@ -745,7 +790,7 @@ int dpll_pin_delete_ntf(struct dpll_pin *pin)
 	return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin);
 }
 
-static int __dpll_pin_change_ntf(struct dpll_pin *pin)
+int __dpll_pin_change_ntf(struct dpll_pin *pin)
 {
 	return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin);
 }
@@ -935,6 +980,108 @@ rollback:
 	return ret;
 }
 
+static int
+dpll_pin_ref_sync_state_set(struct dpll_pin *pin,
+			    unsigned long ref_sync_pin_idx,
+			    const enum dpll_pin_state state,
+			    struct netlink_ext_ack *extack)
+
+{
+	struct dpll_pin_ref *ref, *failed;
+	const struct dpll_pin_ops *ops;
+	enum dpll_pin_state old_state;
+	struct dpll_pin *ref_sync_pin;
+	struct dpll_device *dpll;
+	unsigned long i;
+	int ret;
+
+	ref_sync_pin = xa_find(&pin->ref_sync_pins, &ref_sync_pin_idx,
+			       ULONG_MAX, XA_PRESENT);
+	if (!ref_sync_pin) {
+		NL_SET_ERR_MSG(extack, "reference sync pin not found");
+		return -EINVAL;
+	}
+	if (!dpll_pin_available(ref_sync_pin)) {
+		NL_SET_ERR_MSG(extack, "reference sync pin not available");
+		return -EINVAL;
+	}
+	ref = dpll_xa_ref_dpll_first(&pin->dpll_refs);
+	ASSERT_NOT_NULL(ref);
+	ops = dpll_pin_ops(ref);
+	if (!ops->ref_sync_set || !ops->ref_sync_get) {
+		NL_SET_ERR_MSG(extack, "reference sync not supported by this pin");
+		return -EOPNOTSUPP;
+	}
+	dpll = ref->dpll;
+	ret = ops->ref_sync_get(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				ref_sync_pin,
+				dpll_pin_on_dpll_priv(dpll, ref_sync_pin),
+				&old_state, extack);
+	if (ret) {
+		NL_SET_ERR_MSG(extack, "unable to get old reference sync state");
+		return ret;
+	}
+	if (state == old_state)
+		return 0;
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		ret = ops->ref_sync_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+					ref_sync_pin,
+					dpll_pin_on_dpll_priv(dpll,
+							      ref_sync_pin),
+					state, extack);
+		if (ret) {
+			failed = ref;
+			NL_SET_ERR_MSG_FMT(extack, "reference sync set failed for dpll_id:%u",
+					   dpll->id);
+			goto rollback;
+		}
+	}
+	__dpll_pin_change_ntf(pin);
+
+	return 0;
+
+rollback:
+	xa_for_each(&pin->dpll_refs, i, ref) {
+		if (ref == failed)
+			break;
+		ops = dpll_pin_ops(ref);
+		dpll = ref->dpll;
+		if (ops->ref_sync_set(pin, dpll_pin_on_dpll_priv(dpll, pin),
+				      ref_sync_pin,
+				      dpll_pin_on_dpll_priv(dpll, ref_sync_pin),
+				      old_state, extack))
+			NL_SET_ERR_MSG(extack, "set reference sync rollback failed");
+	}
+	return ret;
+}
+
+static int
+dpll_pin_ref_sync_set(struct dpll_pin *pin, struct nlattr *nest,
+		      struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DPLL_A_PIN_MAX + 1];
+	enum dpll_pin_state state;
+	u32 sync_pin_id;
+
+	nla_parse_nested(tb, DPLL_A_PIN_MAX, nest,
+			 dpll_reference_sync_nl_policy, extack);
+	if (!tb[DPLL_A_PIN_ID]) {
+		NL_SET_ERR_MSG(extack, "sync pin id expected");
+		return -EINVAL;
+	}
+	sync_pin_id = nla_get_u32(tb[DPLL_A_PIN_ID]);
+
+	if (!tb[DPLL_A_PIN_STATE]) {
+		NL_SET_ERR_MSG(extack, "sync pin state expected");
+		return -EINVAL;
+	}
+	state = nla_get_u32(tb[DPLL_A_PIN_STATE]);
+
+	return dpll_pin_ref_sync_state_set(pin, sync_pin_id, state, extack);
+}
+
 static int
 dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx,
 			  enum dpll_pin_state state,
@@ -1241,6 +1388,11 @@ dpll_pin_set_from_nlattr(struct dpll_pin *pin, struct genl_info *info)
 			if (ret)
 				return ret;
 			break;
+		case DPLL_A_PIN_REFERENCE_SYNC:
+			ret = dpll_pin_ref_sync_set(pin, a, info->extack);
+			if (ret)
+				return ret;
+			break;
 		}
 	}
 
diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h
index a9cfd55f57fc..dd28b56d27c5 100644
--- a/drivers/dpll/dpll_netlink.h
+++ b/drivers/dpll/dpll_netlink.h
@@ -11,3 +11,5 @@ int dpll_device_delete_ntf(struct dpll_device *dpll);
 int dpll_pin_create_ntf(struct dpll_pin *pin);
 
 int dpll_pin_delete_ntf(struct dpll_pin *pin);
+
+int __dpll_pin_change_ntf(struct dpll_pin *pin);
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 6ad6c2968a28..fa1e76920d0e 100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -103,6 +103,16 @@ struct dpll_pin_ops {
 			 const struct dpll_device *dpll, void *dpll_priv,
 			 struct dpll_pin_esync *esync,
 			 struct netlink_ext_ack *extack);
+	int (*ref_sync_set)(const struct dpll_pin *pin, void *pin_priv,
+			    const struct dpll_pin *ref_sync_pin,
+			    void *ref_sync_pin_priv,
+			    const enum dpll_pin_state state,
+			    struct netlink_ext_ack *extack);
+	int (*ref_sync_get)(const struct dpll_pin *pin, void *pin_priv,
+			    const struct dpll_pin *ref_sync_pin,
+			    void *ref_sync_pin_priv,
+			    enum dpll_pin_state *state,
+			    struct netlink_ext_ack *extack);
 };
 
 struct dpll_pin_frequency {
@@ -202,6 +212,9 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
 void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin,
 				const struct dpll_pin_ops *ops, void *priv);
 
+int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin,
+			       struct dpll_pin *ref_sync_pin);
+
 int dpll_device_change_ntf(struct dpll_device *dpll);
 
 int dpll_pin_change_ntf(struct dpll_pin *pin);
-- 
cgit v1.2.3


From ddaad4ad774d4ae02047ef873a8e38f62a4b7b01 Mon Sep 17 00:00:00 2001
From: Gabor Juhos <j4g8y7@gmail.com>
Date: Wed, 18 Jun 2025 22:22:50 +0200
Subject: mtd: nand: qpic_common: prevent out of bounds access of BAM arrays

The common QPIC code does not do any boundary checking when it handles
the command elements and scatter gater list arrays of a BAM transaction,
thus it allows to access out of bounds elements in those.

Although it is the responsibility of the given driver to allocate enough
space for all possible BAM transaction variations, however there can be
mistakes in the driver code which can lead to hidden memory corruption
issues which are hard to debug.

This kind of problem has been observed during testing the 'spi-qpic-snand'
driver. Although the driver has been fixed with a preceding patch, but it
still makes sense to reduce the chance of having such errors again later.

In order to prevent such errors, change the qcom_alloc_bam_transaction()
function to store the number of elements of the arrays in the
'bam_transaction' strucutre during allocation. Also, add sanity checks to
the qcom_prep_bam_dma_desc_{cmd,data}() functions to avoid using out of
bounds indices for the arrays.

Tested-by: Lakshmi Sowjanya D <quic_laksd@quicinc.com>     # on SDX75
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Gabor Juhos <j4g8y7@gmail.com>
Link: https://patch.msgid.link/20250618-qpic-snand-avoid-mem-corruption-v3-2-319c71296cda@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mtd/nand/qpic_common.c       | 30 ++++++++++++++++++++++++++----
 include/linux/mtd/nand-qpic-common.h |  8 ++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/qpic_common.c b/drivers/mtd/nand/qpic_common.c
index 4dc4d65e7d32..8e604cc22ca3 100644
--- a/drivers/mtd/nand/qpic_common.c
+++ b/drivers/mtd/nand/qpic_common.c
@@ -57,14 +57,15 @@ qcom_alloc_bam_transaction(struct qcom_nand_controller *nandc)
 	bam_txn_buf += sizeof(*bam_txn);
 
 	bam_txn->bam_ce = bam_txn_buf;
-	bam_txn_buf +=
-		sizeof(*bam_txn->bam_ce) * QPIC_PER_CW_CMD_ELEMENTS * num_cw;
+	bam_txn->bam_ce_nitems = QPIC_PER_CW_CMD_ELEMENTS * num_cw;
+	bam_txn_buf += sizeof(*bam_txn->bam_ce) * bam_txn->bam_ce_nitems;
 
 	bam_txn->cmd_sgl = bam_txn_buf;
-	bam_txn_buf +=
-		sizeof(*bam_txn->cmd_sgl) * QPIC_PER_CW_CMD_SGL * num_cw;
+	bam_txn->cmd_sgl_nitems = QPIC_PER_CW_CMD_SGL * num_cw;
+	bam_txn_buf += sizeof(*bam_txn->cmd_sgl) * bam_txn->cmd_sgl_nitems;
 
 	bam_txn->data_sgl = bam_txn_buf;
+	bam_txn->data_sgl_nitems = QPIC_PER_CW_DATA_SGL * num_cw;
 
 	init_completion(&bam_txn->txn_done);
 
@@ -238,6 +239,11 @@ int qcom_prep_bam_dma_desc_cmd(struct qcom_nand_controller *nandc, bool read,
 	struct bam_transaction *bam_txn = nandc->bam_txn;
 	u32 offset;
 
+	if (bam_txn->bam_ce_pos + size > bam_txn->bam_ce_nitems) {
+		dev_err(nandc->dev, "BAM %s array is full\n", "CE");
+		return -EINVAL;
+	}
+
 	bam_ce_buffer = &bam_txn->bam_ce[bam_txn->bam_ce_pos];
 
 	/* fill the command desc */
@@ -258,6 +264,12 @@ int qcom_prep_bam_dma_desc_cmd(struct qcom_nand_controller *nandc, bool read,
 
 	/* use the separate sgl after this command */
 	if (flags & NAND_BAM_NEXT_SGL) {
+		if (bam_txn->cmd_sgl_pos >= bam_txn->cmd_sgl_nitems) {
+			dev_err(nandc->dev, "BAM %s array is full\n",
+				"CMD sgl");
+			return -EINVAL;
+		}
+
 		bam_ce_buffer = &bam_txn->bam_ce[bam_txn->bam_ce_start];
 		bam_ce_size = (bam_txn->bam_ce_pos -
 				bam_txn->bam_ce_start) *
@@ -297,10 +309,20 @@ int qcom_prep_bam_dma_desc_data(struct qcom_nand_controller *nandc, bool read,
 	struct bam_transaction *bam_txn = nandc->bam_txn;
 
 	if (read) {
+		if (bam_txn->rx_sgl_pos >= bam_txn->data_sgl_nitems) {
+			dev_err(nandc->dev, "BAM %s array is full\n", "RX sgl");
+			return -EINVAL;
+		}
+
 		sg_set_buf(&bam_txn->data_sgl[bam_txn->rx_sgl_pos],
 			   vaddr, size);
 		bam_txn->rx_sgl_pos++;
 	} else {
+		if (bam_txn->tx_sgl_pos >= bam_txn->data_sgl_nitems) {
+			dev_err(nandc->dev, "BAM %s array is full\n", "TX sgl");
+			return -EINVAL;
+		}
+
 		sg_set_buf(&bam_txn->data_sgl[bam_txn->tx_sgl_pos],
 			   vaddr, size);
 		bam_txn->tx_sgl_pos++;
diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h
index e8462deda6db..f0aa098a395f 100644
--- a/include/linux/mtd/nand-qpic-common.h
+++ b/include/linux/mtd/nand-qpic-common.h
@@ -237,6 +237,9 @@
  * @last_data_desc - last DMA desc in data channel (tx/rx).
  * @last_cmd_desc - last DMA desc in command channel.
  * @txn_done - completion for NAND transfer.
+ * @bam_ce_nitems - the number of elements in the @bam_ce array
+ * @cmd_sgl_nitems - the number of elements in the @cmd_sgl array
+ * @data_sgl_nitems - the number of elements in the @data_sgl array
  * @bam_ce_pos - the index in bam_ce which is available for next sgl
  * @bam_ce_start - the index in bam_ce which marks the start position ce
  *		   for current sgl. It will be used for size calculation
@@ -255,6 +258,11 @@ struct bam_transaction {
 	struct dma_async_tx_descriptor *last_data_desc;
 	struct dma_async_tx_descriptor *last_cmd_desc;
 	struct completion txn_done;
+
+	unsigned int bam_ce_nitems;
+	unsigned int cmd_sgl_nitems;
+	unsigned int data_sgl_nitems;
+
 	struct_group(bam_positions,
 		u32 bam_ce_pos;
 		u32 bam_ce_start;
-- 
cgit v1.2.3


From 96893cdd4760ad94a438c1523cc5ca2470e04670 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Sun, 29 Jun 2025 18:15:32 +0200
Subject: spi: Raise limit on number of chip selects to 24

We have a system which uses 24 SPI chip selects, raise the hard coded
limit accordingly.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://patch.msgid.link/20250629-spi-increase-number-of-cs-v2-1-85a0a09bab32@pengutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 4789f91dae94..e9ea43234d9a 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -21,7 +21,7 @@
 #include <uapi/linux/spi/spi.h>
 
 /* Max no. of CS supported per spi device */
-#define SPI_CS_CNT_MAX 16
+#define SPI_CS_CNT_MAX 24
 
 struct dma_chan;
 struct software_node;
-- 
cgit v1.2.3


From 24368a744bafce7daf1eafd6a163871925ee5892 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 2 May 2025 21:32:01 -0400
Subject: sanitize handling of long-term internal mounts

Original rationale for those had been the reduced cost of mntput()
for the stuff that is mounted somewhere.  Mount refcount increments and
decrements are frequent; what's worse, they tend to concentrate on the
same instances and cacheline pingpong is quite noticable.

As the result, mount refcounts are per-cpu; that allows a very cheap
increment.  Plain decrement would be just as easy, but decrement-and-test
is anything but (we need to add the components up, with exclusion against
possible increment-from-zero, etc.).

Fortunately, there is a very common case where we can tell that decrement
won't be the final one - if the thing we are dropping is currently
mounted somewhere.  We have an RCU delay between the removal from mount
tree and dropping the reference that used to pin it there, so we can
just take rcu_read_lock() and check if the victim is mounted somewhere.
If it is, we can go ahead and decrement without and further checks -
the reference we are dropping is not the last one.  If it isn't, we
get all the fun with locking, carefully adding up components, etc.,
but the majority of refcount decrements end up taking the fast path.

There is a major exception, though - pipes and sockets.  Those live
on the internal filesystems that are not going to be mounted anywhere.
They are not going to be _un_mounted, of course, so having to take the
slow path every time a pipe or socket gets closed is really obnoxious.
Solution had been to mark them as long-lived ones - essentially faking
"they are mounted somewhere" indicator.

With minor modification that works even for ones that do eventually get
dropped - all it takes is making sure we have an RCU delay between
clearing the "mounted somewhere" indicator and dropping the reference.

There are some additional twists (if you want to drop a dozen of such
internal mounts, you'd be better off with clearing the indicator on
all of them, doing an RCU delay once, then dropping the references),
but in the basic form it had been
	* use kern_mount() if you want your internal mount to be
a long-term one.
	* use kern_unmount() to undo that.

Unfortunately, the things did rot a bit during the mount API reshuffling.
In several cases we have lost the "fake the indicator" part; kern_unmount()
on the unmount side remained (it doesn't warn if you use it on a mount
without the indicator), but all benefits regaring mntput() cost had been
lost.

To get rid of that bitrot, let's add a new helper that would work
with fs_context-based API: fc_mount_longterm().  It's a counterpart
of fc_mount() that does, on success, mark its result as long-term.
It must be paired with kern_unmount() or equivalents.

Converted:
	1) mqueue (it used to use kern_mount_data() and the umount side
is still as it used to be)
	2) hugetlbfs (used to use kern_mount_data(), internal mount is
never unmounted in this one)
	3) i915 gemfs (used to be kern_mount() + manual remount to set
options, still uses kern_unmount() on umount side)
	4) v3d gemfs (copied from i915)

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/gpu/drm/i915/gem/i915_gemfs.c | 21 ++++++++++++++++++---
 drivers/gpu/drm/v3d/v3d_gemfs.c       | 21 ++++++++++++++++++---
 fs/hugetlbfs/inode.c                  |  2 +-
 fs/namespace.c                        |  9 +++++++++
 include/linux/mount.h                 |  1 +
 ipc/mqueue.c                          |  2 +-
 6 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c
index 65d84a93c525..a09e2eb47175 100644
--- a/drivers/gpu/drm/i915/gem/i915_gemfs.c
+++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c
@@ -5,16 +5,23 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 
 #include "i915_drv.h"
 #include "i915_gemfs.h"
 #include "i915_utils.h"
 
+static int add_param(struct fs_context *fc, const char *key, const char *val)
+{
+	return vfs_parse_fs_string(fc, key, val, strlen(val));
+}
+
 void i915_gemfs_init(struct drm_i915_private *i915)
 {
-	char huge_opt[] = "huge=within_size"; /* r/w */
 	struct file_system_type *type;
+	struct fs_context *fc;
 	struct vfsmount *gemfs;
+	int ret;
 
 	/*
 	 * By creating our own shmemfs mountpoint, we can pass in
@@ -38,8 +45,16 @@ void i915_gemfs_init(struct drm_i915_private *i915)
 	if (!type)
 		goto err;
 
-	gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt);
-	if (IS_ERR(gemfs))
+	fc = fs_context_for_mount(type, SB_KERNMOUNT);
+	if (IS_ERR(fc))
+		goto err;
+	ret = add_param(fc, "source", "tmpfs");
+	if (!ret)
+		ret = add_param(fc, "huge", "within_size");
+	if (!ret)
+		gemfs = fc_mount_longterm(fc);
+	put_fs_context(fc);
+	if (ret)
 		goto err;
 
 	i915->mm.gemfs = gemfs;
diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c
index 4c5e18590a5c..8ec6ed82b3d9 100644
--- a/drivers/gpu/drm/v3d/v3d_gemfs.c
+++ b/drivers/gpu/drm/v3d/v3d_gemfs.c
@@ -3,14 +3,21 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/fs_context.h>
 
 #include "v3d_drv.h"
 
+static int add_param(struct fs_context *fc, const char *key, const char *val)
+{
+	return vfs_parse_fs_string(fc, key, val, strlen(val));
+}
+
 void v3d_gemfs_init(struct v3d_dev *v3d)
 {
-	char huge_opt[] = "huge=within_size";
 	struct file_system_type *type;
+	struct fs_context *fc;
 	struct vfsmount *gemfs;
+	int ret;
 
 	/*
 	 * By creating our own shmemfs mountpoint, we can pass in
@@ -28,8 +35,16 @@ void v3d_gemfs_init(struct v3d_dev *v3d)
 	if (!type)
 		goto err;
 
-	gemfs = vfs_kern_mount(type, SB_KERNMOUNT, type->name, huge_opt);
-	if (IS_ERR(gemfs))
+	fc = fs_context_for_mount(type, SB_KERNMOUNT);
+	if (IS_ERR(fc))
+		goto err;
+	ret = add_param(fc, "source", "tmpfs");
+	if (!ret)
+		ret = add_param(fc, "huge", "within_size");
+	if (!ret)
+		gemfs = fc_mount_longterm(fc);
+	put_fs_context(fc);
+	if (ret)
 		goto err;
 
 	v3d->gemfs = gemfs;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e4de5425838d..4e0397775167 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1587,7 +1587,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
 	} else {
 		struct hugetlbfs_fs_context *ctx = fc->fs_private;
 		ctx->hstate = h;
-		mnt = fc_mount(fc);
+		mnt = fc_mount_longterm(fc);
 		put_fs_context(fc);
 	}
 	if (IS_ERR(mnt))
diff --git a/fs/namespace.c b/fs/namespace.c
index 57b0974a5d1e..6a0697eeda74 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1260,6 +1260,15 @@ struct vfsmount *fc_mount(struct fs_context *fc)
 }
 EXPORT_SYMBOL(fc_mount);
 
+struct vfsmount *fc_mount_longterm(struct fs_context *fc)
+{
+	struct vfsmount *mnt = fc_mount(fc);
+	if (!IS_ERR(mnt))
+		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+	return mnt;
+}
+EXPORT_SYMBOL(fc_mount_longterm);
+
 struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				int flags, const char *name,
 				void *data)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 1a508beba446..c145820fcbbf 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -98,6 +98,7 @@ int mnt_get_write_access(struct vfsmount *mnt);
 void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
+extern struct vfsmount *fc_mount_longterm(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 82ed2d3c9846..de7432efbf4a 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -482,7 +482,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
 	put_user_ns(fc->user_ns);
 	fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
 
-	mnt = fc_mount(fc);
+	mnt = fc_mount_longterm(fc);
 	put_fs_context(fc);
 	return mnt;
 }
-- 
cgit v1.2.3


From f0d0ba19985d23a3e83d654318ccb6e9c5f1b095 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 14 May 2025 20:50:06 -0400
Subject: Rewrite of propagate_umount()

The variant currently in the tree has problems; trying to prove
correctness has caught at least one class of bugs (reparenting
that ends up moving the visible location of reparented mount, due
to not excluding some of the counterparts on propagation that
should've been included).

I tried to prove that it's the only bug there; I'm still not sure
whether it is.  If anyone can reconstruct and write down an analysis
of the mainline implementation, I'll gladly review it; as it is,
I ended up doing a different implementation.  Candidate collection
phase is similar, but trimming the set down until it satisfies the
constraints turned out pretty different.

I hoped to do transformation as a massage series, but that turns out
to be too convoluted.  So it's a single patch replacing propagate_umount()
and friends in one go, with notes and analysis in D/f/propagate_umount.txt
(in addition to inline comments).

As far I can tell, it is provably correct and provably linear by the number
of mounts we need to look at in order to decide what should be unmounted.
It even builds and seems to survive testing...

Another nice thing that fell out of that is that ->mnt_umounting is no longer
needed.

Compared to the first version:
	* explicit MNT_UMOUNT_CANDIDATE flag for is_candidate()
	* trim_ancestors() only clears that flag, leaving the suckers on list
	* trim_one() and handle_locked() take the stuff with flag cleared off
the list.  That allows to iterate with list_for_each_entry_safe() when calling
trim_one() - it removes at most one element from the list now.
	* no globals - I didn't bother with any kind of context, not worth it.

	* Notes updated accordingly; I have not touch the terms yet.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/propagate_umount.txt | 484 +++++++++++++++++++++++++
 fs/mount.h                                     |   1 -
 fs/namespace.c                                 |   1 -
 fs/pnode.c                                     | 362 +++++++++---------
 fs/pnode.h                                     |   2 +-
 include/linux/mount.h                          |   3 +-
 6 files changed, 685 insertions(+), 168 deletions(-)
 create mode 100644 Documentation/filesystems/propagate_umount.txt

(limited to 'include')

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
new file mode 100644
index 000000000000..6906903a8aa2
--- /dev/null
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -0,0 +1,484 @@
+	Notes on propagate_umount()
+
+Umount propagation starts with a set of mounts we are already going to
+take out.  Ideally, we would like to add all downstream cognates to
+that set - anything with the same mountpoint as one of the removed
+mounts and with parent that would receive events from the parent of that
+mount.  However, there are some constraints the resulting set must
+satisfy.
+
+It is convenient to define several properties of sets of mounts:
+
+1) A set S of mounts is non-shifting if for any mount X belonging
+to S all subtrees mounted strictly inside of X (i.e. not overmounting
+the root of X) contain only elements of S.
+
+2) A set S is non-revealing if all locked mounts that belong to S have
+parents that also belong to S.
+
+3) A set S is closed if it contains all children of its elements.
+
+The set of mounts taken out by umount(2) must be non-shifting and
+non-revealing; the first constraint is what allows to reparent
+any remaining mounts and the second is what prevents the exposure
+of any concealed mountpoints.
+
+propagate_umount() takes the original set as an argument and tries to
+extend that set.  The original set is a full subtree and its root is
+unlocked; what matters is that it's closed and non-revealing.
+Resulting set may not be closed; there might still be mounts outside
+of that set, but only on top of stacks of root-overmounting elements
+of set.  They can be reparented to the place where the bottom of
+stack is attached to a mount that will survive.  NOTE: doing that
+will violate a constraint on having no more than one mount with
+the same parent/mountpoint pair; however, the caller (umount_tree())
+will immediately remedy that - it may keep unmounted element attached
+to parent, but only if the parent itself is unmounted.  Since all
+conflicts created by reparenting have common parent *not* in the
+set and one side of the conflict (bottom of the stack of overmounts)
+is in the set, it will be resolved.  However, we rely upon umount_tree()
+doing that pretty much immediately after the call of propagate_umount().
+
+Algorithm is based on two statements:
+	1) for any set S, there is a maximal non-shifting subset of S
+and it can be calculated in O(#S) time.
+	2) for any non-shifting set S, there is a maximal non-revealing
+subset of S.  That subset is also non-shifting and it can be calculated
+in O(#S) time.
+
+		Finding candidates.
+
+We are given a closed set U and we want to find all mounts that have
+the same mountpoint as some mount m in U *and* whose parent receives
+propagation from the parent of the same mount m.  Naive implementation
+would be
+	S = {}
+	for each m in U
+		add m to S
+		p = parent(m)
+		for each q in Propagation(p) - {p}
+			child = look_up(q, mountpoint(m))
+			if child
+				add child to S
+but that can lead to excessive work - there might be propagation among the
+subtrees of U, in which case we'd end up examining the same candidates
+many times.  Since propagation is transitive, the same will happen to
+everything downstream of that candidate and it's not hard to construct
+cases where the approach above leads to the time quadratic by the actual
+number of candidates.
+
+Note that if we run into a candidate we'd already seen, it must've been
+added on an earlier iteration of the outer loop - all additions made
+during one iteration of the outer loop have different parents.  So
+if we find a child already added to the set, we know that everything
+in Propagation(parent(child)) with the same mountpoint has been already
+added.
+	S = {}
+	for each m in U
+		if m in S
+			continue
+		add m to S
+		p = parent(m)
+		q = propagation_next(p, p)
+		while q
+			child = look_up(q, mountpoint(m))
+			if child
+				if child in S
+					q = skip_them(q, p)
+					continue;
+				add child to S
+			q = propagation_next(q, p)
+where
+skip_them(q, p)
+	keep walking Propagation(p) from q until we find something
+	not in Propagation(q)
+
+would get rid of that problem, but we need a sane implementation of
+skip_them().  That's not hard to do - split propagation_next() into
+"down into mnt_slave_list" and "forward-and-up" parts, with the
+skip_them() being "repeat the forward-and-up part until we get NULL
+or something that isn't a peer of the one we are skipping".
+
+Note that there can be no absolute roots among the extra candidates -
+they all come from mount lookups.  Absolute root among the original
+set is _currently_ impossible, but it might be worth protecting
+against.
+
+		Maximal non-shifting subsets.
+
+Let's call a mount m in a set S forbidden in that set if there is a
+subtree mounted strictly inside m and containing mounts that do not
+belong to S.
+
+The set is non-shifting when none of its elements are forbidden in it.
+
+If mount m is forbidden in a set S, it is forbidden in any subset S' it
+belongs to.  In other words, it can't belong to any of the non-shifting
+subsets of S.  If we had a way to find a forbidden mount or show that
+there's none, we could use it to find the maximal non-shifting subset
+simply by finding and removing them until none remain.
+
+Suppose mount m is forbidden in S; then any mounts forbidden in S - {m}
+must have been forbidden in S itself.  Indeed, since m has descendents
+that do not belong to S, any subtree that fits into S will fit into
+S - {m} as well.
+
+So in principle we could go through elements of S, checking if they
+are forbidden in S and removing the ones that are.  Removals will
+not invalidate the checks done for earlier mounts - if they were not
+forbidden at the time we checked, they won't become forbidden later.
+It's too costly to be practical, but there is a similar approach that
+is linear by size of S.
+
+Let's say that mount x in a set S is forbidden by mount y, if
+	* both x and y belong to S.
+	* there is a chain of mounts starting at x and leaving S
+	  immediately after passing through y, with the first
+	  mountpoint strictly inside x.
+Note 1: x may be equal to y - that's the case when something not
+belonging to S is mounted strictly inside x.
+Note 2: if y does not belong to S, it can't forbid anything in S.
+Note 3: if y has no children outside of S, it can't forbid anything in S.
+
+It's easy to show that mount x is forbidden in S if and only if x is
+forbidden in S by some mount y.  And it's easy to find all mounts in S
+forbidden by a given mount.
+
+Consider the following operation:
+	Trim(S, m) = S - {x : x is forbidden by m in S}
+
+Note that if m does not belong to S or has no children outside of S we
+are guaranteed that Trim(S, m) is equal to S.
+
+The following is true: if x is forbidden by y in Trim(S, m), it was
+already forbidden by y in S.
+
+Proof: Suppose x is forbidden by y in Trim(S, m).  Then there is a
+chain of mounts (x_0 = x, ..., x_k = y, x_{k+1} = r), such that x_{k+1}
+is the first element that doesn't belong to Trim(S, m) and the
+mountpoint of x_1 is strictly inside x.  If mount r belongs to S, it must
+have been removed by Trim(S, m), i.e. it was forbidden in S by m.
+Then there was a mount chain from r to some child of m that stayed in
+S all the way until m, but that's impossible since x belongs to Trim(S, m)
+and prepending (x_0, ..., x_k) to that chain demonstrates that x is also
+forbidden in S by m, and thus can't belong to Trim(S, m).
+Therefore r can not belong to S and our chain demonstrates that
+x is forbidden by y in S.  QED.
+
+Corollary: no mount is forbidden by m in Trim(S, m).  Indeed, any
+such mount would have been forbidden by m in S and thus would have been
+in the part of S removed in Trim(S, m).
+
+Corollary: no mount is forbidden by m in Trim(Trim(S, m), n).  Indeed,
+any such would have to have been forbidden by m in Trim(S, m), which
+is impossible.
+
+Corollary: after
+	S = Trim(S, x_1)
+	S = Trim(S, x_2)
+	...
+	S = Trim(S, x_k)
+no mount remaining in S will be forbidden by either of x_1,...,x_k.
+
+The following will reduce S to its maximal non-shifting subset:
+	visited = {}
+	while S contains elements not belonging to visited
+		let m be an arbitrary such element of S
+		S = Trim(S, m)
+		add m to visited
+
+S never grows, so the number of elements of S not belonging to visited
+decreases at least by one on each iteration.  When the loop terminates,
+all mounts remaining in S belong to visited.  It's easy to see that at
+the beginning of each iteration no mount remaining in S will be forbidden
+by any element of visited.  In other words, no mount remaining in S will
+be forbidden, i.e. final value of S will be non-shifting.  It will be
+the maximal non-shifting subset, since we were removing only forbidden
+elements.
+
+	There are two difficulties in implementing the above in linear
+time, both due to the fact that Trim() might need to remove more than one
+element.  Naive implementation of Trim() is vulnerable to running into a
+long chain of mounts, each mounted on top of parent's root.  Nothing in
+that chain is forbidden, so nothing gets removed from it.  We need to
+recognize such chains and avoid walking them again on subsequent calls of
+Trim(), otherwise we will end up with worst-case time being quadratic by
+the number of elements in S.  Another difficulty is in implementing the
+outer loop - we need to iterate through all elements of a shrinking set.
+That would be trivial if we never removed more than one element at a time
+(linked list, with list_for_each_entry_safe for iterator), but we may
+need to remove more than one entry, possibly including the ones we have
+already visited.
+
+	Let's start with naive algorithm for Trim():
+
+Trim_one(m)
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove m from S
+				break
+	if found
+		Trim_ancestors(m)
+
+Trim_ancestors(m)
+	for (; parent(m) in S; m = parent(m)) {
+		if (mountpoint(m) != root(parent(m)))
+			remove parent(m) from S
+	}
+
+If m belongs to S, Trim_one(m) will replace S with Trim(S, m).
+Proof:
+	Consider the chains excluding elements from Trim(S, m).  The last
+two elements in such chain are m and some child of m that does not belong
+to S.  If m has no such children, Trim(S, m) is equal to S.
+	m itself is removed if and only if the chain has exactly two
+elements, i.e. when the last element does not overmount the root of m.
+In other words, that happens when m has a child not in S that does not
+overmount the root of m.
+	All other elements to remove will be ancestors of m, such that
+the entire descent chain from them to m is contained in S.  Let
+(x_0, x_1, ..., x_k = m) be the longest such chain.  x_i needs to be
+removed if and only if x_{i+1} does not overmount its root.  It's easy
+to see that Trim_ancestors(m) will iterate through that chain from
+x_k to x_1 and that it will remove exactly the elements that need to be
+removed.
+
+	Note that if the loop in Trim_ancestors() walks into an already
+visited element, we are guaranteed that remaining iterations will see
+only elements that had already been visited and remove none of them.
+That's the weakness that makes it vulnerable to long chains of full
+overmounts.
+
+	It's easy to deal with, if we can afford setting marks on
+elements of S; we would mark all elements already visited by
+Trim_ancestors() and have it bail out as soon as it sees an already
+marked element.
+
+	The problems with iterating through the set can be dealt with in
+several ways, depending upon the representation we choose for our set.
+One useful observation is that we are given a closed subset in S - the
+original set passed to propagate_umount().  Its elements can neither
+forbid anything nor be forbidden by anything - all their descendents
+belong to S, so they can not occur anywhere in any excluding chain.
+In other words, the elements of that subset will remain in S until
+the end and Trim_one(S, m) is a no-op for all m from that subset.
+
+	That suggests keeping S as a disjoint union of a closed set U
+('will be unmounted, no matter what') and the set of all elements of
+S that do not belong to U.  That set ('candidates') is all we need
+to iterate through.  Let's represent it as a subset in a cyclic list,
+consisting of all list elements that are marked as candidates (initially -
+all of them).  Then we could have Trim_ancestors() only remove the mark,
+leaving the elements on the list.  Then Trim_one() would never remove
+anything other than its argument from the containing list, allowing to
+use list_for_each_entry_safe() as iterator.
+
+	Assuming that representation we get the following:
+
+	list_for_each_entry_safe(m, ..., Candidates, ...)
+		Trim_one(m)
+where
+Trim_one(m)
+	if (m is not marked as a candidate)
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the Candidates list
+		return
+		
+	remove_this = false
+	found = false
+	for each n in children(m)
+		if n not in S
+			found = true
+			if (mountpoint(n) != root(m))
+				remove_this = true
+				break
+	if found
+		Trim_ancestors(m)
+	if remove_this
+		strip the "seen by Trim_ancestors" mark from m
+		strip the "candidate" mark from m
+		remove m from the Candidate list
+
+Trim_ancestors(m)
+	for (p = parent(m); p is marked as candidate ; m = p, p = parent(p)) {
+		if m is marked as seen by Trim_ancestors
+			return
+		mark m as seen by Trim_ancestors
+		if (mountpoint(m) != root(p))
+			strip the "candidate" mark from p
+	}
+
+	Terminating condition in the loop in Trim_ancestors() is correct,
+since that that loop will never run into p belonging to U - p is always
+an ancestor of argument of Trim_one() and since U is closed, the argument
+of Trim_one() would also have to belong to U.  But Trim_one() is never
+called for elements of U.  In other words, p belongs to S if and only
+if it belongs to candidates.
+
+	Time complexity:
+* we get no more than O(#S) calls of Trim_one()
+* the loop over children in Trim_one() never looks at the same child
+twice through all the calls.
+* iterations of that loop for children in S are no more than O(#S)
+in the worst case
+* at most two children that are not elements of S are considered per
+call of Trim_one().
+* the loop in Trim_ancestors() sets its mark once per iteration and
+no element of S has is set more than once.
+
+	In the end we may have some elements excluded from S by
+Trim_ancestors() still stuck on the list.  We could do a separate
+loop removing them from the list (also no worse than O(#S) time),
+but it's easier to leave that until the next phase - there we will
+iterate through the candidates anyway.
+
+	The caller has already removed all elements of U from their parents'
+lists of children, which means that checking if child belongs to S is
+equivalent to checking if it's marked as a candidate; we'll never see
+the elements of U in the loop over children in Trim_one().
+
+	What's more, if we see that children(m) is empty and m is not
+locked, we can immediately move m into the committed subset (remove
+from the parent's list of children, etc.).  That's one fewer mount we'll
+have to look into when we check the list of children of its parent *and*
+when we get to building the non-revealing subset.
+
+		Maximal non-revealing subsets
+
+If S is not a non-revealing subset, there is a locked element x in S
+such that parent of x is not in S.
+
+Obviously, no non-revealing subset of S may contain x.  Removing such
+elements one by one will obviously end with the maximal non-revealing
+subset (possibly empty one).  Note that removal of an element will
+require removal of all its locked children, etc.
+
+If the set had been non-shifting, it will remain non-shifting after
+such removals.
+Proof: suppose S was non-shifting, x is a locked element of S, parent of x
+is not in S and S - {x} is not non-shifting.  Then there is an element m
+in S - {x} and a subtree mounted strictly inside m, such that m contains
+an element not in in S - {x}.  Since S is non-shifting, everything in
+that subtree must belong to S.  But that means that this subtree must
+contain x somewhere *and* that parent of x either belongs that subtree
+or is equal to m.  Either way it must belong to S.  Contradiction.
+
+// same representation as for finding maximal non-shifting subsets:
+// S is a disjoint union of a non-revealing set U (the ones we are committed
+// to unmount) and a set of candidates, represented as a subset of list
+// elements that have "is a candidate" mark on them.
+// Elements of U are removed from their parents' lists of children.
+// In the end candidates becomes empty and maximal non-revealing non-shifting
+// subset of S is now in U
+	while (Candidates list is non-empty)
+		handle_locked(first(Candidates))
+
+handle_locked(m)
+	if m is not marked as a candidate
+		strip the "seen by Trim_ancestors" mark from m
+		remove m from the list
+		return
+	cutoff = m
+	for (p = m; p in candidates; p = parent(p)) {
+		strip the "seen by Trim_ancestors" mark from p
+		strip the "candidate" mark from p
+		remove p from the Candidates list
+		if (!locked(p))
+			cutoff = parent(p)
+	}
+	if p in U
+		cutoff = p
+	while m != cutoff
+		remove m from children(parent(m))
+		add m to U
+		m = parent(m)
+
+Let (x_0, ..., x_n = m) be the maximal chain of descent of m within S.
+* If it contains some elements of U, let x_k be the last one of those.
+Then union of U with {x_{k+1}, ..., x_n} is obviously non-revealing.
+* otherwise if all its elements are locked, then none of {x_0, ..., x_n}
+may be elements of a non-revealing subset of S.
+* otherwise let x_k be the first unlocked element of the chain.  Then none
+of {x_0, ..., x_{k-1}} may be an element of a non-revealing subset of
+S and union of U and {x_k, ..., x_n} is non-revealing.
+
+handle_locked(m) finds which of these cases applies and adjusts Candidates
+and U accordingly.  U remains non-revealing, union of Candidates and
+U still contains any non-revealing subset of S and after the call of
+handle_locked(m) m is guaranteed to be not in Candidates list.  So having
+it called for each element of S would suffice to empty Candidates,
+leaving U the maximal non-revealing subset of S.
+
+However, handle_locked(m) is a no-op when m belongs to U, so it's enough
+to have it called for elements of Candidates list until none remain.
+
+Time complexity: number of calls of handle_locked() is limited by
+#Candidates, each iteration of the first loop in handle_locked() removes
+an element from the list, so their total number of executions is also
+limited by #Candidates; number of iterations in the second loop is no
+greater than the number of iterations of the first loop.
+
+
+		Reparenting
+
+After we'd calculated the final set, we still need to deal with
+reparenting - if an element of the final set has a child not in it,
+we need to reparent such child.
+
+Such children can only be root-overmounting (otherwise the set wouldn't
+be non-shifting) and their parents can not belong to the original set,
+since the original is guaranteed to be closed.
+
+
+		Putting all of that together
+
+The plan is to
+	* find all candidates
+	* trim down to maximal non-shifting subset
+	* trim down to maximal non-revealing subset
+	* reparent anything that needs to be reparented
+	* return the resulting set to the caller
+
+For the 2nd and 3rd steps we want to separate the set into growing
+non-revealing subset, initially containing the original set ("U" in
+terms of the pseudocode above) and everything we are still not sure about
+("candidates").  It means that for the output of the 1st step we'd like
+the extra candidates separated from the stuff already in the original set.
+For the 4th step we would like the additions to U separate from the
+original set.
+
+So let's go for
+	* original set ("set").  Linkage via mnt_list
+	* undecided candidates ("candidates").  Subset of a list,
+consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+Initially all elements of the list will be marked that way; in the
+end the list will become empty and no mounts will remain marked with
+that flag.
+	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* anything in U that hadn't been in the original set - elements of
+candidates will gradually be either discarded or moved there.  In other
+words, it's the candidates we have already decided to unmount.	Its role
+is reasonably close to the old "to_umount", so let's use that name.
+Linkage via mnt_list.
+
+For gather_candidates() we'll need to maintain both candidates (S -
+set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+all elements we encounter, putting the ones not already in the original
+set into the list of candidates.  When we are done, strip that flag from
+all elements of the original set.  That gives a cheap way to check
+if element belongs to S (in gather_candidates) and to candidates
+itself (at later stages).  Call that predicate is_candidate(); it would
+be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+
+All elements of the original set are marked with MNT_UMOUNT and we'll
+need the same for elements added when joining the contents of to_umount
+to set in the end.  Let's set MNT_UMOUNT at the time we add an element
+to to_umount; that's close to what the old 'umount_one' is doing, so
+let's keep that name.  It also gives us another predicate we need -
+"belongs to union of set and to_umount"; will_be_unmounted() for now.
+
+Removals from the candidates list should strip both MNT_MARKED and
+MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index f20e6ed845fe..fb93d3e16724 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -79,7 +79,6 @@ struct mount {
 		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 		struct hlist_node mnt_umount;
 	};
-	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6a0697eeda74..f64895d47d70 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -383,7 +383,6 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
diff --git a/fs/pnode.c b/fs/pnode.c
index aa187144e389..901d40946d34 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,11 +24,6 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
-static inline struct mount *last_slave(struct mount *p)
-{
-	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
-}
-
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -136,6 +131,23 @@ void change_mnt_propagation(struct mount *mnt, int type)
 	}
 }
 
+static struct mount *__propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
 /*
  * get the next mount in the propagation tree.
  * @m: the mount seen last
@@ -153,31 +165,21 @@ static struct mount *propagation_next(struct mount *m,
 	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
-	while (1) {
-		struct mount *master = m->mnt_master;
-
-		if (master == origin->mnt_master) {
-			struct mount *next = next_peer(m);
-			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
-			return next_slave(m);
-
-		/* back at master */
-		m = master;
-	}
+	return __propagation_next(m, origin);
 }
 
 static struct mount *skip_propagation_subtree(struct mount *m,
 						struct mount *origin)
 {
 	/*
-	 * Advance m such that propagation_next will not return
-	 * the slaves of m.
+	 * Advance m past everything that gets propagation from it.
 	 */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		m = last_slave(m);
+	struct mount *p = __propagation_next(m, origin);
+
+	while (p && peers(m, p))
+		p = __propagation_next(p, origin);
 
-	return m;
+	return p;
 }
 
 static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -458,181 +460,213 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static inline bool is_candidate(struct mount *m)
 {
-	CLEAR_MNT_MARK(mnt);
-	mnt->mnt.mnt_flags |= MNT_UMOUNT;
-	list_del_init(&mnt->mnt_child);
-	list_del_init(&mnt->mnt_umounting);
-	move_from_ns(mnt, to_umount);
+	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
-			       struct list_head *to_umount,
-			       struct list_head *to_restore)
+static inline bool will_be_unmounted(struct mount *m)
 {
-	bool progress = false;
-	struct mount *child;
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
 
-	/*
-	 * The state of the parent won't change if this mount is
-	 * already unmounted or marked as without children.
-	 */
-	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
-		goto out;
+static void umount_one(struct mount *m, struct list_head *to_umount)
+{
+	m->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&m->mnt_child);
+	move_from_ns(m, to_umount);
+}
 
-	/* Verify topper is the only grandchild that has not been
-	 * speculatively unmounted.
-	 */
-	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-			continue;
-		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
-			continue;
-		/* Found a mounted child */
-		goto children;
-	}
+static void remove_from_candidate_list(struct mount *m)
+{
+	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	list_del_init(&m->mnt_list);
+}
 
-	/* Mark mounts that can be unmounted if not locked */
-	SET_MNT_MARK(mnt);
-	progress = true;
+static void gather_candidates(struct list_head *set,
+			      struct list_head *candidates)
+{
+	struct mount *m, *p, *q;
 
-	/* If a mount is without children and not locked umount it. */
-	if (!IS_MNT_LOCKED(mnt)) {
-		umount_one(mnt, to_umount);
-	} else {
-children:
-		list_move_tail(&mnt->mnt_umounting, to_restore);
+	list_for_each_entry(m, set, mnt_list) {
+		if (is_candidate(m))
+			continue;
+		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		p = m->mnt_parent;
+		q = propagation_next(p, p);
+		while (q) {
+			struct mount *child = __lookup_mnt(&q->mnt,
+							   m->mnt_mountpoint);
+			if (child) {
+				/*
+				 * We might've already run into this one.  That
+				 * must've happened on earlier iteration of the
+				 * outer loop; in that case we can skip those
+				 * parents that get propagation from q - there
+				 * will be nothing new on those as well.
+				 */
+				if (is_candidate(child)) {
+					q = skip_propagation_subtree(q, p);
+					continue;
+				}
+				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				if (!will_be_unmounted(child))
+					list_add(&child->mnt_list, candidates);
+			}
+			q = propagation_next(q, p);
+		}
 	}
-out:
-	return progress;
+	list_for_each_entry(m, set, mnt_list)
+		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 }
 
-static void umount_list(struct list_head *to_umount,
-			struct list_head *to_restore)
+/*
+ * We know that some child of @m can't be unmounted.  In all places where the
+ * chain of descent of @m has child not overmounting the root of parent,
+ * the parent can't be unmounted either.
+ */
+static void trim_ancestors(struct mount *m)
 {
-	struct mount *mnt, *child, *tmp;
-	list_for_each_entry(mnt, to_umount, mnt_list) {
-		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
-			/* topper? */
-			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-				list_move_tail(&child->mnt_umounting, to_restore);
-			else
-				umount_one(child, to_umount);
-		}
+	struct mount *p;
+
+	for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
+		if (IS_MNT_MARKED(m))	// all candidates beneath are overmounts
+			return;
+		SET_MNT_MARK(m);
+		if (m != p->overmount)
+			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
 	}
 }
 
-static void restore_mounts(struct list_head *to_restore)
+/*
+ * Find and exclude all umount candidates forbidden by @m
+ * (see Documentation/filesystems/propagate_umount.txt)
+ * If we can immediately tell that @m is OK to unmount (unlocked
+ * and all children are already committed to unmounting) commit
+ * to unmounting it.
+ * Only @m itself might be taken from the candidates list;
+ * anything found by trim_ancestors() is marked non-candidate
+ * and left on the list.
+ */
+static void trim_one(struct mount *m, struct list_head *to_umount)
 {
-	/* Restore mounts to a clean working state */
-	while (!list_empty(to_restore)) {
-		struct mount *mnt, *parent;
-		struct mountpoint *mp;
-
-		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
-		CLEAR_MNT_MARK(mnt);
-		list_del_init(&mnt->mnt_umounting);
-
-		/* Should this mount be reparented? */
-		mp = mnt->mnt_mp;
-		parent = mnt->mnt_parent;
-		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
-			mp = parent->mnt_mp;
-			parent = parent->mnt_parent;
-		}
-		if (parent != mnt->mnt_parent) {
-			mnt_change_mountpoint(parent, mp, mnt);
-			mnt_notify_add(mnt);
+	bool remove_this = false, found = false, umount_this = false;
+	struct mount *n;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+
+	list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+		if (!is_candidate(n)) {
+			found = true;
+			if (n != m->overmount) {
+				remove_this = true;
+				break;
+			}
 		}
 	}
+	if (found) {
+		trim_ancestors(m);
+	} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+		remove_this = true;
+		umount_this = true;
+	}
+	if (remove_this) {
+		remove_from_candidate_list(m);
+		if (umount_this)
+			umount_one(m, to_umount);
+	}
 }
 
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m, struct list_head *to_umount)
 {
-	while (!list_empty(visited)) {
-		struct mount *mnt =
-			list_first_entry(visited, struct mount, mnt_umounting);
-		list_del_init(&mnt->mnt_umounting);
+	struct mount *cutoff = m, *p;
+
+	if (!is_candidate(m)) { // trim_ancestors() left it on list
+		remove_from_candidate_list(m);
+		return;
+	}
+	for (p = m; is_candidate(p); p = p->mnt_parent) {
+		remove_from_candidate_list(p);
+		if (!IS_MNT_LOCKED(p))
+			cutoff = p->mnt_parent;
+	}
+	if (will_be_unmounted(p))
+		cutoff = p;
+	while (m != cutoff) {
+		umount_one(m, to_umount);
+		m = m->mnt_parent;
 	}
 }
 
 /*
- * collect all mounts that receive propagation from the mount in @list,
- * and return these additional mounts in the same list.
- * @list: the list of mounts to be unmounted.
+ * @m is not to going away, and it overmounts the top of a stack of mounts
+ * that are going away.  We know that all of those are fully overmounted
+ * by the one above (@m being the topmost of the chain), so @m can be slid
+ * in place where the bottom of the stack is attached.
  *
- * vfsmount lock must be held for write
+ * NOTE: here we temporarily violate a constraint - two mounts end up with
+ * the same parent and mountpoint; that will be remedied as soon as we
+ * return from propagate_umount() - its caller (umount_tree()) will detach
+ * the stack from the parent it (and now @m) is attached to.  umount_tree()
+ * might choose to keep unmounted pieces stuck to each other, but it always
+ * detaches them from the mounts that remain in the tree.
  */
-int propagate_umount(struct list_head *list)
+static void reparent(struct mount *m)
 {
-	struct mount *mnt;
-	LIST_HEAD(to_restore);
-	LIST_HEAD(to_umount);
-	LIST_HEAD(visited);
-
-	/* Find candidates for unmounting */
-	list_for_each_entry_reverse(mnt, list, mnt_list) {
-		struct mount *parent = mnt->mnt_parent;
-		struct mount *m;
+	struct mount *p = m;
+	struct mountpoint *mp;
 
-		/*
-		 * If this mount has already been visited it is known that it's
-		 * entire peer group and all of their slaves in the propagation
-		 * tree for the mountpoint has already been visited and there is
-		 * no need to visit them again.
-		 */
-		if (!list_empty(&mnt->mnt_umounting))
-			continue;
+	do {
+		mp = p->mnt_mp;
+		p = p->mnt_parent;
+	} while (will_be_unmounted(p));
 
-		list_add_tail(&mnt->mnt_umounting, &visited);
-		for (m = propagation_next(parent, parent); m;
-		     m = propagation_next(m, parent)) {
-			struct mount *child = __lookup_mnt(&m->mnt,
-							   mnt->mnt_mountpoint);
-			if (!child)
-				continue;
+	mnt_change_mountpoint(p, mp, m);
+	mnt_notify_add(m);
+}
 
-			if (!list_empty(&child->mnt_umounting)) {
-				/*
-				 * If the child has already been visited it is
-				 * know that it's entire peer group and all of
-				 * their slaves in the propgation tree for the
-				 * mountpoint has already been visited and there
-				 * is no need to visit this subtree again.
-				 */
-				m = skip_propagation_subtree(m, parent);
-				continue;
-			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
-				/*
-				 * We have come across a partially unmounted
-				 * mount in a list that has not been visited
-				 * yet. Remember it has been visited and
-				 * continue about our merry way.
-				 */
-				list_add_tail(&child->mnt_umounting, &visited);
-				continue;
-			}
+/**
+ * propagate_umount - apply propagation rules to the set of mounts for umount()
+ * @set: the list of mounts to be unmounted.
+ *
+ * Collect all mounts that receive propagation from the mount in @set and have
+ * no obstacles to being unmounted.  Add these additional mounts to the set.
+ *
+ * See Documentation/filesystems/propagate_umount.txt if you do anything in
+ * this area.
+ *
+ * Locks held:
+ * mount_lock (write_seqlock), namespace_sem (exclusive).
+ */
+void propagate_umount(struct list_head *set)
+{
+	struct mount *m, *p;
+	LIST_HEAD(to_umount);	// committed to unmounting
+	LIST_HEAD(candidates);	// undecided umount candidates
 
-			/* Check the child and parents while progress is made */
-			while (__propagate_umount(child,
-						  &to_umount, &to_restore)) {
-				/* Is the parent a umount candidate? */
-				child = child->mnt_parent;
-				if (list_empty(&child->mnt_umounting))
-					break;
-			}
-		}
+	// collect all candidates
+	gather_candidates(set, &candidates);
+
+	// reduce the set until it's non-shifting
+	list_for_each_entry_safe(m, p, &candidates, mnt_list)
+		trim_one(m, &to_umount);
+
+	// ... and non-revealing
+	while (!list_empty(&candidates)) {
+		m = list_first_entry(&candidates,struct mount, mnt_list);
+		handle_locked(m, &to_umount);
 	}
 
-	umount_list(&to_umount, &to_restore);
-	restore_mounts(&to_restore);
-	cleanup_umount_visitations(&visited);
-	list_splice_tail(&to_umount, list);
+	// now to_umount consists of all acceptable candidates
+	// deal with reparenting of remaining overmounts on those
+	list_for_each_entry(m, &to_umount, mnt_list) {
+		if (m->overmount)
+			reparent(m->overmount);
+	}
 
-	return 0;
+	// and fold them into the set
+	list_splice_tail_init(&to_umount, set);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 93fa9311bd07..04f1ac53aa49 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -42,7 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index c145820fcbbf..65fa8442c00a 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -40,6 +40,7 @@ enum mount_flags {
 
 	MNT_INTERNAL	= 0x4000,
 
+	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -66,7 +67,7 @@ enum mount_flags {
 
 	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
 			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED,
+			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
 };
 
 struct vfsmount {
-- 
cgit v1.2.3


From 406fea79992561f47fd3511dd8b7c8abeeff7045 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 21 Jun 2025 18:06:19 -0400
Subject: mount: separate the flags accessed only under namespace_sem

Several flags are updated and checked only under namespace_sem; we are
already making use of that when we are checking them without mount_lock,
but we have to hold mount_lock for all updates, which makes things
clumsier than they have to be.

Take MNT_SHARED, MNT_UNBINDABLE, MNT_MARKED and MNT_UMOUNT_CANDIDATE
into a separate field (->mnt_t_flags), renaming them to T_SHARED,
etc. to avoid confusion.  All accesses must be under namespace_sem.

That changes locking requirements for mnt_change_propagation() and
set_mnt_shared() - only namespace_sem is needed now.  The same goes
for SET_MNT_MARKED et.al.

There might be more flags moved from ->mnt_flags to that field;
this is just the initial set.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/propagate_umount.txt | 12 ++++++------
 fs/mount.h                                     | 17 +++++++++++++++++
 fs/namespace.c                                 |  4 ----
 fs/pnode.c                                     | 22 ++++++++++------------
 fs/pnode.h                                     | 19 +++++++++++--------
 include/linux/mount.h                          | 18 ++----------------
 6 files changed, 46 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/propagate_umount.txt b/Documentation/filesystems/propagate_umount.txt
index 6906903a8aa2..c90349e5b889 100644
--- a/Documentation/filesystems/propagate_umount.txt
+++ b/Documentation/filesystems/propagate_umount.txt
@@ -453,11 +453,11 @@ original set.
 So let's go for
 	* original set ("set").  Linkage via mnt_list
 	* undecided candidates ("candidates").  Subset of a list,
-consisting of all its elements marked with a new flag (MNT_UMOUNT_CANDIDATE).
+consisting of all its elements marked with a new flag (T_UMOUNT_CANDIDATE).
 Initially all elements of the list will be marked that way; in the
 end the list will become empty and no mounts will remain marked with
 that flag.
-	* Reuse MNT_MARKED for "has been already seen by trim_ancestors()".
+	* Reuse T_MARKED for "has been already seen by trim_ancestors()".
 	* anything in U that hadn't been in the original set - elements of
 candidates will gradually be either discarded or moved there.  In other
 words, it's the candidates we have already decided to unmount.	Its role
@@ -465,13 +465,13 @@ is reasonably close to the old "to_umount", so let's use that name.
 Linkage via mnt_list.
 
 For gather_candidates() we'll need to maintain both candidates (S -
-set) and intersection of S with set.  Use MNT_UMOUNT_CANDIDATE for
+set) and intersection of S with set.  Use T_UMOUNT_CANDIDATE for
 all elements we encounter, putting the ones not already in the original
 set into the list of candidates.  When we are done, strip that flag from
 all elements of the original set.  That gives a cheap way to check
 if element belongs to S (in gather_candidates) and to candidates
 itself (at later stages).  Call that predicate is_candidate(); it would
-be m->mnt_flags & MNT_UMOUNT_CANDIDATE.
+be m->mnt_t_flags & T_UMOUNT_CANDIDATE.
 
 All elements of the original set are marked with MNT_UMOUNT and we'll
 need the same for elements added when joining the contents of to_umount
@@ -480,5 +480,5 @@ to to_umount; that's close to what the old 'umount_one' is doing, so
 let's keep that name.  It also gives us another predicate we need -
 "belongs to union of set and to_umount"; will_be_unmounted() for now.
 
-Removals from the candidates list should strip both MNT_MARKED and
-MNT_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
+Removals from the candidates list should strip both T_MARKED and
+T_UMOUNT_CANDIDATE; call it remove_from_candidates_list().
diff --git a/fs/mount.h b/fs/mount.h
index 4355c482a841..f299dc85446d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -84,6 +84,7 @@ struct mount {
 	struct list_head to_notify;	/* need to queue notification */
 	struct mnt_namespace *prev_ns;	/* previous namespace (NULL if none) */
 #endif
+	int mnt_t_flags;		/* namespace_sem-protected flags */
 	int mnt_id;			/* mount identifier, reused */
 	u64 mnt_id_unique;		/* mount ID unique until reboot */
 	int mnt_group_id;		/* peer group identifier */
@@ -93,6 +94,22 @@ struct mount {
 	struct mount *overmount;	/* mounted on ->mnt_root */
 } __randomize_layout;
 
+enum {
+	T_SHARED		= 1, /* mount is shared */
+	T_UNBINDABLE		= 2, /* mount is unbindable */
+	T_MARKED		= 4, /* internal mark for propagate_... */
+	T_UMOUNT_CANDIDATE	= 8, /* for propagate_umount */
+
+	/*
+	 * T_SHARED_MASK is the set of flags that should be cleared when a
+	 * mount becomes shared.  Currently, this is only the flag that says a
+	 * mount cannot be bind mounted, since this is how we create a mount
+	 * that shares events with another mount.  If you add a new T_*
+	 * flag, consider how it interacts with shared mounts.
+	 */
+	T_SHARED_MASK	= T_UNBINDABLE,
+};
+
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
 
 static inline struct mount *real_mount(struct vfsmount *mnt)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4bdf6a6e75ca..da27365418a5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2917,10 +2917,8 @@ static int do_change_type(struct path *path, int ms_flags)
 			goto out_unlock;
 	}
 
-	lock_mount_hash();
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	unlock_mount_hash();
 
  out_unlock:
 	namespace_unlock();
@@ -3409,9 +3407,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
 	if (IS_MNT_SHARED(from)) {
 		to->mnt_group_id = from->mnt_group_id;
 		list_add(&to->mnt_share, &from->mnt_share);
-		lock_mount_hash();
 		set_mnt_shared(to);
-		unlock_mount_hash();
 	}
 
 	err = 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 827d71736ac5..b997663de6d0 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -112,7 +112,7 @@ static int do_make_slave(struct mount *mnt)
 }
 
 /*
- * vfsmount lock must be held for write
+ * EXCL[namespace_sem]
  */
 void change_mnt_propagation(struct mount *mnt, int type)
 {
@@ -125,9 +125,9 @@ void change_mnt_propagation(struct mount *mnt, int type)
 		list_del_init(&mnt->mnt_slave);
 		mnt->mnt_master = NULL;
 		if (type == MS_UNBINDABLE)
-			mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+			mnt->mnt_t_flags |= T_UNBINDABLE;
 		else
-			mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+			mnt->mnt_t_flags &= ~T_UNBINDABLE;
 	}
 }
 
@@ -263,9 +263,9 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 		return PTR_ERR(child);
 	read_seqlock_excl(&mount_lock);
 	mnt_set_mountpoint(m, dest_mp, child);
+	read_sequnlock_excl(&mount_lock);
 	if (m->mnt_master != dest_master)
 		SET_MNT_MARK(m->mnt_master);
-	read_sequnlock_excl(&mount_lock);
 	last_dest = m;
 	last_source = child;
 	hlist_add_head(&child->mnt_hash, list);
@@ -322,13 +322,11 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
 		} while (n != m);
 	}
 out:
-	read_seqlock_excl(&mount_lock);
 	hlist_for_each_entry(n, tree_list, mnt_hash) {
 		m = n->mnt_parent;
 		if (m->mnt_master != dest_mnt->mnt_master)
 			CLEAR_MNT_MARK(m->mnt_master);
 	}
-	read_sequnlock_excl(&mount_lock);
 	return ret;
 }
 
@@ -447,7 +445,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 static inline bool is_candidate(struct mount *m)
 {
-	return m->mnt.mnt_flags & MNT_UMOUNT_CANDIDATE;
+	return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
 }
 
 static inline bool will_be_unmounted(struct mount *m)
@@ -464,7 +462,7 @@ static void umount_one(struct mount *m, struct list_head *to_umount)
 
 static void remove_from_candidate_list(struct mount *m)
 {
-	m->mnt.mnt_flags &= ~(MNT_MARKED | MNT_UMOUNT_CANDIDATE);
+	m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
 	list_del_init(&m->mnt_list);
 }
 
@@ -476,7 +474,7 @@ static void gather_candidates(struct list_head *set,
 	list_for_each_entry(m, set, mnt_list) {
 		if (is_candidate(m))
 			continue;
-		m->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 		p = m->mnt_parent;
 		q = propagation_next(p, p);
 		while (q) {
@@ -494,7 +492,7 @@ static void gather_candidates(struct list_head *set,
 					q = skip_propagation_subtree(q, p);
 					continue;
 				}
-				child->mnt.mnt_flags |= MNT_UMOUNT_CANDIDATE;
+				child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
 				if (!will_be_unmounted(child))
 					list_add(&child->mnt_list, candidates);
 			}
@@ -502,7 +500,7 @@ static void gather_candidates(struct list_head *set,
 		}
 	}
 	list_for_each_entry(m, set, mnt_list)
-		m->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+		m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 }
 
 /*
@@ -519,7 +517,7 @@ static void trim_ancestors(struct mount *m)
 			return;
 		SET_MNT_MARK(m);
 		if (m != p->overmount)
-			p->mnt.mnt_flags &= ~MNT_UMOUNT_CANDIDATE;
+			p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
 	}
 }
 
diff --git a/fs/pnode.h b/fs/pnode.h
index 04f1ac53aa49..507e30e7a420 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,14 +10,14 @@
 #include <linux/list.h>
 #include "mount.h"
 
-#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
 #define IS_MNT_SLAVE(m) ((m)->mnt_master)
 #define IS_MNT_NEW(m) (!(m)->mnt_ns)
-#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
-#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
-#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
-#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
 #define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
 
 #define CL_EXPIRE    		0x01
@@ -28,10 +28,13 @@
 #define CL_SHARED_TO_SLAVE	0x20
 #define CL_COPY_MNT_NS_FILE	0x40
 
+/*
+ * EXCL[namespace_sem]
+ */
 static inline void set_mnt_shared(struct mount *mnt)
 {
-	mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
-	mnt->mnt.mnt_flags |= MNT_SHARED;
+	mnt->mnt_t_flags &= ~T_SHARED_MASK;
+	mnt->mnt_t_flags |= T_SHARED;
 }
 
 static inline bool peers(const struct mount *m1, const struct mount *m2)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 65fa8442c00a..5f9c053b0897 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -35,12 +35,8 @@ enum mount_flags {
 	MNT_SHRINKABLE	= 0x100,
 	MNT_WRITE_HOLD	= 0x200,
 
-	MNT_SHARED	= 0x1000, /* if the vfsmount is a shared mount */
-	MNT_UNBINDABLE	= 0x2000, /* if the vfsmount is a unbindable mount */
-
 	MNT_INTERNAL	= 0x4000,
 
-	MNT_UMOUNT_CANDIDATE	= 0x020000,
 	MNT_LOCK_ATIME		= 0x040000,
 	MNT_LOCK_NOEXEC		= 0x080000,
 	MNT_LOCK_NOSUID		= 0x100000,
@@ -49,25 +45,15 @@ enum mount_flags {
 	MNT_LOCKED		= 0x800000,
 	MNT_DOOMED		= 0x1000000,
 	MNT_SYNC_UMOUNT		= 0x2000000,
-	MNT_MARKED		= 0x4000000,
 	MNT_UMOUNT		= 0x8000000,
 
-	/*
-	 * MNT_SHARED_MASK is the set of flags that should be cleared when a
-	 * mount becomes shared.  Currently, this is only the flag that says a
-	 * mount cannot be bind mounted, since this is how we create a mount
-	 * that shares events with another mount.  If you add a new MNT_*
-	 * flag, consider how it interacts with shared mounts.
-	 */
-	MNT_SHARED_MASK	= MNT_UNBINDABLE,
 	MNT_USER_SETTABLE_MASK  = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC
 				  | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME
 				  | MNT_READONLY | MNT_NOSYMFOLLOW,
 	MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME,
 
-	MNT_INTERNAL_FLAGS = MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL |
-			     MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED |
-			     MNT_LOCKED | MNT_UMOUNT_CANDIDATE,
+	MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED |
+			     MNT_SYNC_UMOUNT | MNT_LOCKED
 };
 
 struct vfsmount {
-- 
cgit v1.2.3


From a0f26fcc383965e0522b81269062a9278bc802fe Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 27 Jun 2025 09:42:33 +0900
Subject: ata: libata: Remove ATA_DFLAG_ZAC device flag

The ATA device flag ATA_DFLAG_ZAC is used to indicate if a devie is a
host managed or host aware zoned device. However, this flag is not used
in the hot path and only used during device scanning/revalidation and
for inquiry and sense SCSI command translation.

Save one bit from struct ata_device flags field by replacing this flag
with the internal helper function ata_dev_is_zac(). This function
returns true if the device class is ATA_DEV_ZAC (host managed ZAC device
case) or if its identify data reports it supports the zoned command set
(host aware ZAC device case).

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c | 13 +------------
 drivers/ata/libata-scsi.c |  5 ++---
 drivers/ata/libata.h      |  7 +++++++
 include/linux/libata.h    |  1 -
 4 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 79b20da0a256..3918ea624e0b 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2421,18 +2421,7 @@ static void ata_dev_config_zac(struct ata_device *dev)
 	dev->zac_zones_optimal_nonseq = U32_MAX;
 	dev->zac_zones_max_open = U32_MAX;
 
-	/*
-	 * Always set the 'ZAC' flag for Host-managed devices.
-	 */
-	if (dev->class == ATA_DEV_ZAC)
-		dev->flags |= ATA_DFLAG_ZAC;
-	else if (ata_id_zoned_cap(dev->id) == 0x01)
-		/*
-		 * Check for host-aware devices.
-		 */
-		dev->flags |= ATA_DFLAG_ZAC;
-
-	if (!(dev->flags & ATA_DFLAG_ZAC))
+	if (!ata_dev_is_zac(dev))
 		return;
 
 	if (!ata_identify_page_supported(dev, ATA_LOG_ZONED_INFORMATION)) {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a21c9895408d..ccd7651710be 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1923,8 +1923,7 @@ static unsigned int ata_scsiop_inq_00(struct ata_device *dev,
 	};
 
 	for (i = 0; i < sizeof(pages); i++) {
-		if (pages[i] == 0xb6 &&
-		    !(dev->flags & ATA_DFLAG_ZAC))
+		if (pages[i] == 0xb6 && !ata_dev_is_zac(dev))
 			continue;
 		rbuf[num_pages + 4] = pages[i];
 		num_pages++;
@@ -2181,7 +2180,7 @@ static unsigned int ata_scsiop_inq_b2(struct ata_device *dev,
 static unsigned int ata_scsiop_inq_b6(struct ata_device *dev,
 				      struct scsi_cmnd *cmd, u8 *rbuf)
 {
-	if (!(dev->flags & ATA_DFLAG_ZAC)) {
+	if (!ata_dev_is_zac(dev)) {
 		ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
 		return 0;
 	}
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index ce5c628fa6fd..48ee7acb87af 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -44,6 +44,13 @@ static inline bool ata_sstatus_online(u32 sstatus)
 	return (sstatus & 0xf) == 0x3;
 }
 
+static inline bool ata_dev_is_zac(struct ata_device *dev)
+{
+	/* Host managed device or host aware device */
+	return dev->class == ATA_DEV_ZAC ||
+		ata_id_zoned_cap(dev->id) == 0x01;
+}
+
 #ifdef CONFIG_ATA_FORCE
 extern void ata_force_cbl(struct ata_port *ap);
 #else
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 1e5aec839041..721f0805b6c9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -144,7 +144,6 @@ enum {
 	ATA_DFLAG_DEVSLP	= (1 << 27), /* device supports Device Sleep */
 	ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */
 	ATA_DFLAG_D_SENSE	= (1 << 29), /* Descriptor sense requested */
-	ATA_DFLAG_ZAC		= (1 << 30), /* ZAC device */
 
 	ATA_DFLAG_FEATURES_MASK	= (ATA_DFLAG_TRUSTED | ATA_DFLAG_DA |	\
 				   ATA_DFLAG_DEVSLP | ATA_DFLAG_NCQ_SEND_RECV | \
-- 
cgit v1.2.3


From 2b89eb177c466bb1f84dff8db04d614b33a7ab95 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 19 Jun 2025 18:35:48 +0900
Subject: ata: libata: Improve LPM policies description

Improve the comment describing enum ata_lpm_policy and add comments
within that enum to describe each of the different possible values.
The enum values comments match the description given for the
CONFIG_SATA_MOBILE_LPM_POLICY config parameter.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 include/linux/libata.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 721f0805b6c9..7462218312ad 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -499,16 +499,28 @@ enum ata_completion_errors {
 };
 
 /*
- * Link power management policy: If you alter this, you also need to
- * alter libata-sata.c (for the ascii descriptions)
+ * Link Power Management (LPM) policies.
+ *
+ * The default LPM policy to use for a device link is defined using these values
+ * with the CONFIG_SATA_MOBILE_LPM_POLICY config option and applied through the
+ * target_lpm_policy field of struct ata_port.
+ *
+ * If you alter this, you also need to alter the policy names used with the
+ * sysfs attribute link_power_management_policy defined in libata-sata.c.
  */
 enum ata_lpm_policy {
+	/* Keep firmware settings */
 	ATA_LPM_UNKNOWN,
+	/* No power savings (maximum performance) */
 	ATA_LPM_MAX_POWER,
+	/* HIPM (Partial) */
 	ATA_LPM_MED_POWER,
-	ATA_LPM_MED_POWER_WITH_DIPM, /* Med power + DIPM as win IRST does */
-	ATA_LPM_MIN_POWER_WITH_PARTIAL, /* Min Power + partial and slumber */
-	ATA_LPM_MIN_POWER, /* Min power + no partial (slumber only) */
+	/* HIPM (Partial) and DIPM (Partial and Slumber) */
+	ATA_LPM_MED_POWER_WITH_DIPM,
+	/* HIPM (Partial and DevSleep) and DIPM (Partial and Slumber) */
+	ATA_LPM_MIN_POWER_WITH_PARTIAL,
+	/* HIPM (Slumber and DevSleep) and DIPM (Partial and Slumber) */
+	ATA_LPM_MIN_POWER,
 };
 
 enum ata_lpm_hints {
-- 
cgit v1.2.3


From 04f541cef2dba78b17c83ad8b5a0742012399530 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 19 Jun 2025 12:21:51 +0300
Subject: media: v4l2-ctrls: Return the handler's error in
 v4l2_ctrl_handler_free()

v4l2_ctrl_handler_free() used to return void but changing this to int,
returning the handler's error code, enables the drivers to simply return
the handler's error in this common error handling pattern:

	if (handler->error)
		return v4l2_ctrl_handler_free(handler);

Suggested-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Reviewed-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-ctrls-core.c | 11 ++++++++---
 include/media/v4l2-ctrls.h                |  4 +++-
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls-core.c b/drivers/media/v4l2-core/v4l2-ctrls-core.c
index d28596c720d8..98b960775e87 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls-core.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls-core.c
@@ -1631,14 +1631,17 @@ int v4l2_ctrl_handler_init_class(struct v4l2_ctrl_handler *hdl,
 EXPORT_SYMBOL(v4l2_ctrl_handler_init_class);
 
 /* Free all controls and control refs */
-void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl)
+int v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl)
 {
 	struct v4l2_ctrl_ref *ref, *next_ref;
 	struct v4l2_ctrl *ctrl, *next_ctrl;
 	struct v4l2_subscribed_event *sev, *next_sev;
 
-	if (hdl == NULL || hdl->buckets == NULL)
-		return;
+	if (!hdl)
+		return 0;
+
+	if (!hdl->buckets)
+		return hdl->error;
 
 	v4l2_ctrl_handler_free_request(hdl);
 
@@ -1663,6 +1666,8 @@ void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl)
 	hdl->cached = NULL;
 	mutex_unlock(hdl->lock);
 	mutex_destroy(&hdl->_lock);
+
+	return hdl->error;
 }
 EXPORT_SYMBOL(v4l2_ctrl_handler_free);
 
diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h
index 3a87096e064f..c32c46286441 100644
--- a/include/media/v4l2-ctrls.h
+++ b/include/media/v4l2-ctrls.h
@@ -579,8 +579,10 @@ int v4l2_ctrl_handler_init_class(struct v4l2_ctrl_handler *hdl,
  * @hdl:	The control handler.
  *
  * Does nothing if @hdl == NULL.
+ *
+ * Return: @hdl's error field or 0 if @hdl is NULL.
  */
-void v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl);
+int v4l2_ctrl_handler_free(struct v4l2_ctrl_handler *hdl);
 
 /**
  * v4l2_ctrl_lock() - Helper function to lock the handler
-- 
cgit v1.2.3


From 08ad63bbd681ae4eeb50644564435035c38e5795 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 20 Jun 2025 14:58:01 +0200
Subject: gpio: constify arguments of gpiod_is_equal()

This function is not meant to modify the GPIO descriptors in any way so
we can safely constify both arguments.

Link: https://lore.kernel.org/r/20250620-gpiod-is-equal-improv-v1-1-a75060505d2c@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c        | 2 +-
 include/linux/gpio/consumer.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 5b0b4fc97543..6b4f94c3887f 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -286,7 +286,7 @@ EXPORT_SYMBOL_GPL(gpiod_to_gpio_device);
  * Returns:
  * True if the descriptors refer to the same physical pin. False otherwise.
  */
-bool gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other)
+bool gpiod_is_equal(const struct gpio_desc *desc, const struct gpio_desc *other)
 {
 	return desc == other;
 }
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index f0b1982da0cc..00df68c51405 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -181,7 +181,8 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
 					      enum gpiod_flags flags,
 					      const char *label);
 
-bool gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other);
+bool gpiod_is_equal(const struct gpio_desc *desc,
+		    const struct gpio_desc *other);
 
 #else /* CONFIG_GPIOLIB */
 
@@ -551,7 +552,7 @@ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev,
 }
 
 static inline bool
-gpiod_is_equal(struct gpio_desc *desc, struct gpio_desc *other)
+gpiod_is_equal(const struct gpio_desc *desc, const struct gpio_desc *other)
 {
 	WARN_ON(desc || other);
 	return false;
-- 
cgit v1.2.3


From 62b5848f73dd4f8ae17304dae54562d0c9ecdd3d Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 24 Jun 2025 16:32:20 +0200
Subject: power: sequencing: add defines for return values of the match()
 callback

Instead of using 0 and 1 as magic numbers, let's add proper defines
whose names tell the reader what the meaning behind them is.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250624-pwrseq-match-defines-v1-3-a59d90a951f1@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/power/sequencing/core.c | 6 +++---
 include/linux/pwrseq/provider.h | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/power/sequencing/core.c b/drivers/power/sequencing/core.c
index 0ffc259c6bb6..190564e55988 100644
--- a/drivers/power/sequencing/core.c
+++ b/drivers/power/sequencing/core.c
@@ -628,7 +628,7 @@ static int pwrseq_match_device(struct device *pwrseq_dev, void *data)
 		return 0;
 
 	ret = pwrseq->match(pwrseq, match_data->dev);
-	if (ret <= 0)
+	if (ret == PWRSEQ_NO_MATCH || ret < 0)
 		return ret;
 
 	/* We got the matching device, let's find the right target. */
@@ -651,7 +651,7 @@ static int pwrseq_match_device(struct device *pwrseq_dev, void *data)
 
 	match_data->desc->pwrseq = pwrseq_device_get(pwrseq);
 
-	return 1;
+	return PWRSEQ_MATCH_OK;
 }
 
 /**
@@ -684,7 +684,7 @@ struct pwrseq_desc *pwrseq_get(struct device *dev, const char *target)
 			       pwrseq_match_device);
 	if (ret < 0)
 		return ERR_PTR(ret);
-	if (ret == 0)
+	if (ret == PWRSEQ_NO_MATCH)
 		/* No device matched. */
 		return ERR_PTR(-EPROBE_DEFER);
 
diff --git a/include/linux/pwrseq/provider.h b/include/linux/pwrseq/provider.h
index cbc3607cbfcf..33b3d2c2e39d 100644
--- a/include/linux/pwrseq/provider.h
+++ b/include/linux/pwrseq/provider.h
@@ -13,6 +13,9 @@ struct pwrseq_device;
 typedef int (*pwrseq_power_state_func)(struct pwrseq_device *);
 typedef int (*pwrseq_match_func)(struct pwrseq_device *, struct device *);
 
+#define PWRSEQ_NO_MATCH 0
+#define PWRSEQ_MATCH_OK 1
+
 /**
  * struct pwrseq_unit_data - Configuration of a single power sequencing
  *                           unit.
-- 
cgit v1.2.3


From b1c26e059536d8acbf9d508374f4b76537e20fb7 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 10 Jun 2025 15:58:16 -0500
Subject: Move FCH header to a location accessible by all archs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A new header fch.h was created to store registers used by different AMD
drivers.  This header was included by i2c-piix4 in
commit 624b0d5696a8 ("i2c: piix4, x86/platform: Move the SB800 PIIX4 FCH
definitions to <asm/amd/fch.h>"). To prevent compile failures on non-x86
archs i2c-piix4 was set to only compile on x86 by commit 7e173eb82ae9717
("i2c: piix4: Make CONFIG_I2C_PIIX4 dependent on CONFIG_X86").
This was not a good decision because loongarch and mips both actually
support i2c-piix4 and set it enabled in the defconfig.

Move the header to a location accessible by all architectures.

Fixes: 624b0d5696a89 ("i2c: piix4, x86/platform: Move the SB800 PIIX4 FCH definitions to <asm/amd/fch.h>")
Suggested-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Hans de Goede <hansg@kernel.org>
Link: https://lore.kernel.org/r/20250610205817.3912944-1-superm1@kernel.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 arch/x86/include/asm/amd/fch.h            | 13 -------------
 arch/x86/kernel/cpu/amd.c                 |  2 +-
 drivers/i2c/busses/i2c-piix4.c            |  2 +-
 drivers/platform/x86/amd/pmc/pmc-quirks.c |  2 +-
 include/linux/platform_data/x86/amd-fch.h | 13 +++++++++++++
 5 files changed, 16 insertions(+), 16 deletions(-)
 delete mode 100644 arch/x86/include/asm/amd/fch.h
 create mode 100644 include/linux/platform_data/x86/amd-fch.h

(limited to 'include')

diff --git a/arch/x86/include/asm/amd/fch.h b/arch/x86/include/asm/amd/fch.h
deleted file mode 100644
index 2cf5153edbc2..000000000000
--- a/arch/x86/include/asm/amd/fch.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_AMD_FCH_H_
-#define _ASM_X86_AMD_FCH_H_
-
-#define FCH_PM_BASE			0xFED80300
-
-/* Register offsets from PM base: */
-#define FCH_PM_DECODEEN			0x00
-#define FCH_PM_DECODEEN_SMBUS0SEL	GENMASK(20, 19)
-#define FCH_PM_SCRATCH			0x80
-#define FCH_PM_S5_RESET_STATUS		0xC0
-
-#endif /* _ASM_X86_AMD_FCH_H_ */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 93da466dfe2c..9543d5de4e7d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -9,7 +9,7 @@
 #include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <linux/topology.h>
-#include <asm/amd/fch.h>
+#include <linux/platform_data/x86/amd-fch.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cacheinfo.h>
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 9d3a4dc2bd60..ac3bb550303f 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -34,7 +34,7 @@
 #include <linux/dmi.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
-#include <asm/amd/fch.h>
+#include <linux/platform_data/x86/amd-fch.h>
 
 #include "i2c-piix4.h"
 
diff --git a/drivers/platform/x86/amd/pmc/pmc-quirks.c b/drivers/platform/x86/amd/pmc/pmc-quirks.c
index f292111bd065..131f10b68308 100644
--- a/drivers/platform/x86/amd/pmc/pmc-quirks.c
+++ b/drivers/platform/x86/amd/pmc/pmc-quirks.c
@@ -11,7 +11,7 @@
 #include <linux/dmi.h>
 #include <linux/io.h>
 #include <linux/ioport.h>
-#include <asm/amd/fch.h>
+#include <linux/platform_data/x86/amd-fch.h>
 
 #include "pmc.h"
 
diff --git a/include/linux/platform_data/x86/amd-fch.h b/include/linux/platform_data/x86/amd-fch.h
new file mode 100644
index 000000000000..2cf5153edbc2
--- /dev/null
+++ b/include/linux/platform_data/x86/amd-fch.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_AMD_FCH_H_
+#define _ASM_X86_AMD_FCH_H_
+
+#define FCH_PM_BASE			0xFED80300
+
+/* Register offsets from PM base: */
+#define FCH_PM_DECODEEN			0x00
+#define FCH_PM_DECODEEN_SMBUS0SEL	GENMASK(20, 19)
+#define FCH_PM_SCRATCH			0x80
+#define FCH_PM_S5_RESET_STATUS		0xC0
+
+#endif /* _ASM_X86_AMD_FCH_H_ */
-- 
cgit v1.2.3


From 37d2aa62138daa8ecb6442ae4753704e9c92346f Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 24 Jun 2025 13:28:41 +0100
Subject: ASoC: SDCA: Minor selected/detected mode control fixups

Make the names a slightly better match for the specification and add
some constants for the values rather than hard coding.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250624122844.2761627-5-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h | 9 +++++++++
 sound/soc/sdca/sdca_asoc.c    | 8 ++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 856b0f40ce5e..4b278513597e 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -319,6 +319,15 @@ enum sdca_selected_mode_range {
 	SDCA_SELECTED_MODE_NCOLS			= 2,
 };
 
+/**
+ * enum sdca_detected_mode_values - Predefined GE Detected Mode values
+ */
+enum sdca_detected_mode_values {
+	SDCA_DETECTED_MODE_JACK_UNPLUGGED		= 0,
+	SDCA_DETECTED_MODE_JACK_UNKNOWN			= 1,
+	SDCA_DETECTED_MODE_DETECTION_IN_PROGRESS	= 2,
+};
+
 /**
  * enum sdca_spe_controls - SDCA Controls for Security & Privacy Unit
  *
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 83911dab73ae..dd7b19083c85 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -246,12 +246,12 @@ static int entity_early_parse_ge(struct device *dev,
 	if (!values)
 		return -ENOMEM;
 
-	texts[0] = "No Jack";
+	texts[0] = "Jack Unplugged";
 	texts[1] = "Jack Unknown";
 	texts[2] = "Detection in Progress";
-	values[0] = 0;
-	values[1] = 1;
-	values[2] = 2;
+	values[0] = SDCA_DETECTED_MODE_JACK_UNPLUGGED;
+	values[1] = SDCA_DETECTED_MODE_JACK_UNKNOWN;
+	values[2] = SDCA_DETECTED_MODE_DETECTION_IN_PROGRESS;
 	for (i = 0; i < range->rows; i++) {
 		enum sdca_terminal_type type;
 
-- 
cgit v1.2.3


From 775f5729b47d8737f4f98e0141f61b3358245398 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 24 Jun 2025 13:28:42 +0100
Subject: ASoC: SDCA: Add flag for unused IRQs

Zero is a valid SDCA IRQ interrupt position so add a special value to
indicate that the IRQ is not used.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250624122844.2761627-6-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 2 ++
 sound/soc/sdca/sdca_functions.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 4b278513597e..b4a97ff08729 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -17,6 +17,8 @@ struct device;
 struct sdca_entity;
 struct sdca_function_desc;
 
+#define SDCA_NO_INTERRUPT -1
+
 /*
  * The addressing space for SDCA relies on 7 bits for Entities, so a
  * maximum of 128 Entities per function can be represented.
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 093c681e9387..c34f3bf62983 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -912,6 +912,8 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
 				       &tmp);
 	if (!ret)
 		control->interrupt_position = tmp;
+	else
+		control->interrupt_position = SDCA_NO_INTERRUPT;
 
 	control->label = find_sdca_control_label(dev, entity, control);
 	if (!control->label)
-- 
cgit v1.2.3


From b126394d9ec6f9d8322cf392ba23d4a5f96faf5a Mon Sep 17 00:00:00 2001
From: Maciej Strozek <mstrozek@opensource.cirrus.com>
Date: Tue, 24 Jun 2025 13:28:43 +0100
Subject: ASoC: SDCA: Generic interrupt support

Add a library supporting usage of SDCA interrupts, using regmap irq
framework. The library adds functions for parsing ACPI for
interrupt-related information, configuring irq chip and requesting
individual irqs. Calling code (SDCA function code) is expected to also
substitute the library's base irq handler for its own, appropriate
callback.

Signed-off-by: Maciej Strozek <mstrozek@opensource.cirrus.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250624122844.2761627-7-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  |  75 +++++++++++
 sound/soc/sdca/Kconfig           |   7 +
 sound/soc/sdca/Makefile          |   5 +-
 sound/soc/sdca/sdca_interrupts.c | 284 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 369 insertions(+), 2 deletions(-)
 create mode 100644 include/sound/sdca_interrupts.h
 create mode 100644 sound/soc/sdca/sdca_interrupts.c

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
new file mode 100644
index 000000000000..4cda8b75bae0
--- /dev/null
+++ b/include/sound/sdca_interrupts.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ *
+ * Copyright (C) 2025 Cirrus Logic, Inc. and
+ *                    Cirrus Logic International Semiconductor Ltd.
+ */
+
+#ifndef __SDCA_INTERRUPTS_H__
+#define __SDCA_INTERRUPTS_H__
+
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/regmap.h>
+
+struct device;
+struct snd_soc_component;
+struct sdca_function_data;
+
+#define SDCA_MAX_INTERRUPTS 31 /* the last bit is reserved for future extensions */
+
+/**
+ * struct sdca_interrupt - contains information about a single SDCA interrupt
+ * @name: The name of the interrupt.
+ * @component: Pointer to the ASoC component owns the interrupt.
+ * @function: Pointer to the Function that the interrupt is associated with.
+ * @entity: Pointer to the Entity that the interrupt is associated with.
+ * @control: Pointer to the Control that the interrupt is associated with.
+ * @externally_requested: Internal flag used to check if a client driver has
+ * already requested the interrupt, for custom handling, allowing the core to
+ * skip handling this interrupt.
+ */
+struct sdca_interrupt {
+	const char *name;
+
+	struct snd_soc_component *component;
+	struct sdca_function_data *function;
+	struct sdca_entity *entity;
+	struct sdca_control *control;
+
+	bool externally_requested;
+};
+
+/**
+ * struct sdca_interrupt_info - contains top-level SDCA interrupt information
+ * @irq_chip: regmap irq chip structure.
+ * @irq_data: regmap irq chip data structure.
+ * @irqs: Array of data for each individual IRQ.
+ * @irq_lock: Protects access to the list of sdca_interrupt structures.
+ */
+struct sdca_interrupt_info {
+	struct regmap_irq_chip irq_chip;
+	struct regmap_irq_chip_data *irq_data;
+
+	struct sdca_interrupt irqs[SDCA_MAX_INTERRUPTS];
+
+	struct mutex irq_lock; /* Protect irqs list across functions */
+};
+
+int sdca_irq_request(struct device *dev, struct sdca_interrupt_info *interrupt_info,
+		     int sdca_irq, const char *name, irq_handler_t handler,
+		     void *data);
+int sdca_irq_data_populate(struct snd_soc_component *component,
+			   struct sdca_function_data *function,
+			   struct sdca_entity *entity,
+			   struct sdca_control *control,
+			   struct sdca_interrupt *interrupt);
+int sdca_irq_populate(struct sdca_function_data *function,
+		      struct snd_soc_component *component,
+		      struct sdca_interrupt_info *info);
+struct sdca_interrupt_info *sdca_irq_allocate(struct device *dev,
+					      struct regmap *regmap, int irq);
+
+#endif
diff --git a/sound/soc/sdca/Kconfig b/sound/soc/sdca/Kconfig
index ec28855fe3b0..a633d5a0fea5 100644
--- a/sound/soc/sdca/Kconfig
+++ b/sound/soc/sdca/Kconfig
@@ -13,3 +13,10 @@ config SND_SOC_SDCA_OPTIONAL
 config SND_SOC_SDCA_HID
 	tristate "SDCA HID support"
 	depends on SND_SOC_SDCA && HID
+
+config SND_SOC_SDCA_IRQ
+	tristate
+	select REGMAP
+	select REGMAP_IRQ
+	help
+	  This option enables support for SDCA IRQs.
diff --git a/sound/soc/sdca/Makefile b/sound/soc/sdca/Makefile
index 9af46e7edfd2..2a3938d11ca9 100644
--- a/sound/soc/sdca/Makefile
+++ b/sound/soc/sdca/Makefile
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 snd-soc-sdca-y	:= sdca_functions.o sdca_device.o sdca_regmap.o sdca_asoc.o
-
 snd-soc-sdca-hid-y := sdca_hid.o
+snd-soc-sdca-irq-y := sdca_interrupts.o
 
-obj-$(CONFIG_SND_SOC_SDCA_HID)	+= snd-soc-sdca-hid.o
 obj-$(CONFIG_SND_SOC_SDCA)	+= snd-soc-sdca.o
+obj-$(CONFIG_SND_SOC_SDCA_HID)	+= snd-soc-sdca-hid.o
+obj-$(CONFIG_SND_SOC_SDCA_IRQ)	+= snd-soc-sdca-irq.o
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
new file mode 100644
index 000000000000..7272d11cb6d4
--- /dev/null
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Cirrus Logic, Inc. and
+//                    Cirrus Logic International Semiconductor Ltd.
+
+/*
+ * The MIPI SDCA specification is available for public downloads at
+ * https://www.mipi.org/mipi-sdca-v1-0-download
+ */
+
+#include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/regmap.h>
+#include <linux/soundwire/sdw.h>
+#include <linux/soundwire/sdw_registers.h>
+#include <sound/sdca.h>
+#include <sound/sdca_function.h>
+#include <sound/sdca_interrupts.h>
+#include <sound/soc-component.h>
+
+#define IRQ_SDCA(number) REGMAP_IRQ_REG(number, ((number) / BITS_PER_BYTE), \
+					SDW_SCP_SDCA_INTMASK_SDCA_##number)
+
+static const struct regmap_irq regmap_irqs[SDCA_MAX_INTERRUPTS] = {
+	IRQ_SDCA(0),
+	IRQ_SDCA(1),
+	IRQ_SDCA(2),
+	IRQ_SDCA(3),
+	IRQ_SDCA(4),
+	IRQ_SDCA(5),
+	IRQ_SDCA(6),
+	IRQ_SDCA(7),
+	IRQ_SDCA(8),
+	IRQ_SDCA(9),
+	IRQ_SDCA(10),
+	IRQ_SDCA(11),
+	IRQ_SDCA(12),
+	IRQ_SDCA(13),
+	IRQ_SDCA(14),
+	IRQ_SDCA(15),
+	IRQ_SDCA(16),
+	IRQ_SDCA(17),
+	IRQ_SDCA(18),
+	IRQ_SDCA(19),
+	IRQ_SDCA(20),
+	IRQ_SDCA(21),
+	IRQ_SDCA(22),
+	IRQ_SDCA(23),
+	IRQ_SDCA(24),
+	IRQ_SDCA(25),
+	IRQ_SDCA(26),
+	IRQ_SDCA(27),
+	IRQ_SDCA(28),
+	IRQ_SDCA(29),
+	IRQ_SDCA(30),
+};
+
+static const struct regmap_irq_chip sdca_irq_chip = {
+	.name = "sdca_irq",
+
+	.status_base = SDW_SCP_SDCA_INT1,
+	.unmask_base = SDW_SCP_SDCA_INTMASK1,
+	.ack_base = SDW_SCP_SDCA_INT1,
+	.num_regs = 4,
+
+	.irqs = regmap_irqs,
+	.num_irqs = SDCA_MAX_INTERRUPTS,
+
+	.runtime_pm = true,
+};
+
+static irqreturn_t base_handler(int irq, void *data)
+{
+	struct sdca_interrupt *interrupt = data;
+	struct device *dev = interrupt->component->dev;
+
+	dev_info(dev, "%s irq without full handling\n", interrupt->name);
+
+	return IRQ_HANDLED;
+}
+
+static int sdca_irq_request_locked(struct device *dev,
+				   struct sdca_interrupt_info *info,
+				   int sdca_irq, const char *name,
+				   irq_handler_t handler, void *data)
+{
+	int irq;
+	int ret;
+
+	irq = regmap_irq_get_virq(info->irq_data, sdca_irq);
+	if (irq < 0)
+		return irq;
+
+	ret = devm_request_threaded_irq(dev, irq, NULL, handler,
+					IRQF_ONESHOT, name, data);
+	if (ret)
+		return ret;
+
+	dev_dbg(dev, "requested irq %d for %s\n", irq, name);
+
+	return 0;
+}
+
+/**
+ * sdca_request_irq - request an individual SDCA interrupt
+ * @dev: Pointer to the struct device against which things should be allocated.
+ * @interrupt_info: Pointer to the interrupt information structure.
+ * @sdca_irq: SDCA interrupt position.
+ * @name: Name to be given to the IRQ.
+ * @handler: A callback thread function to be called for the IRQ.
+ * @data: Private data pointer that will be passed to the handler.
+ *
+ * Typically this is handled internally by sdca_irq_populate, however if
+ * a device requires custom IRQ handling this can be called manually before
+ * calling sdca_irq_populate, which will then skip that IRQ whilst processing.
+ *
+ * Return: Zero on success, and a negative error code on failure.
+ */
+int sdca_irq_request(struct device *dev, struct sdca_interrupt_info *info,
+		     int sdca_irq, const char *name, irq_handler_t handler,
+		     void *data)
+{
+	int ret;
+
+	if (sdca_irq < 0 || sdca_irq > SDCA_MAX_INTERRUPTS) {
+		dev_err(dev, "bad irq request: %d\n", sdca_irq);
+		return -EINVAL;
+	}
+
+	guard(mutex)(&info->irq_lock);
+
+	ret = sdca_irq_request_locked(dev, info, sdca_irq, name, handler, data);
+	if (ret) {
+		dev_err(dev, "failed to request irq %s: %d\n", name, ret);
+		return ret;
+	}
+
+	info->irqs[sdca_irq].externally_requested = true;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_irq_request, "SND_SOC_SDCA_IRQ");
+
+/**
+ * sdca_irq_data_populate - Populate common interrupt data
+ * @component: Pointer to the ASoC component for the Function.
+ * @function: Pointer to the SDCA Function.
+ * @entity: Pointer to the SDCA Entity.
+ * @control: Pointer to the SDCA Control.
+ * @interrupt: Pointer to the SDCA interrupt for this IRQ.
+ *
+ * Return: Zero on success, and a negative error code on failure.
+ */
+int sdca_irq_data_populate(struct snd_soc_component *component,
+			   struct sdca_function_data *function,
+			   struct sdca_entity *entity,
+			   struct sdca_control *control,
+			   struct sdca_interrupt *interrupt)
+{
+	struct device *dev = component->dev;
+	const char *name;
+
+	name = devm_kasprintf(dev, GFP_KERNEL, "%s %s %s", function->desc->name,
+			      entity->label, control->label);
+	if (!name)
+		return -ENOMEM;
+
+	interrupt->name = name;
+	interrupt->component = component;
+	interrupt->function = function;
+	interrupt->entity = entity;
+	interrupt->control = control;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_irq_data_populate, "SND_SOC_SDCA_IRQ");
+
+/**
+ * sdca_irq_populate - Request all the individual IRQs for an SDCA Function
+ * @function: Pointer to the SDCA Function.
+ * @component: Pointer to the ASoC component for the Function.
+ * @info: Pointer to the SDCA interrupt info for this device.
+ *
+ * Typically this would be called from the driver for a single SDCA Function.
+ *
+ * Return: Zero on success, and a negative error code on failure.
+ */
+int sdca_irq_populate(struct sdca_function_data *function,
+		      struct snd_soc_component *component,
+		      struct sdca_interrupt_info *info)
+{
+	struct device *dev = component->dev;
+	int i, j;
+
+	guard(mutex)(&info->irq_lock);
+
+	for (i = 0; i < function->num_entities; i++) {
+		struct sdca_entity *entity = &function->entities[i];
+
+		for (j = 0; j < entity->num_controls; j++) {
+			struct sdca_control *control = &entity->controls[j];
+			int irq = control->interrupt_position;
+			struct sdca_interrupt *interrupt;
+			const char *name;
+			int ret;
+
+			if (irq == SDCA_NO_INTERRUPT) {
+				continue;
+			} else if (irq < 0 || irq >= SDCA_MAX_INTERRUPTS) {
+				dev_err(dev, "bad irq position: %d\n", irq);
+				return -EINVAL;
+			}
+
+			interrupt = &info->irqs[irq];
+
+			if (interrupt->externally_requested) {
+				dev_dbg(dev,
+					"skipping irq %d, externally requested\n",
+					irq);
+				continue;
+			}
+
+			ret = sdca_irq_data_populate(component, function, entity,
+						     control, interrupt);
+			if (ret)
+				return ret;
+
+			ret = sdca_irq_request_locked(dev, info, irq, interrupt->name,
+						      base_handler, interrupt);
+			if (ret) {
+				dev_err(dev, "failed to request irq %s: %d\n",
+					name, ret);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_irq_populate, "SND_SOC_SDCA_IRQ");
+
+/**
+ * sdca_irq_allocate - allocate an SDCA interrupt structure for a device
+ * @dev: Device pointer against which things should be allocated.
+ * @regmap: regmap to be used for accessing the SDCA IRQ registers.
+ * @irq: The interrupt number.
+ *
+ * Typically this would be called from the top level driver for the whole
+ * SDCA device, as only a single instance is required across all Functions
+ * on the device.
+ *
+ * Return: A pointer to the allocated sdca_interrupt_info struct, or an
+ * error code.
+ */
+struct sdca_interrupt_info *sdca_irq_allocate(struct device *dev,
+					      struct regmap *regmap, int irq)
+{
+	struct sdca_interrupt_info *info;
+	int ret;
+
+	info = devm_kzalloc(dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->irq_chip = sdca_irq_chip;
+
+	devm_mutex_init(dev, &info->irq_lock);
+
+	ret = devm_regmap_add_irq_chip(dev, regmap, irq, IRQF_ONESHOT, 0,
+				       &info->irq_chip, &info->irq_data);
+	if (ret) {
+		dev_err(dev, "failed to register irq chip: %d\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	dev_dbg(dev, "registered on irq %d\n", irq);
+
+	return info;
+}
+EXPORT_SYMBOL_NS_GPL(sdca_irq_allocate, "SND_SOC_SDCA_IRQ");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SDCA IRQ library");
-- 
cgit v1.2.3


From b9ab3b61824190b1c6b2c59e7ba4de591f24eb92 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 24 Jun 2025 13:28:44 +0100
Subject: ASoC: SDCA: Add some initial IRQ handlers

Add basic IRQ handlers for the function status and jack detection
interrupts.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250624122844.2761627-8-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_interrupts.h  |   3 +
 sound/soc/sdca/sdca_interrupts.c | 157 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 159 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h
index 4cda8b75bae0..bbbc3ab27eba 100644
--- a/include/sound/sdca_interrupts.h
+++ b/include/sound/sdca_interrupts.h
@@ -27,6 +27,7 @@ struct sdca_function_data;
  * @function: Pointer to the Function that the interrupt is associated with.
  * @entity: Pointer to the Entity that the interrupt is associated with.
  * @control: Pointer to the Control that the interrupt is associated with.
+ * @priv: Pointer to private data for use by the handler.
  * @externally_requested: Internal flag used to check if a client driver has
  * already requested the interrupt, for custom handling, allowing the core to
  * skip handling this interrupt.
@@ -39,6 +40,8 @@ struct sdca_interrupt {
 	struct sdca_entity *entity;
 	struct sdca_control *control;
 
+	void *priv;
+
 	bool externally_requested;
 };
 
diff --git a/sound/soc/sdca/sdca_interrupts.c b/sound/soc/sdca/sdca_interrupts.c
index 7272d11cb6d4..edb045c7ebb0 100644
--- a/sound/soc/sdca/sdca_interrupts.c
+++ b/sound/soc/sdca/sdca_interrupts.c
@@ -7,6 +7,7 @@
  * https://www.mipi.org/mipi-sdca-v1-0-download
  */
 
+#include <linux/bitmap.h>
 #include <linux/bits.h>
 #include <linux/cleanup.h>
 #include <linux/device.h>
@@ -18,6 +19,7 @@
 #include <sound/sdca_function.h>
 #include <sound/sdca_interrupts.h>
 #include <sound/soc-component.h>
+#include <sound/soc.h>
 
 #define IRQ_SDCA(number) REGMAP_IRQ_REG(number, ((number) / BITS_PER_BYTE), \
 					SDW_SCP_SDCA_INTMASK_SDCA_##number)
@@ -80,6 +82,143 @@ static irqreturn_t base_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t function_status_handler(int irq, void *data)
+{
+	struct sdca_interrupt *interrupt = data;
+	struct device *dev = interrupt->component->dev;
+	unsigned int reg, val;
+	unsigned long status;
+	unsigned int mask;
+	int ret;
+
+	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
+			   interrupt->control->sel, 0);
+
+	ret = regmap_read(interrupt->component->regmap, reg, &val);
+	if (ret < 0) {
+		dev_err(dev, "failed to read function status: %d\n", ret);
+		return IRQ_NONE;
+	}
+
+	dev_dbg(dev, "function status: %#x\n", val);
+
+	status = val;
+	for_each_set_bit(mask, &status, BITS_PER_BYTE) {
+		mask = 1 << mask;
+
+		switch (mask) {
+		case SDCA_CTL_ENTITY_0_FUNCTION_NEEDS_INITIALIZATION:
+			//FIXME: Add init writes
+			break;
+		case SDCA_CTL_ENTITY_0_FUNCTION_FAULT:
+			dev_err(dev, "function fault\n");
+			break;
+		case SDCA_CTL_ENTITY_0_UMP_SEQUENCE_FAULT:
+			dev_err(dev, "ump sequence fault\n");
+			break;
+		case SDCA_CTL_ENTITY_0_FUNCTION_BUSY:
+			dev_info(dev, "unexpected function busy\n");
+			break;
+		case SDCA_CTL_ENTITY_0_DEVICE_NEWLY_ATTACHED:
+		case SDCA_CTL_ENTITY_0_INTS_DISABLED_ABNORMALLY:
+		case SDCA_CTL_ENTITY_0_STREAMING_STOPPED_ABNORMALLY:
+		case SDCA_CTL_ENTITY_0_FUNCTION_HAS_BEEN_RESET:
+			break;
+		}
+	}
+
+	ret = regmap_write(interrupt->component->regmap, reg, val);
+	if (ret < 0) {
+		dev_err(dev, "failed to clear function status: %d\n", ret);
+		return IRQ_NONE;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t detected_mode_handler(int irq, void *data)
+{
+	struct sdca_interrupt *interrupt = data;
+	struct snd_soc_component *component = interrupt->component;
+	struct device *dev = component->dev;
+	struct snd_soc_card *card = component->card;
+	struct rw_semaphore *rwsem = &card->snd_card->controls_rwsem;
+	struct snd_kcontrol *kctl = interrupt->priv;
+	struct snd_ctl_elem_value ucontrol;
+	struct soc_enum *soc_enum;
+	unsigned int reg, val;
+	int ret;
+
+	if (!kctl) {
+		const char *name __free(kfree) = kasprintf(GFP_KERNEL, "%s %s",
+							   interrupt->entity->label,
+							   SDCA_CTL_SELECTED_MODE_NAME);
+
+		if (!name)
+			return -ENOMEM;
+
+		kctl = snd_soc_component_get_kcontrol(component, name);
+		if (!kctl) {
+			dev_dbg(dev, "control not found: %s\n", name);
+			return IRQ_NONE;
+		}
+
+		interrupt->priv = kctl;
+	}
+
+	soc_enum = (struct soc_enum *)kctl->private_value;
+
+	reg = SDW_SDCA_CTL(interrupt->function->desc->adr, interrupt->entity->id,
+			   interrupt->control->sel, 0);
+
+	ret = regmap_read(component->regmap, reg, &val);
+	if (ret < 0) {
+		dev_err(dev, "failed to read detected mode: %d\n", ret);
+		return IRQ_NONE;
+	}
+
+	switch (val) {
+	case SDCA_DETECTED_MODE_DETECTION_IN_PROGRESS:
+	case SDCA_DETECTED_MODE_JACK_UNKNOWN:
+		reg = SDW_SDCA_CTL(interrupt->function->desc->adr,
+				   interrupt->entity->id,
+				   SDCA_CTL_GE_SELECTED_MODE, 0);
+
+		/*
+		 * Selected mode is not normally marked as volatile register
+		 * (RW), but here force a read from the hardware. If the
+		 * detected mode is unknown we need to see what the device
+		 * selected as a "safe" option.
+		 */
+		regcache_drop_region(component->regmap, reg, reg);
+
+		ret = regmap_read(component->regmap, reg, &val);
+		if (ret) {
+			dev_err(dev, "failed to re-check selected mode: %d\n", ret);
+			return IRQ_NONE;
+		}
+		break;
+	default:
+		break;
+	}
+
+	dev_dbg(dev, "%s: %#x\n", interrupt->name, val);
+
+	ucontrol.value.enumerated.item[0] = snd_soc_enum_val_to_item(soc_enum, val);
+
+	down_write(rwsem);
+	ret = kctl->put(kctl, &ucontrol);
+	up_write(rwsem);
+	if (ret < 0) {
+		dev_err(dev, "failed to update selected mode: %d\n", ret);
+		return IRQ_NONE;
+	}
+
+	snd_ctl_notify(card->snd_card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id);
+
+	return IRQ_HANDLED;
+}
+
 static int sdca_irq_request_locked(struct device *dev,
 				   struct sdca_interrupt_info *info,
 				   int sdca_irq, const char *name,
@@ -202,6 +341,7 @@ int sdca_irq_populate(struct sdca_function_data *function,
 			struct sdca_control *control = &entity->controls[j];
 			int irq = control->interrupt_position;
 			struct sdca_interrupt *interrupt;
+			irq_handler_t handler;
 			const char *name;
 			int ret;
 
@@ -226,8 +366,23 @@ int sdca_irq_populate(struct sdca_function_data *function,
 			if (ret)
 				return ret;
 
+			handler = base_handler;
+
+			switch (entity->type) {
+			case SDCA_ENTITY_TYPE_ENTITY_0:
+				if (control->sel == SDCA_CTL_ENTITY_0_FUNCTION_STATUS)
+					handler = function_status_handler;
+				break;
+			case SDCA_ENTITY_TYPE_GE:
+				if (control->sel == SDCA_CTL_GE_DETECTED_MODE)
+					handler = detected_mode_handler;
+				break;
+			default:
+				break;
+			}
+
 			ret = sdca_irq_request_locked(dev, info, irq, interrupt->name,
-						      base_handler, interrupt);
+						      handler, interrupt);
 			if (ret) {
 				dev_err(dev, "failed to request irq %s: %d\n",
 					name, ret);
-- 
cgit v1.2.3


From 7ff495e26a39f3e7a3d4058df59b5b6d6f943cab Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 30 Jun 2025 09:51:38 +0200
Subject: local_lock: Move this_cpu_ptr() notation from internal to main header

local_lock.h is the main header for the local_lock_t type and provides
wrappers around internal functions prefixed with __ in
local_lock_internal.h.

Move the this_cpu_ptr() dereference of the variable from the internal to
the main header. Since it is all macro implemented, this_cpu_ptr() will
still happen within the preempt/ IRQ disabled section.

This frees the internal implementation (__) to be used on local_lock_t
types which are local variables and must not be accessed via
this_cpu_ptr().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Waiman Long <longman@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20250630075138.3448715-2-bigeasy@linutronix.de
---
 include/linux/local_lock.h          | 20 ++++++++++----------
 include/linux/local_lock_internal.h | 30 +++++++++++++++---------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
index 16a2ee4f8310..2ba846419524 100644
--- a/include/linux/local_lock.h
+++ b/include/linux/local_lock.h
@@ -13,13 +13,13 @@
  * local_lock - Acquire a per CPU local lock
  * @lock:	The lock variable
  */
-#define local_lock(lock)		__local_lock(lock)
+#define local_lock(lock)		__local_lock(this_cpu_ptr(lock))
 
 /**
  * local_lock_irq - Acquire a per CPU local lock and disable interrupts
  * @lock:	The lock variable
  */
-#define local_lock_irq(lock)		__local_lock_irq(lock)
+#define local_lock_irq(lock)		__local_lock_irq(this_cpu_ptr(lock))
 
 /**
  * local_lock_irqsave - Acquire a per CPU local lock, save and disable
@@ -28,19 +28,19 @@
  * @flags:	Storage for interrupt flags
  */
 #define local_lock_irqsave(lock, flags)				\
-	__local_lock_irqsave(lock, flags)
+	__local_lock_irqsave(this_cpu_ptr(lock), flags)
 
 /**
  * local_unlock - Release a per CPU local lock
  * @lock:	The lock variable
  */
-#define local_unlock(lock)		__local_unlock(lock)
+#define local_unlock(lock)		__local_unlock(this_cpu_ptr(lock))
 
 /**
  * local_unlock_irq - Release a per CPU local lock and enable interrupts
  * @lock:	The lock variable
  */
-#define local_unlock_irq(lock)		__local_unlock_irq(lock)
+#define local_unlock_irq(lock)		__local_unlock_irq(this_cpu_ptr(lock))
 
 /**
  * local_unlock_irqrestore - Release a per CPU local lock and restore
@@ -49,7 +49,7 @@
  * @flags:      Interrupt flags to restore
  */
 #define local_unlock_irqrestore(lock, flags)			\
-	__local_unlock_irqrestore(lock, flags)
+	__local_unlock_irqrestore(this_cpu_ptr(lock), flags)
 
 /**
  * local_lock_init - Runtime initialize a lock instance
@@ -64,7 +64,7 @@
  * locking constrains it will _always_ fail to acquire the lock in NMI or
  * HARDIRQ context on PREEMPT_RT.
  */
-#define local_trylock(lock)		__local_trylock(lock)
+#define local_trylock(lock)		__local_trylock(this_cpu_ptr(lock))
 
 /**
  * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
@@ -77,7 +77,7 @@
  * HARDIRQ context on PREEMPT_RT.
  */
 #define local_trylock_irqsave(lock, flags)			\
-	__local_trylock_irqsave(lock, flags)
+	__local_trylock_irqsave(this_cpu_ptr(lock), flags)
 
 DEFINE_GUARD(local_lock, local_lock_t __percpu*,
 	     local_lock(_T),
@@ -91,10 +91,10 @@ DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu,
 		    unsigned long flags)
 
 #define local_lock_nested_bh(_lock)				\
-	__local_lock_nested_bh(_lock)
+	__local_lock_nested_bh(this_cpu_ptr(_lock))
 
 #define local_unlock_nested_bh(_lock)				\
-	__local_unlock_nested_bh(_lock)
+	__local_unlock_nested_bh(this_cpu_ptr(_lock))
 
 DEFINE_GUARD(local_lock_nested_bh, local_lock_t __percpu*,
 	     local_lock_nested_bh(_T),
diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h
index 8d5ac16a9b17..d80b5306a2c0 100644
--- a/include/linux/local_lock_internal.h
+++ b/include/linux/local_lock_internal.h
@@ -99,14 +99,14 @@ do {								\
 		local_trylock_t *tl;					\
 		local_lock_t *l;					\
 									\
-		l = (local_lock_t *)this_cpu_ptr(lock);			\
+		l = (local_lock_t *)(lock);				\
 		tl = (local_trylock_t *)l;				\
 		_Generic((lock),					\
-			__percpu local_trylock_t *: ({			\
+			local_trylock_t *: ({				\
 				lockdep_assert(tl->acquired == 0);	\
 				WRITE_ONCE(tl->acquired, 1);		\
 			}),						\
-			__percpu local_lock_t *: (void)0);		\
+			local_lock_t *: (void)0);			\
 		local_lock_acquire(l);					\
 	} while (0)
 
@@ -133,7 +133,7 @@ do {								\
 		local_trylock_t *tl;				\
 								\
 		preempt_disable();				\
-		tl = this_cpu_ptr(lock);			\
+		tl = (lock);					\
 		if (READ_ONCE(tl->acquired)) {			\
 			preempt_enable();			\
 			tl = NULL;				\
@@ -150,7 +150,7 @@ do {								\
 		local_trylock_t *tl;				\
 								\
 		local_irq_save(flags);				\
-		tl = this_cpu_ptr(lock);			\
+		tl = (lock);					\
 		if (READ_ONCE(tl->acquired)) {			\
 			local_irq_restore(flags);		\
 			tl = NULL;				\
@@ -167,15 +167,15 @@ do {								\
 		local_trylock_t *tl;					\
 		local_lock_t *l;					\
 									\
-		l = (local_lock_t *)this_cpu_ptr(lock);			\
+		l = (local_lock_t *)(lock);				\
 		tl = (local_trylock_t *)l;				\
 		local_lock_release(l);					\
 		_Generic((lock),					\
-			__percpu local_trylock_t *: ({			\
+			local_trylock_t *: ({				\
 				lockdep_assert(tl->acquired == 1);	\
 				WRITE_ONCE(tl->acquired, 0);		\
 			}),						\
-			__percpu local_lock_t *: (void)0);		\
+			local_lock_t *: (void)0);			\
 	} while (0)
 
 #define __local_unlock(lock)					\
@@ -199,11 +199,11 @@ do {								\
 #define __local_lock_nested_bh(lock)				\
 	do {							\
 		lockdep_assert_in_softirq();			\
-		local_lock_acquire(this_cpu_ptr(lock));	\
+		local_lock_acquire((lock));			\
 	} while (0)
 
 #define __local_unlock_nested_bh(lock)				\
-	local_lock_release(this_cpu_ptr(lock))
+	local_lock_release((lock))
 
 #else /* !CONFIG_PREEMPT_RT */
 
@@ -227,7 +227,7 @@ typedef spinlock_t local_trylock_t;
 #define __local_lock(__lock)					\
 	do {							\
 		migrate_disable();				\
-		spin_lock(this_cpu_ptr((__lock)));		\
+		spin_lock((__lock));				\
 	} while (0)
 
 #define __local_lock_irq(lock)			__local_lock(lock)
@@ -241,7 +241,7 @@ typedef spinlock_t local_trylock_t;
 
 #define __local_unlock(__lock)					\
 	do {							\
-		spin_unlock(this_cpu_ptr((__lock)));		\
+		spin_unlock((__lock));				\
 		migrate_enable();				\
 	} while (0)
 
@@ -252,12 +252,12 @@ typedef spinlock_t local_trylock_t;
 #define __local_lock_nested_bh(lock)				\
 do {								\
 	lockdep_assert_in_softirq_func();			\
-	spin_lock(this_cpu_ptr(lock));				\
+	spin_lock((lock));					\
 } while (0)
 
 #define __local_unlock_nested_bh(lock)				\
 do {								\
-	spin_unlock(this_cpu_ptr((lock)));			\
+	spin_unlock((lock));					\
 } while (0)
 
 #define __local_trylock(lock)					\
@@ -268,7 +268,7 @@ do {								\
 			__locked = 0;				\
 		} else {					\
 			migrate_disable();			\
-			__locked = spin_trylock(this_cpu_ptr((lock)));	\
+			__locked = spin_trylock((lock));	\
 			if (!__locked)				\
 				migrate_enable();		\
 		}						\
-- 
cgit v1.2.3


From b693c703accb08cbd52f0b94d810d6abbca3bfb9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:03:06 -0700
Subject: lib/crypto: sha512: Add support for SHA-384 and SHA-512

Add basic support for SHA-384 and SHA-512 to lib/crypto/.

Various in-kernel users will be able to use this instead of the
old-school crypto API, which is harder to use and has more overhead.

The basic support added by this commit consists of the API and its
documentation, backed by a C implementation of the algorithms.
sha512_block_generic() is derived from crypto/sha512_generic.c.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 128 ++++++++++++++++++++++++
 lib/crypto/Kconfig    |  10 ++
 lib/crypto/Makefile   |   6 ++
 lib/crypto/sha512.c   | 265 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 409 insertions(+)
 create mode 100644 lib/crypto/sha512.c

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 4912572578dc..f2a6e84b2840 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -129,4 +129,132 @@ static inline void sha224_init(struct sha256_state *sctx)
 /* Simply use sha256_update as it is equivalent to sha224_update. */
 void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]);
 
+/* State for the SHA-512 (and SHA-384) compression function */
+struct sha512_block_state {
+	u64 h[8];
+};
+
+/*
+ * Context structure, shared by SHA-384 and SHA-512.  The sha384_ctx and
+ * sha512_ctx structs wrap this one so that the API has proper typing and
+ * doesn't allow mixing the SHA-384 and SHA-512 functions arbitrarily.
+ */
+struct __sha512_ctx {
+	struct sha512_block_state state;
+	u64 bytecount_lo;
+	u64 bytecount_hi;
+	u8 buf[SHA512_BLOCK_SIZE] __aligned(__alignof__(__be64));
+};
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * struct sha384_ctx - Context for hashing a message with SHA-384
+ * @ctx: private
+ */
+struct sha384_ctx {
+	struct __sha512_ctx ctx;
+};
+
+/**
+ * sha384_init() - Initialize a SHA-384 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider sha384() instead.
+ *
+ * Context: Any context.
+ */
+void sha384_init(struct sha384_ctx *ctx);
+
+/**
+ * sha384_update() - Update a SHA-384 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void sha384_update(struct sha384_ctx *ctx,
+				 const u8 *data, size_t len)
+{
+	__sha512_update(&ctx->ctx, data, len);
+}
+
+/**
+ * sha384_final() - Finish computing a SHA-384 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting SHA-384 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+
+/**
+ * sha384() - Compute SHA-384 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting SHA-384 message digest
+ *
+ * Context: Any context.
+ */
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]);
+
+/**
+ * struct sha512_ctx - Context for hashing a message with SHA-512
+ * @ctx: private
+ */
+struct sha512_ctx {
+	struct __sha512_ctx ctx;
+};
+
+/**
+ * sha512_init() - Initialize a SHA-512 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider sha512() instead.
+ *
+ * Context: Any context.
+ */
+void sha512_init(struct sha512_ctx *ctx);
+
+/**
+ * sha512_update() - Update a SHA-512 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void sha512_update(struct sha512_ctx *ctx,
+				 const u8 *data, size_t len)
+{
+	__sha512_update(&ctx->ctx, data, len);
+}
+
+/**
+ * sha512_final() - Finish computing a SHA-512 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting SHA-512 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+
+/**
+ * sha512() - Compute SHA-512 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting SHA-512 message digest
+ *
+ * Context: Any context.
+ */
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]);
+
 #endif /* _CRYPTO_SHA2_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 1ec1466108cc..2d295c0e0f79 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -167,6 +167,16 @@ config CRYPTO_LIB_SHA256_GENERIC
 	  for SIMD implementations. If no arch specific implementation is
 	  enabled, this implementation serves the users of CRYPTO_LIB_SHA256.
 
+config CRYPTO_LIB_SHA512
+	tristate
+	help
+	  The SHA-384 and SHA-512 library functions.  Select this if your module
+	  uses any of these functions from <crypto/sha2.h>.
+
+config CRYPTO_LIB_SHA512_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA512 && !UML
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index b0c0f8aea269..f6b6f370451e 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -64,6 +64,12 @@ libsha256-y					:= sha256.o
 obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC)		+= libsha256-generic.o
 libsha256-generic-y				:= sha256-generic.o
 
+obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
+libsha512-y := sha512.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
+CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
new file mode 100644
index 000000000000..536b71481b1c
--- /dev/null
+++ b/lib/crypto/sha512.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-384 and SHA-512 library functions
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/sha2.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/overflow.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+static const struct sha512_block_state sha384_iv = {
+	.h = {
+		SHA384_H0, SHA384_H1, SHA384_H2, SHA384_H3,
+		SHA384_H4, SHA384_H5, SHA384_H6, SHA384_H7,
+	},
+};
+
+static const struct sha512_block_state sha512_iv = {
+	.h = {
+		SHA512_H0, SHA512_H1, SHA512_H2, SHA512_H3,
+		SHA512_H4, SHA512_H5, SHA512_H6, SHA512_H7,
+	},
+};
+
+static const u64 sha512_K[80] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+	0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+	0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+	0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+	0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+	0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+	0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+	0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+	0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+	0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+	0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+	0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+	0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+	0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror64((x), 28) ^ ror64((x), 34) ^ ror64((x), 39))
+#define e1(x) (ror64((x), 14) ^ ror64((x), 18) ^ ror64((x), 41))
+#define s0(x) (ror64((x), 1) ^ ror64((x), 8) ^ ((x) >> 7))
+#define s1(x) (ror64((x), 19) ^ ror64((x), 61) ^ ((x) >> 6))
+
+static void sha512_block_generic(struct sha512_block_state *state,
+				 const u8 *data)
+{
+	u64 a = state->h[0];
+	u64 b = state->h[1];
+	u64 c = state->h[2];
+	u64 d = state->h[3];
+	u64 e = state->h[4];
+	u64 f = state->h[5];
+	u64 g = state->h[6];
+	u64 h = state->h[7];
+	u64 t1, t2;
+	u64 W[16];
+
+	for (int j = 0; j < 16; j++)
+		W[j] = get_unaligned_be64(data + j * sizeof(u64));
+
+	for (int i = 0; i < 80; i += 8) {
+		if ((i & 15) == 0 && i != 0) {
+			for (int j = 0; j < 16; j++) {
+				W[j & 15] += s1(W[(j - 2) & 15]) +
+					     W[(j - 7) & 15] +
+					     s0(W[(j - 15) & 15]);
+			}
+		}
+		t1 = h + e1(e) + Ch(e, f, g) + sha512_K[i]   + W[(i & 15)];
+		t2 = e0(a) + Maj(a, b, c);    d += t1;    h = t1 + t2;
+		t1 = g + e1(d) + Ch(d, e, f) + sha512_K[i+1] + W[(i & 15) + 1];
+		t2 = e0(h) + Maj(h, a, b);    c += t1;    g = t1 + t2;
+		t1 = f + e1(c) + Ch(c, d, e) + sha512_K[i+2] + W[(i & 15) + 2];
+		t2 = e0(g) + Maj(g, h, a);    b += t1;    f = t1 + t2;
+		t1 = e + e1(b) + Ch(b, c, d) + sha512_K[i+3] + W[(i & 15) + 3];
+		t2 = e0(f) + Maj(f, g, h);    a += t1;    e = t1 + t2;
+		t1 = d + e1(a) + Ch(a, b, c) + sha512_K[i+4] + W[(i & 15) + 4];
+		t2 = e0(e) + Maj(e, f, g);    h += t1;    d = t1 + t2;
+		t1 = c + e1(h) + Ch(h, a, b) + sha512_K[i+5] + W[(i & 15) + 5];
+		t2 = e0(d) + Maj(d, e, f);    g += t1;    c = t1 + t2;
+		t1 = b + e1(g) + Ch(g, h, a) + sha512_K[i+6] + W[(i & 15) + 6];
+		t2 = e0(c) + Maj(c, d, e);    f += t1;    b = t1 + t2;
+		t1 = a + e1(f) + Ch(f, g, h) + sha512_K[i+7] + W[(i & 15) + 7];
+		t2 = e0(b) + Maj(b, c, d);    e += t1;    a = t1 + t2;
+	}
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
+}
+
+static void __maybe_unused
+sha512_blocks_generic(struct sha512_block_state *state,
+		      const u8 *data, size_t nblocks)
+{
+	do {
+		sha512_block_generic(state, data);
+		data += SHA512_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA512_ARCH
+#include "sha512.h" /* $(SRCARCH)/sha512.h */
+#else
+#define sha512_blocks sha512_blocks_generic
+#endif
+
+static void __sha512_init(struct __sha512_ctx *ctx,
+			  const struct sha512_block_state *iv,
+			  u64 initial_bytecount)
+{
+	ctx->state = *iv;
+	ctx->bytecount_lo = initial_bytecount;
+	ctx->bytecount_hi = 0;
+}
+
+void sha384_init(struct sha384_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha384_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha384_init);
+
+void sha512_init(struct sha512_ctx *ctx)
+{
+	__sha512_init(&ctx->ctx, &sha512_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha512_init);
+
+void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	if (check_add_overflow(ctx->bytecount_lo, len, &ctx->bytecount_lo))
+		ctx->bytecount_hi++;
+
+	if (partial + len >= SHA512_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA512_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha512_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA512_BLOCK_SIZE;
+		len %= SHA512_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha512_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA512_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(__sha512_update);
+
+static void __sha512_final(struct __sha512_ctx *ctx,
+			   u8 *out, size_t digest_size)
+{
+	u64 bitcount_hi = (ctx->bytecount_hi << 3) | (ctx->bytecount_lo >> 61);
+	u64 bitcount_lo = ctx->bytecount_lo << 3;
+	size_t partial = ctx->bytecount_lo % SHA512_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA512_BLOCK_SIZE - 16) {
+		memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - partial);
+		sha512_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA512_BLOCK_SIZE - 16 - partial);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 16] = cpu_to_be64(bitcount_hi);
+	*(__be64 *)&ctx->buf[SHA512_BLOCK_SIZE - 8] = cpu_to_be64(bitcount_lo);
+	sha512_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->state.h[i / 8], out + i);
+}
+
+void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha384_final);
+
+void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE])
+{
+	__sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha512_final);
+
+void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct sha384_ctx ctx;
+
+	sha384_init(&ctx);
+	sha384_update(&ctx, data, len);
+	sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha384);
+
+void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct sha512_ctx ctx;
+
+	sha512_init(&ctx);
+	sha512_update(&ctx, data, len);
+	sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha512);
+
+#ifdef sha512_mod_init_arch
+static int __init sha512_mod_init(void)
+{
+	sha512_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha512_mod_init);
+
+static void __exit sha512_mod_exit(void)
+{
+}
+module_exit(sha512_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-384 and SHA-512 library functions");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 23e8b4371dbd5907d633262f36903144a378a114 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:03:07 -0700
Subject: lib/crypto: sha512: Add HMAC-SHA384 and HMAC-SHA512 support

Since HMAC support is commonly needed and is fairly simple, include it
as a first-class citizen of the SHA-512 library.

The API supports both incremental and one-shot computation, and either
preparing the key ahead of time or just using a raw key.  The
implementation is much more streamlined than crypto/hmac.c.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/crypto/Kconfig    |   5 +-
 lib/crypto/sha512.c   | 141 +++++++++++++++++++++++++++++++-
 3 files changed, 364 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index f2a6e84b2840..296ce9d468bf 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -147,6 +147,22 @@ struct __sha512_ctx {
 };
 void __sha512_update(struct __sha512_ctx *ctx, const u8 *data, size_t len);
 
+/*
+ * HMAC key and message context structs, shared by HMAC-SHA384 and HMAC-SHA512.
+ * The hmac_sha384_* and hmac_sha512_* structs wrap this one so that the API has
+ * proper typing and doesn't allow mixing the functions arbitrarily.
+ */
+struct __hmac_sha512_key {
+	struct sha512_block_state istate;
+	struct sha512_block_state ostate;
+};
+struct __hmac_sha512_ctx {
+	struct __sha512_ctx sha_ctx;
+	struct sha512_block_state ostate;
+};
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+			const struct __hmac_sha512_key *key);
+
 /**
  * struct sha384_ctx - Context for hashing a message with SHA-384
  * @ctx: private
@@ -202,6 +218,109 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
  */
 void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]);
 
+/**
+ * struct hmac_sha384_key - Prepared key for HMAC-SHA384
+ * @key: private
+ */
+struct hmac_sha384_key {
+	struct __hmac_sha512_key key;
+};
+
+/**
+ * struct hmac_sha384_ctx - Context for computing HMAC-SHA384 of a message
+ * @ctx: private
+ */
+struct hmac_sha384_ctx {
+	struct __hmac_sha512_ctx ctx;
+};
+
+/**
+ * hmac_sha384_preparekey() - Prepare a key for HMAC-SHA384
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-SHA384 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_sha384_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+			    const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha384_init() - Initialize an HMAC-SHA384 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_sha384() instead.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha384_init(struct hmac_sha384_ctx *ctx,
+				    const struct hmac_sha384_key *key)
+{
+	__hmac_sha512_init(&ctx->ctx, &key->key);
+}
+
+/**
+ * hmac_sha384_update() - Update an HMAC-SHA384 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx,
+				      const u8 *data, size_t data_len)
+{
+	__sha512_update(&ctx->ctx.sha_ctx, data, data_len);
+}
+
+/**
+ * hmac_sha384_final() - Finish computing an HMAC-SHA384 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-SHA384 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]);
+
+/**
+ * hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA384 value
+ *
+ * If you're using the key only once, consider using hmac_sha384_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_sha384(const struct hmac_sha384_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]);
+
+/**
+ * hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key
+ * @raw_key: the raw HMAC-SHA384 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA384 value
+ *
+ * If you're using the key multiple times, prefer to use
+ * hmac_sha384_preparekey() followed by multiple calls to hmac_sha384() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA384_DIGEST_SIZE]);
+
 /**
  * struct sha512_ctx - Context for hashing a message with SHA-512
  * @ctx: private
@@ -257,4 +376,107 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
  */
 void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]);
 
+/**
+ * struct hmac_sha512_key - Prepared key for HMAC-SHA512
+ * @key: private
+ */
+struct hmac_sha512_key {
+	struct __hmac_sha512_key key;
+};
+
+/**
+ * struct hmac_sha512_ctx - Context for computing HMAC-SHA512 of a message
+ * @ctx: private
+ */
+struct hmac_sha512_ctx {
+	struct __hmac_sha512_ctx ctx;
+};
+
+/**
+ * hmac_sha512_preparekey() - Prepare a key for HMAC-SHA512
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-SHA512 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_sha512_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+			    const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha512_init() - Initialize an HMAC-SHA512 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_sha512() instead.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha512_init(struct hmac_sha512_ctx *ctx,
+				    const struct hmac_sha512_key *key)
+{
+	__hmac_sha512_init(&ctx->ctx, &key->key);
+}
+
+/**
+ * hmac_sha512_update() - Update an HMAC-SHA512 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx,
+				      const u8 *data, size_t data_len)
+{
+	__sha512_update(&ctx->ctx.sha_ctx, data, data_len);
+}
+
+/**
+ * hmac_sha512_final() - Finish computing an HMAC-SHA512 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-SHA512 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]);
+
+/**
+ * hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA512 value
+ *
+ * If you're using the key only once, consider using hmac_sha512_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_sha512(const struct hmac_sha512_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]);
+
+/**
+ * hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key
+ * @raw_key: the raw HMAC-SHA512 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA512 value
+ *
+ * If you're using the key multiple times, prefer to use
+ * hmac_sha512_preparekey() followed by multiple calls to hmac_sha512() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA512_DIGEST_SIZE]);
+
 #endif /* _CRYPTO_SHA2_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 2d295c0e0f79..d1bee3787eb3 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -170,8 +170,9 @@ config CRYPTO_LIB_SHA256_GENERIC
 config CRYPTO_LIB_SHA512
 	tristate
 	help
-	  The SHA-384 and SHA-512 library functions.  Select this if your module
-	  uses any of these functions from <crypto/sha2.h>.
+	  The SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions.
+	  Select this if your module uses any of these functions from
+	  <crypto/sha2.h>.
 
 config CRYPTO_LIB_SHA512_ARCH
 	bool
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
index 536b71481b1c..d514721491ca 100644
--- a/lib/crypto/sha512.c
+++ b/lib/crypto/sha512.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * SHA-384 and SHA-512 library functions
+ * SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions
  *
  * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
  * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
@@ -8,6 +8,7 @@
  * Copyright 2025 Google LLC
  */
 
+#include <crypto/hmac.h>
 #include <crypto/sha2.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
@@ -15,6 +16,7 @@
 #include <linux/overflow.h>
 #include <linux/string.h>
 #include <linux/unaligned.h>
+#include <linux/wordpart.h>
 
 static const struct sha512_block_state sha384_iv = {
 	.h = {
@@ -247,6 +249,141 @@ void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
 }
 EXPORT_SYMBOL_GPL(sha512);
 
+static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha512_block_state *iv)
+{
+	union {
+		u8 b[SHA512_BLOCK_SIZE];
+		unsigned long w[SHA512_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA512_BLOCK_SIZE)) {
+		if (iv == &sha384_iv)
+			sha384(raw_key, raw_key_len, derived_key.b);
+		else
+			sha512(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	key->istate = *iv;
+	sha512_blocks(&key->istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	key->ostate = *iv;
+	sha512_blocks(&key->ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha384_preparekey(struct hmac_sha384_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha384_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
+
+void hmac_sha512_preparekey(struct hmac_sha512_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha512_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
+
+void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
+			const struct __hmac_sha512_key *key)
+{
+	__sha512_init(&ctx->sha_ctx, &key->istate, SHA512_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha512_init);
+
+static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha512_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA512_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA512_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA512_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha512_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 8)
+		put_unaligned_be64(ctx->ostate.h[i / 8], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha384_final(struct hmac_sha384_ctx *ctx,
+		       u8 out[SHA384_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA384_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_final);
+
+void hmac_sha512_final(struct hmac_sha512_ctx *ctx,
+		       u8 out[SHA512_DIGEST_SIZE])
+{
+	__hmac_sha512_final(&ctx->ctx, out, SHA512_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_final);
+
+void hmac_sha384(const struct hmac_sha384_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_ctx ctx;
+
+	hmac_sha384_init(&ctx, key);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha384);
+
+void hmac_sha512(const struct hmac_sha512_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_ctx ctx;
+
+	hmac_sha512_init(&ctx, key);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha512);
+
+void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA384_DIGEST_SIZE])
+{
+	struct hmac_sha384_key key;
+
+	hmac_sha384_preparekey(&key, raw_key, raw_key_len);
+	hmac_sha384(&key, data, data_len, out);
+
+	memzero_explicit(&key, sizeof(key));
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
+
+void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA512_DIGEST_SIZE])
+{
+	struct hmac_sha512_key key;
+
+	hmac_sha512_preparekey(&key, raw_key, raw_key_len);
+	hmac_sha512(&key, data, data_len, out);
+
+	memzero_explicit(&key, sizeof(key));
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
+
 #ifdef sha512_mod_init_arch
 static int __init sha512_mod_init(void)
 {
@@ -261,5 +398,5 @@ static void __exit sha512_mod_exit(void)
 module_exit(sha512_mod_exit);
 #endif
 
-MODULE_DESCRIPTION("SHA-384 and SHA-512 library functions");
+MODULE_DESCRIPTION("SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 library functions");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 469acaa12502e05eefd439693361fe4b851a4fd5 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:03:09 -0700
Subject: crypto: sha512 - Replace sha512_generic with wrapper around SHA-512
 library

Delete crypto/sha512_generic.c, which provided "generic" SHA-384 and
SHA-512 crypto_shash algorithms.  Replace it with crypto/sha512.c which
provides SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512 crypto_shash
algorithms using the corresponding library functions.

This is a prerequisite for migrating all the arch-optimized SHA-512 code
(which is almost 3000 lines) to lib/crypto/ rather than duplicating it.

Since the replacement crypto_shash algorithms are implemented using the
(potentially arch-optimized) library functions, give them
cra_driver_names ending with "-lib" rather than "-generic".  Update
crypto/testmgr.c and one odd driver to take this change in driver name
into account.  Besides these cases which are accounted for, there are no
known cases where the cra_driver_name was being depended on.

This change does mean that the abstract partial block handling code in
crypto/shash.c, which got added in 6.16, no longer gets used.  But
that's fine; the library has to implement the partial block handling
anyway, and it's better to do it in the library since the block size and
other properties of the algorithm are all fixed at compile time there,
resulting in more streamlined code.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig                        |   4 +-
 crypto/Makefile                       |   2 +-
 crypto/sha512.c                       | 258 ++++++++++++++++++++++++++++++++++
 crypto/sha512_generic.c               | 217 ----------------------------
 crypto/testmgr.c                      |  10 ++
 drivers/crypto/starfive/jh7110-hash.c |   8 +-
 include/crypto/sha512_base.h          |   3 -
 7 files changed, 276 insertions(+), 226 deletions(-)
 create mode 100644 crypto/sha512.c
 delete mode 100644 crypto/sha512_generic.c

(limited to 'include')

diff --git a/crypto/Kconfig b/crypto/Kconfig
index e1cfd0d4cc8f..cb40a9b46972 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1002,8 +1002,10 @@ config CRYPTO_SHA256
 config CRYPTO_SHA512
 	tristate "SHA-384 and SHA-512"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_SHA512
 	help
-	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180, ISO/IEC 10118-3)
+	  SHA-384 and SHA-512 secure hash algorithms (FIPS 180, ISO/IEC
+	  10118-3), including HMAC support.
 
 config CRYPTO_SHA3
 	tristate "SHA-3"
diff --git a/crypto/Makefile b/crypto/Makefile
index 017df3a2e4bb..271c77462cec 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -78,7 +78,7 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256.o
 CFLAGS_sha256.o += -DARCH=$(ARCH)
-obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
+obj-$(CONFIG_CRYPTO_SHA512) += sha512.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
 obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
diff --git a/crypto/sha512.c b/crypto/sha512.c
new file mode 100644
index 000000000000..d1e5400fe590
--- /dev/null
+++ b/crypto/sha512.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/internal/hash.h>
+#include <crypto/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* SHA-384 */
+
+const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = {
+	0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
+	0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
+	0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
+	0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
+	0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
+	0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b
+};
+EXPORT_SYMBOL_GPL(sha384_zero_message_hash);
+
+#define SHA384_CTX(desc) ((struct sha384_ctx *)shash_desc_ctx(desc))
+
+static int crypto_sha384_init(struct shash_desc *desc)
+{
+	sha384_init(SHA384_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha384_update(struct shash_desc *desc,
+				const u8 *data, unsigned int len)
+{
+	sha384_update(SHA384_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_sha384_final(struct shash_desc *desc, u8 *out)
+{
+	sha384_final(SHA384_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_sha384_digest(struct shash_desc *desc,
+				const u8 *data, unsigned int len, u8 *out)
+{
+	sha384(data, len, out);
+	return 0;
+}
+
+/* SHA-512 */
+
+const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = {
+	0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
+	0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
+	0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
+	0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
+	0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
+	0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
+	0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
+	0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e
+};
+EXPORT_SYMBOL_GPL(sha512_zero_message_hash);
+
+#define SHA512_CTX(desc) ((struct sha512_ctx *)shash_desc_ctx(desc))
+
+static int crypto_sha512_init(struct shash_desc *desc)
+{
+	sha512_init(SHA512_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha512_update(struct shash_desc *desc,
+				const u8 *data, unsigned int len)
+{
+	sha512_update(SHA512_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_sha512_final(struct shash_desc *desc, u8 *out)
+{
+	sha512_final(SHA512_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_sha512_digest(struct shash_desc *desc,
+				const u8 *data, unsigned int len, u8 *out)
+{
+	sha512(data, len, out);
+	return 0;
+}
+
+/* HMAC-SHA384 */
+
+#define HMAC_SHA384_KEY(tfm) ((struct hmac_sha384_key *)crypto_shash_ctx(tfm))
+#define HMAC_SHA384_CTX(desc) ((struct hmac_sha384_ctx *)shash_desc_ctx(desc))
+
+static int crypto_hmac_sha384_setkey(struct crypto_shash *tfm,
+				     const u8 *raw_key, unsigned int keylen)
+{
+	hmac_sha384_preparekey(HMAC_SHA384_KEY(tfm), raw_key, keylen);
+	return 0;
+}
+
+static int crypto_hmac_sha384_init(struct shash_desc *desc)
+{
+	hmac_sha384_init(HMAC_SHA384_CTX(desc), HMAC_SHA384_KEY(desc->tfm));
+	return 0;
+}
+
+static int crypto_hmac_sha384_update(struct shash_desc *desc,
+				     const u8 *data, unsigned int len)
+{
+	hmac_sha384_update(HMAC_SHA384_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_hmac_sha384_final(struct shash_desc *desc, u8 *out)
+{
+	hmac_sha384_final(HMAC_SHA384_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_hmac_sha384_digest(struct shash_desc *desc,
+				     const u8 *data, unsigned int len,
+				     u8 *out)
+{
+	hmac_sha384(HMAC_SHA384_KEY(desc->tfm), data, len, out);
+	return 0;
+}
+
+/* HMAC-SHA512 */
+
+#define HMAC_SHA512_KEY(tfm) ((struct hmac_sha512_key *)crypto_shash_ctx(tfm))
+#define HMAC_SHA512_CTX(desc) ((struct hmac_sha512_ctx *)shash_desc_ctx(desc))
+
+static int crypto_hmac_sha512_setkey(struct crypto_shash *tfm,
+				     const u8 *raw_key, unsigned int keylen)
+{
+	hmac_sha512_preparekey(HMAC_SHA512_KEY(tfm), raw_key, keylen);
+	return 0;
+}
+
+static int crypto_hmac_sha512_init(struct shash_desc *desc)
+{
+	hmac_sha512_init(HMAC_SHA512_CTX(desc), HMAC_SHA512_KEY(desc->tfm));
+	return 0;
+}
+
+static int crypto_hmac_sha512_update(struct shash_desc *desc,
+				     const u8 *data, unsigned int len)
+{
+	hmac_sha512_update(HMAC_SHA512_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_hmac_sha512_final(struct shash_desc *desc, u8 *out)
+{
+	hmac_sha512_final(HMAC_SHA512_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_hmac_sha512_digest(struct shash_desc *desc,
+				     const u8 *data, unsigned int len,
+				     u8 *out)
+{
+	hmac_sha512(HMAC_SHA512_KEY(desc->tfm), data, len, out);
+	return 0;
+}
+
+/* Algorithm definitions */
+
+static struct shash_alg algs[] = {
+	{
+		.base.cra_name		= "sha384",
+		.base.cra_driver_name	= "sha384-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= SHA384_BLOCK_SIZE,
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= SHA384_DIGEST_SIZE,
+		.init			= crypto_sha384_init,
+		.update			= crypto_sha384_update,
+		.final			= crypto_sha384_final,
+		.digest			= crypto_sha384_digest,
+		.descsize		= sizeof(struct sha384_ctx),
+	},
+	{
+		.base.cra_name		= "sha512",
+		.base.cra_driver_name	= "sha512-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= SHA512_BLOCK_SIZE,
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= SHA512_DIGEST_SIZE,
+		.init			= crypto_sha512_init,
+		.update			= crypto_sha512_update,
+		.final			= crypto_sha512_final,
+		.digest			= crypto_sha512_digest,
+		.descsize		= sizeof(struct sha512_ctx),
+	},
+	{
+		.base.cra_name		= "hmac(sha384)",
+		.base.cra_driver_name	= "hmac-sha384-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= SHA384_BLOCK_SIZE,
+		.base.cra_ctxsize	= sizeof(struct hmac_sha384_key),
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= SHA384_DIGEST_SIZE,
+		.setkey			= crypto_hmac_sha384_setkey,
+		.init			= crypto_hmac_sha384_init,
+		.update			= crypto_hmac_sha384_update,
+		.final			= crypto_hmac_sha384_final,
+		.digest			= crypto_hmac_sha384_digest,
+		.descsize		= sizeof(struct hmac_sha384_ctx),
+	},
+	{
+		.base.cra_name		= "hmac(sha512)",
+		.base.cra_driver_name	= "hmac-sha512-lib",
+		.base.cra_priority	= 300,
+		.base.cra_blocksize	= SHA512_BLOCK_SIZE,
+		.base.cra_ctxsize	= sizeof(struct hmac_sha512_key),
+		.base.cra_module	= THIS_MODULE,
+		.digestsize		= SHA512_DIGEST_SIZE,
+		.setkey			= crypto_hmac_sha512_setkey,
+		.init			= crypto_hmac_sha512_init,
+		.update			= crypto_hmac_sha512_update,
+		.final			= crypto_hmac_sha512_final,
+		.digest			= crypto_hmac_sha512_digest,
+		.descsize		= sizeof(struct hmac_sha512_ctx),
+	},
+};
+
+static int __init crypto_sha512_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_sha512_mod_init);
+
+static void __exit crypto_sha512_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_sha512_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for SHA-384, SHA-512, HMAC-SHA384, and HMAC-SHA512");
+
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha384-lib");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha512-lib");
+MODULE_ALIAS_CRYPTO("hmac(sha384)");
+MODULE_ALIAS_CRYPTO("hmac-sha384-lib");
+MODULE_ALIAS_CRYPTO("hmac(sha512)");
+MODULE_ALIAS_CRYPTO("hmac-sha512-lib");
diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c
deleted file mode 100644
index 7368173f545e..000000000000
--- a/crypto/sha512_generic.c
+++ /dev/null
@@ -1,217 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* SHA-512 code by Jean-Luc Cooke <jlcooke@certainkey.com>
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = {
-	0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38,
-	0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a,
-	0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43,
-	0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda,
-	0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb,
-	0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b
-};
-EXPORT_SYMBOL_GPL(sha384_zero_message_hash);
-
-const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = {
-	0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd,
-	0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07,
-	0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc,
-	0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce,
-	0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0,
-	0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f,
-	0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81,
-	0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e
-};
-EXPORT_SYMBOL_GPL(sha512_zero_message_hash);
-
-static inline u64 Ch(u64 x, u64 y, u64 z)
-{
-        return z ^ (x & (y ^ z));
-}
-
-static inline u64 Maj(u64 x, u64 y, u64 z)
-{
-        return (x & y) | (z & (x | y));
-}
-
-static const u64 sha512_K[80] = {
-        0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
-        0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-        0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
-        0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-        0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
-        0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-        0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
-        0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-        0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
-        0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-        0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
-        0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-        0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
-        0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-        0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
-        0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-        0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
-        0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-        0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
-        0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-        0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
-        0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-        0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
-        0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-        0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
-        0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-        0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
-};
-
-#define e0(x)       (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39))
-#define e1(x)       (ror64(x,14) ^ ror64(x,18) ^ ror64(x,41))
-#define s0(x)       (ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7))
-#define s1(x)       (ror64(x,19) ^ ror64(x,61) ^ (x >> 6))
-
-static inline void LOAD_OP(int I, u64 *W, const u8 *input)
-{
-	W[I] = get_unaligned_be64((__u64 *)input + I);
-}
-
-static inline void BLEND_OP(int I, u64 *W)
-{
-	W[I & 15] += s1(W[(I-2) & 15]) + W[(I-7) & 15] + s0(W[(I-15) & 15]);
-}
-
-static void
-sha512_transform(u64 *state, const u8 *input)
-{
-	u64 a, b, c, d, e, f, g, h, t1, t2;
-
-	int i;
-	u64 W[16];
-
-	/* load the state into our registers */
-	a=state[0];   b=state[1];   c=state[2];   d=state[3];
-	e=state[4];   f=state[5];   g=state[6];   h=state[7];
-
-	/* now iterate */
-	for (i=0; i<80; i+=8) {
-		if (!(i & 8)) {
-			int j;
-
-			if (i < 16) {
-				/* load the input */
-				for (j = 0; j < 16; j++)
-					LOAD_OP(i + j, W, input);
-			} else {
-				for (j = 0; j < 16; j++) {
-					BLEND_OP(i + j, W);
-				}
-			}
-		}
-
-		t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[(i & 15)];
-		t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
-		t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[(i & 15) + 1];
-		t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
-		t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[(i & 15) + 2];
-		t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
-		t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[(i & 15) + 3];
-		t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
-		t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[(i & 15) + 4];
-		t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
-		t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[(i & 15) + 5];
-		t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
-		t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[(i & 15) + 6];
-		t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
-		t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[(i & 15) + 7];
-		t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
-	}
-
-	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
-	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
-}
-
-void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src,
-			     int blocks)
-{
-	do {
-		sha512_transform(sst->state, src);
-		src += SHA512_BLOCK_SIZE;
-	} while (--blocks);
-}
-EXPORT_SYMBOL_GPL(sha512_generic_block_fn);
-
-static int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
-				unsigned int len)
-{
-	return sha512_base_do_update_blocks(desc, data, len,
-					    sha512_generic_block_fn);
-}
-
-static int crypto_sha512_finup(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, u8 *hash)
-{
-	sha512_base_do_finup(desc, data, len, sha512_generic_block_fn);
-	return sha512_base_finish(desc, hash);
-}
-
-static struct shash_alg sha512_algs[2] = { {
-	.digestsize	=	SHA512_DIGEST_SIZE,
-	.init		=	sha512_base_init,
-	.update		=	crypto_sha512_update,
-	.finup		=	crypto_sha512_finup,
-	.descsize	=	SHA512_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha512",
-		.cra_driver_name =	"sha512-generic",
-		.cra_priority	=	100,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize	=	SHA512_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-}, {
-	.digestsize	=	SHA384_DIGEST_SIZE,
-	.init		=	sha384_base_init,
-	.update		=	crypto_sha512_update,
-	.finup		=	crypto_sha512_finup,
-	.descsize	=	SHA512_STATE_SIZE,
-	.base		=	{
-		.cra_name	=	"sha384",
-		.cra_driver_name =	"sha384-generic",
-		.cra_priority	=	100,
-		.cra_flags	=	CRYPTO_AHASH_ALG_BLOCK_ONLY |
-					CRYPTO_AHASH_ALG_FINUP_MAX,
-		.cra_blocksize	=	SHA384_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init sha512_generic_mod_init(void)
-{
-	return crypto_register_shashes(sha512_algs, ARRAY_SIZE(sha512_algs));
-}
-
-static void __exit sha512_generic_mod_fini(void)
-{
-	crypto_unregister_shashes(sha512_algs, ARRAY_SIZE(sha512_algs));
-}
-
-module_init(sha512_generic_mod_init);
-module_exit(sha512_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha384-generic");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha512-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 32f753d6c430..9d8b11ea4af7 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4315,12 +4315,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "authenc(hmac(sha384),cbc(des))",
+		.generic_driver = "authenc(hmac-sha384-lib,cbc(des-generic))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(hmac_sha384_des_cbc_tv_temp)
 		}
 	}, {
 		.alg = "authenc(hmac(sha384),cbc(des3_ede))",
+		.generic_driver = "authenc(hmac-sha384-lib,cbc(des3_ede-generic))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(hmac_sha384_des3_ede_cbc_tv_temp)
@@ -4331,6 +4333,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "authenc(hmac(sha384),cts(cbc(aes)))",
+		.generic_driver = "authenc(hmac-sha384-lib,cts(cbc(aes-generic)))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(krb5_test_aes256_cts_hmac_sha384_192)
@@ -4341,6 +4344,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "authenc(hmac(sha512),cbc(aes))",
+		.generic_driver = "authenc(hmac-sha512-lib,cbc(aes-generic))",
 		.fips_allowed = 1,
 		.test = alg_test_aead,
 		.suite = {
@@ -4348,12 +4352,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "authenc(hmac(sha512),cbc(des))",
+		.generic_driver = "authenc(hmac-sha512-lib,cbc(des-generic))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(hmac_sha512_des_cbc_tv_temp)
 		}
 	}, {
 		.alg = "authenc(hmac(sha512),cbc(des3_ede))",
+		.generic_driver = "authenc(hmac-sha512-lib,cbc(des3_ede-generic))",
 		.test = alg_test_aead,
 		.suite = {
 			.aead = __VECS(hmac_sha512_des3_ede_cbc_tv_temp)
@@ -5157,6 +5163,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha384)",
+		.generic_driver = "hmac-sha384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5164,6 +5171,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha512)",
+		.generic_driver = "hmac-sha512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5493,6 +5501,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha384",
+		.generic_driver = "sha384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5500,6 +5509,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha512",
+		.generic_driver = "sha512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
diff --git a/drivers/crypto/starfive/jh7110-hash.c b/drivers/crypto/starfive/jh7110-hash.c
index 2c60a1047bc3..4abbff07412f 100644
--- a/drivers/crypto/starfive/jh7110-hash.c
+++ b/drivers/crypto/starfive/jh7110-hash.c
@@ -505,13 +505,13 @@ static int starfive_sha256_init_tfm(struct crypto_ahash *hash)
 
 static int starfive_sha384_init_tfm(struct crypto_ahash *hash)
 {
-	return starfive_hash_init_tfm(hash, "sha384-generic",
+	return starfive_hash_init_tfm(hash, "sha384-lib",
 				      STARFIVE_HASH_SHA384, 0);
 }
 
 static int starfive_sha512_init_tfm(struct crypto_ahash *hash)
 {
-	return starfive_hash_init_tfm(hash, "sha512-generic",
+	return starfive_hash_init_tfm(hash, "sha512-lib",
 				      STARFIVE_HASH_SHA512, 0);
 }
 
@@ -535,13 +535,13 @@ static int starfive_hmac_sha256_init_tfm(struct crypto_ahash *hash)
 
 static int starfive_hmac_sha384_init_tfm(struct crypto_ahash *hash)
 {
-	return starfive_hash_init_tfm(hash, "hmac(sha384-generic)",
+	return starfive_hash_init_tfm(hash, "hmac-sha384-lib",
 				      STARFIVE_HASH_SHA384, 1);
 }
 
 static int starfive_hmac_sha512_init_tfm(struct crypto_ahash *hash)
 {
-	return starfive_hash_init_tfm(hash, "hmac(sha512-generic)",
+	return starfive_hash_init_tfm(hash, "hmac-sha512-lib",
 				      STARFIVE_HASH_SHA512, 1);
 }
 
diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h
index aa814bab442d..d1361b3eb70b 100644
--- a/include/crypto/sha512_base.h
+++ b/include/crypto/sha512_base.h
@@ -114,7 +114,4 @@ static inline int sha512_base_finish(struct shash_desc *desc, u8 *out)
 	return 0;
 }
 
-void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src,
-			     int blocks);
-
 #endif /* _CRYPTO_SHA512_BASE_H */
-- 
cgit v1.2.3


From 9b5c0d82b26d10733f67e10ea1889fc24aa6840a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:03:20 -0700
Subject: crypto: sha512 - Remove sha512_base.h

sha512_base.h is no longer used, so remove it.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-17-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha512_base.h | 117 -------------------------------------------
 1 file changed, 117 deletions(-)
 delete mode 100644 include/crypto/sha512_base.h

(limited to 'include')

diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h
deleted file mode 100644
index d1361b3eb70b..000000000000
--- a/include/crypto/sha512_base.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha512_base.h - core logic for SHA-512 implementations
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#ifndef _CRYPTO_SHA512_BASE_H
-#define _CRYPTO_SHA512_BASE_H
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <linux/compiler.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/unaligned.h>
-
-typedef void (sha512_block_fn)(struct sha512_state *sst, u8 const *src,
-			       int blocks);
-
-static inline int sha384_base_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA384_H0;
-	sctx->state[1] = SHA384_H1;
-	sctx->state[2] = SHA384_H2;
-	sctx->state[3] = SHA384_H3;
-	sctx->state[4] = SHA384_H4;
-	sctx->state[5] = SHA384_H5;
-	sctx->state[6] = SHA384_H6;
-	sctx->state[7] = SHA384_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static inline int sha512_base_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA512_H0;
-	sctx->state[1] = SHA512_H1;
-	sctx->state[2] = SHA512_H2;
-	sctx->state[3] = SHA512_H3;
-	sctx->state[4] = SHA512_H4;
-	sctx->state[5] = SHA512_H5;
-	sctx->state[6] = SHA512_H6;
-	sctx->state[7] = SHA512_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static inline int sha512_base_do_update_blocks(struct shash_desc *desc,
-					       const u8 *data,
-					       unsigned int len,
-					       sha512_block_fn *block_fn)
-{
-	unsigned int remain = len - round_down(len, SHA512_BLOCK_SIZE);
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	len -= remain;
-	sctx->count[0] += len;
-	if (sctx->count[0] < len)
-		sctx->count[1]++;
-	block_fn(sctx, data, len / SHA512_BLOCK_SIZE);
-	return remain;
-}
-
-static inline int sha512_base_do_finup(struct shash_desc *desc, const u8 *src,
-				       unsigned int len,
-				       sha512_block_fn *block_fn)
-{
-	unsigned int bit_offset = SHA512_BLOCK_SIZE / 8 - 2;
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	union {
-		__be64 b64[SHA512_BLOCK_SIZE / 4];
-		u8 u8[SHA512_BLOCK_SIZE * 2];
-	} block = {};
-
-	if (len >= SHA512_BLOCK_SIZE) {
-		int remain;
-
-		remain = sha512_base_do_update_blocks(desc, src, len, block_fn);
-		src += len - remain;
-		len = remain;
-	}
-
-	if (len >= bit_offset * 8)
-		bit_offset += SHA512_BLOCK_SIZE / 8;
-	memcpy(&block, src, len);
-	block.u8[len] = 0x80;
-	sctx->count[0] += len;
-	block.b64[bit_offset] = cpu_to_be64(sctx->count[1] << 3 |
-					    sctx->count[0] >> 61);
-	block.b64[bit_offset + 1] = cpu_to_be64(sctx->count[0] << 3);
-	block_fn(sctx, block.u8, (bit_offset + 2) * 8 / SHA512_BLOCK_SIZE);
-	memzero_explicit(&block, sizeof(block));
-
-	return 0;
-}
-
-static inline int sha512_base_finish(struct shash_desc *desc, u8 *out)
-{
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	__be64 *digest = (__be64 *)out;
-	int i;
-
-	for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64))
-		put_unaligned_be64(sctx->state[i], digest++);
-	return 0;
-}
-
-#endif /* _CRYPTO_SHA512_BASE_H */
-- 
cgit v1.2.3


From f2703a104e89077e622e2f34ac686262c5180d71 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 6 Jun 2025 20:22:28 -0700
Subject: lib/crc32: Remove unused combination support

Remove crc32_le_combine() and crc32_le_shift(), since they are no longer
used.

Although combination is an interesting thing that can be done with CRCs,
it turned out that none of the users of it in the kernel were even close
to being worthwhile.  All were much better off simply chaining the CRCs
or processing zeroes.

Let's remove the CRC32 combination code for now.  It can come back
(potentially optimized with carryless multiplication instructions) if
there is ever a case where it would actually be worthwhile.

Link: https://lore.kernel.org/r/20250607032228.27868-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/linux/crc32.h | 25 -------------------
 lib/crc32.c           | 67 ---------------------------------------------------
 lib/tests/crc_kunit.c | 39 +-----------------------------
 3 files changed, 1 insertion(+), 130 deletions(-)

(limited to 'include')

diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 8c1883b81b42..36bbc0405aa0 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -48,31 +48,6 @@ u32 crc32_optimizations(void);
 static inline u32 crc32_optimizations(void) { return 0; }
 #endif
 
-/**
- * crc32_le_combine - Combine two crc32 check values into one. For two
- * 		      sequences of bytes, seq1 and seq2 with lengths len1
- * 		      and len2, crc32_le() check values were calculated
- * 		      for each, crc1 and crc2.
- *
- * @crc1: crc32 of the first block
- * @crc2: crc32 of the second block
- * @len2: length of the second block
- *
- * Return: The crc32_le() check value of seq1 and seq2 concatenated,
- * 	   requiring only crc1, crc2, and len2. Note: If seq_full denotes
- * 	   the concatenated memory area of seq1 with seq2, and crc_full
- * 	   the crc32_le() value of seq_full, then crc_full ==
- * 	   crc32_le_combine(crc1, crc2, len2) when crc_full was seeded
- * 	   with the same initializer as crc1, and crc2 seed was 0. See
- * 	   also crc32_combine_test().
- */
-u32 crc32_le_shift(u32 crc, size_t len);
-
-static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2)
-{
-	return crc32_le_shift(crc1, len2) ^ crc2;
-}
-
 #define crc32(seed, data, length)  crc32_le(seed, (unsigned char const *)(data), length)
 
 /*
diff --git a/lib/crc32.c b/lib/crc32.c
index 95429861d3ac..6811b37df2aa 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -25,7 +25,6 @@
 /* see: Documentation/staging/crc32.rst for a description of algorithms */
 
 #include <linux/crc32.h>
-#include <linux/crc32poly.h>
 #include <linux/module.h>
 #include <linux/types.h>
 
@@ -51,72 +50,6 @@ u32 crc32c_base(u32 crc, const u8 *p, size_t len)
 }
 EXPORT_SYMBOL(crc32c_base);
 
-/*
- * This multiplies the polynomials x and y modulo the given modulus.
- * This follows the "little-endian" CRC convention that the lsbit
- * represents the highest power of x, and the msbit represents x^0.
- */
-static u32 gf2_multiply(u32 x, u32 y, u32 modulus)
-{
-	u32 product = x & 1 ? y : 0;
-	int i;
-
-	for (i = 0; i < 31; i++) {
-		product = (product >> 1) ^ (product & 1 ? modulus : 0);
-		x >>= 1;
-		product ^= x & 1 ? y : 0;
-	}
-
-	return product;
-}
-
-/**
- * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time
- * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient)
- * @len: The number of bytes. @crc is multiplied by x^(8*@len)
- * @polynomial: The modulus used to reduce the result to 32 bits.
- *
- * It's possible to parallelize CRC computations by computing a CRC
- * over separate ranges of a buffer, then summing them.
- * This shifts the given CRC by 8*len bits (i.e. produces the same effect
- * as appending len bytes of zero to the data), in time proportional
- * to log(len).
- */
-static u32 crc32_generic_shift(u32 crc, size_t len, u32 polynomial)
-{
-	u32 power = polynomial;	/* CRC of x^32 */
-	int i;
-
-	/* Shift up to 32 bits in the simple linear way */
-	for (i = 0; i < 8 * (int)(len & 3); i++)
-		crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0);
-
-	len >>= 2;
-	if (!len)
-		return crc;
-
-	for (;;) {
-		/* "power" is x^(2^i), modulo the polynomial */
-		if (len & 1)
-			crc = gf2_multiply(crc, power, polynomial);
-
-		len >>= 1;
-		if (!len)
-			break;
-
-		/* Square power, advancing to x^(2^(i+1)) */
-		power = gf2_multiply(power, power, polynomial);
-	}
-
-	return crc;
-}
-
-u32 crc32_le_shift(u32 crc, size_t len)
-{
-	return crc32_generic_shift(crc, len, CRC32_POLY_LE);
-}
-EXPORT_SYMBOL(crc32_le_shift);
-
 u32 crc32_be_base(u32 crc, const u8 *p, size_t len)
 {
 	while (len--)
diff --git a/lib/tests/crc_kunit.c b/lib/tests/crc_kunit.c
index 064c2d581557..f08d985d8860 100644
--- a/lib/tests/crc_kunit.c
+++ b/lib/tests/crc_kunit.c
@@ -36,14 +36,12 @@ static size_t test_buflen;
  *	  can fit any CRC up to CRC-64.  The CRC is passed in, and is expected
  *	  to be returned in, the least significant bits of the u64.  The
  *	  function is expected to *not* invert the CRC at the beginning and end.
- * @combine_func: Optional function to combine two CRCs.
  */
 struct crc_variant {
 	int bits;
 	bool le;
 	u64 poly;
 	u64 (*func)(u64 crc, const u8 *p, size_t len);
-	u64 (*combine_func)(u64 crc1, u64 crc2, size_t len2);
 };
 
 static u32 rand32(void)
@@ -144,7 +142,7 @@ static size_t generate_random_length(size_t max_length)
 }
 
 /* Test that v->func gives the same CRCs as a reference implementation. */
-static void crc_main_test(struct kunit *test, const struct crc_variant *v)
+static void crc_test(struct kunit *test, const struct crc_variant *v)
 {
 	size_t i;
 
@@ -188,35 +186,6 @@ static void crc_main_test(struct kunit *test, const struct crc_variant *v)
 	}
 }
 
-/* Test that CRC(concat(A, B)) == combine_CRCs(CRC(A), CRC(B), len(B)). */
-static void crc_combine_test(struct kunit *test, const struct crc_variant *v)
-{
-	int i;
-
-	for (i = 0; i < 100; i++) {
-		u64 init_crc = generate_random_initial_crc(v);
-		size_t len1 = generate_random_length(CRC_KUNIT_MAX_LEN);
-		size_t len2 = generate_random_length(CRC_KUNIT_MAX_LEN - len1);
-		u64 crc1, crc2, expected_crc, actual_crc;
-
-		prandom_bytes_state(&rng, test_buffer, len1 + len2);
-		crc1 = v->func(init_crc, test_buffer, len1);
-		crc2 = v->func(0, &test_buffer[len1], len2);
-		expected_crc = v->func(init_crc, test_buffer, len1 + len2);
-		actual_crc = v->combine_func(crc1, crc2, len2);
-		KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
-				    "CRC combination gave wrong result with len1=%zu len2=%zu\n",
-				    len1, len2);
-	}
-}
-
-static void crc_test(struct kunit *test, const struct crc_variant *v)
-{
-	crc_main_test(test, v);
-	if (v->combine_func)
-		crc_combine_test(test, v);
-}
-
 static __always_inline void
 crc_benchmark(struct kunit *test,
 	      u64 (*crc_func)(u64 crc, const u8 *p, size_t len))
@@ -337,17 +306,11 @@ static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len)
 	return crc32_le(crc, p, len);
 }
 
-static u64 crc32_le_combine_wrapper(u64 crc1, u64 crc2, size_t len2)
-{
-	return crc32_le_combine(crc1, crc2, len2);
-}
-
 static const struct crc_variant crc_variant_crc32_le = {
 	.bits = 32,
 	.le = true,
 	.poly = 0xedb88320,
 	.func = crc32_le_wrapper,
-	.combine_func = crc32_le_combine_wrapper,
 };
 
 static void crc32_le_test(struct kunit *test)
-- 
cgit v1.2.3


From 89a51591405e09a862b9ca1ccfa880986c495c3c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 7 Jun 2025 13:04:43 -0700
Subject: lib/crc: Move files into lib/crc/

Move all CRC files in lib/ into a subdirectory lib/crc/ to keep them
from cluttering up the main lib/ directory.

Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20250607200454.73587-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/core-api/kernel-api.rst |  14 +-
 MAINTAINERS                           |   3 +-
 include/linux/crc64.h                 |   3 -
 lib/Kconfig                           |  88 +------
 lib/Kconfig.debug                     |  21 --
 lib/Makefile                          |  32 +--
 lib/crc-ccitt.c                       |  68 -----
 lib/crc-itu-t.c                       |  67 -----
 lib/crc-t10dif.c                      |  65 -----
 lib/crc/.gitignore                    |   5 +
 lib/crc/Kconfig                       | 111 +++++++++
 lib/crc/Makefile                      |  32 +++
 lib/crc/crc-ccitt.c                   |  65 +++++
 lib/crc/crc-itu-t.c                   |  67 +++++
 lib/crc/crc-t10dif.c                  |  65 +++++
 lib/crc/crc16.c                       |  64 +++++
 lib/crc/crc32.c                       |  59 +++++
 lib/crc/crc4.c                        |  44 ++++
 lib/crc/crc64.c                       |  58 +++++
 lib/crc/crc7.c                        |  72 ++++++
 lib/crc/crc8.c                        |  86 +++++++
 lib/crc/gen_crc32table.c              |  89 +++++++
 lib/crc/gen_crc64table.c              |  88 +++++++
 lib/crc/tests/Makefile                |   2 +
 lib/crc/tests/crc_kunit.c             | 452 ++++++++++++++++++++++++++++++++++
 lib/crc16.c                           |  64 -----
 lib/crc32.c                           |  59 -----
 lib/crc4.c                            |  44 ----
 lib/crc64.c                           |  58 -----
 lib/crc7.c                            |  72 ------
 lib/crc8.c                            |  86 -------
 lib/gen_crc32table.c                  |  89 -------
 lib/gen_crc64table.c                  |  95 -------
 lib/tests/Makefile                    |   1 -
 lib/tests/crc_kunit.c                 | 452 ----------------------------------
 35 files changed, 1369 insertions(+), 1371 deletions(-)
 delete mode 100644 lib/crc-ccitt.c
 delete mode 100644 lib/crc-itu-t.c
 delete mode 100644 lib/crc-t10dif.c
 create mode 100644 lib/crc/.gitignore
 create mode 100644 lib/crc/Kconfig
 create mode 100644 lib/crc/Makefile
 create mode 100644 lib/crc/crc-ccitt.c
 create mode 100644 lib/crc/crc-itu-t.c
 create mode 100644 lib/crc/crc-t10dif.c
 create mode 100644 lib/crc/crc16.c
 create mode 100644 lib/crc/crc32.c
 create mode 100644 lib/crc/crc4.c
 create mode 100644 lib/crc/crc64.c
 create mode 100644 lib/crc/crc7.c
 create mode 100644 lib/crc/crc8.c
 create mode 100644 lib/crc/gen_crc32table.c
 create mode 100644 lib/crc/gen_crc64table.c
 create mode 100644 lib/crc/tests/Makefile
 create mode 100644 lib/crc/tests/crc_kunit.c
 delete mode 100644 lib/crc16.c
 delete mode 100644 lib/crc32.c
 delete mode 100644 lib/crc4.c
 delete mode 100644 lib/crc64.c
 delete mode 100644 lib/crc7.c
 delete mode 100644 lib/crc8.c
 delete mode 100644 lib/gen_crc32table.c
 delete mode 100644 lib/gen_crc64table.c
 delete mode 100644 lib/tests/crc_kunit.c

(limited to 'include')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index ae92a2571388..c4642d9f13a9 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -136,24 +136,24 @@ Arithmetic Overflow Checking
 CRC Functions
 -------------
 
-.. kernel-doc:: lib/crc4.c
+.. kernel-doc:: lib/crc/crc4.c
    :export:
 
-.. kernel-doc:: lib/crc7.c
+.. kernel-doc:: lib/crc/crc7.c
    :export:
 
-.. kernel-doc:: lib/crc8.c
+.. kernel-doc:: lib/crc/crc8.c
    :export:
 
-.. kernel-doc:: lib/crc16.c
+.. kernel-doc:: lib/crc/crc16.c
    :export:
 
-.. kernel-doc:: lib/crc32.c
+.. kernel-doc:: lib/crc/crc32.c
 
-.. kernel-doc:: lib/crc-ccitt.c
+.. kernel-doc:: lib/crc/crc-ccitt.c
    :export:
 
-.. kernel-doc:: lib/crc-itu-t.c
+.. kernel-doc:: lib/crc/crc-itu-t.c
    :export:
 
 Base 2 log and power Functions
diff --git a/MAINTAINERS b/MAINTAINERS
index 4bac4ea21b64..b0ec17dfbe03 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6361,8 +6361,7 @@ T:	git https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git crc-ne
 F:	Documentation/staging/crc*
 F:	arch/*/lib/crc*
 F:	include/linux/crc*
-F:	lib/crc*
-F:	lib/tests/crc_kunit.c
+F:	lib/crc/
 F:	scripts/gen-crc-consts.py
 
 CREATIVE SB0540
diff --git a/include/linux/crc64.h b/include/linux/crc64.h
index 41de30b907df..b6aa290a7931 100644
--- a/include/linux/crc64.h
+++ b/include/linux/crc64.h
@@ -1,7 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/*
- * See lib/crc64.c for the related specification and polynomial arithmetic.
- */
 #ifndef _LINUX_CRC64_H
 #define _LINUX_CRC64_H
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 37db228f70a9..c483951b624f 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -136,95 +136,9 @@ config TRACE_MMIO_ACCESS
 	  Create tracepoints for MMIO read/write operations. These trace events
 	  can be used for logging all MMIO read/write operations.
 
+source "lib/crc/Kconfig"
 source "lib/crypto/Kconfig"
 
-config CRC_CCITT
-	tristate
-	help
-	  The CRC-CCITT library functions.  Select this if your module uses any
-	  of the functions from <linux/crc-ccitt.h>.
-
-config CRC16
-	tristate
-	help
-	  The CRC16 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc16.h>.
-
-config CRC_T10DIF
-	tristate
-	help
-	  The CRC-T10DIF library functions.  Select this if your module uses
-	  any of the functions from <linux/crc-t10dif.h>.
-
-config ARCH_HAS_CRC_T10DIF
-	bool
-
-config CRC_T10DIF_ARCH
-	tristate
-	default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS
-
-config CRC_ITU_T
-	tristate
-	help
-	  The CRC-ITU-T library functions.  Select this if your module uses
-	  any of the functions from <linux/crc-itu-t.h>.
-
-config CRC32
-	tristate
-	select BITREVERSE
-	help
-	  The CRC32 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc32.h> or <linux/crc32c.h>.
-
-config ARCH_HAS_CRC32
-	bool
-
-config CRC32_ARCH
-	tristate
-	default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS
-
-config CRC64
-	tristate
-	help
-	  The CRC64 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc64.h>.
-
-config ARCH_HAS_CRC64
-	bool
-
-config CRC64_ARCH
-	tristate
-	default CRC64 if ARCH_HAS_CRC64 && CRC_OPTIMIZATIONS
-
-config CRC4
-	tristate
-	help
-	  The CRC4 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc4.h>.
-
-config CRC7
-	tristate
-	help
-	  The CRC7 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc7.h>.
-
-config CRC8
-	tristate
-	help
-	  The CRC8 library functions.  Select this if your module uses any of
-	  the functions from <linux/crc8.h>.
-
-config CRC_OPTIMIZATIONS
-	bool "Enable optimized CRC implementations" if EXPERT
-	default y
-	help
-	  Disabling this option reduces code size slightly by disabling the
-	  architecture-optimized implementations of any CRC variants that are
-	  enabled.  CRC checksumming performance may get much slower.
-
-	  Keep this enabled unless you're really trying to minimize the size of
-	  the kernel.
-
 config XXHASH
 	tristate
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebe33181b6e6..3fda96761adb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2901,27 +2901,6 @@ config HW_BREAKPOINT_KUNIT_TEST
 
 	  If unsure, say N.
 
-config CRC_KUNIT_TEST
-	tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
-	depends on KUNIT
-	default KUNIT_ALL_TESTS
-	select CRC7
-	select CRC16
-	select CRC_T10DIF
-	select CRC32
-	select CRC64
-	help
-	  Unit tests for the CRC library functions.
-
-	  This is intended to help people writing architecture-specific
-	  optimized versions.  If unsure, say N.
-
-config CRC_BENCHMARK
-	bool "Benchmark for the CRC functions"
-	depends on CRC_KUNIT_TEST
-	help
-	  Include benchmarks in the KUnit test suite for the CRC functions.
-
 config SIPHASH_KUNIT_TEST
 	tristate "Perform selftest on siphash functions" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/Makefile b/lib/Makefile
index c38582f187dd..14a5928bb57f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -122,7 +122,7 @@ endif
 obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
 CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
 
-obj-y += math/ crypto/ tests/ vdso/
+obj-y += math/ crc/ crypto/ tests/ vdso/
 
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
@@ -148,15 +148,6 @@ obj-$(CONFIG_BITREVERSE) += bitrev.o
 obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o
 obj-$(CONFIG_PACKING)	+= packing.o
 obj-$(CONFIG_PACKING_KUNIT_TEST) += packing_test.o
-obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
-obj-$(CONFIG_CRC16)	+= crc16.o
-obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
-obj-$(CONFIG_CRC_ITU_T)	+= crc-itu-t.o
-obj-$(CONFIG_CRC32)	+= crc32.o
-obj-$(CONFIG_CRC64)     += crc64.o
-obj-$(CONFIG_CRC4)	+= crc4.o
-obj-$(CONFIG_CRC7)	+= crc7.o
-obj-$(CONFIG_CRC8)	+= crc8.o
 obj-$(CONFIG_XXHASH)	+= xxhash.o
 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
 
@@ -294,27 +285,6 @@ obj-$(CONFIG_ASN1_ENCODER) += asn1_encoder.o
 
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
-hostprogs	:= gen_crc32table
-hostprogs	+= gen_crc64table
-clean-files	:= crc32table.h
-clean-files	+= crc64table.h
-
-$(obj)/crc32.o: $(obj)/crc32table.h
-
-quiet_cmd_crc32 = GEN     $@
-      cmd_crc32 = $< > $@
-
-$(obj)/crc32table.h: $(obj)/gen_crc32table
-	$(call cmd,crc32)
-
-$(obj)/crc64.o: $(obj)/crc64table.h
-
-quiet_cmd_crc64 = GEN     $@
-      cmd_crc64 = $< > $@
-
-$(obj)/crc64table.h: $(obj)/gen_crc64table
-	$(call cmd,crc64)
-
 #
 # Build a fast OID lookip registry from include/linux/oid_registry.h
 #
diff --git a/lib/crc-ccitt.c b/lib/crc-ccitt.c
deleted file mode 100644
index 9cddf35d3b66..000000000000
--- a/lib/crc-ccitt.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *	linux/lib/crc-ccitt.c
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc-ccitt.h>
-
-/*
- * This mysterious table is just the CRC of each possible byte. It can be
- * computed using the standard bit-at-a-time methods. The polynomial can
- * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12.
- * Add the implicit x^16, and you have the standard CRC-CCITT.
- */
-u16 const crc_ccitt_table[256] = {
-	0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
-	0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
-	0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
-	0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
-	0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
-	0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
-	0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
-	0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
-	0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
-	0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
-	0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
-	0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
-	0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
-	0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
-	0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
-	0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
-	0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
-	0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
-	0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
-	0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
-	0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
-	0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
-	0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
-	0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
-	0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
-	0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
-	0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
-	0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
-	0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
-	0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
-	0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
-	0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
-};
-EXPORT_SYMBOL(crc_ccitt_table);
-
-/**
- *	crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data
- *	buffer
- *	@crc: previous CRC value
- *	@buffer: data pointer
- *	@len: number of bytes in the buffer
- */
-u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len)
-{
-	while (len--)
-		crc = crc_ccitt_byte(crc, *buffer++);
-	return crc;
-}
-EXPORT_SYMBOL(crc_ccitt);
-
-MODULE_DESCRIPTION("CRC-CCITT calculations");
-MODULE_LICENSE("GPL");
diff --git a/lib/crc-itu-t.c b/lib/crc-itu-t.c
deleted file mode 100644
index 1d26a1647da5..000000000000
--- a/lib/crc-itu-t.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *      crc-itu-t.c
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc-itu-t.h>
-
-/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */
-const u16 crc_itu_t_table[256] = {
-	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
-	0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
-	0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
-	0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de,
-	0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485,
-	0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
-	0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4,
-	0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc,
-	0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
-	0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b,
-	0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12,
-	0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
-	0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41,
-	0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49,
-	0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
-	0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78,
-	0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f,
-	0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
-	0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e,
-	0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256,
-	0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
-	0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
-	0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c,
-	0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
-	0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab,
-	0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3,
-	0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
-	0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92,
-	0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9,
-	0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
-	0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8,
-	0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0
-};
-
-EXPORT_SYMBOL(crc_itu_t_table);
-
-/**
- * crc_itu_t - Compute the CRC-ITU-T for the data buffer
- *
- * @crc:     previous CRC value
- * @buffer:  data pointer
- * @len:     number of bytes in the buffer
- *
- * Returns the updated CRC value
- */
-u16 crc_itu_t(u16 crc, const u8 *buffer, size_t len)
-{
-	while (len--)
-		crc = crc_itu_t_byte(crc, *buffer++);
-	return crc;
-}
-EXPORT_SYMBOL(crc_itu_t);
-
-MODULE_DESCRIPTION("CRC ITU-T V.41 calculations");
-MODULE_LICENSE("GPL");
-
diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c
deleted file mode 100644
index 311c2ab829f1..000000000000
--- a/lib/crc-t10dif.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * T10 Data Integrity Field CRC16 calculation
- *
- * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
- * Written by Martin K. Petersen <martin.petersen@oracle.com>
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc-t10dif.h>
-
-/*
- * Table generated using the following polynomial:
- * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
- * gt: 0x8bb7
- */
-static const u16 t10_dif_crc_table[256] = {
-	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
-	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
-	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
-	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
-	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
-	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
-	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
-	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
-	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
-	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
-	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
-	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
-	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
-	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
-	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
-	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
-	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
-	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
-	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
-	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
-	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
-	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
-	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
-	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
-	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
-	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
-	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
-	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
-};
-
-u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len; i++)
-		crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ p[i]];
-
-	return crc;
-}
-EXPORT_SYMBOL(crc_t10dif_generic);
-
-MODULE_DESCRIPTION("T10 DIF CRC calculation");
-MODULE_LICENSE("GPL");
diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore
new file mode 100644
index 000000000000..a9e48103c9fb
--- /dev/null
+++ b/lib/crc/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/crc32table.h
+/crc64table.h
+/gen_crc32table
+/gen_crc64table
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
new file mode 100644
index 000000000000..e0e7168b74c7
--- /dev/null
+++ b/lib/crc/Kconfig
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Kconfig for the kernel's cyclic redundancy check (CRC) library code
+
+config CRC4
+	tristate
+	help
+	  The CRC4 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc4.h>.
+
+config CRC7
+	tristate
+	help
+	  The CRC7 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc7.h>.
+
+config CRC8
+	tristate
+	help
+	  The CRC8 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc8.h>.
+
+config CRC16
+	tristate
+	help
+	  The CRC16 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc16.h>.
+
+config CRC_CCITT
+	tristate
+	help
+	  The CRC-CCITT library functions.  Select this if your module uses any
+	  of the functions from <linux/crc-ccitt.h>.
+
+config CRC_ITU_T
+	tristate
+	help
+	  The CRC-ITU-T library functions.  Select this if your module uses
+	  any of the functions from <linux/crc-itu-t.h>.
+
+config CRC_T10DIF
+	tristate
+	help
+	  The CRC-T10DIF library functions.  Select this if your module uses
+	  any of the functions from <linux/crc-t10dif.h>.
+
+config ARCH_HAS_CRC_T10DIF
+	bool
+
+config CRC_T10DIF_ARCH
+	tristate
+	default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS
+
+config CRC32
+	tristate
+	select BITREVERSE
+	help
+	  The CRC32 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc32.h> or <linux/crc32c.h>.
+
+config ARCH_HAS_CRC32
+	bool
+
+config CRC32_ARCH
+	tristate
+	default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS
+
+config CRC64
+	tristate
+	help
+	  The CRC64 library functions.  Select this if your module uses any of
+	  the functions from <linux/crc64.h>.
+
+config ARCH_HAS_CRC64
+	bool
+
+config CRC64_ARCH
+	tristate
+	default CRC64 if ARCH_HAS_CRC64 && CRC_OPTIMIZATIONS
+
+config CRC_OPTIMIZATIONS
+	bool "Enable optimized CRC implementations" if EXPERT
+	default y
+	help
+	  Disabling this option reduces code size slightly by disabling the
+	  architecture-optimized implementations of any CRC variants that are
+	  enabled.  CRC checksumming performance may get much slower.
+
+	  Keep this enabled unless you're really trying to minimize the size of
+	  the kernel.
+
+config CRC_KUNIT_TEST
+	tristate "KUnit tests for CRC functions" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	select CRC7
+	select CRC16
+	select CRC_T10DIF
+	select CRC32
+	select CRC64
+	help
+	  Unit tests for the CRC library functions.
+
+	  This is intended to help people writing architecture-specific
+	  optimized versions.  If unsure, say N.
+
+config CRC_BENCHMARK
+	bool "Benchmark for the CRC functions"
+	depends on CRC_KUNIT_TEST
+	help
+	  Include benchmarks in the KUnit test suite for the CRC functions.
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
new file mode 100644
index 000000000000..ff4c30dda452
--- /dev/null
+++ b/lib/crc/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# Makefile for the kernel's cyclic redundancy check (CRC) library code
+
+obj-$(CONFIG_CRC4) += crc4.o
+obj-$(CONFIG_CRC7) += crc7.o
+obj-$(CONFIG_CRC8) += crc8.o
+obj-$(CONFIG_CRC16) += crc16.o
+obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
+obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
+obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o
+obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC64) += crc64.o
+obj-y += tests/
+
+hostprogs := gen_crc32table gen_crc64table
+clean-files := crc32table.h crc64table.h
+
+$(obj)/crc32.o: $(obj)/crc32table.h
+$(obj)/crc64.o: $(obj)/crc64table.h
+
+quiet_cmd_crc32 = GEN     $@
+      cmd_crc32 = $< > $@
+
+quiet_cmd_crc64 = GEN     $@
+      cmd_crc64 = $< > $@
+
+$(obj)/crc32table.h: $(obj)/gen_crc32table
+	$(call cmd,crc32)
+
+$(obj)/crc64table.h: $(obj)/gen_crc64table
+	$(call cmd,crc64)
diff --git a/lib/crc/crc-ccitt.c b/lib/crc/crc-ccitt.c
new file mode 100644
index 000000000000..8d2bc419230b
--- /dev/null
+++ b/lib/crc/crc-ccitt.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-ccitt.h>
+
+/*
+ * This mysterious table is just the CRC of each possible byte. It can be
+ * computed using the standard bit-at-a-time methods. The polynomial can
+ * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12.
+ * Add the implicit x^16, and you have the standard CRC-CCITT.
+ */
+u16 const crc_ccitt_table[256] = {
+	0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
+	0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
+	0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
+	0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
+	0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
+	0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
+	0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
+	0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
+	0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
+	0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
+	0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
+	0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
+	0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
+	0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
+	0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
+	0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
+	0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
+	0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
+	0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
+	0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
+	0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
+	0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
+	0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
+	0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
+	0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
+	0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
+	0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
+	0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
+	0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
+	0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
+	0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
+	0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
+};
+EXPORT_SYMBOL(crc_ccitt_table);
+
+/**
+ *	crc_ccitt - recompute the CRC (CRC-CCITT variant) for the data
+ *	buffer
+ *	@crc: previous CRC value
+ *	@buffer: data pointer
+ *	@len: number of bytes in the buffer
+ */
+u16 crc_ccitt(u16 crc, u8 const *buffer, size_t len)
+{
+	while (len--)
+		crc = crc_ccitt_byte(crc, *buffer++);
+	return crc;
+}
+EXPORT_SYMBOL(crc_ccitt);
+
+MODULE_DESCRIPTION("CRC-CCITT calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc-itu-t.c b/lib/crc/crc-itu-t.c
new file mode 100644
index 000000000000..1d26a1647da5
--- /dev/null
+++ b/lib/crc/crc-itu-t.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *      crc-itu-t.c
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-itu-t.h>
+
+/* CRC table for the CRC ITU-T V.41 0x1021 (x^16 + x^12 + x^5 + 1) */
+const u16 crc_itu_t_table[256] = {
+	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
+	0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
+	0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6,
+	0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de,
+	0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485,
+	0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d,
+	0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4,
+	0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc,
+	0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823,
+	0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b,
+	0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12,
+	0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a,
+	0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41,
+	0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49,
+	0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70,
+	0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78,
+	0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f,
+	0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067,
+	0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e,
+	0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256,
+	0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d,
+	0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
+	0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c,
+	0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634,
+	0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab,
+	0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3,
+	0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a,
+	0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92,
+	0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9,
+	0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1,
+	0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8,
+	0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0
+};
+
+EXPORT_SYMBOL(crc_itu_t_table);
+
+/**
+ * crc_itu_t - Compute the CRC-ITU-T for the data buffer
+ *
+ * @crc:     previous CRC value
+ * @buffer:  data pointer
+ * @len:     number of bytes in the buffer
+ *
+ * Returns the updated CRC value
+ */
+u16 crc_itu_t(u16 crc, const u8 *buffer, size_t len)
+{
+	while (len--)
+		crc = crc_itu_t_byte(crc, *buffer++);
+	return crc;
+}
+EXPORT_SYMBOL(crc_itu_t);
+
+MODULE_DESCRIPTION("CRC ITU-T V.41 calculations");
+MODULE_LICENSE("GPL");
+
diff --git a/lib/crc/crc-t10dif.c b/lib/crc/crc-t10dif.c
new file mode 100644
index 000000000000..311c2ab829f1
--- /dev/null
+++ b/lib/crc/crc-t10dif.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * T10 Data Integrity Field CRC16 calculation
+ *
+ * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+
+/*
+ * Table generated using the following polynomial:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const u16 t10_dif_crc_table[256] = {
+	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ p[i]];
+
+	return crc;
+}
+EXPORT_SYMBOL(crc_t10dif_generic);
+
+MODULE_DESCRIPTION("T10 DIF CRC calculation");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc16.c b/lib/crc/crc16.c
new file mode 100644
index 000000000000..9c71eda9bf4b
--- /dev/null
+++ b/lib/crc/crc16.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *      crc16.c
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc16.h>
+
+/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
+static const u16 crc16_table[256] = {
+	0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
+	0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
+	0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
+	0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
+	0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
+	0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
+	0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
+	0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
+	0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
+	0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
+	0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
+	0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
+	0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
+	0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
+	0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
+	0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
+	0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
+	0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
+	0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
+	0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
+	0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
+	0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
+	0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
+	0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
+	0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
+	0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
+	0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
+	0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
+	0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
+	0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
+	0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
+	0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
+};
+
+/**
+ * crc16 - compute the CRC-16 for the data buffer
+ * @crc:	previous CRC value
+ * @p:		data pointer
+ * @len:	number of bytes in the buffer
+ *
+ * Returns the updated CRC value.
+ */
+u16 crc16(u16 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL(crc16);
+
+MODULE_DESCRIPTION("CRC16 calculations");
+MODULE_LICENSE("GPL");
+
diff --git a/lib/crc/crc32.c b/lib/crc/crc32.c
new file mode 100644
index 000000000000..6811b37df2aa
--- /dev/null
+++ b/lib/crc/crc32.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ */
+
+/* see: Documentation/staging/crc32.rst for a description of algorithms */
+
+#include <linux/crc32.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "crc32table.h"
+
+MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
+MODULE_DESCRIPTION("Various CRC32 calculations");
+MODULE_LICENSE("GPL");
+
+u32 crc32_le_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL(crc32_le_base);
+
+u32 crc32c_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL(crc32c_base);
+
+u32 crc32_be_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL(crc32_be_base);
diff --git a/lib/crc/crc4.c b/lib/crc/crc4.c
new file mode 100644
index 000000000000..e7e1779c67d9
--- /dev/null
+++ b/lib/crc/crc4.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * crc4.c - simple crc-4 calculations.
+ */
+
+#include <linux/crc4.h>
+#include <linux/module.h>
+
+static const uint8_t crc4_tab[] = {
+	0x0, 0x7, 0xe, 0x9, 0xb, 0xc, 0x5, 0x2,
+	0x1, 0x6, 0xf, 0x8, 0xa, 0xd, 0x4, 0x3,
+};
+
+/**
+ * crc4 - calculate the 4-bit crc of a value.
+ * @c:    starting crc4
+ * @x:    value to checksum
+ * @bits: number of bits in @x to checksum
+ *
+ * Returns the crc4 value of @x, using polynomial 0b10111.
+ *
+ * The @x value is treated as left-aligned, and bits above @bits are ignored
+ * in the crc calculations.
+ */
+uint8_t crc4(uint8_t c, uint64_t x, int bits)
+{
+	int i;
+
+	/* mask off anything above the top bit */
+	x &= (1ull << bits) - 1;
+
+	/* Align to 4-bits */
+	bits = (bits + 3) & ~0x3;
+
+	/* Calculate crc4 over four-bit nibbles, starting at the MSbit */
+	for (i = bits - 4; i >= 0; i -= 4)
+		c = crc4_tab[c ^ ((x >> i) & 0xf)];
+
+	return c;
+}
+EXPORT_SYMBOL_GPL(crc4);
+
+MODULE_DESCRIPTION("CRC4 calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc64.c b/lib/crc/crc64.c
new file mode 100644
index 000000000000..5b1b17057f0a
--- /dev/null
+++ b/lib/crc/crc64.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * https://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set
+ * Specification and uses least-significant-bit first bit order:
+ *
+ * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
+ * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
+ * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crc64.h>
+#include "crc64table.h"
+
+MODULE_DESCRIPTION("CRC64 calculations");
+MODULE_LICENSE("GPL v2");
+
+u64 crc64_be_generic(u64 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL_GPL(crc64_be_generic);
+
+u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
+	return crc;
+}
+EXPORT_SYMBOL_GPL(crc64_nvme_generic);
diff --git a/lib/crc/crc7.c b/lib/crc/crc7.c
new file mode 100644
index 000000000000..8dd991cc6114
--- /dev/null
+++ b/lib/crc/crc7.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *      crc7.c
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc7.h>
+
+/*
+ * Table for CRC-7 (polynomial x^7 + x^3 + 1).
+ * This is a big-endian CRC (msbit is highest power of x),
+ * aligned so the msbit of the byte is the x^6 coefficient
+ * and the lsbit is not used.
+ */
+static const u8 crc7_be_syndrome_table[256] = {
+	0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+	0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
+	0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c,
+	0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc,
+	0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,
+	0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
+	0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28,
+	0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8,
+	0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,
+	0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
+	0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84,
+	0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14,
+	0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,
+	0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
+	0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0,
+	0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70,
+	0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc,
+	0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c,
+	0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce,
+	0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e,
+	0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98,
+	0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08,
+	0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa,
+	0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a,
+	0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34,
+	0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4,
+	0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06,
+	0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96,
+	0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50,
+	0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0,
+	0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62,
+	0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2
+};
+
+/**
+ * crc7_be - update the CRC7 for the data buffer
+ * @crc:     previous CRC7 value
+ * @buffer:  data pointer
+ * @len:     number of bytes in the buffer
+ * Context: any
+ *
+ * Returns the updated CRC7 value.
+ * The CRC7 is left-aligned in the byte (the lsbit is always 0), as that
+ * makes the computation easier, and all callers want it in that form.
+ *
+ */
+u8 crc7_be(u8 crc, const u8 *buffer, size_t len)
+{
+	while (len--)
+		crc = crc7_be_syndrome_table[crc ^ *buffer++];
+	return crc;
+}
+EXPORT_SYMBOL(crc7_be);
+
+MODULE_DESCRIPTION("CRC7 calculations");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc8.c b/lib/crc/crc8.c
new file mode 100644
index 000000000000..1ad8e501d9b6
--- /dev/null
+++ b/lib/crc/crc8.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define pr_fmt(fmt)		KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/crc8.h>
+#include <linux/printk.h>
+
+/**
+ * crc8_populate_msb - fill crc table for given polynomial in reverse bit order.
+ *
+ * @table:	table to be filled.
+ * @polynomial:	polynomial for which table is to be filled.
+ */
+void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+	int i, j;
+	const u8 msbit = 0x80;
+	u8 t = msbit;
+
+	table[0] = 0;
+
+	for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) {
+		t = (t << 1) ^ (t & msbit ? polynomial : 0);
+		for (j = 0; j < i; j++)
+			table[i+j] = table[j] ^ t;
+	}
+}
+EXPORT_SYMBOL(crc8_populate_msb);
+
+/**
+ * crc8_populate_lsb - fill crc table for given polynomial in regular bit order.
+ *
+ * @table:	table to be filled.
+ * @polynomial:	polynomial for which table is to be filled.
+ */
+void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
+{
+	int i, j;
+	u8 t = 1;
+
+	table[0] = 0;
+
+	for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) {
+		t = (t >> 1) ^ (t & 1 ? polynomial : 0);
+		for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i)
+			table[i+j] = table[j] ^ t;
+	}
+}
+EXPORT_SYMBOL(crc8_populate_lsb);
+
+/**
+ * crc8 - calculate a crc8 over the given input data.
+ *
+ * @table: crc table used for calculation.
+ * @pdata: pointer to data buffer.
+ * @nbytes: number of bytes in data buffer.
+ * @crc: previous returned crc8 value.
+ */
+u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
+{
+	/* loop over the buffer data */
+	while (nbytes-- > 0)
+		crc = table[(crc ^ *pdata++) & 0xff];
+
+	return crc;
+}
+EXPORT_SYMBOL(crc8);
+
+MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function");
+MODULE_AUTHOR("Broadcom Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/crc/gen_crc32table.c b/lib/crc/gen_crc32table.c
new file mode 100644
index 000000000000..9a7f31658e35
--- /dev/null
+++ b/lib/crc/gen_crc32table.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include "../../include/linux/crc32poly.h"
+#include "../../include/generated/autoconf.h"
+#include <inttypes.h>
+
+static uint32_t crc32table_le[256];
+static uint32_t crc32table_be[256];
+static uint32_t crc32ctable_le[256];
+
+/**
+ * crc32init_le() - allocate and initialize LE table data
+ *
+ * crc is the crc of the byte i; other entries are filled in based on the
+ * fact that crctable[i^j] = crctable[i] ^ crctable[j].
+ *
+ */
+static void crc32init_le_generic(const uint32_t polynomial, uint32_t tab[256])
+{
+	unsigned i, j;
+	uint32_t crc = 1;
+
+	tab[0] = 0;
+
+	for (i = 128; i; i >>= 1) {
+		crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
+		for (j = 0; j < 256; j += 2 * i)
+			tab[i + j] = crc ^ tab[j];
+	}
+}
+
+static void crc32init_le(void)
+{
+	crc32init_le_generic(CRC32_POLY_LE, crc32table_le);
+}
+
+static void crc32cinit_le(void)
+{
+	crc32init_le_generic(CRC32C_POLY_LE, crc32ctable_le);
+}
+
+/**
+ * crc32init_be() - allocate and initialize BE table data
+ */
+static void crc32init_be(void)
+{
+	unsigned i, j;
+	uint32_t crc = 0x80000000;
+
+	crc32table_be[0] = 0;
+
+	for (i = 1; i < 256; i <<= 1) {
+		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0);
+		for (j = 0; j < i; j++)
+			crc32table_be[i + j] = crc ^ crc32table_be[j];
+	}
+}
+
+static void output_table(const uint32_t table[256])
+{
+	int i;
+
+	for (i = 0; i < 256; i += 4) {
+		printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+		       table[i], table[i + 1], table[i + 2], table[i + 3]);
+	}
+}
+
+int main(int argc, char** argv)
+{
+	printf("/* this file is generated - do not edit */\n\n");
+
+	crc32init_le();
+	printf("static const u32 ____cacheline_aligned crc32table_le[256] = {\n");
+	output_table(crc32table_le);
+	printf("};\n");
+
+	crc32init_be();
+	printf("static const u32 ____cacheline_aligned crc32table_be[256] = {\n");
+	output_table(crc32table_be);
+	printf("};\n");
+
+	crc32cinit_le();
+	printf("static const u32 ____cacheline_aligned crc32ctable_le[256] = {\n");
+	output_table(crc32ctable_le);
+	printf("};\n");
+
+	return 0;
+}
diff --git a/lib/crc/gen_crc64table.c b/lib/crc/gen_crc64table.c
new file mode 100644
index 000000000000..f2be9f62bab7
--- /dev/null
+++ b/lib/crc/gen_crc64table.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This host program runs at kernel build time and generates the lookup tables
+ * used by the generic CRC64 code.
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+#include <inttypes.h>
+#include <stdio.h>
+
+#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
+#define CRC64_NVME_POLY 0x9A6C9329AC4BC9B5ULL
+
+static uint64_t crc64_table[256] = {0};
+static uint64_t crc64_nvme_table[256] = {0};
+
+static void generate_reflected_crc64_table(uint64_t table[256], uint64_t poly)
+{
+	uint64_t i, j, c, crc;
+
+	for (i = 0; i < 256; i++) {
+		crc = 0ULL;
+		c = i;
+
+		for (j = 0; j < 8; j++) {
+			if ((crc ^ (c >> j)) & 1)
+				crc = (crc >> 1) ^ poly;
+			else
+				crc >>= 1;
+		}
+		table[i] = crc;
+	}
+}
+
+static void generate_crc64_table(uint64_t table[256], uint64_t poly)
+{
+	uint64_t i, j, c, crc;
+
+	for (i = 0; i < 256; i++) {
+		crc = 0;
+		c = i << 56;
+
+		for (j = 0; j < 8; j++) {
+			if ((crc ^ c) & 0x8000000000000000ULL)
+				crc = (crc << 1) ^ poly;
+			else
+				crc <<= 1;
+			c <<= 1;
+		}
+
+		table[i] = crc;
+	}
+}
+
+static void output_table(uint64_t table[256])
+{
+	int i;
+
+	for (i = 0; i < 256; i++) {
+		printf("\t0x%016" PRIx64 "ULL", table[i]);
+		if (i & 0x1)
+			printf(",\n");
+		else
+			printf(", ");
+	}
+	printf("};\n");
+}
+
+static void print_crc64_tables(void)
+{
+	printf("/* this file is generated - do not edit */\n\n");
+	printf("#include <linux/types.h>\n");
+	printf("#include <linux/cache.h>\n\n");
+	printf("static const u64 ____cacheline_aligned crc64table[256] = {\n");
+	output_table(crc64_table);
+
+	printf("\nstatic const u64 ____cacheline_aligned crc64nvmetable[256] = {\n");
+	output_table(crc64_nvme_table);
+}
+
+int main(int argc, char *argv[])
+{
+	generate_crc64_table(crc64_table, CRC64_ECMA182_POLY);
+	generate_reflected_crc64_table(crc64_nvme_table, CRC64_NVME_POLY);
+	print_crc64_tables();
+	return 0;
+}
diff --git a/lib/crc/tests/Makefile b/lib/crc/tests/Makefile
new file mode 100644
index 000000000000..65f63c318ef5
--- /dev/null
+++ b/lib/crc/tests/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
new file mode 100644
index 000000000000..f08d985d8860
--- /dev/null
+++ b/lib/crc/tests/crc_kunit.c
@@ -0,0 +1,452 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Unit tests and benchmarks for the CRC library functions
+ *
+ * Copyright 2024 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+#include <kunit/test.h>
+#include <linux/crc7.h>
+#include <linux/crc16.h>
+#include <linux/crc-t10dif.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+#include <linux/crc64.h>
+#include <linux/prandom.h>
+#include <linux/vmalloc.h>
+
+#define CRC_KUNIT_SEED			42
+#define CRC_KUNIT_MAX_LEN		16384
+#define CRC_KUNIT_NUM_TEST_ITERS	1000
+
+static struct rnd_state rng;
+static u8 *test_buffer;
+static size_t test_buflen;
+
+/**
+ * struct crc_variant - describes a CRC variant
+ * @bits: Number of bits in the CRC, 1 <= @bits <= 64.
+ * @le: true if it's a "little endian" CRC (reversed mapping between bits and
+ *	polynomial coefficients in each byte), false if it's a "big endian" CRC
+ *	(natural mapping between bits and polynomial coefficients in each byte)
+ * @poly: The generator polynomial with the highest-order term omitted.
+ *	  Bit-reversed if @le is true.
+ * @func: The function to compute a CRC.  The type signature uses u64 so that it
+ *	  can fit any CRC up to CRC-64.  The CRC is passed in, and is expected
+ *	  to be returned in, the least significant bits of the u64.  The
+ *	  function is expected to *not* invert the CRC at the beginning and end.
+ */
+struct crc_variant {
+	int bits;
+	bool le;
+	u64 poly;
+	u64 (*func)(u64 crc, const u8 *p, size_t len);
+};
+
+static u32 rand32(void)
+{
+	return prandom_u32_state(&rng);
+}
+
+static u64 rand64(void)
+{
+	u32 n = rand32();
+
+	return ((u64)n << 32) | rand32();
+}
+
+static u64 crc_mask(const struct crc_variant *v)
+{
+	return (u64)-1 >> (64 - v->bits);
+}
+
+/* Reference implementation of any CRC variant */
+static u64 crc_ref(const struct crc_variant *v,
+		   u64 crc, const u8 *p, size_t len)
+{
+	size_t i, j;
+
+	for (i = 0; i < len; i++) {
+		for (j = 0; j < 8; j++) {
+			if (v->le) {
+				crc ^= (p[i] >> j) & 1;
+				crc = (crc >> 1) ^ ((crc & 1) ? v->poly : 0);
+			} else {
+				crc ^= (u64)((p[i] >> (7 - j)) & 1) <<
+				       (v->bits - 1);
+				if (crc & (1ULL << (v->bits - 1)))
+					crc = ((crc << 1) ^ v->poly) &
+					      crc_mask(v);
+				else
+					crc <<= 1;
+			}
+		}
+	}
+	return crc;
+}
+
+static int crc_suite_init(struct kunit_suite *suite)
+{
+	/*
+	 * Allocate the test buffer using vmalloc() with a page-aligned length
+	 * so that it is immediately followed by a guard page.  This allows
+	 * buffer overreads to be detected, even in assembly code.
+	 */
+	test_buflen = round_up(CRC_KUNIT_MAX_LEN, PAGE_SIZE);
+	test_buffer = vmalloc(test_buflen);
+	if (!test_buffer)
+		return -ENOMEM;
+
+	prandom_seed_state(&rng, CRC_KUNIT_SEED);
+	prandom_bytes_state(&rng, test_buffer, test_buflen);
+	return 0;
+}
+
+static void crc_suite_exit(struct kunit_suite *suite)
+{
+	vfree(test_buffer);
+	test_buffer = NULL;
+}
+
+/* Generate a random initial CRC. */
+static u64 generate_random_initial_crc(const struct crc_variant *v)
+{
+	switch (rand32() % 4) {
+	case 0:
+		return 0;
+	case 1:
+		return crc_mask(v); /* All 1 bits */
+	default:
+		return rand64() & crc_mask(v);
+	}
+}
+
+/* Generate a random length, preferring small lengths. */
+static size_t generate_random_length(size_t max_length)
+{
+	size_t len;
+
+	switch (rand32() % 3) {
+	case 0:
+		len = rand32() % 128;
+		break;
+	case 1:
+		len = rand32() % 3072;
+		break;
+	default:
+		len = rand32();
+		break;
+	}
+	return len % (max_length + 1);
+}
+
+/* Test that v->func gives the same CRCs as a reference implementation. */
+static void crc_test(struct kunit *test, const struct crc_variant *v)
+{
+	size_t i;
+
+	for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) {
+		u64 init_crc, expected_crc, actual_crc;
+		size_t len, offset;
+		bool nosimd;
+
+		init_crc = generate_random_initial_crc(v);
+		len = generate_random_length(CRC_KUNIT_MAX_LEN);
+
+		/* Generate a random offset. */
+		if (rand32() % 2 == 0) {
+			/* Use a random alignment mod 64 */
+			offset = rand32() % 64;
+			offset = min(offset, CRC_KUNIT_MAX_LEN - len);
+		} else {
+			/* Go up to the guard page, to catch buffer overreads */
+			offset = test_buflen - len;
+		}
+
+		if (rand32() % 8 == 0)
+			/* Refresh the data occasionally. */
+			prandom_bytes_state(&rng, &test_buffer[offset], len);
+
+		nosimd = rand32() % 8 == 0;
+
+		/*
+		 * Compute the CRC, and verify that it equals the CRC computed
+		 * by a simple bit-at-a-time reference implementation.
+		 */
+		expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len);
+		if (nosimd)
+			local_irq_disable();
+		actual_crc = v->func(init_crc, &test_buffer[offset], len);
+		if (nosimd)
+			local_irq_enable();
+		KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
+				    "Wrong result with len=%zu offset=%zu nosimd=%d",
+				    len, offset, nosimd);
+	}
+}
+
+static __always_inline void
+crc_benchmark(struct kunit *test,
+	      u64 (*crc_func)(u64 crc, const u8 *p, size_t len))
+{
+	static const size_t lens_to_test[] = {
+		1, 16, 64, 127, 128, 200, 256, 511, 512, 1024, 3173, 4096, 16384,
+	};
+	size_t len, i, j, num_iters;
+	/*
+	 * The CRC value that this function computes in a series of calls to
+	 * crc_func is never actually used, so use volatile to ensure that the
+	 * computations are done as intended and don't all get optimized out.
+	 */
+	volatile u64 crc = 0;
+	u64 t;
+
+	if (!IS_ENABLED(CONFIG_CRC_BENCHMARK))
+		kunit_skip(test, "not enabled");
+
+	/* warm-up */
+	for (i = 0; i < 10000000; i += CRC_KUNIT_MAX_LEN)
+		crc = crc_func(crc, test_buffer, CRC_KUNIT_MAX_LEN);
+
+	for (i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
+		len = lens_to_test[i];
+		KUNIT_ASSERT_LE(test, len, CRC_KUNIT_MAX_LEN);
+		num_iters = 10000000 / (len + 128);
+		preempt_disable();
+		t = ktime_get_ns();
+		for (j = 0; j < num_iters; j++)
+			crc = crc_func(crc, test_buffer, len);
+		t = ktime_get_ns() - t;
+		preempt_enable();
+		kunit_info(test, "len=%zu: %llu MB/s\n",
+			   len, div64_u64((u64)len * num_iters * 1000, t));
+	}
+}
+
+/* crc7_be */
+
+static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	/*
+	 * crc7_be() left-aligns the 7-bit CRC in a u8, whereas the test wants a
+	 * right-aligned CRC (in a u64).  Convert between the conventions.
+	 */
+	return crc7_be(crc << 1, p, len) >> 1;
+}
+
+static const struct crc_variant crc_variant_crc7_be = {
+	.bits = 7,
+	.poly = 0x9,
+	.func = crc7_be_wrapper,
+};
+
+static void crc7_be_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc7_be);
+}
+
+static void crc7_be_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc7_be_wrapper);
+}
+
+/* crc16 */
+
+static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc16(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc16 = {
+	.bits = 16,
+	.le = true,
+	.poly = 0xa001,
+	.func = crc16_wrapper,
+};
+
+static void crc16_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc16);
+}
+
+static void crc16_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc16_wrapper);
+}
+
+/* crc_t10dif */
+
+static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc_t10dif_update(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc_t10dif = {
+	.bits = 16,
+	.le = false,
+	.poly = 0x8bb7,
+	.func = crc_t10dif_wrapper,
+};
+
+static void crc_t10dif_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc_t10dif);
+}
+
+static void crc_t10dif_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc_t10dif_wrapper);
+}
+
+/* crc32_le */
+
+static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc32_le(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32_le = {
+	.bits = 32,
+	.le = true,
+	.poly = 0xedb88320,
+	.func = crc32_le_wrapper,
+};
+
+static void crc32_le_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc32_le);
+}
+
+static void crc32_le_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc32_le_wrapper);
+}
+
+/* crc32_be */
+
+static u64 crc32_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc32_be(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32_be = {
+	.bits = 32,
+	.le = false,
+	.poly = 0x04c11db7,
+	.func = crc32_be_wrapper,
+};
+
+static void crc32_be_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc32_be);
+}
+
+static void crc32_be_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc32_be_wrapper);
+}
+
+/* crc32c */
+
+static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc32c(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc32c = {
+	.bits = 32,
+	.le = true,
+	.poly = 0x82f63b78,
+	.func = crc32c_wrapper,
+};
+
+static void crc32c_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc32c);
+}
+
+static void crc32c_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc32c_wrapper);
+}
+
+/* crc64_be */
+
+static u64 crc64_be_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	return crc64_be(crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc64_be = {
+	.bits = 64,
+	.le = false,
+	.poly = 0x42f0e1eba9ea3693,
+	.func = crc64_be_wrapper,
+};
+
+static void crc64_be_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc64_be);
+}
+
+static void crc64_be_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc64_be_wrapper);
+}
+
+/* crc64_nvme */
+
+static u64 crc64_nvme_wrapper(u64 crc, const u8 *p, size_t len)
+{
+	/* The inversions that crc64_nvme() does have to be undone here. */
+	return ~crc64_nvme(~crc, p, len);
+}
+
+static const struct crc_variant crc_variant_crc64_nvme = {
+	.bits = 64,
+	.le = true,
+	.poly = 0x9a6c9329ac4bc9b5,
+	.func = crc64_nvme_wrapper,
+};
+
+static void crc64_nvme_test(struct kunit *test)
+{
+	crc_test(test, &crc_variant_crc64_nvme);
+}
+
+static void crc64_nvme_benchmark(struct kunit *test)
+{
+	crc_benchmark(test, crc64_nvme_wrapper);
+}
+
+static struct kunit_case crc_test_cases[] = {
+	KUNIT_CASE(crc7_be_test),
+	KUNIT_CASE(crc7_be_benchmark),
+	KUNIT_CASE(crc16_test),
+	KUNIT_CASE(crc16_benchmark),
+	KUNIT_CASE(crc_t10dif_test),
+	KUNIT_CASE(crc_t10dif_benchmark),
+	KUNIT_CASE(crc32_le_test),
+	KUNIT_CASE(crc32_le_benchmark),
+	KUNIT_CASE(crc32_be_test),
+	KUNIT_CASE(crc32_be_benchmark),
+	KUNIT_CASE(crc32c_test),
+	KUNIT_CASE(crc32c_benchmark),
+	KUNIT_CASE(crc64_be_test),
+	KUNIT_CASE(crc64_be_benchmark),
+	KUNIT_CASE(crc64_nvme_test),
+	KUNIT_CASE(crc64_nvme_benchmark),
+	{},
+};
+
+static struct kunit_suite crc_test_suite = {
+	.name = "crc",
+	.test_cases = crc_test_cases,
+	.suite_init = crc_suite_init,
+	.suite_exit = crc_suite_exit,
+};
+kunit_test_suite(crc_test_suite);
+
+MODULE_DESCRIPTION("Unit tests and benchmarks for the CRC library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc16.c b/lib/crc16.c
deleted file mode 100644
index 9c71eda9bf4b..000000000000
--- a/lib/crc16.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *      crc16.c
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc16.h>
-
-/** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */
-static const u16 crc16_table[256] = {
-	0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
-	0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
-	0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
-	0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
-	0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
-	0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
-	0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
-	0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
-	0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
-	0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
-	0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
-	0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
-	0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
-	0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
-	0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
-	0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
-	0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
-	0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
-	0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
-	0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
-	0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
-	0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
-	0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
-	0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
-	0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
-	0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
-	0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
-	0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
-	0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
-	0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
-	0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
-	0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
-};
-
-/**
- * crc16 - compute the CRC-16 for the data buffer
- * @crc:	previous CRC value
- * @p:		data pointer
- * @len:	number of bytes in the buffer
- *
- * Returns the updated CRC value.
- */
-u16 crc16(u16 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc16);
-
-MODULE_DESCRIPTION("CRC16 calculations");
-MODULE_LICENSE("GPL");
-
diff --git a/lib/crc32.c b/lib/crc32.c
deleted file mode 100644
index 6811b37df2aa..000000000000
--- a/lib/crc32.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
- * cleaned up code to current version of sparse and added the slicing-by-8
- * algorithm to the closely similar existing slicing-by-4 algorithm.
- *
- * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
- * Code was from the public domain, copyright abandoned.  Code was
- * subsequently included in the kernel, thus was re-licensed under the
- * GNU GPL v2.
- *
- * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Same crc32 function was used in 5 other places in the kernel.
- * I made one version, and deleted the others.
- * There are various incantations of crc32().  Some use a seed of 0 or ~0.
- * Some xor at the end with ~0.  The generic crc32() function takes
- * seed as an argument, and doesn't xor at the end.  Then individual
- * users can do whatever they need.
- *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
- *   fs/jffs2 uses seed 0, doesn't xor with ~0.
- *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
- */
-
-/* see: Documentation/staging/crc32.rst for a description of algorithms */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include "crc32table.h"
-
-MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
-MODULE_DESCRIPTION("Various CRC32 calculations");
-MODULE_LICENSE("GPL");
-
-u32 crc32_le_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32_le_base);
-
-u32 crc32c_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32c_base);
-
-u32 crc32_be_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32_be_base);
diff --git a/lib/crc4.c b/lib/crc4.c
deleted file mode 100644
index e7e1779c67d9..000000000000
--- a/lib/crc4.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * crc4.c - simple crc-4 calculations.
- */
-
-#include <linux/crc4.h>
-#include <linux/module.h>
-
-static const uint8_t crc4_tab[] = {
-	0x0, 0x7, 0xe, 0x9, 0xb, 0xc, 0x5, 0x2,
-	0x1, 0x6, 0xf, 0x8, 0xa, 0xd, 0x4, 0x3,
-};
-
-/**
- * crc4 - calculate the 4-bit crc of a value.
- * @c:    starting crc4
- * @x:    value to checksum
- * @bits: number of bits in @x to checksum
- *
- * Returns the crc4 value of @x, using polynomial 0b10111.
- *
- * The @x value is treated as left-aligned, and bits above @bits are ignored
- * in the crc calculations.
- */
-uint8_t crc4(uint8_t c, uint64_t x, int bits)
-{
-	int i;
-
-	/* mask off anything above the top bit */
-	x &= (1ull << bits) - 1;
-
-	/* Align to 4-bits */
-	bits = (bits + 3) & ~0x3;
-
-	/* Calculate crc4 over four-bit nibbles, starting at the MSbit */
-	for (i = bits - 4; i >= 0; i -= 4)
-		c = crc4_tab[c ^ ((x >> i) & 0xf)];
-
-	return c;
-}
-EXPORT_SYMBOL_GPL(crc4);
-
-MODULE_DESCRIPTION("CRC4 calculations");
-MODULE_LICENSE("GPL");
diff --git a/lib/crc64.c b/lib/crc64.c
deleted file mode 100644
index 5b1b17057f0a..000000000000
--- a/lib/crc64.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Normal 64-bit CRC calculation.
- *
- * This is a basic crc64 implementation following ECMA-182 specification,
- * which can be found from,
- * https://www.ecma-international.org/publications/standards/Ecma-182.htm
- *
- * Dr. Ross N. Williams has a great document to introduce the idea of CRC
- * algorithm, here the CRC64 code is also inspired by the table-driven
- * algorithm and detail example from this paper. This paper can be found
- * from,
- * http://www.ross.net/crc/download/crc_v3.txt
- *
- * crc64table[256] is the lookup table of a table-driven 64-bit CRC
- * calculation, which is generated by gen_crc64table.c in kernel build
- * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
- * as well, which is defined as,
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
- *
- * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set
- * Specification and uses least-significant-bit first bit order:
- *
- * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
- * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
- * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
- *
- * Copyright 2018 SUSE Linux.
- *   Author: Coly Li <colyli@suse.de>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crc64.h>
-#include "crc64table.h"
-
-MODULE_DESCRIPTION("CRC64 calculations");
-MODULE_LICENSE("GPL v2");
-
-u64 crc64_be_generic(u64 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL_GPL(crc64_be_generic);
-
-u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_generic);
diff --git a/lib/crc7.c b/lib/crc7.c
deleted file mode 100644
index 8dd991cc6114..000000000000
--- a/lib/crc7.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *      crc7.c
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc7.h>
-
-/*
- * Table for CRC-7 (polynomial x^7 + x^3 + 1).
- * This is a big-endian CRC (msbit is highest power of x),
- * aligned so the msbit of the byte is the x^6 coefficient
- * and the lsbit is not used.
- */
-static const u8 crc7_be_syndrome_table[256] = {
-	0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
-	0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
-	0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c,
-	0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc,
-	0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,
-	0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
-	0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28,
-	0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8,
-	0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,
-	0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
-	0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84,
-	0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14,
-	0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,
-	0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
-	0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0,
-	0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70,
-	0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc,
-	0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c,
-	0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce,
-	0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e,
-	0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98,
-	0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08,
-	0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa,
-	0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a,
-	0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34,
-	0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4,
-	0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06,
-	0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96,
-	0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50,
-	0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0,
-	0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62,
-	0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2
-};
-
-/**
- * crc7_be - update the CRC7 for the data buffer
- * @crc:     previous CRC7 value
- * @buffer:  data pointer
- * @len:     number of bytes in the buffer
- * Context: any
- *
- * Returns the updated CRC7 value.
- * The CRC7 is left-aligned in the byte (the lsbit is always 0), as that
- * makes the computation easier, and all callers want it in that form.
- *
- */
-u8 crc7_be(u8 crc, const u8 *buffer, size_t len)
-{
-	while (len--)
-		crc = crc7_be_syndrome_table[crc ^ *buffer++];
-	return crc;
-}
-EXPORT_SYMBOL(crc7_be);
-
-MODULE_DESCRIPTION("CRC7 calculations");
-MODULE_LICENSE("GPL");
diff --git a/lib/crc8.c b/lib/crc8.c
deleted file mode 100644
index 1ad8e501d9b6..000000000000
--- a/lib/crc8.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2011 Broadcom Corporation
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#define pr_fmt(fmt)		KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/crc8.h>
-#include <linux/printk.h>
-
-/**
- * crc8_populate_msb - fill crc table for given polynomial in reverse bit order.
- *
- * @table:	table to be filled.
- * @polynomial:	polynomial for which table is to be filled.
- */
-void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
-{
-	int i, j;
-	const u8 msbit = 0x80;
-	u8 t = msbit;
-
-	table[0] = 0;
-
-	for (i = 1; i < CRC8_TABLE_SIZE; i *= 2) {
-		t = (t << 1) ^ (t & msbit ? polynomial : 0);
-		for (j = 0; j < i; j++)
-			table[i+j] = table[j] ^ t;
-	}
-}
-EXPORT_SYMBOL(crc8_populate_msb);
-
-/**
- * crc8_populate_lsb - fill crc table for given polynomial in regular bit order.
- *
- * @table:	table to be filled.
- * @polynomial:	polynomial for which table is to be filled.
- */
-void crc8_populate_lsb(u8 table[CRC8_TABLE_SIZE], u8 polynomial)
-{
-	int i, j;
-	u8 t = 1;
-
-	table[0] = 0;
-
-	for (i = (CRC8_TABLE_SIZE >> 1); i; i >>= 1) {
-		t = (t >> 1) ^ (t & 1 ? polynomial : 0);
-		for (j = 0; j < CRC8_TABLE_SIZE; j += 2*i)
-			table[i+j] = table[j] ^ t;
-	}
-}
-EXPORT_SYMBOL(crc8_populate_lsb);
-
-/**
- * crc8 - calculate a crc8 over the given input data.
- *
- * @table: crc table used for calculation.
- * @pdata: pointer to data buffer.
- * @nbytes: number of bytes in data buffer.
- * @crc: previous returned crc8 value.
- */
-u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc)
-{
-	/* loop over the buffer data */
-	while (nbytes-- > 0)
-		crc = table[(crc ^ *pdata++) & 0xff];
-
-	return crc;
-}
-EXPORT_SYMBOL(crc8);
-
-MODULE_DESCRIPTION("CRC8 (by Williams, Ross N.) function");
-MODULE_AUTHOR("Broadcom Corporation");
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/gen_crc32table.c b/lib/gen_crc32table.c
deleted file mode 100644
index 6d03425b849e..000000000000
--- a/lib/gen_crc32table.c
+++ /dev/null
@@ -1,89 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <stdio.h>
-#include "../include/linux/crc32poly.h"
-#include "../include/generated/autoconf.h"
-#include <inttypes.h>
-
-static uint32_t crc32table_le[256];
-static uint32_t crc32table_be[256];
-static uint32_t crc32ctable_le[256];
-
-/**
- * crc32init_le() - allocate and initialize LE table data
- *
- * crc is the crc of the byte i; other entries are filled in based on the
- * fact that crctable[i^j] = crctable[i] ^ crctable[j].
- *
- */
-static void crc32init_le_generic(const uint32_t polynomial, uint32_t tab[256])
-{
-	unsigned i, j;
-	uint32_t crc = 1;
-
-	tab[0] = 0;
-
-	for (i = 128; i; i >>= 1) {
-		crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
-		for (j = 0; j < 256; j += 2 * i)
-			tab[i + j] = crc ^ tab[j];
-	}
-}
-
-static void crc32init_le(void)
-{
-	crc32init_le_generic(CRC32_POLY_LE, crc32table_le);
-}
-
-static void crc32cinit_le(void)
-{
-	crc32init_le_generic(CRC32C_POLY_LE, crc32ctable_le);
-}
-
-/**
- * crc32init_be() - allocate and initialize BE table data
- */
-static void crc32init_be(void)
-{
-	unsigned i, j;
-	uint32_t crc = 0x80000000;
-
-	crc32table_be[0] = 0;
-
-	for (i = 1; i < 256; i <<= 1) {
-		crc = (crc << 1) ^ ((crc & 0x80000000) ? CRC32_POLY_BE : 0);
-		for (j = 0; j < i; j++)
-			crc32table_be[i + j] = crc ^ crc32table_be[j];
-	}
-}
-
-static void output_table(const uint32_t table[256])
-{
-	int i;
-
-	for (i = 0; i < 256; i += 4) {
-		printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
-		       table[i], table[i + 1], table[i + 2], table[i + 3]);
-	}
-}
-
-int main(int argc, char** argv)
-{
-	printf("/* this file is generated - do not edit */\n\n");
-
-	crc32init_le();
-	printf("static const u32 ____cacheline_aligned crc32table_le[256] = {\n");
-	output_table(crc32table_le);
-	printf("};\n");
-
-	crc32init_be();
-	printf("static const u32 ____cacheline_aligned crc32table_be[256] = {\n");
-	output_table(crc32table_be);
-	printf("};\n");
-
-	crc32cinit_le();
-	printf("static const u32 ____cacheline_aligned crc32ctable_le[256] = {\n");
-	output_table(crc32ctable_le);
-	printf("};\n");
-
-	return 0;
-}
diff --git a/lib/gen_crc64table.c b/lib/gen_crc64table.c
deleted file mode 100644
index e05a4230a0a0..000000000000
--- a/lib/gen_crc64table.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Generate lookup table for the table-driven CRC64 calculation.
- *
- * gen_crc64table is executed in kernel build time and generates
- * lib/crc64table.h. This header is included by lib/crc64.c for
- * the table-driven CRC64 calculation.
- *
- * See lib/crc64.c for more information about which specification
- * and polynomial arithmetic that gen_crc64table.c follows to
- * generate the lookup table.
- *
- * Copyright 2018 SUSE Linux.
- *   Author: Coly Li <colyli@suse.de>
- */
-#include <inttypes.h>
-#include <stdio.h>
-
-#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
-#define CRC64_NVME_POLY 0x9A6C9329AC4BC9B5ULL
-
-static uint64_t crc64_table[256] = {0};
-static uint64_t crc64_nvme_table[256] = {0};
-
-static void generate_reflected_crc64_table(uint64_t table[256], uint64_t poly)
-{
-	uint64_t i, j, c, crc;
-
-	for (i = 0; i < 256; i++) {
-		crc = 0ULL;
-		c = i;
-
-		for (j = 0; j < 8; j++) {
-			if ((crc ^ (c >> j)) & 1)
-				crc = (crc >> 1) ^ poly;
-			else
-				crc >>= 1;
-		}
-		table[i] = crc;
-	}
-}
-
-static void generate_crc64_table(uint64_t table[256], uint64_t poly)
-{
-	uint64_t i, j, c, crc;
-
-	for (i = 0; i < 256; i++) {
-		crc = 0;
-		c = i << 56;
-
-		for (j = 0; j < 8; j++) {
-			if ((crc ^ c) & 0x8000000000000000ULL)
-				crc = (crc << 1) ^ poly;
-			else
-				crc <<= 1;
-			c <<= 1;
-		}
-
-		table[i] = crc;
-	}
-}
-
-static void output_table(uint64_t table[256])
-{
-	int i;
-
-	for (i = 0; i < 256; i++) {
-		printf("\t0x%016" PRIx64 "ULL", table[i]);
-		if (i & 0x1)
-			printf(",\n");
-		else
-			printf(", ");
-	}
-	printf("};\n");
-}
-
-static void print_crc64_tables(void)
-{
-	printf("/* this file is generated - do not edit */\n\n");
-	printf("#include <linux/types.h>\n");
-	printf("#include <linux/cache.h>\n\n");
-	printf("static const u64 ____cacheline_aligned crc64table[256] = {\n");
-	output_table(crc64_table);
-
-	printf("\nstatic const u64 ____cacheline_aligned crc64nvmetable[256] = {\n");
-	output_table(crc64_nvme_table);
-}
-
-int main(int argc, char *argv[])
-{
-	generate_crc64_table(crc64_table, CRC64_ECMA182_POLY);
-	generate_reflected_crc64_table(crc64_nvme_table, CRC64_NVME_POLY);
-	print_crc64_tables();
-	return 0;
-}
diff --git a/lib/tests/Makefile b/lib/tests/Makefile
index 56d645014482..741d3ac2cba2 100644
--- a/lib/tests/Makefile
+++ b/lib/tests/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_BLACKHOLE_DEV_KUNIT_TEST) += blackhole_dev_kunit.o
 obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
 obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
 obj-$(CONFIG_CPUMASK_KUNIT_TEST) += cpumask_kunit.o
-obj-$(CONFIG_CRC_KUNIT_TEST) += crc_kunit.o
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, unsequenced)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-overread)
 CFLAGS_fortify_kunit.o += $(call cc-disable-warning, stringop-truncation)
diff --git a/lib/tests/crc_kunit.c b/lib/tests/crc_kunit.c
deleted file mode 100644
index f08d985d8860..000000000000
--- a/lib/tests/crc_kunit.c
+++ /dev/null
@@ -1,452 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Unit tests and benchmarks for the CRC library functions
- *
- * Copyright 2024 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-#include <kunit/test.h>
-#include <linux/crc7.h>
-#include <linux/crc16.h>
-#include <linux/crc-t10dif.h>
-#include <linux/crc32.h>
-#include <linux/crc32c.h>
-#include <linux/crc64.h>
-#include <linux/prandom.h>
-#include <linux/vmalloc.h>
-
-#define CRC_KUNIT_SEED			42
-#define CRC_KUNIT_MAX_LEN		16384
-#define CRC_KUNIT_NUM_TEST_ITERS	1000
-
-static struct rnd_state rng;
-static u8 *test_buffer;
-static size_t test_buflen;
-
-/**
- * struct crc_variant - describes a CRC variant
- * @bits: Number of bits in the CRC, 1 <= @bits <= 64.
- * @le: true if it's a "little endian" CRC (reversed mapping between bits and
- *	polynomial coefficients in each byte), false if it's a "big endian" CRC
- *	(natural mapping between bits and polynomial coefficients in each byte)
- * @poly: The generator polynomial with the highest-order term omitted.
- *	  Bit-reversed if @le is true.
- * @func: The function to compute a CRC.  The type signature uses u64 so that it
- *	  can fit any CRC up to CRC-64.  The CRC is passed in, and is expected
- *	  to be returned in, the least significant bits of the u64.  The
- *	  function is expected to *not* invert the CRC at the beginning and end.
- */
-struct crc_variant {
-	int bits;
-	bool le;
-	u64 poly;
-	u64 (*func)(u64 crc, const u8 *p, size_t len);
-};
-
-static u32 rand32(void)
-{
-	return prandom_u32_state(&rng);
-}
-
-static u64 rand64(void)
-{
-	u32 n = rand32();
-
-	return ((u64)n << 32) | rand32();
-}
-
-static u64 crc_mask(const struct crc_variant *v)
-{
-	return (u64)-1 >> (64 - v->bits);
-}
-
-/* Reference implementation of any CRC variant */
-static u64 crc_ref(const struct crc_variant *v,
-		   u64 crc, const u8 *p, size_t len)
-{
-	size_t i, j;
-
-	for (i = 0; i < len; i++) {
-		for (j = 0; j < 8; j++) {
-			if (v->le) {
-				crc ^= (p[i] >> j) & 1;
-				crc = (crc >> 1) ^ ((crc & 1) ? v->poly : 0);
-			} else {
-				crc ^= (u64)((p[i] >> (7 - j)) & 1) <<
-				       (v->bits - 1);
-				if (crc & (1ULL << (v->bits - 1)))
-					crc = ((crc << 1) ^ v->poly) &
-					      crc_mask(v);
-				else
-					crc <<= 1;
-			}
-		}
-	}
-	return crc;
-}
-
-static int crc_suite_init(struct kunit_suite *suite)
-{
-	/*
-	 * Allocate the test buffer using vmalloc() with a page-aligned length
-	 * so that it is immediately followed by a guard page.  This allows
-	 * buffer overreads to be detected, even in assembly code.
-	 */
-	test_buflen = round_up(CRC_KUNIT_MAX_LEN, PAGE_SIZE);
-	test_buffer = vmalloc(test_buflen);
-	if (!test_buffer)
-		return -ENOMEM;
-
-	prandom_seed_state(&rng, CRC_KUNIT_SEED);
-	prandom_bytes_state(&rng, test_buffer, test_buflen);
-	return 0;
-}
-
-static void crc_suite_exit(struct kunit_suite *suite)
-{
-	vfree(test_buffer);
-	test_buffer = NULL;
-}
-
-/* Generate a random initial CRC. */
-static u64 generate_random_initial_crc(const struct crc_variant *v)
-{
-	switch (rand32() % 4) {
-	case 0:
-		return 0;
-	case 1:
-		return crc_mask(v); /* All 1 bits */
-	default:
-		return rand64() & crc_mask(v);
-	}
-}
-
-/* Generate a random length, preferring small lengths. */
-static size_t generate_random_length(size_t max_length)
-{
-	size_t len;
-
-	switch (rand32() % 3) {
-	case 0:
-		len = rand32() % 128;
-		break;
-	case 1:
-		len = rand32() % 3072;
-		break;
-	default:
-		len = rand32();
-		break;
-	}
-	return len % (max_length + 1);
-}
-
-/* Test that v->func gives the same CRCs as a reference implementation. */
-static void crc_test(struct kunit *test, const struct crc_variant *v)
-{
-	size_t i;
-
-	for (i = 0; i < CRC_KUNIT_NUM_TEST_ITERS; i++) {
-		u64 init_crc, expected_crc, actual_crc;
-		size_t len, offset;
-		bool nosimd;
-
-		init_crc = generate_random_initial_crc(v);
-		len = generate_random_length(CRC_KUNIT_MAX_LEN);
-
-		/* Generate a random offset. */
-		if (rand32() % 2 == 0) {
-			/* Use a random alignment mod 64 */
-			offset = rand32() % 64;
-			offset = min(offset, CRC_KUNIT_MAX_LEN - len);
-		} else {
-			/* Go up to the guard page, to catch buffer overreads */
-			offset = test_buflen - len;
-		}
-
-		if (rand32() % 8 == 0)
-			/* Refresh the data occasionally. */
-			prandom_bytes_state(&rng, &test_buffer[offset], len);
-
-		nosimd = rand32() % 8 == 0;
-
-		/*
-		 * Compute the CRC, and verify that it equals the CRC computed
-		 * by a simple bit-at-a-time reference implementation.
-		 */
-		expected_crc = crc_ref(v, init_crc, &test_buffer[offset], len);
-		if (nosimd)
-			local_irq_disable();
-		actual_crc = v->func(init_crc, &test_buffer[offset], len);
-		if (nosimd)
-			local_irq_enable();
-		KUNIT_EXPECT_EQ_MSG(test, expected_crc, actual_crc,
-				    "Wrong result with len=%zu offset=%zu nosimd=%d",
-				    len, offset, nosimd);
-	}
-}
-
-static __always_inline void
-crc_benchmark(struct kunit *test,
-	      u64 (*crc_func)(u64 crc, const u8 *p, size_t len))
-{
-	static const size_t lens_to_test[] = {
-		1, 16, 64, 127, 128, 200, 256, 511, 512, 1024, 3173, 4096, 16384,
-	};
-	size_t len, i, j, num_iters;
-	/*
-	 * The CRC value that this function computes in a series of calls to
-	 * crc_func is never actually used, so use volatile to ensure that the
-	 * computations are done as intended and don't all get optimized out.
-	 */
-	volatile u64 crc = 0;
-	u64 t;
-
-	if (!IS_ENABLED(CONFIG_CRC_BENCHMARK))
-		kunit_skip(test, "not enabled");
-
-	/* warm-up */
-	for (i = 0; i < 10000000; i += CRC_KUNIT_MAX_LEN)
-		crc = crc_func(crc, test_buffer, CRC_KUNIT_MAX_LEN);
-
-	for (i = 0; i < ARRAY_SIZE(lens_to_test); i++) {
-		len = lens_to_test[i];
-		KUNIT_ASSERT_LE(test, len, CRC_KUNIT_MAX_LEN);
-		num_iters = 10000000 / (len + 128);
-		preempt_disable();
-		t = ktime_get_ns();
-		for (j = 0; j < num_iters; j++)
-			crc = crc_func(crc, test_buffer, len);
-		t = ktime_get_ns() - t;
-		preempt_enable();
-		kunit_info(test, "len=%zu: %llu MB/s\n",
-			   len, div64_u64((u64)len * num_iters * 1000, t));
-	}
-}
-
-/* crc7_be */
-
-static u64 crc7_be_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	/*
-	 * crc7_be() left-aligns the 7-bit CRC in a u8, whereas the test wants a
-	 * right-aligned CRC (in a u64).  Convert between the conventions.
-	 */
-	return crc7_be(crc << 1, p, len) >> 1;
-}
-
-static const struct crc_variant crc_variant_crc7_be = {
-	.bits = 7,
-	.poly = 0x9,
-	.func = crc7_be_wrapper,
-};
-
-static void crc7_be_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc7_be);
-}
-
-static void crc7_be_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc7_be_wrapper);
-}
-
-/* crc16 */
-
-static u64 crc16_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc16(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc16 = {
-	.bits = 16,
-	.le = true,
-	.poly = 0xa001,
-	.func = crc16_wrapper,
-};
-
-static void crc16_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc16);
-}
-
-static void crc16_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc16_wrapper);
-}
-
-/* crc_t10dif */
-
-static u64 crc_t10dif_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc_t10dif_update(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc_t10dif = {
-	.bits = 16,
-	.le = false,
-	.poly = 0x8bb7,
-	.func = crc_t10dif_wrapper,
-};
-
-static void crc_t10dif_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc_t10dif);
-}
-
-static void crc_t10dif_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc_t10dif_wrapper);
-}
-
-/* crc32_le */
-
-static u64 crc32_le_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc32_le(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc32_le = {
-	.bits = 32,
-	.le = true,
-	.poly = 0xedb88320,
-	.func = crc32_le_wrapper,
-};
-
-static void crc32_le_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc32_le);
-}
-
-static void crc32_le_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc32_le_wrapper);
-}
-
-/* crc32_be */
-
-static u64 crc32_be_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc32_be(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc32_be = {
-	.bits = 32,
-	.le = false,
-	.poly = 0x04c11db7,
-	.func = crc32_be_wrapper,
-};
-
-static void crc32_be_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc32_be);
-}
-
-static void crc32_be_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc32_be_wrapper);
-}
-
-/* crc32c */
-
-static u64 crc32c_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc32c(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc32c = {
-	.bits = 32,
-	.le = true,
-	.poly = 0x82f63b78,
-	.func = crc32c_wrapper,
-};
-
-static void crc32c_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc32c);
-}
-
-static void crc32c_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc32c_wrapper);
-}
-
-/* crc64_be */
-
-static u64 crc64_be_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	return crc64_be(crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc64_be = {
-	.bits = 64,
-	.le = false,
-	.poly = 0x42f0e1eba9ea3693,
-	.func = crc64_be_wrapper,
-};
-
-static void crc64_be_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc64_be);
-}
-
-static void crc64_be_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc64_be_wrapper);
-}
-
-/* crc64_nvme */
-
-static u64 crc64_nvme_wrapper(u64 crc, const u8 *p, size_t len)
-{
-	/* The inversions that crc64_nvme() does have to be undone here. */
-	return ~crc64_nvme(~crc, p, len);
-}
-
-static const struct crc_variant crc_variant_crc64_nvme = {
-	.bits = 64,
-	.le = true,
-	.poly = 0x9a6c9329ac4bc9b5,
-	.func = crc64_nvme_wrapper,
-};
-
-static void crc64_nvme_test(struct kunit *test)
-{
-	crc_test(test, &crc_variant_crc64_nvme);
-}
-
-static void crc64_nvme_benchmark(struct kunit *test)
-{
-	crc_benchmark(test, crc64_nvme_wrapper);
-}
-
-static struct kunit_case crc_test_cases[] = {
-	KUNIT_CASE(crc7_be_test),
-	KUNIT_CASE(crc7_be_benchmark),
-	KUNIT_CASE(crc16_test),
-	KUNIT_CASE(crc16_benchmark),
-	KUNIT_CASE(crc_t10dif_test),
-	KUNIT_CASE(crc_t10dif_benchmark),
-	KUNIT_CASE(crc32_le_test),
-	KUNIT_CASE(crc32_le_benchmark),
-	KUNIT_CASE(crc32_be_test),
-	KUNIT_CASE(crc32_be_benchmark),
-	KUNIT_CASE(crc32c_test),
-	KUNIT_CASE(crc32c_benchmark),
-	KUNIT_CASE(crc64_be_test),
-	KUNIT_CASE(crc64_be_benchmark),
-	KUNIT_CASE(crc64_nvme_test),
-	KUNIT_CASE(crc64_nvme_benchmark),
-	{},
-};
-
-static struct kunit_suite crc_test_suite = {
-	.name = "crc",
-	.test_cases = crc_test_cases,
-	.suite_init = crc_suite_init,
-	.suite_exit = crc_suite_exit,
-};
-kunit_test_suite(crc_test_suite);
-
-MODULE_DESCRIPTION("Unit tests and benchmarks for the CRC library functions");
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 0bcfca56406dc6342e30fafe41a2f34cdde029b4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 7 Jun 2025 13:04:44 -0700
Subject: lib/crc: Prepare for arch-optimized code in subdirs of lib/crc/

Rework how lib/crc/ supports arch-optimized code.  First, instead of the
arch-optimized CRC code being in arch/$(SRCARCH)/lib/, it will now be in
lib/crc/$(SRCARCH)/.  Second, the API functions (e.g. crc32c()),
arch-optimized functions (e.g. crc32c_arch()), and generic functions
(e.g. crc32c_base()) will now be part of a single module for each CRC
type, allowing better inlining and dead code elimination.  The second
change is made possible by the first.

As an example, consider CONFIG_CRC32=m on x86.  We'll now have just
crc32.ko instead of both crc32-x86.ko and crc32.ko.  The two modules
were already coupled together and always both got loaded together via
direct symbol dependency, so the separation provided no benefit.

Note: later I'd like to apply the same design to lib/crypto/ too, where
often the API functions are out-of-line so this will work even better.
In those cases, for each algorithm we currently have 3 modules all
coupled together, e.g. libsha256.ko, libsha256-generic.ko, and
sha256-x86.ko.  We should have just one, inline things properly, and
rely on the compiler's dead code elimination to decide the inclusion of
the generic code instead of manually setting it via kconfig.

Having arch-specific code outside arch/ was somewhat controversial when
Zinc proposed it back in 2018.  But I don't think the concerns are
warranted.  It's better from a technical perspective, as it enables the
improvements mentioned above.  This model is already successfully used
in other places in the kernel such as lib/raid6/.  The community of each
architecture still remains free to work on the code, even if it's not in
arch/.  At the time there was also a desire to put the library code in
the same files as the old-school crypto API, but that was a mistake; now
that the library is separate, that's no longer a constraint either.

Reviewed-by: "Martin K. Petersen" <martin.petersen@oracle.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: "Jason A. Donenfeld" <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20250607200454.73587-3-ebiggers@kernel.org
Link: https://lore.kernel.org/r/20250612054514.142728-1-ebiggers@kernel.org
Link: https://lore.kernel.org/r/20250621012221.4351-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/core-api/kernel-api.rst |   2 +-
 MAINTAINERS                           |   1 -
 include/linux/crc-t10dif.h            |  10 +---
 include/linux/crc32.h                 |  30 +---------
 include/linux/crc64.h                 |  19 +------
 lib/crc/Kconfig                       |  13 +++--
 lib/crc/Makefile                      |  20 ++++++-
 lib/crc/crc-t10dif-main.c             |  88 ++++++++++++++++++++++++++++
 lib/crc/crc-t10dif.c                  |  65 ---------------------
 lib/crc/crc32-main.c                  | 104 ++++++++++++++++++++++++++++++++++
 lib/crc/crc32.c                       |  59 -------------------
 lib/crc/crc64-main.c                  |  91 +++++++++++++++++++++++++++++
 lib/crc/crc64.c                       |  58 -------------------
 13 files changed, 315 insertions(+), 245 deletions(-)
 create mode 100644 lib/crc/crc-t10dif-main.c
 delete mode 100644 lib/crc/crc-t10dif.c
 create mode 100644 lib/crc/crc32-main.c
 delete mode 100644 lib/crc/crc32.c
 create mode 100644 lib/crc/crc64-main.c
 delete mode 100644 lib/crc/crc64.c

(limited to 'include')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index c4642d9f13a9..9c8370891a39 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -148,7 +148,7 @@ CRC Functions
 .. kernel-doc:: lib/crc/crc16.c
    :export:
 
-.. kernel-doc:: lib/crc/crc32.c
+.. kernel-doc:: lib/crc/crc32-main.c
 
 .. kernel-doc:: lib/crc/crc-ccitt.c
    :export:
diff --git a/MAINTAINERS b/MAINTAINERS
index b0ec17dfbe03..d75676e17ac7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6359,7 +6359,6 @@ L:	linux-crypto@vger.kernel.org
 S:	Maintained
 T:	git https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git crc-next
 F:	Documentation/staging/crc*
-F:	arch/*/lib/crc*
 F:	include/linux/crc*
 F:	lib/crc/
 F:	scripts/gen-crc-consts.py
diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h
index a559fdff3f7e..ecc8bc2dd7f4 100644
--- a/include/linux/crc-t10dif.h
+++ b/include/linux/crc-t10dif.h
@@ -4,15 +4,7 @@
 
 #include <linux/types.h>
 
-u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len);
-u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len);
-
-static inline u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC_T10DIF_ARCH))
-		return crc_t10dif_arch(crc, p, len);
-	return crc_t10dif_generic(crc, p, len);
-}
+u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len);
 
 static inline u16 crc_t10dif(const u8 *p, size_t len)
 {
diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 36bbc0405aa0..22dbe7144eb4 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -5,33 +5,9 @@
 #include <linux/types.h>
 #include <linux/bitrev.h>
 
-u32 crc32_le_arch(u32 crc, const u8 *p, size_t len);
-u32 crc32_le_base(u32 crc, const u8 *p, size_t len);
-u32 crc32_be_arch(u32 crc, const u8 *p, size_t len);
-u32 crc32_be_base(u32 crc, const u8 *p, size_t len);
-u32 crc32c_arch(u32 crc, const u8 *p, size_t len);
-u32 crc32c_base(u32 crc, const u8 *p, size_t len);
-
-static inline u32 crc32_le(u32 crc, const void *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC32_ARCH))
-		return crc32_le_arch(crc, p, len);
-	return crc32_le_base(crc, p, len);
-}
-
-static inline u32 crc32_be(u32 crc, const void *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC32_ARCH))
-		return crc32_be_arch(crc, p, len);
-	return crc32_be_base(crc, p, len);
-}
-
-static inline u32 crc32c(u32 crc, const void *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC32_ARCH))
-		return crc32c_arch(crc, p, len);
-	return crc32c_base(crc, p, len);
-}
+u32 crc32_le(u32 crc, const void *p, size_t len);
+u32 crc32_be(u32 crc, const void *p, size_t len);
+u32 crc32c(u32 crc, const void *p, size_t len);
 
 /*
  * crc32_optimizations() returns flags that indicate which CRC32 library
diff --git a/include/linux/crc64.h b/include/linux/crc64.h
index b6aa290a7931..fc0c06ab1993 100644
--- a/include/linux/crc64.h
+++ b/include/linux/crc64.h
@@ -4,11 +4,6 @@
 
 #include <linux/types.h>
 
-u64 crc64_be_arch(u64 crc, const u8 *p, size_t len);
-u64 crc64_be_generic(u64 crc, const u8 *p, size_t len);
-u64 crc64_nvme_arch(u64 crc, const u8 *p, size_t len);
-u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len);
-
 /**
  * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64
  * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation,
@@ -16,12 +11,7 @@ u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len);
  * @p: pointer to buffer over which CRC64 is run
  * @len: length of buffer @p
  */
-static inline u64 crc64_be(u64 crc, const void *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC64_ARCH))
-		return crc64_be_arch(crc, p, len);
-	return crc64_be_generic(crc, p, len);
-}
+u64 crc64_be(u64 crc, const void *p, size_t len);
 
 /**
  * crc64_nvme - Calculate CRC64-NVME
@@ -33,11 +23,6 @@ static inline u64 crc64_be(u64 crc, const void *p, size_t len)
  * This computes the CRC64 defined in the NVME NVM Command Set Specification,
  * *including the bitwise inversion at the beginning and end*.
  */
-static inline u64 crc64_nvme(u64 crc, const void *p, size_t len)
-{
-	if (IS_ENABLED(CONFIG_CRC64_ARCH))
-		return ~crc64_nvme_arch(~crc, p, len);
-	return ~crc64_nvme_generic(~crc, p, len);
-}
+u64 crc64_nvme(u64 crc, const void *p, size_t len);
 
 #endif /* _LINUX_CRC64_H */
diff --git a/lib/crc/Kconfig b/lib/crc/Kconfig
index e0e7168b74c7..04c7aeb0fc8e 100644
--- a/lib/crc/Kconfig
+++ b/lib/crc/Kconfig
@@ -48,8 +48,8 @@ config ARCH_HAS_CRC_T10DIF
 	bool
 
 config CRC_T10DIF_ARCH
-	tristate
-	default CRC_T10DIF if ARCH_HAS_CRC_T10DIF && CRC_OPTIMIZATIONS
+	bool
+	depends on CRC_T10DIF && CRC_OPTIMIZATIONS
 
 config CRC32
 	tristate
@@ -62,8 +62,8 @@ config ARCH_HAS_CRC32
 	bool
 
 config CRC32_ARCH
-	tristate
-	default CRC32 if ARCH_HAS_CRC32 && CRC_OPTIMIZATIONS
+	bool
+	depends on CRC32 && CRC_OPTIMIZATIONS
 
 config CRC64
 	tristate
@@ -75,11 +75,12 @@ config ARCH_HAS_CRC64
 	bool
 
 config CRC64_ARCH
-	tristate
-	default CRC64 if ARCH_HAS_CRC64 && CRC_OPTIMIZATIONS
+	bool
+	depends on CRC64 && CRC_OPTIMIZATIONS
 
 config CRC_OPTIMIZATIONS
 	bool "Enable optimized CRC implementations" if EXPERT
+	depends on !UML
 	default y
 	help
 	  Disabling this option reduces code size slightly by disabling the
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index ff4c30dda452..926edc3b035f 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -8,16 +8,32 @@ obj-$(CONFIG_CRC8) += crc8.o
 obj-$(CONFIG_CRC16) += crc16.o
 obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
 obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o
+
 obj-$(CONFIG_CRC_T10DIF) += crc-t10dif.o
+crc-t10dif-y := crc-t10dif-main.o
+ifeq ($(CONFIG_CRC_T10DIF_ARCH),y)
+CFLAGS_crc-t10dif-main.o += -I$(src)/$(SRCARCH)
+endif
+
 obj-$(CONFIG_CRC32) += crc32.o
+crc32-y := crc32-main.o
+ifeq ($(CONFIG_CRC32_ARCH),y)
+CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH)
+endif
+
 obj-$(CONFIG_CRC64) += crc64.o
+crc64-y := crc64-main.o
+ifeq ($(CONFIG_CRC64_ARCH),y)
+CFLAGS_crc64-main.o += -I$(src)/$(SRCARCH)
+endif
+
 obj-y += tests/
 
 hostprogs := gen_crc32table gen_crc64table
 clean-files := crc32table.h crc64table.h
 
-$(obj)/crc32.o: $(obj)/crc32table.h
-$(obj)/crc64.o: $(obj)/crc64table.h
+$(obj)/crc32-main.o: $(obj)/crc32table.h
+$(obj)/crc64-main.o: $(obj)/crc64table.h
 
 quiet_cmd_crc32 = GEN     $@
       cmd_crc32 = $< > $@
diff --git a/lib/crc/crc-t10dif-main.c b/lib/crc/crc-t10dif-main.c
new file mode 100644
index 000000000000..bc91f3888184
--- /dev/null
+++ b/lib/crc/crc-t10dif-main.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * T10 Data Integrity Field CRC16 calculation
+ *
+ * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc-t10dif.h>
+
+/*
+ * Table generated using the following polynomial:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const u16 t10_dif_crc_table[256] = {
+	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+static inline u16 __maybe_unused
+crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ *p++];
+	return crc;
+}
+
+#ifdef CONFIG_CRC_T10DIF_ARCH
+#include "crc-t10dif.h" /* $(SRCARCH)/crc-t10dif.h */
+#else
+#define crc_t10dif_arch crc_t10dif_generic
+#endif
+
+u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len)
+{
+	return crc_t10dif_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc_t10dif_update);
+
+#ifdef crc_t10dif_mod_init_arch
+static int __init crc_t10dif_mod_init(void)
+{
+	crc_t10dif_mod_init_arch();
+	return 0;
+}
+subsys_initcall(crc_t10dif_mod_init);
+
+static void __exit crc_t10dif_mod_exit(void)
+{
+}
+module_exit(crc_t10dif_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC-T10DIF library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc-t10dif.c b/lib/crc/crc-t10dif.c
deleted file mode 100644
index 311c2ab829f1..000000000000
--- a/lib/crc/crc-t10dif.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * T10 Data Integrity Field CRC16 calculation
- *
- * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
- * Written by Martin K. Petersen <martin.petersen@oracle.com>
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/crc-t10dif.h>
-
-/*
- * Table generated using the following polynomial:
- * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
- * gt: 0x8bb7
- */
-static const u16 t10_dif_crc_table[256] = {
-	0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-	0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-	0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-	0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-	0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
-	0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
-	0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
-	0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
-	0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
-	0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
-	0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
-	0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
-	0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
-	0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
-	0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
-	0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
-	0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
-	0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
-	0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
-	0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
-	0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
-	0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
-	0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
-	0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
-	0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
-	0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
-	0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
-	0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
-	0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
-	0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
-	0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
-	0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
-};
-
-u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len)
-{
-	size_t i;
-
-	for (i = 0; i < len; i++)
-		crc = (crc << 8) ^ t10_dif_crc_table[(crc >> 8) ^ p[i]];
-
-	return crc;
-}
-EXPORT_SYMBOL(crc_t10dif_generic);
-
-MODULE_DESCRIPTION("T10 DIF CRC calculation");
-MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc32-main.c b/lib/crc/crc32-main.c
new file mode 100644
index 000000000000..7843512ffef4
--- /dev/null
+++ b/lib/crc/crc32-main.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ */
+
+/* see: Documentation/staging/crc32.rst for a description of algorithms */
+
+#include <linux/crc32.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "crc32table.h"
+
+static inline u32 __maybe_unused
+crc32_le_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
+	return crc;
+}
+
+static inline u32 __maybe_unused
+crc32_be_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
+	return crc;
+}
+
+static inline u32 __maybe_unused
+crc32c_base(u32 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
+	return crc;
+}
+
+#ifdef CONFIG_CRC32_ARCH
+#include "crc32.h" /* $(SRCARCH)/crc32.h */
+
+u32 crc32_optimizations(void)
+{
+	return crc32_optimizations_arch();
+}
+EXPORT_SYMBOL(crc32_optimizations);
+#else
+#define crc32_le_arch crc32_le_base
+#define crc32_be_arch crc32_be_base
+#define crc32c_arch crc32c_base
+#endif
+
+u32 crc32_le(u32 crc, const void *p, size_t len)
+{
+	return crc32_le_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_le);
+
+u32 crc32_be(u32 crc, const void *p, size_t len)
+{
+	return crc32_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32_be);
+
+u32 crc32c(u32 crc, const void *p, size_t len)
+{
+	return crc32c_arch(crc, p, len);
+}
+EXPORT_SYMBOL(crc32c);
+
+#ifdef crc32_mod_init_arch
+static int __init crc32_mod_init(void)
+{
+	crc32_mod_init_arch();
+	return 0;
+}
+subsys_initcall(crc32_mod_init);
+
+static void __exit crc32_mod_exit(void)
+{
+}
+module_exit(crc32_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC32 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc32.c b/lib/crc/crc32.c
deleted file mode 100644
index 6811b37df2aa..000000000000
--- a/lib/crc/crc32.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
- * cleaned up code to current version of sparse and added the slicing-by-8
- * algorithm to the closely similar existing slicing-by-4 algorithm.
- *
- * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
- * Code was from the public domain, copyright abandoned.  Code was
- * subsequently included in the kernel, thus was re-licensed under the
- * GNU GPL v2.
- *
- * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
- * Same crc32 function was used in 5 other places in the kernel.
- * I made one version, and deleted the others.
- * There are various incantations of crc32().  Some use a seed of 0 or ~0.
- * Some xor at the end with ~0.  The generic crc32() function takes
- * seed as an argument, and doesn't xor at the end.  Then individual
- * users can do whatever they need.
- *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
- *   fs/jffs2 uses seed 0, doesn't xor with ~0.
- *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
- */
-
-/* see: Documentation/staging/crc32.rst for a description of algorithms */
-
-#include <linux/crc32.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include "crc32table.h"
-
-MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
-MODULE_DESCRIPTION("Various CRC32 calculations");
-MODULE_LICENSE("GPL");
-
-u32 crc32_le_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32_le_base);
-
-u32 crc32c_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32c_base);
-
-u32 crc32_be_base(u32 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL(crc32_be_base);
diff --git a/lib/crc/crc64-main.c b/lib/crc/crc64-main.c
new file mode 100644
index 000000000000..038afc7f4d44
--- /dev/null
+++ b/lib/crc/crc64-main.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Normal 64-bit CRC calculation.
+ *
+ * This is a basic crc64 implementation following ECMA-182 specification,
+ * which can be found from,
+ * https://www.ecma-international.org/publications/standards/Ecma-182.htm
+ *
+ * Dr. Ross N. Williams has a great document to introduce the idea of CRC
+ * algorithm, here the CRC64 code is also inspired by the table-driven
+ * algorithm and detail example from this paper. This paper can be found
+ * from,
+ * http://www.ross.net/crc/download/crc_v3.txt
+ *
+ * crc64table[256] is the lookup table of a table-driven 64-bit CRC
+ * calculation, which is generated by gen_crc64table.c in kernel build
+ * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
+ * as well, which is defined as,
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set
+ * Specification and uses least-significant-bit first bit order:
+ *
+ * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
+ * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
+ * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
+ *
+ * Copyright 2018 SUSE Linux.
+ *   Author: Coly Li <colyli@suse.de>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crc64.h>
+#include "crc64table.h"
+
+static inline u64 __maybe_unused
+crc64_be_generic(u64 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
+	return crc;
+}
+
+static inline u64 __maybe_unused
+crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
+{
+	while (len--)
+		crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
+	return crc;
+}
+
+#ifdef CONFIG_CRC64_ARCH
+#include "crc64.h" /* $(SRCARCH)/crc64.h */
+#else
+#define crc64_be_arch crc64_be_generic
+#define crc64_nvme_arch crc64_nvme_generic
+#endif
+
+u64 crc64_be(u64 crc, const void *p, size_t len)
+{
+	return crc64_be_arch(crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_be);
+
+u64 crc64_nvme(u64 crc, const void *p, size_t len)
+{
+	return ~crc64_nvme_arch(~crc, p, len);
+}
+EXPORT_SYMBOL_GPL(crc64_nvme);
+
+#ifdef crc64_mod_init_arch
+static int __init crc64_mod_init(void)
+{
+	crc64_mod_init_arch();
+	return 0;
+}
+subsys_initcall(crc64_mod_init);
+
+static void __exit crc64_mod_exit(void)
+{
+}
+module_exit(crc64_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("CRC64 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crc/crc64.c b/lib/crc/crc64.c
deleted file mode 100644
index 5b1b17057f0a..000000000000
--- a/lib/crc/crc64.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Normal 64-bit CRC calculation.
- *
- * This is a basic crc64 implementation following ECMA-182 specification,
- * which can be found from,
- * https://www.ecma-international.org/publications/standards/Ecma-182.htm
- *
- * Dr. Ross N. Williams has a great document to introduce the idea of CRC
- * algorithm, here the CRC64 code is also inspired by the table-driven
- * algorithm and detail example from this paper. This paper can be found
- * from,
- * http://www.ross.net/crc/download/crc_v3.txt
- *
- * crc64table[256] is the lookup table of a table-driven 64-bit CRC
- * calculation, which is generated by gen_crc64table.c in kernel build
- * time. The polynomial of crc64 arithmetic is from ECMA-182 specification
- * as well, which is defined as,
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
- *
- * crc64nvmetable[256] uses the CRC64 polynomial from the NVME NVM Command Set
- * Specification and uses least-significant-bit first bit order:
- *
- * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
- * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
- * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
- *
- * Copyright 2018 SUSE Linux.
- *   Author: Coly Li <colyli@suse.de>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crc64.h>
-#include "crc64table.h"
-
-MODULE_DESCRIPTION("CRC64 calculations");
-MODULE_LICENSE("GPL v2");
-
-u64 crc64_be_generic(u64 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc << 8) ^ crc64table[(crc >> 56) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL_GPL(crc64_be_generic);
-
-u64 crc64_nvme_generic(u64 crc, const u8 *p, size_t len)
-{
-	while (len--)
-		crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *p++];
-	return crc;
-}
-EXPORT_SYMBOL_GPL(crc64_nvme_generic);
-- 
cgit v1.2.3


From 8c7c675155ce3f0f6b12c57a47298a390661652d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 19 Jun 2025 11:34:12 -0700
Subject: lib/crc: crc32: Document crc32_le(), crc32_be(), and crc32c()

Document these widely used functions.

Update kernel-api.rst to point to the correct place, instead of to
crc32-main.c which no longer contains kerneldoc comments.

Simplify the documentation in crc32poly.h to just point to the
corresponding functions, now that they are properly documented.  Change
the value of CRC32C_POLY_LE to lower case, for consistency.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250619183414.100082-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/core-api/kernel-api.rst |  4 +--
 include/linux/crc32.h                 | 66 +++++++++++++++++++++++++++++++++++
 include/linux/crc32poly.h             | 16 +++------
 3 files changed, 73 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 9c8370891a39..0096463c7d7f 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -148,14 +148,14 @@ CRC Functions
 .. kernel-doc:: lib/crc/crc16.c
    :export:
 
-.. kernel-doc:: lib/crc/crc32-main.c
-
 .. kernel-doc:: lib/crc/crc-ccitt.c
    :export:
 
 .. kernel-doc:: lib/crc/crc-itu-t.c
    :export:
 
+.. kernel-doc:: include/linux/crc32.h
+
 Base 2 log and power Functions
 ------------------------------
 
diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index 22dbe7144eb4..f9c173206d4d 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -5,8 +5,74 @@
 #include <linux/types.h>
 #include <linux/bitrev.h>
 
+/**
+ * crc32_le() - Compute least-significant-bit-first IEEE CRC-32
+ * @crc: Initial CRC value.  ~0 (recommended) or 0 for a new CRC computation, or
+ *	 the previous CRC value if computing incrementally.
+ * @p: Pointer to the data buffer
+ * @len: Length of data in bytes
+ *
+ * This implements the CRC variant that is often known as the IEEE CRC-32, or
+ * simply CRC-32, and is widely used in Ethernet and other applications:
+ *
+ * - Polynomial: x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 +
+ *		 x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ * - Bit order: Least-significant-bit-first
+ * - Polynomial in integer form: 0xedb88320
+ *
+ * This does *not* invert the CRC at the beginning or end.  The caller is
+ * expected to do that if it needs to.  Inverting at both ends is recommended.
+ *
+ * For new applications, prefer to use CRC-32C instead.  See crc32c().
+ *
+ * Context: Any context
+ * Return: The new CRC value
+ */
 u32 crc32_le(u32 crc, const void *p, size_t len);
+
+/**
+ * crc32_be() - Compute most-significant-bit-first IEEE CRC-32
+ * @crc: Initial CRC value.  ~0 (recommended) or 0 for a new CRC computation, or
+ *	 the previous CRC value if computing incrementally.
+ * @p: Pointer to the data buffer
+ * @len: Length of data in bytes
+ *
+ * crc32_be() is the same as crc32_le() except that crc32_be() computes the
+ * *most-significant-bit-first* variant of the CRC.  I.e., within each byte, the
+ * most significant bit is processed first (treated as highest order polynomial
+ * coefficient).  The same bit order is also used for the CRC value itself:
+ *
+ * - Polynomial: x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 +
+ *		 x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+ * - Bit order: Most-significant-bit-first
+ * - Polynomial in integer form: 0x04c11db7
+ *
+ * Context: Any context
+ * Return: The new CRC value
+ */
 u32 crc32_be(u32 crc, const void *p, size_t len);
+
+/**
+ * crc32c() - Compute CRC-32C
+ * @crc: Initial CRC value.  ~0 (recommended) or 0 for a new CRC computation, or
+ *	 the previous CRC value if computing incrementally.
+ * @p: Pointer to the data buffer
+ * @len: Length of data in bytes
+ *
+ * This implements CRC-32C, i.e. the Castagnoli CRC.  This is the recommended
+ * CRC variant to use in new applications that want a 32-bit CRC.
+ *
+ * - Polynomial: x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 +
+ *		 x^18 + x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0
+ * - Bit order: Least-significant-bit-first
+ * - Polynomial in integer form: 0x82f63b78
+ *
+ * This does *not* invert the CRC at the beginning or end.  The caller is
+ * expected to do that if it needs to.  Inverting at both ends is recommended.
+ *
+ * Context: Any context
+ * Return: The new CRC value
+ */
 u32 crc32c(u32 crc, const void *p, size_t len);
 
 /*
diff --git a/include/linux/crc32poly.h b/include/linux/crc32poly.h
index 62c4b7790a28..ccab711295fa 100644
--- a/include/linux/crc32poly.h
+++ b/include/linux/crc32poly.h
@@ -2,19 +2,13 @@
 #ifndef _LINUX_CRC32_POLY_H
 #define _LINUX_CRC32_POLY_H
 
-/*
- * There are multiple 16-bit CRC polynomials in common use, but this is
- * *the* standard CRC-32 polynomial, first popularized by Ethernet.
- * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
- */
+/* The polynomial used by crc32_le(), in integer form.  See crc32_le(). */
 #define CRC32_POLY_LE 0xedb88320
+
+/* The polynomial used by crc32_be(), in integer form.  See crc32_be(). */
 #define CRC32_POLY_BE 0x04c11db7
 
-/*
- * This is the CRC32c polynomial, as outlined by Castagnoli.
- * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
- * x^8+x^6+x^0
- */
-#define CRC32C_POLY_LE 0x82F63B78
+/* The polynomial used by crc32c(), in integer form.  See crc32c(). */
+#define CRC32C_POLY_LE 0x82f63b78
 
 #endif /* _LINUX_CRC32_POLY_H */
-- 
cgit v1.2.3


From 0b5a58c078167f73d10711734cdc8ea592561ca9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 19 Jun 2025 11:34:13 -0700
Subject: lib/crc: crc32: Change crc32() from macro to inline function and
 remove cast

There's no need for crc32() to be a macro.  Make it an inline function
instead.  Also, remove the cast of the data pointer to
'unsigned char const *', which is no longer necessary now that the type
used in the function prototype is 'const void *'.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250619183414.100082-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/linux/crc32.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index f9c173206d4d..da78b215ff2e 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -30,6 +30,12 @@
  */
 u32 crc32_le(u32 crc, const void *p, size_t len);
 
+/* This is just an alias for crc32_le(). */
+static inline u32 crc32(u32 crc, const void *p, size_t len)
+{
+	return crc32_le(crc, p, len);
+}
+
 /**
  * crc32_be() - Compute most-significant-bit-first IEEE CRC-32
  * @crc: Initial CRC value.  ~0 (recommended) or 0 for a new CRC computation, or
@@ -90,8 +96,6 @@ u32 crc32_optimizations(void);
 static inline u32 crc32_optimizations(void) { return 0; }
 #endif
 
-#define crc32(seed, data, length)  crc32_le(seed, (unsigned char const *)(data), length)
-
 /*
  * Helpers for hash table generation of ethernet nics:
  *
-- 
cgit v1.2.3


From 94b2030968be70b33fed9a5514a5967c7f20aebc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Jun 2025 11:36:54 -0600
Subject: io_uring: remove errant ';' from IORING_CQE_F_TSTAMP_HW definition

An errant ';' slipped into that definition, which will cause some
compilers to complain when it's used in an application:

timestamp.c:257:45: error: empty expression statement has no effect; remove unnecessary ';' to silence this warning [-Werror,-Wextra-semi-stmt]
  257 |                 hwts = cqe->flags & IORING_CQE_F_TSTAMP_HW;
      |                                                           ^

Fixes: 9e4ed359b8ef ("io_uring/netcmd: add tx timestamping cmd support")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 85600ad0ac08..b6be063693c8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -980,7 +980,7 @@ enum io_uring_socket_op {
 /* The cqe->flags bit from which the timestamp type is stored */
 #define IORING_TIMESTAMP_TYPE_SHIFT	(IORING_TIMESTAMP_HW_SHIFT + 1)
 /* The cqe->flags flag signifying whether it's a hardware timestamp */
-#define IORING_CQE_F_TSTAMP_HW		((__u32)1 << IORING_TIMESTAMP_HW_SHIFT);
+#define IORING_CQE_F_TSTAMP_HW		((__u32)1 << IORING_TIMESTAMP_HW_SHIFT)
 
 struct io_timespec {
 	__u64		tv_sec;
-- 
cgit v1.2.3


From a70e9f647f501e36a6a092888b1ea7386b7c5664 Mon Sep 17 00:00:00 2001
From: Jinjie Ruan <ruanjinjie@huawei.com>
Date: Tue, 24 Jun 2025 20:35:56 +0200
Subject: entry: Split generic entry into generic exception and syscall entry

Currently CONFIG_GENERIC_ENTRY enables both the generic exception
entry logic and the generic syscall entry logic, which are otherwise
loosely coupled.

Introduce separate config options for these so that architectures can
select the two independently. This will make it easier for
architectures to migrate to generic entry code.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/20250213130007.1418890-2-ruanjinjie@huawei.com
Link: https://lore.kernel.org/all/20250624-generic-entry-split-v1-1-53d5ef4f94df@linaro.org

[Linus Walleij: rebase onto v6.16-rc1]
---
 MAINTAINERS                      |   1 +
 arch/Kconfig                     |   9 +
 include/linux/entry-common.h     | 382 +-------------------------------------
 include/linux/irq-entry-common.h | 389 +++++++++++++++++++++++++++++++++++++++
 kernel/entry/Makefile            |   3 +-
 kernel/entry/common.c            | 113 +-----------
 kernel/entry/syscall-common.c    | 112 +++++++++++
 kernel/sched/core.c              |   8 +-
 8 files changed, 519 insertions(+), 498 deletions(-)
 create mode 100644 include/linux/irq-entry-common.h
 create mode 100644 kernel/entry/syscall-common.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index a92290fffa16..e92292a0e480 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10052,6 +10052,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/entry
 F:	include/linux/entry-common.h
 F:	include/linux/entry-kvm.h
+F:	include/linux/irq-entry-common.h
 F:	kernel/entry/
 
 GENERIC GPIO I2C DRIVER
diff --git a/arch/Kconfig b/arch/Kconfig
index a3308a220f86..9233fbfd8dd3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -64,8 +64,17 @@ config HOTPLUG_PARALLEL
 	bool
 	select HOTPLUG_SPLIT_STARTUP
 
+config GENERIC_IRQ_ENTRY
+	bool
+
+config GENERIC_SYSCALL
+	bool
+	depends on GENERIC_IRQ_ENTRY
+
 config GENERIC_ENTRY
 	bool
+	select GENERIC_IRQ_ENTRY
+	select GENERIC_SYSCALL
 
 config KPROBES
 	bool "Kprobes"
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..7177436f0f9e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,28 +2,16 @@
 #ifndef __LINUX_ENTRYCOMMON_H
 #define __LINUX_ENTRYCOMMON_H
 
-#include <linux/static_call_types.h>
+#include <linux/irq-entry-common.h>
 #include <linux/ptrace.h>
-#include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
-#include <linux/context_tracking.h>
 #include <linux/livepatch.h>
 #include <linux/resume_user_mode.h>
-#include <linux/tick.h>
-#include <linux/kmsan.h>
 
 #include <asm/entry-common.h>
 #include <asm/syscall.h>
 
-/*
- * Define dummy _TIF work flags if not defined by the architecture or for
- * disabled functionality.
- */
-#ifndef _TIF_PATCH_PENDING
-# define _TIF_PATCH_PENDING		(0)
-#endif
-
 #ifndef _TIF_UPROBE
 # define _TIF_UPROBE			(0)
 #endif
@@ -56,69 +44,6 @@
 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
 				 ARCH_SYSCALL_WORK_EXIT)
 
-/*
- * TIF flags handled in exit_to_user_mode_loop()
- */
-#ifndef ARCH_EXIT_TO_USER_MODE_WORK
-# define ARCH_EXIT_TO_USER_MODE_WORK		(0)
-#endif
-
-#define EXIT_TO_USER_MODE_WORK						\
-	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
-	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
-	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
-	 ARCH_EXIT_TO_USER_MODE_WORK)
-
-/**
- * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
- * @regs:	Pointer to currents pt_regs
- *
- * Defaults to an empty implementation. Can be replaced by architecture
- * specific code.
- *
- * Invoked from syscall_enter_from_user_mode() in the non-instrumentable
- * section. Use __always_inline so the compiler cannot push it out of line
- * and make it instrumentable.
- */
-static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs);
-
-#ifndef arch_enter_from_user_mode
-static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
-#endif
-
-/**
- * enter_from_user_mode - Establish state when coming from user mode
- *
- * Syscall/interrupt entry disables interrupts, but user mode is traced as
- * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
- *
- * 1) Tell lockdep that interrupts are disabled
- * 2) Invoke context tracking if enabled to reactivate RCU
- * 3) Trace interrupts off state
- *
- * Invoked from architecture specific syscall entry code with interrupts
- * disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct and interrupts are still
- * disabled. The subsequent functions can be instrumented.
- *
- * This is invoked when there is architecture specific functionality to be
- * done between establishing state and enabling interrupts. The caller must
- * enable interrupts before invoking syscall_enter_from_user_mode_work().
- */
-static __always_inline void enter_from_user_mode(struct pt_regs *regs)
-{
-	arch_enter_from_user_mode(regs);
-	lockdep_hardirqs_off(CALLER_ADDR0);
-
-	CT_WARN_ON(__ct_state() != CT_STATE_USER);
-	user_exit_irqoff();
-
-	instrumentation_begin();
-	kmsan_unpoison_entry_regs(regs);
-	trace_hardirqs_off_finish();
-	instrumentation_end();
-}
-
 /**
  * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
  * @regs:	Pointer to currents pt_regs
@@ -203,170 +128,6 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
 	return ret;
 }
 
-/**
- * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
- * @ti_work:	Cached TIF flags gathered with interrupts disabled
- *
- * Defaults to local_irq_enable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
-
-#ifndef local_irq_enable_exit_to_user
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
-{
-	local_irq_enable();
-}
-#endif
-
-/**
- * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
- *
- * Defaults to local_irq_disable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_disable_exit_to_user(void);
-
-#ifndef local_irq_disable_exit_to_user
-static inline void local_irq_disable_exit_to_user(void)
-{
-	local_irq_disable();
-}
-#endif
-
-/**
- * arch_exit_to_user_mode_work - Architecture specific TIF work for exit
- *				 to user mode.
- * @regs:	Pointer to currents pt_regs
- * @ti_work:	Cached TIF flags gathered with interrupts disabled
- *
- * Invoked from exit_to_user_mode_loop() with interrupt enabled
- *
- * Defaults to NOOP. Can be supplied by architecture specific code.
- */
-static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
-					       unsigned long ti_work);
-
-#ifndef arch_exit_to_user_mode_work
-static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
-					       unsigned long ti_work)
-{
-}
-#endif
-
-/**
- * arch_exit_to_user_mode_prepare - Architecture specific preparation for
- *				    exit to user mode.
- * @regs:	Pointer to currents pt_regs
- * @ti_work:	Cached TIF flags gathered with interrupts disabled
- *
- * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last
- * function before return. Defaults to NOOP.
- */
-static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
-						  unsigned long ti_work);
-
-#ifndef arch_exit_to_user_mode_prepare
-static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
-						  unsigned long ti_work)
-{
-}
-#endif
-
-/**
- * arch_exit_to_user_mode - Architecture specific final work before
- *			    exit to user mode.
- *
- * Invoked from exit_to_user_mode() with interrupt disabled as the last
- * function before return. Defaults to NOOP.
- *
- * This needs to be __always_inline because it is non-instrumentable code
- * invoked after context tracking switched to user mode.
- *
- * An architecture implementation must not do anything complex, no locking
- * etc. The main purpose is for speculation mitigations.
- */
-static __always_inline void arch_exit_to_user_mode(void);
-
-#ifndef arch_exit_to_user_mode
-static __always_inline void arch_exit_to_user_mode(void) { }
-#endif
-
-/**
- * arch_do_signal_or_restart -  Architecture specific signal delivery function
- * @regs:	Pointer to currents pt_regs
- *
- * Invoked from exit_to_user_mode_loop().
- */
-void arch_do_signal_or_restart(struct pt_regs *regs);
-
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- */
-unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-				     unsigned long ti_work);
-
-/**
- * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
- * @regs:	Pointer to pt_regs on entry stack
- *
- * 1) check that interrupts are disabled
- * 2) call tick_nohz_user_enter_prepare()
- * 3) call exit_to_user_mode_loop() if any flags from
- *    EXIT_TO_USER_MODE_WORK are set
- * 4) check that interrupts are still disabled
- */
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
-{
-	unsigned long ti_work;
-
-	lockdep_assert_irqs_disabled();
-
-	/* Flush pending rcuog wakeup before the last need_resched() check */
-	tick_nohz_user_enter_prepare();
-
-	ti_work = read_thread_flags();
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
-
-	arch_exit_to_user_mode_prepare(regs, ti_work);
-
-	/* Ensure that kernel state is sane for a return to userspace */
-	kmap_assert_nomap();
-	lockdep_assert_irqs_disabled();
-	lockdep_sys_exit();
-}
-
-/**
- * exit_to_user_mode - Fixup state when exiting to user mode
- *
- * Syscall/interrupt exit enables interrupts, but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about it.
- *
- * 1) Trace interrupts on state
- * 2) Invoke context tracking if enabled to adjust RCU state
- * 3) Invoke architecture specific last minute exit code, e.g. speculation
- *    mitigations, etc.: arch_exit_to_user_mode()
- * 4) Tell lockdep that interrupts are enabled
- *
- * Invoked from architecture specific code when syscall_exit_to_user_mode()
- * is not suitable as the last step before returning to userspace. Must be
- * invoked with interrupts disabled and the caller must be
- * non-instrumentable.
- * The caller has to invoke syscall_exit_to_user_mode_work() before this.
- */
-static __always_inline void exit_to_user_mode(void)
-{
-	instrumentation_begin();
-	trace_hardirqs_on_prepare();
-	lockdep_hardirqs_on_prepare();
-	instrumentation_end();
-
-	user_enter_irqoff();
-	arch_exit_to_user_mode();
-	lockdep_hardirqs_on(CALLER_ADDR0);
-}
-
 /**
  * syscall_exit_work - Handle work before returning to user mode
  * @regs:	Pointer to current pt_regs
@@ -451,145 +212,4 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
 	exit_to_user_mode();
 }
 
-/**
- * irqentry_enter_from_user_mode - Establish state before invoking the irq handler
- * @regs:	Pointer to currents pt_regs
- *
- * Invoked from architecture specific entry code with interrupts disabled.
- * Can only be called when the interrupt entry came from user mode. The
- * calling code must be non-instrumentable.  When the function returns all
- * state is correct and the subsequent functions can be instrumented.
- *
- * The function establishes state (lockdep, RCU (context tracking), tracing)
- */
-void irqentry_enter_from_user_mode(struct pt_regs *regs);
-
-/**
- * irqentry_exit_to_user_mode - Interrupt exit work
- * @regs:	Pointer to current's pt_regs
- *
- * Invoked with interrupts disabled and fully valid regs. Returns with all
- * work handled, interrupts disabled such that the caller can immediately
- * switch to user mode. Called from architecture specific interrupt
- * handling code.
- *
- * The call order is #2 and #3 as described in syscall_exit_to_user_mode().
- * Interrupt exit is not invoking #1 which is the syscall specific one time
- * work.
- */
-void irqentry_exit_to_user_mode(struct pt_regs *regs);
-
-#ifndef irqentry_state
-/**
- * struct irqentry_state - Opaque object for exception state storage
- * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the
- *            exit path has to invoke ct_irq_exit().
- * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that
- *           lockdep state is restored correctly on exit from nmi.
- *
- * This opaque object is filled in by the irqentry_*_enter() functions and
- * must be passed back into the corresponding irqentry_*_exit() functions
- * when the exception is complete.
- *
- * Callers of irqentry_*_[enter|exit]() must consider this structure opaque
- * and all members private.  Descriptions of the members are provided to aid in
- * the maintenance of the irqentry_*() functions.
- */
-typedef struct irqentry_state {
-	union {
-		bool	exit_rcu;
-		bool	lockdep;
-	};
-} irqentry_state_t;
-#endif
-
-/**
- * irqentry_enter - Handle state tracking on ordinary interrupt entries
- * @regs:	Pointer to pt_regs of interrupted context
- *
- * Invokes:
- *  - lockdep irqflag state tracking as low level ASM entry disabled
- *    interrupts.
- *
- *  - Context tracking if the exception hit user mode.
- *
- *  - The hardirq tracer to keep the state consistent as low level ASM
- *    entry disabled interrupts.
- *
- * As a precondition, this requires that the entry came from user mode,
- * idle, or a kernel context in which RCU is watching.
- *
- * For kernel mode entries RCU handling is done conditional. If RCU is
- * watching then the only RCU requirement is to check whether the tick has
- * to be restarted. If RCU is not watching then ct_irq_enter() has to be
- * invoked on entry and ct_irq_exit() on exit.
- *
- * Avoiding the ct_irq_enter/exit() calls is an optimization but also
- * solves the problem of kernel mode pagefaults which can schedule, which
- * is not possible after invoking ct_irq_enter() without undoing it.
- *
- * For user mode entries irqentry_enter_from_user_mode() is invoked to
- * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
- * would not be possible.
- *
- * Returns: An opaque object that must be passed to idtentry_exit()
- */
-irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
-
-/**
- * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
- *
- * Conditional reschedule with additional sanity checks.
- */
-void raw_irqentry_exit_cond_resched(void);
-#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched
-#define irqentry_exit_cond_resched_dynamic_disabled	NULL
-DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
-#define irqentry_exit_cond_resched()	static_call(irqentry_exit_cond_resched)()
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
-void dynamic_irqentry_exit_cond_resched(void);
-#define irqentry_exit_cond_resched()	dynamic_irqentry_exit_cond_resched()
-#endif
-#else /* CONFIG_PREEMPT_DYNAMIC */
-#define irqentry_exit_cond_resched()	raw_irqentry_exit_cond_resched()
-#endif /* CONFIG_PREEMPT_DYNAMIC */
-
-/**
- * irqentry_exit - Handle return from exception that used irqentry_enter()
- * @regs:	Pointer to pt_regs (exception entry regs)
- * @state:	Return value from matching call to irqentry_enter()
- *
- * Depending on the return target (kernel/user) this runs the necessary
- * preemption and work checks if possible and required and returns to
- * the caller with interrupts disabled and no further work pending.
- *
- * This is the last action before returning to the low level ASM code which
- * just needs to return to the appropriate context.
- *
- * Counterpart to irqentry_enter().
- */
-void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
-
-/**
- * irqentry_nmi_enter - Handle NMI entry
- * @regs:	Pointer to currents pt_regs
- *
- * Similar to irqentry_enter() but taking care of the NMI constraints.
- */
-irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
-
-/**
- * irqentry_nmi_exit - Handle return from NMI handling
- * @regs:	Pointer to pt_regs (NMI entry regs)
- * @irq_state:	Return value from matching call to irqentry_nmi_enter()
- *
- * Last action before returning to the low level assembly code.
- *
- * Counterpart to irqentry_nmi_enter().
- */
-void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
-
 #endif
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
new file mode 100644
index 000000000000..8af374331900
--- /dev/null
+++ b/include/linux/irq-entry-common.h
@@ -0,0 +1,389 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_IRQENTRYCOMMON_H
+#define __LINUX_IRQENTRYCOMMON_H
+
+#include <linux/static_call_types.h>
+#include <linux/syscalls.h>
+#include <linux/context_tracking.h>
+#include <linux/tick.h>
+#include <linux/kmsan.h>
+
+#include <asm/entry-common.h>
+
+/*
+ * Define dummy _TIF work flags if not defined by the architecture or for
+ * disabled functionality.
+ */
+#ifndef _TIF_PATCH_PENDING
+# define _TIF_PATCH_PENDING		(0)
+#endif
+
+/*
+ * TIF flags handled in exit_to_user_mode_loop()
+ */
+#ifndef ARCH_EXIT_TO_USER_MODE_WORK
+# define ARCH_EXIT_TO_USER_MODE_WORK		(0)
+#endif
+
+#define EXIT_TO_USER_MODE_WORK						\
+	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |		\
+	 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY |			\
+	 _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL |			\
+	 ARCH_EXIT_TO_USER_MODE_WORK)
+
+/**
+ * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Defaults to an empty implementation. Can be replaced by architecture
+ * specific code.
+ *
+ * Invoked from syscall_enter_from_user_mode() in the non-instrumentable
+ * section. Use __always_inline so the compiler cannot push it out of line
+ * and make it instrumentable.
+ */
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs);
+
+#ifndef arch_enter_from_user_mode
+static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
+#endif
+
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall/interrupt entry disables interrupts, but user mode is traced as
+ * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ *
+ * Invoked from architecture specific syscall entry code with interrupts
+ * disabled. The calling code has to be non-instrumentable. When the
+ * function returns all state is correct and interrupts are still
+ * disabled. The subsequent functions can be instrumented.
+ *
+ * This is invoked when there is architecture specific functionality to be
+ * done between establishing state and enabling interrupts. The caller must
+ * enable interrupts before invoking syscall_enter_from_user_mode_work().
+ */
+static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+{
+	arch_enter_from_user_mode(regs);
+	lockdep_hardirqs_off(CALLER_ADDR0);
+
+	CT_WARN_ON(__ct_state() != CT_STATE_USER);
+	user_exit_irqoff();
+
+	instrumentation_begin();
+	kmsan_unpoison_entry_regs(regs);
+	trace_hardirqs_off_finish();
+	instrumentation_end();
+}
+
+/**
+ * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Defaults to local_irq_enable(). Can be supplied by architecture specific
+ * code.
+ */
+static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
+
+#ifndef local_irq_enable_exit_to_user
+static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
+{
+	local_irq_enable();
+}
+#endif
+
+/**
+ * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
+ *
+ * Defaults to local_irq_disable(). Can be supplied by architecture specific
+ * code.
+ */
+static inline void local_irq_disable_exit_to_user(void);
+
+#ifndef local_irq_disable_exit_to_user
+static inline void local_irq_disable_exit_to_user(void)
+{
+	local_irq_disable();
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode_work - Architecture specific TIF work for exit
+ *				 to user mode.
+ * @regs:	Pointer to currents pt_regs
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Invoked from exit_to_user_mode_loop() with interrupt enabled
+ *
+ * Defaults to NOOP. Can be supplied by architecture specific code.
+ */
+static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+					       unsigned long ti_work);
+
+#ifndef arch_exit_to_user_mode_work
+static inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
+					       unsigned long ti_work)
+{
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode_prepare - Architecture specific preparation for
+ *				    exit to user mode.
+ * @regs:	Pointer to currents pt_regs
+ * @ti_work:	Cached TIF flags gathered with interrupts disabled
+ *
+ * Invoked from exit_to_user_mode_prepare() with interrupt disabled as the last
+ * function before return. Defaults to NOOP.
+ */
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+						  unsigned long ti_work);
+
+#ifndef arch_exit_to_user_mode_prepare
+static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
+						  unsigned long ti_work)
+{
+}
+#endif
+
+/**
+ * arch_exit_to_user_mode - Architecture specific final work before
+ *			    exit to user mode.
+ *
+ * Invoked from exit_to_user_mode() with interrupt disabled as the last
+ * function before return. Defaults to NOOP.
+ *
+ * This needs to be __always_inline because it is non-instrumentable code
+ * invoked after context tracking switched to user mode.
+ *
+ * An architecture implementation must not do anything complex, no locking
+ * etc. The main purpose is for speculation mitigations.
+ */
+static __always_inline void arch_exit_to_user_mode(void);
+
+#ifndef arch_exit_to_user_mode
+static __always_inline void arch_exit_to_user_mode(void) { }
+#endif
+
+/**
+ * arch_do_signal_or_restart -  Architecture specific signal delivery function
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked from exit_to_user_mode_loop().
+ */
+void arch_do_signal_or_restart(struct pt_regs *regs);
+
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+				     unsigned long ti_work);
+
+/**
+ * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs:	Pointer to pt_regs on entry stack
+ *
+ * 1) check that interrupts are disabled
+ * 2) call tick_nohz_user_enter_prepare()
+ * 3) call exit_to_user_mode_loop() if any flags from
+ *    EXIT_TO_USER_MODE_WORK are set
+ * 4) check that interrupts are still disabled
+ */
+static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+	unsigned long ti_work;
+
+	lockdep_assert_irqs_disabled();
+
+	/* Flush pending rcuog wakeup before the last need_resched() check */
+	tick_nohz_user_enter_prepare();
+
+	ti_work = read_thread_flags();
+	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
+		ti_work = exit_to_user_mode_loop(regs, ti_work);
+
+	arch_exit_to_user_mode_prepare(regs, ti_work);
+
+	/* Ensure that kernel state is sane for a return to userspace */
+	kmap_assert_nomap();
+	lockdep_assert_irqs_disabled();
+	lockdep_sys_exit();
+}
+
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall/interrupt exit enables interrupts, but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Invoke architecture specific last minute exit code, e.g. speculation
+ *    mitigations, etc.: arch_exit_to_user_mode()
+ * 4) Tell lockdep that interrupts are enabled
+ *
+ * Invoked from architecture specific code when syscall_exit_to_user_mode()
+ * is not suitable as the last step before returning to userspace. Must be
+ * invoked with interrupts disabled and the caller must be
+ * non-instrumentable.
+ * The caller has to invoke syscall_exit_to_user_mode_work() before this.
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+	instrumentation_begin();
+	trace_hardirqs_on_prepare();
+	lockdep_hardirqs_on_prepare();
+	instrumentation_end();
+
+	user_enter_irqoff();
+	arch_exit_to_user_mode();
+	lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
+/**
+ * irqentry_enter_from_user_mode - Establish state before invoking the irq handler
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Invoked from architecture specific entry code with interrupts disabled.
+ * Can only be called when the interrupt entry came from user mode. The
+ * calling code must be non-instrumentable.  When the function returns all
+ * state is correct and the subsequent functions can be instrumented.
+ *
+ * The function establishes state (lockdep, RCU (context tracking), tracing)
+ */
+void irqentry_enter_from_user_mode(struct pt_regs *regs);
+
+/**
+ * irqentry_exit_to_user_mode - Interrupt exit work
+ * @regs:	Pointer to current's pt_regs
+ *
+ * Invoked with interrupts disabled and fully valid regs. Returns with all
+ * work handled, interrupts disabled such that the caller can immediately
+ * switch to user mode. Called from architecture specific interrupt
+ * handling code.
+ *
+ * The call order is #2 and #3 as described in syscall_exit_to_user_mode().
+ * Interrupt exit is not invoking #1 which is the syscall specific one time
+ * work.
+ */
+void irqentry_exit_to_user_mode(struct pt_regs *regs);
+
+#ifndef irqentry_state
+/**
+ * struct irqentry_state - Opaque object for exception state storage
+ * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the
+ *            exit path has to invoke ct_irq_exit().
+ * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that
+ *           lockdep state is restored correctly on exit from nmi.
+ *
+ * This opaque object is filled in by the irqentry_*_enter() functions and
+ * must be passed back into the corresponding irqentry_*_exit() functions
+ * when the exception is complete.
+ *
+ * Callers of irqentry_*_[enter|exit]() must consider this structure opaque
+ * and all members private.  Descriptions of the members are provided to aid in
+ * the maintenance of the irqentry_*() functions.
+ */
+typedef struct irqentry_state {
+	union {
+		bool	exit_rcu;
+		bool	lockdep;
+	};
+} irqentry_state_t;
+#endif
+
+/**
+ * irqentry_enter - Handle state tracking on ordinary interrupt entries
+ * @regs:	Pointer to pt_regs of interrupted context
+ *
+ * Invokes:
+ *  - lockdep irqflag state tracking as low level ASM entry disabled
+ *    interrupts.
+ *
+ *  - Context tracking if the exception hit user mode.
+ *
+ *  - The hardirq tracer to keep the state consistent as low level ASM
+ *    entry disabled interrupts.
+ *
+ * As a precondition, this requires that the entry came from user mode,
+ * idle, or a kernel context in which RCU is watching.
+ *
+ * For kernel mode entries RCU handling is done conditional. If RCU is
+ * watching then the only RCU requirement is to check whether the tick has
+ * to be restarted. If RCU is not watching then ct_irq_enter() has to be
+ * invoked on entry and ct_irq_exit() on exit.
+ *
+ * Avoiding the ct_irq_enter/exit() calls is an optimization but also
+ * solves the problem of kernel mode pagefaults which can schedule, which
+ * is not possible after invoking ct_irq_enter() without undoing it.
+ *
+ * For user mode entries irqentry_enter_from_user_mode() is invoked to
+ * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
+ * would not be possible.
+ *
+ * Returns: An opaque object that must be passed to idtentry_exit()
+ */
+irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
+ *
+ * Conditional reschedule with additional sanity checks.
+ */
+void raw_irqentry_exit_cond_resched(void);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define irqentry_exit_cond_resched_dynamic_enabled	raw_irqentry_exit_cond_resched
+#define irqentry_exit_cond_resched_dynamic_disabled	NULL
+DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#define irqentry_exit_cond_resched()	static_call(irqentry_exit_cond_resched)()
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void);
+#define irqentry_exit_cond_resched()	dynamic_irqentry_exit_cond_resched()
+#endif
+#else /* CONFIG_PREEMPT_DYNAMIC */
+#define irqentry_exit_cond_resched()	raw_irqentry_exit_cond_resched()
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+/**
+ * irqentry_exit - Handle return from exception that used irqentry_enter()
+ * @regs:	Pointer to pt_regs (exception entry regs)
+ * @state:	Return value from matching call to irqentry_enter()
+ *
+ * Depending on the return target (kernel/user) this runs the necessary
+ * preemption and work checks if possible and required and returns to
+ * the caller with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to irqentry_enter().
+ */
+void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
+
+/**
+ * irqentry_nmi_enter - Handle NMI entry
+ * @regs:	Pointer to currents pt_regs
+ *
+ * Similar to irqentry_enter() but taking care of the NMI constraints.
+ */
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs);
+
+/**
+ * irqentry_nmi_exit - Handle return from NMI handling
+ * @regs:	Pointer to pt_regs (NMI entry regs)
+ * @irq_state:	Return value from matching call to irqentry_nmi_enter()
+ *
+ * Last action before returning to the low level assembly code.
+ *
+ * Counterpart to irqentry_nmi_enter().
+ */
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state);
+
+#endif
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index d4b8bd0af79b..77fcd83dd663 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
 CFLAGS_REMOVE_common.o	 = -fstack-protector -fstack-protector-strong
 CFLAGS_common.o		+= -fno-stack-protector
 
-obj-$(CONFIG_GENERIC_ENTRY) 		+= common.o syscall_user_dispatch.o
+obj-$(CONFIG_GENERIC_IRQ_ENTRY) 	+= common.o
+obj-$(CONFIG_GENERIC_SYSCALL) 		+= syscall-common.o syscall_user_dispatch.o
 obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK)	+= kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417c..b82032777310 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,84 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include <linux/context_tracking.h>
-#include <linux/entry-common.h>
+#include <linux/irq-entry-common.h>
 #include <linux/resume_user_mode.h>
 #include <linux/highmem.h>
 #include <linux/jump_label.h>
 #include <linux/kmsan.h>
 #include <linux/livepatch.h>
-#include <linux/audit.h>
 #include <linux/tick.h>
 
-#include "common.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
-	if (unlikely(audit_context())) {
-		unsigned long args[6];
-
-		syscall_get_arguments(current, regs, args);
-		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
-	}
-}
-
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
-				unsigned long work)
-{
-	long ret = 0;
-
-	/*
-	 * Handle Syscall User Dispatch.  This must comes first, since
-	 * the ABI here can be something that doesn't make sense for
-	 * other syscall_work features.
-	 */
-	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
-		if (syscall_user_dispatch(regs))
-			return -1L;
-	}
-
-	/* Handle ptrace */
-	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
-		ret = ptrace_report_syscall_entry(regs);
-		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
-			return -1L;
-	}
-
-	/* Do seccomp after ptrace, to catch any tracer changes. */
-	if (work & SYSCALL_WORK_SECCOMP) {
-		ret = __secure_computing();
-		if (ret == -1L)
-			return ret;
-	}
-
-	/* Either of the above might have changed the syscall number */
-	syscall = syscall_get_nr(current, regs);
-
-	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
-		trace_sys_enter(regs, syscall);
-		/*
-		 * Probes or BPF hooks in the tracepoint may have changed the
-		 * system call number as well.
-		 */
-		syscall = syscall_get_nr(current, regs);
-	}
-
-	syscall_enter_audit(regs, syscall);
-
-	return ret ? : syscall;
-}
-
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
-	enter_from_user_mode(regs);
-	instrumentation_begin();
-	local_irq_enable();
-	instrumentation_end();
-}
-
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
@@ -133,46 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 	return ti_work;
 }
 
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
-{
-	if (work & SYSCALL_WORK_SYSCALL_EMU)
-		return false;
-
-	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
-	bool step;
-
-	/*
-	 * If the syscall was rolled back due to syscall user dispatching,
-	 * then the tracers below are not invoked for the same reason as
-	 * the entry side was not invoked in syscall_trace_enter(): The ABI
-	 * of these syscalls is unknown.
-	 */
-	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
-		if (unlikely(current->syscall_dispatch.on_dispatch)) {
-			current->syscall_dispatch.on_dispatch = false;
-			return;
-		}
-	}
-
-	audit_syscall_exit(regs);
-
-	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
-		trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
-	step = report_single_step(work);
-	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
-		ptrace_report_syscall_exit(regs, step);
-}
-
 noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
 {
 	enter_from_user_mode(regs);
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
new file mode 100644
index 000000000000..66e6ba7fa80c
--- /dev/null
+++ b/kernel/entry/syscall-common.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/audit.h>
+#include <linux/entry-common.h>
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+	if (unlikely(audit_context())) {
+		unsigned long args[6];
+
+		syscall_get_arguments(current, regs, args);
+		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+	}
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+				unsigned long work)
+{
+	long ret = 0;
+
+	/*
+	 * Handle Syscall User Dispatch.  This must comes first, since
+	 * the ABI here can be something that doesn't make sense for
+	 * other syscall_work features.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (syscall_user_dispatch(regs))
+			return -1L;
+	}
+
+	/* Handle ptrace */
+	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+		ret = ptrace_report_syscall_entry(regs);
+		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+			return -1L;
+	}
+
+	/* Do seccomp after ptrace, to catch any tracer changes. */
+	if (work & SYSCALL_WORK_SECCOMP) {
+		ret = __secure_computing();
+		if (ret == -1L)
+			return ret;
+	}
+
+	/* Either of the above might have changed the syscall number */
+	syscall = syscall_get_nr(current, regs);
+
+	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
+		trace_sys_enter(regs, syscall);
+		/*
+		 * Probes or BPF hooks in the tracepoint may have changed the
+		 * system call number as well.
+		 */
+		syscall = syscall_get_nr(current, regs);
+	}
+
+	syscall_enter_audit(regs, syscall);
+
+	return ret ? : syscall;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+	enter_from_user_mode(regs);
+	instrumentation_begin();
+	local_irq_enable();
+	instrumentation_end();
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+	if (work & SYSCALL_WORK_SYSCALL_EMU)
+		return false;
+
+	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+	bool step;
+
+	/*
+	 * If the syscall was rolled back due to syscall user dispatching,
+	 * then the tracers below are not invoked for the same reason as
+	 * the entry side was not invoked in syscall_trace_enter(): The ABI
+	 * of these syscalls is unknown.
+	 */
+	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+		if (unlikely(current->syscall_dispatch.on_dispatch)) {
+			current->syscall_dispatch.on_dispatch = false;
+			return;
+		}
+	}
+
+	audit_syscall_exit(regs);
+
+	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+		trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+	step = report_single_step(work);
+	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+		ptrace_report_syscall_exit(regs, step);
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dce50fa57471..e6269e7954f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -69,8 +69,8 @@
 #include <linux/livepatch_sched.h>
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
-# ifdef CONFIG_GENERIC_ENTRY
-#  include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+#  include <linux/irq-entry-common.h>
 # endif
 #endif
 
@@ -7427,8 +7427,8 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
 
 #ifdef CONFIG_PREEMPT_DYNAMIC
 
-#ifdef CONFIG_GENERIC_ENTRY
-#include <linux/entry-common.h>
+#ifdef CONFIG_GENERIC_IRQ_ENTRY
+#include <linux/irq-entry-common.h>
 #endif
 
 /*
-- 
cgit v1.2.3


From 9b8b84879d4adc506b0d3944e20b28d9f3f6994b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 18 Jun 2025 15:00:45 +0900
Subject: block: Increase BLK_DEF_MAX_SECTORS_CAP

Back in 2015, commit d2be537c3ba3 ("block: bump BLK_DEF_MAX_SECTORS to
2560") increased the default maximum size of a block device I/O to 2560
sectors (1280 KiB) to "accommodate a 10-data-disk stripe write with
chunk size 128k". This choice is rather arbitrary and since then,
improvements to the block layer have software RAID drivers correctly
advertize their stripe width through chunk_sectors and abuses of
BLK_DEF_MAX_SECTORS_CAP by drivers (to set the HW limit rather than the
default user controlled maximum I/O size) have been fixed.

Since many block devices can benefit from a larger value of
BLK_DEF_MAX_SECTORS_CAP, and in particular HDDs, increase this value to
be 4MiB, or 8192 sectors.

And given that BLK_DEF_MAX_SECTORS_CAP is only used in the block layer
and should not be used by drivers directly, move this macro definition
to the block layer internal header file block/blk.h.

Suggested-by: Martin K . Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20250618060045.37593-1-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk.h            | 9 +++++++++
 include/linux/blkdev.h | 9 ---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..1141b343d0b5 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,15 @@
 
 struct elevator_type;
 
+/*
+ * Default upper limit for the software max_sectors limit used for regular I/Os.
+ * This can be increased through sysfs.
+ *
+ * This should not be confused with the max_hw_sector limit that is entirely
+ * controlled by the block device driver, usually based on hardware limits.
+ */
+#define BLK_DEF_MAX_SECTORS_CAP	(SZ_4M >> SECTOR_SHIFT)
+
 #define	BLK_DEV_MAX_SECTORS	(LLONG_MAX >> 9)
 #define	BLK_MIN_SEGMENT_SIZE	4096
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a59880c809c7..5c626d23cbb2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1220,15 +1220,6 @@ enum blk_default_limits {
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 
-/*
- * Default upper limit for the software max_sectors limit used for
- * regular file system I/O.  This can be increased through sysfs.
- *
- * Not to be confused with the max_hw_sector limit that is entirely
- * controlled by the driver, usually based on hardware limits.
- */
-#define BLK_DEF_MAX_SECTORS_CAP	2560u
-
 static inline struct queue_limits *bdev_limits(struct block_device *bdev)
 {
 	return &bdev_get_queue(bdev)->limits;
-- 
cgit v1.2.3


From 3f66ccbaaef3a0c5bd844eab04e3207b4061c546 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 25 Jun 2025 18:33:23 +0900
Subject: block: Make REQ_OP_ZONE_FINISH a write operation

REQ_OP_ZONE_FINISH is defined as "12", which makes
op_is_write(REQ_OP_ZONE_FINISH) return false, despite the fact that a
zone finish operation is an operation that modifies a zone (transition
it to full) and so should be considered as a write operation (albeit
one that does not transfer any data to the device).

Fix this by redefining REQ_OP_ZONE_FINISH to be an odd number (13), and
redefine REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL using sequential
odd numbers from that new value.

Fixes: 6c1b1da58f8c ("block: add zone open, close and finish operations")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250625093327.548866-2-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3d1577f07c1c..930daff207df 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -350,11 +350,11 @@ enum req_op {
 	/* Close a zone */
 	REQ_OP_ZONE_CLOSE	= (__force blk_opf_t)11,
 	/* Transition a zone to full */
-	REQ_OP_ZONE_FINISH	= (__force blk_opf_t)12,
+	REQ_OP_ZONE_FINISH	= (__force blk_opf_t)13,
 	/* reset a zone write pointer */
-	REQ_OP_ZONE_RESET	= (__force blk_opf_t)13,
+	REQ_OP_ZONE_RESET	= (__force blk_opf_t)15,
 	/* reset all the zone present on the device */
-	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)15,
+	REQ_OP_ZONE_RESET_ALL	= (__force blk_opf_t)17,
 
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= (__force blk_opf_t)34,
-- 
cgit v1.2.3


From f70291411ba20d50008db90a6f0731efac27872c Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 25 Jun 2025 18:33:24 +0900
Subject: block: Introduce bio_needs_zone_write_plugging()

In preparation for fixing device mapper zone write handling, introduce
the inline helper function bio_needs_zone_write_plugging() to test if a
BIO requires handling through zone write plugging using the function
blk_zone_plug_bio(). This function returns true for any write
(op_is_write(bio) == true) operation directed at a zoned block device
using zone write plugging, that is, a block device with a disk that has
a zone write plug hash table.

This helper allows simplifying the check on entry to blk_zone_plug_bio()
and used in to protect calls to it for blk-mq devices and DM devices.

Fixes: f211268ed1f9 ("dm: Use the block layer zone append emulation")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250625093327.548866-3-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  6 ++++--
 block/blk-zoned.c      | 20 +-----------------
 drivers/md/dm.c        |  4 +++-
 include/linux/blkdev.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..0c61492724d2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3169,8 +3169,10 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
 		goto queue_exit;
 
-	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
-		goto queue_exit;
+	if (bio_needs_zone_write_plugging(bio)) {
+		if (blk_zone_plug_bio(bio, nr_segs))
+			goto queue_exit;
+	}
 
 new_request:
 	if (rq) {
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 351d659280e1..efe71b1a1da1 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1116,25 +1116,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
 {
 	struct block_device *bdev = bio->bi_bdev;
 
-	if (!bdev->bd_disk->zone_wplugs_hash)
-		return false;
-
-	/*
-	 * If the BIO already has the plugging flag set, then it was already
-	 * handled through this path and this is a submission from the zone
-	 * plug bio submit work.
-	 */
-	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
-		return false;
-
-	/*
-	 * We do not need to do anything special for empty flush BIOs, e.g
-	 * BIOs such as issued by blkdev_issue_flush(). The is because it is
-	 * the responsibility of the user to first wait for the completion of
-	 * write operations for flush to have any effect on the persistence of
-	 * the written data.
-	 */
-	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
 		return false;
 
 	/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1726f0f828cc..6e3de50eae47 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1788,7 +1788,9 @@ static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
 }
 static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
 {
-	return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+	if (!bio_needs_zone_write_plugging(bio))
+		return false;
+	return blk_zone_plug_bio(bio, 0);
 }
 
 static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c626d23cbb2..a51f92b6c340 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -837,6 +837,55 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return disk->nr_zones;
 }
+
+/**
+ * bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone
+ *				   write plugging
+ * @bio: The BIO being submitted
+ *
+ * Return true whenever @bio execution needs to be handled through zone
+ * write plugging (using blk_zone_plug_bio()). Return false otherwise.
+ */
+static inline bool bio_needs_zone_write_plugging(struct bio *bio)
+{
+	enum req_op op = bio_op(bio);
+
+	/*
+	 * Only zoned block devices have a zone write plug hash table. But not
+	 * all of them have one (e.g. DM devices may not need one).
+	 */
+	if (!bio->bi_bdev->bd_disk->zone_wplugs_hash)
+		return false;
+
+	/* Only write operations need zone write plugging. */
+	if (!op_is_write(op))
+		return false;
+
+	/* Ignore empty flush */
+	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+		return false;
+
+	/* Ignore BIOs that already have been handled by zone write plugging. */
+	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+		return false;
+
+	/*
+	 * All zone write operations must be handled through zone write plugging
+	 * using blk_zone_plug_bio().
+	 */
+	switch (op) {
+	case REQ_OP_ZONE_APPEND:
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_ZONE_FINISH:
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_RESET_ALL:
+		return true;
+	default:
+		return false;
+	}
+}
+
 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
 
 /**
@@ -866,6 +915,12 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
+
+static inline bool bio_needs_zone_write_plugging(struct bio *bio)
+{
+	return false;
+}
+
 static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
 {
 	return false;
-- 
cgit v1.2.3


From 38446014648c9f7b2843f87517c8f2b73906bb40 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 25 Jun 2025 13:34:58 +0200
Subject: block: don't merge different kinds of P2P transfers in a single bio

To get out of the DMA mapping helpers having to check every segment for
it's P2P status, ensure that bios either contain P2P transfers or non-P2P
transfers, and that a P2P bio only contains ranges from a single device.

This means we do the page zone access in the bio add path where it should
be still page hot, and will only have do the fairly expensive P2P topology
lookup once per bio down in the DMA mapping path, and only for already
marked bios.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20250625113531.522027-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c     |  3 +++
 block/bio.c               | 20 +++++++++++++-------
 include/linux/blk_types.h |  2 ++
 3 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 10912988c8f5..6b077ca937f6 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -128,6 +128,9 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	if (bip->bip_vcnt > 0) {
 		struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
 
+		if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
 		if (bvec_try_merge_hw_page(q, bv, page, len, offset)) {
 			bip->bip_iter.bi_size += len;
 			return len;
diff --git a/block/bio.c b/block/bio.c
index 3c0a558c90f5..92c512e876c8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -930,8 +930,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
 		return false;
 	if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
 		return false;
-	if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
-		return false;
 
 	if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
 		if (IS_ENABLED(CONFIG_KMSAN))
@@ -982,6 +980,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
 	WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
 	WARN_ON_ONCE(bio_full(bio, len));
 
+	if (is_pci_p2pdma_page(page))
+		bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
+
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
 	bio->bi_iter.bi_size += len;
 	bio->bi_vcnt++;
@@ -1022,11 +1023,16 @@ int bio_add_page(struct bio *bio, struct page *page,
 	if (bio->bi_iter.bi_size > UINT_MAX - len)
 		return 0;
 
-	if (bio->bi_vcnt > 0 &&
-	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
-				page, len, offset)) {
-		bio->bi_iter.bi_size += len;
-		return len;
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
+		if (bvec_try_merge_page(bv, page, len, offset)) {
+			bio->bi_iter.bi_size += len;
+			return len;
+		}
 	}
 
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 930daff207df..09b99d52fd36 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -386,6 +386,7 @@ enum req_flag_bits {
 	__REQ_DRV,		/* for driver use */
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
+	__REQ_P2PDMA,		/* contains P2P DMA pages */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -418,6 +419,7 @@ enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
+#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
-- 
cgit v1.2.3


From 858299dc61603670823f8c1d62bf3fc7af44b18b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 25 Jun 2025 13:34:59 +0200
Subject: block: add scatterlist-less DMA mapping helpers

Add a new blk_rq_dma_map / blk_rq_dma_unmap pair that does away with
the wasteful scatterlist structure.  Instead it uses the mapping iterator
to either add segments to the IOVA for IOMMU operations, or just maps
them one by one for the direct mapping.  For the IOMMU case instead of
a scatterlist with an entry for each segment, only a single [dma_addr,len]
pair needs to be stored for processing a request, and for the direct
mapping the per-segment allocation shrinks from
[page,offset,len,dma_addr,dma_len] to just [dma_addr,len].

One big difference to the scatterlist API, which could be considered
downside, is that the IOVA collapsing only works when the driver sets
a virt_boundary that matches the IOMMU granule.  For NVMe this is done
already so it works perfectly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20250625113531.522027-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-dma.c         | 161 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq-dma.h |  63 ++++++++++++++++++
 2 files changed, 224 insertions(+)
 create mode 100644 include/linux/blk-mq-dma.h

(limited to 'include')

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 82bae475dfa4..ad283017caef 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -2,6 +2,7 @@
 /*
  * Copyright (C) 2025 Christoph Hellwig
  */
+#include <linux/blk-mq-dma.h>
 #include "blk.h"
 
 struct phys_vec {
@@ -61,6 +62,166 @@ static bool blk_map_iter_next(struct request *req, struct req_iterator *iter,
 	return true;
 }
 
+/*
+ * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page
+ * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so
+ * we need to ensure our segments are aligned to this as well.
+ *
+ * Note that there is no point in using the slightly more complicated IOVA based
+ * path for single segment mappings.
+ */
+static inline bool blk_can_dma_map_iova(struct request *req,
+		struct device *dma_dev)
+{
+	return !((queue_virt_boundary(req->q) + 1) &
+		dma_get_merge_boundary(dma_dev));
+}
+
+static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+	iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr);
+	iter->len = vec->len;
+	return true;
+}
+
+static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
+		struct blk_dma_iter *iter, struct phys_vec *vec)
+{
+	iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr),
+			offset_in_page(vec->paddr), vec->len, rq_dma_dir(req));
+	if (dma_mapping_error(dma_dev, iter->addr)) {
+		iter->status = BLK_STS_RESOURCE;
+		return false;
+	}
+	iter->len = vec->len;
+	return true;
+}
+
+static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter,
+		struct phys_vec *vec)
+{
+	enum dma_data_direction dir = rq_dma_dir(req);
+	unsigned int mapped = 0;
+	int error;
+
+	iter->addr = state->addr;
+	iter->len = dma_iova_size(state);
+
+	do {
+		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
+				vec->len, dir, 0);
+		if (error)
+			break;
+		mapped += vec->len;
+	} while (blk_map_iter_next(req, &iter->iter, vec));
+
+	error = dma_iova_sync(dma_dev, state, 0, mapped);
+	if (error) {
+		iter->status = errno_to_blk_status(error);
+		return false;
+	}
+
+	return true;
+}
+
+/**
+ * blk_rq_dma_map_iter_start - map the first DMA segment for a request
+ * @req:	request to map
+ * @dma_dev:	device to map to
+ * @state:	DMA IOVA state
+ * @iter:	block layer DMA iterator
+ *
+ * Start DMA mapping @req to @dma_dev.  @state and @iter are provided by the
+ * caller and don't need to be initialized.  @state needs to be stored for use
+ * at unmap time, @iter is only needed at map time.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len.  If no segment was mapped the status code is
+ * returned in @iter.status.
+ *
+ * The caller can call blk_rq_dma_map_coalesce() to check if further segments
+ * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next()
+ * to try to map the following segments.
+ */
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+	unsigned int total_len = blk_rq_payload_bytes(req);
+	struct phys_vec vec;
+
+	iter->iter.bio = req->bio;
+	iter->iter.iter = req->bio->bi_iter;
+	memset(&iter->p2pdma, 0, sizeof(iter->p2pdma));
+	iter->status = BLK_STS_OK;
+
+	/*
+	 * Grab the first segment ASAP because we'll need it to check for P2P
+	 * transfers.
+	 */
+	if (!blk_map_iter_next(req, &iter->iter, &vec))
+		return false;
+
+	if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) {
+		switch (pci_p2pdma_state(&iter->p2pdma, dma_dev,
+					 phys_to_page(vec.paddr))) {
+		case PCI_P2PDMA_MAP_BUS_ADDR:
+			return blk_dma_map_bus(iter, &vec);
+		case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+			/*
+			 * P2P transfers through the host bridge are treated the
+			 * same as non-P2P transfers below and during unmap.
+			 */
+			req->cmd_flags &= ~REQ_P2PDMA;
+			break;
+		default:
+			iter->status = BLK_STS_INVAL;
+			return false;
+		}
+	}
+
+	if (blk_can_dma_map_iova(req, dma_dev) &&
+	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
+		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
+
+/**
+ * blk_rq_dma_map_iter_next - map the next DMA segment for a request
+ * @req:	request to map
+ * @dma_dev:	device to map to
+ * @state:	DMA IOVA state
+ * @iter:	block layer DMA iterator
+ *
+ * Iterate to the next mapping after a previous call to
+ * blk_rq_dma_map_iter_start().  See there for a detailed description of the
+ * arguments.
+ *
+ * Returns %false if there is no segment to map, including due to an error, or
+ * %true ft it did map a segment.
+ *
+ * If a segment was mapped, the DMA address for it is returned in @iter.addr and
+ * the length in @iter.len.  If no segment was mapped the status code is
+ * returned in @iter.status.
+ */
+bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter)
+{
+	struct phys_vec vec;
+
+	if (!blk_map_iter_next(req, &iter->iter, &vec))
+		return false;
+
+	if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR)
+		return blk_dma_map_bus(iter, &vec);
+	return blk_dma_map_direct(req, dma_dev, iter, &vec);
+}
+EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next);
+
 static inline struct scatterlist *
 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
 {
diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h
new file mode 100644
index 000000000000..c26a01aeae00
--- /dev/null
+++ b/include/linux/blk-mq-dma.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef BLK_MQ_DMA_H
+#define BLK_MQ_DMA_H
+
+#include <linux/blk-mq.h>
+#include <linux/pci-p2pdma.h>
+
+struct blk_dma_iter {
+	/* Output address range for this iteration */
+	dma_addr_t			addr;
+	u32				len;
+
+	/* Status code. Only valid when blk_rq_dma_map_iter_* returned false */
+	blk_status_t			status;
+
+	/* Internal to blk_rq_dma_map_iter_* */
+	struct req_iterator		iter;
+	struct pci_p2pdma_map_state	p2pdma;
+};
+
+bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter);
+bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, struct blk_dma_iter *iter);
+
+/**
+ * blk_rq_dma_map_coalesce - were all segments coalesced?
+ * @state: DMA state to check
+ *
+ * Returns true if blk_rq_dma_map_iter_start coalesced all segments into a
+ * single DMA range.
+ */
+static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state)
+{
+	return dma_use_iova(state);
+}
+
+/**
+ * blk_rq_dma_unmap - try to DMA unmap a request
+ * @req:	request to unmap
+ * @dma_dev:	device to unmap from
+ * @state:	DMA IOVA state
+ * @mapped_len: number of bytes to unmap
+ *
+ * Returns %false if the callers need to manually unmap every DMA segment
+ * mapped using @iter or %true if no work is left to be done.
+ */
+static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev,
+		struct dma_iova_state *state, size_t mapped_len)
+{
+	if (req->cmd_flags & REQ_P2PDMA)
+		return true;
+
+	if (dma_use_iova(state)) {
+		dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req),
+				 0);
+		return true;
+	}
+
+	return !dma_need_unmap(dma_dev);
+}
+
+#endif /* BLK_MQ_DMA_H */
-- 
cgit v1.2.3


From 2ad26b7bedcd4941e6dafa1851e2054b369b9d25 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Mon, 19 May 2025 16:41:08 +0100
Subject: include: linux: move adi-axi-common.h out of fpga
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The adi-axi-common.h header has some common defines used in various ADI
IPs. However they are not specific for any fpga manager so it's
questionable for the header to live under include/linux/fpga. Hence
let's just move one directory up and update all users.

Suggested-by: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> # for IIO
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250519-dev-axi-clkgen-limits-v6-3-bc4b3b61d1d4@analog.com
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Uwe Kleine-König <ukleinek@kernel.org>
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/dma/dma-axi-dmac.c          |  2 +-
 drivers/hwmon/axi-fan-control.c     |  2 +-
 drivers/iio/adc/adi-axi-adc.c       |  3 +--
 drivers/iio/dac/adi-axi-dac.c       |  2 +-
 drivers/pwm/pwm-axi-pwmgen.c        |  2 +-
 drivers/spi/spi-axi-spi-engine.c    |  2 +-
 include/linux/adi-axi-common.h      | 23 +++++++++++++++++++++++
 include/linux/fpga/adi-axi-common.h | 23 -----------------------
 8 files changed, 29 insertions(+), 30 deletions(-)
 create mode 100644 include/linux/adi-axi-common.h
 delete mode 100644 include/linux/fpga/adi-axi-common.h

(limited to 'include')

diff --git a/drivers/dma/dma-axi-dmac.c b/drivers/dma/dma-axi-dmac.c
index 36943b0c6d60..5b06b0dc67ee 100644
--- a/drivers/dma/dma-axi-dmac.c
+++ b/drivers/dma/dma-axi-dmac.c
@@ -6,6 +6,7 @@
  *  Author: Lars-Peter Clausen <lars@metafoo.de>
  */
 
+#include <linux/adi-axi-common.h>
 #include <linux/bitfield.h>
 #include <linux/clk.h>
 #include <linux/device.h>
@@ -22,7 +23,6 @@
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
-#include <linux/fpga/adi-axi-common.h>
 
 #include <dt-bindings/dma/axi-dmac.h>
 
diff --git a/drivers/hwmon/axi-fan-control.c b/drivers/hwmon/axi-fan-control.c
index 35c862eb158b..b7bb325c3ad9 100644
--- a/drivers/hwmon/axi-fan-control.c
+++ b/drivers/hwmon/axi-fan-control.c
@@ -4,9 +4,9 @@
  *
  * Copyright 2019 Analog Devices Inc.
  */
+#include <linux/adi-axi-common.h>
 #include <linux/bits.h>
 #include <linux/clk.h>
-#include <linux/fpga/adi-axi-common.h>
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/interrupt.h>
diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index 4116c44197b8..b3cc78187c9d 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -6,6 +6,7 @@
  * Copyright 2012-2020 Analog Devices Inc.
  */
 
+#include <linux/adi-axi-common.h>
 #include <linux/bitfield.h>
 #include <linux/cleanup.h>
 #include <linux/clk.h>
@@ -20,8 +21,6 @@
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
-#include <linux/fpga/adi-axi-common.h>
-
 #include <linux/iio/backend.h>
 #include <linux/iio/buffer-dmaengine.h>
 #include <linux/iio/buffer.h>
diff --git a/drivers/iio/dac/adi-axi-dac.c b/drivers/iio/dac/adi-axi-dac.c
index 33faba4b02c2..890a0ac0d85e 100644
--- a/drivers/iio/dac/adi-axi-dac.c
+++ b/drivers/iio/dac/adi-axi-dac.c
@@ -5,6 +5,7 @@
  *
  * Copyright 2016-2024 Analog Devices Inc.
  */
+#include <linux/adi-axi-common.h>
 #include <linux/bitfield.h>
 #include <linux/bits.h>
 #include <linux/cleanup.h>
@@ -23,7 +24,6 @@
 #include <linux/regmap.h>
 #include <linux/units.h>
 
-#include <linux/fpga/adi-axi-common.h>
 #include <linux/iio/backend.h>
 #include <linux/iio/buffer-dmaengine.h>
 #include <linux/iio/buffer.h>
diff --git a/drivers/pwm/pwm-axi-pwmgen.c b/drivers/pwm/pwm-axi-pwmgen.c
index 60dcd3542373..b40522f01002 100644
--- a/drivers/pwm/pwm-axi-pwmgen.c
+++ b/drivers/pwm/pwm-axi-pwmgen.c
@@ -18,10 +18,10 @@
  * - Supports normal polarity. Does not support changing polarity.
  * - On disable, the PWM output becomes low (inactive).
  */
+#include <linux/adi-axi-common.h>
 #include <linux/bits.h>
 #include <linux/clk.h>
 #include <linux/err.h>
-#include <linux/fpga/adi-axi-common.h>
 #include <linux/io.h>
 #include <linux/minmax.h>
 #include <linux/module.h>
diff --git a/drivers/spi/spi-axi-spi-engine.c b/drivers/spi/spi-axi-spi-engine.c
index 8cc19934b48b..512d53a8ef4d 100644
--- a/drivers/spi/spi-axi-spi-engine.c
+++ b/drivers/spi/spi-axi-spi-engine.c
@@ -6,12 +6,12 @@
  *  Author: Lars-Peter Clausen <lars@metafoo.de>
  */
 
+#include <linux/adi-axi-common.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/clk.h>
 #include <linux/completion.h>
 #include <linux/dmaengine.h>
-#include <linux/fpga/adi-axi-common.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h
new file mode 100644
index 000000000000..141ac3f251e6
--- /dev/null
+++ b/include/linux/adi-axi-common.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Analog Devices AXI common registers & definitions
+ *
+ * Copyright 2019 Analog Devices Inc.
+ *
+ * https://wiki.analog.com/resources/fpga/docs/axi_ip
+ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap
+ */
+
+#ifndef ADI_AXI_COMMON_H_
+#define ADI_AXI_COMMON_H_
+
+#define ADI_AXI_REG_VERSION			0x0000
+
+#define ADI_AXI_PCORE_VER(major, minor, patch)	\
+	(((major) << 16) | ((minor) << 8) | (patch))
+
+#define ADI_AXI_PCORE_VER_MAJOR(version)	(((version) >> 16) & 0xff)
+#define ADI_AXI_PCORE_VER_MINOR(version)	(((version) >> 8) & 0xff)
+#define ADI_AXI_PCORE_VER_PATCH(version)	((version) & 0xff)
+
+#endif /* ADI_AXI_COMMON_H_ */
diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h
deleted file mode 100644
index 141ac3f251e6..000000000000
--- a/include/linux/fpga/adi-axi-common.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Analog Devices AXI common registers & definitions
- *
- * Copyright 2019 Analog Devices Inc.
- *
- * https://wiki.analog.com/resources/fpga/docs/axi_ip
- * https://wiki.analog.com/resources/fpga/docs/hdl/regmap
- */
-
-#ifndef ADI_AXI_COMMON_H_
-#define ADI_AXI_COMMON_H_
-
-#define ADI_AXI_REG_VERSION			0x0000
-
-#define ADI_AXI_PCORE_VER(major, minor, patch)	\
-	(((major) << 16) | ((minor) << 8) | (patch))
-
-#define ADI_AXI_PCORE_VER_MAJOR(version)	(((version) >> 16) & 0xff)
-#define ADI_AXI_PCORE_VER_MINOR(version)	(((version) >> 8) & 0xff)
-#define ADI_AXI_PCORE_VER_PATCH(version)	((version) & 0xff)
-
-#endif /* ADI_AXI_COMMON_H_ */
-- 
cgit v1.2.3


From 6fc942f777b1a94fd2dacd4836aaf7e3b440baf8 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Mon, 19 May 2025 16:41:09 +0100
Subject: include: adi-axi-common: add new helper macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add new helper macros and enums to help identifying the platform and some
characteristics of it at runtime.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250519-dev-axi-clkgen-limits-v6-4-bc4b3b61d1d4@analog.com
Reviewed-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/adi-axi-common.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include')

diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h
index 141ac3f251e6..f64f4ad4beda 100644
--- a/include/linux/adi-axi-common.h
+++ b/include/linux/adi-axi-common.h
@@ -12,6 +12,7 @@
 #define ADI_AXI_COMMON_H_
 
 #define ADI_AXI_REG_VERSION			0x0000
+#define ADI_AXI_REG_FPGA_INFO			0x001C
 
 #define ADI_AXI_PCORE_VER(major, minor, patch)	\
 	(((major) << 16) | ((minor) << 8) | (patch))
@@ -20,4 +21,36 @@
 #define ADI_AXI_PCORE_VER_MINOR(version)	(((version) >> 8) & 0xff)
 #define ADI_AXI_PCORE_VER_PATCH(version)	((version) & 0xff)
 
+#define ADI_AXI_INFO_FPGA_TECH(info)            (((info) >> 24) & 0xff)
+#define ADI_AXI_INFO_FPGA_FAMILY(info)          (((info) >> 16) & 0xff)
+#define ADI_AXI_INFO_FPGA_SPEED_GRADE(info)     (((info) >> 8) & 0xff)
+
+enum adi_axi_fpga_technology {
+	ADI_AXI_FPGA_TECH_UNKNOWN = 0,
+	ADI_AXI_FPGA_TECH_SERIES7,
+	ADI_AXI_FPGA_TECH_ULTRASCALE,
+	ADI_AXI_FPGA_TECH_ULTRASCALE_PLUS,
+};
+
+enum adi_axi_fpga_family {
+	ADI_AXI_FPGA_FAMILY_UNKNOWN = 0,
+	ADI_AXI_FPGA_FAMILY_ARTIX,
+	ADI_AXI_FPGA_FAMILY_KINTEX,
+	ADI_AXI_FPGA_FAMILY_VIRTEX,
+	ADI_AXI_FPGA_FAMILY_ZYNQ,
+};
+
+enum adi_axi_fpga_speed_grade {
+	ADI_AXI_FPGA_SPEED_UNKNOWN      = 0,
+	ADI_AXI_FPGA_SPEED_1    = 10,
+	ADI_AXI_FPGA_SPEED_1L   = 11,
+	ADI_AXI_FPGA_SPEED_1H   = 12,
+	ADI_AXI_FPGA_SPEED_1HV  = 13,
+	ADI_AXI_FPGA_SPEED_1LV  = 14,
+	ADI_AXI_FPGA_SPEED_2    = 20,
+	ADI_AXI_FPGA_SPEED_2L   = 21,
+	ADI_AXI_FPGA_SPEED_2LV  = 22,
+	ADI_AXI_FPGA_SPEED_3    = 30,
+};
+
 #endif /* ADI_AXI_COMMON_H_ */
-- 
cgit v1.2.3


From 03dc03fa0432a9160c4fcbdb86f274e6b4587972 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 26 Jun 2025 10:31:10 +0300
Subject: neighbor: Add NTF_EXT_VALIDATED flag for externally validated entries

tl;dr
=====

Add a new neighbor flag ("extern_valid") that can be used to indicate to
the kernel that a neighbor entry was learned and determined to be valid
externally. The kernel will not try to remove or invalidate such an
entry, leaving these decisions to the user space control plane. This is
needed for EVPN multi-homing where a neighbor entry for a multi-homed
host needs to be synced across all the VTEPs among which the host is
multi-homed.

Background
==========

In a typical EVPN multi-homing setup each host is multi-homed using a
set of links called ES (Ethernet Segment, i.e., LAG) to multiple leaf
switches (VTEPs). VTEPs that are connected to the same ES are called ES
peers.

When a neighbor entry is learned on a VTEP, it is distributed to both ES
peers and remote VTEPs using EVPN MAC/IP advertisement routes. ES peers
use the neighbor entry when routing traffic towards the multi-homed host
and remote VTEPs use it for ARP/NS suppression.

Motivation
==========

If the ES link between a host and the VTEP on which the neighbor entry
was locally learned goes down, the EVPN MAC/IP advertisement route will
be withdrawn and the neighbor entries will be removed from both ES peers
and remote VTEPs. Routing towards the multi-homed host and ARP/NS
suppression can fail until another ES peer locally learns the neighbor
entry and distributes it via an EVPN MAC/IP advertisement route.

"draft-rbickhart-evpn-ip-mac-proxy-adv-03" [1] suggests avoiding these
intermittent failures by having the ES peers install the neighbor
entries as before, but also injecting EVPN MAC/IP advertisement routes
with a proxy indication. When the previously mentioned ES link goes down
and the original EVPN MAC/IP advertisement route is withdrawn, the ES
peers will not withdraw their neighbor entries, but instead start aging
timers for the proxy indication.

If an ES peer locally learns the neighbor entry (i.e., it becomes
"reachable"), it will restart its aging timer for the entry and emit an
EVPN MAC/IP advertisement route without a proxy indication. An ES peer
will stop its aging timer for the proxy indication if it observes the
removal of the proxy indication from at least one of the ES peers
advertising the entry.

In the event that the aging timer for the proxy indication expired, an
ES peer will withdraw its EVPN MAC/IP advertisement route. If the timer
expired on all ES peers and they all withdrew their proxy
advertisements, the neighbor entry will be completely removed from the
EVPN fabric.

Implementation
==============

In the above scheme, when the control plane (e.g., FRR) advertises a
neighbor entry with a proxy indication, it expects the corresponding
entry in the data plane (i.e., the kernel) to remain valid and not be
removed due to garbage collection or loss of carrier. The control plane
also expects the kernel to notify it if the entry was learned locally
(i.e., became "reachable") so that it will remove the proxy indication
from the EVPN MAC/IP advertisement route. That is why these entries
cannot be programmed with dummy states such as "permanent" or "noarp".

Instead, add a new neighbor flag ("extern_valid") which indicates that
the entry was learned and determined to be valid externally and should
not be removed or invalidated by the kernel. The kernel can probe the
entry and notify user space when it becomes "reachable" (it is initially
installed as "stale"). However, if the kernel does not receive a
confirmation, have it return the entry to the "stale" state instead of
the "failed" state.

In other words, an entry marked with the "extern_valid" flag behaves
like any other dynamically learned entry other than the fact that the
kernel cannot remove or invalidate it.

One can argue that the "extern_valid" flag should not prevent garbage
collection and that instead a neighbor entry should be programmed with
both the "extern_valid" and "extern_learn" flags. There are two reasons
for not doing that:

1. Unclear why a control plane would like to program an entry that the
   kernel cannot invalidate but can completely remove.

2. The "extern_learn" flag is used by FRR for neighbor entries learned
   on remote VTEPs (for ARP/NS suppression) whereas here we are
   concerned with local entries. This distinction is currently irrelevant
   for the kernel, but might be relevant in the future.

Given that the flag only makes sense when the neighbor has a valid
state, reject attempts to add a neighbor with an invalid state and with
this flag set. For example:

 # ip neigh add 192.0.2.1 nud none dev br0.10 extern_valid
 Error: Cannot create externally validated neighbor with an invalid state.
 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid
 # ip neigh replace 192.0.2.1 nud failed dev br0.10 extern_valid
 Error: Cannot mark neighbor as externally validated with an invalid state.

The above means that a neighbor cannot be created with the
"extern_valid" flag and flags such as "use" or "managed" as they result
in a neighbor being created with an invalid state ("none") and
immediately getting probed:

 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 Error: Cannot create externally validated neighbor with an invalid state.

However, these flags can be used together with "extern_valid" after the
neighbor was created with a valid state:

 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use

One consequence of preventing the kernel from invalidating a neighbor
entry is that by default it will only try to determine reachability
using unicast probes. This can be changed using the "mcast_resolicit"
sysctl:

 # sysctl net.ipv4.neigh.br0/10.mcast_resolicit
 0
 # tcpdump -nn -e -i br0.10 -Q out arp &
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 # sysctl -wq net.ipv4.neigh.br0/10.mcast_resolicit=3
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28

iproute2 patches can be found here [2].

[1] https://datatracker.ietf.org/doc/html/draft-rbickhart-evpn-ip-mac-proxy-adv-03
[2] https://github.com/idosch/iproute2/tree/submit/extern_valid_v1

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20250626073111.244534-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/rt-neigh.yaml |  1 +
 include/net/neighbour.h                   |  4 +-
 include/uapi/linux/neighbour.h            |  5 ++
 net/core/neighbour.c                      | 79 +++++++++++++++++++++++++++----
 4 files changed, 78 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml
index 25cc2d528d2f..30a9ee16f128 100644
--- a/Documentation/netlink/specs/rt-neigh.yaml
+++ b/Documentation/netlink/specs/rt-neigh.yaml
@@ -79,6 +79,7 @@ definitions:
     entries:
       - managed
       - locked
+      - ext-validated
   -
     name: rtm-type
     type: enum
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index c7ce5ec7be23..7e865b14749d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -261,13 +261,15 @@ static inline void *neighbour_priv(const struct neighbour *n)
 #define NEIGH_UPDATE_F_EXT_LEARNED		BIT(5)
 #define NEIGH_UPDATE_F_ISROUTER			BIT(6)
 #define NEIGH_UPDATE_F_ADMIN			BIT(7)
+#define NEIGH_UPDATE_F_EXT_VALIDATED		BIT(8)
 
 /* In-kernel representation for NDA_FLAGS_EXT flags: */
 #define NTF_OLD_MASK		0xff
 #define NTF_EXT_SHIFT		8
-#define NTF_EXT_MASK		(NTF_EXT_MANAGED)
+#define NTF_EXT_MASK		(NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED)
 
 #define NTF_MANAGED		(NTF_EXT_MANAGED << NTF_EXT_SHIFT)
+#define NTF_EXT_VALIDATED	(NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT)
 
 extern const struct nla_policy nda_policy[];
 
diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index b851c36ad25d..c34a81245f87 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -54,6 +54,7 @@ enum {
 /* Extended flags under NDA_FLAGS_EXT: */
 #define NTF_EXT_MANAGED		(1 << 0)
 #define NTF_EXT_LOCKED		(1 << 1)
+#define NTF_EXT_EXT_VALIDATED	(1 << 2)
 
 /*
  *	Neighbor Cache Entry States.
@@ -92,6 +93,10 @@ enum {
  * bridge in response to a host trying to communicate via a locked bridge port
  * with MAB enabled. Their purpose is to notify user space that a host requires
  * authentication.
+ *
+ * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by
+ * a user space control plane. The kernel will not remove or invalidate them,
+ * but it can probe them and notify user space when they become reachable.
  */
 
 struct nda_cacheinfo {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 8ad9898f8e42..e5f0992ac364 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -154,11 +154,12 @@ static void neigh_update_gc_list(struct neighbour *n)
 	if (n->dead)
 		goto out;
 
-	/* remove from the gc list if new state is permanent or if neighbor
-	 * is externally learned; otherwise entry should be on the gc list
+	/* remove from the gc list if new state is permanent or if neighbor is
+	 * externally learned / validated; otherwise entry should be on the gc
+	 * list
 	 */
 	exempt_from_gc = n->nud_state & NUD_PERMANENT ||
-			 n->flags & NTF_EXT_LEARNED;
+			 n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED);
 	on_gc_list = !list_empty(&n->gc_list);
 
 	if (exempt_from_gc && on_gc_list) {
@@ -205,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
 
 	ndm_flags  = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
 	ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0;
+	ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0;
 
 	if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) {
 		if (ndm_flags & NTF_EXT_LEARNED)
@@ -222,6 +224,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
 		*notify = 1;
 		*managed_update = true;
 	}
+	if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) {
+		if (ndm_flags & NTF_EXT_VALIDATED)
+			neigh->flags |= NTF_EXT_VALIDATED;
+		else
+			neigh->flags &= ~NTF_EXT_VALIDATED;
+		*notify = 1;
+		*gc_update = true;
+	}
 }
 
 bool neigh_remove_one(struct neighbour *n)
@@ -379,7 +389,9 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
 	dev_head = neigh_get_dev_table(dev, tbl->family);
 
 	hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
-		if (skip_perm && n->nud_state & NUD_PERMANENT)
+		if (skip_perm &&
+		    (n->nud_state & NUD_PERMANENT ||
+		     n->flags & NTF_EXT_VALIDATED))
 			continue;
 
 		hlist_del_rcu(&n->hash);
@@ -942,7 +954,8 @@ static void neigh_periodic_work(struct work_struct *work)
 
 			state = n->nud_state;
 			if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
-			    (n->flags & NTF_EXT_LEARNED)) {
+			    (n->flags &
+			     (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
 				write_unlock(&n->lock);
 				continue;
 			}
@@ -1095,9 +1108,15 @@ static void neigh_timer_handler(struct timer_list *t)
 
 	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
 	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
-		WRITE_ONCE(neigh->nud_state, NUD_FAILED);
+		if (neigh->nud_state == NUD_PROBE &&
+		    neigh->flags & NTF_EXT_VALIDATED) {
+			WRITE_ONCE(neigh->nud_state, NUD_STALE);
+			neigh->updated = jiffies;
+		} else {
+			WRITE_ONCE(neigh->nud_state, NUD_FAILED);
+			neigh_invalidate(neigh);
+		}
 		notify = 1;
-		neigh_invalidate(neigh);
 		goto out;
 	}
 
@@ -1245,6 +1264,8 @@ static void neigh_update_hhs(struct neighbour *neigh)
 				NTF_ROUTER flag.
 	NEIGH_UPDATE_F_ISROUTER	indicates if the neighbour is known as
 				a router.
+	NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
+				or invalidated.
 
    Caller MUST hold reference count on the entry.
  */
@@ -1979,7 +2000,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (ndm_flags & NTF_PROXY) {
 		struct pneigh_entry *pn;
 
-		if (ndm_flags & NTF_MANAGED) {
+		if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) {
 			NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
 			goto out;
 		}
@@ -2010,7 +2031,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (neigh == NULL) {
 		bool ndm_permanent  = ndm->ndm_state & NUD_PERMANENT;
 		bool exempt_from_gc = ndm_permanent ||
-				      ndm_flags & NTF_EXT_LEARNED;
+				      ndm_flags & (NTF_EXT_LEARNED |
+						   NTF_EXT_VALIDATED);
 
 		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
 			err = -ENOENT;
@@ -2021,10 +2043,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 			err = -EINVAL;
 			goto out;
 		}
+		if (ndm_flags & NTF_EXT_VALIDATED) {
+			u8 state = ndm->ndm_state;
+
+			/* NTF_USE and NTF_MANAGED will result in the neighbor
+			 * being created with an invalid state (NUD_NONE).
+			 */
+			if (ndm_flags & (NTF_USE | NTF_MANAGED))
+				state = NUD_NONE;
+
+			if (!(state & NUD_VALID)) {
+				NL_SET_ERR_MSG(extack,
+					       "Cannot create externally validated neighbor with an invalid state");
+				err = -EINVAL;
+				goto out;
+			}
+		}
 
 		neigh = ___neigh_create(tbl, dst, dev,
 					ndm_flags &
-					(NTF_EXT_LEARNED | NTF_MANAGED),
+					(NTF_EXT_LEARNED | NTF_MANAGED |
+					 NTF_EXT_VALIDATED),
 					exempt_from_gc, true);
 		if (IS_ERR(neigh)) {
 			err = PTR_ERR(neigh);
@@ -2036,6 +2075,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 			neigh_release(neigh);
 			goto out;
 		}
+		if (ndm_flags & NTF_EXT_VALIDATED) {
+			u8 state = ndm->ndm_state;
+
+			/* NTF_USE and NTF_MANAGED do not update the existing
+			 * state other than clearing it if it was
+			 * NUD_PERMANENT.
+			 */
+			if (ndm_flags & (NTF_USE | NTF_MANAGED))
+				state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT;
+
+			if (!(state & NUD_VALID)) {
+				NL_SET_ERR_MSG(extack,
+					       "Cannot mark neighbor as externally validated with an invalid state");
+				err = -EINVAL;
+				neigh_release(neigh);
+				goto out;
+			}
+		}
 
 		if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
 			flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
@@ -2052,6 +2109,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 		flags |= NEIGH_UPDATE_F_MANAGED;
 	if (ndm_flags & NTF_USE)
 		flags |= NEIGH_UPDATE_F_USE;
+	if (ndm_flags & NTF_EXT_VALIDATED)
+		flags |= NEIGH_UPDATE_F_EXT_VALIDATED;
 
 	err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
 			     NETLINK_CB(skb).portid, extack);
-- 
cgit v1.2.3


From 763ff02ce287c2e5c8a012d40bd2f3dab99ae5d5 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Fri, 20 Jun 2025 09:10:03 -0600
Subject: ublk: allow UBLK_IO_(UN)REGISTER_IO_BUF on any task

Currently, UBLK_IO_REGISTER_IO_BUF and UBLK_IO_UNREGISTER_IO_BUF are
only permitted on the ublk_io's daemon task. But this restriction is
unnecessary. ublk_register_io_buf() calls __ublk_check_and_get_req() to
look up the request from the tagset and atomically take a reference on
the request without accessing the ublk_io. ublk_unregister_io_buf()
doesn't use the q_id or tag at all.

So allow these opcodes even on tasks other than io->task.

Handle UBLK_IO_UNREGISTER_IO_BUF before obtaining the ubq and io since
the buffer index being unregistered is not necessarily related to the
specified q_id and tag.

Add a feature flag UBLK_F_BUF_REG_OFF_DAEMON that userspace can use to
determine whether the kernel supports off-daemon buffer registration.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250620151008.3976463-10-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c      | 27 ++++++++++++++++++++++-----
 include/uapi/linux/ublk_cmd.h | 10 ++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c244fe76d27..fa1859c06211 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -70,7 +70,8 @@
 		| UBLK_F_UPDATE_SIZE \
 		| UBLK_F_AUTO_BUF_REG \
 		| UBLK_F_QUIESCE \
-		| UBLK_F_PER_IO_DAEMON)
+		| UBLK_F_PER_IO_DAEMON \
+		| UBLK_F_BUF_REG_OFF_DAEMON)
 
 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
@@ -2204,6 +2205,14 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 	if (ret)
 		goto out;
 
+	/*
+	 * io_buffer_unregister_bvec() doesn't access the ubq or io,
+	 * so no need to validate the q_id, tag, or task
+	 */
+	if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
+		return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr,
+					      issue_flags);
+
 	ret = -EINVAL;
 	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
 		goto out;
@@ -2224,8 +2233,17 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 		return -EIOCBQUEUED;
 	}
 
-	if (READ_ONCE(io->task) != current)
+	if (READ_ONCE(io->task) != current) {
+		/*
+		 * ublk_register_io_buf() accesses only the io's refcount,
+		 * so can be handled on any task
+		 */
+		if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
+			return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr,
+						    issue_flags);
+
 		goto out;
+	}
 
 	/* there is pending io cmd, something must be wrong */
 	if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
@@ -2244,8 +2262,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
 	switch (_IOC_NR(cmd_op)) {
 	case UBLK_IO_REGISTER_IO_BUF:
 		return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, issue_flags);
-	case UBLK_IO_UNREGISTER_IO_BUF:
-		return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, issue_flags);
 	case UBLK_IO_COMMIT_AND_FETCH_REQ:
 		ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
 		if (ret)
@@ -2961,7 +2977,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 
 	ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
 		UBLK_F_URING_CMD_COMP_IN_TASK |
-		UBLK_F_PER_IO_DAEMON;
+		UBLK_F_PER_IO_DAEMON |
+		UBLK_F_BUF_REG_OFF_DAEMON;
 
 	/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
 	if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index c9751bdfd937..ec77dabba45b 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -301,6 +301,16 @@
  */
 #define UBLK_F_PER_IO_DAEMON (1ULL << 13)
 
+/*
+ * If this feature is set, UBLK_U_IO_REGISTER_IO_BUF/UBLK_U_IO_UNREGISTER_IO_BUF
+ * can be issued for an I/O on any task. q_id and tag are also ignored in
+ * UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd.
+ * If it is unset, zero-copy buffers can only be registered and unregistered by
+ * the I/O's daemon task. The q_id and tag of the registered buffer are required
+ * in UBLK_U_IO_UNREGISTER_IO_BUF's ublksrv_io_cmd.
+ */
+#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
+
 /* device state */
 #define UBLK_S_DEV_DEAD	0
 #define UBLK_S_DEV_LIVE	1
-- 
cgit v1.2.3


From f458ccd2aa2c5a6f0129a9b1548f2825071fdc6b Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 26 Jun 2025 21:58:04 +0300
Subject: RDMA/uverbs: Check CAP_NET_RAW in user namespace for flow create

Currently, the capability check is done in the default
init_user_ns user namespace. When a process runs in a
non default user namespace, such check fails. Due to this
when a process is running using Podman, it fails to create
the flow resource.

Since the RDMA device is a resource within a network namespace,
use the network namespace associated with the RDMA device to
determine its owning user namespace.

Fixes: 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Link: https://patch.msgid.link/6df6f2f24627874c4f6d041c19dc1f6f29f68f84.1750963874.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/device.c     | 27 +++++++++++++++++++++++++++
 drivers/infiniband/core/rdma_core.c  | 29 +++++++++++++++++++++++++++++
 drivers/infiniband/core/uverbs_cmd.c |  2 +-
 include/rdma/ib_verbs.h              |  3 +++
 4 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index c0f8b8cba7c0..1ca6a9b7ba1a 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
 }
 EXPORT_SYMBOL(rdma_dev_access_netns);
 
+/**
+ * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has
+ *			    CAP_NET_RAW capability or not.
+ *
+ * @dev:	Pointer to rdma device whose capability to be checked
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false. When rdma subsystem is in legacy shared network,
+ * namespace mode, the default net namespace is considered.
+ */
+bool rdma_dev_has_raw_cap(const struct ib_device *dev)
+{
+	const struct net *net;
+
+	/* Network namespace is the resource whose user namespace
+	 * to be considered. When in shared mode, there is no reliable
+	 * network namespace resource, so consider the default net namespace.
+	 */
+	if (ib_devices_shared_netns)
+		net = &init_net;
+	else
+		net = read_pnet(&dev->coredev.rdma_net);
+
+	return ns_capable(net->user_ns, CAP_NET_RAW);
+}
+EXPORT_SYMBOL(rdma_dev_has_raw_cap);
+
 /*
  * xarray has this behavior where it won't iterate over NULL values stored in
  * allocated arrays.  So we need our own iterator to see all values stored in
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
index 90c177edf9b0..18918f463361 100644
--- a/drivers/infiniband/core/rdma_core.c
+++ b/drivers/infiniband/core/rdma_core.c
@@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj,
 		WARN_ON(true);
 	}
 }
+
+/**
+ * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the
+ *			       uverbs attributes file has CAP_NET_RAW
+ *			       capability or not.
+ *
+ * @attrs:       Pointer to uverbs attributes
+ *
+ * Returns true if a rdma device's owning user namespace has CAP_NET_RAW
+ * capability, otherwise false.
+ */
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uverbs_file *ufile = attrs->ufile;
+	struct ib_ucontext *ucontext;
+	bool has_cap = false;
+	int srcu_key;
+
+	srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu);
+	ucontext = ib_uverbs_get_ucontext_file(ufile);
+	if (IS_ERR(ucontext))
+		goto out;
+	has_cap = rdma_dev_has_raw_cap(ucontext->device);
+
+out:
+	srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key);
+	return has_cap;
+}
+EXPORT_SYMBOL(rdma_uattrs_has_raw_cap);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bc9fe3ceca4d..6700c2c66167 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3225,7 +3225,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
 	if (cmd.comp_mask)
 		return -EINVAL;
 
-	if (!capable(CAP_NET_RAW))
+	if (!rdma_uattrs_has_raw_cap(attrs))
 		return -EPERM;
 
 	if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7da27f01eeb6..010594dc755b 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4817,6 +4817,8 @@ static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
 }
 #endif
 
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs);
+
 struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
 				     enum rdma_netdev_t type, const char *name,
 				     unsigned char name_assign_type,
@@ -4871,6 +4873,7 @@ static inline int ibdev_to_node(struct ib_device *ibdev)
 bool rdma_dev_access_netns(const struct ib_device *device,
 			   const struct net *net);
 
+bool rdma_dev_has_raw_cap(const struct ib_device *dev);
 static inline struct net *rdma_dev_net(struct ib_device *device)
 {
 	return read_pnet(&device->coredev.rdma_net);
-- 
cgit v1.2.3


From 12c409aa1ec2592280a2ddcc66ff8f3c7f7bb171 Mon Sep 17 00:00:00 2001
From: Denis OSTERLAND-HEIM <denis.osterland@diehl.com>
Date: Wed, 28 May 2025 12:57:50 +0200
Subject: pps: fix poll support

Because pps_cdev_poll() returns unconditionally EPOLLIN,
a user space program that calls select/poll get always an immediate data
ready-to-read response. As a result the intended use to wait until next
data becomes ready does not work.

User space snippet:

    struct pollfd pollfd = {
      .fd = open("/dev/pps0", O_RDONLY),
      .events = POLLIN|POLLERR,
      .revents = 0 };
    while(1) {
      poll(&pollfd, 1, 2000/*ms*/); // returns immediate, but should wait
      if(revents & EPOLLIN) { // always true
        struct pps_fdata fdata;
        memset(&fdata, 0, sizeof(memdata));
        ioctl(PPS_FETCH, &fdata); // currently fetches data at max speed
      }
    }

Lets remember the last fetch event counter and compare this value
in pps_cdev_poll() with most recent event counter
and return 0 if they are equal.

Signed-off-by: Denis OSTERLAND-HEIM <denis.osterland@diehl.com>
Co-developed-by: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Rodolfo Giometti <giometti@enneenne.com>
Fixes: eae9d2ba0cfc ("LinuxPPS: core support")
Link: https://lore.kernel.org/all/f6bed779-6d59-4f0f-8a59-b6312bd83b4e@enneenne.com/
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Link: https://lore.kernel.org/r/c3c50ad1eb19ef553eca8a57c17f4c006413ab70.camel@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/pps/pps.c          | 11 +++++++++--
 include/linux/pps_kernel.h |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pps/pps.c b/drivers/pps/pps.c
index 6a02245ea35f..9463232af8d2 100644
--- a/drivers/pps/pps.c
+++ b/drivers/pps/pps.c
@@ -41,6 +41,9 @@ static __poll_t pps_cdev_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &pps->queue, wait);
 
+	if (pps->last_fetched_ev == pps->last_ev)
+		return 0;
+
 	return EPOLLIN | EPOLLRDNORM;
 }
 
@@ -186,9 +189,11 @@ static long pps_cdev_ioctl(struct file *file,
 		if (err)
 			return err;
 
-		/* Return the fetched timestamp */
+		/* Return the fetched timestamp and save last fetched event  */
 		spin_lock_irq(&pps->lock);
 
+		pps->last_fetched_ev = pps->last_ev;
+
 		fdata.info.assert_sequence = pps->assert_sequence;
 		fdata.info.clear_sequence = pps->clear_sequence;
 		fdata.info.assert_tu = pps->assert_tu;
@@ -272,9 +277,11 @@ static long pps_cdev_compat_ioctl(struct file *file,
 		if (err)
 			return err;
 
-		/* Return the fetched timestamp */
+		/* Return the fetched timestamp and save last fetched event  */
 		spin_lock_irq(&pps->lock);
 
+		pps->last_fetched_ev = pps->last_ev;
+
 		compat.info.assert_sequence = pps->assert_sequence;
 		compat.info.clear_sequence = pps->clear_sequence;
 		compat.info.current_mode = pps->current_mode;
diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index c7abce28ed29..aab0aebb529e 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -52,6 +52,7 @@ struct pps_device {
 	int current_mode;			/* PPS mode at event time */
 
 	unsigned int last_ev;			/* last PPS event id */
+	unsigned int last_fetched_ev;		/* last fetched PPS event id */
 	wait_queue_head_t queue;		/* PPS event queue */
 
 	unsigned int id;			/* PPS source unique ID */
-- 
cgit v1.2.3


From b75e1f0619bd707e027812e262af3fbce445e71a Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Mon, 9 Jun 2025 10:26:47 +0300
Subject: device property: Use tidy for_each_named_* macros

Implementing if-conditions inside for_each_x() macros requires some
thinking to avoid side effects in the calling code. Resulting code
may look somewhat awkward, and there are couple of different ways it is
usually done.

Standardizing this to one way can help making it more obvious for a code
reader and writer. The newly added for_each_if() is a way to achieve this.

Use for_each_if() to make these macros look like many others which
should in the long run help reading the code.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/c98b39a7195006fdd24590b8d11bb271a72a0c8a.1749453752.git.mazziesaccount@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/property.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/property.h b/include/linux/property.h
index f718dd4789e5..82f0cb3abd1e 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -17,6 +17,7 @@
 #include <linux/fwnode.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
+#include <linux/util_macros.h>
 
 struct device;
 
@@ -169,7 +170,7 @@ struct fwnode_handle *fwnode_get_next_available_child_node(
 
 #define fwnode_for_each_named_child_node(fwnode, child, name)		\
 	fwnode_for_each_child_node(fwnode, child)			\
-		if (!fwnode_name_eq(child, name)) { } else
+		for_each_if(fwnode_name_eq(child, name))
 
 #define fwnode_for_each_available_child_node(fwnode, child)		       \
 	for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\
@@ -184,7 +185,7 @@ struct fwnode_handle *device_get_next_child_node(const struct device *dev,
 
 #define device_for_each_named_child_node(dev, child, name)		\
 	device_for_each_child_node(dev, child)				\
-		if (!fwnode_name_eq(child, name)) { } else
+		for_each_if(fwnode_name_eq(child, name))
 
 #define device_for_each_child_node_scoped(dev, child)			\
 	for (struct fwnode_handle *child __free(fwnode_handle) =	\
@@ -193,7 +194,7 @@ struct fwnode_handle *device_get_next_child_node(const struct device *dev,
 
 #define device_for_each_named_child_node_scoped(dev, child, name)	\
 	device_for_each_child_node_scoped(dev, child)			\
-		if (!fwnode_name_eq(child, name)) { } else
+		for_each_if(fwnode_name_eq(child, name))
 
 struct fwnode_handle *fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 						  const char *childname);
-- 
cgit v1.2.3


From c6603b1d6556cc02d0169f74508ab0e3e3e4bd76 Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Mon, 30 Jun 2025 14:35:45 +0530
Subject: block: rename tuple_size field in blk_integrity to metadata_size

The tuple_size field in blk_integrity currently represents the total
size of metadata associated with each data interval. To make the meaning
more explicit, rename tuple_size to metadata_size. This is a purely
mechanical rename with no functional changes.

Suggested-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Link: https://lore.kernel.org/20250630090548.3317-2-anuj20.g@samsung.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bio-integrity-auto.c        |  4 ++--
 block/blk-integrity.c             |  2 +-
 block/blk-settings.c              |  6 +++---
 block/t10-pi.c                    | 16 ++++++++--------
 drivers/md/dm-crypt.c             |  4 ++--
 drivers/md/dm-integrity.c         | 12 ++++++------
 drivers/nvdimm/btt.c              |  2 +-
 drivers/nvme/host/core.c          |  2 +-
 drivers/nvme/target/io-cmd-bdev.c |  2 +-
 drivers/scsi/sd_dif.c             |  2 +-
 include/linux/blk-integrity.h     |  4 ++--
 include/linux/blkdev.h            |  2 +-
 12 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index 9c6657664792..687952f63bbb 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -54,10 +54,10 @@ static bool bi_offload_capable(struct blk_integrity *bi)
 {
 	switch (bi->csum_type) {
 	case BLK_INTEGRITY_CSUM_CRC64:
-		return bi->tuple_size == sizeof(struct crc64_pi_tuple);
+		return bi->metadata_size == sizeof(struct crc64_pi_tuple);
 	case BLK_INTEGRITY_CSUM_CRC:
 	case BLK_INTEGRITY_CSUM_IP:
-		return bi->tuple_size == sizeof(struct t10_pi_tuple);
+		return bi->metadata_size == sizeof(struct t10_pi_tuple);
 	default:
 		pr_warn_once("%s: unknown integrity checksum type:%d\n",
 			__func__, bi->csum_type);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index e4e2567061f9..c1102bf4cd8d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -239,7 +239,7 @@ static ssize_t format_show(struct device *dev, struct device_attribute *attr,
 {
 	struct blk_integrity *bi = dev_to_bi(dev);
 
-	if (!bi->tuple_size)
+	if (!bi->metadata_size)
 		return sysfs_emit(page, "none\n");
 	return sysfs_emit(page, "%s\n", blk_integrity_profile_name(bi));
 }
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a000daafbfb4..787500ff00c3 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -114,7 +114,7 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 {
 	struct blk_integrity *bi = &lim->integrity;
 
-	if (!bi->tuple_size) {
+	if (!bi->metadata_size) {
 		if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE ||
 		    bi->tag_size || ((bi->flags & BLK_INTEGRITY_REF_TAG))) {
 			pr_warn("invalid PI settings.\n");
@@ -875,7 +875,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
 		return true;
 
 	if (ti->flags & BLK_INTEGRITY_STACKED) {
-		if (ti->tuple_size != bi->tuple_size)
+		if (ti->metadata_size != bi->metadata_size)
 			goto incompatible;
 		if (ti->interval_exp != bi->interval_exp)
 			goto incompatible;
@@ -891,7 +891,7 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
 		ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
 			     (bi->flags & BLK_INTEGRITY_REF_TAG);
 		ti->csum_type = bi->csum_type;
-		ti->tuple_size = bi->tuple_size;
+		ti->metadata_size = bi->metadata_size;
 		ti->pi_offset = bi->pi_offset;
 		ti->interval_exp = bi->interval_exp;
 		ti->tag_size = bi->tag_size;
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 851db518ee5e..0c4ed9702146 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -56,7 +56,7 @@ static void t10_pi_generate(struct blk_integrity_iter *iter,
 			pi->ref_tag = 0;
 
 		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->tuple_size;
+		iter->prot_buf += bi->metadata_size;
 		iter->seed++;
 	}
 }
@@ -105,7 +105,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
 
 next:
 		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->tuple_size;
+		iter->prot_buf += bi->metadata_size;
 		iter->seed++;
 	}
 
@@ -125,7 +125,7 @@ next:
 static void t10_pi_type1_prepare(struct request *rq)
 {
 	struct blk_integrity *bi = &rq->q->limits.integrity;
-	const int tuple_sz = bi->tuple_size;
+	const int tuple_sz = bi->metadata_size;
 	u32 ref_tag = t10_pi_ref_tag(rq);
 	u8 offset = bi->pi_offset;
 	struct bio *bio;
@@ -177,7 +177,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
 {
 	struct blk_integrity *bi = &rq->q->limits.integrity;
 	unsigned intervals = nr_bytes >> bi->interval_exp;
-	const int tuple_sz = bi->tuple_size;
+	const int tuple_sz = bi->metadata_size;
 	u32 ref_tag = t10_pi_ref_tag(rq);
 	u8 offset = bi->pi_offset;
 	struct bio *bio;
@@ -234,7 +234,7 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
 			put_unaligned_be48(0ULL, pi->ref_tag);
 
 		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->tuple_size;
+		iter->prot_buf += bi->metadata_size;
 		iter->seed++;
 	}
 }
@@ -289,7 +289,7 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
 
 next:
 		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->tuple_size;
+		iter->prot_buf += bi->metadata_size;
 		iter->seed++;
 	}
 
@@ -299,7 +299,7 @@ next:
 static void ext_pi_type1_prepare(struct request *rq)
 {
 	struct blk_integrity *bi = &rq->q->limits.integrity;
-	const int tuple_sz = bi->tuple_size;
+	const int tuple_sz = bi->metadata_size;
 	u64 ref_tag = ext_pi_ref_tag(rq);
 	u8 offset = bi->pi_offset;
 	struct bio *bio;
@@ -340,7 +340,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
 {
 	struct blk_integrity *bi = &rq->q->limits.integrity;
 	unsigned intervals = nr_bytes >> bi->interval_exp;
-	const int tuple_sz = bi->tuple_size;
+	const int tuple_sz = bi->metadata_size;
 	u64 ref_tag = ext_pi_ref_tag(rq);
 	u8 offset = bi->pi_offset;
 	struct bio *bio;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9dfdb63220d7..3d6d06b94c9f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1189,11 +1189,11 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 		return -EINVAL;
 	}
 
-	if (bi->tuple_size < cc->used_tag_size) {
+	if (bi->metadata_size < cc->used_tag_size) {
 		ti->error = "Integrity profile tag size mismatch.";
 		return -EINVAL;
 	}
-	cc->tuple_size = bi->tuple_size;
+	cc->tuple_size = bi->metadata_size;
 	if (1 << bi->interval_exp != cc->sector_size) {
 		ti->error = "Integrity profile sector size mismatch.";
 		return -EINVAL;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 4395657fa583..efeee0a873c0 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -3906,8 +3906,8 @@ static void dm_integrity_io_hints(struct dm_target *ti, struct queue_limits *lim
 		struct blk_integrity *bi = &limits->integrity;
 
 		memset(bi, 0, sizeof(*bi));
-		bi->tuple_size = ic->tag_size;
-		bi->tag_size = bi->tuple_size;
+		bi->metadata_size = ic->tag_size;
+		bi->tag_size = bi->metadata_size;
 		bi->interval_exp =
 			ic->sb->log2_sectors_per_block + SECTOR_SHIFT;
 	}
@@ -4746,18 +4746,18 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
 			ti->error = "Integrity profile not supported";
 			goto bad;
 		}
-		/*printk("tag_size: %u, tuple_size: %u\n", bi->tag_size, bi->tuple_size);*/
-		if (bi->tuple_size < ic->tag_size) {
+		/*printk("tag_size: %u, metadata_size: %u\n", bi->tag_size, bi->metadata_size);*/
+		if (bi->metadata_size < ic->tag_size) {
 			r = -EINVAL;
 			ti->error = "The integrity profile is smaller than tag size";
 			goto bad;
 		}
-		if ((unsigned long)bi->tuple_size > PAGE_SIZE / 2) {
+		if ((unsigned long)bi->metadata_size > PAGE_SIZE / 2) {
 			r = -EINVAL;
 			ti->error = "Too big tuple size";
 			goto bad;
 		}
-		ic->tuple_size = bi->tuple_size;
+		ic->tuple_size = bi->metadata_size;
 		if (1 << bi->interval_exp != ic->sectors_per_block << SECTOR_SHIFT) {
 			r = -EINVAL;
 			ti->error = "Integrity profile sector size mismatch";
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 423dcd190906..2a1aa32e6693 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1506,7 +1506,7 @@ static int btt_blk_init(struct btt *btt)
 	int rc;
 
 	if (btt_meta_size(btt) && IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) {
-		lim.integrity.tuple_size = btt_meta_size(btt);
+		lim.integrity.metadata_size = btt_meta_size(btt);
 		lim.integrity.tag_size = btt_meta_size(btt);
 	}
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 92697f98c601..b027dda38e69 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1866,7 +1866,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
 		break;
 	}
 
-	bi->tuple_size = head->ms;
+	bi->metadata_size = head->ms;
 	bi->pi_offset = info->pi_offset;
 	return true;
 }
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index eba42df2f821..42fb19f94ab8 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -65,7 +65,7 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns)
 		return;
 
 	if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC) {
-		ns->metadata_size = bi->tuple_size;
+		ns->metadata_size = bi->metadata_size;
 		if (bi->flags & BLK_INTEGRITY_REF_TAG)
 			ns->pi_type = NVME_NS_DPS_PI_TYPE1;
 		else
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index ae6ce6f5d622..18bfca1f1c78 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -52,7 +52,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim)
 	if (type != T10_PI_TYPE3_PROTECTION)
 		bi->flags |= BLK_INTEGRITY_REF_TAG;
 
-	bi->tuple_size = sizeof(struct t10_pi_tuple);
+	bi->metadata_size = sizeof(struct t10_pi_tuple);
 
 	if (dif && type) {
 		bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index c7eae0bfb013..d27730da47f3 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -33,7 +33,7 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
 {
-	return q->limits.integrity.tuple_size;
+	return q->limits.integrity.metadata_size;
 }
 
 static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
@@ -74,7 +74,7 @@ static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
 static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
 					       unsigned int sectors)
 {
-	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+	return bio_integrity_intervals(bi, sectors) * bi->metadata_size;
 }
 
 static inline bool blk_integrity_rq(struct request *rq)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a59880c809c7..ccda87d06a38 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -116,7 +116,7 @@ enum blk_integrity_checksum {
 struct blk_integrity {
 	unsigned char				flags;
 	enum blk_integrity_checksum		csum_type;
-	unsigned char				tuple_size;
+	unsigned char				metadata_size;
 	unsigned char				pi_offset;
 	unsigned char				interval_exp;
 	unsigned char				tag_size;
-- 
cgit v1.2.3


From 76e45252a4cefa205439eb6610a244771e7d88da Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Mon, 30 Jun 2025 14:35:46 +0530
Subject: block: introduce pi_tuple_size field in blk_integrity

Introduce a new pi_tuple_size field in struct blk_integrity to
explicitly represent the size (in bytes) of the protection information
(PI) tuple. This is a prep patch.
Add validation in blk_validate_integrity_limits() to ensure that
pi size matches the expected size for known checksum types and never
exceeds the pi_tuple_size.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Link: https://lore.kernel.org/20250630090548.3317-3-anuj20.g@samsung.com
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/blk-settings.c     | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c |  2 ++
 drivers/scsi/sd_dif.c    |  1 +
 include/linux/blkdev.h   |  1 +
 4 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 787500ff00c3..32f3cdc9835a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -14,6 +14,8 @@
 #include <linux/jiffies.h>
 #include <linux/gfp.h>
 #include <linux/dma-mapping.h>
+#include <linux/t10-pi.h>
+#include <linux/crc64.h>
 
 #include "blk.h"
 #include "blk-rq-qos.h"
@@ -135,6 +137,42 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 		return -EINVAL;
 	}
 
+	if (bi->pi_tuple_size > bi->metadata_size) {
+		pr_warn("pi_tuple_size (%u) exceeds metadata_size (%u)\n",
+			 bi->pi_tuple_size,
+			 bi->metadata_size);
+		return -EINVAL;
+	}
+
+	switch (bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_NONE:
+		if (bi->pi_tuple_size) {
+			pr_warn("pi_tuple_size must be 0 when checksum type \
+				 is none\n");
+			return -EINVAL;
+		}
+		break;
+	case BLK_INTEGRITY_CSUM_CRC:
+	case BLK_INTEGRITY_CSUM_IP:
+		if (bi->pi_tuple_size != sizeof(struct t10_pi_tuple)) {
+			pr_warn("pi_tuple_size mismatch for T10 PI: expected \
+				 %zu, got %u\n",
+				 sizeof(struct t10_pi_tuple),
+				 bi->pi_tuple_size);
+			return -EINVAL;
+		}
+		break;
+	case BLK_INTEGRITY_CSUM_CRC64:
+		if (bi->pi_tuple_size != sizeof(struct crc64_pi_tuple)) {
+			pr_warn("pi_tuple_size mismatch for CRC64 PI: \
+				 expected %zu, got %u\n",
+				 sizeof(struct crc64_pi_tuple),
+				 bi->pi_tuple_size);
+			return -EINVAL;
+		}
+		break;
+	}
+
 	if (!bi->interval_exp)
 		bi->interval_exp = ilog2(lim->logical_block_size);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b027dda38e69..fe72accab516 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1867,6 +1867,8 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
 	}
 
 	bi->metadata_size = head->ms;
+	if (bi->csum_type)
+		bi->pi_tuple_size = head->pi_size;
 	bi->pi_offset = info->pi_offset;
 	return true;
 }
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 18bfca1f1c78..ff4217fef93b 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -53,6 +53,7 @@ void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim)
 		bi->flags |= BLK_INTEGRITY_REF_TAG;
 
 	bi->metadata_size = sizeof(struct t10_pi_tuple);
+	bi->pi_tuple_size = bi->metadata_size;
 
 	if (dif && type) {
 		bi->flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ccda87d06a38..0d4011dcfed3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,6 +120,7 @@ struct blk_integrity {
 	unsigned char				pi_offset;
 	unsigned char				interval_exp;
 	unsigned char				tag_size;
+	unsigned char				pi_tuple_size;
 };
 
 typedef unsigned int __bitwise blk_mode_t;
-- 
cgit v1.2.3


From 9eb22f7fedfc9eb1b7f431a5359abd4d15b0b0cd Mon Sep 17 00:00:00 2001
From: Anuj Gupta <anuj20.g@samsung.com>
Date: Mon, 30 Jun 2025 14:35:48 +0530
Subject: fs: add ioctl to query metadata and protection info capabilities

Add a new ioctl, FS_IOC_GETLBMD_CAP, to query metadata and protection
info (PI) capabilities. This ioctl returns information about the files
integrity profile. This is useful for userspace applications to
understand a files end-to-end data protection support and configure the
I/O accordingly.

For now this interface is only supported by block devices. However the
design and placement of this ioctl in generic FS ioctl space allows us
to extend it to work over files as well. This maybe useful when
filesystems start supporting  PI-aware layouts.

A new structure struct logical_block_metadata_cap is introduced, which
contains the following fields:

1. lbmd_flags: bitmask of logical block metadata capability flags
2. lbmd_interval: the amount of data described by each unit of logical
block metadata
3. lbmd_size: size in bytes of the logical block metadata associated
with each interval
4. lbmd_opaque_size: size in bytes of the opaque block tag associated
with each interval
5. lbmd_opaque_offset: offset in bytes of the opaque block tag within
the logical block metadata
6. lbmd_pi_size: size in bytes of the T10 PI tuple associated with each
interval
7. lbmd_pi_offset: offset in bytes of T10 PI tuple within the logical
block metadata
8. lbmd_pi_guard_tag_type: T10 PI guard tag type
9. lbmd_pi_app_tag_size: size in bytes of the T10 PI application tag
10. lbmd_pi_ref_tag_size: size in bytes of the T10 PI reference tag
11. lbmd_pi_storage_tag_size: size in bytes of the T10 PI storage tag

The internal logic to fetch the capability is encapsulated in a helper
function blk_get_meta_cap(), which uses the blk_integrity profile
associated with the device. The ioctl returns -EOPNOTSUPP, if
CONFIG_BLK_DEV_INTEGRITY is not enabled.

Suggested-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
Link: https://lore.kernel.org/20250630090548.3317-5-anuj20.g@samsung.com
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/blk-integrity.c         | 52 ++++++++++++++++++++++++++++++++++++++
 block/ioctl.c                 |  4 +++
 include/linux/blk-integrity.h |  7 +++++
 include/uapi/linux/fs.h       | 59 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 122 insertions(+)

(limited to 'include')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index c1102bf4cd8d..9d9dc9c32083 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -13,6 +13,7 @@
 #include <linux/scatterlist.h>
 #include <linux/export.h>
 #include <linux/slab.h>
+#include <linux/t10-pi.h>
 
 #include "blk.h"
 
@@ -54,6 +55,57 @@ new_segment:
 	return segments;
 }
 
+int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
+		     struct logical_block_metadata_cap __user *argp)
+{
+	struct blk_integrity *bi = blk_get_integrity(bdev->bd_disk);
+	struct logical_block_metadata_cap meta_cap = {};
+	size_t usize = _IOC_SIZE(cmd);
+
+	if (!argp)
+		return -EINVAL;
+	if (usize < LBMD_SIZE_VER0)
+		return -EINVAL;
+	if (!bi)
+		goto out;
+
+	if (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)
+		meta_cap.lbmd_flags |= LBMD_PI_CAP_INTEGRITY;
+	if (bi->flags & BLK_INTEGRITY_REF_TAG)
+		meta_cap.lbmd_flags |= LBMD_PI_CAP_REFTAG;
+	meta_cap.lbmd_interval = 1 << bi->interval_exp;
+	meta_cap.lbmd_size = bi->metadata_size;
+	meta_cap.lbmd_pi_size = bi->pi_tuple_size;
+	meta_cap.lbmd_pi_offset = bi->pi_offset;
+	meta_cap.lbmd_opaque_size = bi->metadata_size - bi->pi_tuple_size;
+	if (meta_cap.lbmd_opaque_size && !bi->pi_offset)
+		meta_cap.lbmd_opaque_offset = bi->pi_tuple_size;
+
+	meta_cap.lbmd_guard_tag_type = bi->csum_type;
+	if (bi->csum_type != BLK_INTEGRITY_CSUM_NONE)
+		meta_cap.lbmd_app_tag_size = 2;
+
+	if (bi->flags & BLK_INTEGRITY_REF_TAG) {
+		switch (bi->csum_type) {
+		case BLK_INTEGRITY_CSUM_CRC64:
+			meta_cap.lbmd_ref_tag_size =
+				sizeof_field(struct crc64_pi_tuple, ref_tag);
+			break;
+		case BLK_INTEGRITY_CSUM_CRC:
+		case BLK_INTEGRITY_CSUM_IP:
+			meta_cap.lbmd_ref_tag_size =
+				sizeof_field(struct t10_pi_tuple, ref_tag);
+			break;
+		default:
+			break;
+		}
+	}
+
+out:
+	return copy_struct_to_user(argp, usize, &meta_cap, sizeof(meta_cap),
+				   NULL);
+}
+
 /**
  * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
  * @rq:		request to map
diff --git a/block/ioctl.c b/block/ioctl.c
index e472cc1030c6..9ad403733e19 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/io_uring/cmd.h>
+#include <linux/blk-integrity.h>
 #include <uapi/linux/blkdev.h>
 #include "blk.h"
 #include "blk-crypto-internal.h"
@@ -566,6 +567,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
 {
 	unsigned int max_sectors;
 
+	if (_IOC_NR(cmd) == _IOC_NR(FS_IOC_GETLBMD_CAP))
+		return blk_get_meta_cap(bdev, cmd, argp);
+
 	switch (cmd) {
 	case BLKFLSBUF:
 		return blkdev_flushbuf(bdev, cmd, arg);
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index d27730da47f3..e04c6e5bf1c6 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -29,6 +29,8 @@ int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
 			      ssize_t bytes);
+int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
+		     struct logical_block_metadata_cap __user *argp);
 
 static inline bool
 blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -92,6 +94,11 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq)
 				 rq->bio->bi_integrity->bip_iter);
 }
 #else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
+				   struct logical_block_metadata_cap __user *argp)
+{
+	return -EOPNOTSUPP;
+}
 static inline int blk_rq_count_integrity_sg(struct request_queue *q,
 					    struct bio *b)
 {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 0098b0ce8ccb..83720a2fd20d 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -91,6 +91,63 @@ struct fs_sysfs_path {
 	__u8			name[128];
 };
 
+/* Protection info capability flags */
+#define	LBMD_PI_CAP_INTEGRITY		(1 << 0)
+#define	LBMD_PI_CAP_REFTAG		(1 << 1)
+
+/* Checksum types for Protection Information */
+#define LBMD_PI_CSUM_NONE		0
+#define LBMD_PI_CSUM_IP			1
+#define LBMD_PI_CSUM_CRC16_T10DIF	2
+#define LBMD_PI_CSUM_CRC64_NVME		4
+
+/* sizeof first published struct */
+#define LBMD_SIZE_VER0			16
+
+/*
+ * Logical block metadata capability descriptor
+ * If the device does not support metadata, all the fields will be zero.
+ * Applications must check lbmd_flags to determine whether metadata is
+ * supported or not.
+ */
+struct logical_block_metadata_cap {
+	/* Bitmask of logical block metadata capability flags */
+	__u32	lbmd_flags;
+	/*
+	 * The amount of data described by each unit of logical block
+	 * metadata
+	 */
+	__u16	lbmd_interval;
+	/*
+	 * Size in bytes of the logical block metadata associated with each
+	 * interval
+	 */
+	__u8	lbmd_size;
+	/*
+	 * Size in bytes of the opaque block tag associated with each
+	 * interval
+	 */
+	__u8	lbmd_opaque_size;
+	/*
+	 * Offset in bytes of the opaque block tag within the logical block
+	 * metadata
+	 */
+	__u8	lbmd_opaque_offset;
+	/* Size in bytes of the T10 PI tuple associated with each interval */
+	__u8	lbmd_pi_size;
+	/* Offset in bytes of T10 PI tuple within the logical block metadata */
+	__u8	lbmd_pi_offset;
+	/* T10 PI guard tag type */
+	__u8	lbmd_guard_tag_type;
+	/* Size in bytes of the T10 PI application tag */
+	__u8	lbmd_app_tag_size;
+	/* Size in bytes of the T10 PI reference tag */
+	__u8	lbmd_ref_tag_size;
+	/* Size in bytes of the T10 PI storage tag */
+	__u8	lbmd_storage_tag_size;
+	__u8	pad;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
@@ -247,6 +304,8 @@ struct fsxattr {
  * also /sys/kernel/debug/ for filesystems with debugfs exports
  */
 #define FS_IOC_GETFSSYSFSPATH		_IOR(0x15, 1, struct fs_sysfs_path)
+/* Get logical block metadata capability details */
+#define FS_IOC_GETLBMD_CAP		_IOWR(0x15, 2, struct logical_block_metadata_cap)
 
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
-- 
cgit v1.2.3


From e78f70bad29c5ae1e1076698b690b15794e9b81e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 1 Jul 2025 14:32:25 +0200
Subject: time/timecounter: Fix the lie that struct cyclecounter is const

In both the read callback for struct cyclecounter, and in struct
timecounter, struct cyclecounter is declared as a const pointer.

Unfortunatly, a number of users of this pointer treat it as a non-const
pointer as it is burried in a larger structure that is heavily modified by
the callback function when accessed.  This lie had been hidden by the fact
that container_of() "casts away" a const attribute of a pointer without any
compiler warning happening at all.

Fix this all up by removing the const attribute in the needed places so
that everyone can see that the structure really isn't const, but can,
and is, modified by the users of it.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/2025070124-backyard-hurt-783a@gregkh
---
 arch/microblaze/kernel/timer.c                        | 2 +-
 drivers/clocksource/arm_arch_timer.c                  | 2 +-
 drivers/net/can/rockchip/rockchip_canfd-timestamp.c   | 2 +-
 drivers/net/can/spi/mcp251xfd/mcp251xfd-timestamp.c   | 2 +-
 drivers/net/can/usb/gs_usb.c                          | 2 +-
 drivers/net/dsa/mv88e6xxx/chip.h                      | 2 +-
 drivers/net/dsa/mv88e6xxx/ptp.c                       | 6 +++---
 drivers/net/ethernet/amd/xgbe/xgbe-ptp.c              | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c      | 2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c         | 2 +-
 drivers/net/ethernet/cavium/common/cavium_ptp.c       | 2 +-
 drivers/net/ethernet/freescale/fec_ptp.c              | 2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c            | 2 +-
 drivers/net/ethernet/intel/igb/igb_ptp.c              | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c          | 4 ++--
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c | 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_clock.c         | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c   | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c    | 2 +-
 drivers/net/ethernet/pensando/ionic/ionic_phc.c       | 2 +-
 drivers/net/ethernet/qlogic/qede/qede_ptp.c           | 2 +-
 drivers/net/ethernet/ti/cpts.c                        | 2 +-
 drivers/net/ethernet/wangxun/libwx/wx_ptp.c           | 2 +-
 drivers/ptp/ptp_mock.c                                | 2 +-
 drivers/ptp/ptp_vclock.c                              | 2 +-
 include/linux/timecounter.h                           | 6 +++---
 kernel/time/timecounter.c                             | 2 +-
 sound/hda/hdac_stream.c                               | 2 +-
 28 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index ccb4b4b59bca..a2ab67b747a1 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -193,7 +193,7 @@ static struct timecounter xilinx_tc = {
 	.cc = NULL,
 };
 
-static u64 xilinx_cc_read(const struct cyclecounter *cc)
+static u64 xilinx_cc_read(struct cyclecounter *cc)
 {
 	return xilinx_read(NULL);
 }
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 981a578043a5..80ba6a54248c 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -243,7 +243,7 @@ static u64 arch_counter_read(struct clocksource *cs)
 	return arch_timer_read_counter();
 }
 
-static u64 arch_counter_read_cc(const struct cyclecounter *cc)
+static u64 arch_counter_read_cc(struct cyclecounter *cc)
 {
 	return arch_timer_read_counter();
 }
diff --git a/drivers/net/can/rockchip/rockchip_canfd-timestamp.c b/drivers/net/can/rockchip/rockchip_canfd-timestamp.c
index fa85a75be65a..72774cd2f94b 100644
--- a/drivers/net/can/rockchip/rockchip_canfd-timestamp.c
+++ b/drivers/net/can/rockchip/rockchip_canfd-timestamp.c
@@ -8,7 +8,7 @@
 
 #include "rockchip_canfd.h"
 
-static u64 rkcanfd_timestamp_read(const struct cyclecounter *cc)
+static u64 rkcanfd_timestamp_read(struct cyclecounter *cc)
 {
 	const struct rkcanfd_priv *priv = container_of(cc, struct rkcanfd_priv, cc);
 
diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-timestamp.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-timestamp.c
index 202ca0d24d03..413a5cb75c13 100644
--- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-timestamp.c
+++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-timestamp.c
@@ -11,7 +11,7 @@
 
 #include "mcp251xfd.h"
 
-static u64 mcp251xfd_timestamp_raw_read(const struct cyclecounter *cc)
+static u64 mcp251xfd_timestamp_raw_read(struct cyclecounter *cc)
 {
 	const struct mcp251xfd_priv *priv;
 	u32 ts_raw = 0;
diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index bb6335278e46..c9482d6e947b 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -420,7 +420,7 @@ static inline int gs_usb_get_timestamp(const struct gs_usb *parent,
 	return 0;
 }
 
-static u64 gs_usb_timestamp_read(const struct cyclecounter *cc) __must_hold(&dev->tc_lock)
+static u64 gs_usb_timestamp_read(struct cyclecounter *cc) __must_hold(&dev->tc_lock)
 {
 	struct gs_usb *parent = container_of(cc, struct gs_usb, cc);
 	u32 timestamp = 0;
diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h
index 7d00482f53a3..feddf505c918 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.h
+++ b/drivers/net/dsa/mv88e6xxx/chip.h
@@ -732,7 +732,7 @@ struct mv88e6xxx_avb_ops {
 };
 
 struct mv88e6xxx_ptp_ops {
-	u64 (*clock_read)(const struct cyclecounter *cc);
+	u64 (*clock_read)(struct cyclecounter *cc);
 	int (*ptp_enable)(struct ptp_clock_info *ptp,
 			  struct ptp_clock_request *rq, int on);
 	int (*ptp_verify)(struct ptp_clock_info *ptp, unsigned int pin,
diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c
index 1d3b2c94c53e..e8c9207e932e 100644
--- a/drivers/net/dsa/mv88e6xxx/ptp.c
+++ b/drivers/net/dsa/mv88e6xxx/ptp.c
@@ -138,7 +138,7 @@ mv88e6xxx_cc_coeff_get(struct mv88e6xxx_chip *chip)
 	}
 }
 
-static u64 mv88e6352_ptp_clock_read(const struct cyclecounter *cc)
+static u64 mv88e6352_ptp_clock_read(struct cyclecounter *cc)
 {
 	struct mv88e6xxx_chip *chip = cc_to_chip(cc);
 	u16 phc_time[2];
@@ -152,7 +152,7 @@ static u64 mv88e6352_ptp_clock_read(const struct cyclecounter *cc)
 		return ((u32)phc_time[1] << 16) | phc_time[0];
 }
 
-static u64 mv88e6165_ptp_clock_read(const struct cyclecounter *cc)
+static u64 mv88e6165_ptp_clock_read(struct cyclecounter *cc)
 {
 	struct mv88e6xxx_chip *chip = cc_to_chip(cc);
 	u16 phc_time[2];
@@ -483,7 +483,7 @@ const struct mv88e6xxx_ptp_ops mv88e6390_ptp_ops = {
 		(1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ),
 };
 
-static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc)
+static u64 mv88e6xxx_ptp_clock_read(struct cyclecounter *cc)
 {
 	struct mv88e6xxx_chip *chip = cc_to_chip(cc);
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
index 978c4dd01fa0..e8d5c05de77a 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
@@ -13,7 +13,7 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
-static u64 xgbe_cc_read(const struct cyclecounter *cc)
+static u64 xgbe_cc_read(struct cyclecounter *cc)
 {
 	struct xgbe_prv_data *pdata = container_of(cc,
 						   struct xgbe_prv_data,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index c9a1a1d504c0..48ad2d6e125b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -15176,7 +15176,7 @@ void bnx2x_set_rx_ts(struct bnx2x *bp, struct sk_buff *skb)
 }
 
 /* Read the PHC */
-static u64 bnx2x_cyclecounter_read(const struct cyclecounter *cc)
+static u64 bnx2x_cyclecounter_read(struct cyclecounter *cc)
 {
 	struct bnx2x *bp = container_of(cc, struct bnx2x, cyclecounter);
 	int port = BP_PORT(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
index 0669d43472f5..7542b6d2568b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
@@ -702,7 +702,7 @@ static void bnxt_unmap_ptp_regs(struct bnxt *bp)
 		  (BNXT_PTP_GRC_WIN - 1) * 4);
 }
 
-static u64 bnxt_cc_read(const struct cyclecounter *cc)
+static u64 bnxt_cc_read(struct cyclecounter *cc)
 {
 	struct bnxt_ptp_cfg *ptp = container_of(cc, struct bnxt_ptp_cfg, cc);
 	u64 ns = 0;
diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.c b/drivers/net/ethernet/cavium/common/cavium_ptp.c
index 984f0dd7b62e..61e261657073 100644
--- a/drivers/net/ethernet/cavium/common/cavium_ptp.c
+++ b/drivers/net/ethernet/cavium/common/cavium_ptp.c
@@ -209,7 +209,7 @@ static int cavium_ptp_enable(struct ptp_clock_info *ptp_info,
 	return -EOPNOTSUPP;
 }
 
-static u64 cavium_ptp_cc_read(const struct cyclecounter *cc)
+static u64 cavium_ptp_cc_read(struct cyclecounter *cc)
 {
 	struct cavium_ptp *clock =
 		container_of(cc, struct cavium_ptp, cycle_counter);
diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index 876d90832596..c28ca17a81fd 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -96,7 +96,7 @@
  * cyclecounter structure used to construct a ns counter from the
  * arbitrary fixed point registers
  */
-static u64 fec_ptp_read(const struct cyclecounter *cc)
+static u64 fec_ptp_read(struct cyclecounter *cc)
 {
 	struct fec_enet_private *fep =
 		container_of(cc, struct fec_enet_private, cc);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 7719e15813ee..b27a61fab371 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4436,7 +4436,7 @@ u64 e1000e_read_systim(struct e1000_adapter *adapter,
  * e1000e_cyclecounter_read - read raw cycle counter (used by time counter)
  * @cc: cyclecounter structure
  **/
-static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
+static u64 e1000e_cyclecounter_read(struct cyclecounter *cc)
 {
 	struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter,
 						     cc);
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index 793c96016288..2f1fae2bcdd2 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -73,7 +73,7 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter);
 static void igb_ptp_sdp_init(struct igb_adapter *adapter);
 
 /* SYSTIM read access for the 82576 */
-static u64 igb_ptp_read_82576(const struct cyclecounter *cc)
+static u64 igb_ptp_read_82576(struct cyclecounter *cc)
 {
 	struct igb_adapter *igb = container_of(cc, struct igb_adapter, cc);
 	struct e1000_hw *hw = &igb->hw;
@@ -90,7 +90,7 @@ static u64 igb_ptp_read_82576(const struct cyclecounter *cc)
 }
 
 /* SYSTIM read access for the 82580 */
-static u64 igb_ptp_read_82580(const struct cyclecounter *cc)
+static u64 igb_ptp_read_82580(struct cyclecounter *cc)
 {
 	struct igb_adapter *igb = container_of(cc, struct igb_adapter, cc);
 	struct e1000_hw *hw = &igb->hw;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index eef25e11d938..eafb61415bd1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -327,7 +327,7 @@ static void ixgbe_ptp_setup_sdp_X550(struct ixgbe_adapter *adapter)
  * result of SYSTIME is 32bits of "billions of cycles" and 32 bits of
  * "cycles", rather than seconds and nanoseconds.
  */
-static u64 ixgbe_ptp_read_X550(const struct cyclecounter *cc)
+static u64 ixgbe_ptp_read_X550(struct cyclecounter *cc)
 {
 	struct ixgbe_adapter *adapter =
 		container_of(cc, struct ixgbe_adapter, hw_cc);
@@ -364,7 +364,7 @@ static u64 ixgbe_ptp_read_X550(const struct cyclecounter *cc)
  * cyclecounter structure used to construct a ns counter from the
  * arbitrary fixed point registers
  */
-static u64 ixgbe_ptp_read_82599(const struct cyclecounter *cc)
+static u64 ixgbe_ptp_read_82599(struct cyclecounter *cc)
 {
 	struct ixgbe_adapter *adapter =
 		container_of(cc, struct ixgbe_adapter, hw_cc);
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
index 63130ba37e9d..e52cc6b1a26c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c
@@ -193,7 +193,7 @@ static int ptp_pps_on(struct otx2_ptp *ptp, int on, u64 period)
 	return otx2_sync_mbox_msg(&ptp->nic->mbox);
 }
 
-static u64 ptp_cc_read(const struct cyclecounter *cc)
+static u64 ptp_cc_read(struct cyclecounter *cc)
 {
 	struct otx2_ptp *ptp = container_of(cc, struct otx2_ptp, cycle_counter);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index d73a2044dc26..2aeaafcfb993 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -38,7 +38,7 @@
 
 /* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
  */
-static u64 mlx4_en_read_clock(const struct cyclecounter *tc)
+static u64 mlx4_en_read_clock(struct cyclecounter *tc)
 {
 	struct mlx4_en_dev *mdev =
 		container_of(tc, struct mlx4_en_dev, cycles);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
index cec18efadc73..214d732d18e9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
@@ -343,7 +343,7 @@ static u64 mlx5_read_time(struct mlx5_core_dev *dev,
 			   (u64)timer_l | (u64)timer_h1 << 32;
 }
 
-static u64 read_internal_timer(const struct cyclecounter *cc)
+static u64 read_internal_timer(struct cyclecounter *cc)
 {
 	struct mlx5_timer *timer = container_of(cc, struct mlx5_timer, cycles);
 	struct mlx5_clock *clock = container_of(timer, struct mlx5_clock, timer);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c
index e8182dd76c7d..5b9f0844b8f6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c
@@ -131,7 +131,7 @@ static u64 __mlxsw_sp1_ptp_read_frc(struct mlxsw_sp1_ptp_clock *clock,
 	return (u64) frc_l | (u64) frc_h2 << 32;
 }
 
-static u64 mlxsw_sp1_ptp_read_frc(const struct cyclecounter *cc)
+static u64 mlxsw_sp1_ptp_read_frc(struct cyclecounter *cc)
 {
 	struct mlxsw_sp1_ptp_clock *clock =
 		container_of(cc, struct mlxsw_sp1_ptp_clock, cycles);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_phc.c b/drivers/net/ethernet/pensando/ionic/ionic_phc.c
index 7505efdff8e9..9f5c81d44f99 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_phc.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_phc.c
@@ -290,7 +290,7 @@ static u64 ionic_hwstamp_read(struct ionic *ionic,
 	return (u64)tick_low | ((u64)tick_high << 32);
 }
 
-static u64 ionic_cc_read(const struct cyclecounter *cc)
+static u64 ionic_cc_read(struct cyclecounter *cc)
 {
 	struct ionic_phc *phc = container_of(cc, struct ionic_phc, cc);
 	struct ionic *ionic = phc->lif->ionic;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
index 9d6399a5c780..a38f1e72c62b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -181,7 +181,7 @@ static void qede_ptp_task(struct work_struct *work)
 }
 
 /* Read the PHC. This API is invoked with ptp_lock held. */
-static u64 qede_ptp_read_cc(const struct cyclecounter *cc)
+static u64 qede_ptp_read_cc(struct cyclecounter *cc)
 {
 	struct qede_dev *edev;
 	struct qede_ptp *ptp;
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index dbbea9146040..2ba4c8795d60 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -181,7 +181,7 @@ void cpts_misc_interrupt(struct cpts *cpts)
 }
 EXPORT_SYMBOL_GPL(cpts_misc_interrupt);
 
-static u64 cpts_systim_read(const struct cyclecounter *cc)
+static u64 cpts_systim_read(struct cyclecounter *cc)
 {
 	struct cpts *cpts = container_of(cc, struct cpts, cc);
 
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_ptp.c b/drivers/net/ethernet/wangxun/libwx/wx_ptp.c
index 2c39b879f977..44f3e6505246 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_ptp.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_ptp.c
@@ -652,7 +652,7 @@ static int wx_ptp_set_timestamp_mode(struct wx *wx,
 	return 0;
 }
 
-static u64 wx_ptp_read(const struct cyclecounter *hw_cc)
+static u64 wx_ptp_read(struct cyclecounter *hw_cc)
 {
 	struct wx *wx = container_of(hw_cc, struct wx, hw_cc);
 
diff --git a/drivers/ptp/ptp_mock.c b/drivers/ptp/ptp_mock.c
index e7b459c846a2..bbd14ce24b34 100644
--- a/drivers/ptp/ptp_mock.c
+++ b/drivers/ptp/ptp_mock.c
@@ -41,7 +41,7 @@ struct mock_phc {
 	spinlock_t lock;
 };
 
-static u64 mock_phc_cc_read(const struct cyclecounter *cc)
+static u64 mock_phc_cc_read(struct cyclecounter *cc)
 {
 	return ktime_get_raw_ns();
 }
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index 7febfdcbde8b..2fdeedd60e21 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -164,7 +164,7 @@ static const struct ptp_clock_info ptp_vclock_info = {
 	.do_aux_work	= ptp_vclock_refresh,
 };
 
-static u64 ptp_vclock_read(const struct cyclecounter *cc)
+static u64 ptp_vclock_read(struct cyclecounter *cc)
 {
 	struct ptp_vclock *vclock = cc_to_vclock(cc);
 	struct ptp_clock *ptp = vclock->pclock;
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 0982d1d52b24..dce03a5cafb7 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -28,7 +28,7 @@
  * @shift:		cycle to nanosecond divisor (power of two)
  */
 struct cyclecounter {
-	u64 (*read)(const struct cyclecounter *cc);
+	u64 (*read)(struct cyclecounter *cc);
 	u64 mask;
 	u32 mult;
 	u32 shift;
@@ -53,7 +53,7 @@ struct cyclecounter {
  * @frac:		accumulated fractional nanoseconds
  */
 struct timecounter {
-	const struct cyclecounter *cc;
+	struct cyclecounter *cc;
 	u64 cycle_last;
 	u64 nsec;
 	u64 mask;
@@ -100,7 +100,7 @@ static inline void timecounter_adjtime(struct timecounter *tc, s64 delta)
  * the time stamp counter by the number of elapsed nanoseconds.
  */
 extern void timecounter_init(struct timecounter *tc,
-			     const struct cyclecounter *cc,
+			     struct cyclecounter *cc,
 			     u64 start_tstamp);
 
 /**
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..3d2a354cfe1c 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
 #include <linux/timecounter.h>
 
 void timecounter_init(struct timecounter *tc,
-		      const struct cyclecounter *cc,
+		      struct cyclecounter *cc,
 		      u64 start_tstamp)
 {
 	tc->cc = cc;
diff --git a/sound/hda/hdac_stream.c b/sound/hda/hdac_stream.c
index e7f6208af5b0..4a87bef8834f 100644
--- a/sound/hda/hdac_stream.c
+++ b/sound/hda/hdac_stream.c
@@ -634,7 +634,7 @@ int snd_hdac_stream_set_params(struct hdac_stream *azx_dev,
 }
 EXPORT_SYMBOL_GPL(snd_hdac_stream_set_params);
 
-static u64 azx_cc_read(const struct cyclecounter *cc)
+static u64 azx_cc_read(struct cyclecounter *cc)
 {
 	struct hdac_stream *azx_dev = container_of(cc, struct hdac_stream, cc);
 
-- 
cgit v1.2.3


From b9f58d3572a8e1ef707b941eae58ec4014b9269d Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Fri, 20 Jun 2025 21:13:07 +0800
Subject: ACPI: Return -ENODEV from acpi_parse_spcr() when SPCR support is
 disabled

If CONFIG_ACPI_SPCR_TABLE is disabled, acpi_parse_spcr()
currently returns 0, which may incorrectly suggest that
SPCR parsing was successful. This patch changes the behavior
to return -ENODEV to clearly indicate that SPCR support
is not available.

This prepares the codebase for future changes that depend
on acpi_parse_spcr() failure detection, such as suppressing
misleading console messages.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/20250620131309.126555-2-me@linux.beauty
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index f102c0fe3431..71e692f95290 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1503,7 +1503,7 @@ int acpi_parse_spcr(bool enable_earlycon, bool enable_console);
 #else
 static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console)
 {
-	return 0;
+	return -ENODEV;
 }
 #endif
 
-- 
cgit v1.2.3


From 536f5941adde41c99a18a0ba03b457adc9702ab8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 27 Jun 2025 17:19:23 +0300
Subject: libnvdimm: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

While doing that, sort headers alphabetically.

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20250627142001.994860-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 include/linux/libnvdimm.h | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index e772aae71843..28f086c4a187 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -6,12 +6,12 @@
  */
 #ifndef __LIBNVDIMM_H__
 #define __LIBNVDIMM_H__
-#include <linux/kernel.h>
+
+#include <linux/io.h>
 #include <linux/sizes.h>
+#include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
-#include <linux/spinlock.h>
-#include <linux/bio.h>
 
 struct badrange_entry {
 	u64 start;
@@ -80,7 +80,9 @@ typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc);
 
+struct attribute_group;
 struct device_node;
+struct module;
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
 	unsigned long cmd_mask;
@@ -121,6 +123,8 @@ struct nd_mapping_desc {
 	int position;
 };
 
+struct bio;
+struct resource;
 struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
@@ -147,8 +151,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev,
 	return (void __iomem *) devm_nvdimm_memremap(dev, offset, size, 0);
 }
 
-struct nvdimm_bus;
-
 /*
  * Note that separate bits for locked + unlocked are defined so that
  * 'flags == 0' corresponds to an error / not-supported state.
@@ -238,6 +240,9 @@ struct nvdimm_fw_ops {
 	int (*arm)(struct nvdimm *nvdimm, enum nvdimm_fwa_trigger arg);
 };
 
+struct kobject;
+struct nvdimm_bus;
+
 void badrange_init(struct badrange *badrange);
 int badrange_add(struct badrange *badrange, u64 addr, u64 length);
 void badrange_forget(struct badrange *badrange, phys_addr_t start,
-- 
cgit v1.2.3


From b6139a6abf673029008f80d42abd3848d80a9108 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@kernel.org>
Date: Tue, 17 Jun 2025 15:43:23 +0200
Subject: lib/group_cpus: Let group_cpu_evenly() return the number of
 initialized masks

group_cpu_evenly() might have allocated less groups then requested:

group_cpu_evenly()
  __group_cpus_evenly()
    alloc_nodes_groups()
      # allocated total groups may be less than numgrps when
      # active total CPU number is less then numgrps

In this case, the caller will do an out of bound access because the
caller assumes the masks returned has numgrps.

Return the number of groups created so the caller can limit the access
range accordingly.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Daniel Wagner <wagi@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250617-isolcpus-queue-counters-v1-1-13923686b54b@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c        |  6 +++---
 drivers/virtio/virtio_vdpa.c |  9 +++++----
 fs/fuse/virtio_fs.c          |  6 +++---
 include/linux/group_cpus.h   |  2 +-
 kernel/irq/affinity.c        | 11 +++++------
 lib/group_cpus.c             | 16 ++++++++--------
 6 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 444798c5374f..269161252add 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -19,9 +19,9 @@
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
 	const struct cpumask *masks;
-	unsigned int queue, cpu;
+	unsigned int queue, cpu, nr_masks;
 
-	masks = group_cpus_evenly(qmap->nr_queues);
+	masks = group_cpus_evenly(qmap->nr_queues, &nr_masks);
 	if (!masks) {
 		for_each_possible_cpu(cpu)
 			qmap->mq_map[cpu] = qmap->queue_offset;
@@ -29,7 +29,7 @@ void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 	}
 
 	for (queue = 0; queue < qmap->nr_queues; queue++) {
-		for_each_cpu(cpu, &masks[queue])
+		for_each_cpu(cpu, &masks[queue % nr_masks])
 			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 	kfree(masks);
diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c
index 1f60c9d5cb18..a7b297dae489 100644
--- a/drivers/virtio/virtio_vdpa.c
+++ b/drivers/virtio/virtio_vdpa.c
@@ -329,20 +329,21 @@ create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 
 	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
 		unsigned int this_vecs = affd->set_size[i];
+		unsigned int nr_masks;
 		int j;
-		struct cpumask *result = group_cpus_evenly(this_vecs);
+		struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
 
 		if (!result) {
 			kfree(masks);
 			return NULL;
 		}
 
-		for (j = 0; j < this_vecs; j++)
+		for (j = 0; j < nr_masks; j++)
 			cpumask_copy(&masks[curvec + j], &result[j]);
 		kfree(result);
 
-		curvec += this_vecs;
-		usedvecs += this_vecs;
+		curvec += nr_masks;
+		usedvecs += nr_masks;
 	}
 
 	/* Fill out vectors at the end that don't need affinity */
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 53c2626e90e7..3fbfb1a2942b 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -862,7 +862,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
 static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
 {
 	const struct cpumask *mask, *masks;
-	unsigned int q, cpu;
+	unsigned int q, cpu, nr_masks;
 
 	/* First attempt to map using existing transport layer affinities
 	 * e.g. PCIe MSI-X
@@ -882,7 +882,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
 	return;
 fallback:
 	/* Attempt to map evenly in groups over the CPUs */
-	masks = group_cpus_evenly(fs->num_request_queues);
+	masks = group_cpus_evenly(fs->num_request_queues, &nr_masks);
 	/* If even this fails we default to all CPUs use first request queue */
 	if (!masks) {
 		for_each_possible_cpu(cpu)
@@ -891,7 +891,7 @@ fallback:
 	}
 
 	for (q = 0; q < fs->num_request_queues; q++) {
-		for_each_cpu(cpu, &masks[q])
+		for_each_cpu(cpu, &masks[q % nr_masks])
 			fs->mq_map[cpu] = q + VQ_REQUEST;
 	}
 	kfree(masks);
diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h
index e42807ec61f6..9d4e5ab6c314 100644
--- a/include/linux/group_cpus.h
+++ b/include/linux/group_cpus.h
@@ -9,6 +9,6 @@
 #include <linux/kernel.h>
 #include <linux/cpu.h>
 
-struct cpumask *group_cpus_evenly(unsigned int numgrps);
+struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks);
 
 #endif
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 44a4eba80315..4013e6ad2b2f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
 	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
-		unsigned int this_vecs = affd->set_size[i];
-		int j;
-		struct cpumask *result = group_cpus_evenly(this_vecs);
+		unsigned int nr_masks, this_vecs = affd->set_size[i];
+		struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
 
 		if (!result) {
 			kfree(masks);
 			return NULL;
 		}
 
-		for (j = 0; j < this_vecs; j++)
+		for (int j = 0; j < nr_masks; j++)
 			cpumask_copy(&masks[curvec + j].mask, &result[j]);
 		kfree(result);
 
-		curvec += this_vecs;
-		usedvecs += this_vecs;
+		curvec += nr_masks;
+		usedvecs += nr_masks;
 	}
 
 	/* Fill out vectors at the end that don't need affinity */
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index 18d43a406114..6d08ac05f371 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -332,9 +332,11 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 /**
  * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
  * @numgrps: number of groups
+ * @nummasks: number of initialized cpumasks
  *
  * Return: cpumask array if successful, NULL otherwise. And each element
- * includes CPUs assigned to this group
+ * includes CPUs assigned to this group. nummasks contains the number
+ * of initialized masks which can be less than numgrps.
  *
  * Try to put close CPUs from viewpoint of CPU and NUMA locality into
  * same group, and run two-stage grouping:
@@ -344,7 +346,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  * We guarantee in the resulted grouping that all CPUs are covered, and
  * no same CPU is assigned to multiple groups
  */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 {
 	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
 	cpumask_var_t *node_to_cpumask;
@@ -389,7 +391,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
 				  npresmsk, nmsk, masks);
 	if (ret < 0)
-		goto fail_build_affinity;
+		goto fail_node_to_cpumask;
 	nr_present = ret;
 
 	/*
@@ -408,10 +410,6 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
 	if (ret >= 0)
 		nr_others = ret;
 
- fail_build_affinity:
-	if (ret >= 0)
-		WARN_ON(nr_present + nr_others < numgrps);
-
  fail_node_to_cpumask:
 	free_node_to_cpumask(node_to_cpumask);
 
@@ -424,10 +422,11 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
 		kfree(masks);
 		return NULL;
 	}
+	*nummasks = min(nr_present + nr_others, numgrps);
 	return masks;
 }
 #else /* CONFIG_SMP */
-struct cpumask *group_cpus_evenly(unsigned int numgrps)
+struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 {
 	struct cpumask *masks;
 
@@ -440,6 +439,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
 
 	/* assign all CPUs(cpu 0) to the 1st group only */
 	cpumask_copy(&masks[0], cpu_possible_mask);
+	*nummasks = 1;
 	return masks;
 }
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 3f27c1de5df265f9d8edf0cc5d75dc92e328484a Mon Sep 17 00:00:00 2001
From: Daniel Wagner <wagi@kernel.org>
Date: Tue, 17 Jun 2025 15:43:24 +0200
Subject: blk-mq: add number of queue calc helper

Add two variants of helper functions that calculate the correct number
of queues to use. Two variants are needed because some drivers base
their maximum number of queues on the possible CPU mask, while others
use the online CPU mask.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Daniel Wagner <wagi@kernel.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Link: https://lore.kernel.org/r/20250617-isolcpus-queue-counters-v1-2-13923686b54b@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 269161252add..705da074ad6c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -12,10 +12,50 @@
 #include <linux/cpu.h>
 #include <linux/group_cpus.h>
 #include <linux/device/bus.h>
+#include <linux/sched/isolation.h>
 
 #include "blk.h"
 #include "blk-mq.h"
 
+static unsigned int blk_mq_num_queues(const struct cpumask *mask,
+				      unsigned int max_queues)
+{
+	unsigned int num;
+
+	num = cpumask_weight(mask);
+	return min_not_zero(num, max_queues);
+}
+
+/**
+ * blk_mq_num_possible_queues - Calc nr of queues for multiqueue devices
+ * @max_queues:	The maximum number of queues the hardware/driver
+ *		supports. If max_queues is 0, the argument is
+ *		ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of possible CPUs.
+ */
+unsigned int blk_mq_num_possible_queues(unsigned int max_queues)
+{
+	return blk_mq_num_queues(cpu_possible_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_possible_queues);
+
+/**
+ * blk_mq_num_online_queues - Calc nr of queues for multiqueue devices
+ * @max_queues:	The maximum number of queues the hardware/driver
+ *		supports. If max_queues is 0, the argument is
+ *		ignored.
+ *
+ * Calculates the number of queues to be used for a multiqueue
+ * device based on the number of online CPUs.
+ */
+unsigned int blk_mq_num_online_queues(unsigned int max_queues)
+{
+	return blk_mq_num_queues(cpu_online_mask, max_queues);
+}
+EXPORT_SYMBOL_GPL(blk_mq_num_online_queues);
+
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
 	const struct cpumask *masks;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index de8c85a03bb7..2a5a828f19a0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -947,6 +947,8 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
 void blk_freeze_queue_start_non_owner(struct request_queue *q);
 
+unsigned int blk_mq_num_possible_queues(unsigned int max_queues);
+unsigned int blk_mq_num_online_queues(unsigned int max_queues);
 void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
 			  struct device *dev, unsigned int offset);
-- 
cgit v1.2.3


From fc9a099567813a9fef0da07b94ecb8dee64703c4 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.com>
Date: Fri, 27 Jun 2025 12:10:11 +0200
Subject: docs: dma-api: add a kernel-doc comment for dma_pool_zalloc()

Document the dma_pool_zalloc() wrapper.

Signed-off-by: Petr Tesarik <ptesarik@suse.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
[jc: fixed up dma_pool_alloc() reference in dmapool.h]
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Link: https://lore.kernel.org/r/20250627101015.1600042-5-ptesarik@suse.com
---
 Documentation/core-api/mm-api.rst | 2 ++
 include/linux/dmapool.h           | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index af8151db88b2..a61766328ac0 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -97,6 +97,8 @@ DMA pools
 .. kernel-doc:: mm/dmapool.c
    :export:
 
+.. kernel-doc:: include/linux/dmapool.h
+
 More Memory Management Functions
 ================================
 
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index 06c4de602b2f..7d40b51933d1 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -60,6 +60,14 @@ static inline struct dma_pool *dma_pool_create(const char *name,
 				    NUMA_NO_NODE);
 }
 
+/**
+ * dma_pool_zalloc - Get a zero-initialized block of DMA coherent memory.
+ * @pool: dma pool that will produce the block
+ * @mem_flags: GFP_* bitmask
+ * @handle: pointer to dma address of block
+ *
+ * Same as dma_pool_alloc(), but the returned memory is zeroed.
+ */
 static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
 				    dma_addr_t *handle)
 {
-- 
cgit v1.2.3


From 9df7b5ebead649b00bf9a53a798e4bf83a1318fd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Jul 2025 17:38:37 +0100
Subject: netfs: Fix double put of request

If a netfs request finishes during the pause loop, it will have the ref
that belongs to the IN_PROGRESS flag removed at that point - however, if it
then goes to the final wait loop, that will *also* put the ref because it
sees that the IN_PROGRESS flag is clear and incorrectly assumes that this
happened when it called the collector.

In fact, since IN_PROGRESS is clear, we shouldn't call the collector again
since it's done all the cleanup, such as calling ->ki_complete().

Fix this by making netfs_collect_in_app() just return, indicating that
we're done if IN_PROGRESS is removed.

Fixes: 2b1424cd131c ("netfs: Fix wait/wake to be consistent about the waitqueue used")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/20250701163852.2171681-3-dhowells@redhat.com
Tested-by: Steve French <sfrench@samba.org>
Reviewed-by: Paulo Alcantara <pc@manguebit.org>
cc: Steve French <sfrench@samba.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/misc.c              | 5 +++++
 include/trace/events/netfs.h | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 0a54b1203486..8cf73b237269 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -383,6 +383,11 @@ static int netfs_collect_in_app(struct netfs_io_request *rreq,
 {
 	bool need_collect = false, inactive = true, done = true;
 
+	if (!netfs_check_rreq_in_progress(rreq)) {
+		trace_netfs_rreq(rreq, netfs_rreq_trace_recollect);
+		return 1; /* Done */
+	}
+
 	for (int i = 0; i < NR_IO_STREAMS; i++) {
 		struct netfs_io_subrequest *subreq;
 		struct netfs_io_stream *stream = &rreq->io_streams[i];
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 333d2e38dd2c..ba35dc66e986 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -56,6 +56,7 @@
 	EM(netfs_rreq_trace_dirty,		"DIRTY  ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
+	EM(netfs_rreq_trace_recollect,		"RECLLCT")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
 	EM(netfs_rreq_trace_set_abandon,	"S-ABNDN")	\
-- 
cgit v1.2.3


From 5e1e6ec2e346c0850f304c31abdef4158007474e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Jul 2025 17:38:46 +0100
Subject: netfs: Merge i_size update functions

Netfslib has two functions for updating the i_size after a write: one for
buffered writes into the pagecache and one for direct/unbuffered writes.
However, what needs to be done is much the same in both cases, so merge
them together.

This does raise one question, though: should updating the i_size after a
direct write do the same estimated update of i_blocks as is done for
buffered writes.

Also get rid of the cleanup function pointer from netfs_io_request as it's
only used for direct write to update i_size; instead do the i_size setting
directly from write collection.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/20250701163852.2171681-12-dhowells@redhat.com
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_write.c | 36 +++++++++++++++++++++---------------
 fs/netfs/direct_write.c   | 19 -------------------
 fs/netfs/internal.h       |  6 ++++++
 fs/netfs/write_collect.c  |  6 ++++--
 include/linux/netfs.h     |  1 -
 5 files changed, 31 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b87ef3fe4ea4..f27ea5099a68 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -53,30 +53,38 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
  * data written into the pagecache until we can find out from the server what
  * the values actually are.
  */
-static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
-				loff_t i_size, loff_t pos, size_t copied)
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+			 loff_t pos, size_t copied)
 {
+	loff_t i_size, end = pos + copied;
 	blkcnt_t add;
 	size_t gap;
 
+	if (end <= i_size_read(inode))
+		return;
+
 	if (ctx->ops->update_i_size) {
-		ctx->ops->update_i_size(inode, pos);
+		ctx->ops->update_i_size(inode, end);
 		return;
 	}
 
 	spin_lock(&inode->i_lock);
-	i_size_write(inode, pos);
+
+	i_size = i_size_read(inode);
+	if (end > i_size) {
+		i_size_write(inode, end);
 #if IS_ENABLED(CONFIG_FSCACHE)
-	fscache_update_cookie(ctx->cache, NULL, &pos);
+		fscache_update_cookie(ctx->cache, NULL, &end);
 #endif
 
-	gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
-	if (copied > gap) {
-		add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+		gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+		if (copied > gap) {
+			add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
 
-		inode->i_blocks = min_t(blkcnt_t,
-					DIV_ROUND_UP(pos, SECTOR_SIZE),
-					inode->i_blocks + add);
+			inode->i_blocks = min_t(blkcnt_t,
+						DIV_ROUND_UP(end, SECTOR_SIZE),
+						inode->i_blocks + add);
+		}
 	}
 	spin_unlock(&inode->i_lock);
 }
@@ -113,7 +121,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	struct folio *folio = NULL, *writethrough = NULL;
 	unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
 	ssize_t written = 0, ret, ret2;
-	loff_t i_size, pos = iocb->ki_pos;
+	loff_t pos = iocb->ki_pos;
 	size_t max_chunk = mapping_max_folio_size(mapping);
 	bool maybe_trouble = false;
 
@@ -346,10 +354,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		flush_dcache_folio(folio);
 
 		/* Update the inode size if we moved the EOF marker */
+		netfs_update_i_size(ctx, inode, pos, copied);
 		pos += copied;
-		i_size = i_size_read(inode);
-		if (pos > i_size)
-			netfs_update_i_size(ctx, inode, i_size, pos, copied);
 		written += copied;
 
 		if (likely(!wreq)) {
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 3efa5894b2c0..dcf2b096cc4e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -9,24 +9,6 @@
 #include <linux/uio.h>
 #include "internal.h"
 
-static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
-{
-	struct inode *inode = wreq->inode;
-	unsigned long long end = wreq->start + wreq->transferred;
-
-	if (wreq->error || end <= i_size_read(inode))
-		return;
-
-	spin_lock(&inode->i_lock);
-	if (end > i_size_read(inode)) {
-		if (wreq->netfs_ops->update_i_size)
-			wreq->netfs_ops->update_i_size(inode, end);
-		else
-			i_size_write(inode, end);
-	}
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Perform an unbuffered write where we may have to do an RMW operation on an
  * encrypted file.  This can also be used for direct I/O writes.
@@ -102,7 +84,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 	if (async)
 		wreq->iocb = iocb;
 	wreq->len = iov_iter_count(&wreq->buffer.iter);
-	wreq->cleanup = netfs_cleanup_dio_write;
 	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
 	if (ret < 0) {
 		_debug("begin = %zd", ret);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index d6656d2b54ab..f9bb9464a147 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -27,6 +27,12 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
 int netfs_prefetch_for_write(struct file *file, struct folio *folio,
 			     size_t offset, size_t len);
 
+/*
+ * buffered_write.c
+ */
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+			 loff_t pos, size_t copied);
+
 /*
  * main.c
  */
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 2ac85a819b71..33a93258f36e 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -393,8 +393,10 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
 		ictx->ops->invalidate_cache(wreq);
 	}
 
-	if (wreq->cleanup)
-		wreq->cleanup(wreq);
+	if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
+	     wreq->origin == NETFS_DIO_WRITE) &&
+	    !wreq->error)
+		netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
 
 	if (wreq->origin == NETFS_DIO_WRITE &&
 	    wreq->mapping->nrpages) {
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 065c17385e53..d8186b90fb38 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -279,7 +279,6 @@ struct netfs_io_request {
 #define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
-	void (*cleanup)(struct netfs_io_request *req);
 };
 
 /*
-- 
cgit v1.2.3


From 4e32541076833f5ce2e23523c9faa25f7b2cc96f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Jul 2025 17:38:47 +0100
Subject: netfs: Renumber the NETFS_RREQ_* flags to make traces easier to read

Renumber the NETFS_RREQ_* flags to put the most useful status bits in the
bottom nibble - and therefore the last hex digit in the trace output -
making it easier to grasp the state at a glance.

In particular, put the IN_PROGRESS flag in bit 0 and ALL_QUEUED at bit 1.

Also make the flags field in /proc/fs/netfs/requests larger to accommodate
all the flags.

Also make the flags field in the netfs_sreq tracepoint larger to
accommodate all the NETFS_SREQ_* flags.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/20250701163852.2171681-13-dhowells@redhat.com
Reviewed-by: Paulo Alcantara <pc@manguebit.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/main.c              |  6 +++---
 include/linux/netfs.h        | 20 ++++++++++----------
 include/trace/events/netfs.h |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 3db401d269e7..73da6c9f5777 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -58,15 +58,15 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
 
 	if (v == &netfs_io_requests) {
 		seq_puts(m,
-			 "REQUEST  OR REF FL ERR  OPS COVERAGE\n"
-			 "======== == === == ==== === =========\n"
+			 "REQUEST  OR REF FLAG ERR  OPS COVERAGE\n"
+			 "======== == === ==== ==== === =========\n"
 			 );
 		return 0;
 	}
 
 	rreq = list_entry(v, struct netfs_io_request, proc_link);
 	seq_printf(m,
-		   "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
+		   "%08x %s %3d %4lx %4ld %3d @%04llx %llx/%llx",
 		   rreq->debug_id,
 		   netfs_origins[rreq->origin],
 		   refcount_read(&rreq->ref),
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index d8186b90fb38..f43f075852c0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -265,17 +265,17 @@ struct netfs_io_request {
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
 	refcount_t		ref;
 	unsigned long		flags;
-#define NETFS_RREQ_OFFLOAD_COLLECTION	0	/* Offload collection to workqueue */
-#define NETFS_RREQ_NO_UNLOCK_FOLIO	2	/* Don't unlock no_unlock_folio on completion */
-#define NETFS_RREQ_FAILED		4	/* The request failed */
-#define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes (has ref) */
-#define NETFS_RREQ_FOLIO_COPY_TO_CACHE	6	/* Copy current folio to cache from read */
-#define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
-#define NETFS_RREQ_PAUSE		11	/* Pause subrequest generation */
+#define NETFS_RREQ_IN_PROGRESS		0	/* Unlocked when the request completes (has ref) */
+#define NETFS_RREQ_ALL_QUEUED		1	/* All subreqs are now queued */
+#define NETFS_RREQ_PAUSE		2	/* Pause subrequest generation */
+#define NETFS_RREQ_FAILED		3	/* The request failed */
+#define NETFS_RREQ_RETRYING		4	/* Set if we're in the retry path */
+#define NETFS_RREQ_SHORT_TRANSFER	5	/* Set if we have a short transfer */
+#define NETFS_RREQ_OFFLOAD_COLLECTION	8	/* Offload collection to workqueue */
+#define NETFS_RREQ_NO_UNLOCK_FOLIO	9	/* Don't unlock no_unlock_folio on completion */
+#define NETFS_RREQ_FOLIO_COPY_TO_CACHE	10	/* Copy current folio to cache from read */
+#define NETFS_RREQ_UPLOAD_TO_SERVER	11	/* Need to write to the server */
 #define NETFS_RREQ_USE_IO_ITER		12	/* Use ->io_iter rather than ->i_pages */
-#define NETFS_RREQ_ALL_QUEUED		13	/* All subreqs are now queued */
-#define NETFS_RREQ_RETRYING		14	/* Set if we're in the retry path */
-#define NETFS_RREQ_SHORT_TRANSFER	15	/* Set if we have a short transfer */
 #define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index ba35dc66e986..c2d581429a7b 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -367,7 +367,7 @@ TRACE_EVENT(netfs_sreq,
 		    __entry->slot	= sreq->io_iter.folioq_slot;
 			   ),
 
-	    TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d",
+	    TP_printk("R=%08x[%x] %s %s f=%03x s=%llx %zx/%zx s=%u e=%d",
 		      __entry->rreq, __entry->index,
 		      __print_symbolic(__entry->source, netfs_sreq_sources),
 		      __print_symbolic(__entry->what, netfs_sreq_traces),
-- 
cgit v1.2.3


From 90b3ccf514578ca3a6ac25db51a29a48e34e0f1b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Jul 2025 17:38:48 +0100
Subject: netfs: Update tracepoints in a number of ways

Make a number of updates to the netfs tracepoints:

 (1) Remove a duplicate trace from netfs_unbuffered_write_iter_locked().

 (2) Move the trace in netfs_wake_rreq_flag() to after the flag is cleared
     so that the change appears in the trace.

 (3) Differentiate the use of netfs_rreq_trace_wait/woke_queue symbols.

 (4) Don't do so many trace emissions in the wait functions as some of them
     are redundant.

 (5) In netfs_collect_read_results(), differentiate a subreq that's being
     abandoned vs one that has been consumed in a regular way.

 (6) Add a tracepoint to indicate the call to ->ki_complete().

 (7) Don't double-increment the subreq_counter when retrying a write.

 (8) Move the netfs_sreq_trace_io_progress tracepoint within cifs code to
     just MID_RESPONSE_RECEIVED and add different tracepoints for other MID
     states and note check failure.

Signed-off-by: David Howells <dhowells@redhat.com>
Co-developed-by: Paulo Alcantara <pc@manguebit.org>
Signed-off-by: Paulo Alcantara <pc@manguebit.org>
Link: https://lore.kernel.org/20250701163852.2171681-14-dhowells@redhat.com
cc: Steve French <sfrench@samba.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/direct_write.c      |  1 -
 fs/netfs/internal.h          |  2 +-
 fs/netfs/misc.c              | 14 ++++++--------
 fs/netfs/read_collect.c      | 12 +++++++++---
 fs/netfs/write_collect.c     |  4 +++-
 fs/netfs/write_retry.c       |  1 -
 fs/smb/client/cifssmb.c      | 20 ++++++++++++++++++++
 fs/smb/client/smb2pdu.c      | 26 ++++++++++++++++++++++----
 include/trace/events/netfs.h | 26 ++++++++++++++++++--------
 9 files changed, 79 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index dcf2b096cc4e..a16660ab7f83 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -91,7 +91,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
 	}
 
 	if (!async) {
-		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
 		ret = netfs_wait_for_write(wreq);
 		if (ret > 0)
 			iocb->ki_pos += ret;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index f9bb9464a147..d4f16fefd965 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -273,9 +273,9 @@ static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq,
 					enum netfs_rreq_trace trace)
 {
 	if (test_bit(rreq_flag, &rreq->flags)) {
-		trace_netfs_rreq(rreq, trace);
 		clear_bit_unlock(rreq_flag, &rreq->flags);
 		smp_mb__after_atomic(); /* Set flag before task state */
+		trace_netfs_rreq(rreq, trace);
 		wake_up(&rreq->waitq);
 	}
 }
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 127a269938bb..20748bcfbf59 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -359,7 +359,7 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
 		if (!netfs_check_subreq_in_progress(subreq))
 			continue;
 
-		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce);
 		for (;;) {
 			prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
 
@@ -368,10 +368,10 @@ void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
 
 			trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
 			schedule();
-			trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
 		}
 	}
 
+	trace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce);
 	finish_wait(&rreq->waitq, &myself);
 }
 
@@ -437,7 +437,6 @@ static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq,
 	ssize_t ret;
 
 	for (;;) {
-		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
 		prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
 
 		if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
@@ -457,11 +456,12 @@ static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq,
 		if (!netfs_check_rreq_in_progress(rreq))
 			break;
 
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
 		schedule();
-		trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
 	}
 
 all_collected:
+	trace_netfs_rreq(rreq, netfs_rreq_trace_waited_ip);
 	finish_wait(&rreq->waitq, &myself);
 
 	ret = rreq->error;
@@ -504,10 +504,8 @@ static void netfs_wait_for_pause(struct netfs_io_request *rreq,
 {
 	DEFINE_WAIT(myself);
 
-	trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
-
 	for (;;) {
-		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
+		trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
 		prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
 
 		if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
@@ -530,10 +528,10 @@ static void netfs_wait_for_pause(struct netfs_io_request *rreq,
 			break;
 
 		schedule();
-		trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
 	}
 
 all_collected:
+	trace_netfs_rreq(rreq, netfs_rreq_trace_waited_pause);
 	finish_wait(&rreq->waitq, &myself);
 }
 
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index cceed9d629c6..3e804da1e1eb 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -293,7 +293,9 @@ reassess:
 		spin_lock(&rreq->lock);
 
 		remove = front;
-		trace_netfs_sreq(front, netfs_sreq_trace_discard);
+		trace_netfs_sreq(front,
+				 notes & ABANDON_SREQ ?
+				 netfs_sreq_trace_abandoned : netfs_sreq_trace_consumed);
 		list_del_init(&front->rreq_link);
 		front = list_first_entry_or_null(&stream->subrequests,
 						 struct netfs_io_subrequest, rreq_link);
@@ -353,9 +355,11 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
 
 	if (rreq->iocb) {
 		rreq->iocb->ki_pos += rreq->transferred;
-		if (rreq->iocb->ki_complete)
+		if (rreq->iocb->ki_complete) {
+			trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
 			rreq->iocb->ki_complete(
 				rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+		}
 	}
 	if (rreq->netfs_ops->done)
 		rreq->netfs_ops->done(rreq);
@@ -379,9 +383,11 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
 
 	if (rreq->iocb) {
 		rreq->iocb->ki_pos += rreq->transferred;
-		if (rreq->iocb->ki_complete)
+		if (rreq->iocb->ki_complete) {
+			trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
 			rreq->iocb->ki_complete(
 				rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+		}
 	}
 	if (rreq->netfs_ops->done)
 		rreq->netfs_ops->done(rreq);
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 33a93258f36e..0f3a36852a4d 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -421,9 +421,11 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
 	if (wreq->iocb) {
 		size_t written = min(wreq->transferred, wreq->len);
 		wreq->iocb->ki_pos += written;
-		if (wreq->iocb->ki_complete)
+		if (wreq->iocb->ki_complete) {
+			trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
 			wreq->iocb->ki_complete(
 				wreq->iocb, wreq->error ? wreq->error : written);
+		}
 		wreq->iocb = VFS_PTR_POISON;
 	}
 
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index 7158657061e9..fc9c3e0d34d8 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -146,7 +146,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 			subreq = netfs_alloc_subrequest(wreq);
 			subreq->source		= to->source;
 			subreq->start		= start;
-			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
 			subreq->stream_nr	= to->stream_nr;
 			subreq->retry_count	= 1;
 
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 0e509a0433fb..75142f49d65d 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1334,7 +1334,11 @@ cifs_readv_callback(struct mid_q_entry *mid)
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
 	case MID_REQUEST_SUBMITTED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+		goto do_retry;
 	case MID_RETRY_NEEDED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
 		__set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
 		rdata->result = -EAGAIN;
 		if (server->sign && rdata->got_bytes)
@@ -1344,8 +1348,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
 		task_io_account_read(rdata->got_bytes);
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
+	case MID_RESPONSE_MALFORMED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
+		rdata->result = -EIO;
+		break;
 	default:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
 		rdata->result = -EIO;
+		break;
 	}
 
 	if (rdata->result == -ENODATA) {
@@ -1714,11 +1724,21 @@ cifs_writev_callback(struct mid_q_entry *mid)
 		}
 		break;
 	case MID_REQUEST_SUBMITTED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+		__set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+		result = -EAGAIN;
+		break;
 	case MID_RETRY_NEEDED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
 		__set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
 		result = -EAGAIN;
 		break;
+	case MID_RESPONSE_MALFORMED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
+		result = -EIO;
+		break;
 	default:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
 		result = -EIO;
 		break;
 	}
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 084ee66e73fd..7f6186c2e60d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4567,7 +4567,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
 	case MID_REQUEST_SUBMITTED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+		goto do_retry;
 	case MID_RETRY_NEEDED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
 		__set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
 		rdata->result = -EAGAIN;
 		if (server->sign && rdata->got_bytes)
@@ -4578,11 +4582,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
 	case MID_RESPONSE_MALFORMED:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
 		credits.value = le16_to_cpu(shdr->CreditRequest);
 		credits.instance = server->reconnect_instance;
-		fallthrough;
+		rdata->result = -EIO;
+		break;
 	default:
+		trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
 		rdata->result = -EIO;
+		break;
 	}
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	/*
@@ -4835,11 +4843,14 @@ smb2_writev_callback(struct mid_q_entry *mid)
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
 		credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
 		credits.instance = server->reconnect_instance;
 		result = smb2_check_receive(mid, server, 0);
-		if (result != 0)
+		if (result != 0) {
+			trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_bad);
 			break;
+		}
 
 		written = le32_to_cpu(rsp->DataLength);
 		/*
@@ -4861,15 +4872,23 @@ smb2_writev_callback(struct mid_q_entry *mid)
 		}
 		break;
 	case MID_REQUEST_SUBMITTED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+		__set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+		result = -EAGAIN;
+		break;
 	case MID_RETRY_NEEDED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
 		__set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
 		result = -EAGAIN;
 		break;
 	case MID_RESPONSE_MALFORMED:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
 		credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
 		credits.instance = server->reconnect_instance;
-		fallthrough;
+		result = -EIO;
+		break;
 	default:
+		trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
 		result = -EIO;
 		break;
 	}
@@ -4909,7 +4928,6 @@ smb2_writev_callback(struct mid_q_entry *mid)
 			      server->credits, server->in_flight,
 			      0, cifs_trace_rw_credits_write_response_clear);
 	wdata->credits.value = 0;
-	trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
 	cifs_write_subrequest_terminated(wdata, result ?: written);
 	release_mid(mid);
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index c2d581429a7b..73e96ccbe830 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -50,12 +50,13 @@
 
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
-	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_collect,		"COLLECT")	\
 	EM(netfs_rreq_trace_complete,		"COMPLET")	\
+	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_dirty,		"DIRTY  ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
+	EM(netfs_rreq_trace_ki_complete,	"KI-CMPL")	\
 	EM(netfs_rreq_trace_recollect,		"RECLLCT")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
@@ -64,13 +65,15 @@
 	EM(netfs_rreq_trace_unlock,		"UNLOCK ")	\
 	EM(netfs_rreq_trace_unlock_pgpriv2,	"UNLCK-2")	\
 	EM(netfs_rreq_trace_unmark,		"UNMARK ")	\
+	EM(netfs_rreq_trace_unpause,		"UNPAUSE")	\
 	EM(netfs_rreq_trace_wait_ip,		"WAIT-IP")	\
-	EM(netfs_rreq_trace_wait_pause,		"WT-PAUS")	\
-	EM(netfs_rreq_trace_wait_queue,		"WAIT-Q ")	\
+	EM(netfs_rreq_trace_wait_pause,		"--PAUSED--")	\
+	EM(netfs_rreq_trace_wait_quiesce,	"WAIT-QUIESCE")	\
+	EM(netfs_rreq_trace_waited_ip,		"DONE-IP")	\
+	EM(netfs_rreq_trace_waited_pause,	"--UNPAUSED--")	\
+	EM(netfs_rreq_trace_waited_quiesce,	"DONE-QUIESCE")	\
 	EM(netfs_rreq_trace_wake_ip,		"WAKE-IP")	\
 	EM(netfs_rreq_trace_wake_queue,		"WAKE-Q ")	\
-	EM(netfs_rreq_trace_woke_queue,		"WOKE-Q ")	\
-	EM(netfs_rreq_trace_unpause,		"UNPAUSE")	\
 	E_(netfs_rreq_trace_write_done,		"WR-DONE")
 
 #define netfs_sreq_sources					\
@@ -83,6 +86,7 @@
 	E_(NETFS_WRITE_TO_CACHE,		"WRIT")
 
 #define netfs_sreq_traces					\
+	EM(netfs_sreq_trace_abandoned,		"ABNDN")	\
 	EM(netfs_sreq_trace_add_donations,	"+DON ")	\
 	EM(netfs_sreq_trace_added,		"ADD  ")	\
 	EM(netfs_sreq_trace_cache_nowrite,	"CA-NW")	\
@@ -90,6 +94,7 @@
 	EM(netfs_sreq_trace_cache_write,	"CA-WR")	\
 	EM(netfs_sreq_trace_cancel,		"CANCL")	\
 	EM(netfs_sreq_trace_clear,		"CLEAR")	\
+	EM(netfs_sreq_trace_consumed,		"CONSM")	\
 	EM(netfs_sreq_trace_discard,		"DSCRD")	\
 	EM(netfs_sreq_trace_donate_to_prev,	"DON-P")	\
 	EM(netfs_sreq_trace_donate_to_next,	"DON-N")	\
@@ -97,7 +102,12 @@
 	EM(netfs_sreq_trace_fail,		"FAIL ")	\
 	EM(netfs_sreq_trace_free,		"FREE ")	\
 	EM(netfs_sreq_trace_hit_eof,		"EOF  ")	\
-	EM(netfs_sreq_trace_io_progress,	"IO   ")	\
+	EM(netfs_sreq_trace_io_bad,		"I-BAD")	\
+	EM(netfs_sreq_trace_io_malformed,	"I-MLF")	\
+	EM(netfs_sreq_trace_io_unknown,		"I-UNK")	\
+	EM(netfs_sreq_trace_io_progress,	"I-OK ")	\
+	EM(netfs_sreq_trace_io_req_submitted,	"I-RSB")	\
+	EM(netfs_sreq_trace_io_retry_needed,	"I-RTR")	\
 	EM(netfs_sreq_trace_limited,		"LIMIT")	\
 	EM(netfs_sreq_trace_need_clear,		"N-CLR")	\
 	EM(netfs_sreq_trace_partial_read,	"PARTR")	\
@@ -143,8 +153,8 @@
 
 #define netfs_sreq_ref_traces					\
 	EM(netfs_sreq_trace_get_copy_to_cache,	"GET COPY2C ")	\
-	EM(netfs_sreq_trace_get_resubmit,	"GET RESUBMIT")	\
-	EM(netfs_sreq_trace_get_submit,		"GET SUBMIT")	\
+	EM(netfs_sreq_trace_get_resubmit,	"GET RESUBMT")	\
+	EM(netfs_sreq_trace_get_submit,		"GET SUBMIT ")	\
 	EM(netfs_sreq_trace_get_short_read,	"GET SHORTRD")	\
 	EM(netfs_sreq_trace_new,		"NEW        ")	\
 	EM(netfs_sreq_trace_put_abandon,	"PUT ABANDON")	\
-- 
cgit v1.2.3


From 2f952c9e8fe13c6ee15c05189f1f87c1a70b866c Mon Sep 17 00:00:00 2001
From: Andrey Albershteyn <aalbersh@kernel.org>
Date: Mon, 30 Jun 2025 18:20:11 +0200
Subject: fs: split fileattr related helpers into separate file

This patch moves function related to file extended attributes
manipulations to separate file. Refactoring only.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Link: https://lore.kernel.org/20250630-xattrat-syscall-v6-1-c4e3bc35227b@kernel.org
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Makefile              |   3 +-
 fs/file_attr.c           | 319 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ioctl.c               | 309 ---------------------------------------------
 include/linux/fileattr.h |   4 +
 4 files changed, 325 insertions(+), 310 deletions(-)
 create mode 100644 fs/file_attr.c

(limited to 'include')

diff --git a/fs/Makefile b/fs/Makefile
index 79c08b914c47..334654f9584b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,8 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+		file_attr.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
diff --git a/fs/file_attr.c b/fs/file_attr.c
new file mode 100644
index 000000000000..a4dd1d9646c8
--- /dev/null
+++ b/fs/file_attr.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/fscrypt.h>
+#include <linux/fileattr.h>
+#include <linux/export.h>
+
+/**
+ * fileattr_fill_xflags - initialize fileattr with xflags
+ * @fa:		fileattr pointer
+ * @xflags:	FS_XFLAG_* flags
+ *
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
+ * other fields are zeroed.
+ */
+void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
+{
+	memset(fa, 0, sizeof(*fa));
+	fa->fsx_valid = true;
+	fa->fsx_xflags = xflags;
+	if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
+		fa->flags |= FS_IMMUTABLE_FL;
+	if (fa->fsx_xflags & FS_XFLAG_APPEND)
+		fa->flags |= FS_APPEND_FL;
+	if (fa->fsx_xflags & FS_XFLAG_SYNC)
+		fa->flags |= FS_SYNC_FL;
+	if (fa->fsx_xflags & FS_XFLAG_NOATIME)
+		fa->flags |= FS_NOATIME_FL;
+	if (fa->fsx_xflags & FS_XFLAG_NODUMP)
+		fa->flags |= FS_NODUMP_FL;
+	if (fa->fsx_xflags & FS_XFLAG_DAX)
+		fa->flags |= FS_DAX_FL;
+	if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+		fa->flags |= FS_PROJINHERIT_FL;
+}
+EXPORT_SYMBOL(fileattr_fill_xflags);
+
+/**
+ * fileattr_fill_flags - initialize fileattr with flags
+ * @fa:		fileattr pointer
+ * @flags:	FS_*_FL flags
+ *
+ * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
+ * All other fields are zeroed.
+ */
+void fileattr_fill_flags(struct fileattr *fa, u32 flags)
+{
+	memset(fa, 0, sizeof(*fa));
+	fa->flags_valid = true;
+	fa->flags = flags;
+	if (fa->flags & FS_SYNC_FL)
+		fa->fsx_xflags |= FS_XFLAG_SYNC;
+	if (fa->flags & FS_IMMUTABLE_FL)
+		fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+	if (fa->flags & FS_APPEND_FL)
+		fa->fsx_xflags |= FS_XFLAG_APPEND;
+	if (fa->flags & FS_NODUMP_FL)
+		fa->fsx_xflags |= FS_XFLAG_NODUMP;
+	if (fa->flags & FS_NOATIME_FL)
+		fa->fsx_xflags |= FS_XFLAG_NOATIME;
+	if (fa->flags & FS_DAX_FL)
+		fa->fsx_xflags |= FS_XFLAG_DAX;
+	if (fa->flags & FS_PROJINHERIT_FL)
+		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+}
+EXPORT_SYMBOL(fileattr_fill_flags);
+
+/**
+ * vfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry:	the object to retrieve from
+ * @fa:		fileattr pointer
+ *
+ * Call i_op->fileattr_get() callback, if exists.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (!inode->i_op->fileattr_get)
+		return -ENOIOCTLCMD;
+
+	return inode->i_op->fileattr_get(dentry, fa);
+}
+EXPORT_SYMBOL(vfs_fileattr_get);
+
+/**
+ * copy_fsxattr_to_user - copy fsxattr to userspace.
+ * @fa:		fileattr pointer
+ * @ufa:	fsxattr user pointer
+ *
+ * Return: 0 on success, or -EFAULT on failure.
+ */
+int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
+{
+	struct fsxattr xfa;
+
+	memset(&xfa, 0, sizeof(xfa));
+	xfa.fsx_xflags = fa->fsx_xflags;
+	xfa.fsx_extsize = fa->fsx_extsize;
+	xfa.fsx_nextents = fa->fsx_nextents;
+	xfa.fsx_projid = fa->fsx_projid;
+	xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+	if (copy_to_user(ufa, &xfa, sizeof(xfa)))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL(copy_fsxattr_to_user);
+
+static int copy_fsxattr_from_user(struct fileattr *fa,
+				  struct fsxattr __user *ufa)
+{
+	struct fsxattr xfa;
+
+	if (copy_from_user(&xfa, ufa, sizeof(xfa)))
+		return -EFAULT;
+
+	fileattr_fill_xflags(fa, xfa.fsx_xflags);
+	fa->fsx_extsize = xfa.fsx_extsize;
+	fa->fsx_nextents = xfa.fsx_nextents;
+	fa->fsx_projid = xfa.fsx_projid;
+	fa->fsx_cowextsize = xfa.fsx_cowextsize;
+
+	return 0;
+}
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
+ * any invalid configurations.
+ *
+ * Note: must be called with inode lock held.
+ */
+static int fileattr_set_prepare(struct inode *inode,
+			      const struct fileattr *old_ma,
+			      struct fileattr *fa)
+{
+	int err;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 */
+	if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return -EPERM;
+
+	err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
+	if (err)
+		return err;
+
+	/*
+	 * Project Quota ID state is only allowed to change from within the init
+	 * namespace. Enforce that restriction only if we are trying to change
+	 * the quota ID state. Everything else is allowed in user namespaces.
+	 */
+	if (current_user_ns() != &init_user_ns) {
+		if (old_ma->fsx_projid != fa->fsx_projid)
+			return -EINVAL;
+		if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
+				FS_XFLAG_PROJINHERIT)
+			return -EINVAL;
+	} else {
+		/*
+		 * Caller is allowed to change the project ID. If it is being
+		 * changed, make sure that the new value is valid.
+		 */
+		if (old_ma->fsx_projid != fa->fsx_projid &&
+		    !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
+			return -EINVAL;
+	}
+
+	/* Check extent size hints. */
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+			!S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+	    !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+		return -EINVAL;
+
+	/*
+	 * It is only valid to set the DAX flag on regular files and
+	 * directories on filesystems.
+	 */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+		return -EINVAL;
+
+	/* Extent size hints of zero turn off the flags. */
+	if (fa->fsx_extsize == 0)
+		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+	if (fa->fsx_cowextsize == 0)
+		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+	return 0;
+}
+
+/**
+ * vfs_fileattr_set - change miscellaneous file attributes
+ * @idmap:	idmap of the mount
+ * @dentry:	the object to change
+ * @fa:		fileattr pointer
+ *
+ * After verifying permissions, call i_op->fileattr_set() callback, if
+ * exists.
+ *
+ * Verifying attributes involves retrieving current attributes with
+ * i_op->fileattr_get(), this also allows initializing attributes that have
+ * not been set by the caller to current values.  Inode lock is held
+ * thoughout to prevent racing with another instance.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
+		     struct fileattr *fa)
+{
+	struct inode *inode = d_inode(dentry);
+	struct fileattr old_ma = {};
+	int err;
+
+	if (!inode->i_op->fileattr_set)
+		return -ENOIOCTLCMD;
+
+	if (!inode_owner_or_capable(idmap, inode))
+		return -EPERM;
+
+	inode_lock(inode);
+	err = vfs_fileattr_get(dentry, &old_ma);
+	if (!err) {
+		/* initialize missing bits from old_ma */
+		if (fa->flags_valid) {
+			fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
+			fa->fsx_extsize = old_ma.fsx_extsize;
+			fa->fsx_nextents = old_ma.fsx_nextents;
+			fa->fsx_projid = old_ma.fsx_projid;
+			fa->fsx_cowextsize = old_ma.fsx_cowextsize;
+		} else {
+			fa->flags |= old_ma.flags & ~FS_COMMON_FL;
+		}
+		err = fileattr_set_prepare(inode, &old_ma, fa);
+		if (!err)
+			err = inode->i_op->fileattr_set(idmap, dentry, fa);
+	}
+	inode_unlock(inode);
+
+	return err;
+}
+EXPORT_SYMBOL(vfs_fileattr_set);
+
+int ioctl_getflags(struct file *file, unsigned int __user *argp)
+{
+	struct fileattr fa = { .flags_valid = true }; /* hint only */
+	int err;
+
+	err = vfs_fileattr_get(file->f_path.dentry, &fa);
+	if (!err)
+		err = put_user(fa.flags, argp);
+	return err;
+}
+EXPORT_SYMBOL(ioctl_getflags);
+
+int ioctl_setflags(struct file *file, unsigned int __user *argp)
+{
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct fileattr fa;
+	unsigned int flags;
+	int err;
+
+	err = get_user(flags, argp);
+	if (!err) {
+		err = mnt_want_write_file(file);
+		if (!err) {
+			fileattr_fill_flags(&fa, flags);
+			err = vfs_fileattr_set(idmap, dentry, &fa);
+			mnt_drop_write_file(file);
+		}
+	}
+	return err;
+}
+EXPORT_SYMBOL(ioctl_setflags);
+
+int ioctl_fsgetxattr(struct file *file, void __user *argp)
+{
+	struct fileattr fa = { .fsx_valid = true }; /* hint only */
+	int err;
+
+	err = vfs_fileattr_get(file->f_path.dentry, &fa);
+	if (!err)
+		err = copy_fsxattr_to_user(&fa, argp);
+
+	return err;
+}
+EXPORT_SYMBOL(ioctl_fsgetxattr);
+
+int ioctl_fssetxattr(struct file *file, void __user *argp)
+{
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct dentry *dentry = file->f_path.dentry;
+	struct fileattr fa;
+	int err;
+
+	err = copy_fsxattr_from_user(&fa, argp);
+	if (!err) {
+		err = mnt_want_write_file(file);
+		if (!err) {
+			err = vfs_fileattr_set(idmap, dentry, &fa);
+			mnt_drop_write_file(file);
+		}
+	}
+	return err;
+}
+EXPORT_SYMBOL(ioctl_fssetxattr);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 69107a245b4c..0248cb8db2d3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -453,315 +453,6 @@ out:
 	return ret;
 }
 
-/**
- * fileattr_fill_xflags - initialize fileattr with xflags
- * @fa:		fileattr pointer
- * @xflags:	FS_XFLAG_* flags
- *
- * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
- * other fields are zeroed.
- */
-void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
-{
-	memset(fa, 0, sizeof(*fa));
-	fa->fsx_valid = true;
-	fa->fsx_xflags = xflags;
-	if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
-		fa->flags |= FS_IMMUTABLE_FL;
-	if (fa->fsx_xflags & FS_XFLAG_APPEND)
-		fa->flags |= FS_APPEND_FL;
-	if (fa->fsx_xflags & FS_XFLAG_SYNC)
-		fa->flags |= FS_SYNC_FL;
-	if (fa->fsx_xflags & FS_XFLAG_NOATIME)
-		fa->flags |= FS_NOATIME_FL;
-	if (fa->fsx_xflags & FS_XFLAG_NODUMP)
-		fa->flags |= FS_NODUMP_FL;
-	if (fa->fsx_xflags & FS_XFLAG_DAX)
-		fa->flags |= FS_DAX_FL;
-	if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-		fa->flags |= FS_PROJINHERIT_FL;
-}
-EXPORT_SYMBOL(fileattr_fill_xflags);
-
-/**
- * fileattr_fill_flags - initialize fileattr with flags
- * @fa:		fileattr pointer
- * @flags:	FS_*_FL flags
- *
- * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
- * All other fields are zeroed.
- */
-void fileattr_fill_flags(struct fileattr *fa, u32 flags)
-{
-	memset(fa, 0, sizeof(*fa));
-	fa->flags_valid = true;
-	fa->flags = flags;
-	if (fa->flags & FS_SYNC_FL)
-		fa->fsx_xflags |= FS_XFLAG_SYNC;
-	if (fa->flags & FS_IMMUTABLE_FL)
-		fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
-	if (fa->flags & FS_APPEND_FL)
-		fa->fsx_xflags |= FS_XFLAG_APPEND;
-	if (fa->flags & FS_NODUMP_FL)
-		fa->fsx_xflags |= FS_XFLAG_NODUMP;
-	if (fa->flags & FS_NOATIME_FL)
-		fa->fsx_xflags |= FS_XFLAG_NOATIME;
-	if (fa->flags & FS_DAX_FL)
-		fa->fsx_xflags |= FS_XFLAG_DAX;
-	if (fa->flags & FS_PROJINHERIT_FL)
-		fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-}
-EXPORT_SYMBOL(fileattr_fill_flags);
-
-/**
- * vfs_fileattr_get - retrieve miscellaneous file attributes
- * @dentry:	the object to retrieve from
- * @fa:		fileattr pointer
- *
- * Call i_op->fileattr_get() callback, if exists.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (!inode->i_op->fileattr_get)
-		return -ENOIOCTLCMD;
-
-	return inode->i_op->fileattr_get(dentry, fa);
-}
-EXPORT_SYMBOL(vfs_fileattr_get);
-
-/**
- * copy_fsxattr_to_user - copy fsxattr to userspace.
- * @fa:		fileattr pointer
- * @ufa:	fsxattr user pointer
- *
- * Return: 0 on success, or -EFAULT on failure.
- */
-int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
-{
-	struct fsxattr xfa;
-
-	memset(&xfa, 0, sizeof(xfa));
-	xfa.fsx_xflags = fa->fsx_xflags;
-	xfa.fsx_extsize = fa->fsx_extsize;
-	xfa.fsx_nextents = fa->fsx_nextents;
-	xfa.fsx_projid = fa->fsx_projid;
-	xfa.fsx_cowextsize = fa->fsx_cowextsize;
-
-	if (copy_to_user(ufa, &xfa, sizeof(xfa)))
-		return -EFAULT;
-
-	return 0;
-}
-EXPORT_SYMBOL(copy_fsxattr_to_user);
-
-static int copy_fsxattr_from_user(struct fileattr *fa,
-				  struct fsxattr __user *ufa)
-{
-	struct fsxattr xfa;
-
-	if (copy_from_user(&xfa, ufa, sizeof(xfa)))
-		return -EFAULT;
-
-	fileattr_fill_xflags(fa, xfa.fsx_xflags);
-	fa->fsx_extsize = xfa.fsx_extsize;
-	fa->fsx_nextents = xfa.fsx_nextents;
-	fa->fsx_projid = xfa.fsx_projid;
-	fa->fsx_cowextsize = xfa.fsx_cowextsize;
-
-	return 0;
-}
-
-/*
- * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
- * any invalid configurations.
- *
- * Note: must be called with inode lock held.
- */
-static int fileattr_set_prepare(struct inode *inode,
-			      const struct fileattr *old_ma,
-			      struct fileattr *fa)
-{
-	int err;
-
-	/*
-	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-	 * the relevant capability.
-	 */
-	if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		return -EPERM;
-
-	err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
-	if (err)
-		return err;
-
-	/*
-	 * Project Quota ID state is only allowed to change from within the init
-	 * namespace. Enforce that restriction only if we are trying to change
-	 * the quota ID state. Everything else is allowed in user namespaces.
-	 */
-	if (current_user_ns() != &init_user_ns) {
-		if (old_ma->fsx_projid != fa->fsx_projid)
-			return -EINVAL;
-		if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
-				FS_XFLAG_PROJINHERIT)
-			return -EINVAL;
-	} else {
-		/*
-		 * Caller is allowed to change the project ID. If it is being
-		 * changed, make sure that the new value is valid.
-		 */
-		if (old_ma->fsx_projid != fa->fsx_projid &&
-		    !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
-			return -EINVAL;
-	}
-
-	/* Check extent size hints. */
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
-		return -EINVAL;
-
-	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-			!S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
-	    !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	/*
-	 * It is only valid to set the DAX flag on regular files and
-	 * directories on filesystems.
-	 */
-	if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
-	    !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
-		return -EINVAL;
-
-	/* Extent size hints of zero turn off the flags. */
-	if (fa->fsx_extsize == 0)
-		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
-	if (fa->fsx_cowextsize == 0)
-		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
-
-	return 0;
-}
-
-/**
- * vfs_fileattr_set - change miscellaneous file attributes
- * @idmap:	idmap of the mount
- * @dentry:	the object to change
- * @fa:		fileattr pointer
- *
- * After verifying permissions, call i_op->fileattr_set() callback, if
- * exists.
- *
- * Verifying attributes involves retrieving current attributes with
- * i_op->fileattr_get(), this also allows initializing attributes that have
- * not been set by the caller to current values.  Inode lock is held
- * thoughout to prevent racing with another instance.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct fileattr *fa)
-{
-	struct inode *inode = d_inode(dentry);
-	struct fileattr old_ma = {};
-	int err;
-
-	if (!inode->i_op->fileattr_set)
-		return -ENOIOCTLCMD;
-
-	if (!inode_owner_or_capable(idmap, inode))
-		return -EPERM;
-
-	inode_lock(inode);
-	err = vfs_fileattr_get(dentry, &old_ma);
-	if (!err) {
-		/* initialize missing bits from old_ma */
-		if (fa->flags_valid) {
-			fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
-			fa->fsx_extsize = old_ma.fsx_extsize;
-			fa->fsx_nextents = old_ma.fsx_nextents;
-			fa->fsx_projid = old_ma.fsx_projid;
-			fa->fsx_cowextsize = old_ma.fsx_cowextsize;
-		} else {
-			fa->flags |= old_ma.flags & ~FS_COMMON_FL;
-		}
-		err = fileattr_set_prepare(inode, &old_ma, fa);
-		if (!err)
-			err = inode->i_op->fileattr_set(idmap, dentry, fa);
-	}
-	inode_unlock(inode);
-
-	return err;
-}
-EXPORT_SYMBOL(vfs_fileattr_set);
-
-static int ioctl_getflags(struct file *file, unsigned int __user *argp)
-{
-	struct fileattr fa = { .flags_valid = true }; /* hint only */
-	int err;
-
-	err = vfs_fileattr_get(file->f_path.dentry, &fa);
-	if (!err)
-		err = put_user(fa.flags, argp);
-	return err;
-}
-
-static int ioctl_setflags(struct file *file, unsigned int __user *argp)
-{
-	struct mnt_idmap *idmap = file_mnt_idmap(file);
-	struct dentry *dentry = file->f_path.dentry;
-	struct fileattr fa;
-	unsigned int flags;
-	int err;
-
-	err = get_user(flags, argp);
-	if (!err) {
-		err = mnt_want_write_file(file);
-		if (!err) {
-			fileattr_fill_flags(&fa, flags);
-			err = vfs_fileattr_set(idmap, dentry, &fa);
-			mnt_drop_write_file(file);
-		}
-	}
-	return err;
-}
-
-static int ioctl_fsgetxattr(struct file *file, void __user *argp)
-{
-	struct fileattr fa = { .fsx_valid = true }; /* hint only */
-	int err;
-
-	err = vfs_fileattr_get(file->f_path.dentry, &fa);
-	if (!err)
-		err = copy_fsxattr_to_user(&fa, argp);
-
-	return err;
-}
-
-static int ioctl_fssetxattr(struct file *file, void __user *argp)
-{
-	struct mnt_idmap *idmap = file_mnt_idmap(file);
-	struct dentry *dentry = file->f_path.dentry;
-	struct fileattr fa;
-	int err;
-
-	err = copy_fsxattr_from_user(&fa, argp);
-	if (!err) {
-		err = mnt_want_write_file(file);
-		if (!err) {
-			err = vfs_fileattr_set(idmap, dentry, &fa);
-			mnt_drop_write_file(file);
-		}
-	}
-	return err;
-}
-
 static int ioctl_getfsuuid(struct file *file, void __user *argp)
 {
 	struct super_block *sb = file_inode(file)->i_sb;
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index 47c05a9851d0..6030d0bf7ad3 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -55,5 +55,9 @@ static inline bool fileattr_has_fsx(const struct fileattr *fa)
 int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
 int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
 		     struct fileattr *fa);
+int ioctl_getflags(struct file *file, unsigned int __user *argp);
+int ioctl_setflags(struct file *file, unsigned int __user *argp);
+int ioctl_fsgetxattr(struct file *file, void __user *argp);
+int ioctl_fssetxattr(struct file *file, void __user *argp);
 
 #endif /* _LINUX_FILEATTR_H */
-- 
cgit v1.2.3


From defdd02d783c6fa22d0005bdc238ccd9174faf20 Mon Sep 17 00:00:00 2001
From: Andrey Albershteyn <aalbersh@redhat.com>
Date: Mon, 30 Jun 2025 18:20:12 +0200
Subject: lsm: introduce new hooks for setting/getting inode fsxattr

Introduce new hooks for setting and getting filesystem extended
attributes on inode (FS_IOC_FSGETXATTR).

Cc: selinux@vger.kernel.org
Cc: Paul Moore <paul@paul-moore.com>

Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Link: https://lore.kernel.org/20250630-xattrat-syscall-v6-2-c4e3bc35227b@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_attr.c                | 19 ++++++++++++++++---
 include/linux/lsm_hook_defs.h |  2 ++
 include/linux/security.h      | 16 ++++++++++++++++
 security/security.c           | 30 ++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/file_attr.c b/fs/file_attr.c
index a4dd1d9646c8..0aee402acde4 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -77,10 +77,15 @@ EXPORT_SYMBOL(fileattr_fill_flags);
 int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
+	int error;
 
 	if (!inode->i_op->fileattr_get)
 		return -ENOIOCTLCMD;
 
+	error = security_inode_file_getattr(dentry, fa);
+	if (error)
+		return error;
+
 	return inode->i_op->fileattr_get(dentry, fa);
 }
 EXPORT_SYMBOL(vfs_fileattr_get);
@@ -243,12 +248,20 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
 		} else {
 			fa->flags |= old_ma.flags & ~FS_COMMON_FL;
 		}
+
 		err = fileattr_set_prepare(inode, &old_ma, fa);
-		if (!err)
-			err = inode->i_op->fileattr_set(idmap, dentry, fa);
+		if (err)
+			goto out;
+		err = security_inode_file_setattr(dentry, fa);
+		if (err)
+			goto out;
+		err = inode->i_op->fileattr_set(idmap, dentry, fa);
+		if (err)
+			goto out;
 	}
-	inode_unlock(inode);
 
+out:
+	inode_unlock(inode);
 	return err;
 }
 EXPORT_SYMBOL(vfs_fileattr_set);
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index bf3bbac4e02a..9600a4350e79 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -157,6 +157,8 @@ LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap,
 	 struct dentry *dentry, const char *name)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry,
 	 const char *name)
+LSM_HOOK(int, 0, inode_file_setattr, struct dentry *dentry, struct fileattr *fa)
+LSM_HOOK(int, 0, inode_file_getattr, struct dentry *dentry, struct fileattr *fa)
 LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap,
 	 struct dentry *dentry, const char *acl_name, struct posix_acl *kacl)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry,
diff --git a/include/linux/security.h b/include/linux/security.h
index dba349629229..9ed0d0e0c81f 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -451,6 +451,10 @@ int security_inode_listxattr(struct dentry *dentry);
 int security_inode_removexattr(struct mnt_idmap *idmap,
 			       struct dentry *dentry, const char *name);
 void security_inode_post_removexattr(struct dentry *dentry, const char *name);
+int security_inode_file_setattr(struct dentry *dentry,
+			      struct fileattr *fa);
+int security_inode_file_getattr(struct dentry *dentry,
+			      struct fileattr *fa);
 int security_inode_need_killpriv(struct dentry *dentry);
 int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry);
 int security_inode_getsecurity(struct mnt_idmap *idmap,
@@ -1052,6 +1056,18 @@ static inline void security_inode_post_removexattr(struct dentry *dentry,
 						   const char *name)
 { }
 
+static inline int security_inode_file_setattr(struct dentry *dentry,
+					      struct fileattr *fa)
+{
+	return 0;
+}
+
+static inline int security_inode_file_getattr(struct dentry *dentry,
+					      struct fileattr *fa)
+{
+	return 0;
+}
+
 static inline int security_inode_need_killpriv(struct dentry *dentry)
 {
 	return cap_inode_need_killpriv(dentry);
diff --git a/security/security.c b/security/security.c
index 596d41818577..711b4de40b8d 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2622,6 +2622,36 @@ void security_inode_post_removexattr(struct dentry *dentry, const char *name)
 	call_void_hook(inode_post_removexattr, dentry, name);
 }
 
+/**
+ * security_inode_file_setattr() - check if setting fsxattr is allowed
+ * @dentry: file to set filesystem extended attributes on
+ * @fa: extended attributes to set on the inode
+ *
+ * Called when file_setattr() syscall or FS_IOC_FSSETXATTR ioctl() is called on
+ * inode
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_inode_file_setattr(struct dentry *dentry, struct fileattr *fa)
+{
+	return call_int_hook(inode_file_setattr, dentry, fa);
+}
+
+/**
+ * security_inode_file_getattr() - check if retrieving fsxattr is allowed
+ * @dentry: file to retrieve filesystem extended attributes from
+ * @fa: extended attributes to get
+ *
+ * Called when file_getattr() syscall or FS_IOC_FSGETXATTR ioctl() is called on
+ * inode
+ *
+ * Return: Returns 0 if permission is granted.
+ */
+int security_inode_file_getattr(struct dentry *dentry, struct fileattr *fa)
+{
+	return call_int_hook(inode_file_getattr, dentry, fa);
+}
+
 /**
  * security_inode_need_killpriv() - Check if security_inode_killpriv() required
  * @dentry: associated dentry
-- 
cgit v1.2.3


From 1a4eabf662543c62ae1e71a26d1c8e6643c66388 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:01 +0100
Subject: mfd: adp5585: Refactor how regmap defaults are handled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only thing changing between variants is the regmap default
registers. Hence, instead of having a regmap configuration for every
variant (duplicating lots of fields), add a chip info type of structure
with a regmap ID to identify which defaults to use and populate
regmap_config at runtime given a template plus the id. Also note that
between variants, the defaults can be the same which means the chip info
structure can be used in more than one compatible.

This will also make it simpler adding new chips with more variants.

Also note that the chip info structures are deliberately not const as
they will also contain lots of members that are the same between the
different devices variants and so we will fill those at runtime.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-6-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 80 +++++++++++++++++++++++----------------------
 include/linux/mfd/adp5585.h | 11 +++++++
 2 files changed, 52 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index c764f4818758..4d92b6362928 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -81,42 +81,36 @@ static const u8 adp5585_regmap_defaults_04[ADP5585_MAX_REG + 1] = {
 	/* 0x38 */ 0x00, 0x00, 0x00, 0x00, 0x00,
 };
 
-enum adp5585_regmap_type {
-	ADP5585_REGMAP_00,
-	ADP5585_REGMAP_02,
-	ADP5585_REGMAP_04,
+static const u8 *adp5585_regmap_defaults[ADP5585_MAX] = {
+	[ADP5585_00] = adp5585_regmap_defaults_00,
+	[ADP5585_01] = adp5585_regmap_defaults_00,
+	[ADP5585_02] = adp5585_regmap_defaults_02,
+	[ADP5585_03] = adp5585_regmap_defaults_00,
+	[ADP5585_04] = adp5585_regmap_defaults_04,
 };
 
-static const struct regmap_config adp5585_regmap_configs[] = {
-	[ADP5585_REGMAP_00] = {
-		.reg_bits = 8,
-		.val_bits = 8,
-		.max_register = ADP5585_MAX_REG,
-		.volatile_table = &adp5585_volatile_regs,
-		.cache_type = REGCACHE_MAPLE,
-		.reg_defaults_raw = adp5585_regmap_defaults_00,
-		.num_reg_defaults_raw = sizeof(adp5585_regmap_defaults_00),
-	},
-	[ADP5585_REGMAP_02] = {
-		.reg_bits = 8,
-		.val_bits = 8,
-		.max_register = ADP5585_MAX_REG,
-		.volatile_table = &adp5585_volatile_regs,
-		.cache_type = REGCACHE_MAPLE,
-		.reg_defaults_raw = adp5585_regmap_defaults_02,
-		.num_reg_defaults_raw = sizeof(adp5585_regmap_defaults_02),
-	},
-	[ADP5585_REGMAP_04] = {
-		.reg_bits = 8,
-		.val_bits = 8,
-		.max_register = ADP5585_MAX_REG,
-		.volatile_table = &adp5585_volatile_regs,
-		.cache_type = REGCACHE_MAPLE,
-		.reg_defaults_raw = adp5585_regmap_defaults_04,
-		.num_reg_defaults_raw = sizeof(adp5585_regmap_defaults_04),
-	},
+static const struct regmap_config adp5585_regmap_config_template = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = ADP5585_MAX_REG,
+	.volatile_table = &adp5585_volatile_regs,
+	.cache_type = REGCACHE_MAPLE,
+	.num_reg_defaults_raw = ADP5585_MAX_REG + 1,
 };
 
+static struct regmap_config *adp5585_fill_regmap_config(const struct adp5585_dev *adp5585)
+{
+	struct regmap_config *regmap_config;
+
+	regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
+				     sizeof(*regmap_config), GFP_KERNEL);
+	if (!regmap_config)
+		return ERR_PTR(-ENOMEM);
+
+	regmap_config->reg_defaults_raw = adp5585_regmap_defaults[adp5585->variant];
+	return regmap_config;
+}
+
 static int adp5585_add_devices(struct device *dev)
 {
 	int ret;
@@ -147,7 +141,7 @@ static void adp5585_osc_disable(void *data)
 
 static int adp5585_i2c_probe(struct i2c_client *i2c)
 {
-	const struct regmap_config *regmap_config;
+	struct regmap_config *regmap_config;
 	struct adp5585_dev *adp5585;
 	unsigned int id;
 	int ret;
@@ -157,8 +151,16 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 		return -ENOMEM;
 
 	i2c_set_clientdata(i2c, adp5585);
+	adp5585->dev = &i2c->dev;
+
+	adp5585->variant = (enum adp5585_variant)(uintptr_t)i2c_get_match_data(i2c);
+	if (!adp5585->variant)
+		return -ENODEV;
+
+	regmap_config = adp5585_fill_regmap_config(adp5585);
+	if (IS_ERR(regmap_config))
+		return PTR_ERR(regmap_config);
 
-	regmap_config = i2c_get_match_data(i2c);
 	adp5585->regmap = devm_regmap_init_i2c(i2c, regmap_config);
 	if (IS_ERR(adp5585->regmap))
 		return dev_err_probe(&i2c->dev, PTR_ERR(adp5585->regmap),
@@ -212,19 +214,19 @@ static DEFINE_SIMPLE_DEV_PM_OPS(adp5585_pm, adp5585_suspend, adp5585_resume);
 static const struct of_device_id adp5585_of_match[] = {
 	{
 		.compatible = "adi,adp5585-00",
-		.data = &adp5585_regmap_configs[ADP5585_REGMAP_00],
+		.data = (void *)ADP5585_00,
 	}, {
 		.compatible = "adi,adp5585-01",
-		.data = &adp5585_regmap_configs[ADP5585_REGMAP_00],
+		.data = (void *)ADP5585_01,
 	}, {
 		.compatible = "adi,adp5585-02",
-		.data = &adp5585_regmap_configs[ADP5585_REGMAP_02],
+		.data = (void *)ADP5585_02,
 	}, {
 		.compatible = "adi,adp5585-03",
-		.data = &adp5585_regmap_configs[ADP5585_REGMAP_00],
+		.data = (void *)ADP5585_03,
 	}, {
 		.compatible = "adi,adp5585-04",
-		.data = &adp5585_regmap_configs[ADP5585_REGMAP_04],
+		.data = (void *)ADP5585_04,
 	},
 	{ /* sentinel */ }
 };
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 016033cd68e4..c56af8d8d76c 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -119,8 +119,19 @@
 
 struct regmap;
 
+enum adp5585_variant {
+	ADP5585_00 = 1,
+	ADP5585_01,
+	ADP5585_02,
+	ADP5585_03,
+	ADP5585_04,
+	ADP5585_MAX
+};
+
 struct adp5585_dev {
+	struct device *dev;
 	struct regmap *regmap;
+	enum adp5585_variant variant;
 };
 
 #endif
-- 
cgit v1.2.3


From 0190a72f28ee0995c546fd4fcf80ed25a0fc4b28 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:02 +0100
Subject: mfd: adp5585: Add support for adp5589
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ADP5589 is a 19 I/O port expander with built-in keypad matrix decoder,
programmable logic, reset generator, and PWM generator.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-7-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 123 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/mfd/adp5585.h |  10 ++++
 2 files changed, 124 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index 4d92b6362928..00996571ef90 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -29,6 +29,11 @@ static const struct mfd_cell adp5585_devs[ADP5585_DEV_MAX] = {
 	MFD_CELL_NAME("adp5585-pwm"),
 };
 
+static const struct mfd_cell adp5589_devs[] = {
+	MFD_CELL_NAME("adp5589-gpio"),
+	MFD_CELL_NAME("adp5589-pwm"),
+};
+
 static const struct regmap_range adp5585_volatile_ranges[] = {
 	regmap_reg_range(ADP5585_ID, ADP5585_GPI_STATUS_B),
 };
@@ -38,6 +43,15 @@ static const struct regmap_access_table adp5585_volatile_regs = {
 	.n_yes_ranges = ARRAY_SIZE(adp5585_volatile_ranges),
 };
 
+static const struct regmap_range adp5589_volatile_ranges[] = {
+	regmap_reg_range(ADP5585_ID, ADP5589_GPI_STATUS_C),
+};
+
+static const struct regmap_access_table adp5589_volatile_regs = {
+	.yes_ranges = adp5589_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(adp5589_volatile_ranges),
+};
+
 /*
  * Chip variants differ in the default configuration of pull-up and pull-down
  * resistors, and therefore have different default register values:
@@ -81,12 +95,54 @@ static const u8 adp5585_regmap_defaults_04[ADP5585_MAX_REG + 1] = {
 	/* 0x38 */ 0x00, 0x00, 0x00, 0x00, 0x00,
 };
 
+static const u8 adp5589_regmap_defaults_00[ADP5589_MAX_REG + 1] = {
+	/* 0x00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x08 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x10 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x18 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x20 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x28 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x30 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x38 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x40 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x48 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+static const u8 adp5589_regmap_defaults_01[ADP5589_MAX_REG + 1] = {
+	/* 0x00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x08 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x10 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x18 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x20 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x28 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x30 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x38 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+	/* 0x40 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x48 */ 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+};
+
+static const u8 adp5589_regmap_defaults_02[ADP5589_MAX_REG + 1] = {
+	/* 0x00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x08 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x10 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x18 */ 0x00, 0x41, 0x01, 0x00, 0x11, 0x04, 0x00, 0x00,
+	/* 0x20 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x28 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x30 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x38 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x40 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	/* 0x48 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
 static const u8 *adp5585_regmap_defaults[ADP5585_MAX] = {
 	[ADP5585_00] = adp5585_regmap_defaults_00,
 	[ADP5585_01] = adp5585_regmap_defaults_00,
 	[ADP5585_02] = adp5585_regmap_defaults_02,
 	[ADP5585_03] = adp5585_regmap_defaults_00,
 	[ADP5585_04] = adp5585_regmap_defaults_04,
+	[ADP5589_00] = adp5589_regmap_defaults_00,
+	[ADP5589_01] = adp5589_regmap_defaults_01,
+	[ADP5589_02] = adp5589_regmap_defaults_02,
 };
 
 static const struct regmap_config adp5585_regmap_config_template = {
@@ -98,33 +154,69 @@ static const struct regmap_config adp5585_regmap_config_template = {
 	.num_reg_defaults_raw = ADP5585_MAX_REG + 1,
 };
 
-static struct regmap_config *adp5585_fill_regmap_config(const struct adp5585_dev *adp5585)
+static const struct regmap_config adp5589_regmap_config_template = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = ADP5589_MAX_REG,
+	.volatile_table = &adp5589_volatile_regs,
+	.cache_type = REGCACHE_MAPLE,
+	.num_reg_defaults_raw = ADP5589_MAX_REG + 1,
+};
+
+static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp5585)
 {
 	struct regmap_config *regmap_config;
 
-	regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
-				     sizeof(*regmap_config), GFP_KERNEL);
+	switch (adp5585->variant) {
+	case ADP5585_00:
+	case ADP5585_01:
+	case ADP5585_02:
+	case ADP5585_03:
+	case ADP5585_04:
+		adp5585->id = ADP5585_MAN_ID_VALUE;
+		regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
+					     sizeof(*regmap_config), GFP_KERNEL);
+		break;
+	case ADP5589_00:
+	case ADP5589_01:
+	case ADP5589_02:
+		adp5585->id = ADP5589_MAN_ID_VALUE;
+		regmap_config = devm_kmemdup(adp5585->dev, &adp5589_regmap_config_template,
+					     sizeof(*regmap_config), GFP_KERNEL);
+		break;
+	default:
+		return ERR_PTR(-ENODEV);
+	}
+
 	if (!regmap_config)
 		return ERR_PTR(-ENOMEM);
 
 	regmap_config->reg_defaults_raw = adp5585_regmap_defaults[adp5585->variant];
+
 	return regmap_config;
 }
 
-static int adp5585_add_devices(struct device *dev)
+static int adp5585_add_devices(const struct adp5585_dev *adp5585)
 {
+	struct device *dev = adp5585->dev;
+	const struct mfd_cell *cells;
 	int ret;
 
+	if (adp5585->id == ADP5585_MAN_ID_VALUE)
+		cells = adp5585_devs;
+	else
+		cells = adp5589_devs;
+
 	if (device_property_present(dev, "#pwm-cells")) {
 		ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO,
-					   &adp5585_devs[ADP5585_DEV_PWM], 1, NULL, 0, NULL);
+					   &cells[ADP5585_DEV_PWM], 1, NULL, 0, NULL);
 		if (ret)
 			return dev_err_probe(dev, ret, "Failed to add PWM device\n");
 	}
 
 	if (device_property_present(dev, "#gpio-cells")) {
 		ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO,
-					   &adp5585_devs[ADP5585_DEV_GPIO], 1, NULL, 0, NULL);
+					   &cells[ADP5585_DEV_GPIO], 1, NULL, 0, NULL);
 		if (ret)
 			return dev_err_probe(dev, ret, "Failed to add GPIO device\n");
 	}
@@ -157,7 +249,7 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 	if (!adp5585->variant)
 		return -ENODEV;
 
-	regmap_config = adp5585_fill_regmap_config(adp5585);
+	regmap_config = adp5585_fill_variant_config(adp5585);
 	if (IS_ERR(regmap_config))
 		return PTR_ERR(regmap_config);
 
@@ -171,7 +263,8 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 		return dev_err_probe(&i2c->dev, ret,
 				     "Failed to read device ID\n");
 
-	if ((id & ADP5585_MAN_ID_MASK) != ADP5585_MAN_ID_VALUE)
+	id &= ADP5585_MAN_ID_MASK;
+	if (id != adp5585->id)
 		return dev_err_probe(&i2c->dev, -ENODEV,
 				     "Invalid device ID 0x%02x\n", id);
 
@@ -187,7 +280,7 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 	if (ret)
 		return ret;
 
-	return adp5585_add_devices(&i2c->dev);
+	return adp5585_add_devices(adp5585);
 }
 
 static int adp5585_suspend(struct device *dev)
@@ -227,6 +320,18 @@ static const struct of_device_id adp5585_of_match[] = {
 	}, {
 		.compatible = "adi,adp5585-04",
 		.data = (void *)ADP5585_04,
+	}, {
+		.compatible = "adi,adp5589-00",
+		.data = (void *)ADP5589_00,
+	}, {
+		.compatible = "adi,adp5589-01",
+		.data = (void *)ADP5589_01,
+	}, {
+		.compatible = "adi,adp5589-02",
+		.data = (void *)ADP5589_02,
+	}, {
+		.compatible = "adi,adp5589",
+		.data = (void *)ADP5589_00,
 	},
 	{ /* sentinel */ }
 };
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index c56af8d8d76c..70e58122a36a 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -117,6 +117,12 @@
 #define ADP5585_BANK(n)			((n) >= 6 ? 1 : 0)
 #define ADP5585_BIT(n)			((n) >= 6 ? BIT((n) - 6) : BIT(n))
 
+/* ADP5589 */
+#define		ADP5589_MAN_ID_VALUE		0x10
+#define ADP5589_GPI_STATUS_C		0x18
+#define ADP5589_INT_EN			0x4e
+#define ADP5589_MAX_REG			ADP5589_INT_EN
+
 struct regmap;
 
 enum adp5585_variant {
@@ -125,6 +131,9 @@ enum adp5585_variant {
 	ADP5585_02,
 	ADP5585_03,
 	ADP5585_04,
+	ADP5589_00,
+	ADP5589_01,
+	ADP5589_02,
 	ADP5585_MAX
 };
 
@@ -132,6 +141,7 @@ struct adp5585_dev {
 	struct device *dev;
 	struct regmap *regmap;
 	enum adp5585_variant variant;
+	unsigned int id;
 };
 
 #endif
-- 
cgit v1.2.3


From 7077fb501b95360c7fe35553f2bdb1ccf34edd16 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:03 +0100
Subject: mfd: adp5585: Add a per chip reg struture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some differences in the register map between the devices.
Hence, add a register structure per device. This will be needed in
following patches.

On top of that adp5585_fill_regmap_config() is renamed and reworked so
that the current struct adp5585_info act as template (they indeed
contain all the different data between variants) which can then be
complemented depending on the device (as identified by the id register).
This is done like this since a lot of the data is pretty much the same
between variants of the same device.

Reviewed-by: Lee Jones <lee@kernel.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-8-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 10 ++++++++++
 include/linux/mfd/adp5585.h |  6 ++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index 00996571ef90..ae12372bdde9 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -163,6 +163,14 @@ static const struct regmap_config adp5589_regmap_config_template = {
 	.num_reg_defaults_raw = ADP5589_MAX_REG + 1,
 };
 
+static const struct adp5585_regs adp5585_regs = {
+	.ext_cfg = ADP5585_PIN_CONFIG_C,
+};
+
+static const struct adp5585_regs adp5589_regs = {
+	.ext_cfg = ADP5589_PIN_CONFIG_D,
+};
+
 static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp5585)
 {
 	struct regmap_config *regmap_config;
@@ -174,6 +182,7 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	case ADP5585_03:
 	case ADP5585_04:
 		adp5585->id = ADP5585_MAN_ID_VALUE;
+		adp5585->regs = &adp5585_regs;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
 					     sizeof(*regmap_config), GFP_KERNEL);
 		break;
@@ -181,6 +190,7 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	case ADP5589_01:
 	case ADP5589_02:
 		adp5585->id = ADP5589_MAN_ID_VALUE;
+		adp5585->regs = &adp5589_regs;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5589_regmap_config_template,
 					     sizeof(*regmap_config), GFP_KERNEL);
 		break;
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 70e58122a36a..6ecb90a6276c 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -120,6 +120,7 @@
 /* ADP5589 */
 #define		ADP5589_MAN_ID_VALUE		0x10
 #define ADP5589_GPI_STATUS_C		0x18
+#define ADP5589_PIN_CONFIG_D		0x4C
 #define ADP5589_INT_EN			0x4e
 #define ADP5589_MAX_REG			ADP5589_INT_EN
 
@@ -137,9 +138,14 @@ enum adp5585_variant {
 	ADP5585_MAX
 };
 
+struct adp5585_regs {
+	unsigned int ext_cfg;
+};
+
 struct adp5585_dev {
 	struct device *dev;
 	struct regmap *regmap;
+	const struct adp5585_regs *regs;
 	enum adp5585_variant variant;
 	unsigned int id;
 };
-- 
cgit v1.2.3


From 9f425bf713b511b1078e0fea5a88c497e13dbb64 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:04 +0100
Subject: gpio: adp5585: add support for the adp5589 expander
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support the adp5589 I/O expander which supports up to 19 pins. We need
to add a chip_info based struct since accessing register "banks"
and "bits" differs between devices.

Also some register addresses are different.

While at it move ADP558X_GPIO_MAX defines to the main header file and
rename them. That information will be needed by the top level device in
a following change.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-9-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/gpio/gpio-adp5585.c | 151 ++++++++++++++++++++++++++++++++++----------
 include/linux/mfd/adp5585.h |  18 +++---
 2 files changed, 126 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-adp5585.c b/drivers/gpio/gpio-adp5585.c
index d5c0f1b267c8..cdf107742579 100644
--- a/drivers/gpio/gpio-adp5585.c
+++ b/drivers/gpio/gpio-adp5585.c
@@ -4,6 +4,7 @@
  *
  * Copyright 2022 NXP
  * Copyright 2024 Ideas on Board Oy
+ * Copyright 2025 Analog Devices, Inc.
  */
 
 #include <linux/device.h>
@@ -14,57 +15,106 @@
 #include <linux/regmap.h>
 #include <linux/types.h>
 
-#define ADP5585_GPIO_MAX	11
+/*
+ * Bank 0 covers pins "GPIO 1/R0" to "GPIO 6/R5", numbered 0 to 5 by the
+ * driver, and bank 1 covers pins "GPIO 7/C0" to "GPIO 11/C4", numbered 6 to
+ * 10. Some variants of the ADP5585 don't support "GPIO 6/R5". As the driver
+ * uses identical GPIO numbering for all variants to avoid confusion, GPIO 5 is
+ * marked as reserved in the device tree for variants that don't support it.
+ */
+#define ADP5585_BANK(n)			((n) >= 6 ? 1 : 0)
+#define ADP5585_BIT(n)			((n) >= 6 ? BIT((n) - 6) : BIT(n))
+
+/*
+ * Bank 0 covers pins "GPIO 1/R0" to "GPIO 8/R7", numbered 0 to 7 by the
+ * driver, bank 1 covers pins "GPIO 9/C0" to "GPIO 16/C7", numbered 8 to
+ * 15 and bank 3 covers pins "GPIO 17/C8" to "GPIO 19/C10", numbered 16 to 18.
+ */
+#define ADP5589_BANK(n)			((n) >> 3)
+#define ADP5589_BIT(n)			BIT((n) & 0x7)
+
+struct adp5585_gpio_chip {
+	int (*bank)(unsigned int off);
+	int (*bit)(unsigned int off);
+	unsigned int max_gpio;
+	unsigned int debounce_dis_a;
+	unsigned int rpull_cfg_a;
+	unsigned int gpo_data_a;
+	unsigned int gpo_out_a;
+	unsigned int gpio_dir_a;
+	unsigned int gpi_stat_a;
+	bool has_bias_hole;
+};
 
 struct adp5585_gpio_dev {
 	struct gpio_chip gpio_chip;
+	const struct adp5585_gpio_chip *info;
 	struct regmap *regmap;
 };
 
+static int adp5585_gpio_bank(unsigned int off)
+{
+	return ADP5585_BANK(off);
+}
+
+static int adp5585_gpio_bit(unsigned int off)
+{
+	return ADP5585_BIT(off);
+}
+
+static int adp5589_gpio_bank(unsigned int off)
+{
+	return ADP5589_BANK(off);
+}
+
+static int adp5589_gpio_bit(unsigned int off)
+{
+	return ADP5589_BIT(off);
+}
+
 static int adp5585_gpio_get_direction(struct gpio_chip *chip, unsigned int off)
 {
 	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
 	unsigned int val;
 
-	regmap_read(adp5585_gpio->regmap, ADP5585_GPIO_DIRECTION_A + bank, &val);
+	regmap_read(adp5585_gpio->regmap, info->gpio_dir_a + info->bank(off), &val);
 
-	return val & bit ? GPIO_LINE_DIRECTION_OUT : GPIO_LINE_DIRECTION_IN;
+	return val & info->bit(off) ? GPIO_LINE_DIRECTION_OUT : GPIO_LINE_DIRECTION_IN;
 }
 
 static int adp5585_gpio_direction_input(struct gpio_chip *chip, unsigned int off)
 {
 	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
 
-	return regmap_clear_bits(adp5585_gpio->regmap,
-				 ADP5585_GPIO_DIRECTION_A + bank, bit);
+	return regmap_clear_bits(adp5585_gpio->regmap, info->gpio_dir_a + info->bank(off),
+				 info->bit(off));
 }
 
 static int adp5585_gpio_direction_output(struct gpio_chip *chip, unsigned int off, int val)
 {
 	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	unsigned int bank = info->bank(off);
+	unsigned int bit = info->bit(off);
 	int ret;
 
-	ret = regmap_update_bits(adp5585_gpio->regmap,
-				 ADP5585_GPO_DATA_OUT_A + bank, bit,
-				 val ? bit : 0);
+	ret = regmap_update_bits(adp5585_gpio->regmap, info->gpo_data_a + bank,
+				 bit, val ? bit : 0);
 	if (ret)
 		return ret;
 
-	return regmap_set_bits(adp5585_gpio->regmap,
-			       ADP5585_GPIO_DIRECTION_A + bank, bit);
+	return regmap_set_bits(adp5585_gpio->regmap, info->gpio_dir_a + bank,
+			       bit);
 }
 
 static int adp5585_gpio_get_value(struct gpio_chip *chip, unsigned int off)
 {
 	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	unsigned int bank = info->bank(off);
+	unsigned int bit = info->bit(off);
 	unsigned int reg;
 	unsigned int val;
 
@@ -79,8 +129,8 @@ static int adp5585_gpio_get_value(struct gpio_chip *chip, unsigned int off)
 	 * .direction_input(), .direction_output() or .set() operations racing
 	 * with this.
 	 */
-	regmap_read(adp5585_gpio->regmap, ADP5585_GPIO_DIRECTION_A + bank, &val);
-	reg = val & bit ? ADP5585_GPO_DATA_OUT_A : ADP5585_GPI_STATUS_A;
+	regmap_read(adp5585_gpio->regmap, info->gpio_dir_a + bank, &val);
+	reg = val & bit ? info->gpo_data_a : info->gpi_stat_a;
 	regmap_read(adp5585_gpio->regmap, reg + bank, &val);
 
 	return !!(val & bit);
@@ -90,17 +140,17 @@ static int adp5585_gpio_set_value(struct gpio_chip *chip, unsigned int off,
 				  int val)
 {
 	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	unsigned int bit = adp5585_gpio->info->bit(off);
 
-	return regmap_update_bits(adp5585_gpio->regmap,
-				  ADP5585_GPO_DATA_OUT_A + bank,
+	return regmap_update_bits(adp5585_gpio->regmap, info->gpo_data_a + info->bank(off),
 				  bit, val ? bit : 0);
 }
 
 static int adp5585_gpio_set_bias(struct adp5585_gpio_dev *adp5585_gpio,
 				 unsigned int off, unsigned int bias)
 {
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
 	unsigned int bit, reg, mask, val;
 
 	/*
@@ -108,8 +158,10 @@ static int adp5585_gpio_set_bias(struct adp5585_gpio_dev *adp5585_gpio,
 	 * consecutive registers ADP5585_RPULL_CONFIG_*, with a hole of 4 bits
 	 * after R5.
 	 */
-	bit = off * 2 + (off > 5 ? 4 : 0);
-	reg = ADP5585_RPULL_CONFIG_A + bit / 8;
+	bit = off * 2;
+	if (info->has_bias_hole)
+		bit += (off > 5 ? 4 : 0);
+	reg = info->rpull_cfg_a + bit / 8;
 	mask = ADP5585_Rx_PULL_CFG_MASK << (bit % 8);
 	val = bias << (bit % 8);
 
@@ -119,22 +171,22 @@ static int adp5585_gpio_set_bias(struct adp5585_gpio_dev *adp5585_gpio,
 static int adp5585_gpio_set_drive(struct adp5585_gpio_dev *adp5585_gpio,
 				  unsigned int off, enum pin_config_param drive)
 {
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	unsigned int bit = adp5585_gpio->info->bit(off);
 
 	return regmap_update_bits(adp5585_gpio->regmap,
-				  ADP5585_GPO_OUT_MODE_A + bank, bit,
+				  info->gpo_out_a + info->bank(off), bit,
 				  drive == PIN_CONFIG_DRIVE_OPEN_DRAIN ? bit : 0);
 }
 
 static int adp5585_gpio_set_debounce(struct adp5585_gpio_dev *adp5585_gpio,
 				     unsigned int off, unsigned int debounce)
 {
-	unsigned int bank = ADP5585_BANK(off);
-	unsigned int bit = ADP5585_BIT(off);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	unsigned int bit = adp5585_gpio->info->bit(off);
 
 	return regmap_update_bits(adp5585_gpio->regmap,
-				  ADP5585_DEBOUNCE_DIS_A + bank, bit,
+				  info->debounce_dis_a + info->bank(off), bit,
 				  debounce ? 0 : bit);
 }
 
@@ -175,6 +227,7 @@ static int adp5585_gpio_set_config(struct gpio_chip *chip, unsigned int off,
 static int adp5585_gpio_probe(struct platform_device *pdev)
 {
 	struct adp5585_dev *adp5585 = dev_get_drvdata(pdev->dev.parent);
+	const struct platform_device_id *id = platform_get_device_id(pdev);
 	struct adp5585_gpio_dev *adp5585_gpio;
 	struct device *dev = &pdev->dev;
 	struct gpio_chip *gc;
@@ -186,6 +239,10 @@ static int adp5585_gpio_probe(struct platform_device *pdev)
 
 	adp5585_gpio->regmap = adp5585->regmap;
 
+	adp5585_gpio->info = (const struct adp5585_gpio_chip *)id->driver_data;
+	if (!adp5585_gpio->info)
+		return -ENODEV;
+
 	device_set_of_node_from_dev(dev, dev->parent);
 
 	gc = &adp5585_gpio->gpio_chip;
@@ -199,7 +256,7 @@ static int adp5585_gpio_probe(struct platform_device *pdev)
 	gc->can_sleep = true;
 
 	gc->base = -1;
-	gc->ngpio = ADP5585_GPIO_MAX;
+	gc->ngpio = adp5585_gpio->info->max_gpio;
 	gc->label = pdev->name;
 	gc->owner = THIS_MODULE;
 
@@ -211,8 +268,34 @@ static int adp5585_gpio_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct adp5585_gpio_chip adp5585_gpio_chip_info = {
+	.bank = adp5585_gpio_bank,
+	.bit = adp5585_gpio_bit,
+	.debounce_dis_a = ADP5585_DEBOUNCE_DIS_A,
+	.rpull_cfg_a = ADP5585_RPULL_CONFIG_A,
+	.gpo_data_a = ADP5585_GPO_DATA_OUT_A,
+	.gpo_out_a = ADP5585_GPO_OUT_MODE_A,
+	.gpio_dir_a = ADP5585_GPIO_DIRECTION_A,
+	.gpi_stat_a = ADP5585_GPI_STATUS_A,
+	.max_gpio = ADP5585_PIN_MAX,
+	.has_bias_hole = true,
+};
+
+static const struct adp5585_gpio_chip adp5589_gpio_chip_info = {
+	.bank = adp5589_gpio_bank,
+	.bit = adp5589_gpio_bit,
+	.debounce_dis_a = ADP5589_DEBOUNCE_DIS_A,
+	.rpull_cfg_a = ADP5589_RPULL_CONFIG_A,
+	.gpo_data_a = ADP5589_GPO_DATA_OUT_A,
+	.gpo_out_a = ADP5589_GPO_OUT_MODE_A,
+	.gpio_dir_a = ADP5589_GPIO_DIRECTION_A,
+	.gpi_stat_a = ADP5589_GPI_STATUS_A,
+	.max_gpio = ADP5589_PIN_MAX,
+};
+
 static const struct platform_device_id adp5585_gpio_id_table[] = {
-	{ "adp5585-gpio" },
+	{ "adp5585-gpio", (kernel_ulong_t)&adp5585_gpio_chip_info },
+	{ "adp5589-gpio", (kernel_ulong_t)&adp5589_gpio_chip_info },
 	{ /* Sentinel */ }
 };
 MODULE_DEVICE_TABLE(platform, adp5585_gpio_id_table);
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 6ecb90a6276c..d26f722cf31a 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -107,23 +107,23 @@
 
 #define ADP5585_MAX_REG			ADP5585_INT_EN
 
-/*
- * Bank 0 covers pins "GPIO 1/R0" to "GPIO 6/R5", numbered 0 to 5 by the
- * driver, and bank 1 covers pins "GPIO 7/C0" to "GPIO 11/C4", numbered 6 to
- * 10. Some variants of the ADP5585 don't support "GPIO 6/R5". As the driver
- * uses identical GPIO numbering for all variants to avoid confusion, GPIO 5 is
- * marked as reserved in the device tree for variants that don't support it.
- */
-#define ADP5585_BANK(n)			((n) >= 6 ? 1 : 0)
-#define ADP5585_BIT(n)			((n) >= 6 ? BIT((n) - 6) : BIT(n))
+#define ADP5585_PIN_MAX			11
 
 /* ADP5589 */
 #define		ADP5589_MAN_ID_VALUE		0x10
+#define ADP5589_GPI_STATUS_A		0x16
 #define ADP5589_GPI_STATUS_C		0x18
+#define ADP5589_RPULL_CONFIG_A		0x19
+#define ADP5589_DEBOUNCE_DIS_A		0x27
+#define ADP5589_GPO_DATA_OUT_A		0x2a
+#define ADP5589_GPO_OUT_MODE_A		0x2d
+#define		ADP5589_GPIO_DIRECTION_A	0x30
 #define ADP5589_PIN_CONFIG_D		0x4C
 #define ADP5589_INT_EN			0x4e
 #define ADP5589_MAX_REG			ADP5589_INT_EN
 
+#define ADP5589_PIN_MAX			19
+
 struct regmap;
 
 enum adp5585_variant {
-- 
cgit v1.2.3


From 75024f97e82e63d02b0743500efb1e264a1c2dd4 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:05 +0100
Subject: pwm: adp5585: add support for adp5589
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the adp5589 I/O expander. From a PWM point of view it is
pretty similar to adp5585. Main difference is the address
of registers meaningful for configuring the PWM.

Acked-by: Uwe Kleine-König <ukleinek@kernel.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-10-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/pwm/pwm-adp5585.c   | 73 ++++++++++++++++++++++++++++++++++-----------
 include/linux/mfd/adp5585.h |  3 ++
 2 files changed, 59 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/pwm-adp5585.c b/drivers/pwm/pwm-adp5585.c
index add36bc7d221..dc2860979e24 100644
--- a/drivers/pwm/pwm-adp5585.c
+++ b/drivers/pwm/pwm-adp5585.c
@@ -33,21 +33,33 @@
 #define ADP5585_PWM_MIN_PERIOD_NS	(2ULL * NSEC_PER_SEC / ADP5585_PWM_OSC_FREQ_HZ)
 #define ADP5585_PWM_MAX_PERIOD_NS	(2ULL * 0xffff * NSEC_PER_SEC / ADP5585_PWM_OSC_FREQ_HZ)
 
+struct adp5585_pwm_chip {
+	unsigned int pwm_cfg;
+	unsigned int pwm_offt_low;
+	unsigned int pwm_ont_low;
+};
+
+struct adp5585_pwm {
+	const struct adp5585_pwm_chip *info;
+	struct regmap *regmap;
+	unsigned int ext_cfg;
+};
+
 static int pwm_adp5585_request(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct regmap *regmap = pwmchip_get_drvdata(chip);
+	struct adp5585_pwm *adp5585_pwm = pwmchip_get_drvdata(chip);
 
 	/* Configure the R3 pin as PWM output. */
-	return regmap_update_bits(regmap, ADP5585_PIN_CONFIG_C,
+	return regmap_update_bits(adp5585_pwm->regmap, adp5585_pwm->ext_cfg,
 				  ADP5585_R3_EXTEND_CFG_MASK,
 				  ADP5585_R3_EXTEND_CFG_PWM_OUT);
 }
 
 static void pwm_adp5585_free(struct pwm_chip *chip, struct pwm_device *pwm)
 {
-	struct regmap *regmap = pwmchip_get_drvdata(chip);
+	struct adp5585_pwm *adp5585_pwm = pwmchip_get_drvdata(chip);
 
-	regmap_update_bits(regmap, ADP5585_PIN_CONFIG_C,
+	regmap_update_bits(adp5585_pwm->regmap, adp5585_pwm->ext_cfg,
 			   ADP5585_R3_EXTEND_CFG_MASK,
 			   ADP5585_R3_EXTEND_CFG_GPIO4);
 }
@@ -56,14 +68,16 @@ static int pwm_adp5585_apply(struct pwm_chip *chip,
 			     struct pwm_device *pwm,
 			     const struct pwm_state *state)
 {
-	struct regmap *regmap = pwmchip_get_drvdata(chip);
+	struct adp5585_pwm *adp5585_pwm = pwmchip_get_drvdata(chip);
+	const struct adp5585_pwm_chip *info = adp5585_pwm->info;
+	struct regmap *regmap = adp5585_pwm->regmap;
 	u64 period, duty_cycle;
 	u32 on, off;
 	__le16 val;
 	int ret;
 
 	if (!state->enabled) {
-		regmap_clear_bits(regmap, ADP5585_PWM_CFG, ADP5585_PWM_EN);
+		regmap_clear_bits(regmap, info->pwm_cfg, ADP5585_PWM_EN);
 		return 0;
 	}
 
@@ -84,41 +98,43 @@ static int pwm_adp5585_apply(struct pwm_chip *chip,
 	off = div_u64(period, NSEC_PER_SEC / ADP5585_PWM_OSC_FREQ_HZ) - on;
 
 	val = cpu_to_le16(off);
-	ret = regmap_bulk_write(regmap, ADP5585_PWM_OFFT_LOW, &val, 2);
+	ret = regmap_bulk_write(regmap, info->pwm_offt_low, &val, 2);
 	if (ret)
 		return ret;
 
 	val = cpu_to_le16(on);
-	ret = regmap_bulk_write(regmap, ADP5585_PWM_ONT_LOW, &val, 2);
+	ret = regmap_bulk_write(regmap, info->pwm_ont_low, &val, 2);
 	if (ret)
 		return ret;
 
 	/* Enable PWM in continuous mode and no external AND'ing. */
-	ret = regmap_update_bits(regmap, ADP5585_PWM_CFG,
+	ret = regmap_update_bits(regmap, info->pwm_cfg,
 				 ADP5585_PWM_IN_AND | ADP5585_PWM_MODE |
 				 ADP5585_PWM_EN, ADP5585_PWM_EN);
 	if (ret)
 		return ret;
 
-	return regmap_set_bits(regmap, ADP5585_PWM_CFG, ADP5585_PWM_EN);
+	return regmap_set_bits(regmap, info->pwm_cfg, ADP5585_PWM_EN);
 }
 
 static int pwm_adp5585_get_state(struct pwm_chip *chip,
 				 struct pwm_device *pwm,
 				 struct pwm_state *state)
 {
-	struct regmap *regmap = pwmchip_get_drvdata(chip);
+	struct adp5585_pwm *adp5585_pwm = pwmchip_get_drvdata(chip);
+	const struct adp5585_pwm_chip *info = adp5585_pwm->info;
+	struct regmap *regmap = adp5585_pwm->regmap;
 	unsigned int on, off;
 	unsigned int val;
 	__le16 on_off;
 	int ret;
 
-	ret = regmap_bulk_read(regmap, ADP5585_PWM_OFFT_LOW, &on_off, 2);
+	ret = regmap_bulk_read(regmap, info->pwm_offt_low, &on_off, 2);
 	if (ret)
 		return ret;
 	off = le16_to_cpu(on_off);
 
-	ret = regmap_bulk_read(regmap, ADP5585_PWM_ONT_LOW, &on_off, 2);
+	ret = regmap_bulk_read(regmap, info->pwm_ont_low, &on_off, 2);
 	if (ret)
 		return ret;
 	on = le16_to_cpu(on_off);
@@ -128,7 +144,7 @@ static int pwm_adp5585_get_state(struct pwm_chip *chip,
 
 	state->polarity = PWM_POLARITY_NORMAL;
 
-	regmap_read(regmap, ADP5585_PWM_CFG, &val);
+	regmap_read(regmap, info->pwm_cfg, &val);
 	state->enabled = !!(val & ADP5585_PWM_EN);
 
 	return 0;
@@ -143,18 +159,28 @@ static const struct pwm_ops adp5585_pwm_ops = {
 
 static int adp5585_pwm_probe(struct platform_device *pdev)
 {
+	const struct platform_device_id *id = platform_get_device_id(pdev);
 	struct device *dev = &pdev->dev;
 	struct adp5585_dev *adp5585 = dev_get_drvdata(dev->parent);
+	struct adp5585_pwm *adp5585_pwm;
 	struct pwm_chip *chip;
 	int ret;
 
-	chip = devm_pwmchip_alloc(dev, ADP5585_PWM_CHAN_NUM, 0);
+	chip = devm_pwmchip_alloc(dev, ADP5585_PWM_CHAN_NUM,
+				  sizeof(*adp5585_pwm));
 	if (IS_ERR(chip))
 		return PTR_ERR(chip);
 
+	adp5585_pwm = pwmchip_get_drvdata(chip);
+	adp5585_pwm->regmap = adp5585->regmap;
+	adp5585_pwm->ext_cfg = adp5585->regs->ext_cfg;
+
+	adp5585_pwm->info = (const struct adp5585_pwm_chip *)id->driver_data;
+	if (!adp5585_pwm->info)
+		return -ENODEV;
+
 	device_set_of_node_from_dev(dev, dev->parent);
 
-	pwmchip_set_drvdata(chip, adp5585->regmap);
 	chip->ops = &adp5585_pwm_ops;
 
 	ret = devm_pwmchip_add(dev, chip);
@@ -164,8 +190,21 @@ static int adp5585_pwm_probe(struct platform_device *pdev)
 	return 0;
 }
 
+static const struct adp5585_pwm_chip adp5589_pwm_chip_info = {
+	.pwm_cfg = ADP5585_PWM_CFG,
+	.pwm_offt_low = ADP5585_PWM_OFFT_LOW,
+	.pwm_ont_low = ADP5585_PWM_ONT_LOW,
+};
+
+static const struct adp5585_pwm_chip adp5585_pwm_chip_info = {
+	.pwm_cfg = ADP5589_PWM_CFG,
+	.pwm_offt_low = ADP5589_PWM_OFFT_LOW,
+	.pwm_ont_low = ADP5589_PWM_ONT_LOW,
+};
+
 static const struct platform_device_id adp5585_pwm_id_table[] = {
-	{ "adp5585-pwm" },
+	{ "adp5585-pwm", (kernel_ulong_t)&adp5585_pwm_chip_info },
+	{ "adp5589-pwm", (kernel_ulong_t)&adp5589_pwm_chip_info },
 	{ /* Sentinel */ }
 };
 MODULE_DEVICE_TABLE(platform, adp5585_pwm_id_table);
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index d26f722cf31a..77f7c74f084d 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -118,6 +118,9 @@
 #define ADP5589_GPO_DATA_OUT_A		0x2a
 #define ADP5589_GPO_OUT_MODE_A		0x2d
 #define		ADP5589_GPIO_DIRECTION_A	0x30
+#define ADP5589_PWM_OFFT_LOW		0x3e
+#define ADP5589_PWM_ONT_LOW		0x40
+#define ADP5589_PWM_CFG			0x42
 #define ADP5589_PIN_CONFIG_D		0x4C
 #define ADP5589_INT_EN			0x4e
 #define ADP5589_MAX_REG			ADP5589_INT_EN
-- 
cgit v1.2.3


From 47a1f759b776ec9287f675f5d4fbf60b94cc566d Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:07 +0100
Subject: mfd: adp5585: Add support for event handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These devices are capable of generate FIFO based events based on KEY or
GPI presses. Add support for handling these events. This is in
preparation of adding full support for keymap and gpis based events.

Reviewed-by: Lee Jones <lee@kernel.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-12-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 176 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/adp5585.h |  18 +++++
 2 files changed, 186 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index ae12372bdde9..ae2448697ef4 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/array_size.h>
+#include <linux/bitfield.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/i2c.h>
@@ -165,10 +166,16 @@ static const struct regmap_config adp5589_regmap_config_template = {
 
 static const struct adp5585_regs adp5585_regs = {
 	.ext_cfg = ADP5585_PIN_CONFIG_C,
+	.int_en = ADP5585_INT_EN,
+	.gen_cfg = ADP5585_GENERAL_CFG,
+	.poll_ptime_cfg = ADP5585_POLL_PTIME_CFG,
 };
 
 static const struct adp5585_regs adp5589_regs = {
 	.ext_cfg = ADP5589_PIN_CONFIG_D,
+	.int_en = ADP5589_INT_EN,
+	.gen_cfg = ADP5589_GENERAL_CFG,
+	.poll_ptime_cfg = ADP5589_POLL_PTIME_CFG,
 };
 
 static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp5585)
@@ -241,6 +248,146 @@ static void adp5585_osc_disable(void *data)
 	regmap_write(adp5585->regmap, ADP5585_GENERAL_CFG, 0);
 }
 
+static void adp5585_report_events(struct adp5585_dev *adp5585, int ev_cnt)
+{
+	unsigned int i;
+
+	for (i = 0; i < ev_cnt; i++) {
+		unsigned long key_val, key_press;
+		unsigned int key;
+		int ret;
+
+		ret = regmap_read(adp5585->regmap, ADP5585_FIFO_1 + i, &key);
+		if (ret)
+			return;
+
+		key_val = FIELD_GET(ADP5585_KEY_EVENT_MASK, key);
+		key_press = FIELD_GET(ADP5585_KEV_EV_PRESS_MASK, key);
+
+		blocking_notifier_call_chain(&adp5585->event_notifier, key_val, (void *)key_press);
+	}
+}
+
+static irqreturn_t adp5585_irq(int irq, void *data)
+{
+	struct adp5585_dev *adp5585 = data;
+	unsigned int status, ev_cnt;
+	int ret;
+
+	ret = regmap_read(adp5585->regmap, ADP5585_INT_STATUS, &status);
+	if (ret)
+		return IRQ_HANDLED;
+
+	if (status & ADP5585_OVRFLOW_INT)
+		dev_err_ratelimited(adp5585->dev, "Event overflow error\n");
+
+	if (!(status & ADP5585_EVENT_INT))
+		goto out_irq;
+
+	ret = regmap_read(adp5585->regmap, ADP5585_STATUS, &ev_cnt);
+	if (ret)
+		goto out_irq;
+
+	ev_cnt = FIELD_GET(ADP5585_EC_MASK, ev_cnt);
+	if (!ev_cnt)
+		goto out_irq;
+
+	adp5585_report_events(adp5585, ev_cnt);
+out_irq:
+	regmap_write(adp5585->regmap, ADP5585_INT_STATUS, status);
+	return IRQ_HANDLED;
+}
+
+static int adp5585_setup(struct adp5585_dev *adp5585)
+{
+	const struct adp5585_regs *regs = adp5585->regs;
+	unsigned int reg_val, i;
+	int ret;
+
+	/* Clear any possible event by reading all the FIFO entries */
+	for (i = 0; i < ADP5585_EV_MAX; i++) {
+		ret = regmap_read(adp5585->regmap, ADP5585_FIFO_1 + i, &reg_val);
+		if (ret)
+			return ret;
+	}
+
+	ret = regmap_write(adp5585->regmap, regs->poll_ptime_cfg, adp5585->ev_poll_time);
+	if (ret)
+		return ret;
+
+	/*
+	 * Enable the internal oscillator, as it's shared between multiple
+	 * functions.
+	 */
+	ret = regmap_write(adp5585->regmap, regs->gen_cfg,
+			   ADP5585_OSC_FREQ_500KHZ | ADP5585_INT_CFG | ADP5585_OSC_EN);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(adp5585->dev, adp5585_osc_disable, adp5585);
+}
+
+static int adp5585_parse_fw(struct adp5585_dev *adp5585)
+{
+	unsigned int prop_val;
+	int ret;
+
+	ret = device_property_read_u32(adp5585->dev, "poll-interval", &prop_val);
+	if (!ret) {
+		adp5585->ev_poll_time = prop_val / 10 - 1;
+		/*
+		 * ev_poll_time is the raw value to be written on the register and 0 to 3 are the
+		 * valid values.
+		 */
+		if (adp5585->ev_poll_time > 3)
+			return dev_err_probe(adp5585->dev, -EINVAL,
+					     "Invalid value(%u) for poll-interval\n", prop_val);
+	}
+
+	return 0;
+}
+
+static void adp5585_irq_disable(void *data)
+{
+	struct adp5585_dev *adp5585 = data;
+
+	regmap_write(adp5585->regmap, adp5585->regs->int_en, 0);
+}
+
+static int adp5585_irq_enable(struct i2c_client *i2c,
+			      struct adp5585_dev *adp5585)
+{
+	const struct adp5585_regs *regs = adp5585->regs;
+	unsigned int stat;
+	int ret;
+
+	if (i2c->irq <= 0)
+		return 0;
+
+	ret = devm_request_threaded_irq(&i2c->dev, i2c->irq, NULL, adp5585_irq,
+					IRQF_ONESHOT, i2c->name, adp5585);
+	if (ret)
+		return ret;
+
+	/*
+	 * Clear any possible outstanding interrupt before enabling them. We do that by reading
+	 * the status register and writing back the same value.
+	 */
+	ret = regmap_read(adp5585->regmap, ADP5585_INT_STATUS, &stat);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(adp5585->regmap, ADP5585_INT_STATUS, stat);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(adp5585->regmap, regs->int_en, ADP5585_OVRFLOW_IEN | ADP5585_EVENT_IEN);
+	if (ret)
+		return ret;
+
+	return devm_add_action_or_reset(&i2c->dev, adp5585_irq_disable, adp5585);
+}
+
 static int adp5585_i2c_probe(struct i2c_client *i2c)
 {
 	struct regmap_config *regmap_config;
@@ -254,6 +401,8 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 
 	i2c_set_clientdata(i2c, adp5585);
 	adp5585->dev = &i2c->dev;
+	adp5585->irq = i2c->irq;
+	BLOCKING_INIT_NOTIFIER_HEAD(&adp5585->event_notifier);
 
 	adp5585->variant = (enum adp5585_variant)(uintptr_t)i2c_get_match_data(i2c);
 	if (!adp5585->variant)
@@ -278,25 +427,28 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 		return dev_err_probe(&i2c->dev, -ENODEV,
 				     "Invalid device ID 0x%02x\n", id);
 
-	/*
-	 * Enable the internal oscillator, as it's shared between multiple
-	 * functions.
-	 */
-	ret = regmap_set_bits(adp5585->regmap, ADP5585_GENERAL_CFG, ADP5585_OSC_EN);
+	ret = adp5585_parse_fw(adp5585);
 	if (ret)
 		return ret;
 
-	ret = devm_add_action_or_reset(&i2c->dev, adp5585_osc_disable, adp5585);
+	ret = adp5585_setup(adp5585);
 	if (ret)
 		return ret;
 
-	return adp5585_add_devices(adp5585);
+	ret = adp5585_add_devices(adp5585);
+	if (ret)
+		return ret;
+
+	return adp5585_irq_enable(i2c, adp5585);
 }
 
 static int adp5585_suspend(struct device *dev)
 {
 	struct adp5585_dev *adp5585 = dev_get_drvdata(dev);
 
+	if (adp5585->irq)
+		disable_irq(adp5585->irq);
+
 	regcache_cache_only(adp5585->regmap, true);
 
 	return 0;
@@ -305,11 +457,19 @@ static int adp5585_suspend(struct device *dev)
 static int adp5585_resume(struct device *dev)
 {
 	struct adp5585_dev *adp5585 = dev_get_drvdata(dev);
+	int ret;
 
 	regcache_cache_only(adp5585->regmap, false);
 	regcache_mark_dirty(adp5585->regmap);
 
-	return regcache_sync(adp5585->regmap);
+	ret = regcache_sync(adp5585->regmap);
+	if (ret)
+		return ret;
+
+	if (adp5585->irq)
+		enable_irq(adp5585->irq);
+
+	return 0;
 }
 
 static DEFINE_SIMPLE_DEV_PM_OPS(adp5585_pm, adp5585_suspend, adp5585_resume);
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 77f7c74f084d..43a33a3d3f5a 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -10,13 +10,20 @@
 #define __MFD_ADP5585_H_
 
 #include <linux/bits.h>
+#include <linux/notifier.h>
 
 #define ADP5585_ID			0x00
 #define		ADP5585_MAN_ID_VALUE		0x20
 #define		ADP5585_MAN_ID_MASK		GENMASK(7, 4)
+#define		ADP5585_REV_ID_MASK		GENMASK(3, 0)
 #define ADP5585_INT_STATUS		0x01
+#define		ADP5585_OVRFLOW_INT		BIT(2)
+#define		ADP5585_EVENT_INT		BIT(0)
 #define ADP5585_STATUS			0x02
+#define		ADP5585_EC_MASK			GENMASK(4, 0)
 #define ADP5585_FIFO_1			0x03
+#define		ADP5585_KEV_EV_PRESS_MASK	BIT(7)
+#define		ADP5585_KEY_EVENT_MASK		GENMASK(6, 0)
 #define ADP5585_FIFO_2			0x04
 #define ADP5585_FIFO_3			0x05
 #define ADP5585_FIFO_4			0x06
@@ -32,6 +39,7 @@
 #define ADP5585_FIFO_14			0x10
 #define ADP5585_FIFO_15			0x11
 #define ADP5585_FIFO_16			0x12
+#define		ADP5585_EV_MAX			(ADP5585_FIFO_16 - ADP5585_FIFO_1 + 1)
 #define ADP5585_GPI_INT_STAT_A		0x13
 #define ADP5585_GPI_INT_STAT_B		0x14
 #define ADP5585_GPI_STATUS_A		0x15
@@ -104,6 +112,8 @@
 #define		ADP5585_INT_CFG			BIT(1)
 #define		ADP5585_RST_CFG			BIT(0)
 #define ADP5585_INT_EN			0x3c
+#define		ADP5585_OVRFLOW_IEN		BIT(2)
+#define		ADP5585_EVENT_IEN		BIT(0)
 
 #define ADP5585_MAX_REG			ADP5585_INT_EN
 
@@ -121,7 +131,9 @@
 #define ADP5589_PWM_OFFT_LOW		0x3e
 #define ADP5589_PWM_ONT_LOW		0x40
 #define ADP5589_PWM_CFG			0x42
+#define ADP5589_POLL_PTIME_CFG		0x48
 #define ADP5589_PIN_CONFIG_D		0x4C
+#define ADP5589_GENERAL_CFG		0x4d
 #define ADP5589_INT_EN			0x4e
 #define ADP5589_MAX_REG			ADP5589_INT_EN
 
@@ -142,15 +154,21 @@ enum adp5585_variant {
 };
 
 struct adp5585_regs {
+	unsigned int gen_cfg;
 	unsigned int ext_cfg;
+	unsigned int int_en;
+	unsigned int poll_ptime_cfg;
 };
 
 struct adp5585_dev {
 	struct device *dev;
 	struct regmap *regmap;
 	const struct adp5585_regs *regs;
+	struct blocking_notifier_head event_notifier;
 	enum adp5585_variant variant;
 	unsigned int id;
+	int irq;
+	unsigned int ev_poll_time;
 };
 
 #endif
-- 
cgit v1.2.3


From 333812da70d5f71bf5e176f6d55a5f716301b5fc Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:08 +0100
Subject: mfd: adp5585: Support reset and unlock events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ADP558x family of devices can be programmed to respond to some
especial events, In case of the unlock events, one can lock the keypad
and use KEYS or GPIs events to unlock it. For the reset events, one can
again use a combinations of GPIs/KEYs in order to generate an event that
will trigger the device to generate an output reset pulse.

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-13-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 274 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/adp5585.h |  40 +++++++
 2 files changed, 312 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index ae2448697ef4..30014deee41f 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -169,6 +169,9 @@ static const struct adp5585_regs adp5585_regs = {
 	.int_en = ADP5585_INT_EN,
 	.gen_cfg = ADP5585_GENERAL_CFG,
 	.poll_ptime_cfg = ADP5585_POLL_PTIME_CFG,
+	.reset_cfg = ADP5585_RESET_CFG,
+	.reset1_event_a = ADP5585_RESET1_EVENT_A,
+	.reset2_event_a = ADP5585_RESET2_EVENT_A,
 };
 
 static const struct adp5585_regs adp5589_regs = {
@@ -176,8 +179,54 @@ static const struct adp5585_regs adp5589_regs = {
 	.int_en = ADP5589_INT_EN,
 	.gen_cfg = ADP5589_GENERAL_CFG,
 	.poll_ptime_cfg = ADP5589_POLL_PTIME_CFG,
+	.reset_cfg = ADP5589_RESET_CFG,
+	.reset1_event_a = ADP5589_RESET1_EVENT_A,
+	.reset2_event_a = ADP5589_RESET2_EVENT_A,
 };
 
+static int adp5585_validate_event(const struct adp5585_dev *adp5585, unsigned int ev)
+{
+	if (adp5585->has_pin6) {
+		if (ev >= ADP5585_ROW5_KEY_EVENT_START && ev <= ADP5585_ROW5_KEY_EVENT_END)
+			return 0;
+		if (ev >= ADP5585_GPI_EVENT_START && ev <= ADP5585_GPI_EVENT_END)
+			return 0;
+
+		return dev_err_probe(adp5585->dev, -EINVAL,
+				     "Invalid unlock/reset event(%u) for this device\n", ev);
+	}
+
+	if (ev >= ADP5585_KEY_EVENT_START && ev <= ADP5585_KEY_EVENT_END)
+		return 0;
+	if (ev >= ADP5585_GPI_EVENT_START && ev <= ADP5585_GPI_EVENT_END) {
+		/*
+		 * Some variants of the adp5585 do not have the Row 5
+		 * (meaning pin 6 or GPIO 6) available. Instead that pin serves
+		 * as a reset pin. So, we need to make sure no event is
+		 * configured for it.
+		 */
+		if (ev == (ADP5585_GPI_EVENT_START + 5))
+			return dev_err_probe(adp5585->dev, -EINVAL,
+					     "Invalid unlock/reset event(%u). R5 not available\n",
+					     ev);
+		return 0;
+	}
+
+	return dev_err_probe(adp5585->dev, -EINVAL,
+			     "Invalid unlock/reset event(%u) for this device\n", ev);
+}
+
+static int adp5589_validate_event(const struct adp5585_dev *adp5585, unsigned int ev)
+{
+	if (ev >= ADP5589_KEY_EVENT_START && ev <= ADP5589_KEY_EVENT_END)
+		return 0;
+	if (ev >= ADP5589_GPI_EVENT_START && ev <= ADP5589_GPI_EVENT_END)
+		return 0;
+
+	return dev_err_probe(adp5585->dev, -EINVAL,
+			     "Invalid unlock/reset event(%u) for this device\n", ev);
+}
+
 static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp5585)
 {
 	struct regmap_config *regmap_config;
@@ -190,6 +239,8 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	case ADP5585_04:
 		adp5585->id = ADP5585_MAN_ID_VALUE;
 		adp5585->regs = &adp5585_regs;
+		if (adp5585->variant == ADP5585_01)
+			adp5585->has_pin6 = true;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
 					     sizeof(*regmap_config), GFP_KERNEL);
 		break;
@@ -198,6 +249,8 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	case ADP5589_02:
 		adp5585->id = ADP5589_MAN_ID_VALUE;
 		adp5585->regs = &adp5589_regs;
+		adp5585->has_unlock = true;
+		adp5585->has_pin6 = true;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5589_regmap_config_template,
 					     sizeof(*regmap_config), GFP_KERNEL);
 		break;
@@ -213,6 +266,167 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	return regmap_config;
 }
 
+static int adp5585_parse_ev_array(const struct adp5585_dev *adp5585, const char *prop, u32 *events,
+				  u32 *n_events, u32 max_evs, bool reset_ev)
+{
+	struct device *dev = adp5585->dev;
+	unsigned int ev;
+	int ret;
+
+	/*
+	 * The device has the capability of handling special events through GPIs or a Keypad:
+	 *  unlock events: Unlock the keymap until one of the configured events is detected.
+	 *  reset events: Generate a reset pulse when one of the configured events is detected.
+	 */
+	ret = device_property_count_u32(dev, prop);
+	if (ret < 0)
+		return 0;
+
+	*n_events = ret;
+
+	if (!adp5585->has_unlock && !reset_ev)
+		return dev_err_probe(dev, -EOPNOTSUPP, "Unlock keys not supported\n");
+
+	if (*n_events > max_evs)
+		return dev_err_probe(dev, -EINVAL,
+				     "Invalid number of keys(%u > %u) for %s\n",
+				     *n_events, max_evs, prop);
+
+	ret = device_property_read_u32_array(dev, prop, events, *n_events);
+	if (ret)
+		return ret;
+
+	for (ev = 0; ev < *n_events; ev++) {
+		if (!reset_ev && events[ev] == ADP5589_UNLOCK_WILDCARD)
+			continue;
+
+		if (adp5585->id == ADP5585_MAN_ID_VALUE)
+			ret = adp5585_validate_event(adp5585, events[ev]);
+		else
+			ret = adp5589_validate_event(adp5585, events[ev]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int adp5585_unlock_ev_parse(struct adp5585_dev *adp5585)
+{
+	struct device *dev = adp5585->dev;
+	int ret;
+
+	ret = adp5585_parse_ev_array(adp5585, "adi,unlock-events", adp5585->unlock_keys,
+				     &adp5585->nkeys_unlock, ARRAY_SIZE(adp5585->unlock_keys),
+				     false);
+	if (ret)
+		return ret;
+	if (!adp5585->nkeys_unlock)
+		return 0;
+
+	ret = device_property_read_u32(dev, "adi,unlock-trigger-sec", &adp5585->unlock_time);
+	if (!ret) {
+		if (adp5585->unlock_time > ADP5585_MAX_UNLOCK_TIME_SEC)
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid unlock time(%u > %d)\n",
+					     adp5585->unlock_time,
+					     ADP5585_MAX_UNLOCK_TIME_SEC);
+	}
+
+	return 0;
+}
+
+static int adp5585_reset_ev_parse(struct adp5585_dev *adp5585)
+{
+	struct device *dev = adp5585->dev;
+	u32 prop_val;
+	int ret;
+
+	ret = adp5585_parse_ev_array(adp5585, "adi,reset1-events", adp5585->reset1_keys,
+				     &adp5585->nkeys_reset1,
+				     ARRAY_SIZE(adp5585->reset1_keys), true);
+	if (ret)
+		return ret;
+
+	ret = adp5585_parse_ev_array(adp5585, "adi,reset2-events",
+				     adp5585->reset2_keys,
+				     &adp5585->nkeys_reset2,
+				     ARRAY_SIZE(adp5585->reset2_keys), true);
+	if (ret)
+		return ret;
+
+	if (!adp5585->nkeys_reset1 && !adp5585->nkeys_reset2)
+		return 0;
+
+	if (adp5585->nkeys_reset1 && device_property_read_bool(dev, "adi,reset1-active-high"))
+		adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET1_POL, 1);
+
+	if (adp5585->nkeys_reset2 && device_property_read_bool(dev, "adi,reset2-active-high"))
+		adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET2_POL, 1);
+
+	if (device_property_read_bool(dev, "adi,rst-passthrough-enable"))
+		adp5585->reset_cfg |= FIELD_PREP(ADP5585_RST_PASSTHRU_EN, 1);
+
+	ret = device_property_read_u32(dev, "adi,reset-trigger-ms", &prop_val);
+	if (!ret) {
+		switch (prop_val) {
+		case 0:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 0);
+			break;
+		case 1000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 1);
+			break;
+		case 1500:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 2);
+			break;
+		case 2000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 3);
+			break;
+		case 2500:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 4);
+			break;
+		case 3000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 5);
+			break;
+		case 3500:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 6);
+			break;
+		case 4000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_RESET_TRIG_TIME, 7);
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid value(%u) for adi,reset-trigger-ms\n",
+					     prop_val);
+		}
+	}
+
+	ret = device_property_read_u32(dev, "adi,reset-pulse-width-us", &prop_val);
+	if (!ret) {
+		switch (prop_val) {
+		case 500:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_PULSE_WIDTH, 0);
+			break;
+		case 1000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_PULSE_WIDTH, 1);
+			break;
+		case 2000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_PULSE_WIDTH, 2);
+			break;
+		case 10000:
+			adp5585->reset_cfg |= FIELD_PREP(ADP5585_PULSE_WIDTH, 3);
+			break;
+		default:
+			return dev_err_probe(dev, -EINVAL,
+					     "Invalid value(%u) for adi,reset-pulse-width-us\n",
+					     prop_val);
+		}
+		return ret;
+	}
+
+	return 0;
+}
+
 static int adp5585_add_devices(const struct adp5585_dev *adp5585)
 {
 	struct device *dev = adp5585->dev;
@@ -301,9 +515,61 @@ out_irq:
 static int adp5585_setup(struct adp5585_dev *adp5585)
 {
 	const struct adp5585_regs *regs = adp5585->regs;
-	unsigned int reg_val, i;
+	unsigned int reg_val = 0, i;
 	int ret;
 
+	/* Configure the device with reset and unlock events */
+	for (i = 0; i < adp5585->nkeys_unlock; i++) {
+		ret = regmap_write(adp5585->regmap, ADP5589_UNLOCK1 + i,
+				   adp5585->unlock_keys[i] | ADP5589_UNLOCK_EV_PRESS);
+		if (ret)
+			return ret;
+	}
+
+	if (adp5585->nkeys_unlock) {
+		ret = regmap_update_bits(adp5585->regmap, ADP5589_UNLOCK_TIMERS,
+					 ADP5589_UNLOCK_TIMER, adp5585->unlock_time);
+		if (ret)
+			return ret;
+
+		ret = regmap_set_bits(adp5585->regmap, ADP5589_LOCK_CFG, ADP5589_LOCK_EN);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < adp5585->nkeys_reset1; i++) {
+		ret = regmap_write(adp5585->regmap, regs->reset1_event_a + i,
+				   adp5585->reset1_keys[i] | ADP5585_RESET_EV_PRESS);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < adp5585->nkeys_reset2; i++) {
+		ret = regmap_write(adp5585->regmap, regs->reset2_event_a + i,
+				   adp5585->reset2_keys[i] | ADP5585_RESET_EV_PRESS);
+		if (ret)
+			return ret;
+	}
+
+	if (adp5585->nkeys_reset1 || adp5585->nkeys_reset2) {
+		ret = regmap_write(adp5585->regmap, regs->reset_cfg, adp5585->reset_cfg);
+		if (ret)
+			return ret;
+
+		/* If there's a reset1 event, then R4 is used as an output for the reset signal */
+		if (adp5585->nkeys_reset1)
+			reg_val = ADP5585_R4_EXTEND_CFG_RESET1;
+		/* If there's a reset2 event, then C4 is used as an output for the reset signal */
+		if (adp5585->nkeys_reset2)
+			reg_val |= ADP5585_C4_EXTEND_CFG_RESET2;
+
+		ret = regmap_update_bits(adp5585->regmap, regs->ext_cfg,
+					 ADP5585_C4_EXTEND_CFG_MASK | ADP5585_R4_EXTEND_CFG_MASK,
+					 reg_val);
+		if (ret)
+			return ret;
+	}
+
 	/* Clear any possible event by reading all the FIFO entries */
 	for (i = 0; i < ADP5585_EV_MAX; i++) {
 		ret = regmap_read(adp5585->regmap, ADP5585_FIFO_1 + i, &reg_val);
@@ -344,7 +610,11 @@ static int adp5585_parse_fw(struct adp5585_dev *adp5585)
 					     "Invalid value(%u) for poll-interval\n", prop_val);
 	}
 
-	return 0;
+	ret = adp5585_unlock_ev_parse(adp5585);
+	if (ret)
+		return ret;
+
+	return adp5585_reset_ev_parse(adp5585);
 }
 
 static void adp5585_irq_disable(void *data)
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 43a33a3d3f5a..db483ef9693a 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -68,6 +68,7 @@
 #define ADP5585_GPIO_DIRECTION_A	0x27
 #define ADP5585_GPIO_DIRECTION_B	0x28
 #define ADP5585_RESET1_EVENT_A		0x29
+#define		ADP5585_RESET_EV_PRESS		BIT(7)
 #define ADP5585_RESET1_EVENT_B		0x2a
 #define ADP5585_RESET1_EVENT_C		0x2b
 #define ADP5585_RESET2_EVENT_A		0x2c
@@ -118,6 +119,13 @@
 #define ADP5585_MAX_REG			ADP5585_INT_EN
 
 #define ADP5585_PIN_MAX			11
+#define ADP5585_MAX_UNLOCK_TIME_SEC	7
+#define ADP5585_KEY_EVENT_START		1
+#define ADP5585_KEY_EVENT_END		25
+#define ADP5585_GPI_EVENT_START		37
+#define ADP5585_GPI_EVENT_END		47
+#define ADP5585_ROW5_KEY_EVENT_START	1
+#define ADP5585_ROW5_KEY_EVENT_END	30
 
 /* ADP5589 */
 #define		ADP5589_MAN_ID_VALUE		0x10
@@ -128,6 +136,20 @@
 #define ADP5589_GPO_DATA_OUT_A		0x2a
 #define ADP5589_GPO_OUT_MODE_A		0x2d
 #define		ADP5589_GPIO_DIRECTION_A	0x30
+#define ADP5589_UNLOCK1			0x33
+#define		ADP5589_UNLOCK_EV_PRESS		BIT(7)
+#define ADP5589_UNLOCK_TIMERS		0x36
+#define		ADP5589_UNLOCK_TIMER		GENMASK(2, 0)
+#define ADP5589_LOCK_CFG		0x37
+#define		ADP5589_LOCK_EN			BIT(0)
+#define ADP5589_RESET1_EVENT_A		0x38
+#define ADP5589_RESET2_EVENT_A		0x3B
+#define ADP5589_RESET_CFG		0x3D
+#define		ADP5585_RESET2_POL		BIT(7)
+#define		ADP5585_RESET1_POL		BIT(6)
+#define		ADP5585_RST_PASSTHRU_EN		BIT(5)
+#define		ADP5585_RESET_TRIG_TIME		GENMASK(4, 2)
+#define		ADP5585_PULSE_WIDTH		GENMASK(1, 0)
 #define ADP5589_PWM_OFFT_LOW		0x3e
 #define ADP5589_PWM_ONT_LOW		0x40
 #define ADP5589_PWM_CFG			0x42
@@ -138,6 +160,11 @@
 #define ADP5589_MAX_REG			ADP5589_INT_EN
 
 #define ADP5589_PIN_MAX			19
+#define ADP5589_KEY_EVENT_START		1
+#define ADP5589_KEY_EVENT_END		88
+#define ADP5589_GPI_EVENT_START		97
+#define ADP5589_GPI_EVENT_END		115
+#define ADP5589_UNLOCK_WILDCARD		127
 
 struct regmap;
 
@@ -158,6 +185,9 @@ struct adp5585_regs {
 	unsigned int ext_cfg;
 	unsigned int int_en;
 	unsigned int poll_ptime_cfg;
+	unsigned int reset_cfg;
+	unsigned int reset1_event_a;
+	unsigned int reset2_event_a;
 };
 
 struct adp5585_dev {
@@ -167,8 +197,18 @@ struct adp5585_dev {
 	struct blocking_notifier_head event_notifier;
 	enum adp5585_variant variant;
 	unsigned int id;
+	bool has_unlock;
+	bool has_pin6;
 	int irq;
 	unsigned int ev_poll_time;
+	unsigned int unlock_time;
+	unsigned int unlock_keys[2];
+	unsigned int nkeys_unlock;
+	unsigned int reset1_keys[3];
+	unsigned int nkeys_reset1;
+	unsigned int reset2_keys[2];
+	unsigned int nkeys_reset2;
+	u8 reset_cfg;
 };
 
 #endif
-- 
cgit v1.2.3


From bd113a13e1fa51789f55987369b80e1d8bc19389 Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:09 +0100
Subject: mfd: adp5585: Add support for input devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ADP558x family supports a built in keypad matrix decoder which can
be added as an Input device. In order to both support the Input and the
GPIO device, we need to create a bitmap of the supported pins and track
their usage since they can either be used as GPIOs (GPIs) or as part of
the keymap.

We also need to mark special pins busy in case some features are being
used (ex: pwm or reset events).

Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-14-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/adp5585.c       | 31 +++++++++++++++++++++++++++++++
 include/linux/mfd/adp5585.h | 10 ++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/adp5585.c b/drivers/mfd/adp5585.c
index 30014deee41f..8f0fd7374426 100644
--- a/drivers/mfd/adp5585.c
+++ b/drivers/mfd/adp5585.c
@@ -22,17 +22,20 @@
 enum {
 	ADP5585_DEV_GPIO,
 	ADP5585_DEV_PWM,
+	ADP5585_DEV_INPUT,
 	ADP5585_DEV_MAX
 };
 
 static const struct mfd_cell adp5585_devs[ADP5585_DEV_MAX] = {
 	MFD_CELL_NAME("adp5585-gpio"),
 	MFD_CELL_NAME("adp5585-pwm"),
+	MFD_CELL_NAME("adp5585-keys"),
 };
 
 static const struct mfd_cell adp5589_devs[] = {
 	MFD_CELL_NAME("adp5589-gpio"),
 	MFD_CELL_NAME("adp5589-pwm"),
+	MFD_CELL_NAME("adp5589-keys"),
 };
 
 static const struct regmap_range adp5585_volatile_ranges[] = {
@@ -172,6 +175,7 @@ static const struct adp5585_regs adp5585_regs = {
 	.reset_cfg = ADP5585_RESET_CFG,
 	.reset1_event_a = ADP5585_RESET1_EVENT_A,
 	.reset2_event_a = ADP5585_RESET2_EVENT_A,
+	.pin_cfg_a = ADP5585_PIN_CONFIG_A,
 };
 
 static const struct adp5585_regs adp5589_regs = {
@@ -182,6 +186,7 @@ static const struct adp5585_regs adp5589_regs = {
 	.reset_cfg = ADP5589_RESET_CFG,
 	.reset1_event_a = ADP5589_RESET1_EVENT_A,
 	.reset2_event_a = ADP5589_RESET2_EVENT_A,
+	.pin_cfg_a = ADP5589_PIN_CONFIG_A,
 };
 
 static int adp5585_validate_event(const struct adp5585_dev *adp5585, unsigned int ev)
@@ -239,6 +244,8 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 	case ADP5585_04:
 		adp5585->id = ADP5585_MAN_ID_VALUE;
 		adp5585->regs = &adp5585_regs;
+		adp5585->n_pins = ADP5585_PIN_MAX;
+		adp5585->reset2_out = ADP5585_RESET2_OUT;
 		if (adp5585->variant == ADP5585_01)
 			adp5585->has_pin6 = true;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5585_regmap_config_template,
@@ -251,6 +258,8 @@ static struct regmap_config *adp5585_fill_variant_config(struct adp5585_dev *adp
 		adp5585->regs = &adp5589_regs;
 		adp5585->has_unlock = true;
 		adp5585->has_pin6 = true;
+		adp5585->n_pins = ADP5589_PIN_MAX;
+		adp5585->reset2_out = ADP5589_RESET2_OUT;
 		regmap_config = devm_kmemdup(adp5585->dev, &adp5589_regmap_config_template,
 					     sizeof(*regmap_config), GFP_KERNEL);
 		break;
@@ -439,6 +448,8 @@ static int adp5585_add_devices(const struct adp5585_dev *adp5585)
 		cells = adp5589_devs;
 
 	if (device_property_present(dev, "#pwm-cells")) {
+		/* Make sure the PWM output pin is not used by the GPIO or INPUT devices */
+		__set_bit(ADP5585_PWM_OUT, adp5585->pin_usage);
 		ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO,
 					   &cells[ADP5585_DEV_PWM], 1, NULL, 0, NULL);
 		if (ret)
@@ -452,6 +463,13 @@ static int adp5585_add_devices(const struct adp5585_dev *adp5585)
 			return dev_err_probe(dev, ret, "Failed to add GPIO device\n");
 	}
 
+	if (device_property_present(adp5585->dev, "adi,keypad-pins")) {
+		ret = devm_mfd_add_devices(adp5585->dev, PLATFORM_DEVID_AUTO,
+					   &cells[ADP5585_DEV_INPUT], 1, NULL, 0, NULL);
+		if (ret)
+			return dev_err_probe(dev, ret, "Failed to add input device\n");
+	}
+
 	return 0;
 }
 
@@ -518,6 +536,10 @@ static int adp5585_setup(struct adp5585_dev *adp5585)
 	unsigned int reg_val = 0, i;
 	int ret;
 
+	/* If pin_6 (ROW5/GPI6) is not available, make sure to mark it as "busy" */
+	if (!adp5585->has_pin6)
+		__set_bit(ADP5585_ROW5, adp5585->pin_usage);
+
 	/* Configure the device with reset and unlock events */
 	for (i = 0; i < adp5585->nkeys_unlock; i++) {
 		ret = regmap_write(adp5585->regmap, ADP5589_UNLOCK1 + i,
@@ -542,6 +564,9 @@ static int adp5585_setup(struct adp5585_dev *adp5585)
 				   adp5585->reset1_keys[i] | ADP5585_RESET_EV_PRESS);
 		if (ret)
 			return ret;
+
+		/* Mark that pin as not usable for the INPUT and GPIO devices. */
+		__set_bit(ADP5585_RESET1_OUT, adp5585->pin_usage);
 	}
 
 	for (i = 0; i < adp5585->nkeys_reset2; i++) {
@@ -549,6 +574,8 @@ static int adp5585_setup(struct adp5585_dev *adp5585)
 				   adp5585->reset2_keys[i] | ADP5585_RESET_EV_PRESS);
 		if (ret)
 			return ret;
+
+		__set_bit(adp5585->reset2_out, adp5585->pin_usage);
 	}
 
 	if (adp5585->nkeys_reset1 || adp5585->nkeys_reset2) {
@@ -697,6 +724,10 @@ static int adp5585_i2c_probe(struct i2c_client *i2c)
 		return dev_err_probe(&i2c->dev, -ENODEV,
 				     "Invalid device ID 0x%02x\n", id);
 
+	adp5585->pin_usage = devm_bitmap_zalloc(&i2c->dev, adp5585->n_pins, GFP_KERNEL);
+	if (!adp5585->pin_usage)
+		return -ENOMEM;
+
 	ret = adp5585_parse_fw(adp5585);
 	if (ret)
 		return ret;
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index db483ef9693a..41c5d2e1cc7c 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -126,6 +126,10 @@
 #define ADP5585_GPI_EVENT_END		47
 #define ADP5585_ROW5_KEY_EVENT_START	1
 #define ADP5585_ROW5_KEY_EVENT_END	30
+#define ADP5585_PWM_OUT			3
+#define ADP5585_RESET1_OUT		4
+#define ADP5585_RESET2_OUT		9
+#define ADP5585_ROW5			5
 
 /* ADP5589 */
 #define		ADP5589_MAN_ID_VALUE		0x10
@@ -154,6 +158,7 @@
 #define ADP5589_PWM_ONT_LOW		0x40
 #define ADP5589_PWM_CFG			0x42
 #define ADP5589_POLL_PTIME_CFG		0x48
+#define ADP5589_PIN_CONFIG_A		0x49
 #define ADP5589_PIN_CONFIG_D		0x4C
 #define ADP5589_GENERAL_CFG		0x4d
 #define ADP5589_INT_EN			0x4e
@@ -165,6 +170,7 @@
 #define ADP5589_GPI_EVENT_START		97
 #define ADP5589_GPI_EVENT_END		115
 #define ADP5589_UNLOCK_WILDCARD		127
+#define ADP5589_RESET2_OUT		12
 
 struct regmap;
 
@@ -188,6 +194,7 @@ struct adp5585_regs {
 	unsigned int reset_cfg;
 	unsigned int reset1_event_a;
 	unsigned int reset2_event_a;
+	unsigned int pin_cfg_a;
 };
 
 struct adp5585_dev {
@@ -195,6 +202,9 @@ struct adp5585_dev {
 	struct regmap *regmap;
 	const struct adp5585_regs *regs;
 	struct blocking_notifier_head event_notifier;
+	unsigned long *pin_usage;
+	unsigned int n_pins;
+	unsigned int reset2_out;
 	enum adp5585_variant variant;
 	unsigned int id;
 	bool has_unlock;
-- 
cgit v1.2.3


From 988b28a83b658137e58123f4dafc3a1588e1cb2b Mon Sep 17 00:00:00 2001
From: Nuno Sá <nuno.sa@analog.com>
Date: Tue, 1 Jul 2025 15:32:10 +0100
Subject: gpio: adp5585: support gpi events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for adding GPIs to the event FIFO. This is done by adding
irq_chip support. Like this, one can use the input gpio_keys driver as a
"frontend" device and input handler.

As part of this change, we now implement .request() and .free() as we can't
blindly consume all available pins as GPIOs (example: some pins can be
used for forming a keymap matrix).

Also note that the number of pins can now be obtained from the parent,
top level device. Hence the 'max_gpio' variable can be removed.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Nuno Sá <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20250701-dev-adp5589-fw-v7-15-b1fcfe9e9826@analog.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/gpio/Kconfig        |   1 +
 drivers/gpio/gpio-adp5585.c | 221 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/adp5585.h |   2 +
 3 files changed, 220 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 44f922e10db2..b552401e3f73 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -1263,6 +1263,7 @@ config GPIO_ADP5520
 config GPIO_ADP5585
 	tristate "GPIO Support for ADP5585"
 	depends on MFD_ADP5585
+	select GPIOLIB_IRQCHIP
 	help
 	  This option enables support for the GPIO function found in the Analog
 	  Devices ADP5585.
diff --git a/drivers/gpio/gpio-adp5585.c b/drivers/gpio/gpio-adp5585.c
index cdf107742579..b2c8836c5f84 100644
--- a/drivers/gpio/gpio-adp5585.c
+++ b/drivers/gpio/gpio-adp5585.c
@@ -7,10 +7,15 @@
  * Copyright 2025 Analog Devices, Inc.
  */
 
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/container_of.h>
 #include <linux/device.h>
 #include <linux/gpio/driver.h>
 #include <linux/mfd/adp5585.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
 #include <linux/types.h>
@@ -36,20 +41,29 @@
 struct adp5585_gpio_chip {
 	int (*bank)(unsigned int off);
 	int (*bit)(unsigned int off);
-	unsigned int max_gpio;
 	unsigned int debounce_dis_a;
 	unsigned int rpull_cfg_a;
 	unsigned int gpo_data_a;
 	unsigned int gpo_out_a;
 	unsigned int gpio_dir_a;
 	unsigned int gpi_stat_a;
+	unsigned int gpi_int_lvl_a;
+	unsigned int gpi_ev_a;
+	unsigned int gpi_ev_min;
+	unsigned int gpi_ev_max;
 	bool has_bias_hole;
 };
 
 struct adp5585_gpio_dev {
 	struct gpio_chip gpio_chip;
+	struct notifier_block nb;
 	const struct adp5585_gpio_chip *info;
 	struct regmap *regmap;
+	unsigned long irq_mask;
+	unsigned long irq_en;
+	unsigned long irq_active_high;
+	/* used for irqchip bus locking */
+	struct mutex bus_lock;
 };
 
 static int adp5585_gpio_bank(unsigned int off)
@@ -224,12 +238,175 @@ static int adp5585_gpio_set_config(struct gpio_chip *chip, unsigned int off,
 	};
 }
 
+static int adp5585_gpio_request(struct gpio_chip *chip, unsigned int off)
+{
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	struct device *dev = chip->parent;
+	struct adp5585_dev *adp5585 = dev_get_drvdata(dev->parent);
+	const struct adp5585_regs *regs = adp5585->regs;
+	int ret;
+
+	ret = test_and_set_bit(off, adp5585->pin_usage);
+	if (ret)
+		return -EBUSY;
+
+	/* make sure it's configured for GPIO */
+	return regmap_clear_bits(adp5585_gpio->regmap,
+				 regs->pin_cfg_a + info->bank(off),
+				 info->bit(off));
+}
+
+static void adp5585_gpio_free(struct gpio_chip *chip, unsigned int off)
+{
+	struct device *dev = chip->parent;
+	struct adp5585_dev *adp5585 = dev_get_drvdata(dev->parent);
+
+	clear_bit(off, adp5585->pin_usage);
+}
+
+static int adp5585_gpio_key_event(struct notifier_block *nb, unsigned long key,
+				  void *data)
+{
+	struct adp5585_gpio_dev *adp5585_gpio = container_of(nb, struct adp5585_gpio_dev, nb);
+	struct device *dev = adp5585_gpio->gpio_chip.parent;
+	unsigned long key_press = (unsigned long)data;
+	unsigned int irq, irq_type;
+	struct irq_data *irqd;
+	bool active_high;
+	unsigned int off;
+
+	/* make sure the event is for me */
+	if (key < adp5585_gpio->info->gpi_ev_min || key > adp5585_gpio->info->gpi_ev_max)
+		return NOTIFY_DONE;
+
+	off = key - adp5585_gpio->info->gpi_ev_min;
+	active_high = test_bit(off, &adp5585_gpio->irq_active_high);
+
+	irq = irq_find_mapping(adp5585_gpio->gpio_chip.irq.domain, off);
+	if (!irq)
+		return NOTIFY_BAD;
+
+	irqd = irq_get_irq_data(irq);
+	if (!irqd) {
+		dev_err(dev, "Could not get irq(%u) data\n", irq);
+		return NOTIFY_BAD;
+	}
+
+	dev_dbg_ratelimited(dev, "gpio-keys event(%u) press=%lu, a_high=%u\n",
+			    off, key_press, active_high);
+
+	if (!active_high)
+		key_press = !key_press;
+
+	irq_type = irqd_get_trigger_type(irqd);
+
+	if ((irq_type & IRQ_TYPE_EDGE_RISING && key_press) ||
+	    (irq_type & IRQ_TYPE_EDGE_FALLING && !key_press))
+		handle_nested_irq(irq);
+
+	return NOTIFY_STOP;
+}
+
+static void adp5585_irq_bus_lock(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(gc);
+
+	mutex_lock(&adp5585_gpio->bus_lock);
+}
+
+static void adp5585_irq_bus_sync_unlock(struct irq_data *d)
+{
+	struct gpio_chip *chip = irq_data_get_irq_chip_data(d);
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(chip);
+	const struct adp5585_gpio_chip *info = adp5585_gpio->info;
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+	bool active_high = test_bit(hwirq, &adp5585_gpio->irq_active_high);
+	bool enabled = test_bit(hwirq, &adp5585_gpio->irq_en);
+	bool masked = test_bit(hwirq, &adp5585_gpio->irq_mask);
+	unsigned int bank = adp5585_gpio->info->bank(hwirq);
+	unsigned int bit = adp5585_gpio->info->bit(hwirq);
+
+	if (masked && !enabled)
+		goto out_unlock;
+	if (!masked && enabled)
+		goto out_unlock;
+
+	regmap_update_bits(adp5585_gpio->regmap, info->gpi_int_lvl_a + bank, bit,
+			   active_high ? bit : 0);
+	regmap_update_bits(adp5585_gpio->regmap, info->gpi_ev_a + bank, bit,
+			   masked ? 0 : bit);
+	assign_bit(hwirq, &adp5585_gpio->irq_en, !masked);
+
+out_unlock:
+	mutex_unlock(&adp5585_gpio->bus_lock);
+}
+
+static void adp5585_irq_mask(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(gc);
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+
+	__set_bit(hwirq, &adp5585_gpio->irq_mask);
+	gpiochip_disable_irq(gc, hwirq);
+}
+
+static void adp5585_irq_unmask(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(gc);
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+
+	gpiochip_enable_irq(gc, hwirq);
+	__clear_bit(hwirq, &adp5585_gpio->irq_mask);
+}
+
+static int adp5585_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct adp5585_gpio_dev *adp5585_gpio = gpiochip_get_data(gc);
+	irq_hw_number_t hwirq = irqd_to_hwirq(d);
+
+	if (!(type & IRQ_TYPE_EDGE_BOTH))
+		return -EINVAL;
+
+	assign_bit(hwirq, &adp5585_gpio->irq_active_high,
+		   type == IRQ_TYPE_EDGE_RISING);
+
+	irq_set_handler_locked(d, handle_edge_irq);
+	return 0;
+}
+
+static const struct irq_chip adp5585_irq_chip = {
+	.name = "adp5585",
+	.irq_mask = adp5585_irq_mask,
+	.irq_unmask = adp5585_irq_unmask,
+	.irq_bus_lock = adp5585_irq_bus_lock,
+	.irq_bus_sync_unlock = adp5585_irq_bus_sync_unlock,
+	.irq_set_type = adp5585_irq_set_type,
+	.flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_IMMUTABLE,
+	GPIOCHIP_IRQ_RESOURCE_HELPERS,
+};
+
+static void adp5585_gpio_unreg_notifier(void *data)
+{
+	struct adp5585_gpio_dev *adp5585_gpio = data;
+	struct device *dev = adp5585_gpio->gpio_chip.parent;
+	struct adp5585_dev *adp5585 = dev_get_drvdata(dev->parent);
+
+	blocking_notifier_chain_unregister(&adp5585->event_notifier,
+					   &adp5585_gpio->nb);
+}
+
 static int adp5585_gpio_probe(struct platform_device *pdev)
 {
 	struct adp5585_dev *adp5585 = dev_get_drvdata(pdev->dev.parent);
 	const struct platform_device_id *id = platform_get_device_id(pdev);
 	struct adp5585_gpio_dev *adp5585_gpio;
 	struct device *dev = &pdev->dev;
+	struct gpio_irq_chip *girq;
 	struct gpio_chip *gc;
 	int ret;
 
@@ -253,13 +430,43 @@ static int adp5585_gpio_probe(struct platform_device *pdev)
 	gc->get = adp5585_gpio_get_value;
 	gc->set_rv = adp5585_gpio_set_value;
 	gc->set_config = adp5585_gpio_set_config;
+	gc->request = adp5585_gpio_request;
+	gc->free = adp5585_gpio_free;
 	gc->can_sleep = true;
 
 	gc->base = -1;
-	gc->ngpio = adp5585_gpio->info->max_gpio;
+	gc->ngpio = adp5585->n_pins;
 	gc->label = pdev->name;
 	gc->owner = THIS_MODULE;
 
+	if (device_property_present(dev->parent, "interrupt-controller")) {
+		if (!adp5585->irq)
+			return dev_err_probe(dev, -EINVAL,
+					     "Unable to serve as interrupt controller without IRQ\n");
+
+		girq = &adp5585_gpio->gpio_chip.irq;
+		gpio_irq_chip_set_chip(girq, &adp5585_irq_chip);
+		girq->handler = handle_bad_irq;
+		girq->threaded = true;
+
+		adp5585_gpio->nb.notifier_call = adp5585_gpio_key_event;
+		ret = blocking_notifier_chain_register(&adp5585->event_notifier,
+						       &adp5585_gpio->nb);
+		if (ret)
+			return ret;
+
+		ret = devm_add_action_or_reset(dev, adp5585_gpio_unreg_notifier,
+					       adp5585_gpio);
+		if (ret)
+			return ret;
+	}
+
+	/* everything masked by default */
+	adp5585_gpio->irq_mask = ~0UL;
+
+	ret = devm_mutex_init(dev, &adp5585_gpio->bus_lock);
+	if (ret)
+		return ret;
 	ret = devm_gpiochip_add_data(dev, &adp5585_gpio->gpio_chip,
 				     adp5585_gpio);
 	if (ret)
@@ -277,8 +484,11 @@ static const struct adp5585_gpio_chip adp5585_gpio_chip_info = {
 	.gpo_out_a = ADP5585_GPO_OUT_MODE_A,
 	.gpio_dir_a = ADP5585_GPIO_DIRECTION_A,
 	.gpi_stat_a = ADP5585_GPI_STATUS_A,
-	.max_gpio = ADP5585_PIN_MAX,
 	.has_bias_hole = true,
+	.gpi_ev_min = ADP5585_GPI_EVENT_START,
+	.gpi_ev_max = ADP5585_GPI_EVENT_END,
+	.gpi_int_lvl_a = ADP5585_GPI_INT_LEVEL_A,
+	.gpi_ev_a = ADP5585_GPI_EVENT_EN_A,
 };
 
 static const struct adp5585_gpio_chip adp5589_gpio_chip_info = {
@@ -290,7 +500,10 @@ static const struct adp5585_gpio_chip adp5589_gpio_chip_info = {
 	.gpo_out_a = ADP5589_GPO_OUT_MODE_A,
 	.gpio_dir_a = ADP5589_GPIO_DIRECTION_A,
 	.gpi_stat_a = ADP5589_GPI_STATUS_A,
-	.max_gpio = ADP5589_PIN_MAX,
+	.gpi_ev_min = ADP5589_GPI_EVENT_START,
+	.gpi_ev_max = ADP5589_GPI_EVENT_END,
+	.gpi_int_lvl_a = ADP5589_GPI_INT_LEVEL_A,
+	.gpi_ev_a = ADP5589_GPI_EVENT_EN_A,
 };
 
 static const struct platform_device_id adp5585_gpio_id_table[] = {
diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h
index 41c5d2e1cc7c..5237da6b4a9f 100644
--- a/include/linux/mfd/adp5585.h
+++ b/include/linux/mfd/adp5585.h
@@ -136,6 +136,8 @@
 #define ADP5589_GPI_STATUS_A		0x16
 #define ADP5589_GPI_STATUS_C		0x18
 #define ADP5589_RPULL_CONFIG_A		0x19
+#define ADP5589_GPI_INT_LEVEL_A		0x1e
+#define ADP5589_GPI_EVENT_EN_A		0x21
 #define ADP5589_DEBOUNCE_DIS_A		0x27
 #define ADP5589_GPO_DATA_OUT_A		0x2a
 #define ADP5589_GPO_OUT_MODE_A		0x2d
-- 
cgit v1.2.3


From 8af39ec5cf2be522c8eb43a3d8005ed59e4daaee Mon Sep 17 00:00:00 2001
From: Matthew Brost <matthew.brost@intel.com>
Date: Fri, 13 Jun 2025 14:20:13 -0700
Subject: drm/sched: Increment job count before swapping tail spsc queue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A small race exists between spsc_queue_push and the run-job worker, in
which spsc_queue_push may return not-first while the run-job worker has
already idled due to the job count being zero. If this race occurs, job
scheduling stops, leading to hangs while waiting on the job’s DMA
fences.

Seal this race by incrementing the job count before appending to the
SPSC queue.

This race was observed on a drm-tip 6.16-rc1 build with the Xe driver in
an SVM test case.

Fixes: 1b1f42d8fde4 ("drm: move amd_gpu_scheduler into common location")
Fixes: 27105db6c63a ("drm/amdgpu: Add SPSC queue to scheduler.")
Cc: stable@vger.kernel.org
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Link: https://lore.kernel.org/r/20250613212013.719312-1-matthew.brost@intel.com
---
 include/drm/spsc_queue.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/spsc_queue.h b/include/drm/spsc_queue.h
index 125f096c88cb..ee9df8cc67b7 100644
--- a/include/drm/spsc_queue.h
+++ b/include/drm/spsc_queue.h
@@ -70,9 +70,11 @@ static inline bool spsc_queue_push(struct spsc_queue *queue, struct spsc_node *n
 
 	preempt_disable();
 
+	atomic_inc(&queue->job_count);
+	smp_mb__after_atomic();
+
 	tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next);
 	WRITE_ONCE(*tail, node);
-	atomic_inc(&queue->job_count);
 
 	/*
 	 * In case of first element verify new node will be visible to the consumer
-- 
cgit v1.2.3


From e3d6e1b9a34c745b635f122ac471a198867cd0ec Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 1 Apr 2025 00:36:02 +0900
Subject: tracing: tprobe-events: Support multiple tprobes on the same
 tracepoint

Allow user to set multiple tracepoint-probe events on the same
tracepoint. After the last tprobe-event is removed, the tracepoint
callback is unregistered.

Link: https://lore.kernel.org/all/174343536245.843280.6548776576601537671.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/module.h      |   4 +
 kernel/trace/trace_fprobe.c | 251 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 205 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5faa1fb1f4b4..2cb5be5cc7e6 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -14,6 +14,7 @@
 #include <linux/buildid.h>
 #include <linux/compiler.h>
 #include <linux/cache.h>
+#include <linux/cleanup.h>
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/elf.h>
@@ -1019,4 +1020,7 @@ static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
 
 #endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */
 
+/* Define __free(module_put) macro for struct module *. */
+DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
+
 #endif /* _LINUX_MODULE_H */
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 017d6ebfdf8b..4a9ce7b0e084 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -21,7 +21,6 @@
 #define FPROBE_EVENT_SYSTEM "fprobes"
 #define TRACEPOINT_EVENT_SYSTEM "tracepoints"
 #define RETHOOK_MAXACTIVE_MAX 4096
-#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
 
 static int trace_fprobe_create(const char *raw_command);
 static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -38,6 +37,89 @@ static struct dyn_event_operations trace_fprobe_ops = {
 	.match = trace_fprobe_match,
 };
 
+struct tracepoint_user {
+	struct tracepoint	*tpoint;
+	unsigned int		refcount;
+};
+
+static bool tracepoint_user_is_registered(struct tracepoint_user *tuser)
+{
+	return tuser && tuser->tpoint;
+}
+
+static int tracepoint_user_register(struct tracepoint_user *tuser)
+{
+	struct tracepoint *tpoint = tuser->tpoint;
+
+	if (!tpoint)
+		return 0;
+
+	return tracepoint_probe_register_prio_may_exist(tpoint,
+					tpoint->probestub, NULL, 0);
+}
+
+static void tracepoint_user_unregister(struct tracepoint_user *tuser)
+{
+	if (!tuser->tpoint)
+		return;
+
+	WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL));
+	tuser->tpoint = NULL;
+}
+
+static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser)
+{
+	if (!tuser->tpoint)
+		return 0UL;
+
+	return (unsigned long)tuser->tpoint->probestub;
+}
+
+static bool tracepoint_user_within_module(struct tracepoint_user *tuser,
+					  struct module *mod)
+{
+	return within_module(tracepoint_user_ip(tuser), mod);
+}
+
+static struct tracepoint_user *tracepoint_user_allocate(struct tracepoint *tpoint)
+{
+	struct tracepoint_user *tuser;
+
+	tuser = kzalloc(sizeof(*tuser), GFP_KERNEL);
+	if (!tuser)
+		return NULL;
+	tuser->tpoint = tpoint;
+	tuser->refcount = 1;
+
+	return tuser;
+}
+
+/* These must be called with event_mutex */
+static void tracepoint_user_get(struct tracepoint_user *tuser)
+{
+	tuser->refcount++;
+}
+
+static void tracepoint_user_put(struct tracepoint_user *tuser)
+{
+	if (--tuser->refcount > 0)
+		return;
+
+	if (tracepoint_user_is_registered(tuser))
+		tracepoint_user_unregister(tuser);
+	kfree(tuser);
+}
+
+static const char *tracepoint_user_lookup(struct tracepoint_user *tuser, char *buf)
+{
+	struct tracepoint *tpoint = tuser->tpoint;
+
+	if (!tpoint)
+		return NULL;
+
+	return kallsyms_lookup((unsigned long)tpoint->probestub, NULL, NULL, NULL, buf);
+}
+
 /*
  * Fprobe event core functions
  */
@@ -45,7 +127,7 @@ struct trace_fprobe {
 	struct dyn_event	devent;
 	struct fprobe		fp;
 	const char		*symbol;
-	struct tracepoint	*tpoint;
+	struct tracepoint_user	*tuser;
 	struct trace_probe	tp;
 };
 
@@ -75,7 +157,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf)
 
 static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
 {
-	return tf->tpoint != NULL;
+	return tf->tuser != NULL;
 }
 
 static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
@@ -125,6 +207,56 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf)
 	return fprobe_is_registered(&tf->fp);
 }
 
+static struct tracepoint *find_tracepoint(const char *tp_name,
+	struct module **tp_mod);
+
+/*
+ * Get tracepoint_user if exist, or allocate new one. If tracepoint is on a
+ * module, get its refcounter.
+ */
+static struct tracepoint_user *
+trace_fprobe_get_tracepoint_user(const char *name, struct module **pmod)
+{
+	struct tracepoint_user *tuser __free(kfree) = NULL;
+	struct tracepoint *tpoint;
+	struct trace_fprobe *tf;
+	struct dyn_event *dpos;
+	struct module *mod __free(module_put) = NULL;
+	int ret;
+
+	/*
+	 * Find appropriate tracepoint and locking module.
+	 * Note: tpoint can be NULL if it is unloaded (or failed to get module.)
+	 */
+	tpoint = find_tracepoint(name, &mod);
+
+	/* Search existing tracepoint_user */
+	for_each_trace_fprobe(tf, dpos) {
+		if (!trace_fprobe_is_tracepoint(tf))
+			continue;
+		if (!strcmp(tf->symbol, name)) {
+			tracepoint_user_get(tf->tuser);
+			*pmod = no_free_ptr(mod);
+			return tf->tuser;
+		}
+	}
+
+	/* Not found, allocate and register new tracepoint_user. */
+	tuser = tracepoint_user_allocate(tpoint);
+	if (!tuser)
+		return NULL;
+
+	if (tpoint) {
+		/* If the tracepoint is not loaded, tpoint can be NULL. */
+		ret = tracepoint_user_register(tuser);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	*pmod = no_free_ptr(mod);
+	return_ptr(tuser);
+}
+
 /*
  * Note that we don't verify the fetch_insn code, since it does not come
  * from user space.
@@ -410,6 +542,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
 {
 	if (tf) {
 		trace_probe_cleanup(&tf->tp);
+		if (tf->tuser)
+			tracepoint_user_put(tf->tuser);
 		kfree(tf->symbol);
 		kfree(tf);
 	}
@@ -424,7 +558,7 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f
 static struct trace_fprobe *alloc_trace_fprobe(const char *group,
 					       const char *event,
 					       const char *symbol,
-					       struct tracepoint *tpoint,
+					       struct tracepoint_user *tuser,
 					       int nargs, bool is_return)
 {
 	struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
@@ -443,7 +577,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
 	else
 		tf->fp.entry_handler = fentry_dispatcher;
 
-	tf->tpoint = tpoint;
+	tf->tuser = tuser;
 
 	ret = trace_probe_init(&tf->tp, event, group, false, nargs);
 	if (ret < 0)
@@ -709,19 +843,11 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
 
 static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
 {
-	struct tracepoint *tpoint = tf->tpoint;
-	unsigned long ip = (unsigned long)tpoint->probestub;
-	int ret;
+	unsigned long ip = tracepoint_user_ip(tf->tuser);
+
+	if (!ip)
+		return -ENOENT;
 
-	/*
-	 * Here, we do 2 steps to enable fprobe on a tracepoint.
-	 * At first, put __probestub_##TP function on the tracepoint
-	 * and put a fprobe on the stub function.
-	 */
-	ret = tracepoint_probe_register_prio_may_exist(tpoint,
-				tpoint->probestub, NULL, 0);
-	if (ret < 0)
-		return ret;
 	return register_fprobe_ips(&tf->fp, &ip, 1);
 }
 
@@ -753,7 +879,7 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
 	if (trace_fprobe_is_tracepoint(tf)) {
 
 		/* This tracepoint is not loaded yet */
-		if (tf->tpoint == TRACEPOINT_STUB)
+		if (!tracepoint_user_is_registered(tf->tuser))
 			return 0;
 
 		return __regsiter_tracepoint_fprobe(tf);
@@ -770,9 +896,8 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf)
 		unregister_fprobe(&tf->fp);
 		memset(&tf->fp, 0, sizeof(tf->fp));
 		if (trace_fprobe_is_tracepoint(tf)) {
-			tracepoint_probe_unregister(tf->tpoint,
-					tf->tpoint->probestub, NULL);
-			tf->tpoint = NULL;
+			tracepoint_user_put(tf->tuser);
+			tf->tuser = NULL;
 		}
 	}
 }
@@ -988,7 +1113,7 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
 					unsigned long val, void *data)
 {
 	struct tp_module *tp_mod = data;
-	struct tracepoint *tpoint;
+	struct tracepoint_user *tuser;
 	struct trace_fprobe *tf;
 	struct dyn_event *pos;
 
@@ -999,21 +1124,46 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
 	for_each_trace_fprobe(tf, pos) {
 		if (!trace_fprobe_is_tracepoint(tf))
 			continue;
-		if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
+
+		if (val == MODULE_STATE_COMING) {
+			/*
+			 * If any tracepoint used by tprobe is in the module,
+			 * register the stub.
+			 */
+			struct tracepoint *tpoint;
+
 			tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
-			if (tpoint) {
-				tf->tpoint = tpoint;
-				if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
-				    trace_probe_is_enabled(&tf->tp))
-					reenable_trace_fprobe(tf);
+			/* This is not a tracepoint in this module. Skip it. */
+			if (!tpoint)
+				continue;
+
+			tuser = tf->tuser;
+			/* If the tracepoint is not registered yet, register it. */
+			if (!tracepoint_user_is_registered(tuser)) {
+				tuser->tpoint = tpoint;
+				if (WARN_ON_ONCE(tracepoint_user_register(tuser)))
+					continue;
 			}
-		} else if (val == MODULE_STATE_GOING &&
-			   tf->tpoint != TRACEPOINT_STUB &&
-			   within_module((unsigned long)tf->tpoint->probestub, tp_mod->mod)) {
-			unregister_fprobe(&tf->fp);
-			tracepoint_probe_unregister(tf->tpoint,
-				tf->tpoint->probestub, NULL);
-			tf->tpoint = TRACEPOINT_STUB;
+
+			/* Finally enable fprobe on this module. */
+			if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
+			    trace_probe_is_enabled(&tf->tp))
+				reenable_trace_fprobe(tf);
+		} else if (val == MODULE_STATE_GOING) {
+			tuser = tf->tuser;
+			/* Unregister all tracepoint_user in this module. */
+			if (tracepoint_user_is_registered(tuser) &&
+			    tracepoint_user_within_module(tuser, tp_mod->mod))
+				tracepoint_user_unregister(tuser);
+
+			/*
+			 * Here we need to handle shared tracepoint_user case.
+			 * Such tuser is unregistered, but trace_fprobe itself
+			 * is registered. (Note this only handles tprobes.)
+			 */
+			if (!tracepoint_user_is_registered(tuser) &&
+			    trace_fprobe_is_registered(tf))
+				unregister_fprobe(&tf->fp);
 		}
 	}
 	mutex_unlock(&event_mutex);
@@ -1082,7 +1232,9 @@ static int parse_symbol_and_return(int argc, const char *argv[],
 	return 0;
 }
 
-DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
+DEFINE_FREE(tuser_put, struct tracepoint_user *,
+	if (!IS_ERR_OR_NULL(_T))
+		tracepoint_user_put(_T))
 
 static int trace_fprobe_create_internal(int argc, const char *argv[],
 					struct traceprobe_parse_context *ctx)
@@ -1112,6 +1264,8 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
 	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
 	struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
+	struct tracepoint_user *tuser __free(tuser_put) = NULL;
+	struct module *mod __free(module_put) = NULL;
 	int i, new_argc = 0, ret = 0;
 	bool is_return = false;
 	char *symbol __free(kfree) = NULL;
@@ -1123,8 +1277,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
 	char abuf[MAX_BTF_ARGS_LEN];
 	char *dbuf __free(kfree) = NULL;
 	bool is_tracepoint = false;
-	struct module *tp_mod __free(module_put) = NULL;
-	struct tracepoint *tpoint = NULL;
 
 	if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
 		return -ECANCELED;
@@ -1177,20 +1329,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
 
 	if (is_tracepoint) {
 		ctx->flags |= TPARG_FL_TPOINT;
-		tpoint = find_tracepoint(symbol, &tp_mod);
-		if (tpoint) {
-			ctx->funcname = kallsyms_lookup(
-				(unsigned long)tpoint->probestub,
-				NULL, NULL, NULL, sbuf);
-		} else if (IS_ENABLED(CONFIG_MODULES)) {
-				/* This *may* be loaded afterwards */
-				tpoint = TRACEPOINT_STUB;
-				ctx->funcname = symbol;
-		} else {
+		tuser = trace_fprobe_get_tracepoint_user(symbol, &mod);
+		if (!tuser)
+			return -ENOMEM;
+		if (IS_ERR(tuser)) {
 			trace_probe_log_set_index(1);
 			trace_probe_log_err(0, NO_TRACEPOINT);
-			return -EINVAL;
+			return PTR_ERR(tuser);
 		}
+		ctx->funcname = tracepoint_user_lookup(tuser, sbuf);
+		/* If tracepoint is not loaded yet, use symbol name as funcname. */
+		if (!ctx->funcname)
+			ctx->funcname = symbol;
 	} else
 		ctx->funcname = symbol;
 
@@ -1214,13 +1364,14 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
 		return ret;
 
 	/* setup a probe */
-	tf = alloc_trace_fprobe(group, event, symbol, tpoint, argc, is_return);
+	tf = alloc_trace_fprobe(group, event, symbol, tuser, argc, is_return);
 	if (IS_ERR(tf)) {
 		ret = PTR_ERR(tf);
 		/* This must return -ENOMEM, else there is a bug */
 		WARN_ON_ONCE(ret != -ENOMEM);
 		return ret;
 	}
+	tuser = NULL; /* Move tuser to tf. */
 
 	/* parse arguments */
 	for (i = 0; i < argc; i++) {
-- 
cgit v1.2.3


From 2db832ec9090d3b5f726f49ad4d0322d6b68a490 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Tue, 1 Apr 2025 00:36:11 +0900
Subject: tracing: fprobe-events: Register fprobe-events only when it is
 enabled

Currently fprobe events are registered when it is defined. Thus it will
give some overhead even if it is disabled. This changes it to register the
fprobe only when it is enabled.

Link: https://lore.kernel.org/all/174343537128.843280.16131300052837035043.stgit@devnote2/

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/fprobe.h      |   5 +
 kernel/trace/fprobe.c       |   5 +
 kernel/trace/trace_fprobe.c | 237 ++++++++++++++++++++++----------------------
 3 files changed, 126 insertions(+), 121 deletions(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 702099f08929..7964db96e41a 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -94,6 +94,7 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num);
 int register_fprobe_syms(struct fprobe *fp, const char **syms, int num);
 int unregister_fprobe(struct fprobe *fp);
 bool fprobe_is_registered(struct fprobe *fp);
+int fprobe_count_ips_from_filter(const char *filter, const char *notfilter);
 #else
 static inline int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
 {
@@ -115,6 +116,10 @@ static inline bool fprobe_is_registered(struct fprobe *fp)
 {
 	return false;
 }
+static inline int fprobe_count_ips_from_filter(const char *filter, const char *notfilter)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 /**
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index ba7ff14f5339..b78fce0982c7 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
 
 #define FPROBE_IPS_MAX	INT_MAX
 
+int fprobe_count_ips_from_filter(const char *filter, const char *notfilter)
+{
+	return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+}
+
 /**
  * register_fprobe() - Register fprobe to ftrace by pattern.
  * @fp: A fprobe data structure to be registered.
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 4a9ce7b0e084..79ffc1b2d519 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -600,98 +600,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
 	return NULL;
 }
 
-static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
-{
-	if (trace_fprobe_is_registered(tf))
-		enable_fprobe(&tf->fp);
-
-	return 0;
-}
-
-static void __disable_trace_fprobe(struct trace_probe *tp)
-{
-	struct trace_fprobe *tf;
-
-	list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
-		if (!trace_fprobe_is_registered(tf))
-			continue;
-		disable_fprobe(&tf->fp);
-	}
-}
-
-/*
- * Enable trace_probe
- * if the file is NULL, enable "perf" handler, or enable "trace" handler.
- */
-static int enable_trace_fprobe(struct trace_event_call *call,
-			       struct trace_event_file *file)
-{
-	struct trace_probe *tp;
-	struct trace_fprobe *tf;
-	bool enabled;
-	int ret = 0;
-
-	tp = trace_probe_primary_from_call(call);
-	if (WARN_ON_ONCE(!tp))
-		return -ENODEV;
-	enabled = trace_probe_is_enabled(tp);
-
-	/* This also changes "enabled" state */
-	if (file) {
-		ret = trace_probe_add_file(tp, file);
-		if (ret)
-			return ret;
-	} else
-		trace_probe_set_flag(tp, TP_FLAG_PROFILE);
-
-	if (!enabled) {
-		list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
-			/* TODO: check the fprobe is gone */
-			__enable_trace_fprobe(tf);
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Disable trace_probe
- * if the file is NULL, disable "perf" handler, or disable "trace" handler.
- */
-static int disable_trace_fprobe(struct trace_event_call *call,
-				struct trace_event_file *file)
-{
-	struct trace_probe *tp;
-
-	tp = trace_probe_primary_from_call(call);
-	if (WARN_ON_ONCE(!tp))
-		return -ENODEV;
-
-	if (file) {
-		if (!trace_probe_get_file_link(tp, file))
-			return -ENOENT;
-		if (!trace_probe_has_single_file(tp))
-			goto out;
-		trace_probe_clear_flag(tp, TP_FLAG_TRACE);
-	} else
-		trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
-
-	if (!trace_probe_is_enabled(tp))
-		__disable_trace_fprobe(tp);
-
- out:
-	if (file)
-		/*
-		 * Synchronization is done in below function. For perf event,
-		 * file == NULL and perf_trace_event_unreg() calls
-		 * tracepoint_synchronize_unregister() to ensure synchronize
-		 * event. We don't need to care about it.
-		 */
-		trace_probe_remove_file(tp, file);
-
-	return 0;
-}
-
 /* Event entry printers */
 static enum print_line_t
 print_fentry_event(struct trace_iterator *iter, int flags,
@@ -851,6 +759,29 @@ static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
 	return register_fprobe_ips(&tf->fp, &ip, 1);
 }
 
+/* Returns an error if the target function is not available, or 0 */
+static int trace_fprobe_verify_target(struct trace_fprobe *tf)
+{
+	int ret;
+
+	if (trace_fprobe_is_tracepoint(tf)) {
+
+		/* This tracepoint is not loaded yet */
+		if (!tracepoint_user_is_registered(tf->tuser))
+			return 0;
+
+		/* We assume all stab function is tracable. */
+		return tracepoint_user_ip(tf->tuser) ? 0 : -ENOENT;
+	}
+
+	/*
+	 * Note: since we don't lock the module, even if this succeeded,
+	 * register_fprobe() later can fail.
+	 */
+	ret = fprobe_count_ips_from_filter(tf->symbol, NULL);
+	return (ret < 0) ? ret : 0;
+}
+
 /* Internal register function - just handle fprobe and flags */
 static int __register_trace_fprobe(struct trace_fprobe *tf)
 {
@@ -870,11 +801,7 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
 			return ret;
 	}
 
-	/* Set/clear disabled flag according to tp->flag */
-	if (trace_probe_is_enabled(&tf->tp))
-		tf->fp.flags &= ~FPROBE_FL_DISABLED;
-	else
-		tf->fp.flags |= FPROBE_FL_DISABLED;
+	tf->fp.flags &= ~FPROBE_FL_DISABLED;
 
 	if (trace_fprobe_is_tracepoint(tf)) {
 
@@ -895,10 +822,10 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf)
 	if (trace_fprobe_is_registered(tf)) {
 		unregister_fprobe(&tf->fp);
 		memset(&tf->fp, 0, sizeof(tf->fp));
-		if (trace_fprobe_is_tracepoint(tf)) {
-			tracepoint_user_put(tf->tuser);
-			tf->tuser = NULL;
-		}
+	}
+	if (trace_fprobe_is_tracepoint(tf)) {
+		tracepoint_user_put(tf->tuser);
+		tf->tuser = NULL;
 	}
 }
 
@@ -958,7 +885,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
 	return false;
 }
 
-static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to)
 {
 	int ret;
 
@@ -986,7 +913,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
 	if (ret)
 		return ret;
 
-	ret = __register_trace_fprobe(tf);
+	ret = trace_fprobe_verify_target(tf);
 	if (ret)
 		trace_probe_unlink(&tf->tp);
 	else
@@ -995,8 +922,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
 	return ret;
 }
 
-/* Register a trace_probe and probe_event */
-static int register_trace_fprobe(struct trace_fprobe *tf)
+/* Register a trace_probe and probe_event, and check the fprobe is available. */
+static int register_trace_fprobe_event(struct trace_fprobe *tf)
 {
 	struct trace_fprobe *old_tf;
 	int ret;
@@ -1006,7 +933,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
 	old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
 				   trace_probe_group_name(&tf->tp));
 	if (old_tf)
-		return append_trace_fprobe(tf, old_tf);
+		return append_trace_fprobe_event(tf, old_tf);
 
 	/* Register new event */
 	ret = register_fprobe_event(tf);
@@ -1019,8 +946,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
 		return ret;
 	}
 
-	/* Register fprobe */
-	ret = __register_trace_fprobe(tf);
+	/* Verify fprobe is sane. */
+	ret = trace_fprobe_verify_target(tf);
 	if (ret < 0)
 		unregister_fprobe_event(tf);
 	else
@@ -1084,15 +1011,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name,
 }
 
 #ifdef CONFIG_MODULES
-static void reenable_trace_fprobe(struct trace_fprobe *tf)
-{
-	struct trace_probe *tp = &tf->tp;
-
-	list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
-		__enable_trace_fprobe(tf);
-	}
-}
-
 /*
  * Find a tracepoint from specified module. In this case, this does not get the
  * module's refcount. The caller must ensure the module is not freed.
@@ -1146,9 +1064,8 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
 			}
 
 			/* Finally enable fprobe on this module. */
-			if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
-			    trace_probe_is_enabled(&tf->tp))
-				reenable_trace_fprobe(tf);
+			if (trace_probe_is_enabled(&tf->tp) && !trace_fprobe_is_registered(tf))
+				WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf));
 		} else if (val == MODULE_STATE_GOING) {
 			tuser = tf->tuser;
 			/* Unregister all tracepoint_user in this module. */
@@ -1397,7 +1314,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
 	if (ret < 0)
 		return ret;
 
-	ret = register_trace_fprobe(tf);
+	ret = register_trace_fprobe_event(tf);
 	if (ret) {
 		trace_probe_log_set_index(1);
 		if (ret == -EILSEQ)
@@ -1466,6 +1383,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
 	return 0;
 }
 
+/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+			       struct trace_event_file *file)
+{
+	struct trace_probe *tp;
+	struct trace_fprobe *tf;
+	bool enabled;
+	int ret = 0;
+
+	tp = trace_probe_primary_from_call(call);
+	if (WARN_ON_ONCE(!tp))
+		return -ENODEV;
+	enabled = trace_probe_is_enabled(tp);
+
+	/* This also changes "enabled" state */
+	if (file) {
+		ret = trace_probe_add_file(tp, file);
+		if (ret)
+			return ret;
+	} else
+		trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+	if (!enabled) {
+		list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+			ret = __register_trace_fprobe(tf);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+				struct trace_event_file *file)
+{
+	struct trace_fprobe *tf;
+	struct trace_probe *tp;
+
+	tp = trace_probe_primary_from_call(call);
+	if (WARN_ON_ONCE(!tp))
+		return -ENODEV;
+
+	if (file) {
+		if (!trace_probe_get_file_link(tp, file))
+			return -ENOENT;
+		if (!trace_probe_has_single_file(tp))
+			goto out;
+		trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+	} else
+		trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+	if (!trace_probe_is_enabled(tp)) {
+		list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+			unregister_fprobe(&tf->fp);
+		}
+	}
+
+ out:
+	if (file)
+		/*
+		 * Synchronization is done in below function. For perf event,
+		 * file == NULL and perf_trace_event_unreg() calls
+		 * tracepoint_synchronize_unregister() to ensure synchronize
+		 * event. We don't need to care about it.
+		 */
+		trace_probe_remove_file(tp, file);
+
+	return 0;
+}
+
 /*
  * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
  */
-- 
cgit v1.2.3


From fbe346ce9d626680a4dd0f079e17c7b5dd32ffad Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Fri, 27 Jun 2025 13:26:23 -0700
Subject: net: mana: Handle Reset Request from MANA NIC

Upon receiving the Reset Request, pause the connection and clean up
queues, wait for the specified period, then resume the NIC.
In the cleanup phase, the HWC is no longer responding, so set hwc_timeout
to zero to skip waiting on the response.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://patch.msgid.link/1751055983-29760-1-git-send-email-haiyangz@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c  | 127 ++++++++++++++++++-----
 drivers/net/ethernet/microsoft/mana/hw_channel.c |   4 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c    |  37 +++++--
 include/net/mana/gdma.h                          |  10 ++
 4 files changed, 143 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 55dd7dee718c..a468cd8e5f36 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -10,6 +10,7 @@
 #include <linux/irqdomain.h>
 
 #include <net/mana/mana.h>
+#include <net/mana/hw_channel.h>
 
 struct dentry *mana_debugfs_root;
 
@@ -68,6 +69,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev)
 		mana_gd_init_vf_regs(pdev);
 }
 
+/* Suppress logging when we set timeout to zero */
+bool mana_need_log(struct gdma_context *gc, int err)
+{
+	struct hw_channel_context *hwc;
+
+	if (err != -ETIMEDOUT)
+		return true;
+
+	if (!gc)
+		return true;
+
+	hwc = gc->hwc.driver_data;
+	if (hwc && hwc->hwc_timeout == 0)
+		return false;
+
+	return true;
+}
+
 static int mana_gd_query_max_resources(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -278,8 +297,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue)
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
 	if (err || resp.hdr.status) {
-		dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
-			resp.hdr.status);
+		if (mana_need_log(gc, err))
+			dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
+				resp.hdr.status);
 		return err ? err : -EPROTO;
 	}
 
@@ -366,25 +386,12 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
 
 #define MANA_SERVICE_PERIOD 10
 
-struct mana_serv_work {
-	struct work_struct serv_work;
-	struct pci_dev *pdev;
-};
-
-static void mana_serv_func(struct work_struct *w)
+static void mana_serv_fpga(struct pci_dev *pdev)
 {
-	struct mana_serv_work *mns_wk;
 	struct pci_bus *bus, *parent;
-	struct pci_dev *pdev;
-
-	mns_wk = container_of(w, struct mana_serv_work, serv_work);
-	pdev = mns_wk->pdev;
 
 	pci_lock_rescan_remove();
 
-	if (!pdev)
-		goto out;
-
 	bus = pdev->bus;
 	if (!bus) {
 		dev_err(&pdev->dev, "MANA service: no bus\n");
@@ -405,7 +412,74 @@ static void mana_serv_func(struct work_struct *w)
 
 out:
 	pci_unlock_rescan_remove();
+}
+
+static void mana_serv_reset(struct pci_dev *pdev)
+{
+	struct gdma_context *gc = pci_get_drvdata(pdev);
+	struct hw_channel_context *hwc;
+
+	if (!gc) {
+		dev_err(&pdev->dev, "MANA service: no GC\n");
+		return;
+	}
+
+	hwc = gc->hwc.driver_data;
+	if (!hwc) {
+		dev_err(&pdev->dev, "MANA service: no HWC\n");
+		goto out;
+	}
+
+	/* HWC is not responding in this case, so don't wait */
+	hwc->hwc_timeout = 0;
+
+	dev_info(&pdev->dev, "MANA reset cycle start\n");
 
+	mana_gd_suspend(pdev, PMSG_SUSPEND);
+
+	msleep(MANA_SERVICE_PERIOD * 1000);
+
+	mana_gd_resume(pdev);
+
+	dev_info(&pdev->dev, "MANA reset cycle completed\n");
+
+out:
+	gc->in_service = false;
+}
+
+struct mana_serv_work {
+	struct work_struct serv_work;
+	struct pci_dev *pdev;
+	enum gdma_eqe_type type;
+};
+
+static void mana_serv_func(struct work_struct *w)
+{
+	struct mana_serv_work *mns_wk;
+	struct pci_dev *pdev;
+
+	mns_wk = container_of(w, struct mana_serv_work, serv_work);
+	pdev = mns_wk->pdev;
+
+	if (!pdev)
+		goto out;
+
+	switch (mns_wk->type) {
+	case GDMA_EQE_HWC_FPGA_RECONFIG:
+		mana_serv_fpga(pdev);
+		break;
+
+	case GDMA_EQE_HWC_RESET_REQUEST:
+		mana_serv_reset(pdev);
+		break;
+
+	default:
+		dev_err(&pdev->dev, "MANA service: unknown type %d\n",
+			mns_wk->type);
+		break;
+	}
+
+out:
 	pci_dev_put(pdev);
 	kfree(mns_wk);
 	module_put(THIS_MODULE);
@@ -462,6 +536,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 		break;
 
 	case GDMA_EQE_HWC_FPGA_RECONFIG:
+	case GDMA_EQE_HWC_RESET_REQUEST:
 		dev_info(gc->dev, "Recv MANA service type:%d\n", type);
 
 		if (gc->in_service) {
@@ -483,6 +558,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
 		dev_info(gc->dev, "Start MANA service type:%d\n", type);
 		gc->in_service = true;
 		mns_wk->pdev = to_pci_dev(gc->dev);
+		mns_wk->type = type;
 		pci_dev_get(mns_wk->pdev);
 		INIT_WORK(&mns_wk->serv_work, mana_serv_func);
 		schedule_work(&mns_wk->serv_work);
@@ -634,7 +710,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq)
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
 	if (err) {
-		dev_err(dev, "test_eq failed: %d\n", err);
+		if (mana_need_log(gc, err))
+			dev_err(dev, "test_eq failed: %d\n", err);
 		goto out;
 	}
 
@@ -669,7 +746,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
 
 	if (flush_evenets) {
 		err = mana_gd_test_eq(gc, queue);
-		if (err)
+		if (err && mana_need_log(gc, err))
 			dev_warn(gc->dev, "Failed to flush EQ: %d\n", err);
 	}
 
@@ -815,8 +892,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle)
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
 	if (err || resp.hdr.status) {
-		dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
-			err, resp.hdr.status);
+		if (mana_need_log(gc, err))
+			dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
+				err, resp.hdr.status);
 		return -EPROTO;
 	}
 
@@ -1116,8 +1194,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd)
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
 	if (err || resp.hdr.status) {
-		dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
-			err, resp.hdr.status);
+		if (mana_need_log(gc, err))
+			dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
+				err, resp.hdr.status);
 		if (!err)
 			err = -EPROTO;
 	}
@@ -1915,7 +1994,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
 }
 
 /* The 'state' parameter is not used. */
-static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
+int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 
@@ -1931,7 +2010,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
  * fail -- if this happens, it's safer to just report an error than try to undo
  * what has been done.
  */
-static int mana_gd_resume(struct pci_dev *pdev)
+int mana_gd_resume(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	int err;
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 650d22654d49..ef072e24c46d 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -880,7 +880,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 
 	if (!wait_for_completion_timeout(&ctx->comp_event,
 					 (msecs_to_jiffies(hwc->hwc_timeout)))) {
-		dev_err(hwc->dev, "HWC: Request timed out!\n");
+		if (hwc->hwc_timeout != 0)
+			dev_err(hwc->dev, "HWC: Request timed out!\n");
+
 		err = -ETIMEDOUT;
 		goto out;
 	}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 016fd808ccad..a7973651ae51 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -47,6 +47,15 @@ static const struct file_operations mana_dbg_q_fops = {
 	.read   = mana_dbg_q_read,
 };
 
+static bool mana_en_need_log(struct mana_port_context *apc, int err)
+{
+	if (apc && apc->ac && apc->ac->gdma_dev &&
+	    apc->ac->gdma_dev->gdma_context)
+		return mana_need_log(apc->ac->gdma_dev->gdma_context, err);
+	else
+		return true;
+}
+
 /* Microsoft Azure Network Adapter (MANA) functions */
 
 static int mana_open(struct net_device *ndev)
@@ -854,7 +863,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 		if (err == -EOPNOTSUPP)
 			return err;
 
-		if (req->req.msg_type != MANA_QUERY_PHY_STAT)
+		if (req->req.msg_type != MANA_QUERY_PHY_STAT &&
+		    mana_need_log(gc, err))
 			dev_err(dev, "Failed to send mana message: %d, 0x%x\n",
 				err, resp->status);
 		return err ? err : -EPROTO;
@@ -931,8 +941,10 @@ static void mana_pf_deregister_hw_vport(struct mana_port_context *apc)
 	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
 				sizeof(resp));
 	if (err) {
-		netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
-			   err);
+		if (mana_en_need_log(apc, err))
+			netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n",
+				   err);
+
 		return;
 	}
 
@@ -987,8 +999,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
 	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
 				sizeof(resp));
 	if (err) {
-		netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
-			   err);
+		if (mana_en_need_log(apc, err))
+			netdev_err(apc->ndev, "Failed to unregister filter: %d\n",
+				   err);
+
 		return;
 	}
 
@@ -1218,7 +1232,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
 	err = mana_send_request(apc->ac, req, req_buf_size, &resp,
 				sizeof(resp));
 	if (err) {
-		netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
+		if (mana_en_need_log(apc, err))
+			netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
+
 		goto out;
 	}
 
@@ -1402,7 +1418,9 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 	err = mana_send_request(apc->ac, &req, sizeof(req), &resp,
 				sizeof(resp));
 	if (err) {
-		netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
+		if (mana_en_need_log(apc, err))
+			netdev_err(ndev, "Failed to destroy WQ object: %d\n", err);
+
 		return;
 	}
 
@@ -3067,11 +3085,10 @@ static int mana_dealloc_queues(struct net_device *ndev)
 
 	apc->rss_state = TRI_STATE_FALSE;
 	err = mana_config_rss(apc, TRI_STATE_FALSE, false, false);
-	if (err) {
+	if (err && mana_en_need_log(apc, err))
 		netdev_err(ndev, "Failed to disable vPort: %d\n", err);
-		return err;
-	}
 
+	/* Even in err case, still need to cleanup the vPort */
 	mana_destroy_vport(apc);
 
 	return 0;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 92ab85061df0..57df78cfbf82 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -62,6 +62,7 @@ enum gdma_eqe_type {
 	GDMA_EQE_HWC_FPGA_RECONFIG	= 132,
 	GDMA_EQE_HWC_SOC_RECONFIG_DATA	= 133,
 	GDMA_EQE_HWC_SOC_SERVICE	= 134,
+	GDMA_EQE_HWC_RESET_REQUEST	= 135,
 	GDMA_EQE_RNIC_QP_FATAL		= 176,
 };
 
@@ -584,6 +585,9 @@ enum {
 /* Driver supports dynamic MSI-X vector allocation */
 #define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
 
+/* Driver can self reset on EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
+
 /* Driver can self reset on FPGA Reconfig EQE notification */
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
 
@@ -594,6 +598,7 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
 	 GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
 	 GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
+	 GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE)
 
 #define GDMA_DRV_CAP_FLAGS2 0
@@ -921,4 +926,9 @@ void mana_unregister_debugfs(void);
 
 int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
 
+int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state);
+int mana_gd_resume(struct pci_dev *pdev);
+
+bool mana_need_log(struct gdma_context *gc, int err);
+
 #endif /* _GDMA_H */
-- 
cgit v1.2.3


From 002d90627cdbf7efd16f0846a814889409e42f07 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 20:59:26 +0300
Subject: firmware/nvram: bcm47xx: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 include/linux/bcm47xx_nvram.h | 1 -
 include/linux/bcm47xx_sprom.h | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bcm47xx_nvram.h b/include/linux/bcm47xx_nvram.h
index 7615f8d7b1ed..e4b6ce953ddb 100644
--- a/include/linux/bcm47xx_nvram.h
+++ b/include/linux/bcm47xx_nvram.h
@@ -7,7 +7,6 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/vmalloc.h>
 
 #ifdef CONFIG_BCM47XX_NVRAM
diff --git a/include/linux/bcm47xx_sprom.h b/include/linux/bcm47xx_sprom.h
index f8254fd53e15..40a7da3ef50e 100644
--- a/include/linux/bcm47xx_sprom.h
+++ b/include/linux/bcm47xx_sprom.h
@@ -5,8 +5,8 @@
 #ifndef __BCM47XX_SPROM_H
 #define __BCM47XX_SPROM_H
 
+#include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/kernel.h>
 #include <linux/vmalloc.h>
 
 struct ssb_sprom;
-- 
cgit v1.2.3


From 276e136bff7edcdecc6e206c81594ef06aa40743 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 30 Jun 2025 18:20:15 +0200
Subject: fs: prepare for extending file_get/setattr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We intend to add support for more xflags to selective filesystems and
We cannot rely on copy_struct_from_user() to detect this extension.

In preparation of extending the API, do not allow setting xflags unknown
by this kernel version.

Also do not pass the read-only flags and read-only field fsx_nextents to
filesystem.

These changes should not affect existing chattr programs that use the
ioctl to get fsxattr before setting the new values.

Link: https://lore.kernel.org/linux-fsdevel/20250216164029.20673-4-pali@kernel.org/
Cc: Pali Rohár <pali@kernel.org>
Cc: Andrey Albershteyn <aalbersh@redhat.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Link: https://lore.kernel.org/20250630-xattrat-syscall-v6-5-c4e3bc35227b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_attr.c           |  8 +++++++-
 include/linux/fileattr.h | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/file_attr.c b/fs/file_attr.c
index 5c98cf8d5519..775f43fc9687 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -100,9 +100,10 @@ EXPORT_SYMBOL(vfs_fileattr_get);
 int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
 {
 	struct fsxattr xfa;
+	__u32 mask = FS_XFLAGS_MASK;
 
 	memset(&xfa, 0, sizeof(xfa));
-	xfa.fsx_xflags = fa->fsx_xflags;
+	xfa.fsx_xflags = fa->fsx_xflags & mask;
 	xfa.fsx_extsize = fa->fsx_extsize;
 	xfa.fsx_nextents = fa->fsx_nextents;
 	xfa.fsx_projid = fa->fsx_projid;
@@ -119,11 +120,16 @@ static int copy_fsxattr_from_user(struct fileattr *fa,
 				  struct fsxattr __user *ufa)
 {
 	struct fsxattr xfa;
+	__u32 mask = FS_XFLAGS_MASK;
 
 	if (copy_from_user(&xfa, ufa, sizeof(xfa)))
 		return -EFAULT;
 
+	if (xfa.fsx_xflags & ~mask)
+		return -EOPNOTSUPP;
+
 	fileattr_fill_xflags(fa, xfa.fsx_xflags);
+	fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
 	fa->fsx_extsize = xfa.fsx_extsize;
 	fa->fsx_nextents = xfa.fsx_nextents;
 	fa->fsx_projid = xfa.fsx_projid;
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index 6030d0bf7ad3..e2a2f4ae242d 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -14,6 +14,26 @@
 	 FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \
 	 FS_XFLAG_PROJINHERIT)
 
+/* Read-only inode flags */
+#define FS_XFLAG_RDONLY_MASK \
+	(FS_XFLAG_PREALLOC | FS_XFLAG_HASATTR)
+
+/* Flags to indicate valid value of fsx_ fields */
+#define FS_XFLAG_VALUES_MASK \
+	(FS_XFLAG_EXTSIZE | FS_XFLAG_COWEXTSIZE)
+
+/* Flags for directories */
+#define FS_XFLAG_DIRONLY_MASK \
+	(FS_XFLAG_RTINHERIT | FS_XFLAG_NOSYMLINKS | FS_XFLAG_EXTSZINHERIT)
+
+/* Misc settable flags */
+#define FS_XFLAG_MISC_MASK \
+	(FS_XFLAG_REALTIME | FS_XFLAG_NODEFRAG | FS_XFLAG_FILESTREAM)
+
+#define FS_XFLAGS_MASK \
+	(FS_XFLAG_COMMON | FS_XFLAG_RDONLY_MASK | FS_XFLAG_VALUES_MASK | \
+	 FS_XFLAG_DIRONLY_MASK | FS_XFLAG_MISC_MASK)
+
 /*
  * Merged interface for miscellaneous file attributes.  'flags' originates from
  * ext* and 'fsx_flags' from xfs.  There's some overlap between the two, which
-- 
cgit v1.2.3


From 3a0ae385f69e9b2d87c9b017c4ffb5567c015197 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:51 +0100
Subject: io_uring/mock: add basic infra for test mock files

io_uring commands provide an ioctl style interface for files to
implement file specific operations. io_uring provides many features and
advanced api to commands, and it's getting hard to test as it requires
specific files/devices.

Add basic infrastucture for creating special mock files that will be
implementing the cmd api and using various io_uring features we want to
test. It'll also be useful to test some more obscure read/write/polling
edge cases in the future.

Suggested-by: chase xd <sl1589472800@gmail.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/93f21b0af58c1367a2b22635d5a7d694ad0272fc.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS                             |   1 +
 include/uapi/linux/io_uring/mock_file.h |  22 +++++
 init/Kconfig                            |  11 +++
 io_uring/Makefile                       |   1 +
 io_uring/mock_file.c                    | 148 ++++++++++++++++++++++++++++++++
 5 files changed, 183 insertions(+)
 create mode 100644 include/uapi/linux/io_uring/mock_file.h
 create mode 100644 io_uring/mock_file.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index c3f7fbd0d67a..24e11687f8b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12679,6 +12679,7 @@ F:	include/linux/io_uring.h
 F:	include/linux/io_uring_types.h
 F:	include/trace/events/io_uring.h
 F:	include/uapi/linux/io_uring.h
+F:	include/uapi/linux/io_uring/
 F:	io_uring/
 
 IPMI SUBSYSTEM
diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
new file mode 100644
index 000000000000..a44273fd526d
--- /dev/null
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_IO_URING_MOCK_FILE_H
+#define LINUX_IO_URING_MOCK_FILE_H
+
+#include <linux/types.h>
+
+struct io_uring_mock_probe {
+	__u64		features;
+	__u64		__resv[9];
+};
+
+struct io_uring_mock_create {
+	__u32		out_fd;
+	__u32		flags;
+	__u64		__resv[15];
+};
+
+enum {
+	IORING_MOCK_MGR_CMD_PROBE,
+	IORING_MOCK_MGR_CMD_CREATE,
+};
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index af4c2f085455..c40a7c65fb4c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1801,6 +1801,17 @@ config GCOV_PROFILE_URING
 	  the io_uring subsystem, hence this should only be enabled for
 	  specific test purposes.
 
+config IO_URING_MOCK_FILE
+	tristate "Enable io_uring mock files (Experimental)" if EXPERT
+	default n
+	depends on IO_URING
+	help
+	  Enable mock files for io_uring subststem testing. The ABI might
+	  still change, so it's still experimental and should only be enabled
+	  for specific test purposes.
+
+	  If unsure, say N.
+
 config ADVISE_SYSCALLS
 	bool "Enable madvise/fadvise syscalls" if EXPERT
 	default y
diff --git a/io_uring/Makefile b/io_uring/Makefile
index d97c6b51d584..b3f1bd492804 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL)		+= epoll.o
 obj-$(CONFIG_NET_RX_BUSY_POLL)	+= napi.o
 obj-$(CONFIG_NET) += net.o cmd_net.o
 obj-$(CONFIG_PROC_FS) += fdinfo.o
+obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
new file mode 100644
index 000000000000..3681d0b8d8de
--- /dev/null
+++ b/io_uring/mock_file.c
@@ -0,0 +1,148 @@
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/anon_inodes.h>
+
+#include <linux/io_uring/cmd.h>
+#include <linux/io_uring_types.h>
+#include <uapi/linux/io_uring/mock_file.h>
+
+static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	return -ENOTSUPP;
+}
+
+static const struct file_operations io_mock_fops = {
+	.owner		= THIS_MODULE,
+	.uring_cmd	= io_mock_cmd,
+};
+
+static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	struct io_uring_mock_create mc, __user *uarg;
+	struct file *file = NULL;
+	size_t uarg_size;
+	int fd, ret;
+
+	/*
+	 * It's a testing only driver that allows exercising edge cases
+	 * that wouldn't be possible to hit otherwise.
+	 */
+	add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
+	uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	uarg_size = READ_ONCE(sqe->len);
+
+	if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index)
+		return -EINVAL;
+	if (uarg_size != sizeof(mc))
+		return -EINVAL;
+
+	memset(&mc, 0, sizeof(mc));
+	if (copy_from_user(&mc, uarg, uarg_size))
+		return -EFAULT;
+	if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags)
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops,
+					 NULL, O_RDWR | O_CLOEXEC, NULL);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto fail;
+	}
+
+	mc.out_fd = fd;
+	if (copy_to_user(uarg, &mc, uarg_size)) {
+		fput(file);
+		ret = -EFAULT;
+		goto fail;
+	}
+
+	fd_install(fd, file);
+	return 0;
+fail:
+	put_unused_fd(fd);
+	return ret;
+}
+
+static int io_probe_mock(struct io_uring_cmd *cmd)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	struct io_uring_mock_probe mp, __user *uarg;
+	size_t uarg_size;
+
+	uarg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	uarg_size = READ_ONCE(sqe->len);
+
+	if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index ||
+	    uarg_size != sizeof(mp))
+		return -EINVAL;
+
+	memset(&mp, 0, sizeof(mp));
+	if (copy_from_user(&mp, uarg, uarg_size))
+		return -EFAULT;
+	if (!mem_is_zero(&mp, sizeof(mp)))
+		return -EINVAL;
+
+	mp.features = 0;
+
+	if (copy_to_user(uarg, &mp, uarg_size))
+		return -EFAULT;
+	return 0;
+}
+
+static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd->cmd_op) {
+	case IORING_MOCK_MGR_CMD_PROBE:
+		return io_probe_mock(cmd);
+	case IORING_MOCK_MGR_CMD_CREATE:
+		return io_create_mock_file(cmd, issue_flags);
+	}
+	return -EOPNOTSUPP;
+}
+
+static const struct file_operations iou_mock_dev_fops = {
+	.owner		= THIS_MODULE,
+	.uring_cmd	= iou_mock_mgr_cmd,
+};
+
+static struct miscdevice iou_mock_miscdev = {
+	.minor			= MISC_DYNAMIC_MINOR,
+	.name			= "io_uring_mock",
+	.fops			= &iou_mock_dev_fops,
+};
+
+static int __init io_mock_init(void)
+{
+	int ret;
+
+	ret = misc_register(&iou_mock_miscdev);
+	if (ret < 0) {
+		pr_err("Could not initialize io_uring mock device\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void __exit io_mock_exit(void)
+{
+	misc_deregister(&iou_mock_miscdev);
+}
+
+module_init(io_mock_init)
+module_exit(io_mock_exit)
+
+MODULE_AUTHOR("Pavel Begunkov <asml.silence@gmail.com>");
+MODULE_DESCRIPTION("io_uring mock file");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 4aac001f780388b252534396feaf49b250eae27f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:52 +0100
Subject: io_uring/mock: add cmd using vectored regbufs

There is a command api allowing to import vectored registered buffers,
add a new mock command that uses the feature and simply copies the
specified registered buffer into user space or vice versa.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/229a113fd7de6b27dbef9567f7c0bf4475c9017d.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/mock_file.h | 14 +++++++
 io_uring/mock_file.c                    | 70 ++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
index a44273fd526d..73aca477d5c8 100644
--- a/include/uapi/linux/io_uring/mock_file.h
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -3,6 +3,12 @@
 
 #include <linux/types.h>
 
+enum {
+	IORING_MOCK_FEAT_CMD_COPY,
+
+	IORING_MOCK_FEAT_END,
+};
+
 struct io_uring_mock_probe {
 	__u64		features;
 	__u64		__resv[9];
@@ -19,4 +25,12 @@ enum {
 	IORING_MOCK_MGR_CMD_CREATE,
 };
 
+enum {
+	IORING_MOCK_CMD_COPY_REGBUF,
+};
+
+enum {
+	IORING_MOCK_COPY_FROM			= 1,
+};
+
 #endif
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index 3681d0b8d8de..8285393f4a5b 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -9,8 +9,76 @@
 #include <linux/io_uring_types.h>
 #include <uapi/linux/io_uring/mock_file.h>
 
+#define IO_VALID_COPY_CMD_FLAGS		IORING_MOCK_COPY_FROM
+
+static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf)
+{
+	size_t ret, copied = 0;
+	size_t buflen = PAGE_SIZE;
+	void *tmp_buf;
+
+	tmp_buf = kzalloc(buflen, GFP_KERNEL);
+	if (!tmp_buf)
+		return -ENOMEM;
+
+	while (iov_iter_count(reg_iter)) {
+		size_t len = min(iov_iter_count(reg_iter), buflen);
+
+		if (iov_iter_rw(reg_iter) == ITER_SOURCE) {
+			ret = copy_from_iter(tmp_buf, len, reg_iter);
+			if (ret <= 0)
+				break;
+			if (copy_to_user(ubuf, tmp_buf, ret))
+				break;
+		} else {
+			if (copy_from_user(tmp_buf, ubuf, len))
+				break;
+			ret = copy_to_iter(tmp_buf, len, reg_iter);
+			if (ret <= 0)
+				break;
+		}
+		ubuf += ret;
+		copied += ret;
+	}
+
+	kfree(tmp_buf);
+	return copied;
+}
+
+static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	const struct io_uring_sqe *sqe = cmd->sqe;
+	const struct iovec __user *iovec;
+	unsigned flags, iovec_len;
+	struct iov_iter iter;
+	void __user *ubuf;
+	int dir, ret;
+
+	ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+	iovec = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	iovec_len = READ_ONCE(sqe->len);
+	flags = READ_ONCE(sqe->file_index);
+
+	if (unlikely(sqe->ioprio || sqe->__pad1))
+		return -EINVAL;
+	if (flags & ~IO_VALID_COPY_CMD_FLAGS)
+		return -EINVAL;
+
+	dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST;
+	ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter,
+					    issue_flags);
+	if (ret)
+		return ret;
+	ret = io_copy_regbuf(&iter, ubuf);
+	return ret ? ret : -EFAULT;
+}
+
 static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	switch (cmd->cmd_op) {
+	case IORING_MOCK_CMD_COPY_REGBUF:
+		return io_cmd_copy_regbuf(cmd, issue_flags);
+	}
 	return -ENOTSUPP;
 }
 
@@ -91,7 +159,7 @@ static int io_probe_mock(struct io_uring_cmd *cmd)
 	if (!mem_is_zero(&mp, sizeof(mp)))
 		return -EINVAL;
 
-	mp.features = 0;
+	mp.features = IORING_MOCK_FEAT_END;
 
 	if (copy_to_user(uarg, &mp, uarg_size))
 		return -EFAULT;
-- 
cgit v1.2.3


From d1aa0346571436203a24cc3fc0c80f14cabbd630 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:53 +0100
Subject: io_uring/mock: add sync read/write

Add support for synchronous zero read/write for mock files.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/571f3c9fe688e918256a06a722d3db6ced9ca3d5.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/mock_file.h |  4 +-
 io_uring/mock_file.c                    | 67 ++++++++++++++++++++++++++++++---
 2 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
index 73aca477d5c8..de27295bb365 100644
--- a/include/uapi/linux/io_uring/mock_file.h
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -5,6 +5,7 @@
 
 enum {
 	IORING_MOCK_FEAT_CMD_COPY,
+	IORING_MOCK_FEAT_RW_ZERO,
 
 	IORING_MOCK_FEAT_END,
 };
@@ -17,7 +18,8 @@ struct io_uring_mock_probe {
 struct io_uring_mock_create {
 	__u32		out_fd;
 	__u32		flags;
-	__u64		__resv[15];
+	__u64		file_size;
+	__u64		__resv[14];
 };
 
 enum {
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index 8285393f4a5b..90160ccb50f0 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -9,6 +9,10 @@
 #include <linux/io_uring_types.h>
 #include <uapi/linux/io_uring/mock_file.h>
 
+struct io_mock_file {
+	size_t size;
+};
+
 #define IO_VALID_COPY_CMD_FLAGS		IORING_MOCK_COPY_FROM
 
 static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf)
@@ -82,18 +86,59 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	return -ENOTSUPP;
 }
 
+static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct io_mock_file *mf = iocb->ki_filp->private_data;
+	size_t len = iov_iter_count(to);
+
+	if (iocb->ki_pos + len > mf->size)
+		return -EINVAL;
+	return iov_iter_zero(len, to);
+}
+
+static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct io_mock_file *mf = iocb->ki_filp->private_data;
+	size_t len = iov_iter_count(from);
+
+	if (iocb->ki_pos + len > mf->size)
+		return -EINVAL;
+	iov_iter_advance(from, len);
+	return len;
+}
+
+static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct io_mock_file *mf = file->private_data;
+
+	return fixed_size_llseek(file, offset, whence, mf->size);
+}
+
+static int io_mock_release(struct inode *inode, struct file *file)
+{
+	struct io_mock_file *mf = file->private_data;
+
+	kfree(mf);
+	return 0;
+}
+
 static const struct file_operations io_mock_fops = {
 	.owner		= THIS_MODULE,
+	.release	= io_mock_release,
 	.uring_cmd	= io_mock_cmd,
+	.read_iter	= io_mock_read_iter,
+	.write_iter	= io_mock_write_iter,
+	.llseek		= io_mock_llseek,
 };
 
 static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	const struct io_uring_sqe *sqe = cmd->sqe;
 	struct io_uring_mock_create mc, __user *uarg;
+	struct io_mock_file *mf = NULL;
 	struct file *file = NULL;
 	size_t uarg_size;
-	int fd, ret;
+	int fd = -1, ret;
 
 	/*
 	 * It's a testing only driver that allows exercising edge cases
@@ -114,18 +159,28 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 		return -EFAULT;
 	if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags)
 		return -EINVAL;
+	if (mc.file_size > SZ_1G)
+		return -EINVAL;
+	mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT);
+	if (!mf)
+		return -ENOMEM;
 
-	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
 	if (fd < 0)
-		return fd;
+		goto fail;
 
+	mf->size = mc.file_size;
 	file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops,
-					 NULL, O_RDWR | O_CLOEXEC, NULL);
+					 mf, O_RDWR | O_CLOEXEC, NULL);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
 		goto fail;
 	}
 
+	file->f_mode |= FMODE_READ | FMODE_CAN_READ |
+			FMODE_WRITE | FMODE_CAN_WRITE |
+			FMODE_LSEEK;
+
 	mc.out_fd = fd;
 	if (copy_to_user(uarg, &mc, uarg_size)) {
 		fput(file);
@@ -136,7 +191,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 	fd_install(fd, file);
 	return 0;
 fail:
-	put_unused_fd(fd);
+	if (fd >= 0)
+		put_unused_fd(fd);
+	kfree(mf);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2f71d2386f4feed5bfb9ee7b3d2c0ad953db1fa5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:54 +0100
Subject: io_uring/mock: allow to choose FMODE_NOWAIT

Add an option to choose whether the file supports FMODE_NOWAIT, that
changes the execution path io_uring request takes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1e532565b05a05b23589d237c24ee1a3d90c2fd9.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/mock_file.h | 5 +++++
 io_uring/mock_file.c                    | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
index de27295bb365..125949d2b5ce 100644
--- a/include/uapi/linux/io_uring/mock_file.h
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -6,6 +6,7 @@
 enum {
 	IORING_MOCK_FEAT_CMD_COPY,
 	IORING_MOCK_FEAT_RW_ZERO,
+	IORING_MOCK_FEAT_RW_NOWAIT,
 
 	IORING_MOCK_FEAT_END,
 };
@@ -15,6 +16,10 @@ struct io_uring_mock_probe {
 	__u64		__resv[9];
 };
 
+enum {
+	IORING_MOCK_CREATE_F_SUPPORT_NOWAIT			= 1,
+};
+
 struct io_uring_mock_create {
 	__u32		out_fd;
 	__u32		flags;
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index 90160ccb50f0..0eb1d3bd6368 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -131,6 +131,8 @@ static const struct file_operations io_mock_fops = {
 	.llseek		= io_mock_llseek,
 };
 
+#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
+
 static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
 	const struct io_uring_sqe *sqe = cmd->sqe;
@@ -157,7 +159,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 	memset(&mc, 0, sizeof(mc));
 	if (copy_from_user(&mc, uarg, uarg_size))
 		return -EFAULT;
-	if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags)
+	if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)))
+		return -EINVAL;
+	if (mc.flags & ~IO_VALID_CREATE_FLAGS)
 		return -EINVAL;
 	if (mc.file_size > SZ_1G)
 		return -EINVAL;
@@ -180,6 +184,8 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 	file->f_mode |= FMODE_READ | FMODE_CAN_READ |
 			FMODE_WRITE | FMODE_CAN_WRITE |
 			FMODE_LSEEK;
+	if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
+		file->f_mode |= FMODE_NOWAIT;
 
 	mc.out_fd = fd;
 	if (copy_to_user(uarg, &mc, uarg_size)) {
-- 
cgit v1.2.3


From 0c98a44329c10bf904434524425cb42043513bd6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:55 +0100
Subject: io_uring/mock: support for async read/write

Let the user to specify a delay to read/write request. io_uring will
start a timer, return -EIOCBQUEUED and complete the request
asynchronously after the delay pass.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/38f9d2e143fda8522c90a724b74630e68f9bbd16.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/mock_file.h |  4 ++-
 io_uring/mock_file.c                    | 59 ++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
index 125949d2b5ce..c8fa77e39c68 100644
--- a/include/uapi/linux/io_uring/mock_file.h
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -7,6 +7,7 @@ enum {
 	IORING_MOCK_FEAT_CMD_COPY,
 	IORING_MOCK_FEAT_RW_ZERO,
 	IORING_MOCK_FEAT_RW_NOWAIT,
+	IORING_MOCK_FEAT_RW_ASYNC,
 
 	IORING_MOCK_FEAT_END,
 };
@@ -24,7 +25,8 @@ struct io_uring_mock_create {
 	__u32		out_fd;
 	__u32		flags;
 	__u64		file_size;
-	__u64		__resv[14];
+	__u64		rw_delay_ns;
+	__u64		__resv[13];
 };
 
 enum {
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index 0eb1d3bd6368..ed6a5505763e 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -4,13 +4,22 @@
 #include <linux/miscdevice.h>
 #include <linux/module.h>
 #include <linux/anon_inodes.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
 
 #include <linux/io_uring/cmd.h>
 #include <linux/io_uring_types.h>
 #include <uapi/linux/io_uring/mock_file.h>
 
+struct io_mock_iocb {
+	struct kiocb		*iocb;
+	struct hrtimer		timer;
+	int			res;
+};
+
 struct io_mock_file {
-	size_t size;
+	size_t			size;
+	u64			rw_delay_ns;
 };
 
 #define IO_VALID_COPY_CMD_FLAGS		IORING_MOCK_COPY_FROM
@@ -86,14 +95,48 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	return -ENOTSUPP;
 }
 
+static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer)
+{
+	struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer);
+	struct kiocb *iocb = mio->iocb;
+
+	WRITE_ONCE(iocb->private, NULL);
+	iocb->ki_complete(iocb, mio->res);
+	kfree(mio);
+	return HRTIMER_NORESTART;
+}
+
+static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len)
+{
+	struct io_mock_file *mf = iocb->ki_filp->private_data;
+	struct io_mock_iocb *mio;
+
+	mio = kzalloc(sizeof(*mio), GFP_KERNEL);
+	if (!mio)
+		return -ENOMEM;
+
+	mio->iocb = iocb;
+	mio->res = len;
+	hrtimer_setup(&mio->timer, io_mock_rw_timer_expired,
+		      CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns),
+		      HRTIMER_MODE_REL);
+	return -EIOCBQUEUED;
+}
+
 static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct io_mock_file *mf = iocb->ki_filp->private_data;
 	size_t len = iov_iter_count(to);
+	size_t nr_zeroed;
 
 	if (iocb->ki_pos + len > mf->size)
 		return -EINVAL;
-	return iov_iter_zero(len, to);
+	nr_zeroed = iov_iter_zero(len, to);
+	if (!mf->rw_delay_ns || nr_zeroed != len)
+		return nr_zeroed;
+
+	return io_mock_delay_rw(iocb, len);
 }
 
 static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -103,8 +146,12 @@ static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	if (iocb->ki_pos + len > mf->size)
 		return -EINVAL;
-	iov_iter_advance(from, len);
-	return len;
+	if (!mf->rw_delay_ns) {
+		iov_iter_advance(from, len);
+		return len;
+	}
+
+	return io_mock_delay_rw(iocb, len);
 }
 
 static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence)
@@ -165,6 +212,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 		return -EINVAL;
 	if (mc.file_size > SZ_1G)
 		return -EINVAL;
+	if (mc.rw_delay_ns > NSEC_PER_SEC)
+		return -EINVAL;
+
 	mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT);
 	if (!mf)
 		return -ENOMEM;
@@ -174,6 +224,7 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 		goto fail;
 
 	mf->size = mc.file_size;
+	mf->rw_delay_ns = mc.rw_delay_ns;
 	file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops,
 					 mf, O_RDWR | O_CLOEXEC, NULL);
 	if (IS_ERR(file)) {
-- 
cgit v1.2.3


From e448d578264a9512d38deb8c418954d5f3e20712 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 30 Jun 2025 19:16:56 +0100
Subject: io_uring/mock: add trivial poll handler

Add a flag that enables polling on the mock file. For now it's trivially
says that there is always data available, it'll be extended in the
future.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/f16de043ec4876d65fae294fc99ade57415fba0c.1750599274.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring/mock_file.h |  2 ++
 io_uring/mock_file.c                    | 37 +++++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h
index c8fa77e39c68..debeee8e4527 100644
--- a/include/uapi/linux/io_uring/mock_file.h
+++ b/include/uapi/linux/io_uring/mock_file.h
@@ -8,6 +8,7 @@ enum {
 	IORING_MOCK_FEAT_RW_ZERO,
 	IORING_MOCK_FEAT_RW_NOWAIT,
 	IORING_MOCK_FEAT_RW_ASYNC,
+	IORING_MOCK_FEAT_POLL,
 
 	IORING_MOCK_FEAT_END,
 };
@@ -19,6 +20,7 @@ struct io_uring_mock_probe {
 
 enum {
 	IORING_MOCK_CREATE_F_SUPPORT_NOWAIT			= 1,
+	IORING_MOCK_CREATE_F_POLL				= 2,
 };
 
 struct io_uring_mock_create {
diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c
index ed6a5505763e..45d3735b2708 100644
--- a/io_uring/mock_file.c
+++ b/io_uring/mock_file.c
@@ -6,6 +6,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
+#include <linux/poll.h>
 
 #include <linux/io_uring/cmd.h>
 #include <linux/io_uring_types.h>
@@ -20,6 +21,8 @@ struct io_mock_iocb {
 struct io_mock_file {
 	size_t			size;
 	u64			rw_delay_ns;
+	bool			pollable;
+	struct wait_queue_head	poll_wq;
 };
 
 #define IO_VALID_COPY_CMD_FLAGS		IORING_MOCK_COPY_FROM
@@ -161,6 +164,18 @@ static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence)
 	return fixed_size_llseek(file, offset, whence, mf->size);
 }
 
+static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt)
+{
+	struct io_mock_file *mf = file->private_data;
+	__poll_t mask = 0;
+
+	poll_wait(file, &mf->poll_wq, pt);
+
+	mask |= EPOLLOUT | EPOLLWRNORM;
+	mask |= EPOLLIN | EPOLLRDNORM;
+	return mask;
+}
+
 static int io_mock_release(struct inode *inode, struct file *file)
 {
 	struct io_mock_file *mf = file->private_data;
@@ -178,10 +193,22 @@ static const struct file_operations io_mock_fops = {
 	.llseek		= io_mock_llseek,
 };
 
-#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT)
+static const struct file_operations io_mock_poll_fops = {
+	.owner		= THIS_MODULE,
+	.release	= io_mock_release,
+	.uring_cmd	= io_mock_cmd,
+	.read_iter	= io_mock_read_iter,
+	.write_iter	= io_mock_write_iter,
+	.llseek		= io_mock_llseek,
+	.poll		= io_mock_poll,
+};
+
+#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \
+				IORING_MOCK_CREATE_F_POLL)
 
 static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags)
 {
+	const struct file_operations *fops = &io_mock_fops;
 	const struct io_uring_sqe *sqe = cmd->sqe;
 	struct io_uring_mock_create mc, __user *uarg;
 	struct io_mock_file *mf = NULL;
@@ -223,9 +250,15 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag
 	if (fd < 0)
 		goto fail;
 
+	init_waitqueue_head(&mf->poll_wq);
 	mf->size = mc.file_size;
 	mf->rw_delay_ns = mc.rw_delay_ns;
-	file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops,
+	if (mc.flags & IORING_MOCK_CREATE_F_POLL) {
+		fops = &io_mock_poll_fops;
+		mf->pollable = true;
+	}
+
+	file = anon_inode_create_getfile("[io_uring_mock]", fops,
 					 mf, O_RDWR | O_CLOEXEC, NULL);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
-- 
cgit v1.2.3


From be7efb2d20d67f334a7de2aef77ae6c69367e646 Mon Sep 17 00:00:00 2001
From: Andrey Albershteyn <aalbersh@redhat.com>
Date: Mon, 30 Jun 2025 18:20:16 +0200
Subject: fs: introduce file_getattr and file_setattr syscalls

Introduce file_getattr() and file_setattr() syscalls to manipulate inode
extended attributes. The syscalls takes pair of file descriptor and
pathname. Then it operates on inode opened accroding to openat()
semantics. The struct file_attr is passed to obtain/change extended
attributes.

This is an alternative to FS_IOC_FSSETXATTR ioctl with a difference
that file don't need to be open as we can reference it with a path
instead of fd. By having this we can manipulated inode extended
attributes not only on regular files but also on special ones. This
is not possible with FS_IOC_FSSETXATTR ioctl as with special files
we can not call ioctl() directly on the filesystem inode using fd.

This patch adds two new syscalls which allows userspace to get/set
extended inode attributes on special files by using parent directory
and a path - *at() like syscall.

CC: linux-api@vger.kernel.org
CC: linux-fsdevel@vger.kernel.org
CC: linux-xfs@vger.kernel.org
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Link: https://lore.kernel.org/20250630-xattrat-syscall-v6-6-c4e3bc35227b@kernel.org
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   2 +
 arch/arm/tools/syscall.tbl                  |   2 +
 arch/arm64/tools/syscall_32.tbl             |   2 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   2 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   2 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   2 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   2 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   2 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   2 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   2 +
 arch/s390/kernel/syscalls/syscall.tbl       |   2 +
 arch/sh/kernel/syscalls/syscall.tbl         |   2 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   2 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   2 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   2 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   2 +
 fs/file_attr.c                              | 152 ++++++++++++++++++++++++++++
 include/linux/syscalls.h                    |   7 ++
 include/uapi/asm-generic/unistd.h           |   8 +-
 include/uapi/linux/fs.h                     |  18 ++++
 scripts/syscall.tbl                         |   2 +
 21 files changed, 218 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 2dd6340de6b4..16dca28ebf17 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -507,3 +507,5 @@
 575	common	listxattrat			sys_listxattrat
 576	common	removexattrat			sys_removexattrat
 577	common	open_tree_attr			sys_open_tree_attr
+578	common	file_getattr			sys_file_getattr
+579	common	file_setattr			sys_file_setattr
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 27c1d5ebcd91..b07e699aaa3c 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -482,3 +482,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 0765b3a8d6d6..8d9088bc577d 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -479,3 +479,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 9fe47112c586..f41d38dfbf13 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -467,3 +467,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 7b6e97828e55..580af574fe73 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -473,3 +473,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index aa70e371bb54..d824ffe9a014 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -406,3 +406,5 @@
 465	n32	listxattrat			sys_listxattrat
 466	n32	removexattrat			sys_removexattrat
 467	n32	open_tree_attr			sys_open_tree_attr
+468	n32	file_getattr			sys_file_getattr
+469	n32	file_setattr			sys_file_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1e8c44c7b614..7a7049c2c307 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -382,3 +382,5 @@
 465	n64	listxattrat			sys_listxattrat
 466	n64	removexattrat			sys_removexattrat
 467	n64	open_tree_attr			sys_open_tree_attr
+468	n64	file_getattr			sys_file_getattr
+469	n64	file_setattr			sys_file_setattr
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 114a5a1a6230..d330274f0601 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -455,3 +455,5 @@
 465	o32	listxattrat			sys_listxattrat
 466	o32	removexattrat			sys_removexattrat
 467	o32	open_tree_attr			sys_open_tree_attr
+468	o32	file_getattr			sys_file_getattr
+469	o32	file_setattr			sys_file_setattr
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 94df3cb957e9..88a788a7b18d 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -466,3 +466,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 9a084bdb8926..b453e80dfc00 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -558,3 +558,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index a4569b96ef06..8a6744d658db 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -470,3 +470,5 @@
 465  common	listxattrat		sys_listxattrat			sys_listxattrat
 466  common	removexattrat		sys_removexattrat		sys_removexattrat
 467  common	open_tree_attr		sys_open_tree_attr		sys_open_tree_attr
+468  common	file_getattr		sys_file_getattr		sys_file_getattr
+469  common	file_setattr		sys_file_setattr		sys_file_setattr
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 52a7652fcff6..5e9c9eff5539 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -471,3 +471,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 83e45eb6c095..ebb7d06d1044 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -513,3 +513,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ac007ea00979..4877e16da69a 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -473,3 +473,5 @@
 465	i386	listxattrat		sys_listxattrat
 466	i386	removexattrat		sys_removexattrat
 467	i386	open_tree_attr		sys_open_tree_attr
+468	i386	file_getattr		sys_file_getattr
+469	i386	file_setattr		sys_file_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cfb5ca41e30d..92cf0fe2291e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -391,6 +391,8 @@
 465	common	listxattrat		sys_listxattrat
 466	common	removexattrat		sys_removexattrat
 467	common	open_tree_attr		sys_open_tree_attr
+468	common	file_getattr		sys_file_getattr
+469	common	file_setattr		sys_file_setattr
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index f657a77314f8..374e4cb788d8 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -438,3 +438,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 775f43fc9687..21d6a0607345 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -4,6 +4,10 @@
 #include <linux/fscrypt.h>
 #include <linux/fileattr.h>
 #include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+
+#include "internal.h"
 
 /**
  * fileattr_fill_xflags - initialize fileattr with xflags
@@ -90,6 +94,19 @@ int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 EXPORT_SYMBOL(vfs_fileattr_get);
 
+static void fileattr_to_file_attr(const struct fileattr *fa,
+				  struct file_attr *fattr)
+{
+	__u32 mask = FS_XFLAGS_MASK;
+
+	memset(fattr, 0, sizeof(struct file_attr));
+	fattr->fa_xflags = fa->fsx_xflags & mask;
+	fattr->fa_extsize = fa->fsx_extsize;
+	fattr->fa_nextents = fa->fsx_nextents;
+	fattr->fa_projid = fa->fsx_projid;
+	fattr->fa_cowextsize = fa->fsx_cowextsize;
+}
+
 /**
  * copy_fsxattr_to_user - copy fsxattr to userspace.
  * @fa:		fileattr pointer
@@ -116,6 +133,23 @@ int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
 }
 EXPORT_SYMBOL(copy_fsxattr_to_user);
 
+static int file_attr_to_fileattr(const struct file_attr *fattr,
+				 struct fileattr *fa)
+{
+	__u32 mask = FS_XFLAGS_MASK;
+
+	if (fattr->fa_xflags & ~mask)
+		return -EINVAL;
+
+	fileattr_fill_xflags(fa, fattr->fa_xflags);
+	fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
+	fa->fsx_extsize = fattr->fa_extsize;
+	fa->fsx_projid = fattr->fa_projid;
+	fa->fsx_cowextsize = fattr->fa_cowextsize;
+
+	return 0;
+}
+
 static int copy_fsxattr_from_user(struct fileattr *fa,
 				  struct fsxattr __user *ufa)
 {
@@ -344,3 +378,121 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
 	return err;
 }
 EXPORT_SYMBOL(ioctl_fssetxattr);
+
+SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
+		struct file_attr __user *, ufattr, size_t, usize,
+		unsigned int, at_flags)
+{
+	struct path filepath __free(path_put) = {};
+	struct filename *name __free(putname) = NULL;
+	unsigned int lookup_flags = 0;
+	struct file_attr fattr;
+	struct fileattr fa;
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	if (usize < FILE_ATTR_SIZE_VER0)
+		return -EINVAL;
+
+	name = getname_maybe_null(filename, at_flags);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	if (!name && dfd >= 0) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+
+		filepath = fd_file(f)->f_path;
+		path_get(&filepath);
+	} else {
+		error = filename_lookup(dfd, name, lookup_flags, &filepath,
+					NULL);
+		if (error)
+			return error;
+	}
+
+	error = vfs_fileattr_get(filepath.dentry, &fa);
+	if (error)
+		return error;
+
+	fileattr_to_file_attr(&fa, &fattr);
+	error = copy_struct_to_user(ufattr, usize, &fattr,
+				    sizeof(struct file_attr), NULL);
+
+	return error;
+}
+
+SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
+		struct file_attr __user *, ufattr, size_t, usize,
+		unsigned int, at_flags)
+{
+	struct path filepath __free(path_put) = {};
+	struct filename *name __free(putname) = NULL;
+	unsigned int lookup_flags = 0;
+	struct file_attr fattr;
+	struct fileattr fa;
+	int error;
+
+	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+	BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+	if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+
+	if (usize > PAGE_SIZE)
+		return -E2BIG;
+
+	if (usize < FILE_ATTR_SIZE_VER0)
+		return -EINVAL;
+
+	error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr,
+				      usize);
+	if (error)
+		return error;
+
+	error = file_attr_to_fileattr(&fattr, &fa);
+	if (error)
+		return error;
+
+	name = getname_maybe_null(filename, at_flags);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	if (!name && dfd >= 0) {
+		CLASS(fd, f)(dfd);
+		if (fd_empty(f))
+			return -EBADF;
+
+		filepath = fd_file(f)->f_path;
+		path_get(&filepath);
+	} else {
+		error = filename_lookup(dfd, name, lookup_flags, &filepath,
+					NULL);
+		if (error)
+			return error;
+	}
+
+	error = mnt_want_write(filepath.mnt);
+	if (!error) {
+		error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
+					 filepath.dentry, &fa);
+		mnt_drop_write(filepath.mnt);
+	}
+
+	return error;
+}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e5603cc91963..77f45e5d4413 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -78,6 +78,7 @@ struct cachestat;
 struct statmount;
 struct mnt_id_req;
 struct xattr_args;
+struct file_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -371,6 +372,12 @@ asmlinkage long sys_removexattrat(int dfd, const char __user *path,
 asmlinkage long sys_lremovexattr(const char __user *path,
 				 const char __user *name);
 asmlinkage long sys_fremovexattr(int fd, const char __user *name);
+asmlinkage long sys_file_getattr(int dfd, const char __user *filename,
+				 struct file_attr __user *attr, size_t usize,
+				 unsigned int at_flags);
+asmlinkage long sys_file_setattr(int dfd, const char __user *filename,
+				 struct file_attr __user *attr, size_t usize,
+				 unsigned int at_flags);
 asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_epoll_create1(int flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2892a45023af..04e0077fb4c9 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -852,8 +852,14 @@ __SYSCALL(__NR_removexattrat, sys_removexattrat)
 #define __NR_open_tree_attr 467
 __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr)
 
+/* fs/inode.c */
+#define __NR_file_getattr 468
+__SYSCALL(__NR_file_getattr, sys_file_getattr)
+#define __NR_file_setattr 469
+__SYSCALL(__NR_file_setattr, sys_file_setattr)
+
 #undef __NR_syscalls
-#define __NR_syscalls 468
+#define __NR_syscalls 470
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 0098b0ce8ccb..9663dbdda181 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -148,6 +148,24 @@ struct fsxattr {
 	unsigned char	fsx_pad[8];
 };
 
+/*
+ * Variable size structure for file_[sg]et_attr().
+ *
+ * Note. This is alternative to the structure 'struct fileattr'/'struct fsxattr'.
+ * As this structure is passed to/from userspace with its size, this can
+ * be versioned based on the size.
+ */
+struct file_attr {
+	__u64 fa_xflags;	/* xflags field value (get/set) */
+	__u32 fa_extsize;	/* extsize field value (get/set)*/
+	__u32 fa_nextents;	/* nextents field value (get)   */
+	__u32 fa_projid;	/* project identifier (get/set) */
+	__u32 fa_cowextsize;	/* CoW extsize field value (get/set) */
+};
+
+#define FILE_ATTR_SIZE_VER0 24
+#define FILE_ATTR_SIZE_LATEST FILE_ATTR_SIZE_VER0
+
 /*
  * Flags for the fsx_xflags field
  */
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 580b4e246aec..d1ae5e92c615 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -408,3 +408,5 @@
 465	common	listxattrat			sys_listxattrat
 466	common	removexattrat			sys_removexattrat
 467	common	open_tree_attr			sys_open_tree_attr
+468	common	file_getattr			sys_file_getattr
+469	common	file_setattr			sys_file_setattr
-- 
cgit v1.2.3


From 0dc7e656ddd54c3267b7cc18c1ac8ec1297ed02f Mon Sep 17 00:00:00 2001
From: Gabor Juhos <j4g8y7@gmail.com>
Date: Wed, 2 Jul 2025 14:35:23 +0200
Subject: mtd: nand: qpic-common: add defines for ECC_MODE values

Add defines for the values of the ECC_MODE field of the NAND_DEV0_ECC_CFG
register and change both the 'qcom-nandc' and 'spi-qpic-snand' drivers to
use those instead of magic numbers.

No functional changes. This is in preparation for adding 8 bit ECC strength
support for the 'spi-qpic-snand' driver.

Reviewed-by: Md Sadre Alam <quic_mdalam@quicinc.com>
Signed-off-by: Gabor Juhos <j4g8y7@gmail.com>
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://patch.msgid.link/20250702-qpic-snand-8bit-ecc-v2-1-ae2c17a30bb7@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/mtd/nand/raw/qcom_nandc.c    | 6 +++---
 drivers/spi/spi-qpic-snand.c         | 2 +-
 include/linux/mtd/nand-qpic-common.h | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c
index 1003cf118c01..4dd6f1a4e797 100644
--- a/drivers/mtd/nand/raw/qcom_nandc.c
+++ b/drivers/mtd/nand/raw/qcom_nandc.c
@@ -1379,7 +1379,7 @@ static int qcom_nand_attach_chip(struct nand_chip *chip)
 	struct qcom_nand_controller *nandc = get_qcom_nand_controller(chip);
 	int cwperpage, bad_block_byte, ret;
 	bool wide_bus;
-	int ecc_mode = 1;
+	int ecc_mode = ECC_MODE_8BIT;
 
 	/* controller only supports 512 bytes data steps */
 	ecc->size = NANDC_STEP_SIZE;
@@ -1400,7 +1400,7 @@ static int qcom_nand_attach_chip(struct nand_chip *chip)
 	if (ecc->strength >= 8) {
 		/* 8 bit ECC defaults to BCH ECC on all platforms */
 		host->bch_enabled = true;
-		ecc_mode = 1;
+		ecc_mode = ECC_MODE_8BIT;
 
 		if (wide_bus) {
 			host->ecc_bytes_hw = 14;
@@ -1420,7 +1420,7 @@ static int qcom_nand_attach_chip(struct nand_chip *chip)
 		if (nandc->props->ecc_modes & ECC_BCH_4BIT) {
 			/* BCH */
 			host->bch_enabled = true;
-			ecc_mode = 0;
+			ecc_mode = ECC_MODE_4BIT;
 
 			if (wide_bus) {
 				host->ecc_bytes_hw = 8;
diff --git a/drivers/spi/spi-qpic-snand.c b/drivers/spi/spi-qpic-snand.c
index ca55f9bcd17b..7219bcaf4055 100644
--- a/drivers/spi/spi-qpic-snand.c
+++ b/drivers/spi/spi-qpic-snand.c
@@ -343,7 +343,7 @@ static int qcom_spi_ecc_init_ctx_pipelined(struct nand_device *nand)
 			       FIELD_PREP(ECC_SW_RESET, 0) |
 			       FIELD_PREP(ECC_NUM_DATA_BYTES_MASK, ecc_cfg->cw_data) |
 			       FIELD_PREP(ECC_FORCE_CLK_OPEN, 1) |
-			       FIELD_PREP(ECC_MODE_MASK, 0) |
+			       FIELD_PREP(ECC_MODE_MASK, ECC_MODE_4BIT) |
 			       FIELD_PREP(ECC_PARITY_SIZE_BYTES_BCH_MASK, ecc_cfg->ecc_bytes_hw);
 
 	ecc_cfg->ecc_buf_cfg = FIELD_PREP(NUM_STEPS_MASK, 0x203);
diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h
index e8462deda6db..0d944db363cd 100644
--- a/include/linux/mtd/nand-qpic-common.h
+++ b/include/linux/mtd/nand-qpic-common.h
@@ -101,6 +101,8 @@
 #define	ECC_SW_RESET			BIT(1)
 #define	ECC_MODE			4
 #define	ECC_MODE_MASK			GENMASK(5, 4)
+#define	ECC_MODE_4BIT			0
+#define	ECC_MODE_8BIT			1
 #define	ECC_PARITY_SIZE_BYTES_BCH	8
 #define	ECC_PARITY_SIZE_BYTES_BCH_MASK	GENMASK(12, 8)
 #define	ECC_NUM_DATA_BYTES		16
-- 
cgit v1.2.3


From cb335325b1431152f696c53e32465ba192cd119a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 30 Jun 2025 12:26:39 +0300
Subject: leds: Unexport of_led_get()

There are no users outside the module.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250630092639.1574860-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-class.c | 3 +--
 include/linux/leds.h     | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index c20ac8ccf52b..669c21ef8611 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -256,7 +256,7 @@ static const struct class leds_class = {
  * Returns the LED device parsed from the phandle specified in the "leds"
  * property of a device tree node or a negative error-code on failure.
  */
-struct led_classdev *of_led_get(struct device_node *np, int index)
+static struct led_classdev *of_led_get(struct device_node *np, int index)
 {
 	struct device *led_dev;
 	struct device_node *led_node;
@@ -270,7 +270,6 @@ struct led_classdev *of_led_get(struct device_node *np, int index)
 
 	return led_module_get(led_dev);
 }
-EXPORT_SYMBOL_GPL(of_led_get);
 
 /**
  * led_put() - release a LED device
diff --git a/include/linux/leds.h b/include/linux/leds.h
index b3f0aa081064..b16b803cc1ac 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -294,7 +294,6 @@ void led_remove_lookup(struct led_lookup_data *led_lookup);
 struct led_classdev *__must_check led_get(struct device *dev, char *con_id);
 struct led_classdev *__must_check devm_led_get(struct device *dev, char *con_id);
 
-extern struct led_classdev *of_led_get(struct device_node *np, int index);
 extern void led_put(struct led_classdev *led_cdev);
 struct led_classdev *__must_check devm_of_led_get(struct device *dev,
 						  int index);
-- 
cgit v1.2.3


From b1fabef37bd504f378a203fd8b9227b8fa65b193 Mon Sep 17 00:00:00 2001
From: Yeoreum Yun <yeoreum.yun@arm.com>
Date: Wed, 18 Jun 2025 10:29:51 +0100
Subject: prctl: Introduce PR_MTE_STORE_ONLY

PR_MTE_STORE_ONLY is used to restrict the MTE tag check for store
opeartion only.

Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Tested-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20250618092957.2069907-3-yeoreum.yun@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/uapi/linux/prctl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 43dec6eed559..f6fb137c407f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -244,6 +244,8 @@ struct prctl_mm_map {
 # define PR_MTE_TAG_MASK		(0xffffUL << PR_MTE_TAG_SHIFT)
 /* Unused; kept only for source compatibility */
 # define PR_MTE_TCF_SHIFT		1
+/* MTE tag check store only */
+# define PR_MTE_STORE_ONLY		(1UL << 19)
 /* RISC-V pointer masking tag length */
 # define PR_PMLEN_SHIFT			24
 # define PR_PMLEN_MASK			(0x7fUL << PR_PMLEN_SHIFT)
-- 
cgit v1.2.3


From 5e4e8c1415c181ce311a0b5936ef301edd57c5d1 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Fri, 27 Jun 2025 21:42:32 +0100
Subject: dt-bindings: clock: renesas,r9a09g056/57-cpg: Add XSPI core clock

Add XSPI core clock definitions to the clock bindings for the Renesas
R9A09G056 and R9A09G057 SoCs. These clocks IDs are used to support XSPI
interface.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/20250627204237.214635-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g056-cpg.h | 1 +
 include/dt-bindings/clock/renesas,r9a09g057-cpg.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
index f4905b27f8d9..a9af5af9e3a1 100644
--- a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
+++ b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h
@@ -20,5 +20,6 @@
 #define R9A09G056_USB2_0_CLK_CORE0		9
 #define R9A09G056_GBETH_0_CLK_PTP_REF_I		10
 #define R9A09G056_GBETH_1_CLK_PTP_REF_I		11
+#define R9A09G056_SPI_CLK_SPI			12
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G056_CPG_H__ */
diff --git a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
index 884dbeb1e139..5346a898ab60 100644
--- a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
+++ b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h
@@ -21,5 +21,6 @@
 #define R9A09G057_USB2_0_CLK_CORE1		10
 #define R9A09G057_GBETH_0_CLK_PTP_REF_I		11
 #define R9A09G057_GBETH_1_CLK_PTP_REF_I		12
+#define R9A09G057_SPI_CLK_SPI			13
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G057_CPG_H__ */
-- 
cgit v1.2.3


From 2a76193f7cc03de5b2745d069926ebc431dd5ba4 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Date: Wed, 25 Jun 2025 15:17:03 +0100
Subject: dt-bindings: clock: renesas,r9a09g077/87: Add SDHI_CLKHS clock ID

Add the SDHI high-speed clock (SDHI_CLKHS) definition for the Renesas
RZ/T2H (R9A09G077) and RZ/N2H (R9A09G087) SoCs. SDHI_CLKHS is used as
a core clock for the SDHI IP and operates at 800MHz.

Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://lore.kernel.org/20250625141705.151383-2-prabhakar.mahadev-lad.rj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h | 1 +
 include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
index f6e5f62b07c4..7ecc4f0b235a 100644
--- a/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
+++ b/include/dt-bindings/clock/renesas,r9a09g077-cpg-mssr.h
@@ -24,5 +24,6 @@
 #define R9A09G077_CLK_PCLKH		12
 #define R9A09G077_CLK_PCLKM		13
 #define R9A09G077_CLK_PCLKL		14
+#define R9A09G077_SDHI_CLKHS		15
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G077_CPG_H__ */
diff --git a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h
index f28166d6015f..925e57703925 100644
--- a/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h
+++ b/include/dt-bindings/clock/renesas,r9a09g087-cpg-mssr.h
@@ -24,5 +24,6 @@
 #define R9A09G087_CLK_PCLKH		12
 #define R9A09G087_CLK_PCLKM		13
 #define R9A09G087_CLK_PCLKL		14
+#define R9A09G087_SDHI_CLKHS		15
 
 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G087_CPG_H__ */
-- 
cgit v1.2.3


From 3715b5df09b92168a4492b48bb7ea70d89f9d8f3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 09:35:37 +0000
Subject: net: add struct net_aligned_data

This structure will hold networking data that must
consume a full cache line to avoid accidental false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250630093540.3052835-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/aligned_data.h | 16 ++++++++++++++++
 net/core/hotdata.c         |  3 +++
 2 files changed, 19 insertions(+)
 create mode 100644 include/net/aligned_data.h

(limited to 'include')

diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
new file mode 100644
index 000000000000..cf3329d7c227
--- /dev/null
+++ b/include/net/aligned_data.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_ALIGNED_DATA_H
+#define _NET_ALIGNED_DATA_H
+
+#include <linux/types.h>
+
+/* Structure holding cacheline aligned fields on SMP builds.
+ * Each field or group should have an ____cacheline_aligned_in_smp
+ * attribute to ensure no accidental false sharing can happen.
+ */
+struct net_aligned_data {
+};
+
+extern struct net_aligned_data net_aligned_data;
+
+#endif /* _NET_ALIGNED_DATA_H */
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index 0bc893d5f07b..e9c03491ab00 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -2,6 +2,7 @@
 #include <linux/cache.h>
 #include <linux/jiffies.h>
 #include <linux/list.h>
+#include <net/aligned_data.h>
 #include <net/hotdata.h>
 #include <net/proto_memory.h>
 
@@ -22,3 +23,5 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
 };
 EXPORT_SYMBOL(net_hotdata);
+
+struct net_aligned_data net_aligned_data;
-- 
cgit v1.2.3


From 998642e999d23324c5dbf38149606d09cec2c377 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 09:35:38 +0000
Subject: net: move net_cookie into net_aligned_data

Using per-cpu data for net->net_cookie generation is overkill,
because even busy hosts do not create hundreds of netns per second.

Make sure to put net_cookie in a private cache line to avoid
potential false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250630093540.3052835-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/aligned_data.h | 2 ++
 net/core/net_namespace.c   | 8 ++------
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
index cf3329d7c227..5c7badf71f04 100644
--- a/include/net/aligned_data.h
+++ b/include/net/aligned_data.h
@@ -2,6 +2,7 @@
 #ifndef _NET_ALIGNED_DATA_H
 #define _NET_ALIGNED_DATA_H
 
+#include <linux/atomic.h>
 #include <linux/types.h>
 
 /* Structure holding cacheline aligned fields on SMP builds.
@@ -9,6 +10,7 @@
  * attribute to ensure no accidental false sharing can happen.
  */
 struct net_aligned_data {
+	atomic64_t	net_cookie ____cacheline_aligned_in_smp;
 };
 
 extern struct net_aligned_data net_aligned_data;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 419604d9cf32..f58ef920a3a1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -19,9 +19,9 @@
 #include <linux/net_namespace.h>
 #include <linux/sched/task.h>
 #include <linux/uidgid.h>
-#include <linux/cookie.h>
 #include <linux/proc_fs.h>
 
+#include <net/aligned_data.h>
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
@@ -64,8 +64,6 @@ DECLARE_RWSEM(pernet_ops_rwsem);
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
 
-DEFINE_COOKIE(net_cookie);
-
 static struct net_generic *net_alloc_generic(void)
 {
 	unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
@@ -434,9 +432,7 @@ static __net_init int setup_net(struct net *net)
 	LIST_HEAD(net_exit_list);
 	int error = 0;
 
-	preempt_disable();
-	net->net_cookie = gen_cookie_next(&net_cookie);
-	preempt_enable();
+	net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
-- 
cgit v1.2.3


From 83081337419cb692eca4ee475d936b1fdcfd49f6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 09:35:39 +0000
Subject: tcp: move tcp_memory_allocated into net_aligned_data

____cacheline_aligned_in_smp attribute only makes sure to align
a field to a cache line. It does not prevent the linker to use
the remaining of the cache line for other variables, causing
potential false sharing.

Move tcp_memory_allocated into a dedicated cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250630093540.3052835-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/aligned_data.h | 3 +++
 include/net/tcp.h          | 1 -
 net/core/hotdata.c         | 2 ++
 net/ipv4/tcp.c             | 2 --
 net/ipv4/tcp_ipv4.c        | 3 ++-
 net/ipv6/tcp_ipv6.c        | 3 ++-
 net/mptcp/protocol.c       | 3 ++-
 7 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
index 5c7badf71f04..bedb4f86b0fe 100644
--- a/include/net/aligned_data.h
+++ b/include/net/aligned_data.h
@@ -11,6 +11,9 @@
  */
 struct net_aligned_data {
 	atomic64_t	net_cookie ____cacheline_aligned_in_smp;
+#if defined(CONFIG_INET)
+	atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
+#endif
 };
 
 extern struct net_aligned_data net_aligned_data;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 761c4a0ad386..bc08de49805c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -267,7 +267,6 @@ extern long sysctl_tcp_mem[3];
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
 #define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
-extern atomic_long_t tcp_memory_allocated;
 DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
 
 extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index e9c03491ab00..95d0a4df1006 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <net/aligned_data.h>
 #include <net/hotdata.h>
+#include <net/ip.h>
 #include <net/proto_memory.h>
 
 struct net_hotdata net_hotdata __cacheline_aligned = {
@@ -25,3 +26,4 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 EXPORT_SYMBOL(net_hotdata);
 
 struct net_aligned_data net_aligned_data;
+EXPORT_IPV6_MOD(net_aligned_data);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8a3c99246d2e..925b2c572ca2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -302,8 +302,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_IPV6_MOD(sysctl_tcp_mem);
 
-atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;	/* Current allocated memory. */
-EXPORT_IPV6_MOD(tcp_memory_allocated);
 DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
 EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56223338bc0f..b406fd012b2e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -59,6 +59,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
+#include <net/aligned_data.h>
 #include <net/net_namespace.h>
 #include <net/icmp.h>
 #include <net/inet_hashtables.h>
@@ -3390,7 +3391,7 @@ struct proto tcp_prot = {
 	.sockets_allocated	= &tcp_sockets_allocated,
 	.orphan_count		= &tcp_orphan_count,
 
-	.memory_allocated	= &tcp_memory_allocated,
+	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
 
 	.memory_pressure	= &tcp_memory_pressure,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9fb614e17bde..ed0b891885d8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -41,6 +41,7 @@
 #include <linux/random.h>
 #include <linux/indirect_call_wrapper.h>
 
+#include <net/aligned_data.h>
 #include <net/tcp.h>
 #include <net/ndisc.h>
 #include <net/inet6_hashtables.h>
@@ -2356,7 +2357,7 @@ struct proto tcpv6_prot = {
 	.stream_memory_free	= tcp_stream_memory_free,
 	.sockets_allocated	= &tcp_sockets_allocated,
 
-	.memory_allocated	= &tcp_memory_allocated,
+	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
 
 	.memory_pressure	= &tcp_memory_pressure,
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e7972e633236..5f904fc5ac4c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -11,6 +11,7 @@
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
 #include <linux/atomic.h>
+#include <net/aligned_data.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
 #include <net/inet_hashtables.h>
@@ -3729,7 +3730,7 @@ static struct proto mptcp_prot = {
 	.stream_memory_free	= mptcp_stream_memory_free,
 	.sockets_allocated	= &mptcp_sockets_allocated,
 
-	.memory_allocated	= &tcp_memory_allocated,
+	.memory_allocated	= &net_aligned_data.tcp_memory_allocated,
 	.per_cpu_fw_alloc	= &tcp_memory_per_cpu_fw_alloc,
 
 	.memory_pressure	= &tcp_memory_pressure,
-- 
cgit v1.2.3


From e3d4825124bce0d1f72187fabcf972b7c0b6cb9b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 09:35:40 +0000
Subject: udp: move udp_memory_allocated into net_aligned_data

____cacheline_aligned_in_smp attribute only makes sure to align
a field to a cache line. It does not prevent the linker to use
the remaining of the cache line for other variables, causing
potential false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250630093540.3052835-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/aligned_data.h | 1 +
 include/net/udp.h          | 1 -
 net/ipv4/udp.c             | 4 +---
 net/ipv4/udp_impl.h        | 1 +
 net/ipv4/udplite.c         | 2 +-
 net/ipv6/udp.c             | 2 +-
 net/ipv6/udp_impl.h        | 1 +
 net/ipv6/udplite.c         | 2 +-
 8 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
index bedb4f86b0fe..e1a1c8aedc79 100644
--- a/include/net/aligned_data.h
+++ b/include/net/aligned_data.h
@@ -13,6 +13,7 @@ struct net_aligned_data {
 	atomic64_t	net_cookie ____cacheline_aligned_in_smp;
 #if defined(CONFIG_INET)
 	atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
+	atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
 #endif
 };
 
diff --git a/include/net/udp.h b/include/net/udp.h
index a772510b2aa5..f8ae2c4ade14 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -205,7 +205,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2)
 
 extern struct proto udp_prot;
 
-extern atomic_long_t udp_memory_allocated;
 DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
 
 /* sysctl variables for udp */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 19573ee64a0f..49f43c54cfb0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -127,8 +127,6 @@ struct udp_table udp_table __read_mostly;
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_IPV6_MOD(sysctl_udp_mem);
 
-atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
-EXPORT_IPV6_MOD(udp_memory_allocated);
 DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
 EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
 
@@ -3235,7 +3233,7 @@ struct proto udp_prot = {
 #ifdef CONFIG_BPF_SYSCALL
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif
-	.memory_allocated	= &udp_memory_allocated,
+	.memory_allocated	= &net_aligned_data.udp_memory_allocated,
 	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,
 
 	.sysctl_mem		= sysctl_udp_mem,
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e1ff3a375996..c7142213fc21 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _UDP4_IMPL_H
 #define _UDP4_IMPL_H
+#include <net/aligned_data.h>
 #include <net/udp.h>
 #include <net/udplite.h>
 #include <net/protocol.h>
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index af37af3ab727..d3e621a11a1a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -60,7 +60,7 @@ struct proto 	udplite_prot = {
 	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
 
-	.memory_allocated  = &udp_memory_allocated,
+	.memory_allocated  = &net_aligned_data.udp_memory_allocated,
 	.per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
 
 	.sysctl_mem	   = sysctl_udp_mem,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ebb95d8bc681..6bbdadbd5fec 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1925,7 +1925,7 @@ struct proto udpv6_prot = {
 	.psock_update_sk_prot	= udp_bpf_update_proto,
 #endif
 
-	.memory_allocated	= &udp_memory_allocated,
+	.memory_allocated	= &net_aligned_data.udp_memory_allocated,
 	.per_cpu_fw_alloc	= &udp_memory_per_cpu_fw_alloc,
 
 	.sysctl_mem		= sysctl_udp_mem,
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 0590f566379d..8a406be25a3a 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _UDP6_IMPL_H
 #define _UDP6_IMPL_H
+#include <net/aligned_data.h>
 #include <net/udp.h>
 #include <net/udplite.h>
 #include <net/protocol.h>
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index a60bec9b14f1..2cec542437f7 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -59,7 +59,7 @@ struct proto udplitev6_prot = {
 	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
 
-	.memory_allocated  = &udp_memory_allocated,
+	.memory_allocated  = &net_aligned_data.udp_memory_allocated,
 	.per_cpu_fw_alloc  = &udp_memory_per_cpu_fw_alloc,
 
 	.sysctl_mem	   = sysctl_udp_mem,
-- 
cgit v1.2.3


From 8a402bbe54760dea67f1b2980c727761b47994d7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:25 +0000
Subject: net: dst: annotate data-races around dst->obsolete

(dst_entry)->obsolete is read locklessly, add corresponding
annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h               |  2 +-
 net/core/dst.c                  |  2 +-
 net/core/dst_cache.c            |  2 +-
 net/core/neighbour.c            |  3 ++-
 net/core/sock.c                 |  4 ++--
 net/ipv4/datagram.c             |  2 +-
 net/ipv4/route.c                | 15 ++++++++-------
 net/ipv6/datagram.c             |  2 +-
 net/ipv6/route.c                |  9 ++++-----
 net/netfilter/ipvs/ip_vs_xmit.c |  2 +-
 net/sctp/transport.c            |  2 +-
 net/xfrm/xfrm_policy.c          |  4 ++--
 12 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 78c78cdce0e9..76c30c3b22dd 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -476,7 +476,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 							   u32));
 static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
 {
-	if (dst->obsolete)
+	if (READ_ONCE(dst->obsolete))
 		dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
 					 ipv4_dst_check, dst, cookie);
 	return dst;
diff --git a/net/core/dst.c b/net/core/dst.c
index 795ca07e28a4..8f2a3138d60c 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -145,7 +145,7 @@ void dst_dev_put(struct dst_entry *dst)
 {
 	struct net_device *dev = dst->dev;
 
-	dst->obsolete = DST_OBSOLETE_DEAD;
+	WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD);
 	if (dst->ops->ifdown)
 		dst->ops->ifdown(dst, dev);
 	dst->input = dst_discard;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 93a04d18e505..9ab4902324e1 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -52,7 +52,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
 
 	if (unlikely(!time_after(idst->refresh_ts,
 				 READ_ONCE(dst_cache->reset_ts)) ||
-		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+		     (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) {
 		dst_cache_per_cpu_dst_set(idst, NULL, 0);
 		dst_release(dst);
 		goto fail;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index e5f0992ac364..d1de7f292eea 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1428,7 +1428,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 			 * we can reinject the packet there.
 			 */
 			n2 = NULL;
-			if (dst && dst->obsolete != DST_OBSOLETE_DEAD) {
+			if (dst &&
+			    READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
 				n2 = dst_neigh_lookup_skb(dst, skb);
 				if (n2)
 					n1 = n2;
diff --git a/net/core/sock.c b/net/core/sock.c
index 3a71d6c4ccf0..dc59fb7760a3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -602,7 +602,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 {
 	struct dst_entry *dst = __sk_dst_get(sk);
 
-	if (dst && dst->obsolete &&
+	if (dst && READ_ONCE(dst->obsolete) &&
 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 			       dst, cookie) == NULL) {
 		sk_tx_queue_clear(sk);
@@ -620,7 +620,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 {
 	struct dst_entry *dst = sk_dst_get(sk);
 
-	if (dst && dst->obsolete &&
+	if (dst && READ_ONCE(dst->obsolete) &&
 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 			       dst, cookie) == NULL) {
 		sk_dst_reset(sk);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4b5bc6eb52e7..c2b2cda1a7e5 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -109,7 +109,7 @@ void ip4_datagram_release_cb(struct sock *sk)
 	rcu_read_lock();
 
 	dst = __sk_dst_get(sk);
-	if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+	if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) {
 		rcu_read_unlock();
 		return;
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a2b7cadf66af..d32af8c16727 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -717,7 +717,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 		 */
 		rt = rcu_dereference(nhc->nhc_rth_input);
 		if (rt)
-			rt->dst.obsolete = DST_OBSOLETE_KILL;
+			WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
 
 		for_each_possible_cpu(i) {
 			struct rtable __rcu **prt;
@@ -725,7 +725,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 			rt = rcu_dereference(*prt);
 			if (rt)
-				rt->dst.obsolete = DST_OBSOLETE_KILL;
+				WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
 		}
 	}
 
@@ -797,7 +797,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 						jiffies + ip_rt_gc_timeout);
 			}
 			if (kill_route)
-				rt->dst.obsolete = DST_OBSOLETE_KILL;
+				WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
 			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 		}
 		neigh_release(n);
@@ -842,7 +842,7 @@ static void ipv4_negative_advice(struct sock *sk,
 {
 	struct rtable *rt = dst_rtable(dst);
 
-	if ((dst->obsolete > 0) ||
+	if ((READ_ONCE(dst->obsolete) > 0) ||
 	    (rt->rt_flags & RTCF_REDIRECTED) ||
 	    rt->dst.expires)
 		sk_dst_reset(sk);
@@ -1136,7 +1136,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 	rt = dst_rtable(odst);
-	if (odst->obsolete && !odst->ops->check(odst, 0)) {
+	if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) {
 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
 		if (IS_ERR(rt))
 			goto out;
@@ -1211,7 +1211,8 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
 	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
 	 * DST_OBSOLETE_DEAD.
 	 */
-	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+	if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK ||
+	    rt_is_expired(rt))
 		return NULL;
 	return dst;
 }
@@ -1571,7 +1572,7 @@ void rt_flush_dev(struct net_device *dev)
 static bool rt_cache_valid(const struct rtable *rt)
 {
 	return	rt &&
-		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+		READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
 		!rt_is_expired(rt);
 }
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 281722817a65..972bf0426d59 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -127,7 +127,7 @@ void ip6_datagram_release_cb(struct sock *sk)
 
 	rcu_read_lock();
 	dst = __sk_dst_get(sk);
-	if (!dst || !dst->obsolete ||
+	if (!dst || !READ_ONCE(dst->obsolete) ||
 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) {
 		rcu_read_unlock();
 		return;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 46a4f9d1900f..ace2071f77bd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -406,7 +406,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 		if (time_after(jiffies, rt->dst.expires))
 			return true;
 	} else if (from) {
-		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
+		return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK ||
 			fib6_check_expired(from);
 	}
 	return false;
@@ -2777,11 +2777,10 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
 					    u32 cookie)
 {
 	if (!__rt6_check_expired(rt) &&
-	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+	    READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
 	    fib6_check(from, cookie))
 		return &rt->dst;
-	else
-		return NULL;
+	return NULL;
 }
 
 INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
@@ -3014,7 +3013,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 			sk_uid(sk));
 
 	dst = __sk_dst_get(sk);
-	if (!dst || !dst->obsolete ||
+	if (!dst || !READ_ONCE(dst->obsolete) ||
 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
 		return;
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 014f07740369..95af252b2939 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -97,7 +97,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest)
 	if (!dest_dst)
 		return NULL;
 	dst = dest_dst->dst_cache;
-	if (dst->obsolete &&
+	if (READ_ONCE(dst->obsolete) &&
 	    dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
 		return NULL;
 	return dest_dst;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 6946c1462793..4d258a6e8033 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -240,7 +240,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport,
 void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 {
 	/* If we don't have a fresh route, look one up */
-	if (!transport->dst || transport->dst->obsolete) {
+	if (!transport->dst || READ_ONCE(transport->dst->obsolete)) {
 		sctp_transport_dst_release(transport);
 		transport->af_specific->get_dst(transport, &transport->saddr,
 						&transport->fl, sk);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 094d2454602e..c5035a9bc3bb 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3925,7 +3925,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie)
 	 * This will force stale_bundle() to fail on any xdst bundle with
 	 * this dst linked in it.
 	 */
-	if (dst->obsolete < 0 && !stale_bundle(dst))
+	if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst))
 		return dst;
 
 	return NULL;
@@ -3953,7 +3953,7 @@ static void xfrm_link_failure(struct sk_buff *skb)
 
 static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst)
 {
-	if (dst->obsolete)
+	if (READ_ONCE(dst->obsolete))
 		sk_dst_reset(sk);
 }
 
-- 
cgit v1.2.3


From 36229b2caca2228b834c03fb83867022485a0563 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:26 +0000
Subject: net: dst: annotate data-races around dst->expires

(dst_entry)->expires is read and written locklessly,
add corresponding annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h |  8 +++++---
 include/net/ip.h  |  2 +-
 net/ipv4/route.c  |  7 ++++---
 net/ipv6/route.c  | 13 ++++++-------
 4 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 76c30c3b22dd..1efe1e5d51a9 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -431,13 +431,15 @@ static inline void dst_link_failure(struct sk_buff *skb)
 
 static inline void dst_set_expires(struct dst_entry *dst, int timeout)
 {
-	unsigned long expires = jiffies + timeout;
+	unsigned long old, expires = jiffies + timeout;
 
 	if (expires == 0)
 		expires = 1;
 
-	if (dst->expires == 0 || time_before(expires, dst->expires))
-		dst->expires = expires;
+	old = READ_ONCE(dst->expires);
+
+	if (!old || time_before(expires, old))
+		WRITE_ONCE(dst->expires, expires);
 }
 
 static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
diff --git a/include/net/ip.h b/include/net/ip.h
index 375304bb99f6..391af454422e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -477,7 +477,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	    ip_mtu_locked(dst) ||
 	    !forwarding) {
 		mtu = rt->rt_pmtu;
-		if (mtu && time_before(jiffies, rt->dst.expires))
+		if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires)))
 			goto out;
 	}
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d32af8c16727..d7a534a5f1ff 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -844,7 +844,7 @@ static void ipv4_negative_advice(struct sock *sk,
 
 	if ((READ_ONCE(dst->obsolete) > 0) ||
 	    (rt->rt_flags & RTCF_REDIRECTED) ||
-	    rt->dst.expires)
+	    READ_ONCE(rt->dst.expires))
 		sk_dst_reset(sk);
 }
 
@@ -1033,7 +1033,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	}
 
 	if (rt->rt_pmtu == mtu && !lock &&
-	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
+	    time_before(jiffies, READ_ONCE(dst->expires) -
+				 net->ipv4.ip_rt_mtu_expires / 2))
 		goto out;
 
 	if (fib_lookup(net, fl4, &res, 0) == 0) {
@@ -3010,7 +3011,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 		}
 	}
 
-	expires = rt->dst.expires;
+	expires = READ_ONCE(rt->dst.expires);
 	if (expires) {
 		unsigned long now = jiffies;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ace2071f77bd..1014dcea1200 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -391,9 +391,8 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
 static bool __rt6_check_expired(const struct rt6_info *rt)
 {
 	if (rt->rt6i_flags & RTF_EXPIRES)
-		return time_after(jiffies, rt->dst.expires);
-	else
-		return false;
+		return time_after(jiffies, READ_ONCE(rt->dst.expires));
+	return false;
 }
 
 static bool rt6_check_expired(const struct rt6_info *rt)
@@ -403,7 +402,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	from = rcu_dereference(rt->from);
 
 	if (rt->rt6i_flags & RTF_EXPIRES) {
-		if (time_after(jiffies, rt->dst.expires))
+		if (time_after(jiffies, READ_ONCE(rt->dst.expires)))
 			return true;
 	} else if (from) {
 		return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK ||
@@ -2139,7 +2138,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 			rt6_remove_exception(bucket, rt6_ex);
 			return;
 		}
-	} else if (time_after(jiffies, rt->dst.expires)) {
+	} else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) {
 		pr_debug("purging expired route %p\n", rt);
 		rt6_remove_exception(bucket, rt6_ex);
 		return;
@@ -2870,7 +2869,7 @@ static void rt6_update_expires(struct rt6_info *rt0, int timeout)
 		rcu_read_lock();
 		from = rcu_dereference(rt0->from);
 		if (from)
-			rt0->dst.expires = from->expires;
+			WRITE_ONCE(rt0->dst.expires, from->expires);
 		rcu_read_unlock();
 	}
 
@@ -5903,7 +5902,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	}
 
 	if (rt6_flags & RTF_EXPIRES) {
-		expires = dst ? dst->expires : rt->expires;
+		expires = dst ? READ_ONCE(dst->expires) : rt->expires;
 		expires -= jiffies;
 	}
 
-- 
cgit v1.2.3


From 8f2b2282d04a5d5bcbec22f91572bb6803cfc771 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:27 +0000
Subject: net: dst: annotate data-races around dst->lastuse

(dst_entry)->lastuse is read and written locklessly,
add corresponding annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h    | 4 ++--
 net/core/rtnetlink.c | 4 +++-
 net/ipv6/route.c     | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 1efe1e5d51a9..bef2f41c7220 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -240,9 +240,9 @@ static inline void dst_hold(struct dst_entry *dst)
 
 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
 {
-	if (unlikely(time != dst->lastuse)) {
+	if (unlikely(time != READ_ONCE(dst->lastuse))) {
 		dst->__use++;
-		dst->lastuse = time;
+		WRITE_ONCE(dst->lastuse, time);
 	}
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c57692eb8da9..a9555bfc372f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1026,9 +1026,11 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 		.rta_error = error,
 		.rta_id =  id,
 	};
+	unsigned long delta;
 
 	if (dst) {
-		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+		delta = jiffies - READ_ONCE(dst->lastuse);
+		ci.rta_lastuse = jiffies_delta_to_clock_t(delta);
 		ci.rta_used = dst->__use;
 		ci.rta_clntref = rcuref_read(&dst->__rcuref);
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1014dcea1200..375112a59492 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2133,7 +2133,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
 	 * expired, independently from their aging, as per RFC 8201 section 4
 	 */
 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
-		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
+		if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) +
+				       gc_args->timeout)) {
 			pr_debug("aging clone %p\n", rt);
 			rt6_remove_exception(bucket, rt6_ex);
 			return;
-- 
cgit v1.2.3


From f1c5fd34891a1c242885f48c2e4dc52df180f311 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:28 +0000
Subject: net: dst: annotate data-races around dst->input

dst_dev_put() can overwrite dst->input while other
cpus might read this field (for instance from dst_input())

Add READ_ONCE()/WRITE_ONCE() annotations to suppress
potential issues.

We will likely need full RCU protection later.

Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h      | 2 +-
 include/net/lwtunnel.h | 4 ++--
 net/core/dst.c         | 2 +-
 net/ipv4/route.c       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index bef2f41c7220..c0f8b6d8e707 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -468,7 +468,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
 /* Input packet from network to transport.  */
 static inline int dst_input(struct sk_buff *skb)
 {
-	return INDIRECT_CALL_INET(skb_dst(skb)->input,
+	return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input),
 				  ip6_input, ip_local_deliver, skb);
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index c306ebe379a0..eaac07d50595 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -142,8 +142,8 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 		dst->output = lwtunnel_output;
 	}
 	if (lwtunnel_input_redirect(dst->lwtstate)) {
-		dst->lwtstate->orig_input = dst->input;
-		dst->input = lwtunnel_input;
+		dst->lwtstate->orig_input = READ_ONCE(dst->input);
+		WRITE_ONCE(dst->input, lwtunnel_input);
 	}
 }
 #else
diff --git a/net/core/dst.c b/net/core/dst.c
index 8f2a3138d60c..13c629dc7123 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -148,7 +148,7 @@ void dst_dev_put(struct dst_entry *dst)
 	WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD);
 	if (dst->ops->ifdown)
 		dst->ops->ifdown(dst, dev);
-	dst->input = dst_discard;
+	WRITE_ONCE(dst->input, dst_discard);
 	dst->output = dst_discard_out;
 	dst->dev = blackhole_netdev;
 	netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d7a534a5f1ff..75a1f9eabd6b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1687,7 +1687,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
 		else if (rt->rt_gw_family == AF_INET6)
 			new_rt->rt_gw6 = rt->rt_gw6;
 
-		new_rt->dst.input = rt->dst.input;
+		new_rt->dst.input = READ_ONCE(rt->dst.input);
 		new_rt->dst.output = rt->dst.output;
 		new_rt->dst.error = rt->dst.error;
 		new_rt->dst.lastuse = jiffies;
-- 
cgit v1.2.3


From 2dce8c52a98995c4719def6f88629ab1581c0b82 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:29 +0000
Subject: net: dst: annotate data-races around dst->output

dst_dev_put() can overwrite dst->output while other
cpus might read this field (for instance from dst_output())

Add READ_ONCE()/WRITE_ONCE() annotations to suppress
potential issues.

We will likely need RCU protection in the future.

Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h      | 2 +-
 include/net/lwtunnel.h | 4 ++--
 net/core/dst.c         | 2 +-
 net/ipv4/route.c       | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index c0f8b6d8e707..b6acfde7d587 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -458,7 +458,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
 /* Output packet to network from transport.  */
 static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	return INDIRECT_CALL_INET(skb_dst(skb)->output,
+	return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output),
 				  ip6_output, ip_output,
 				  net, sk, skb);
 }
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index eaac07d50595..26232f603e33 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -138,8 +138,8 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
 	if (lwtunnel_output_redirect(dst->lwtstate)) {
-		dst->lwtstate->orig_output = dst->output;
-		dst->output = lwtunnel_output;
+		dst->lwtstate->orig_output = READ_ONCE(dst->output);
+		WRITE_ONCE(dst->output, lwtunnel_output);
 	}
 	if (lwtunnel_input_redirect(dst->lwtstate)) {
 		dst->lwtstate->orig_input = READ_ONCE(dst->input);
diff --git a/net/core/dst.c b/net/core/dst.c
index 13c629dc7123..52e824e57c17 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -149,7 +149,7 @@ void dst_dev_put(struct dst_entry *dst)
 	if (dst->ops->ifdown)
 		dst->ops->ifdown(dst, dev);
 	WRITE_ONCE(dst->input, dst_discard);
-	dst->output = dst_discard_out;
+	WRITE_ONCE(dst->output, dst_discard_out);
 	dst->dev = blackhole_netdev;
 	netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
 			   GFP_ATOMIC);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 75a1f9eabd6b..ce6aba4f01ff 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1688,7 +1688,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
 			new_rt->rt_gw6 = rt->rt_gw6;
 
 		new_rt->dst.input = READ_ONCE(rt->dst.input);
-		new_rt->dst.output = rt->dst.output;
+		new_rt->dst.output = READ_ONCE(rt->dst.output);
 		new_rt->dst.error = rt->dst.error;
 		new_rt->dst.lastuse = jiffies;
 		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
-- 
cgit v1.2.3


From 88fe14253e181878c2ddb51a298ae8c468a63010 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:30 +0000
Subject: net: dst: add four helpers to annotate data-races around dst->dev

dst->dev is read locklessly in many contexts,
and written in dst_dev_put().

Fixing all the races is going to need many changes.

We probably will have to add full RCU protection.

Add three helpers to ease this painful process.

static inline struct net_device *dst_dev(const struct dst_entry *dst)
{
       return READ_ONCE(dst->dev);
}

static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
{
       return dst_dev(skb_dst(skb));
}

static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
{
       return dev_net(skb_dst_dev(skb));
}

static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
{
       return dev_net_rcu(skb_dst_dev(skb));
}

Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h | 20 ++++++++++++++++++++
 net/core/dst.c    |  4 ++--
 net/core/sock.c   |  8 ++++----
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index b6acfde7d587..00467c1b5093 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -563,6 +563,26 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
 		dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
 }
 
+static inline struct net_device *dst_dev(const struct dst_entry *dst)
+{
+	return READ_ONCE(dst->dev);
+}
+
+static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
+{
+	return dst_dev(skb_dst(skb));
+}
+
+static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
+{
+	return dev_net(skb_dst_dev(skb));
+}
+
+static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
+{
+	return dev_net_rcu(skb_dst_dev(skb));
+}
+
 struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
 void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			       struct sk_buff *skb, u32 mtu, bool confirm_neigh);
diff --git a/net/core/dst.c b/net/core/dst.c
index 52e824e57c17..e2de8b68c41d 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst)
 		dst->ops->ifdown(dst, dev);
 	WRITE_ONCE(dst->input, dst_discard);
 	WRITE_ONCE(dst->output, dst_discard_out);
-	dst->dev = blackhole_netdev;
+	WRITE_ONCE(dst->dev, blackhole_netdev);
 	netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
 			   GFP_ATOMIC);
 }
@@ -263,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
 {
 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
-	return mtu ? : dst->dev->mtu;
+	return mtu ? : dst_dev(dst)->mtu;
 }
 EXPORT_SYMBOL_GPL(dst_blackhole_mtu);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index dc59fb7760a3..8b7623c7d547 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2588,8 +2588,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
 #endif
 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
-	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
-			READ_ONCE(dst->dev->gso_ipv4_max_size);
+	max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) :
+			READ_ONCE(dst_dev(dst)->gso_ipv4_max_size);
 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
 		max_size = GSO_LEGACY_MAX_SIZE;
 
@@ -2600,7 +2600,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
 	u32 max_segs = 1;
 
-	sk->sk_route_caps = dst->dev->features;
+	sk->sk_route_caps = dst_dev(dst)->features;
 	if (sk_is_tcp(sk)) {
 		struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -2618,7 +2618,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
-			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
+			max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1);
 		}
 	}
 	sk->sk_gso_max_segs = max_segs;
-- 
cgit v1.2.3


From a74fc62eec155ca5a6da8ff3856f3dc87fe24558 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:31 +0000
Subject: ipv4: adopt dst_dev, skb_dst_dev and skb_dst_dev_net[_rcu]

Use the new helpers as a first step to deal with
potential dst->dev races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h |  2 +-
 include/net/ip.h              | 11 ++++++-----
 include/net/route.h           |  2 +-
 net/ipv4/icmp.c               | 24 +++++++++++++-----------
 net/ipv4/igmp.c               |  2 +-
 net/ipv4/ip_fragment.c        |  2 +-
 net/ipv4/ip_output.c          |  6 +++---
 net/ipv4/ip_vti.c             |  4 ++--
 net/ipv4/netfilter.c          |  4 ++--
 net/ipv4/route.c              |  8 ++++----
 net/ipv4/tcp_fastopen.c       |  4 +++-
 net/ipv4/tcp_ipv4.c           |  2 +-
 net/ipv4/tcp_metrics.c        |  8 ++++----
 net/ipv4/xfrm4_output.c       |  2 +-
 14 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ae09e91398a5..19dbd9081d5a 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -481,7 +481,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
 					     const int sdif,
 					     bool *refcounted)
 {
-	struct net *net = dev_net_rcu(skb_dst(skb)->dev);
+	struct net *net = skb_dst_dev_net_rcu(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct sock *sk;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 391af454422e..befcba575129 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -472,7 +472,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 
 	rcu_read_lock();
 
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 	if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
 	    ip_mtu_locked(dst) ||
 	    !forwarding) {
@@ -486,7 +486,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	if (mtu)
 		goto out;
 
-	mtu = READ_ONCE(dst->dev->mtu);
+	mtu = READ_ONCE(dst_dev(dst)->mtu);
 
 	if (unlikely(ip_mtu_locked(dst))) {
 		if (rt->rt_uses_gateway && mtu > 576)
@@ -506,16 +506,17 @@ out:
 static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
 					  const struct sk_buff *skb)
 {
+	const struct dst_entry *dst = skb_dst(skb);
 	unsigned int mtu;
 
 	if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
 		bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
 
-		return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
+		return ip_dst_mtu_maybe_forward(dst, forwarding);
 	}
 
-	mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
-	return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
+	mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU);
+	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
 struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
diff --git a/include/net/route.h b/include/net/route.h
index 3d3d6048ffca..7ea840daa775 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 		const struct net *net;
 
 		rcu_read_lock();
-		net = dev_net_rcu(dst->dev);
+		net = dev_net_rcu(dst_dev(dst));
 		hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
 		rcu_read_unlock();
 	}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 717cb7d3607a..2ffe73ea644f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -311,18 +311,20 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 {
 	struct dst_entry *dst = &rt->dst;
 	struct inet_peer *peer;
+	struct net_device *dev;
 	bool rc = true;
 
 	if (!apply_ratelimit)
 		return true;
 
 	/* No rate limit on loopback */
-	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+	dev = dst_dev(dst);
+	if (dev && (dev->flags & IFF_LOOPBACK))
 		goto out;
 
 	rcu_read_lock();
 	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
-			       l3mdev_master_ifindex_rcu(dst->dev));
+			       l3mdev_master_ifindex_rcu(dev));
 	rc = inet_peer_xrlim_allow(peer,
 				   READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
 	rcu_read_unlock();
@@ -466,13 +468,13 @@ out_bh_enable:
  */
 static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
 {
-	struct net_device *route_lookup_dev = NULL;
+	struct net_device *dev = skb->dev;
+	const struct dst_entry *dst;
 
-	if (skb->dev)
-		route_lookup_dev = skb->dev;
-	else if (skb_dst(skb))
-		route_lookup_dev = skb_dst(skb)->dev;
-	return route_lookup_dev;
+	if (dev)
+		return dev;
+	dst = skb_dst(skb);
+	return dst ? dst_dev(dst) : NULL;
 }
 
 static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
@@ -869,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
 	struct net *net;
 	u32 info = 0;
 
-	net = dev_net_rcu(skb_dst(skb)->dev);
+	net = skb_dst_dev_net_rcu(skb);
 
 	/*
 	 *	Incomplete header ?
@@ -1012,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
 	struct icmp_bxm icmp_param;
 	struct net *net;
 
-	net = dev_net_rcu(skb_dst(skb)->dev);
+	net = skb_dst_dev_net_rcu(skb);
 	/* should there be an ICMP stat for ignored echos? */
 	if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
 		return SKB_NOT_DROPPED_YET;
@@ -1182,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
 	return SKB_NOT_DROPPED_YET;
 
 out_err:
-	__ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+	__ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS);
 	return SKB_DROP_REASON_PKT_TOO_SMALL;
 }
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d1769034b643..7182f1419c2a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -427,7 +427,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
 
 	pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
 
-	return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+	return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb);
 }
 
 static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 64b3fb3208af..b2584cce90ae 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -476,7 +476,7 @@ out_fail:
 /* Process an incoming IP datagram fragment. */
 int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 {
-	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+	struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
 	int vif = l3mdev_master_ifindex_rcu(dev);
 	struct ipq *qp;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a2705d454fd6..414b47a0d513 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -116,7 +116,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 	skb->protocol = htons(ETH_P_IP);
 
 	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
-		       net, sk, skb, NULL, skb_dst(skb)->dev,
+		       net, sk, skb, NULL, skb_dst_dev(skb),
 		       dst_output);
 }
 
@@ -199,7 +199,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct rtable *rt = dst_rtable(dst);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
 	bool is_v6gw = false;
@@ -425,7 +425,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
+	struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev;
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 686e4f3d83aa..95b6bb78fcd2 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -229,7 +229,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error_icmp;
 	}
 
-	tdev = dst->dev;
+	tdev = dst_dev(dst);
 
 	if (tdev == dev) {
 		dst_release(dst);
@@ -259,7 +259,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
 xmit:
 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
 	skb_dst_set(skb, dst);
-	skb->dev = skb_dst(skb)->dev;
+	skb->dev = skb_dst_dev(skb);
 
 	err = dst_output(tunnel->net, skb->sk, skb);
 	if (net_xmit_eval(err) == 0)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 08bc3f2c0078..0565f001120d 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -20,12 +20,12 @@
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
 {
+	struct net_device *dev = skb_dst_dev(skb);
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi4 fl4 = {};
 	__be32 saddr = iph->saddr;
 	__u8 flags;
-	struct net_device *dev = skb_dst(skb)->dev;
 	struct flow_keys flkeys;
 	unsigned int hh_len;
 
@@ -74,7 +74,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
 #endif
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb_dst(skb)->dev->hard_header_len;
+	hh_len = skb_dst_dev(skb)->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
 				0, GFP_ATOMIC))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ce6aba4f01ff..64ba377cd6cc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -413,7 +413,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 					   const void *daddr)
 {
 	const struct rtable *rt = container_of(dst, struct rtable, dst);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	struct neighbour *n;
 
 	rcu_read_lock();
@@ -440,7 +440,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
 	const struct rtable *rt = container_of(dst, struct rtable, dst);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	const __be32 *pkey = daddr;
 
 	if (rt->rt_gw_family == AF_INET) {
@@ -1026,7 +1026,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		return;
 
 	rcu_read_lock();
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 	if (mtu < net->ipv4.ip_rt_min_pmtu) {
 		lock = true;
 		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
@@ -1326,7 +1326,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
 	struct net *net;
 
 	rcu_read_lock();
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 	advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
 				   net->ipv4.ip_rt_min_advmss);
 	rcu_read_unlock();
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 5107121c5e37..f1884f0c9e52 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -559,6 +559,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net_device *dev;
 	struct dst_entry *dst;
 	struct sk_buff *skb;
 
@@ -576,7 +577,8 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
 	} else if (tp->syn_fastopen_ch &&
 		   atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
 		dst = sk_dst_get(sk);
-		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+		dev = dst ? dst_dev(dst) : NULL;
+		if (!(dev && (dev->flags & IFF_LOOPBACK)))
 			atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
 		dst_release(dst);
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b406fd012b2e..a847d894ace3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -788,7 +788,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 	arg.iov[0].iov_base = (unsigned char *)&rep;
 	arg.iov[0].iov_len  = sizeof(rep.th);
 
-	net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
+	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
 
 	/* Invalid TCP option size or twice included auth */
 	if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4251670e328c..03c068ea27b6 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -166,11 +166,11 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
 					  unsigned int hash)
 {
 	struct tcp_metrics_block *tm;
-	struct net *net;
 	bool reclaim = false;
+	struct net *net;
 
 	spin_lock_bh(&tcp_metrics_lock);
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 
 	/* While waiting for the spin-lock the cache might have been populated
 	 * with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
 		return NULL;
 	}
 
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 	hash ^= net_hash_mix(net);
 	hash = hash_32(hash, tcp_metrics_hash_log);
 
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
 	else
 		return NULL;
 
-	net = dev_net_rcu(dst->dev);
+	net = dev_net_rcu(dst_dev(dst));
 	hash ^= net_hash_mix(net);
 	hash = hash_32(hash, tcp_metrics_hash_log);
 
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 3cff51ba72bb..0ae67d537499 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -31,7 +31,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
-			    net, sk, skb, skb->dev, skb_dst(skb)->dev,
+			    net, sk, skb, skb->dev, skb_dst_dev(skb),
 			    __xfrm4_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-- 
cgit v1.2.3


From 1caf27297215a5241f9bfc9c07336349d9034ee3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:32 +0000
Subject: ipv6: adopt dst_dev() helper

Use the new helper as a step to deal with potential dst->dev races.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-9-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip6_route.h          |  4 ++--
 net/ipv6/exthdrs.c               |  2 +-
 net/ipv6/icmp.c                  |  4 +++-
 net/ipv6/ila/ila_lwt.c           |  2 +-
 net/ipv6/ioam6_iptunnel.c        |  4 ++--
 net/ipv6/ip6_gre.c               |  8 +++++---
 net/ipv6/ip6_output.c            | 19 ++++++++++---------
 net/ipv6/ip6_tunnel.c            |  4 ++--
 net/ipv6/ip6_udp_tunnel.c        |  2 +-
 net/ipv6/ip6_vti.c               |  2 +-
 net/ipv6/ndisc.c                 |  6 ++++--
 net/ipv6/netfilter/nf_dup_ipv6.c |  2 +-
 net/ipv6/output_core.c           |  2 +-
 net/ipv6/route.c                 | 20 ++++++++++++--------
 net/ipv6/rpl_iptunnel.c          |  4 ++--
 net/ipv6/seg6_iptunnel.c         | 20 +++++++++++---------
 net/ipv6/seg6_local.c            |  2 +-
 17 files changed, 60 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 6dbdf60b342f..9255f21818ee 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -274,7 +274,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
 	unsigned int mtu;
 
 	if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
-		mtu = READ_ONCE(dst->dev->mtu);
+		mtu = READ_ONCE(dst_dev(dst)->mtu);
 		mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
 	} else {
 		mtu = dst_mtu(dst);
@@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst
 
 	mtu = IPV6_MIN_MTU;
 	rcu_read_lock();
-	idev = __in6_dev_get(dst->dev);
+	idev = __in6_dev_get(dst_dev(dst));
 	if (idev)
 		mtu = READ_ONCE(idev->cnf.mtu6);
 	rcu_read_unlock();
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 457de0745a33..1947530fb20a 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -306,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
 	    !pskb_may_pull(skb, (skb_transport_offset(skb) +
 				 ((skb_transport_header(skb)[1] + 1) << 3)))) {
-		__IP6_INC_STATS(dev_net(dst->dev), idev,
+		__IP6_INC_STATS(dev_net(dst_dev(dst)), idev,
 				IPSTATS_MIB_INHDRERRORS);
 fail_and_free:
 		kfree_skb(skb);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3fd19a84b358..44550957fd4e 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -196,6 +196,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 			       struct flowi6 *fl6, bool apply_ratelimit)
 {
 	struct net *net = sock_net(sk);
+	struct net_device *dev;
 	struct dst_entry *dst;
 	bool res = false;
 
@@ -208,10 +209,11 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	 * this lookup should be more aggressive (not longer than timeout).
 	 */
 	dst = ip6_route_output(net, sk, fl6);
+	dev = dst_dev(dst);
 	if (dst->error) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst),
 			      IPSTATS_MIB_OUTNOROUTES);
-	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+	} else if (dev && (dev->flags & IFF_LOOPBACK)) {
 		res = true;
 	} else {
 		struct rt6_info *rt = dst_rt6_info(dst);
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 7d574f5132e2..7bb9edc5c28c 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -70,7 +70,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		 */
 
 		memset(&fl6, 0, sizeof(fl6));
-		fl6.flowi6_oif = orig_dst->dev->ifindex;
+		fl6.flowi6_oif = dst_dev(orig_dst)->ifindex;
 		fl6.flowi6_iif = LOOPBACK_IFINDEX;
 		fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst),
 					 &ip6h->daddr);
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
index 40df8bdfaacd..1fe7894f14dd 100644
--- a/net/ipv6/ioam6_iptunnel.c
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -335,7 +335,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
 	if (has_tunsrc)
 		memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc));
 	else
-		ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr,
+		ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr,
 				   IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
 
 	skb_postpush_rcsum(skb, hdr, len);
@@ -442,7 +442,7 @@ do_encap:
 			dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
 		local_bh_enable();
 
-		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
 		if (unlikely(err))
 			goto drop;
 	}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 2dc9dcffe2ca..a1210fd6404e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1085,9 +1085,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 			 htonl(atomic_fetch_inc(&t->o_seqno)));
 
 	/* TooBig packet may have updated dst->dev's mtu */
-	if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
-		dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false);
-
+	if (!t->parms.collect_md && dst) {
+		mtu = READ_ONCE(dst_dev(dst)->mtu);
+		if (dst_mtu(dst) > mtu)
+			dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
+	}
 	err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
 			   NEXTHDR_GRE);
 	if (err != 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 7bd29a9ff0db..f494b4ece6b7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -60,7 +60,7 @@
 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	struct inet6_dev *idev = ip6_dst_idev(dst);
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	const struct in6_addr *daddr, *nexthop;
@@ -271,7 +271,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl6->daddr;
 	struct dst_entry *dst = skb_dst(skb);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	struct inet6_dev *idev = ip6_dst_idev(dst);
 	struct hop_jumbo_hdr *hop_jumbo;
 	int hoplen = sizeof(*hop_jumbo);
@@ -503,7 +503,8 @@ int ip6_forward(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct inet6_skb_parm *opt = IP6CB(skb);
-	struct net *net = dev_net(dst->dev);
+	struct net *net = dev_net(dst_dev(dst));
+	struct net_device *dev;
 	struct inet6_dev *idev;
 	SKB_DR(reason);
 	u32 mtu;
@@ -591,12 +592,12 @@ int ip6_forward(struct sk_buff *skb)
 		goto drop;
 	}
 	dst = skb_dst(skb);
-
+	dev = dst_dev(dst);
 	/* IPv6 specs say nothing about it, but it is clear that we cannot
 	   send redirects to source routed frames.
 	   We don't send redirects to frames decapsulated from IPsec.
 	 */
-	if (IP6CB(skb)->iif == dst->dev->ifindex &&
+	if (IP6CB(skb)->iif == dev->ifindex &&
 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
 		struct in6_addr *target = NULL;
 		struct inet_peer *peer;
@@ -644,7 +645,7 @@ int ip6_forward(struct sk_buff *skb)
 
 	if (ip6_pkt_too_big(skb, mtu)) {
 		/* Again, force OUTPUT device used as source address */
-		skb->dev = dst->dev;
+		skb->dev = dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
@@ -653,7 +654,7 @@ int ip6_forward(struct sk_buff *skb)
 		return -EMSGSIZE;
 	}
 
-	if (skb_cow(skb, dst->dev->hard_header_len)) {
+	if (skb_cow(skb, dev->hard_header_len)) {
 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 				IPSTATS_MIB_OUTDISCARDS);
 		goto drop;
@@ -666,7 +667,7 @@ int ip6_forward(struct sk_buff *skb)
 	hdr->hop_limit--;
 
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
-		       net, NULL, skb, skb->dev, dst->dev,
+		       net, NULL, skb, skb->dev, dev,
 		       ip6_forward_finish);
 
 error:
@@ -1093,7 +1094,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 #ifdef CONFIG_IPV6_SUBTREES
 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 #endif
-	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
 		dst_release(dst);
 		dst = NULL;
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 436e077061d1..cd8a4141e5c0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1179,7 +1179,7 @@ route_lookup:
 		ndst = dst;
 	}
 
-	tdev = dst->dev;
+	tdev = dst_dev(dst);
 
 	if (tdev == dev) {
 		DEV_STATS_INC(dev, collisions);
@@ -1255,7 +1255,7 @@ route_lookup:
 	/* Calculate max headroom for all the headers and adjust
 	 * needed_headroom if necessary.
 	 */
-	max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr)
 			+ dst->header_len + t->hlen;
 	if (max_headroom > READ_ONCE(dev->needed_headroom))
 		WRITE_ONCE(dev->needed_headroom, max_headroom);
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 8ebe17a6058a..0ff547a4bff7 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -168,7 +168,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
 		return ERR_PTR(-ENETUNREACH);
 	}
-	if (dst->dev == dev) { /* is this necessary? */
+	if (dst_dev(dst) == dev) { /* is this necessary? */
 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
 		dst_release(dst);
 		return ERR_PTR(-ELOOP);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 40464a88bca6..2a86de922d42 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -497,7 +497,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 			      (const struct in6_addr *)&x->id.daddr))
 		goto tx_err_link_failure;
 
-	tdev = dst->dev;
+	tdev = dst_dev(dst);
 
 	if (tdev == dev) {
 		DEV_STATS_INC(dev, collisions);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index ecb5c4b8518f..f2299b61221b 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -473,6 +473,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
 {
 	struct icmp6hdr *icmp6h = icmp6_hdr(skb);
 	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *dev;
 	struct inet6_dev *idev;
 	struct net *net;
 	struct sock *sk;
@@ -507,11 +508,12 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
 
 	ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
 
-	idev = __in6_dev_get(dst->dev);
+	dev = dst_dev(dst);
+	idev = __in6_dev_get(dev);
 	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
 
 	err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
-		      net, sk, skb, NULL, dst->dev,
+		      net, sk, skb, NULL, dev,
 		      dst_output);
 	if (!err) {
 		ICMP6MSGOUT_INC_STATS(net, idev, type);
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index b903c62c00c9..6da3102b7c1b 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -38,7 +38,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
 	}
 	skb_dst_drop(skb);
 	skb_dst_set(skb, dst);
-	skb->dev      = dst->dev;
+	skb->dev      = dst_dev(dst);
 	skb->protocol = htons(ETH_P_IPV6);
 
 	return true;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 806d4b5dd1e6..90a178dd24aa 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -105,7 +105,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 {
 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
 	if (hoplimit == 0) {
-		struct net_device *dev = dst->dev;
+		struct net_device *dev = dst_dev(dst);
 		struct inet6_dev *idev;
 
 		rcu_read_lock();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 375112a59492..dacfe1284918 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -228,13 +228,13 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 	const struct rt6_info *rt = dst_rt6_info(dst);
 
 	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
-				dst->dev, skb, daddr);
+				dst_dev(dst), skb, daddr);
 }
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
 	const struct rt6_info *rt = dst_rt6_info(dst);
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 
 	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
 	if (!daddr)
@@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 
 		if (res.f6i->nh) {
 			struct fib6_nh_match_arg arg = {
-				.dev = dst->dev,
+				.dev = dst_dev(dst),
 				.gw = &rt6->rt6i_gateway,
 			};
 
@@ -3238,7 +3238,7 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect);
 
 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 {
-	struct net_device *dev = dst->dev;
+	struct net_device *dev = dst_dev(dst);
 	unsigned int mtu = dst_mtu(dst);
 	struct net *net;
 
@@ -4301,7 +4301,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 
 	if (res.f6i->nh) {
 		struct fib6_nh_match_arg arg = {
-			.dev = dst->dev,
+			.dev = dst_dev(dst),
 			.gw = &rt->rt6i_gateway,
 		};
 
@@ -4587,13 +4587,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct net *net = dev_net(dst->dev);
+	struct net_device *dev = dst_dev(dst);
+	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
 	SKB_DR(reason);
 	int type;
 
 	if (netif_is_l3_master(skb->dev) ||
-	    dst->dev == net->loopback_dev)
+	    dev == net->loopback_dev)
 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 	else
 		idev = ip6_dst_idev(dst);
@@ -5844,11 +5845,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 	 * each as a nexthop within RTA_MULTIPATH.
 	 */
 	if (rt6) {
+		struct net_device *dev;
+
 		if (rt6_flags & RTF_GATEWAY &&
 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
 			goto nla_put_failure;
 
-		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+		dev = dst_dev(dst);
+		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
 			goto nla_put_failure;
 
 		if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
index 7c05ac846646..1f41f53fbaff 100644
--- a/net/ipv6/rpl_iptunnel.c
+++ b/net/ipv6/rpl_iptunnel.c
@@ -242,7 +242,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 			local_bh_enable();
 		}
 
-		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
 		if (unlikely(err))
 			goto drop;
 	}
@@ -297,7 +297,7 @@ static int rpl_input(struct sk_buff *skb)
 			local_bh_enable();
 		}
 
-		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
 		if (unlikely(err))
 			goto drop;
 	} else {
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 51583461ae29..27918fc0c972 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -128,7 +128,8 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 			       int proto, struct dst_entry *cache_dst)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct net *net = dev_net(dst->dev);
+	struct net_device *dev = dst_dev(dst);
+	struct net *net = dev_net(dev);
 	struct ipv6hdr *hdr, *inner_hdr;
 	struct ipv6_sr_hdr *isrh;
 	int hdrlen, tot_len, err;
@@ -181,7 +182,7 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 	isrh->nexthdr = proto;
 
 	hdr->daddr = isrh->segments[isrh->first_segment];
-	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+	set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	if (sr_has_hmac(isrh)) {
@@ -212,7 +213,8 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb,
 {
 	__u8 first_seg = osrh->first_segment;
 	struct dst_entry *dst = skb_dst(skb);
-	struct net *net = dev_net(dst->dev);
+	struct net_device *dev = dst_dev(dst);
+	struct net *net = dev_net(dev);
 	struct ipv6hdr *hdr, *inner_hdr;
 	int hdrlen = ipv6_optlen(osrh);
 	int red_tlv_offset, tlv_offset;
@@ -270,7 +272,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb,
 	if (skip_srh) {
 		hdr->nexthdr = proto;
 
-		set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+		set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
 		goto out;
 	}
 
@@ -306,7 +308,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb,
 
 srcaddr:
 	isrh->nexthdr = proto;
-	set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+	set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
@@ -507,7 +509,7 @@ static int seg6_input_core(struct net *net, struct sock *sk,
 			local_bh_enable();
 		}
 
-		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
 		if (unlikely(err))
 			goto drop;
 	} else {
@@ -518,7 +520,7 @@ static int seg6_input_core(struct net *net, struct sock *sk,
 	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 			       dev_net(skb->dev), NULL, skb, NULL,
-			       skb_dst(skb)->dev, seg6_input_finish);
+			       skb_dst_dev(skb), seg6_input_finish);
 
 	return seg6_input_finish(dev_net(skb->dev), NULL, skb);
 drop:
@@ -593,7 +595,7 @@ static int seg6_output_core(struct net *net, struct sock *sk,
 			local_bh_enable();
 		}
 
-		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
 		if (unlikely(err))
 			goto drop;
 	}
@@ -603,7 +605,7 @@ static int seg6_output_core(struct net *net, struct sock *sk,
 
 	if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
-			       NULL, skb_dst(skb)->dev, dst_output);
+			       NULL, dst_dev(dst), dst_output);
 
 	return dst_output(net, sk, skb);
 drop:
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 4834d72624cf..2b41e4c0dddd 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -313,7 +313,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 	if (!local_delivery)
 		dev_flags |= IFF_LOOPBACK;
 
-	if (dst && (dst->dev->flags & dev_flags) && !dst->error) {
+	if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) {
 		dst_release(dst);
 		dst = NULL;
 	}
-- 
cgit v1.2.3


From 93d1cff35adc522a5d21e722eee1071f3f7dc716 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 30 Jun 2025 12:19:33 +0000
Subject: ipv6: adopt skb_dst_dev() and skb_dst_dev_net[_rcu]() helpers

Use the new helpers as a step to deal with potential dst->dev races.

v2: fix typo in ipv6_rthdr_rcv() (kernel test robot <lkp@intel.com>)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250630121934.3399505-10-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet6_hashtables.h      |  2 +-
 include/net/ip6_tunnel.h            |  2 +-
 net/ipv6/exthdrs.c                  |  8 ++++----
 net/ipv6/ioam6.c                    | 17 +++++++++--------
 net/ipv6/ip6_input.c                |  6 ++++--
 net/ipv6/ip6_output.c               |  5 +++--
 net/ipv6/ip6_tunnel.c               |  2 +-
 net/ipv6/ip6_vti.c                  |  2 +-
 net/ipv6/netfilter.c                |  4 ++--
 net/ipv6/netfilter/nf_reject_ipv6.c |  2 +-
 net/ipv6/output_core.c              |  2 +-
 net/ipv6/reassembly.c               | 10 +++++-----
 net/ipv6/route.c                    |  4 ++--
 net/ipv6/seg6_iptunnel.c            |  6 +++---
 net/ipv6/tcp_ipv6.c                 |  4 ++--
 net/ipv6/xfrm6_output.c             |  2 +-
 16 files changed, 41 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index c32878c69179..ab3929a2a956 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
 					      int iif, int sdif,
 					      bool *refcounted)
 {
-	struct net *net = dev_net_rcu(skb_dst(skb)->dev);
+	struct net *net = skb_dst_dev_net_rcu(skb);
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct sock *sk;
 
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index dd163495f353..120db2865811 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -159,7 +159,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
 	memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 	IP6CB(skb)->flags = ip6cb_flags;
 	pkt_len = skb->len - skb_inner_network_offset(skb);
-	err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
+	err = ip6_local_out(skb_dst_dev_net(skb), sk, skb);
 
 	if (dev) {
 		if (unlikely(net_xmit_eval(err)))
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 1947530fb20a..d1ef9644f826 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -460,7 +460,7 @@ looped_back:
 		return -1;
 	}
 
-	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+	if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
@@ -621,7 +621,7 @@ looped_back:
 		return -1;
 	}
 
-	if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+	if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
@@ -783,7 +783,7 @@ looped_back:
 			kfree_skb(skb);
 			return -1;
 		}
-		if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
+		if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) {
 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 			kfree_skb(skb);
 			return -1;
@@ -809,7 +809,7 @@ looped_back:
 		return -1;
 	}
 
-	if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
+	if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
 		if (ipv6_hdr(skb)->hop_limit <= 1) {
 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 			icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
index a84d332f952f..9553a3200081 100644
--- a/net/ipv6/ioam6.c
+++ b/net/ipv6/ioam6.c
@@ -696,6 +696,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 				    struct ioam6_schema *sc,
 				    u8 sclen, bool is_input)
 {
+	struct net_device *dev = skb_dst_dev(skb);
 	struct timespec64 ts;
 	ktime_t tstamp;
 	u64 raw64;
@@ -712,7 +713,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (is_input)
 			byte--;
 
-		raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id;
+		raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id;
 
 		*(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
 		data += sizeof(__be32);
@@ -728,10 +729,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		*(__be16 *)data = cpu_to_be16(raw16);
 		data += sizeof(__be16);
 
-		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+		if (dev->flags & IFF_LOOPBACK)
 			raw16 = IOAM6_U16_UNAVAILABLE;
 		else
-			raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id);
+			raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id);
 
 		*(__be16 *)data = cpu_to_be16(raw16);
 		data += sizeof(__be16);
@@ -783,10 +784,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		struct Qdisc *qdisc;
 		__u32 qlen, backlog;
 
-		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+		if (dev->flags & IFF_LOOPBACK) {
 			*(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
 		} else {
-			queue = skb_get_tx_queue(skb_dst(skb)->dev, skb);
+			queue = skb_get_tx_queue(dev, skb);
 			qdisc = rcu_dereference(queue->qdisc);
 			qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog);
 
@@ -807,7 +808,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		if (is_input)
 			byte--;
 
-		raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide;
+		raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide;
 
 		*(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
 		data += sizeof(__be64);
@@ -823,10 +824,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb,
 		*(__be32 *)data = cpu_to_be32(raw32);
 		data += sizeof(__be32);
 
-		if (skb_dst(skb)->dev->flags & IFF_LOOPBACK)
+		if (dev->flags & IFF_LOOPBACK)
 			raw32 = IOAM6_U32_UNAVAILABLE;
 		else
-			raw32 = READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide);
+			raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide);
 
 		*(__be32 *)data = cpu_to_be32(raw32);
 		data += sizeof(__be32);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 39da6a7ce5f1..16953bd00960 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -187,7 +187,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
 	 * arrived via the sending interface (ethX), because of the
 	 * nature of scoping architecture. --yoshfuji
 	 */
-	IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+	IP6CB(skb)->iif = skb_valid_dst(skb) ?
+				ip6_dst_idev(skb_dst(skb))->dev->ifindex :
+				dev->ifindex;
 
 	if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
 		goto err;
@@ -504,7 +506,7 @@ int ip6_mc_input(struct sk_buff *skb)
 	struct net_device *dev;
 	bool deliver;
 
-	__IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
+	__IP6_UPD_PO_STATS(skb_dst_dev_net(skb),
 			 __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
 			 skb->len);
 
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f494b4ece6b7..877bee7ffee9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -232,8 +232,9 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 
 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
-	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *dev = dst_dev(dst), *indev = skb->dev;
+	struct inet6_dev *idev = ip6_dst_idev(dst);
 
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->dev = dev;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index cd8a4141e5c0..3262e81223df 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -632,7 +632,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	} else {
 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr,
 				   ip4h_dscp(eiph), skb2->dev) ||
-		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6)
+		    skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6)
 			goto out;
 	}
 
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 2a86de922d42..ad5290be4dd6 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -529,7 +529,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 xmit:
 	skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
 	skb_dst_set(skb, dst);
-	skb->dev = skb_dst(skb)->dev;
+	skb->dev = dst_dev(dst);
 
 	err = dst_output(t->net, skb->sk, skb);
 	if (net_xmit_eval(err) == 0)
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 4541836ee3da..45f9105f9ac1 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -24,7 +24,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct sock *sk = sk_to_full_sk(sk_partial);
-	struct net_device *dev = skb_dst(skb)->dev;
+	struct net_device *dev = skb_dst_dev(skb);
 	struct flow_keys flkeys;
 	unsigned int hh_len;
 	struct dst_entry *dst;
@@ -72,7 +72,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff
 #endif
 
 	/* Change in oif may mean change in hh_len. */
-	hh_len = skb_dst(skb)->dev->hard_header_len;
+	hh_len = skb_dst_dev(skb)->hard_header_len;
 	if (skb_headroom(skb) < hh_len &&
 	    pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
 			     0, GFP_ATOMIC))
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 9ae2b2725bf9..838295fa32e3 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -300,7 +300,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
 		skb_dst_set(oldskb, dst);
 	}
 
-	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb));
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
 	security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
 	dst = ip6_route_output(net, NULL, &fl6);
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 90a178dd24aa..d21fe27fe21e 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -141,7 +141,7 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 	skb->protocol = htons(ETH_P_IPV6);
 
 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
-		       net, sk, skb, NULL, skb_dst(skb)->dev,
+		       net, sk, skb, NULL, skb_dst_dev(skb),
 		       dst_output);
 }
 EXPORT_SYMBOL_GPL(__ip6_local_out);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 7d4bcf3fda5b..25ec8001898d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -104,11 +104,11 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 	return container_of(q, struct frag_queue, q);
 }
 
-static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
+static int ip6_frag_queue(struct net *net,
+			  struct frag_queue *fq, struct sk_buff *skb,
 			  struct frag_hdr *fhdr, int nhoff,
 			  u32 *prob_offset, int *refs)
 {
-	struct net *net = dev_net(skb_dst(skb)->dev);
 	int offset, end, fragsize;
 	struct sk_buff *prev_tail;
 	struct net_device *dev;
@@ -324,10 +324,10 @@ out_fail:
 
 static int ipv6_frag_rcv(struct sk_buff *skb)
 {
+	const struct ipv6hdr *hdr = ipv6_hdr(skb);
+	struct net *net = skb_dst_dev_net(skb);
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
-	const struct ipv6hdr *hdr = ipv6_hdr(skb);
-	struct net *net = dev_net(skb_dst(skb)->dev);
 	u8 nexthdr;
 	int iif;
 
@@ -384,7 +384,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		spin_lock(&fq->q.lock);
 
 		fq->iif = iif;
-		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
+		ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff,
 				     &prob_offset, &refs);
 
 		spin_unlock(&fq->q.lock);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index dacfe1284918..3fbe0885c21c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4631,7 +4631,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
 
 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	skb->dev = skb_dst(skb)->dev;
+	skb->dev = skb_dst_dev(skb);
 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
 }
 
@@ -4642,7 +4642,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
 
 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	skb->dev = skb_dst(skb)->dev;
+	skb->dev = skb_dst_dev(skb);
 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
 }
 
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 27918fc0c972..3e1b9991131a 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -364,7 +364,7 @@ static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	if (sr_has_hmac(isrh)) {
-		struct net *net = dev_net(skb_dst(skb)->dev);
+		struct net *net = skb_dst_dev_net(skb);
 
 		err = seg6_push_hmac(net, &hdr->saddr, isrh);
 		if (unlikely(err))
@@ -530,7 +530,7 @@ drop:
 
 static int seg6_input_nf(struct sk_buff *skb)
 {
-	struct net_device *dev = skb_dst(skb)->dev;
+	struct net_device *dev = skb_dst_dev(skb);
 	struct net *net = dev_net(skb->dev);
 
 	switch (skb->protocol) {
@@ -616,7 +616,7 @@ drop:
 
 static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct net_device *dev = skb_dst(skb)->dev;
+	struct net_device *dev = skb_dst_dev(skb);
 
 	switch (skb->protocol) {
 	case htons(ETH_P_IP):
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ed0b891885d8..8f2c3cba1f1f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -868,7 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 				 int oif, int rst, u8 tclass, __be32 label,
 				 u32 priority, u32 txhash, struct tcp_key *key)
 {
-	struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
+	struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
 	unsigned int tot_len = sizeof(struct tcphdr);
 	struct sock *ctl_sk = net->ipv6.tcp_sk;
 	const struct tcphdr *th = tcp_hdr(skb);
@@ -1043,7 +1043,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
 	if (!sk && !ipv6_unicast_destination(skb))
 		return;
 
-	net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
+	net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
 	/* Invalid TCP option size or twice included auth */
 	if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
 		return;
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index b3d5d1f266ee..512bdaf13699 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -106,7 +106,7 @@ skip_frag:
 int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
-			    net, sk, skb,  skb->dev, skb_dst(skb)->dev,
+			    net, sk, skb,  skb->dev, skb_dst_dev(skb),
 			    __xfrm6_output,
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
-- 
cgit v1.2.3


From 4d313f2bd22213caace3fe4fb02977b527f9c6c3 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 1 Jul 2025 09:03:51 +0800
Subject: tun: remove unnecessary tun_xdp_hdr structure

With f95f0f95cfb7("net, xdp: Introduce xdp_init_buff utility routine"),
buffer length could be stored as frame size so there's no need to have
a dedicated tun_xdp_hdr structure. We can simply store virtio net
header instead.

Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Link: https://patch.msgid.link/20250701010352.74515-1-jasowang@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c      | 5 ++---
 drivers/net/tun.c      | 5 ++---
 drivers/vhost/net.c    | 8 ++------
 include/linux/if_tun.h | 5 -----
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index bdf0788d8e66..d82eb7276a8b 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1044,9 +1044,8 @@ static const struct file_operations tap_fops = {
 
 static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
 {
-	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
-	struct virtio_net_hdr *gso = &hdr->gso;
-	int buflen = hdr->buflen;
+	struct virtio_net_hdr *gso = xdp->data_hard_start;
+	int buflen = xdp->frame_sz;
 	int vnet_hdr_len = 0;
 	struct tap_dev *tap;
 	struct sk_buff *skb;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f8c5e2fd04df..447c37959504 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2356,13 +2356,12 @@ static int tun_xdp_one(struct tun_struct *tun,
 		       struct tun_page *tpage)
 {
 	unsigned int datasize = xdp->data_end - xdp->data;
-	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
-	struct virtio_net_hdr *gso = &hdr->gso;
+	struct virtio_net_hdr *gso = xdp->data_hard_start;
 	struct bpf_prog *xdp_prog;
 	struct sk_buff *skb = NULL;
 	struct sk_buff_head *queue;
 	u32 rxhash = 0, act;
-	int buflen = hdr->buflen;
+	int buflen = xdp->frame_sz;
 	int metasize = 0;
 	int ret = 0;
 	bool skb_xdp = false;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7cbfc7d718b3..777eb6193985 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -668,7 +668,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 	struct socket *sock = vhost_vq_get_backend(vq);
 	struct virtio_net_hdr *gso;
 	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
-	struct tun_xdp_hdr *hdr;
 	size_t len = iov_iter_count(from);
 	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
@@ -691,15 +690,13 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 	if (unlikely(!buf))
 		return -ENOMEM;
 
-	copied = copy_from_iter(buf + offsetof(struct tun_xdp_hdr, gso),
-				sock_hlen, from);
+	copied = copy_from_iter(buf, sock_hlen, from);
 	if (copied != sock_hlen) {
 		ret = -EFAULT;
 		goto err;
 	}
 
-	hdr = buf;
-	gso = &hdr->gso;
+	gso = buf;
 
 	if (!sock_hlen)
 		memset(buf, 0, pad);
@@ -727,7 +724,6 @@ static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
 
 	xdp_init_buff(xdp, buflen, NULL);
 	xdp_prepare_buff(xdp, buf, pad, len, true);
-	hdr->buflen = buflen;
 
 	++nvq->batched_xdp;
 
diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h
index 043d442994b0..80166eb62f41 100644
--- a/include/linux/if_tun.h
+++ b/include/linux/if_tun.h
@@ -19,11 +19,6 @@ struct tun_msg_ctl {
 	void *ptr;
 };
 
-struct tun_xdp_hdr {
-	int buflen;
-	struct virtio_net_hdr gso;
-};
-
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-- 
cgit v1.2.3


From 42401c42389622424f2973ec57414f033ae6be8f Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Sun, 29 Jun 2025 17:21:31 +0300
Subject: netlink: introduce type-checking attribute iteration for nlmsg

Add the nlmsg_for_each_attr_type() macro to simplify iteration over
attributes of a specific type in a Netlink message.

Convert existing users in vxlan and nfsd to use the new macro.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-2-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_vnifilter.c | 13 ++++---------
 fs/nfsd/nfsctl.c                    | 36 ++++++++++++++----------------------
 include/net/netlink.h               | 14 ++++++++++++++
 3 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c
index 4ff56d9f8f28..adc89e651e27 100644
--- a/drivers/net/vxlan/vxlan_vnifilter.c
+++ b/drivers/net/vxlan/vxlan_vnifilter.c
@@ -971,15 +971,10 @@ static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
 		return -EOPNOTSUPP;
 
-	nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) {
-		switch (nla_type(attr)) {
-		case VXLAN_VNIFILTER_ENTRY:
-			err = vxlan_process_vni_filter(vxlan, attr,
-						       nlh->nlmsg_type, extack);
-			break;
-		default:
-			continue;
-		}
+	nlmsg_for_each_attr_type(attr, VXLAN_VNIFILTER_ENTRY, nlh,
+				 sizeof(*tmsg), rem) {
+		err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type,
+					       extack);
 		vnis++;
 		if (err)
 			break;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6a42cc7a845a..657d44afc062 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1621,10 +1621,9 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 
 	/* count number of SERVER_THREADS values */
-	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
-		if (nla_type(attr) == NFSD_A_SERVER_THREADS)
-			nrpools++;
-	}
+	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+				 GENL_HDRLEN, rem)
+		nrpools++;
 
 	mutex_lock(&nfsd_mutex);
 
@@ -1635,12 +1634,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	i = 0;
-	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
-		if (nla_type(attr) == NFSD_A_SERVER_THREADS) {
-			nthreads[i++] = nla_get_u32(attr);
-			if (i >= nrpools)
-				break;
-		}
+	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+				 GENL_HDRLEN, rem) {
+		nthreads[i++] = nla_get_u32(attr);
+		if (i >= nrpools)
+			break;
 	}
 
 	if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
@@ -1781,14 +1779,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
 	for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
 		nfsd_minorversion(nn, i, NFSD_CLEAR);
 
-	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr,
+				 GENL_HDRLEN, rem) {
 		struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
 		u32 major, minor = 0;
 		bool enabled;
 
-		if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
-			continue;
-
 		if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
 				     nfsd_version_nl_policy, info->extack) < 0)
 			continue;
@@ -1939,14 +1935,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
 	 * Walk the list of server_socks from userland and move any that match
 	 * back to sv_permsocks
 	 */
-	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+				 GENL_HDRLEN, rem) {
 		struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
 		const char *xcl_name;
 		struct sockaddr *sa;
 
-		if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
-			continue;
-
 		if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
 				     nfsd_sock_nl_policy, info->extack) < 0)
 			continue;
@@ -2001,15 +1995,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
 		svc_xprt_destroy_all(serv, net);
 
 	/* walk list of addrs again, open any that still don't exist */
-	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+	nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+				 GENL_HDRLEN, rem) {
 		struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
 		const char *xcl_name;
 		struct sockaddr *sa;
 		int ret;
 
-		if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
-			continue;
-
 		if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
 				     nfsd_sock_nl_policy, info->extack) < 0)
 			continue;
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 90a560dc167a..1a8356ca4b78 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -68,6 +68,8 @@
  *   nlmsg_for_each_msg()		loop over all messages
  *   nlmsg_validate()			validate netlink message incl. attrs
  *   nlmsg_for_each_attr()		loop over all attributes
+ *   nlmsg_for_each_attr_type()		loop over all attributes with the
+ *					given type
  *
  * Misc:
  *   nlmsg_report()			report back to application?
@@ -966,6 +968,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
 	nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
 			  nlmsg_attrlen(nlh, hdrlen), rem)
 
+/**
+ * nlmsg_for_each_attr_type - iterate over a stream of attributes
+ * @pos: loop counter, set to the current attribute
+ * @type: required attribute type for @pos
+ * @nlh: netlink message header
+ * @hdrlen: length of the family specific header
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \
+	nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
+		if (nla_type(pos) == type)
+
 /**
  * nlmsg_put - Add a new netlink message to an skb
  * @skb: socket buffer to store message in
-- 
cgit v1.2.3


From 566e8f108fc7847f2a8676ec6a101d37b7dd0fb4 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Sun, 29 Jun 2025 17:21:32 +0300
Subject: devlink: Extend devlink rate API with traffic classes bandwidth
 management

Introduce support for specifying relative bandwidth shares between
traffic classes (TC) in the devlink-rate API. This new option allows
users to allocate bandwidth across multiple traffic classes in a
single command.

This feature provides a more granular control over traffic management,
especially for scenarios requiring Enhanced Transmission Selection.

Users can now define a relative bandwidth share for each traffic class.
For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5
(RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the
total bandwidth. The actual percentage each class receives depends on
the ratio of its share value to the sum of all shares.

Example:
DEV=pci/0000:08:00.0

$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \
  tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0

$ devlink port function rate set $DEV/vfs_group \
  tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0

Example usage with ynl:

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-set --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1,
  "rate-tc-bws": [
    {"rate-tc-index": 0, "rate-tc-bw": 50},
    {"rate-tc-index": 1, "rate-tc-bw": 50},
    {"rate-tc-index": 2, "rate-tc-bw": 0},
    {"rate-tc-index": 3, "rate-tc-bw": 0},
    {"rate-tc-index": 4, "rate-tc-bw": 0},
    {"rate-tc-index": 5, "rate-tc-bw": 0},
    {"rate-tc-index": 6, "rate-tc-bw": 0},
    {"rate-tc-index": 7, "rate-tc-bw": 0}
  ]
}'

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-get --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1
}'

output for rate-get:
{'bus-name': 'pci',
 'dev-name': '0000:08:00.0',
 'port-index': 1,
 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0},
                 {'rate-tc-bw': 50, 'rate-tc-index': 1},
                 {'rate-tc-bw': 0, 'rate-tc-index': 2},
                 {'rate-tc-bw': 0, 'rate-tc-index': 3},
                 {'rate-tc-bw': 0, 'rate-tc-index': 4},
                 {'rate-tc-bw': 0, 'rate-tc-index': 5},
                 {'rate-tc-bw': 0, 'rate-tc-index': 6},
                 {'rate-tc-bw': 0, 'rate-tc-index': 7}],
 'rate-tx-max': 0,
 'rate-tx-priority': 0,
 'rate-tx-share': 0,
 'rate-tx-weight': 0,
 'rate-type': 'leaf'}

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/devlink.yaml          |  32 +++++-
 Documentation/networking/devlink/devlink-port.rst |   8 ++
 include/net/devlink.h                             |   8 ++
 include/uapi/linux/devlink.h                      |   9 ++
 net/devlink/netlink_gen.c                         |  15 ++-
 net/devlink/netlink_gen.h                         |   1 +
 net/devlink/rate.c                                | 127 ++++++++++++++++++++++
 7 files changed, 195 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index bfba466d694a..1c4bb0cbe5f0 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -224,6 +224,10 @@ definitions:
         value: 10
       -
         name: binary
+  -
+    name: rate-tc-index-max
+    type: const
+    value: 7
 
 attribute-sets:
   -
@@ -844,7 +848,23 @@ attribute-sets:
       -
         name: region-direct
         type: flag
-
+      -
+        name: rate-tc-bws
+        type: nest
+        multi-attr: true
+        nested-attributes: dl-rate-tc-bws
+      -
+        name: rate-tc-index
+        type: u8
+        checks:
+          max: rate-tc-index-max
+      -
+        name: rate-tc-bw
+        type: u32
+        doc: |
+             Specifies the bandwidth share assigned to the Traffic Class.
+             The bandwidth for the traffic class is determined
+             in proportion to the sum of the shares of all configured classes.
   -
     name: dl-dev-stats
     subset-of: devlink
@@ -1249,6 +1269,14 @@ attribute-sets:
       -
         name: flash
         type: flag
+  -
+    name: dl-rate-tc-bws
+    subset-of: devlink
+    attributes:
+      -
+        name: rate-tc-index
+      -
+        name: rate-tc-bw
 
 operations:
   enum-model: directional
@@ -2176,6 +2204,7 @@ operations:
             - rate-tx-priority
             - rate-tx-weight
             - rate-parent-node-name
+            - rate-tc-bws
 
     -
       name: rate-new
@@ -2196,6 +2225,7 @@ operations:
             - rate-tx-priority
             - rate-tx-weight
             - rate-parent-node-name
+            - rate-tc-bws
 
     -
       name: rate-del
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 9d22d41a7cd1..5e397798a402 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -418,6 +418,14 @@ API allows to configure following rate object's parameters:
   to all node children limits. ``tx_max`` is an upper limit for children.
   ``tx_share`` is a total bandwidth distributed among children.
 
+``tc_bw``
+  Allow users to set the bandwidth allocation per traffic class on rate
+  objects. This enables fine-grained QoS configurations by assigning a relative
+  share value to each traffic class. The bandwidth is distributed in proportion
+  to the share value for each class, relative to the sum of all shares.
+  When applied to a non-leaf node, tc_bw determines how bandwidth is shared
+  among its child elements.
+
 ``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case
 nodes with the same priority form a WFQ subgroup in the sibling group
 and arbitration among them is based on assigned weights.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 63517646a497..d0ce5a7e984c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -118,6 +118,8 @@ struct devlink_rate {
 
 	u32 tx_priority;
 	u32 tx_weight;
+
+	u32 tc_bw[DEVLINK_RATE_TCS_MAX];
 };
 
 struct devlink_port {
@@ -1486,6 +1488,9 @@ struct devlink_ops {
 					 u32 tx_priority, struct netlink_ext_ack *extack);
 	int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
 				       u32 tx_weight, struct netlink_ext_ack *extack);
+	int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate,
+				   void *priv, u32 *tc_bw,
+				   struct netlink_ext_ack *extack);
 	int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
 				      u64 tx_share, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@@ -1494,6 +1499,9 @@ struct devlink_ops {
 					 u32 tx_priority, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
 				       u32 tx_weight, struct netlink_ext_ack *extack);
+	int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate,
+				   void *priv, u32 *tc_bw,
+				   struct netlink_ext_ack *extack);
 	int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
 			     struct netlink_ext_ack *extack);
 	int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a5ee0f13740a..e72bcc239afd 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -221,6 +221,11 @@ enum devlink_port_flavour {
 				      */
 };
 
+/* IEEE 802.1Qaz standard supported values. */
+
+#define DEVLINK_RATE_TCS_MAX 8
+#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1)
+
 enum devlink_rate_type {
 	DEVLINK_RATE_TYPE_LEAF,
 	DEVLINK_RATE_TYPE_NODE,
@@ -629,6 +634,10 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
 
+	DEVLINK_ATTR_RATE_TC_BWS,		/* nested */
+	DEVLINK_ATTR_RATE_TC_INDEX,		/* u8 */
+	DEVLINK_ATTR_RATE_TC_BW,		/* u32 */
+
 	/* Add new attributes above here, update the spec in
 	 * Documentation/netlink/specs/devlink.yaml and re-generate
 	 * net/devlink/netlink_gen.c.
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index e340d955cf3b..c50436433c18 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_
 	[DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
 };
 
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = {
+	[DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
+	[DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, },
+};
+
 const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
 	[DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
 };
@@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_
 };
 
 /* DEVLINK_CMD_RATE_SET - do */
-static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W
 	[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
 	[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+	[DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
 };
 
 /* DEVLINK_CMD_RATE_NEW - do */
-static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = {
+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
 	[DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
@@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W
 	[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
 	[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
 	[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+	[DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
 };
 
 /* DEVLINK_CMD_RATE_DEL - do */
@@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
 		.doit		= devlink_nl_rate_set_doit,
 		.post_doit	= devlink_nl_post_doit,
 		.policy		= devlink_rate_set_nl_policy,
-		.maxattr	= DEVLINK_ATTR_RATE_TX_WEIGHT,
+		.maxattr	= DEVLINK_ATTR_RATE_TC_BWS,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
@@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
 		.doit		= devlink_nl_rate_new_doit,
 		.post_doit	= devlink_nl_post_doit,
 		.policy		= devlink_rate_new_nl_policy,
-		.maxattr	= DEVLINK_ATTR_RATE_TX_WEIGHT,
+		.maxattr	= DEVLINK_ATTR_RATE_TC_BWS,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
index 8f2bd50ddf5e..fb733b5d4ff1 100644
--- a/net/devlink/netlink_gen.h
+++ b/net/devlink/netlink_gen.h
@@ -13,6 +13,7 @@
 
 /* Common nested types */
 extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1];
 extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
 
 /* Ops table for devlink */
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index 8828ffaf6cbc..d39300a9b3d4 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
 		return ERR_PTR(-EINVAL);
 }
 
+static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
+{
+	struct nlattr *nla_tc_bw;
+	int i;
+
+	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+		nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
+		if (!nla_tc_bw)
+			return -EMSGSIZE;
+
+		if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) ||
+		    nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i]))
+			goto nla_put_failure;
+
+		nla_nest_end(msg, nla_tc_bw);
+	}
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(msg, nla_tc_bw);
+	return -EMSGSIZE;
+}
+
 static int devlink_nl_rate_fill(struct sk_buff *msg,
 				struct devlink_rate *devlink_rate,
 				enum devlink_command cmd, u32 portid, u32 seq,
@@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
 				   devlink_rate->parent->name))
 			goto nla_put_failure;
 
+	if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
+		goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
@@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
 	return 0;
 }
 
+static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
+				       unsigned long *bitmap,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
+	u8 tc_index;
+	int err;
+
+	err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest,
+			       devlink_dl_rate_tc_bws_nl_policy, extack);
+	if (err)
+		return err;
+
+	if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) {
+		NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+				     DEVLINK_ATTR_RATE_TC_INDEX);
+		return -EINVAL;
+	}
+
+	tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]);
+
+	if (!tb[DEVLINK_ATTR_RATE_TC_BW]) {
+		NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+				     DEVLINK_ATTR_RATE_TC_BW);
+		return -EINVAL;
+	}
+
+	if (test_and_set_bit(tc_index, bitmap)) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "Duplicate traffic class index specified (%u)",
+				   tc_index);
+		return -EINVAL;
+	}
+
+	tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]);
+
+	return 0;
+}
+
+static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
+				     struct genl_info *info)
+{
+	DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {};
+	struct devlink *devlink = devlink_rate->devlink;
+	const struct devlink_ops *ops = devlink->ops;
+	u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {};
+	int rem, err = -EOPNOTSUPP, i;
+	struct nlattr *attr;
+
+	nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr,
+				 GENL_HDRLEN, rem) {
+		err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap,
+						  info->extack);
+		if (err)
+			return err;
+	}
+
+	for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+		if (!test_bit(i, bitmap)) {
+			NL_SET_ERR_MSG_FMT(info->extack,
+					   "Bandwidth values must be specified for all %u traffic classes",
+					   DEVLINK_RATE_TCS_MAX);
+			return -EINVAL;
+		}
+	}
+
+	if (devlink_rate_is_leaf(devlink_rate))
+		err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv,
+					       tc_bw, info->extack);
+	else if (devlink_rate_is_node(devlink_rate))
+		err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv,
+					       tc_bw, info->extack);
+
+	if (err)
+		return err;
+
+	memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
+
+	return 0;
+}
+
 static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 			       const struct devlink_ops *ops,
 			       struct genl_info *info)
@@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 			return err;
 	}
 
+	if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
+		err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 					    "TX weight set isn't supported for the leafs");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+		    !ops->rate_leaf_tc_bw_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TC_BWS],
+					    "TC bandwidth set isn't supported for the leafs");
+			return false;
+		}
 	} else if (type == DEVLINK_RATE_TYPE_NODE) {
 		if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
 			NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
@@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 					    "TX weight set isn't supported for the nodes");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+		    !ops->rate_node_tc_bw_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TC_BWS],
+					    "TC bandwidth set isn't supported for the nodes");
+			return false;
+		}
 	} else {
 		WARN(1, "Unknown type of rate object");
 		return false;
-- 
cgit v1.2.3


From cf73d9970ea4f8cace5d8f02d2565a2723003112 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 2 Jul 2025 21:31:54 +0100
Subject: io_uring: don't use int for ABI

__kernel_rwf_t is defined as int, the actual size of which is
implementation defined. It won't go well if some compiler / archs
ever defines it as i64, so replace it with __u32, hoping that
there is no one using i16 for it.

Cc: stable@vger.kernel.org
Fixes: 2b188cc1bb857 ("Add io_uring IO interface")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/47c666c4ee1df2018863af3a2028af18feef11ed.1751412511.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b6be063693c8..b8a0e70ee2fd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -50,7 +50,7 @@ struct io_uring_sqe {
 	};
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
-		__kernel_rwf_t	rw_flags;
+		__u32		rw_flags;
 		__u32		fsync_flags;
 		__u16		poll_events;	/* compatibility */
 		__u32		poll32_events;	/* word-reversed for BE */
-- 
cgit v1.2.3


From 9fd45235fdd2c2615a03c86ebe5a88b050dc5680 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 9 May 2024 16:32:51 -0400
Subject: add locked_recursive_removal()

simple_recursive_removal() assumes that parent is not locked and
locks it when it finally gets to removing the victim itself.
Usually that's what we want, but there are places where the
parent is *already* locked and we need it to stay that way.
In those cases simple_recursive_removal() would, of course,
deadlock, so we have to play racy games with unlocking/relocking
the parent around the call or open-code the entire thing.

A better solution is to provide a variant that expects to
be called with the parent already locked by the caller.
Parent should be locked with I_MUTEX_PARENT, to avoid false
positives from lockdep.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 25 +++++++++++++++++++++----
 include/linux/fs.h |  2 ++
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index 20b05a6db770..429caacc5229 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -605,8 +605,9 @@ struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
 }
 EXPORT_SYMBOL(find_next_child);
 
-void simple_recursive_removal(struct dentry *dentry,
-                              void (*callback)(struct dentry *))
+static void __simple_recursive_removal(struct dentry *dentry,
+                              void (*callback)(struct dentry *),
+			      bool locked)
 {
 	struct dentry *this = dget(dentry);
 	while (true) {
@@ -625,7 +626,8 @@ void simple_recursive_removal(struct dentry *dentry,
 			victim = this;
 			this = this->d_parent;
 			inode = this->d_inode;
-			inode_lock_nested(inode, I_MUTEX_CHILD);
+			if (!locked || victim != dentry)
+				inode_lock_nested(inode, I_MUTEX_CHILD);
 			if (simple_positive(victim)) {
 				d_invalidate(victim);	// avoid lost mounts
 				if (callback)
@@ -638,7 +640,8 @@ void simple_recursive_removal(struct dentry *dentry,
 						      inode_set_ctime_current(inode));
 				if (d_is_dir(dentry))
 					drop_nlink(inode);
-				inode_unlock(inode);
+				if (!locked)
+					inode_unlock(inode);
 				dput(dentry);
 				return;
 			}
@@ -647,8 +650,22 @@ void simple_recursive_removal(struct dentry *dentry,
 		this = child;
 	}
 }
+
+void simple_recursive_removal(struct dentry *dentry,
+                              void (*callback)(struct dentry *))
+{
+	return __simple_recursive_removal(dentry, callback, false);
+}
 EXPORT_SYMBOL(simple_recursive_removal);
 
+/* caller holds parent directory with I_MUTEX_PARENT */
+void locked_recursive_removal(struct dentry *dentry,
+                              void (*callback)(struct dentry *))
+{
+	return __simple_recursive_removal(dentry, callback, true);
+}
+EXPORT_SYMBOL(locked_recursive_removal);
+
 static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..4f0c6bf8d652 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3595,6 +3595,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *,
 			 unsigned int);
 extern void simple_recursive_removal(struct dentry *,
                               void (*callback)(struct dentry *));
+extern void locked_recursive_removal(struct dentry *,
+                              void (*callback)(struct dentry *));
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
-- 
cgit v1.2.3


From 59200f45267481582f4e42334a510f01d0b89449 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 22 Mar 2025 21:06:11 -0400
Subject: new helper: simple_start_creating()

Set the things up for kernel-initiated creation of object in
a tree-in-dcache filesystem.  With respect to locking it's
an equivalent of filename_create() - we either get a negative
dentry with locked parent, or ERR_PTR() and no locks taken.

tracefs and debugfs had that open-coded as part of their
object creation machinery; switched to calling new helper.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/debugfs/inode.c | 21 +++------------------
 fs/libfs.c         | 25 +++++++++++++++++++++++++
 fs/tracefs/inode.c | 15 ++-------------
 include/linux/fs.h |  1 +
 4 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30c4944e1862..52befd94acee 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -384,27 +384,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = debugfs_mount->mnt_root;
 
-	inode_lock(d_inode(parent));
-	if (unlikely(IS_DEADDIR(d_inode(parent))))
-		dentry = ERR_PTR(-ENOENT);
-	else
-		dentry = lookup_noperm(&QSTR(name), parent);
-	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
-		if (d_is_dir(dentry))
-			pr_err("Directory '%s' with parent '%s' already present!\n",
-			       name, parent->d_name.name);
-		else
-			pr_err("File '%s' in directory '%s' already present!\n",
-			       name, parent->d_name.name);
-		dput(dentry);
-		dentry = ERR_PTR(-EEXIST);
-	}
-
+	dentry = simple_start_creating(parent, name);
 	if (IS_ERR(dentry)) {
-		inode_unlock(d_inode(parent));
+		if (dentry == ERR_PTR(-EEXIST))
+			pr_err("'%s' already exists in '%pd'\n", name, parent);
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	}
-
 	return dentry;
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 20b05a6db770..dbbfa3703937 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2260,3 +2260,28 @@ void stashed_dentry_prune(struct dentry *dentry)
 	 */
 	cmpxchg(stashed, dentry, NULL);
 }
+
+/* parent must be held exclusive */
+struct dentry *simple_start_creating(struct dentry *parent, const char *name)
+{
+	struct dentry *dentry;
+	struct inode *dir = d_inode(parent);
+
+	inode_lock(dir);
+	if (unlikely(IS_DEADDIR(dir))) {
+		inode_unlock(dir);
+		return ERR_PTR(-ENOENT);
+	}
+	dentry = lookup_noperm(&QSTR(name), parent);
+	if (IS_ERR(dentry)) {
+		inode_unlock(dir);
+		return dentry;
+	}
+	if (dentry->d_inode) {
+		dput(dentry);
+		inode_unlock(dir);
+		return ERR_PTR(-EEXIST);
+	}
+	return dentry;
+}
+EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index a3fd3cc591bd..4e5d091e9263 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -551,20 +551,9 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = tracefs_mount->mnt_root;
 
-	inode_lock(d_inode(parent));
-	if (unlikely(IS_DEADDIR(d_inode(parent))))
-		dentry = ERR_PTR(-ENOENT);
-	else
-		dentry = lookup_noperm(&QSTR(name), parent);
-	if (!IS_ERR(dentry) && d_inode(dentry)) {
-		dput(dentry);
-		dentry = ERR_PTR(-EEXIST);
-	}
-
-	if (IS_ERR(dentry)) {
-		inode_unlock(d_inode(parent));
+	dentry = simple_start_creating(parent, name);
+	if (IS_ERR(dentry))
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	}
 
 	return dentry;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..9f75f8836bbd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3619,6 +3619,7 @@ extern int simple_fill_super(struct super_block *, unsigned long,
 			     const struct tree_descr *);
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
+struct dentry *simple_start_creating(struct dentry *, const char *);
 
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
-- 
cgit v1.2.3


From bccea4ed060f1f6476ac7a0649ffa73f77d6e94c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 20 Feb 2024 00:24:19 -0500
Subject: rpc_unlink(): saner calling conventions

1) pass it pipe instead of pipe->dentry
2) zero pipe->dentry afterwards
3) it always returns 0; why bother?

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/blocklayout/rpc_pipefs.c    | 12 ++----------
 fs/nfs/nfs4idmap.c                 |  6 +-----
 fs/nfsd/nfs4recover.c              | 12 ++----------
 include/linux/sunrpc/rpc_pipe_fs.h |  2 +-
 net/sunrpc/auth_gss/auth_gss.c     |  6 +-----
 net/sunrpc/rpc_pipe.c              | 12 +++++++-----
 6 files changed, 14 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index d8d50a88de04..25d429e44eb4 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -154,13 +154,6 @@ static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
 	return dentry;
 }
 
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
-					  struct rpc_pipe *pipe)
-{
-	if (pipe->dentry)
-		rpc_unlink(pipe->dentry);
-}
-
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 			   void *ptr)
 {
@@ -188,8 +181,7 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 		nn->bl_device_pipe->dentry = dentry;
 		break;
 	case RPC_PIPEFS_UMOUNT:
-		if (nn->bl_device_pipe->dentry)
-			nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+		rpc_unlink(nn->bl_device_pipe);
 		break;
 	default:
 		ret = -ENOTSUPP;
@@ -224,7 +216,7 @@ static void nfs4blocklayout_unregister_net(struct net *net,
 
 	pipefs_sb = rpc_get_sb_net(net);
 	if (pipefs_sb) {
-		nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+		rpc_unlink(pipe);
 		rpc_put_sb_net(net);
 	}
 }
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 25a7c771cfd8..adc03232b851 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -424,12 +424,8 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir,
 		struct rpc_pipe_dir_object *pdo)
 {
 	struct idmap *idmap = pdo->pdo_data;
-	struct rpc_pipe *pipe = idmap->idmap_pipe;
 
-	if (pipe->dentry) {
-		rpc_unlink(pipe->dentry);
-		pipe->dentry = NULL;
-	}
+	rpc_unlink(idmap->idmap_pipe);
 }
 
 static int nfs_idmap_pipe_create(struct dentry *dir,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 82785db730d9..bbd29b3b573f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -963,13 +963,6 @@ nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
 	return dentry;
 }
 
-static void
-nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
-{
-	if (pipe->dentry)
-		rpc_unlink(pipe->dentry);
-}
-
 static struct dentry *
 nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
 {
@@ -991,7 +984,7 @@ nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
 
 	sb = rpc_get_sb_net(net);
 	if (sb) {
-		nfsd4_cld_unregister_sb(pipe);
+		rpc_unlink(pipe);
 		rpc_put_sb_net(net);
 	}
 }
@@ -2142,8 +2135,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
 		cn->cn_pipe->dentry = dentry;
 		break;
 	case RPC_PIPEFS_UMOUNT:
-		if (cn->cn_pipe->dentry)
-			nfsd4_cld_unregister_sb(cn->cn_pipe);
+		rpc_unlink(cn->cn_pipe);
 		break;
 	default:
 		ret = -ENOTSUPP;
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 3b35b6f6533a..a8c0a500d55c 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -129,7 +129,7 @@ struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
 void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
 extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
 					struct rpc_pipe *);
-extern int rpc_unlink(struct dentry *);
+extern void rpc_unlink(struct rpc_pipe *);
 extern int register_rpc_pipefs(void);
 extern void unregister_rpc_pipefs(void);
 
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 0fa244f16876..f2a44d589cfb 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -887,12 +887,8 @@ static void gss_pipe_dentry_destroy(struct dentry *dir,
 		struct rpc_pipe_dir_object *pdo)
 {
 	struct gss_pipe *gss_pipe = pdo->pdo_data;
-	struct rpc_pipe *pipe = gss_pipe->pipe;
 
-	if (pipe->dentry != NULL) {
-		rpc_unlink(pipe->dentry);
-		pipe->dentry = NULL;
-	}
+	rpc_unlink(gss_pipe->pipe);
 }
 
 static int gss_pipe_dentry_create(struct dentry *dir,
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 46fa00ac5e0e..2046582c4f35 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -737,17 +737,19 @@ EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry);
 
 /**
  * rpc_unlink - remove a pipe
- * @dentry: dentry for the pipe, as returned from rpc_mkpipe
+ * @pipe: the pipe to be removed
  *
  * After this call, lookups will no longer find the pipe, and any
  * attempts to read or write using preexisting opens of the pipe will
  * return -EPIPE.
  */
-int
-rpc_unlink(struct dentry *dentry)
+void
+rpc_unlink(struct rpc_pipe *pipe)
 {
-	simple_recursive_removal(dentry, rpc_close_pipes);
-	return 0;
+	if (pipe->dentry) {
+		simple_recursive_removal(pipe->dentry, rpc_close_pipes);
+		pipe->dentry = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(rpc_unlink);
 
-- 
cgit v1.2.3


From 19a6314a997f6adde0c100ecf9224d1ab43c9603 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 20 Feb 2024 02:41:59 -0500
Subject: rpc_mkpipe_dentry(): saner calling conventions

Instead of returning a dentry or ERR_PTR(-E...), return 0 and store
dentry into pipe->dentry on success and return -E... on failure.

Callers are happier that way...

NOTE: dummy rpc_pipe is getting ->dentry set; we never access that,
since we
	1) never call rpc_unlink() for it (dentry is taken out by
->kill_sb())
	2) never call rpc_queue_upcall() for it (writing to that
sucker fails; no downcalls are ever submitted, so no replies are
going to arrive)
IOW, having that ->dentry set (and left dangling) is harmless,
if ugly; cleaner solution will take more massage.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/blocklayout/rpc_pipefs.c    | 41 +++++++++++++++-----------------------
 fs/nfs/nfs4idmap.c                 |  8 +-------
 fs/nfsd/nfs4recover.c              | 37 +++++++++++++---------------------
 include/linux/sunrpc/rpc_pipe_fs.h |  2 +-
 net/sunrpc/auth_gss/auth_gss.c     |  7 +------
 net/sunrpc/rpc_pipe.c              | 29 +++++++++++----------------
 6 files changed, 45 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 25d429e44eb4..d526f5ba7887 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -141,17 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
 	.destroy_msg	= bl_pipe_destroy_msg,
 };
 
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+static int nfs4blocklayout_register_sb(struct super_block *sb,
 					    struct rpc_pipe *pipe)
 {
-	struct dentry *dir, *dentry;
+	struct dentry *dir;
+	int err;
 
 	dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
 	if (dir == NULL)
-		return ERR_PTR(-ENOENT);
-	dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+		return -ENOENT;
+	err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
 	dput(dir);
-	return dentry;
+	return err;
 }
 
 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
@@ -160,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 	struct super_block *sb = ptr;
 	struct net *net = sb->s_fs_info;
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
 	int ret = 0;
 
 	if (!try_module_get(THIS_MODULE))
@@ -173,12 +173,7 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
 
 	switch (event) {
 	case RPC_PIPEFS_MOUNT:
-		dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
-		if (IS_ERR(dentry)) {
-			ret = PTR_ERR(dentry);
-			break;
-		}
-		nn->bl_device_pipe->dentry = dentry;
+		ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
 		break;
 	case RPC_PIPEFS_UMOUNT:
 		rpc_unlink(nn->bl_device_pipe);
@@ -195,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = {
 	.notifier_call = rpc_pipefs_event,
 };
 
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
-						   struct rpc_pipe *pipe)
+static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe)
 {
 	struct super_block *pipefs_sb;
-	struct dentry *dentry;
+	int ret;
 
 	pipefs_sb = rpc_get_sb_net(net);
 	if (!pipefs_sb)
-		return NULL;
-	dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+		return 0;
+	ret = nfs4blocklayout_register_sb(pipefs_sb, pipe);
 	rpc_put_sb_net(net);
-	return dentry;
+	return ret;
 }
 
 static void nfs4blocklayout_unregister_net(struct net *net,
@@ -224,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net,
 static int nfs4blocklayout_net_init(struct net *net)
 {
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
-	struct dentry *dentry;
+	int err;
 
 	mutex_init(&nn->bl_mutex);
 	init_waitqueue_head(&nn->bl_wq);
 	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
 	if (IS_ERR(nn->bl_device_pipe))
 		return PTR_ERR(nn->bl_device_pipe);
-	dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
-	if (IS_ERR(dentry)) {
+	err = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+	if (unlikely(err))
 		rpc_destroy_pipe_data(nn->bl_device_pipe);
-		return PTR_ERR(dentry);
-	}
-	nn->bl_device_pipe->dentry = dentry;
-	return 0;
+	return err;
 }
 
 static void nfs4blocklayout_net_exit(struct net *net)
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index adc03232b851..00932500fce4 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -432,14 +432,8 @@ static int nfs_idmap_pipe_create(struct dentry *dir,
 		struct rpc_pipe_dir_object *pdo)
 {
 	struct idmap *idmap = pdo->pdo_data;
-	struct rpc_pipe *pipe = idmap->idmap_pipe;
-	struct dentry *dentry;
 
-	dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	pipe->dentry = dentry;
-	return 0;
+	return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe);
 }
 
 static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index bbd29b3b573f..2231192ec33f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -950,31 +950,32 @@ static const struct rpc_pipe_ops cld_upcall_ops = {
 	.destroy_msg	= cld_pipe_destroy_msg,
 };
 
-static struct dentry *
+static int
 nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
 {
-	struct dentry *dir, *dentry;
+	struct dentry *dir;
+	int err;
 
 	dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
 	if (dir == NULL)
-		return ERR_PTR(-ENOENT);
-	dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+		return -ENOENT;
+	err = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
 	dput(dir);
-	return dentry;
+	return err;
 }
 
-static struct dentry *
+static int
 nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
 {
 	struct super_block *sb;
-	struct dentry *dentry;
+	int err;
 
 	sb = rpc_get_sb_net(net);
 	if (!sb)
-		return NULL;
-	dentry = nfsd4_cld_register_sb(sb, pipe);
+		return 0;
+	err = nfsd4_cld_register_sb(sb, pipe);
 	rpc_put_sb_net(net);
-	return dentry;
+	return err;
 }
 
 static void
@@ -994,7 +995,6 @@ static int
 __nfsd4_init_cld_pipe(struct net *net)
 {
 	int ret;
-	struct dentry *dentry;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct cld_net *cn;
 
@@ -1015,13 +1015,10 @@ __nfsd4_init_cld_pipe(struct net *net)
 	spin_lock_init(&cn->cn_lock);
 	INIT_LIST_HEAD(&cn->cn_list);
 
-	dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
-	if (IS_ERR(dentry)) {
-		ret = PTR_ERR(dentry);
+	ret = nfsd4_cld_register_net(net, cn->cn_pipe);
+	if (unlikely(ret))
 		goto err_destroy_data;
-	}
 
-	cn->cn_pipe->dentry = dentry;
 #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
 	cn->cn_has_legacy = false;
 #endif
@@ -2114,7 +2111,6 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
 	struct net *net = sb->s_fs_info;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
-	struct dentry *dentry;
 	int ret = 0;
 
 	if (!try_module_get(THIS_MODULE))
@@ -2127,12 +2123,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
 
 	switch (event) {
 	case RPC_PIPEFS_MOUNT:
-		dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
-		if (IS_ERR(dentry)) {
-			ret = PTR_ERR(dentry);
-			break;
-		}
-		cn->cn_pipe->dentry = dentry;
+		ret = nfsd4_cld_register_sb(sb, cn->cn_pipe);
 		break;
 	case RPC_PIPEFS_UMOUNT:
 		rpc_unlink(cn->cn_pipe);
diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a8c0a500d55c..8cc3a5df9801 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -127,7 +127,7 @@ extern void rpc_remove_cache_dir(struct dentry *);
 
 struct rpc_pipe *rpc_mkpipe_data(const struct rpc_pipe_ops *ops, int flags);
 void rpc_destroy_pipe_data(struct rpc_pipe *pipe);
-extern struct dentry *rpc_mkpipe_dentry(struct dentry *, const char *, void *,
+extern int rpc_mkpipe_dentry(struct dentry *, const char *, void *,
 					struct rpc_pipe *);
 extern void rpc_unlink(struct rpc_pipe *);
 extern int register_rpc_pipefs(void);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index f2a44d589cfb..6c23d46a1dcc 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -895,13 +895,8 @@ static int gss_pipe_dentry_create(struct dentry *dir,
 		struct rpc_pipe_dir_object *pdo)
 {
 	struct gss_pipe *p = pdo->pdo_data;
-	struct dentry *dentry;
 
-	dentry = rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	p->pipe->dentry = dentry;
-	return 0;
+	return rpc_mkpipe_dentry(dir, p->name, p->clnt, p->pipe);
 }
 
 static const struct rpc_pipe_dir_object_ops gss_pipe_dir_object_ops = {
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 2046582c4f35..dac1c35a642f 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -702,7 +702,7 @@ out_err:
  * The @private argument passed here will be available to all these methods
  * from the file pointer, via RPC_I(file_inode(file))->private.
  */
-struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
+int rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 				 void *private, struct rpc_pipe *pipe)
 {
 	struct dentry *dentry;
@@ -717,21 +717,19 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 
 	inode_lock_nested(dir, I_MUTEX_PARENT);
 	dentry = __rpc_lookup_create_exclusive(parent, name);
-	if (IS_ERR(dentry))
-		goto out;
+	if (IS_ERR(dentry)) {
+		inode_unlock(dir);
+		return PTR_ERR(dentry);
+	}
 	err = __rpc_mkpipe_dentry(dir, dentry, umode, &rpc_pipe_fops,
 				  private, pipe);
-	if (err)
-		goto out_err;
-out:
+	if (unlikely(err))
+		pr_warn("%s() failed to create pipe %pd/%s (errno = %d)\n",
+			__func__, parent, name, err);
+	else
+		pipe->dentry = dentry;
 	inode_unlock(dir);
-	return dentry;
-out_err:
-	dentry = ERR_PTR(err);
-	printk(KERN_WARNING "%s: %s() failed to create pipe %pd/%s (errno = %d)\n",
-			__FILE__, __func__, parent, name,
-			err);
-	goto out;
+	return err;
 }
 EXPORT_SYMBOL_GPL(rpc_mkpipe_dentry);
 
@@ -1185,7 +1183,6 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
 	int ret = 0;
 	struct dentry *gssd_dentry;
 	struct dentry *clnt_dentry = NULL;
-	struct dentry *pipe_dentry = NULL;
 
 	/* We should never get this far if "gssd" doesn't exist */
 	gssd_dentry = try_lookup_noperm(&QSTR(files[RPCAUTH_gssd].name), root);
@@ -1209,10 +1206,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
 		dput(clnt_dentry);
 		return ret;
 	}
-	pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
+	ret = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
 	dput(clnt_dentry);
-	if (IS_ERR(pipe_dentry))
-		ret = PTR_ERR(pipe_dentry);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 350db61fbeb940502a16e74153ee5954d03622e9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 23 Mar 2025 00:51:10 -0400
Subject: rpc_create_client_dir(): return 0 or -E...

Callers couldn't care less which dentry did we get - anything
valid is treated as success.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  2 +-
 net/sunrpc/clnt.c                  | 36 ++++++++++++++----------------------
 net/sunrpc/rpc_pipe.c              | 12 ++++++------
 3 files changed, 21 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index 8cc3a5df9801..2cb406f8ff4e 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -98,7 +98,7 @@ static inline bool rpc_msg_is_inflight(const struct rpc_pipe_msg *msg) {
 }
 
 struct rpc_clnt;
-extern struct dentry *rpc_create_client_dir(struct dentry *, const char *, struct rpc_clnt *);
+extern int rpc_create_client_dir(struct dentry *, const char *, struct rpc_clnt *);
 extern int rpc_remove_client_dir(struct rpc_clnt *);
 
 extern void rpc_init_pipe_dir_head(struct rpc_pipe_dir_head *pdh);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 21426c3049d3..8ca354ecfd02 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -112,47 +112,46 @@ static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt)
 	}
 }
 
-static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
+static int rpc_setup_pipedir_sb(struct super_block *sb,
 				    struct rpc_clnt *clnt)
 {
 	static uint32_t clntid;
 	const char *dir_name = clnt->cl_program->pipe_dir_name;
 	char name[15];
-	struct dentry *dir, *dentry;
+	struct dentry *dir;
+	int err;
 
 	dir = rpc_d_lookup_sb(sb, dir_name);
 	if (dir == NULL) {
 		pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name);
-		return dir;
+		return -ENOENT;
 	}
 	for (;;) {
 		snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
 		name[sizeof(name) - 1] = '\0';
-		dentry = rpc_create_client_dir(dir, name, clnt);
-		if (!IS_ERR(dentry))
+		err = rpc_create_client_dir(dir, name, clnt);
+		if (!err)
 			break;
-		if (dentry == ERR_PTR(-EEXIST))
+		if (err == -EEXIST)
 			continue;
 		printk(KERN_INFO "RPC: Couldn't create pipefs entry"
-				" %s/%s, error %ld\n",
-				dir_name, name, PTR_ERR(dentry));
+				" %s/%s, error %d\n",
+				dir_name, name, err);
 		break;
 	}
 	dput(dir);
-	return dentry;
+	return err;
 }
 
 static int
 rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt)
 {
-	struct dentry *dentry;
-
 	clnt->pipefs_sb = pipefs_sb;
 
 	if (clnt->cl_program->pipe_dir_name != NULL) {
-		dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt);
-		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
+		int err = rpc_setup_pipedir_sb(pipefs_sb, clnt);
+		if (err && err != -ENOENT)
+			return err;
 	}
 	return 0;
 }
@@ -180,16 +179,9 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event)
 static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
 				   struct super_block *sb)
 {
-	struct dentry *dentry;
-
 	switch (event) {
 	case RPC_PIPEFS_MOUNT:
-		dentry = rpc_setup_pipedir_sb(sb, clnt);
-		if (!dentry)
-			return -ENOENT;
-		if (IS_ERR(dentry))
-			return PTR_ERR(dentry);
-		break;
+		return rpc_setup_pipedir_sb(sb, clnt);
 	case RPC_PIPEFS_UMOUNT:
 		__rpc_clnt_remove_pipedir(clnt);
 		break;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index e4b53530eb1b..a12ec709c445 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -863,27 +863,27 @@ rpc_destroy_pipe_dir_objects(struct rpc_pipe_dir_head *pdh)
  * information about the client, together with any "pipes" that may
  * later be created using rpc_mkpipe().
  */
-struct dentry *rpc_create_client_dir(struct dentry *dentry,
-				   const char *name,
-				   struct rpc_clnt *rpc_client)
+int rpc_create_client_dir(struct dentry *dentry,
+			   const char *name,
+			   struct rpc_clnt *rpc_client)
 {
 	struct dentry *ret;
 	int err;
 
 	ret = rpc_new_dir(dentry, name, 0555);
 	if (IS_ERR(ret))
-		return ret;
+		return PTR_ERR(ret);
 	err = rpc_new_file(ret, "info", S_IFREG | 0400,
 			      &rpc_info_operations, rpc_client);
 	if (err) {
 		pr_warn("%s failed to populate directory %pd\n",
 				__func__, ret);
 		simple_recursive_removal(ret, NULL);
-		return ERR_PTR(err);
+		return err;
 	}
 	rpc_client->cl_pipedir_objects.pdh_dentry = ret;
 	rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
-	return ret;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From aa89281bbc0b61610c96074c6390aed44474ebd0 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Date: Thu, 12 Jun 2025 16:58:21 +0200
Subject: media: pisp_be: Use clamp() and define max sizes

Use the clamp() function from minmax.h and provide a define for the max
sizes as they will be used in subsequent patches.

Reviewed-by: Daniel Scally <dan.scally@ideasonboard.com>
Reviewed-by: Stefan Klug <stefan.klug@ideasonboard.com>
Signed-off-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/platform/raspberrypi/pisp_be/pisp_be.c  | 11 +++++++----
 include/uapi/linux/media/raspberrypi/pisp_be_config.h |  9 +++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
index be794a123620..b30891718d8d 100644
--- a/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
+++ b/drivers/media/platform/raspberrypi/pisp_be/pisp_be.c
@@ -9,6 +9,7 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/lockdep.h>
+#include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
@@ -1112,10 +1113,12 @@ static void pispbe_try_format(struct v4l2_format *f, struct pispbe_node *node)
 	f->fmt.pix_mp.pixelformat = fmt->fourcc;
 	f->fmt.pix_mp.num_planes = fmt->num_planes;
 	f->fmt.pix_mp.field = V4L2_FIELD_NONE;
-	f->fmt.pix_mp.width = max(min(f->fmt.pix_mp.width, 65536u),
-				  PISP_BACK_END_MIN_TILE_WIDTH);
-	f->fmt.pix_mp.height = max(min(f->fmt.pix_mp.height, 65536u),
-				   PISP_BACK_END_MIN_TILE_HEIGHT);
+	f->fmt.pix_mp.width = clamp(f->fmt.pix_mp.width,
+				    PISP_BACK_END_MIN_TILE_WIDTH,
+				    PISP_BACK_END_MAX_TILE_WIDTH);
+	f->fmt.pix_mp.height = clamp(f->fmt.pix_mp.height,
+				     PISP_BACK_END_MIN_TILE_HEIGHT,
+				     PISP_BACK_END_MAX_TILE_HEIGHT);
 
 	/*
 	 * Fill in the actual colour space when the requested one was
diff --git a/include/uapi/linux/media/raspberrypi/pisp_be_config.h b/include/uapi/linux/media/raspberrypi/pisp_be_config.h
index cbeb714f4d61..2ad3b90684d7 100644
--- a/include/uapi/linux/media/raspberrypi/pisp_be_config.h
+++ b/include/uapi/linux/media/raspberrypi/pisp_be_config.h
@@ -21,10 +21,11 @@
 /* preferred byte alignment for outputs */
 #define PISP_BACK_END_OUTPUT_MAX_ALIGN 64u
 
-/* minimum allowed tile width anywhere in the pipeline */
-#define PISP_BACK_END_MIN_TILE_WIDTH 16u
-/* minimum allowed tile width anywhere in the pipeline */
-#define PISP_BACK_END_MIN_TILE_HEIGHT 16u
+/* minimum allowed tile sizes anywhere in the pipeline */
+#define PISP_BACK_END_MIN_TILE_WIDTH	16u
+#define PISP_BACK_END_MIN_TILE_HEIGHT	16u
+#define PISP_BACK_END_MAX_TILE_WIDTH	65536u
+#define PISP_BACK_END_MAX_TILE_HEIGHT	65536u
 
 #define PISP_BACK_END_NUM_OUTPUTS 2
 #define PISP_BACK_END_HOG_OUTPUT 1
-- 
cgit v1.2.3


From 78584431e2cea6b60909cfa23c90ac8b33ab4198 Mon Sep 17 00:00:00 2001
From: Daniel Scally <dan.scally+renesas@ideasonboard.com>
Date: Mon, 30 Jun 2025 23:27:34 +0100
Subject: media: v4l2: Add Renesas Camera Receiver Unit pixel formats

The Renesas Camera Receiver Unit in the RZ/V2H SoC can output RAW
data captured from an image sensor without conversion to an RGB/YUV
format. In that case the data are packed into 64-bit blocks, with a
variable amount of padding in the most significant bits depending on
the bitdepth of the data. Add new V4L2 pixel format codes for the new
formats, along with documentation to describe them.

Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Reviewed-by: Jacopo Mondi <jacopo.mondi@ideasonboard.com>
Signed-off-by: Daniel Scally <dan.scally+renesas@ideasonboard.com>
Link: https://lore.kernel.org/r/20250630222734.2712390-1-dan.scally@ideasonboard.com
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 .../userspace-api/media/v4l/pixfmt-bayer.rst       |   1 +
 .../userspace-api/media/v4l/pixfmt-rawnn-cru.rst   | 143 +++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-common.c              |   6 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |   4 +
 include/uapi/linux/videodev2.h                     |   6 +
 5 files changed, 160 insertions(+)
 create mode 100644 Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst b/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst
index ed3eb432967d..b5ca501842b0 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-bayer.rst
@@ -19,6 +19,7 @@ orders. See also `the Wikipedia article on Bayer filter
 .. toctree::
     :maxdepth: 1
 
+    pixfmt-rawnn-cru
     pixfmt-srggb8
     pixfmt-srggb8-pisp-comp
     pixfmt-srggb10
diff --git a/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst b/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst
new file mode 100644
index 000000000000..db81f1cfe0f5
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/pixfmt-rawnn-cru.rst
@@ -0,0 +1,143 @@
+.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
+
+.. _v4l2-pix-fmt-raw-cru10:
+.. _v4l2-pix-fmt-raw-cru12:
+.. _v4l2-pix-fmt-raw-cru14:
+.. _v4l2-pix-fmt-raw-cru20:
+
+**********************************************************************************************************************************
+V4L2_PIX_FMT_RAW_CRU10 ('CR10'), V4L2_PIX_FMT_RAW_CRU12 ('CR12'), V4L2_PIX_FMT_RAW_CRU14 ('CR14'), V4L2_PIX_FMT_RAW_CRU20 ('CR20')
+**********************************************************************************************************************************
+
+===============================================================
+Renesas RZ/V2H Camera Receiver Unit 64-bit packed pixel formats
+===============================================================
+
+| V4L2_PIX_FMT_RAW_CRU10 (CR10)
+| V4L2_PIX_FMT_RAW_CRU12 (CR12)
+| V4L2_PIX_FMT_RAW_CRU14 (CR14)
+| V4L2_PIX_FMT_RAW_CRU20 (CR20)
+
+Description
+===========
+
+These pixel formats are some of the RAW outputs for the Camera Receiver Unit in
+the Renesas RZ/V2H SoC. They are raw formats which pack pixels contiguously into
+64-bit units, with the 4 or 8 most significant bits padded.
+
+**Byte Order**
+
+.. flat-table:: RAW formats
+    :header-rows:  2
+    :stub-columns: 0
+    :widths: 36 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
+    :fill-cells:
+
+    * - :rspan:`1` Pixel Format Code
+      - :cspan:`63` Data organization
+    * - 63
+      - 62
+      - 61
+      - 60
+      - 59
+      - 58
+      - 57
+      - 56
+      - 55
+      - 54
+      - 53
+      - 52
+      - 51
+      - 50
+      - 49
+      - 48
+      - 47
+      - 46
+      - 45
+      - 44
+      - 43
+      - 42
+      - 41
+      - 40
+      - 39
+      - 38
+      - 37
+      - 36
+      - 35
+      - 34
+      - 33
+      - 32
+      - 31
+      - 30
+      - 29
+      - 28
+      - 27
+      - 26
+      - 25
+      - 24
+      - 23
+      - 22
+      - 21
+      - 20
+      - 19
+      - 18
+      - 17
+      - 16
+      - 15
+      - 14
+      - 13
+      - 12
+      - 11
+      - 10
+      - 9
+      - 8
+      - 7
+      - 6
+      - 5
+      - 4
+      - 3
+      - 2
+      - 1
+      - 0
+    * - V4L2_PIX_FMT_RAW_CRU10
+      - 0
+      - 0
+      - 0
+      - 0
+      - :cspan:`9` P5
+      - :cspan:`9` P4
+      - :cspan:`9` P3
+      - :cspan:`9` P2
+      - :cspan:`9` P1
+      - :cspan:`9` P0
+    * - V4L2_PIX_FMT_RAW_CRU12
+      - 0
+      - 0
+      - 0
+      - 0
+      - :cspan:`11` P4
+      - :cspan:`11` P3
+      - :cspan:`11` P2
+      - :cspan:`11` P1
+      - :cspan:`11` P0
+    * - V4L2_PIX_FMT_RAW_CRU14
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - 0
+      - :cspan:`13` P3
+      - :cspan:`13` P2
+      - :cspan:`13` P1
+      - :cspan:`13` P0
+    * - V4L2_PIX_FMT_RAW_CRU20
+      - 0
+      - 0
+      - 0
+      - 0
+      - :cspan:`19` P2
+      - :cspan:`19` P1
+      - :cspan:`19` P0
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index e1fc8fe43b74..df40756ac8f6 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -344,6 +344,12 @@ const struct v4l2_format_info *v4l2_format_info(u32 format)
 		{ .format = V4L2_PIX_FMT_SGBRG12,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_SGRBG12,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
 		{ .format = V4L2_PIX_FMT_SRGGB12,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .bpp_div = { 1, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
+
+		/* Renesas Camera Data Receiver Unit formats, bayer order agnostic */
+		{ .format = V4L2_PIX_FMT_RAW_CRU10,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 6, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_RAW_CRU12,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 5, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_RAW_CRU14,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 4, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
+		{ .format = V4L2_PIX_FMT_RAW_CRU20,	.pixel_enc = V4L2_PIXEL_ENC_BAYER, .mem_planes = 1, .comp_planes = 1, .bpp = { 8, 0, 0, 0 }, .bpp_div = { 3, 1, 1, 1 }, .hdiv = 1, .vdiv = 1 },
 	};
 	unsigned int i;
 
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 650dc1956f73..be94a79b976e 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1413,6 +1413,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG10DPCM8:	descr = "8-bit Bayer GBGB/RGRG (DPCM)"; break;
 	case V4L2_PIX_FMT_SGRBG10DPCM8:	descr = "8-bit Bayer GRGR/BGBG (DPCM)"; break;
 	case V4L2_PIX_FMT_SRGGB10DPCM8:	descr = "8-bit Bayer RGRG/GBGB (DPCM)"; break;
+	case V4L2_PIX_FMT_RAW_CRU10:	descr = "10-bit Raw CRU Packed"; break;
 	case V4L2_PIX_FMT_SBGGR12:	descr = "12-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG12:	descr = "12-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG12:	descr = "12-bit Bayer GRGR/BGBG"; break;
@@ -1421,6 +1422,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG12P:	descr = "12-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG12P:	descr = "12-bit Bayer GRGR/BGBG Packed"; break;
 	case V4L2_PIX_FMT_SRGGB12P:	descr = "12-bit Bayer RGRG/GBGB Packed"; break;
+	case V4L2_PIX_FMT_RAW_CRU12:	descr = "12-bit Raw CRU Packed"; break;
 	case V4L2_PIX_FMT_SBGGR14:	descr = "14-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG14:	descr = "14-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG14:	descr = "14-bit Bayer GRGR/BGBG"; break;
@@ -1429,10 +1431,12 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_SGBRG14P:	descr = "14-bit Bayer GBGB/RGRG Packed"; break;
 	case V4L2_PIX_FMT_SGRBG14P:	descr = "14-bit Bayer GRGR/BGBG Packed"; break;
 	case V4L2_PIX_FMT_SRGGB14P:	descr = "14-bit Bayer RGRG/GBGB Packed"; break;
+	case V4L2_PIX_FMT_RAW_CRU14:	descr = "14-bit Raw CRU Packed"; break;
 	case V4L2_PIX_FMT_SBGGR16:	descr = "16-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG16:	descr = "16-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG16:	descr = "16-bit Bayer GRGR/BGBG"; break;
 	case V4L2_PIX_FMT_SRGGB16:	descr = "16-bit Bayer RGRG/GBGB"; break;
+	case V4L2_PIX_FMT_RAW_CRU20:	descr = "14-bit Raw CRU Packed"; break;
 	case V4L2_PIX_FMT_SN9C20X_I420:	descr = "GSPCA SN9C20X I420"; break;
 	case V4L2_PIX_FMT_SPCA501:	descr = "GSPCA SPCA501"; break;
 	case V4L2_PIX_FMT_SPCA505:	descr = "GSPCA SPCA505"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 9e3b366d5fc7..6f7bd38dd5aa 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -840,6 +840,12 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_PISP_COMP2_BGGR	v4l2_fourcc('P', 'C', '2', 'B') /* PiSP 8-bit mode 2 compressed BGGR bayer */
 #define V4L2_PIX_FMT_PISP_COMP2_MONO	v4l2_fourcc('P', 'C', '2', 'M') /* PiSP 8-bit mode 2 compressed monochrome */
 
+/* Renesas RZ/V2H CRU packed formats. 64-bit units with contiguous pixels */
+#define V4L2_PIX_FMT_RAW_CRU10	v4l2_fourcc('C', 'R', '1', '0')
+#define V4L2_PIX_FMT_RAW_CRU12	v4l2_fourcc('C', 'R', '1', '2')
+#define V4L2_PIX_FMT_RAW_CRU14	v4l2_fourcc('C', 'R', '1', '4')
+#define V4L2_PIX_FMT_RAW_CRU20	v4l2_fourcc('C', 'R', '2', '0')
+
 /* SDR formats - used only for Software Defined Radio devices */
 #define V4L2_SDR_FMT_CU8          v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */
 #define V4L2_SDR_FMT_CU16LE       v4l2_fourcc('C', 'U', '1', '6') /* IQ u16le */
-- 
cgit v1.2.3


From 8a67d4b49bbdebcd255abde9e652092c3de3b657 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:19 -0700
Subject: platform/x86/intel/vsec: Add device links to enforce dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New Intel VSEC features will have dependencies on other features, requiring
certain supplier drivers to be probed before their consumers. To enforce
this dependency ordering, introduce device links using device_link_add(),
ensuring that suppliers are fully registered before consumers are probed.

- Add device link tracking by storing supplier devices and tracking their
  state.
- Implement intel_vsec_link_devices() to establish links between suppliers
  and consumers based on feature dependencies.
- Add get_consumer_dependencies() to retrieve supplier-consumer
  relationships.
- Modify feature registration logic:
  * Consumers now check that all required suppliers are registered before
    being initialized.
  * suppliers_ready() verifies that all required supplier devices are
    available.
- Prevent potential null consumer name issue in sysfs:
  - Use dev_set_name() when creating auxiliary devices to ensure a
    unique, non-null consumer name.
- Update intel_vsec_pci_probe() to loop up to the number of possible
  features or when all devices are registered, whichever comes first.
- Introduce VSEC_CAP_UNUSED to prevent sub-features (registered via
  exported APIs) from being mistakenly linked.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-5-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/vsec.c | 223 ++++++++++++++++++++++++++++++++++++--
 include/linux/intel_vsec.h        |  28 ++++-
 2 files changed, 236 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c
index 8bdb74d86f24..aa1f7e63039d 100644
--- a/drivers/platform/x86/intel/vsec.c
+++ b/drivers/platform/x86/intel/vsec.c
@@ -15,9 +15,12 @@
 
 #include <linux/auxiliary_bus.h>
 #include <linux/bits.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
 #include <linux/cleanup.h>
 #include <linux/delay.h>
 #include <linux/idr.h>
+#include <linux/log2.h>
 #include <linux/intel_vsec.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -32,8 +35,17 @@ static DEFINE_IDA(intel_vsec_ida);
 static DEFINE_IDA(intel_vsec_sdsi_ida);
 static DEFINE_XARRAY_ALLOC(auxdev_array);
 
+enum vsec_device_state {
+	STATE_NOT_FOUND,
+	STATE_REGISTERED,
+	STATE_SKIP,
+};
+
 struct vsec_priv {
 	struct intel_vsec_platform_info *info;
+	struct device *suppliers[VSEC_FEATURE_COUNT];
+	enum vsec_device_state state[VSEC_FEATURE_COUNT];
+	unsigned long found_caps;
 };
 
 static const char *intel_vsec_name(enum intel_vsec_id id)
@@ -95,6 +107,74 @@ static void intel_vsec_dev_release(struct device *dev)
 	kfree(intel_vsec_dev);
 }
 
+static const struct vsec_feature_dependency *
+get_consumer_dependencies(struct vsec_priv *priv, int cap_id)
+{
+	const struct vsec_feature_dependency *deps = priv->info->deps;
+	int consumer_id = priv->info->num_deps;
+
+	if (!deps)
+		return NULL;
+
+	while (consumer_id--)
+		if (deps[consumer_id].feature == BIT(cap_id))
+			return &deps[consumer_id];
+
+	return NULL;
+}
+
+/*
+ * Although pci_device_id table is available in the pdev, this prototype is
+ * necessary because the code using it can be called by an exported API that
+ * might pass a different pdev.
+ */
+static const struct pci_device_id intel_vsec_pci_ids[];
+
+static int intel_vsec_link_devices(struct pci_dev *pdev, struct device *dev,
+				   int consumer_id)
+{
+	const struct vsec_feature_dependency *deps;
+	enum vsec_device_state *state;
+	struct device **suppliers;
+	struct vsec_priv *priv;
+	int supplier_id;
+
+	if (!consumer_id)
+		return 0;
+
+	if (!pci_match_id(intel_vsec_pci_ids, pdev))
+		return 0;
+
+	priv = pci_get_drvdata(pdev);
+	state = priv->state;
+	suppliers = priv->suppliers;
+
+	priv->suppliers[consumer_id] = dev;
+
+	deps = get_consumer_dependencies(priv, consumer_id);
+	if (!deps)
+		return 0;
+
+	for_each_set_bit(supplier_id, &deps->supplier_bitmap, VSEC_FEATURE_COUNT) {
+		struct device_link *link;
+
+		if (state[supplier_id] != STATE_REGISTERED)
+			continue;
+
+		if (!suppliers[supplier_id]) {
+			dev_err(dev, "Bad supplier list\n");
+			return -EINVAL;
+		}
+
+		link = device_link_add(dev, suppliers[supplier_id],
+				       DL_FLAG_AUTOPROBE_CONSUMER);
+		if (!link)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
 		       struct intel_vsec_device *intel_vsec_dev,
 		       const char *name)
@@ -132,19 +212,37 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
 		return ret;
 	}
 
+	/*
+	 * Assign a name now to ensure that the device link doesn't contain
+	 * a null string for the consumer name. This is a problem when a supplier
+	 * supplies more than one consumer and can lead to a duplicate name error
+	 * when the link is created in sysfs.
+	 */
+	ret = dev_set_name(&auxdev->dev, "%s.%s.%d", KBUILD_MODNAME, auxdev->name,
+			   auxdev->id);
+	if (ret)
+		goto cleanup_aux;
+
+	ret = intel_vsec_link_devices(pdev, &auxdev->dev, intel_vsec_dev->cap_id);
+	if (ret)
+		goto cleanup_aux;
+
 	ret = auxiliary_device_add(auxdev);
-	if (ret < 0) {
-		auxiliary_device_uninit(auxdev);
-		return ret;
-	}
+	if (ret)
+		goto cleanup_aux;
 
 	return devm_add_action_or_reset(parent, intel_vsec_remove_aux,
 				       auxdev);
+
+cleanup_aux:
+	auxiliary_device_uninit(auxdev);
+	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, "INTEL_VSEC");
 
 static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header,
-			      struct intel_vsec_platform_info *info)
+			      struct intel_vsec_platform_info *info,
+			      unsigned long cap_id)
 {
 	struct intel_vsec_device __free(kfree) *intel_vsec_dev = NULL;
 	struct resource __free(kfree) *res = NULL;
@@ -211,6 +309,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he
 	intel_vsec_dev->quirks = info->quirks;
 	intel_vsec_dev->base_addr = info->base_addr;
 	intel_vsec_dev->priv_data = info->priv_data;
+	intel_vsec_dev->cap_id = cap_id;
 
 	if (header->id == VSEC_ID_SDSI)
 		intel_vsec_dev->ida = &intel_vsec_sdsi_ida;
@@ -225,6 +324,101 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he
 				  intel_vsec_name(header->id));
 }
 
+static bool suppliers_ready(struct vsec_priv *priv,
+			    const struct vsec_feature_dependency *consumer_deps,
+			    int cap_id)
+{
+	enum vsec_device_state *state = priv->state;
+	int supplier_id;
+
+	if (WARN_ON_ONCE(consumer_deps->feature != BIT(cap_id)))
+		return false;
+
+	/*
+	 * Verify that all required suppliers have been found. Return false
+	 * immediately if any are still missing.
+	 */
+	for_each_set_bit(supplier_id, &consumer_deps->supplier_bitmap, VSEC_FEATURE_COUNT) {
+		if (state[supplier_id] == STATE_SKIP)
+			continue;
+
+		if (state[supplier_id] == STATE_NOT_FOUND)
+			return false;
+	}
+
+	/*
+	 * All suppliers have been found and the consumer is ready to be
+	 * registered.
+	 */
+	return true;
+}
+
+static int get_cap_id(u32 header_id, unsigned long *cap_id)
+{
+	switch (header_id) {
+	case VSEC_ID_TELEMETRY:
+		*cap_id = ilog2(VSEC_CAP_TELEMETRY);
+		break;
+	case VSEC_ID_WATCHER:
+		*cap_id = ilog2(VSEC_CAP_WATCHER);
+		break;
+	case VSEC_ID_CRASHLOG:
+		*cap_id = ilog2(VSEC_CAP_CRASHLOG);
+		break;
+	case VSEC_ID_SDSI:
+		*cap_id = ilog2(VSEC_CAP_SDSI);
+		break;
+	case VSEC_ID_TPMI:
+		*cap_id = ilog2(VSEC_CAP_TPMI);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int intel_vsec_register_device(struct pci_dev *pdev,
+				      struct intel_vsec_header *header,
+				      struct intel_vsec_platform_info *info)
+{
+	const struct vsec_feature_dependency *consumer_deps;
+	struct vsec_priv *priv;
+	unsigned long cap_id;
+	int ret;
+
+	ret = get_cap_id(header->id, &cap_id);
+	if (ret)
+		return ret;
+
+	/*
+	 * Only track dependencies for devices probed by the VSEC driver.
+	 * For others using the exported APIs, add the device directly.
+	 */
+	if (!pci_match_id(intel_vsec_pci_ids, pdev))
+		return intel_vsec_add_dev(pdev, header, info, cap_id);
+
+	priv = pci_get_drvdata(pdev);
+	if (priv->state[cap_id] == STATE_REGISTERED ||
+	    priv->state[cap_id] == STATE_SKIP)
+		return -EEXIST;
+
+	priv->found_caps |= BIT(cap_id);
+
+	consumer_deps = get_consumer_dependencies(priv, cap_id);
+	if (!consumer_deps || suppliers_ready(priv, consumer_deps, cap_id)) {
+		ret = intel_vsec_add_dev(pdev, header, info, cap_id);
+		if (ret)
+			priv->state[cap_id] = STATE_SKIP;
+		else
+			priv->state[cap_id] = STATE_REGISTERED;
+
+		return ret;
+	}
+
+	return -EAGAIN;
+}
+
 static bool intel_vsec_walk_header(struct pci_dev *pdev,
 				   struct intel_vsec_platform_info *info)
 {
@@ -233,7 +427,7 @@ static bool intel_vsec_walk_header(struct pci_dev *pdev,
 	int ret;
 
 	for ( ; *header; header++) {
-		ret = intel_vsec_add_dev(pdev, *header, info);
+		ret = intel_vsec_register_device(pdev, *header, info);
 		if (!ret)
 			have_devices = true;
 	}
@@ -281,7 +475,7 @@ static bool intel_vsec_walk_dvsec(struct pci_dev *pdev,
 		pci_read_config_dword(pdev, pos + PCI_DVSEC_HEADER2, &hdr);
 		header.id = PCI_DVSEC_HEADER2_ID(hdr);
 
-		ret = intel_vsec_add_dev(pdev, &header, info);
+		ret = intel_vsec_register_device(pdev, &header, info);
 		if (ret)
 			continue;
 
@@ -326,7 +520,7 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev,
 		header.tbir = INTEL_DVSEC_TABLE_BAR(table);
 		header.offset = INTEL_DVSEC_TABLE_OFFSET(table);
 
-		ret = intel_vsec_add_dev(pdev, &header, info);
+		ret = intel_vsec_register_device(pdev, &header, info);
 		if (ret)
 			continue;
 
@@ -378,7 +572,8 @@ static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id
 {
 	struct intel_vsec_platform_info *info;
 	struct vsec_priv *priv;
-	int ret;
+	int num_caps, ret;
+	bool found_any = false;
 
 	ret = pcim_enable_device(pdev);
 	if (ret)
@@ -396,7 +591,15 @@ static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id
 	priv->info = info;
 	pci_set_drvdata(pdev, priv);
 
-	if (!intel_vsec_get_features(pdev, info))
+	num_caps = hweight_long(info->caps);
+	while (num_caps--) {
+		found_any |= intel_vsec_get_features(pdev, info);
+
+		if (priv->found_caps == info->caps)
+			break;
+	}
+
+	if (!found_any)
 		return -ENODEV;
 
 	return 0;
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index bc95821f1bfb..71067afaca99 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -5,11 +5,18 @@
 #include <linux/auxiliary_bus.h>
 #include <linux/bits.h>
 
-#define VSEC_CAP_TELEMETRY	BIT(0)
-#define VSEC_CAP_WATCHER	BIT(1)
-#define VSEC_CAP_CRASHLOG	BIT(2)
-#define VSEC_CAP_SDSI		BIT(3)
-#define VSEC_CAP_TPMI		BIT(4)
+/*
+ * VSEC_CAP_UNUSED is reserved. It exists to prevent zero initialized
+ * intel_vsec devices from being automatically set to a known
+ * capability with ID 0
+ */
+#define VSEC_CAP_UNUSED		BIT(0)
+#define VSEC_CAP_TELEMETRY	BIT(1)
+#define VSEC_CAP_WATCHER	BIT(2)
+#define VSEC_CAP_CRASHLOG	BIT(3)
+#define VSEC_CAP_SDSI		BIT(4)
+#define VSEC_CAP_TPMI		BIT(5)
+#define VSEC_FEATURE_COUNT	6
 
 /* Intel DVSEC offsets */
 #define INTEL_DVSEC_ENTRIES		0xA
@@ -81,22 +88,31 @@ struct pmt_callbacks {
 	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count);
 };
 
+struct vsec_feature_dependency {
+	unsigned long feature;
+	unsigned long supplier_bitmap;
+};
+
 /**
  * struct intel_vsec_platform_info - Platform specific data
  * @parent:    parent device in the auxbus chain
  * @headers:   list of headers to define the PMT client devices to create
+ * @deps:      array of feature dependencies
  * @priv_data: private data, usable by parent devices, currently a callback
  * @caps:      bitmask of PMT capabilities for the given headers
  * @quirks:    bitmask of VSEC device quirks
  * @base_addr: allow a base address to be specified (rather than derived)
+ * @num_deps:  Count feature dependencies
  */
 struct intel_vsec_platform_info {
 	struct device *parent;
 	struct intel_vsec_header **headers;
+	const struct vsec_feature_dependency *deps;
 	void *priv_data;
 	unsigned long caps;
 	unsigned long quirks;
 	u64 base_addr;
+	int num_deps;
 };
 
 /**
@@ -110,6 +126,7 @@ struct intel_vsec_platform_info {
  * @priv_data:     any private data needed
  * @quirks:        specified quirks
  * @base_addr:     base address of entries (if specified)
+ * @cap_id:        the enumerated id of the vsec feature
  */
 struct intel_vsec_device {
 	struct auxiliary_device auxdev;
@@ -122,6 +139,7 @@ struct intel_vsec_device {
 	size_t priv_data_size;
 	unsigned long quirks;
 	u64 base_addr;
+	unsigned long cap_id;
 };
 
 int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
-- 
cgit v1.2.3


From 10f32796e86c04f73b7f8580cc9483765ed19f49 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:22 -0700
Subject: platform/x86/intel/vsec: Add new Discovery feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the PCIe VSEC ID for new Intel Platform Monitoring Technology
Capability Discovery feature. Discovery provides detailed information for
the various Intel VSEC features. Also make the driver a supplier for
TPMI and Telemetry drivers which will use the information.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-8-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/vsec.c | 26 ++++++++++++++++++++++++--
 include/linux/intel_vsec.h        |  4 +++-
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c
index 30e558af6888..4d76f1ac3c8c 100644
--- a/drivers/platform/x86/intel/vsec.c
+++ b/drivers/platform/x86/intel/vsec.c
@@ -66,6 +66,9 @@ static const char *intel_vsec_name(enum intel_vsec_id id)
 	case VSEC_ID_TPMI:
 		return "tpmi";
 
+	case VSEC_ID_DISCOVERY:
+		return "discovery";
+
 	default:
 		return NULL;
 	}
@@ -84,6 +87,8 @@ static bool intel_vsec_supported(u16 id, unsigned long caps)
 		return !!(caps & VSEC_CAP_SDSI);
 	case VSEC_ID_TPMI:
 		return !!(caps & VSEC_CAP_TPMI);
+	case VSEC_ID_DISCOVERY:
+		return !!(caps & VSEC_CAP_DISCOVERY);
 	default:
 		return false;
 	}
@@ -138,6 +143,8 @@ static bool vsec_driver_present(int cap_id)
 		return IS_ENABLED(CONFIG_INTEL_SDSI);
 	case VSEC_CAP_TPMI:
 		return IS_ENABLED(CONFIG_INTEL_TPMI);
+	case VSEC_CAP_DISCOVERY:
+		return IS_ENABLED(CONFIG_INTEL_PMT_DISCOVERY);
 	default:
 		return false;
 	}
@@ -392,6 +399,9 @@ static int get_cap_id(u32 header_id, unsigned long *cap_id)
 	case VSEC_ID_TPMI:
 		*cap_id = ilog2(VSEC_CAP_TPMI);
 		break;
+	case VSEC_ID_DISCOVERY:
+		*cap_id = ilog2(VSEC_CAP_DISCOVERY);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -681,14 +691,26 @@ static const struct intel_vsec_platform_info mtl_info = {
 	.caps = VSEC_CAP_TELEMETRY,
 };
 
+static const struct vsec_feature_dependency oobmsm_deps[] = {
+	{
+		.feature = VSEC_CAP_TELEMETRY,
+		.supplier_bitmap = VSEC_CAP_DISCOVERY,
+	},
+};
+
 /* OOBMSM info */
 static const struct intel_vsec_platform_info oobmsm_info = {
-	.caps = VSEC_CAP_TELEMETRY | VSEC_CAP_SDSI | VSEC_CAP_TPMI,
+	.caps = VSEC_CAP_TELEMETRY | VSEC_CAP_SDSI | VSEC_CAP_TPMI |
+		VSEC_CAP_DISCOVERY,
+	.deps = oobmsm_deps,
+	.num_deps = ARRAY_SIZE(oobmsm_deps),
 };
 
 /* DMR OOBMSM info */
 static const struct intel_vsec_platform_info dmr_oobmsm_info = {
-	.caps = VSEC_CAP_TELEMETRY | VSEC_CAP_TPMI,
+	.caps = VSEC_CAP_TELEMETRY | VSEC_CAP_TPMI | VSEC_CAP_DISCOVERY,
+	.deps = oobmsm_deps,
+	.num_deps = ARRAY_SIZE(oobmsm_deps),
 };
 
 /* TGL info */
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 71067afaca99..a07796d7d43b 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -16,7 +16,8 @@
 #define VSEC_CAP_CRASHLOG	BIT(3)
 #define VSEC_CAP_SDSI		BIT(4)
 #define VSEC_CAP_TPMI		BIT(5)
-#define VSEC_FEATURE_COUNT	6
+#define VSEC_CAP_DISCOVERY	BIT(6)
+#define VSEC_FEATURE_COUNT	7
 
 /* Intel DVSEC offsets */
 #define INTEL_DVSEC_ENTRIES		0xA
@@ -33,6 +34,7 @@ enum intel_vsec_id {
 	VSEC_ID_TELEMETRY	= 2,
 	VSEC_ID_WATCHER		= 3,
 	VSEC_ID_CRASHLOG	= 4,
+	VSEC_ID_DISCOVERY	= 12,
 	VSEC_ID_SDSI		= 65,
 	VSEC_ID_TPMI		= 66,
 };
-- 
cgit v1.2.3


From d9a0788093565c300f7c8dd034dbfa6ac4da9aa6 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:23 -0700
Subject: platform/x86/intel/pmt: Add PMT Discovery driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a new driver to enumerate and expose Intel Platform
Monitoring Technology (PMT) capabilities via a simple discovery mechanism.
The PMT Discovery driver parses hardware-provided discovery tables from
Intel Out of Band Management Services Modules (OOBMSM) and extracts feature
information for various providers (such as TPMI, Telemetry, Crash Log,
etc). This unified interface simplifies the process of determining which
manageability and telemetry features are supported by a given platform.

This new feature is described in the Intel Platform Monitoring Technology
3.0 specification, section 6.6 Capability.

Key changes and additions:

New file drivers/platform/x86/intel/pmt/discovery.c:
  – Implements the discovery logic to map the discovery resource, read
    the feature discovery table, and validate feature parameters.
New file drivers/platform/x86/intel/pmt/features.c:
  – Defines feature names, layouts, and associated capability masks.
  – Provides a mapping between raw hardware attributes and sysfs
    representations for easier integration with user-space tools.
New header include/linux/intel_pmt_features.h:
  – Declares constants, masks, and feature identifiers used across the
    PMT framework.
Sysfs integration:
  – Feature attributes are exposed under /sys/class/intel_pmt.
  – Each device is represented by a subfolder within the intel_pmt class,
    named using its DBDF (Domain:Bus:Device.Function), e.g.:
        features-0000:00:03.1
  – Example directory layout for a device:

    /sys/class/intel_pmt/features-0000:00:03.1/
    ├── accelerator_telemetry
    ├── crash_log
    ├── per_core_environment_telemetry
    ├── per_core_performance_telemetry
    ├── per_rmid_energy_telemetry
    ├── per_rmid_perf_telemetry
    ├── tpmi_control
    ├── tracing
    └── uncore_telemetry

By exposing PMT feature details through sysfs and integrating with the
existing PMT class, this driver paves the way for more streamlined
integration of PMT-based manageability and telemetry tools.

Link: https://www.intel.com/content/www/us/en/content-details/710389/intel-platform-monitoring-technology-intel-pmt-external-specification.html
Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-9-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/pmt/Kconfig     |  12 +
 drivers/platform/x86/intel/pmt/Makefile    |   2 +
 drivers/platform/x86/intel/pmt/class.c     |  35 +-
 drivers/platform/x86/intel/pmt/class.h     |   2 +
 drivers/platform/x86/intel/pmt/discovery.c | 602 +++++++++++++++++++++++++++++
 drivers/platform/x86/intel/pmt/features.c  | 205 ++++++++++
 include/linux/intel_pmt_features.h         | 157 ++++++++
 7 files changed, 1013 insertions(+), 2 deletions(-)
 create mode 100644 drivers/platform/x86/intel/pmt/discovery.c
 create mode 100644 drivers/platform/x86/intel/pmt/features.c
 create mode 100644 include/linux/intel_pmt_features.h

(limited to 'include')

diff --git a/drivers/platform/x86/intel/pmt/Kconfig b/drivers/platform/x86/intel/pmt/Kconfig
index e916fc966221..0ad91b5112e9 100644
--- a/drivers/platform/x86/intel/pmt/Kconfig
+++ b/drivers/platform/x86/intel/pmt/Kconfig
@@ -38,3 +38,15 @@ config INTEL_PMT_CRASHLOG
 
 	  To compile this driver as a module, choose M here: the module
 	  will be called intel_pmt_crashlog.
+
+config INTEL_PMT_DISCOVERY
+	tristate "Intel Platform Monitoring Technology (PMT) Discovery driver"
+	depends on INTEL_VSEC
+	select INTEL_PMT_CLASS
+	help
+	  The Intel Platform Monitoring Technology (PMT) discovery driver provides
+	  access to details about the various PMT features and feature specific
+	  attributes.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pmt_discovery.
diff --git a/drivers/platform/x86/intel/pmt/Makefile b/drivers/platform/x86/intel/pmt/Makefile
index 279e158c7c23..8aed7e1592e4 100644
--- a/drivers/platform/x86/intel/pmt/Makefile
+++ b/drivers/platform/x86/intel/pmt/Makefile
@@ -10,3 +10,5 @@ obj-$(CONFIG_INTEL_PMT_TELEMETRY)	+= pmt_telemetry.o
 pmt_telemetry-y				:= telemetry.o
 obj-$(CONFIG_INTEL_PMT_CRASHLOG)	+= pmt_crashlog.o
 pmt_crashlog-y				:= crashlog.o
+obj-$(CONFIG_INTEL_PMT_DISCOVERY)	+= pmt_discovery.o
+pmt_discovery-y				:= discovery.o features.o
diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c
index 7233b654bbad..a806a81ece52 100644
--- a/drivers/platform/x86/intel/pmt/class.c
+++ b/drivers/platform/x86/intel/pmt/class.c
@@ -9,11 +9,13 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/log2.h>
 #include <linux/intel_vsec.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/sysfs.h>
 
 #include "class.h"
 
@@ -166,12 +168,41 @@ static struct attribute *intel_pmt_attrs[] = {
 	&dev_attr_offset.attr,
 	NULL
 };
-ATTRIBUTE_GROUPS(intel_pmt);
 
-static struct class intel_pmt_class = {
+static umode_t intel_pmt_attr_visible(struct kobject *kobj,
+				      struct attribute *attr, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct auxiliary_device *auxdev = to_auxiliary_dev(dev->parent);
+	struct intel_vsec_device *ivdev = auxdev_to_ivdev(auxdev);
+
+	/*
+	 * Place the discovery features folder in /sys/class/intel_pmt, but
+	 * exclude the common attributes as they are not applicable.
+	 */
+	if (ivdev->cap_id == ilog2(VSEC_CAP_DISCOVERY))
+		return 0;
+
+	return attr->mode;
+}
+
+static bool intel_pmt_group_visible(struct kobject *kobj)
+{
+	return true;
+}
+DEFINE_SYSFS_GROUP_VISIBLE(intel_pmt);
+
+static const struct attribute_group intel_pmt_group = {
+	.attrs = intel_pmt_attrs,
+	.is_visible = SYSFS_GROUP_VISIBLE(intel_pmt),
+};
+__ATTRIBUTE_GROUPS(intel_pmt);
+
+struct class intel_pmt_class = {
 	.name = "intel_pmt",
 	.dev_groups = intel_pmt_groups,
 };
+EXPORT_SYMBOL_GPL(intel_pmt_class);
 
 static int intel_pmt_populate_entry(struct intel_pmt_entry *entry,
 				    struct intel_vsec_device *ivdev,
diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h
index b2006d57779d..39c32357ee2c 100644
--- a/drivers/platform/x86/intel/pmt/class.h
+++ b/drivers/platform/x86/intel/pmt/class.h
@@ -20,6 +20,7 @@
 #define GET_ADDRESS(v)		((v) & GENMASK(31, 3))
 
 struct pci_dev;
+extern struct class intel_pmt_class;
 
 struct telem_endpoint {
 	struct pci_dev		*pcidev;
@@ -48,6 +49,7 @@ struct intel_pmt_entry {
 	unsigned long		base_addr;
 	size_t			size;
 	u32			guid;
+	u32			num_rmids; /* Number of Resource Monitoring IDs */
 	int			devid;
 };
 
diff --git a/drivers/platform/x86/intel/pmt/discovery.c b/drivers/platform/x86/intel/pmt/discovery.c
new file mode 100644
index 000000000000..4b4fa3137ad2
--- /dev/null
+++ b/drivers/platform/x86/intel/pmt/discovery.c
@@ -0,0 +1,602 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Platform Monitory Technology Discovery driver
+ *
+ * Copyright (c) 2025, Intel Corporation.
+ * All Rights Reserved.
+ */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/bug.h>
+#include <linux/cleanup.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/kdev_t.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/overflow.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string_choices.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+#include <linux/intel_pmt_features.h>
+#include <linux/intel_vsec.h>
+
+#include "class.h"
+
+#define MAX_FEATURE_VERSION	0
+#define DT_TBIR			GENMASK(2, 0)
+#define FEAT_ATTR_SIZE(x)	((x) * sizeof(u32))
+#define PMT_GUID_SIZE(x)	((x) * sizeof(u32))
+#define PMT_ACCESS_TYPE_RSVD	0xF
+#define SKIP_FEATURE		1
+
+struct feature_discovery_table {
+	u32	access_type:4;
+	u32	version:8;
+	u32	size:16;
+	u32	reserved:4;
+	u32	id;
+	u32	offset;
+	u32	reserved2;
+};
+
+/* Common feature table header */
+struct feature_header {
+	u32	attr_size:8;
+	u32	num_guids:8;
+	u32	reserved:16;
+};
+
+/* Feature attribute fields */
+struct caps {
+	u32		caps;
+};
+
+struct command {
+	u32		max_stream_size:16;
+	u32		max_command_size:16;
+};
+
+struct watcher {
+	u32		reserved:21;
+	u32		period:11;
+	struct command	command;
+};
+
+struct rmid {
+	u32		num_rmids:16;	/* Number of Resource Monitoring IDs */
+	u32		reserved:16;
+	struct watcher	watcher;
+};
+
+struct feature_table {
+	struct feature_header	header;
+	struct caps		caps;
+	union {
+		struct command command;
+		struct watcher watcher;
+		struct rmid rmid;
+	};
+	u32			*guids;
+};
+
+/* For backreference in struct feature */
+struct pmt_features_priv;
+
+struct feature {
+	struct feature_table		table;
+	struct kobject			kobj;
+	struct pmt_features_priv	*priv;
+	struct list_head		list;
+	const struct attribute_group	*attr_group;
+	enum pmt_feature_id		id;
+};
+
+struct pmt_features_priv {
+	struct device		*parent;
+	struct device		*dev;
+	int			count;
+	u32			mask;
+	struct feature		feature[];
+};
+
+static LIST_HEAD(pmt_feature_list);
+static DEFINE_MUTEX(feature_list_lock);
+
+#define to_pmt_feature(x) container_of(x, struct feature, kobj)
+static void pmt_feature_release(struct kobject *kobj)
+{
+}
+
+static ssize_t caps_show(struct kobject *kobj, struct kobj_attribute *attr,
+			 char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+	struct pmt_cap **pmt_caps;
+	u32 caps = feature->table.caps.caps;
+	ssize_t ret = 0;
+
+	switch (feature->id) {
+	case FEATURE_PER_CORE_PERF_TELEM:
+		pmt_caps = pmt_caps_pcpt;
+		break;
+	case FEATURE_PER_CORE_ENV_TELEM:
+		pmt_caps = pmt_caps_pcet;
+		break;
+	case FEATURE_PER_RMID_PERF_TELEM:
+		pmt_caps = pmt_caps_rmid_perf;
+		break;
+	case FEATURE_ACCEL_TELEM:
+		pmt_caps = pmt_caps_accel;
+		break;
+	case FEATURE_UNCORE_TELEM:
+		pmt_caps = pmt_caps_uncore;
+		break;
+	case FEATURE_CRASH_LOG:
+		pmt_caps = pmt_caps_crashlog;
+		break;
+	case FEATURE_PETE_LOG:
+		pmt_caps = pmt_caps_pete;
+		break;
+	case FEATURE_TPMI_CTRL:
+		pmt_caps = pmt_caps_tpmi;
+		break;
+	case FEATURE_TRACING:
+		pmt_caps = pmt_caps_tracing;
+		break;
+	case FEATURE_PER_RMID_ENERGY_TELEM:
+		pmt_caps = pmt_caps_rmid_energy;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	while (*pmt_caps) {
+		struct pmt_cap *pmt_cap = *pmt_caps;
+
+		while (pmt_cap->name) {
+			ret += sysfs_emit_at(buf, ret, "%-40s Available: %s\n", pmt_cap->name,
+					     str_yes_no(pmt_cap->mask & caps));
+			pmt_cap++;
+		}
+		pmt_caps++;
+	}
+
+	return ret;
+}
+static struct kobj_attribute caps_attribute = __ATTR_RO(caps);
+
+static struct watcher *get_watcher(struct feature *feature)
+{
+	switch (feature_layout[feature->id]) {
+	case LAYOUT_RMID:
+		return &feature->table.rmid.watcher;
+	case LAYOUT_WATCHER:
+		return &feature->table.watcher;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+static struct command *get_command(struct feature *feature)
+{
+	switch (feature_layout[feature->id]) {
+	case LAYOUT_RMID:
+		return &feature->table.rmid.watcher.command;
+	case LAYOUT_WATCHER:
+		return &feature->table.watcher.command;
+	case LAYOUT_COMMAND:
+		return &feature->table.command;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+static ssize_t num_rmids_show(struct kobject *kobj,
+			      struct kobj_attribute *attr, char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+
+	return sysfs_emit(buf, "%u\n", feature->table.rmid.num_rmids);
+}
+static struct kobj_attribute num_rmids_attribute = __ATTR_RO(num_rmids);
+
+static ssize_t min_watcher_period_ms_show(struct kobject *kobj,
+					  struct kobj_attribute *attr, char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+	struct watcher *watcher = get_watcher(feature);
+
+	if (IS_ERR(watcher))
+		return PTR_ERR(watcher);
+
+	return sysfs_emit(buf, "%u\n", watcher->period);
+}
+static struct kobj_attribute min_watcher_period_ms_attribute =
+	__ATTR_RO(min_watcher_period_ms);
+
+static ssize_t max_stream_size_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+	struct command *command = get_command(feature);
+
+	if (IS_ERR(command))
+		return PTR_ERR(command);
+
+	return sysfs_emit(buf, "%u\n", command->max_stream_size);
+}
+static struct kobj_attribute max_stream_size_attribute =
+	__ATTR_RO(max_stream_size);
+
+static ssize_t max_command_size_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+	struct command *command = get_command(feature);
+
+	if (IS_ERR(command))
+		return PTR_ERR(command);
+
+	return sysfs_emit(buf, "%u\n", command->max_command_size);
+}
+static struct kobj_attribute max_command_size_attribute =
+	__ATTR_RO(max_command_size);
+
+static ssize_t guids_show(struct kobject *kobj, struct kobj_attribute *attr,
+			  char *buf)
+{
+	struct feature *feature = to_pmt_feature(kobj);
+	int i, count = 0;
+
+	for (i = 0; i < feature->table.header.num_guids; i++)
+		count += sysfs_emit_at(buf, count, "0x%x\n",
+				       feature->table.guids[i]);
+
+	return count;
+}
+static struct kobj_attribute guids_attribute = __ATTR_RO(guids);
+
+static struct attribute *pmt_feature_rmid_attrs[] = {
+	&caps_attribute.attr,
+	&num_rmids_attribute.attr,
+	&min_watcher_period_ms_attribute.attr,
+	&max_stream_size_attribute.attr,
+	&max_command_size_attribute.attr,
+	&guids_attribute.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(pmt_feature_rmid);
+
+static const struct kobj_type pmt_feature_rmid_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = pmt_feature_release,
+	.default_groups = pmt_feature_rmid_groups,
+};
+
+static struct attribute *pmt_feature_watcher_attrs[] = {
+	&caps_attribute.attr,
+	&min_watcher_period_ms_attribute.attr,
+	&max_stream_size_attribute.attr,
+	&max_command_size_attribute.attr,
+	&guids_attribute.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(pmt_feature_watcher);
+
+static const struct kobj_type pmt_feature_watcher_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = pmt_feature_release,
+	.default_groups = pmt_feature_watcher_groups,
+};
+
+static struct attribute *pmt_feature_command_attrs[] = {
+	&caps_attribute.attr,
+	&max_stream_size_attribute.attr,
+	&max_command_size_attribute.attr,
+	&guids_attribute.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(pmt_feature_command);
+
+static const struct kobj_type pmt_feature_command_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = pmt_feature_release,
+	.default_groups = pmt_feature_command_groups,
+};
+
+static struct attribute *pmt_feature_guids_attrs[] = {
+	&caps_attribute.attr,
+	&guids_attribute.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(pmt_feature_guids);
+
+static const struct kobj_type pmt_feature_guids_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = pmt_feature_release,
+	.default_groups = pmt_feature_guids_groups,
+};
+
+static int
+pmt_feature_get_disc_table(struct pmt_features_priv *priv,
+			   struct resource *disc_res,
+			   struct feature_discovery_table *disc_tbl)
+{
+	void __iomem *disc_base;
+
+	disc_base = devm_ioremap_resource(priv->dev, disc_res);
+	if (IS_ERR(disc_base))
+		return PTR_ERR(disc_base);
+
+	memcpy_fromio(disc_tbl, disc_base, sizeof(*disc_tbl));
+
+	devm_iounmap(priv->dev, disc_base);
+
+	if (priv->mask & BIT(disc_tbl->id))
+		return dev_err_probe(priv->dev, -EINVAL, "Duplicate feature: %s\n",
+				     pmt_feature_names[disc_tbl->id]);
+
+	/*
+	 * Some devices may expose non-functioning entries that are
+	 * reserved for future use. They have zero size. Do not fail
+	 * probe for these. Just ignore them.
+	 */
+	if (disc_tbl->size == 0 || disc_tbl->access_type == PMT_ACCESS_TYPE_RSVD)
+		return SKIP_FEATURE;
+
+	if (disc_tbl->version > MAX_FEATURE_VERSION)
+		return SKIP_FEATURE;
+
+	if (!pmt_feature_id_is_valid(disc_tbl->id))
+		return SKIP_FEATURE;
+
+	priv->mask |= BIT(disc_tbl->id);
+
+	return 0;
+}
+
+static int
+pmt_feature_get_feature_table(struct pmt_features_priv *priv,
+			      struct feature *feature,
+			      struct feature_discovery_table *disc_tbl,
+			      struct resource *disc_res)
+{
+	struct feature_table *feat_tbl = &feature->table;
+	struct feature_header *header;
+	struct resource res = {};
+	resource_size_t res_size;
+	void __iomem *feat_base, *feat_offset;
+	void *tbl_offset;
+	size_t size;
+	u32 *guids;
+	u8 tbir;
+
+	tbir = FIELD_GET(DT_TBIR, disc_tbl->offset);
+
+	switch (disc_tbl->access_type) {
+	case ACCESS_LOCAL:
+		if (tbir)
+			return dev_err_probe(priv->dev, -EINVAL,
+				"Unsupported BAR index %u for access type %u\n",
+				tbir, disc_tbl->access_type);
+
+
+		/*
+		 * For access_type LOCAL, the base address is as follows:
+		 * base address = end of discovery region + base offset + 1
+		 */
+		res = DEFINE_RES_MEM(disc_res->end + disc_tbl->offset + 1,
+				     disc_tbl->size * sizeof(u32));
+		break;
+
+	default:
+		return dev_err_probe(priv->dev, -EINVAL, "Unrecognized access_type %u\n",
+				     disc_tbl->access_type);
+	}
+
+	feature->id = disc_tbl->id;
+
+	/* Get the feature table */
+	feat_base = devm_ioremap_resource(priv->dev, &res);
+	if (IS_ERR(feat_base))
+		return PTR_ERR(feat_base);
+
+	feat_offset = feat_base;
+	tbl_offset = feat_tbl;
+
+	/* Get the header */
+	header = &feat_tbl->header;
+	memcpy_fromio(header, feat_offset, sizeof(*header));
+
+	/* Validate fields fit within mapped resource */
+	size = sizeof(*header) + FEAT_ATTR_SIZE(header->attr_size) +
+	       PMT_GUID_SIZE(header->num_guids);
+	res_size = resource_size(&res);
+	if (WARN(size > res_size, "Bad table size %ld > %pa", size, &res_size))
+		return -EINVAL;
+
+	/* Get the feature attributes, including capability fields */
+	tbl_offset += sizeof(*header);
+	feat_offset += sizeof(*header);
+
+	memcpy_fromio(tbl_offset, feat_offset, FEAT_ATTR_SIZE(header->attr_size));
+
+	/* Finally, get the guids */
+	guids = devm_kmalloc(priv->dev, PMT_GUID_SIZE(header->num_guids), GFP_KERNEL);
+	if (!guids)
+		return -ENOMEM;
+
+	feat_offset += FEAT_ATTR_SIZE(header->attr_size);
+
+	memcpy_fromio(guids, feat_offset, PMT_GUID_SIZE(header->num_guids));
+
+	feat_tbl->guids = guids;
+
+	devm_iounmap(priv->dev, feat_base);
+
+	return 0;
+}
+
+static void pmt_features_add_feat(struct feature *feature)
+{
+	guard(mutex)(&feature_list_lock);
+	list_add(&feature->list, &pmt_feature_list);
+}
+
+static void pmt_features_remove_feat(struct feature *feature)
+{
+	guard(mutex)(&feature_list_lock);
+	list_del(&feature->list);
+}
+
+/* Get the discovery table and use it to get the feature table */
+static int pmt_features_discovery(struct pmt_features_priv *priv,
+				  struct feature *feature,
+				  struct intel_vsec_device *ivdev,
+				  int idx)
+{
+	struct feature_discovery_table disc_tbl = {}; /* Avoid false warning */
+	struct resource *disc_res = &ivdev->resource[idx];
+	const struct kobj_type *ktype;
+	int ret;
+
+	ret = pmt_feature_get_disc_table(priv, disc_res, &disc_tbl);
+	if (ret)
+		return ret;
+
+	ret = pmt_feature_get_feature_table(priv, feature, &disc_tbl, disc_res);
+	if (ret)
+		return ret;
+
+	switch (feature_layout[feature->id]) {
+	case LAYOUT_RMID:
+		ktype = &pmt_feature_rmid_ktype;
+		feature->attr_group = &pmt_feature_rmid_group;
+		break;
+	case LAYOUT_WATCHER:
+		ktype = &pmt_feature_watcher_ktype;
+		feature->attr_group = &pmt_feature_watcher_group;
+		break;
+	case LAYOUT_COMMAND:
+		ktype = &pmt_feature_command_ktype;
+		feature->attr_group = &pmt_feature_command_group;
+		break;
+	case LAYOUT_CAPS_ONLY:
+		ktype = &pmt_feature_guids_ktype;
+		feature->attr_group = &pmt_feature_guids_group;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = kobject_init_and_add(&feature->kobj, ktype, &priv->dev->kobj,
+				   pmt_feature_names[feature->id]);
+	if (ret)
+		return ret;
+
+	kobject_uevent(&feature->kobj, KOBJ_ADD);
+	pmt_features_add_feat(feature);
+
+	return 0;
+}
+
+static void pmt_features_remove(struct auxiliary_device *auxdev)
+{
+	struct pmt_features_priv *priv = auxiliary_get_drvdata(auxdev);
+	int i;
+
+	for (i = 0; i < priv->count; i++) {
+		struct feature *feature = &priv->feature[i];
+
+		pmt_features_remove_feat(feature);
+		sysfs_remove_group(&feature->kobj, feature->attr_group);
+		kobject_put(&feature->kobj);
+	}
+
+	device_unregister(priv->dev);
+}
+
+static int pmt_features_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id)
+{
+	struct intel_vsec_device *ivdev = auxdev_to_ivdev(auxdev);
+	struct pmt_features_priv *priv;
+	size_t size;
+	int ret, i;
+
+	size = struct_size(priv, feature, ivdev->num_resources);
+	priv = devm_kzalloc(&auxdev->dev, size, GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->parent = &ivdev->pcidev->dev;
+	auxiliary_set_drvdata(auxdev, priv);
+
+	priv->dev = device_create(&intel_pmt_class, &auxdev->dev, MKDEV(0, 0), priv,
+				  "%s-%s", "features", dev_name(priv->parent));
+	if (IS_ERR(priv->dev))
+		return dev_err_probe(priv->dev, PTR_ERR(priv->dev),
+				     "Could not create %s-%s device node\n",
+				     "features", dev_name(priv->dev));
+
+	/* Initialize each feature */
+	for (i = 0; i < ivdev->num_resources; i++) {
+		struct feature *feature = &priv->feature[priv->count];
+
+		ret = pmt_features_discovery(priv, feature, ivdev, i);
+		if (ret == SKIP_FEATURE)
+			continue;
+		if (ret != 0)
+			goto abort_probe;
+
+		feature->priv = priv;
+		priv->count++;
+	}
+
+	return 0;
+
+abort_probe:
+	/*
+	 * Only fully initialized features are tracked in priv->count, which is
+	 * incremented only after a feature is completely set up (i.e., after
+	 * discovery and sysfs registration). If feature initialization fails,
+	 * the failing feature's state is local and does not require rollback.
+	 *
+	 * Therefore, on error, we can safely call the driver's remove() routine
+	 * pmt_features_remove() to clean up only those features that were
+	 * fully initialized and counted. All other resources are device-managed
+	 * and will be cleaned up automatically during device_unregister().
+	 */
+	pmt_features_remove(auxdev);
+
+	return ret;
+}
+
+static const struct auxiliary_device_id pmt_features_id_table[] = {
+	{ .name = "intel_vsec.discovery" },
+	{}
+};
+MODULE_DEVICE_TABLE(auxiliary, pmt_features_id_table);
+
+static struct auxiliary_driver pmt_features_aux_driver = {
+	.id_table	= pmt_features_id_table,
+	.remove		= pmt_features_remove,
+	.probe		= pmt_features_probe,
+};
+module_auxiliary_driver(pmt_features_aux_driver);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("Intel PMT Discovery driver");
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS("INTEL_PMT");
diff --git a/drivers/platform/x86/intel/pmt/features.c b/drivers/platform/x86/intel/pmt/features.c
new file mode 100644
index 000000000000..8a39cddc75c8
--- /dev/null
+++ b/drivers/platform/x86/intel/pmt/features.c
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Author: "David E. Box" <david.e.box@linux.intel.com>
+ */
+
+#include <linux/export.h>
+#include <linux/types.h>
+
+#include <linux/intel_pmt_features.h>
+
+const char * const pmt_feature_names[] = {
+	[FEATURE_PER_CORE_PERF_TELEM]	= "per_core_performance_telemetry",
+	[FEATURE_PER_CORE_ENV_TELEM]	= "per_core_environment_telemetry",
+	[FEATURE_PER_RMID_PERF_TELEM]	= "per_rmid_perf_telemetry",
+	[FEATURE_ACCEL_TELEM]		= "accelerator_telemetry",
+	[FEATURE_UNCORE_TELEM]		= "uncore_telemetry",
+	[FEATURE_CRASH_LOG]		= "crash_log",
+	[FEATURE_PETE_LOG]		= "pete_log",
+	[FEATURE_TPMI_CTRL]		= "tpmi_control",
+	[FEATURE_TRACING]		= "tracing",
+	[FEATURE_PER_RMID_ENERGY_TELEM]	= "per_rmid_energy_telemetry",
+};
+EXPORT_SYMBOL_NS_GPL(pmt_feature_names, "INTEL_PMT_DISCOVERY");
+
+enum feature_layout feature_layout[] = {
+	[FEATURE_PER_CORE_PERF_TELEM]	= LAYOUT_WATCHER,
+	[FEATURE_PER_CORE_ENV_TELEM]	= LAYOUT_WATCHER,
+	[FEATURE_PER_RMID_PERF_TELEM]	= LAYOUT_RMID,
+	[FEATURE_ACCEL_TELEM]		= LAYOUT_WATCHER,
+	[FEATURE_UNCORE_TELEM]		= LAYOUT_WATCHER,
+	[FEATURE_CRASH_LOG]		= LAYOUT_COMMAND,
+	[FEATURE_PETE_LOG]		= LAYOUT_COMMAND,
+	[FEATURE_TPMI_CTRL]		= LAYOUT_CAPS_ONLY,
+	[FEATURE_TRACING]		= LAYOUT_CAPS_ONLY,
+	[FEATURE_PER_RMID_ENERGY_TELEM]	= LAYOUT_RMID,
+};
+
+struct pmt_cap pmt_cap_common[] = {
+	{PMT_CAP_TELEM,		"telemetry"},
+	{PMT_CAP_WATCHER,	"watcher"},
+	{PMT_CAP_CRASHLOG,	"crashlog"},
+	{PMT_CAP_STREAMING,	"streaming"},
+	{PMT_CAP_THRESHOLD,	"threshold"},
+	{PMT_CAP_WINDOW,	"window"},
+	{PMT_CAP_CONFIG,	"config"},
+	{PMT_CAP_TRACING,	"tracing"},
+	{PMT_CAP_INBAND,	"inband"},
+	{PMT_CAP_OOB,		"oob"},
+	{PMT_CAP_SECURED_CHAN,	"secure_chan"},
+	{PMT_CAP_PMT_SP,	"pmt_sp"},
+	{PMT_CAP_PMT_SP_POLICY,	"pmt_sp_policy"},
+	{}
+};
+
+struct pmt_cap pmt_cap_pcpt[] = {
+	{PMT_CAP_PCPT_CORE_PERF,	"core_performance"},
+	{PMT_CAP_PCPT_CORE_C0_RES,	"core_c0_residency"},
+	{PMT_CAP_PCPT_CORE_ACTIVITY,	"core_activity"},
+	{PMT_CAP_PCPT_CACHE_PERF,	"cache_performance"},
+	{PMT_CAP_PCPT_QUALITY_TELEM,	"quality_telemetry"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_pcpt[] = {
+	pmt_cap_common,
+	pmt_cap_pcpt,
+	NULL
+};
+
+struct pmt_cap pmt_cap_pcet[] = {
+	{PMT_CAP_PCET_WORKPOINT_HIST,	"workpoint_histogram"},
+	{PMT_CAP_PCET_CORE_CURR_TEMP,	"core_current_temp"},
+	{PMT_CAP_PCET_CORE_INST_RES,	"core_inst_residency"},
+	{PMT_CAP_PCET_QUALITY_TELEM,	"quality_telemetry"},
+	{PMT_CAP_PCET_CORE_CDYN_LVL,	"core_cdyn_level"},
+	{PMT_CAP_PCET_CORE_STRESS_LVL,	"core_stress_level"},
+	{PMT_CAP_PCET_CORE_DAS,		"core_digital_aging_sensor"},
+	{PMT_CAP_PCET_FIVR_HEALTH,	"fivr_health"},
+	{PMT_CAP_PCET_ENERGY,		"energy"},
+	{PMT_CAP_PCET_PEM_STATUS,	"pem_status"},
+	{PMT_CAP_PCET_CORE_C_STATE,	"core_c_state"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_pcet[] = {
+	pmt_cap_common,
+	pmt_cap_pcet,
+	NULL
+};
+
+struct pmt_cap pmt_cap_rmid_perf[] = {
+	{PMT_CAP_RMID_CORES_PERF,	"core_performance"},
+	{PMT_CAP_RMID_CACHE_PERF,	"cache_performance"},
+	{PMT_CAP_RMID_PERF_QUAL,	"performance_quality"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_rmid_perf[] = {
+	pmt_cap_common,
+	pmt_cap_rmid_perf,
+	NULL
+};
+
+struct pmt_cap pmt_cap_accel[] = {
+	{PMT_CAP_ACCEL_CPM_TELEM,	"content_processing_module"},
+	{PMT_CAP_ACCEL_TIP_TELEM,	"content_turbo_ip"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_accel[] = {
+	pmt_cap_common,
+	pmt_cap_accel,
+	NULL
+};
+
+struct pmt_cap pmt_cap_uncore[] = {
+	{PMT_CAP_UNCORE_IO_CA_TELEM,	"io_ca"},
+	{PMT_CAP_UNCORE_RMID_TELEM,	"rmid"},
+	{PMT_CAP_UNCORE_D2D_ULA_TELEM,	"d2d_ula"},
+	{PMT_CAP_UNCORE_PKGC_TELEM,	"package_c"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_uncore[] = {
+	pmt_cap_common,
+	pmt_cap_uncore,
+	NULL
+};
+
+struct pmt_cap pmt_cap_crashlog[] = {
+	{PMT_CAP_CRASHLOG_MAN_TRIG,	"manual_trigger"},
+	{PMT_CAP_CRASHLOG_CORE,		"core"},
+	{PMT_CAP_CRASHLOG_UNCORE,	"uncore"},
+	{PMT_CAP_CRASHLOG_TOR,		"tor"},
+	{PMT_CAP_CRASHLOG_S3M,		"s3m"},
+	{PMT_CAP_CRASHLOG_PERSISTENCY,	"persistency"},
+	{PMT_CAP_CRASHLOG_CLIP_GPIO,	"crashlog_in_progress"},
+	{PMT_CAP_CRASHLOG_PRE_RESET,	"pre_reset_extraction"},
+	{PMT_CAP_CRASHLOG_POST_RESET,	"post_reset_extraction"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_crashlog[] = {
+	pmt_cap_common,
+	pmt_cap_crashlog,
+	NULL
+};
+
+struct pmt_cap pmt_cap_pete[] = {
+	{PMT_CAP_PETE_MAN_TRIG,		"manual_trigger"},
+	{PMT_CAP_PETE_ENCRYPTION,	"encryption"},
+	{PMT_CAP_PETE_PERSISTENCY,	"persistency"},
+	{PMT_CAP_PETE_REQ_TOKENS,	"required_tokens"},
+	{PMT_CAP_PETE_PROD_ENABLED,	"production_enabled"},
+	{PMT_CAP_PETE_DEBUG_ENABLED,	"debug_enabled"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_pete[] = {
+	pmt_cap_common,
+	pmt_cap_pete,
+	NULL
+};
+
+struct pmt_cap pmt_cap_tpmi[] = {
+	{PMT_CAP_TPMI_MAILBOX,		"mailbox"},
+	{PMT_CAP_TPMI_LOCK,		"bios_lock"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_tpmi[] = {
+	pmt_cap_common,
+	pmt_cap_tpmi,
+	NULL
+};
+
+struct pmt_cap pmt_cap_tracing[] = {
+	{PMT_CAP_TRACE_SRAR,		"srar_errors"},
+	{PMT_CAP_TRACE_CORRECTABLE,	"correctable_errors"},
+	{PMT_CAP_TRACE_MCTP,		"mctp"},
+	{PMT_CAP_TRACE_MRT,		"memory_resiliency"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_tracing[] = {
+	pmt_cap_common,
+	pmt_cap_tracing,
+	NULL
+};
+
+struct pmt_cap pmt_cap_rmid_energy[] = {
+	{PMT_CAP_RMID_ENERGY,		"energy"},
+	{PMT_CAP_RMID_ACTIVITY,		"activity"},
+	{PMT_CAP_RMID_ENERGY_QUAL,	"energy_quality"},
+	{}
+};
+
+struct pmt_cap *pmt_caps_rmid_energy[] = {
+	pmt_cap_common,
+	pmt_cap_rmid_energy,
+	NULL
+};
diff --git a/include/linux/intel_pmt_features.h b/include/linux/intel_pmt_features.h
new file mode 100644
index 000000000000..53573a4a49b7
--- /dev/null
+++ b/include/linux/intel_pmt_features.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FEATURES_H
+#define _FEATURES_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/* Common masks */
+#define PMT_CAP_TELEM			BIT(0)
+#define PMT_CAP_WATCHER			BIT(1)
+#define PMT_CAP_CRASHLOG		BIT(2)
+#define PMT_CAP_STREAMING		BIT(3)
+#define PMT_CAP_THRESHOLD		BIT(4)
+#define PMT_CAP_WINDOW			BIT(5)
+#define PMT_CAP_CONFIG			BIT(6)
+#define PMT_CAP_TRACING			BIT(7)
+#define PMT_CAP_INBAND			BIT(8)
+#define PMT_CAP_OOB			BIT(9)
+#define PMT_CAP_SECURED_CHAN		BIT(10)
+
+#define PMT_CAP_PMT_SP			BIT(11)
+#define PMT_CAP_PMT_SP_POLICY		GENMASK(17, 12)
+
+/* Per Core Performance Telemetry (PCPT) specific masks */
+#define PMT_CAP_PCPT_CORE_PERF		BIT(18)
+#define PMT_CAP_PCPT_CORE_C0_RES	BIT(19)
+#define PMT_CAP_PCPT_CORE_ACTIVITY	BIT(20)
+#define PMT_CAP_PCPT_CACHE_PERF		BIT(21)
+#define PMT_CAP_PCPT_QUALITY_TELEM	BIT(22)
+
+/* Per Core Environmental Telemetry (PCET) specific masks */
+#define PMT_CAP_PCET_WORKPOINT_HIST	BIT(18)
+#define PMT_CAP_PCET_CORE_CURR_TEMP	BIT(19)
+#define PMT_CAP_PCET_CORE_INST_RES	BIT(20)
+#define PMT_CAP_PCET_QUALITY_TELEM	BIT(21)	/* Same as PMT_CAP_PCPT */
+#define PMT_CAP_PCET_CORE_CDYN_LVL	BIT(22)
+#define PMT_CAP_PCET_CORE_STRESS_LVL	BIT(23)
+#define PMT_CAP_PCET_CORE_DAS		BIT(24)
+#define PMT_CAP_PCET_FIVR_HEALTH	BIT(25)
+#define PMT_CAP_PCET_ENERGY		BIT(26)
+#define PMT_CAP_PCET_PEM_STATUS		BIT(27)
+#define PMT_CAP_PCET_CORE_C_STATE	BIT(28)
+
+/* Per RMID Performance Telemetry specific masks */
+#define PMT_CAP_RMID_CORES_PERF		BIT(18)
+#define PMT_CAP_RMID_CACHE_PERF		BIT(19)
+#define PMT_CAP_RMID_PERF_QUAL		BIT(20)
+
+/* Accelerator Telemetry specific masks */
+#define PMT_CAP_ACCEL_CPM_TELEM		BIT(18)
+#define PMT_CAP_ACCEL_TIP_TELEM		BIT(19)
+
+/* Uncore Telemetry specific masks */
+#define PMT_CAP_UNCORE_IO_CA_TELEM	BIT(18)
+#define PMT_CAP_UNCORE_RMID_TELEM	BIT(19)
+#define PMT_CAP_UNCORE_D2D_ULA_TELEM	BIT(20)
+#define PMT_CAP_UNCORE_PKGC_TELEM	BIT(21)
+
+/* Crash Log specific masks */
+#define PMT_CAP_CRASHLOG_MAN_TRIG	BIT(11)
+#define PMT_CAP_CRASHLOG_CORE		BIT(12)
+#define PMT_CAP_CRASHLOG_UNCORE		BIT(13)
+#define PMT_CAP_CRASHLOG_TOR		BIT(14)
+#define PMT_CAP_CRASHLOG_S3M		BIT(15)
+#define PMT_CAP_CRASHLOG_PERSISTENCY	BIT(16)
+#define PMT_CAP_CRASHLOG_CLIP_GPIO	BIT(17)
+#define PMT_CAP_CRASHLOG_PRE_RESET	BIT(18)
+#define PMT_CAP_CRASHLOG_POST_RESET	BIT(19)
+
+/* PeTe Log specific masks */
+#define PMT_CAP_PETE_MAN_TRIG		BIT(11)
+#define PMT_CAP_PETE_ENCRYPTION		BIT(12)
+#define PMT_CAP_PETE_PERSISTENCY	BIT(13)
+#define PMT_CAP_PETE_REQ_TOKENS		BIT(14)
+#define PMT_CAP_PETE_PROD_ENABLED	BIT(15)
+#define PMT_CAP_PETE_DEBUG_ENABLED	BIT(16)
+
+/* TPMI control specific masks */
+#define PMT_CAP_TPMI_MAILBOX		BIT(11)
+#define PMT_CAP_TPMI_LOCK		BIT(12)
+
+/* Tracing specific masks */
+#define PMT_CAP_TRACE_SRAR		BIT(11)
+#define PMT_CAP_TRACE_CORRECTABLE	BIT(12)
+#define PMT_CAP_TRACE_MCTP		BIT(13)
+#define PMT_CAP_TRACE_MRT		BIT(14)
+
+/* Per RMID Energy Telemetry specific masks */
+#define PMT_CAP_RMID_ENERGY		BIT(18)
+#define PMT_CAP_RMID_ACTIVITY		BIT(19)
+#define PMT_CAP_RMID_ENERGY_QUAL	BIT(20)
+
+enum pmt_feature_id {
+	FEATURE_INVALID			= 0x0,
+	FEATURE_PER_CORE_PERF_TELEM	= 0x1,
+	FEATURE_PER_CORE_ENV_TELEM	= 0x2,
+	FEATURE_PER_RMID_PERF_TELEM	= 0x3,
+	FEATURE_ACCEL_TELEM		= 0x4,
+	FEATURE_UNCORE_TELEM		= 0x5,
+	FEATURE_CRASH_LOG		= 0x6,
+	FEATURE_PETE_LOG		= 0x7,
+	FEATURE_TPMI_CTRL		= 0x8,
+	FEATURE_RESERVED		= 0x9,
+	FEATURE_TRACING			= 0xA,
+	FEATURE_PER_RMID_ENERGY_TELEM	= 0xB,
+	FEATURE_MAX			= 0xB,
+};
+
+enum feature_layout {
+	LAYOUT_RMID,
+	LAYOUT_WATCHER,
+	LAYOUT_COMMAND,
+	LAYOUT_CAPS_ONLY,
+};
+
+struct pmt_cap {
+	u32		mask;
+	const char	*name;
+};
+
+extern const char * const pmt_feature_names[];
+extern enum feature_layout feature_layout[];
+extern struct pmt_cap pmt_cap_common[];
+extern struct pmt_cap pmt_cap_pcpt[];
+extern struct pmt_cap *pmt_caps_pcpt[];
+extern struct pmt_cap pmt_cap_pcet[];
+extern struct pmt_cap *pmt_caps_pcet[];
+extern struct pmt_cap pmt_cap_rmid_perf[];
+extern struct pmt_cap *pmt_caps_rmid_perf[];
+extern struct pmt_cap pmt_cap_accel[];
+extern struct pmt_cap *pmt_caps_accel[];
+extern struct pmt_cap pmt_cap_uncore[];
+extern struct pmt_cap *pmt_caps_uncore[];
+extern struct pmt_cap pmt_cap_crashlog[];
+extern struct pmt_cap *pmt_caps_crashlog[];
+extern struct pmt_cap pmt_cap_pete[];
+extern struct pmt_cap *pmt_caps_pete[];
+extern struct pmt_cap pmt_cap_tpmi[];
+extern struct pmt_cap *pmt_caps_tpmi[];
+extern struct pmt_cap pmt_cap_s3m[];
+extern struct pmt_cap *pmt_caps_s3m[];
+extern struct pmt_cap pmt_cap_tracing[];
+extern struct pmt_cap *pmt_caps_tracing[];
+extern struct pmt_cap pmt_cap_rmid_energy[];
+extern struct pmt_cap *pmt_caps_rmid_energy[];
+
+static inline bool pmt_feature_id_is_valid(enum pmt_feature_id id)
+{
+	if (id > FEATURE_MAX)
+		return false;
+
+	if (id == FEATURE_INVALID || id == FEATURE_RESERVED)
+		return false;
+
+	return true;
+}
+#endif
-- 
cgit v1.2.3


From 934954df0f44de5e10afc1af84c06f78149f15fe Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:25 -0700
Subject: platform/x86/intel/tpmi: Relocate platform info to intel_vsec.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TPMI platform information provides a mapping of OOBMSM PCI devices to
logical CPUs. Since this mapping is consistent across all OOBMSM features
(e.g., TPMI, PMT, SDSi), it can be leveraged by multiple drivers. To
facilitate reuse, relocate the struct intel_tpmi_plat_info to intel_vsec.h,
renaming it to struct oobmsm_plat_info, making it accessible to other
features. While modifying headers, place them in alphabetical order.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-11-david.e.box@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/plr_tpmi.c              |  3 ++-
 .../x86/intel/speed_select_if/isst_tpmi_core.c     |  9 ++++----
 .../intel/uncore-frequency/uncore-frequency-tpmi.c |  7 +++---
 drivers/platform/x86/intel/vsec_tpmi.c             |  4 ++--
 drivers/powercap/intel_rapl_tpmi.c                 |  9 ++++----
 include/linux/intel_tpmi.h                         | 27 +++-------------------
 include/linux/intel_vsec.h                         | 22 ++++++++++++++++++
 7 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/plr_tpmi.c b/drivers/platform/x86/intel/plr_tpmi.c
index 2b55347a5a93..58132da47745 100644
--- a/drivers/platform/x86/intel/plr_tpmi.c
+++ b/drivers/platform/x86/intel/plr_tpmi.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/gfp_types.h>
 #include <linux/intel_tpmi.h>
+#include <linux/intel_vsec.h>
 #include <linux/io.h>
 #include <linux/iopoll.h>
 #include <linux/kstrtox.h>
@@ -256,7 +257,7 @@ DEFINE_SHOW_STORE_ATTRIBUTE(plr_status);
 
 static int intel_plr_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id)
 {
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 	struct dentry *dentry;
 	int i, num_resources;
 	struct resource *res;
diff --git a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
index 18c035710eb9..34bff2f65a83 100644
--- a/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
+++ b/drivers/platform/x86/intel/speed_select_if/isst_tpmi_core.c
@@ -22,6 +22,7 @@
 #include <linux/auxiliary_bus.h>
 #include <linux/delay.h>
 #include <linux/intel_tpmi.h>
+#include <linux/intel_vsec.h>
 #include <linux/fs.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
@@ -1546,7 +1547,7 @@ int tpmi_sst_dev_add(struct auxiliary_device *auxdev)
 {
 	struct tpmi_per_power_domain_info *pd_info;
 	bool read_blocked = 0, write_blocked = 0;
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 	struct device *dev = &auxdev->dev;
 	struct tpmi_sst_struct *tpmi_sst;
 	u8 i, num_resources, io_die_cnt;
@@ -1698,7 +1699,7 @@ EXPORT_SYMBOL_NS_GPL(tpmi_sst_dev_add, "INTEL_TPMI_SST");
 void tpmi_sst_dev_remove(struct auxiliary_device *auxdev)
 {
 	struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 
 	plat_info = tpmi_get_platform_data(auxdev);
 	if (!plat_info)
@@ -1720,7 +1721,7 @@ void tpmi_sst_dev_suspend(struct auxiliary_device *auxdev)
 {
 	struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
 	struct tpmi_per_power_domain_info *power_domain_info;
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 	void __iomem *cp_base;
 
 	plat_info = tpmi_get_platform_data(auxdev);
@@ -1748,7 +1749,7 @@ void tpmi_sst_dev_resume(struct auxiliary_device *auxdev)
 {
 	struct tpmi_sst_struct *tpmi_sst = auxiliary_get_drvdata(auxdev);
 	struct tpmi_per_power_domain_info *power_domain_info;
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 	void __iomem *cp_base;
 
 	plat_info = tpmi_get_platform_data(auxdev);
diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
index 44d9948ed224..6df55c8e16b7 100644
--- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
+++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency-tpmi.c
@@ -22,9 +22,10 @@
 #include <linux/auxiliary_bus.h>
 #include <linux/bitfield.h>
 #include <linux/bits.h>
+#include <linux/intel_tpmi.h>
+#include <linux/intel_vsec.h>
 #include <linux/io.h>
 #include <linux/module.h>
-#include <linux/intel_tpmi.h>
 
 #include "../tpmi_power_domains.h"
 #include "uncore-frequency-common.h"
@@ -448,7 +449,7 @@ static void remove_cluster_entries(struct tpmi_uncore_struct *tpmi_uncore)
 }
 
 static void set_cdie_id(int domain_id, struct tpmi_uncore_cluster_info *cluster_info,
-		       struct intel_tpmi_plat_info *plat_info)
+			struct oobmsm_plat_info *plat_info)
 {
 
 	cluster_info->cdie_id = domain_id;
@@ -465,7 +466,7 @@ static void set_cdie_id(int domain_id, struct tpmi_uncore_cluster_info *cluster_
 static int uncore_probe(struct auxiliary_device *auxdev, const struct auxiliary_device_id *id)
 {
 	bool read_blocked = 0, write_blocked = 0;
-	struct intel_tpmi_plat_info *plat_info;
+	struct oobmsm_plat_info *plat_info;
 	struct tpmi_uncore_struct *tpmi_uncore;
 	bool uncore_sysfs_added = false;
 	int ret, i, pkg = 0;
diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c
index 5c383a27bbe8..d95a0d994546 100644
--- a/drivers/platform/x86/intel/vsec_tpmi.c
+++ b/drivers/platform/x86/intel/vsec_tpmi.c
@@ -116,7 +116,7 @@ struct intel_tpmi_info {
 	struct intel_vsec_device *vsec_dev;
 	int feature_count;
 	u64 pfs_start;
-	struct intel_tpmi_plat_info plat_info;
+	struct oobmsm_plat_info plat_info;
 	void __iomem *tpmi_control_mem;
 	struct dentry *dbgfs_dir;
 };
@@ -187,7 +187,7 @@ struct tpmi_feature_state {
 /* Used during auxbus device creation */
 static DEFINE_IDA(intel_vsec_tpmi_ida);
 
-struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev)
+struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev)
 {
 	struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev);
 
diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c
index af2368f4db10..82201bf4685d 100644
--- a/drivers/powercap/intel_rapl_tpmi.c
+++ b/drivers/powercap/intel_rapl_tpmi.c
@@ -9,9 +9,10 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/auxiliary_bus.h>
-#include <linux/io.h>
-#include <linux/intel_tpmi.h>
 #include <linux/intel_rapl.h>
+#include <linux/intel_tpmi.h>
+#include <linux/intel_vsec.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 
@@ -48,7 +49,7 @@ enum tpmi_rapl_register {
 
 struct tpmi_rapl_package {
 	struct rapl_if_priv priv;
-	struct intel_tpmi_plat_info *tpmi_info;
+	struct oobmsm_plat_info *tpmi_info;
 	struct rapl_package *rp;
 	void __iomem *base;
 	struct list_head node;
@@ -253,7 +254,7 @@ static int intel_rapl_tpmi_probe(struct auxiliary_device *auxdev,
 				 const struct auxiliary_device_id *id)
 {
 	struct tpmi_rapl_package *trp;
-	struct intel_tpmi_plat_info *info;
+	struct oobmsm_plat_info *info;
 	struct resource *res;
 	u32 offset;
 	int ret;
diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index ff480b47ae64..94c06bf214fb 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -8,6 +8,8 @@
 
 #include <linux/bitfield.h>
 
+struct oobmsm_plat_info;
+
 #define TPMI_VERSION_INVALID	0xff
 #define TPMI_MINOR_VERSION(val)	FIELD_GET(GENMASK(4, 0), val)
 #define TPMI_MAJOR_VERSION(val)	FIELD_GET(GENMASK(7, 5), val)
@@ -26,30 +28,7 @@ enum intel_tpmi_id {
 	TPMI_INFO_ID = 0x81,	/* Special ID for PCI BDF and Package ID information */
 };
 
-/**
- * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
- * @cdie_mask:       Mask of all compute dies in the partition
- * @package_id:      CPU Package id
- * @partition:       Package partition id when multiple VSEC PCI devices per package
- * @segment:         PCI segment ID
- * @bus_number:      PCI bus number
- * @device_number:   PCI device number
- * @function_number: PCI function number
- *
- * Structure to store platform data for a TPMI device instance. This
- * struct is used to return data via tpmi_get_platform_data().
- */
-struct intel_tpmi_plat_info {
-	u16 cdie_mask;
-	u8 package_id;
-	u8 partition;
-	u8 segment;
-	u8 bus_number;
-	u8 device_number;
-	u8 function_number;
-};
-
-struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev);
+struct oobmsm_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev);
 struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index);
 int tpmi_get_resource_count(struct auxiliary_device *auxdev);
 int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked,
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index a07796d7d43b..cd78d0b2e623 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -144,6 +144,28 @@ struct intel_vsec_device {
 	unsigned long cap_id;
 };
 
+/**
+ * struct oobmsm_plat_info - Platform information for a device instance
+ * @cdie_mask:       Mask of all compute dies in the partition
+ * @package_id:      CPU Package id
+ * @partition:       Package partition id when multiple VSEC PCI devices per package
+ * @segment:         PCI segment ID
+ * @bus_number:      PCI bus number
+ * @device_number:   PCI device number
+ * @function_number: PCI function number
+ *
+ * Structure to store platform data for a OOBMSM device instance.
+ */
+struct oobmsm_plat_info {
+	u16 cdie_mask;
+	u8 package_id;
+	u8 partition;
+	u8 segment;
+	u8 bus_number;
+	u8 device_number;
+	u8 function_number;
+};
+
 int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
 		       struct intel_vsec_device *intel_vsec_dev,
 		       const char *name);
-- 
cgit v1.2.3


From a885a2780937afac4f31f00d11663f50d05dfb35 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:26 -0700
Subject: platform/x86/intel/vsec: Set OOBMSM to CPU mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add functions, intel_vsec_set/get_mapping(), to set and retrieve the
OOBMSM-to-CPU mapping data in the private data of the parent Intel VSEC
driver. With this mapping information available, other Intel VSEC features
on the same OOBMSM device can easily access and use the mapping data,
allowing each of the OOBMSM features to map to the CPUs they provides data
for.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-12-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/vsec.c | 31 +++++++++++++++++++++++++++++++
 include/linux/intel_vsec.h        | 12 ++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c
index 4d76f1ac3c8c..711ff4edfe21 100644
--- a/drivers/platform/x86/intel/vsec.c
+++ b/drivers/platform/x86/intel/vsec.c
@@ -44,6 +44,7 @@ enum vsec_device_state {
 struct vsec_priv {
 	struct intel_vsec_platform_info *info;
 	struct device *suppliers[VSEC_FEATURE_COUNT];
+	struct oobmsm_plat_info plat_info;
 	enum vsec_device_state state[VSEC_FEATURE_COUNT];
 	unsigned long found_caps;
 };
@@ -665,6 +666,36 @@ static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id
 	return 0;
 }
 
+int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info,
+			   struct intel_vsec_device *vsec_dev)
+{
+	struct vsec_priv *priv;
+
+	priv = pci_get_drvdata(vsec_dev->pcidev);
+	if (!priv)
+		return -EINVAL;
+
+	priv->plat_info = *plat_info;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(intel_vsec_set_mapping, "INTEL_VSEC");
+
+struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev)
+{
+	struct vsec_priv *priv;
+
+	if (!pci_match_id(intel_vsec_pci_ids, pdev))
+		return ERR_PTR(-EINVAL);
+
+	priv = pci_get_drvdata(pdev);
+	if (!priv)
+		return ERR_PTR(-EINVAL);
+
+	return &priv->plat_info;
+}
+EXPORT_SYMBOL_NS_GPL(intel_vsec_get_mapping, "INTEL_VSEC");
+
 /* DG1 info */
 static struct intel_vsec_header dg1_header = {
 	.length = 0x10,
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index cd78d0b2e623..4bd0c6e7857c 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -183,11 +183,23 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device
 #if IS_ENABLED(CONFIG_INTEL_VSEC)
 int intel_vsec_register(struct pci_dev *pdev,
 			 struct intel_vsec_platform_info *info);
+int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info,
+			   struct intel_vsec_device *vsec_dev);
+struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev);
 #else
 static inline int intel_vsec_register(struct pci_dev *pdev,
 				       struct intel_vsec_platform_info *info)
 {
 	return -ENODEV;
 }
+static inline int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info,
+					 struct intel_vsec_device *vsec_dev)
+{
+	return -ENODEV;
+}
+static inline struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev)
+{
+	return ERR_PTR(-ENODEV);
+}
 #endif
 #endif
-- 
cgit v1.2.3


From 86fc85c75bcd9b0f28afadd60c9f890669b42ba4 Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:28 -0700
Subject: platform/x86/intel/pmt/discovery: Get telemetry attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add intel_pmt_get_features() in PMT Discovery to enable the PMT Telemetry
driver to obtain attributes of the aggregated telemetry spaces it
enumerates. The function gathers feature flags and associated data (like
the number of RMIDs) from each PMT entry, laying the groundwork for a
future kernel interface that will allow direct access to telemetry regions
based on their capabilities.

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-14-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/pmt/Kconfig     |  1 +
 drivers/platform/x86/intel/pmt/class.h     |  7 +++++++
 drivers/platform/x86/intel/pmt/discovery.c | 33 ++++++++++++++++++++++++++++++
 drivers/platform/x86/intel/pmt/telemetry.c |  5 +++++
 include/linux/intel_vsec.h                 | 16 +++++++++++++++
 5 files changed, 62 insertions(+)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/pmt/Kconfig b/drivers/platform/x86/intel/pmt/Kconfig
index 0ad91b5112e9..83ae17eab462 100644
--- a/drivers/platform/x86/intel/pmt/Kconfig
+++ b/drivers/platform/x86/intel/pmt/Kconfig
@@ -18,6 +18,7 @@ config INTEL_PMT_CLASS
 config INTEL_PMT_TELEMETRY
 	tristate "Intel Platform Monitoring Technology (PMT) Telemetry driver"
 	depends on INTEL_VSEC
+	select INTEL_PMT_DISCOVERY
 	select INTEL_PMT_CLASS
 	help
 	  The Intel Platform Monitory Technology (PMT) Telemetry driver provides
diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h
index 39c32357ee2c..fdf7e79d8c0d 100644
--- a/drivers/platform/x86/intel/pmt/class.h
+++ b/drivers/platform/x86/intel/pmt/class.h
@@ -48,6 +48,7 @@ struct intel_pmt_entry {
 	struct pmt_callbacks	*cb;
 	unsigned long		base_addr;
 	size_t			size;
+	u64			feature_flags;
 	u32			guid;
 	u32			num_rmids; /* Number of Resource Monitoring IDs */
 	int			devid;
@@ -71,4 +72,10 @@ int intel_pmt_dev_create(struct intel_pmt_entry *entry,
 			 struct intel_vsec_device *dev, int idx);
 void intel_pmt_dev_destroy(struct intel_pmt_entry *entry,
 			   struct intel_pmt_namespace *ns);
+#if IS_ENABLED(CONFIG_INTEL_PMT_DISCOVERY)
+void intel_pmt_get_features(struct intel_pmt_entry *entry);
+#else
+static inline void intel_pmt_get_features(struct intel_pmt_entry *entry) {}
+#endif
+
 #endif
diff --git a/drivers/platform/x86/intel/pmt/discovery.c b/drivers/platform/x86/intel/pmt/discovery.c
index 4b4fa3137ad2..e72d43b675b4 100644
--- a/drivers/platform/x86/intel/pmt/discovery.c
+++ b/drivers/platform/x86/intel/pmt/discovery.c
@@ -583,6 +583,39 @@ abort_probe:
 	return ret;
 }
 
+static void pmt_get_features(struct intel_pmt_entry *entry, struct feature *f)
+{
+	int num_guids = f->table.header.num_guids;
+	int i;
+
+	for (i = 0; i < num_guids; i++) {
+		if (f->table.guids[i] != entry->guid)
+			continue;
+
+		entry->feature_flags |= BIT(f->id);
+
+		if (feature_layout[f->id] == LAYOUT_RMID)
+			entry->num_rmids = f->table.rmid.num_rmids;
+		else
+			entry->num_rmids = 0; /* entry is kzalloc but set anyway */
+	}
+}
+
+void intel_pmt_get_features(struct intel_pmt_entry *entry)
+{
+	struct feature *feature;
+
+	mutex_lock(&feature_list_lock);
+	list_for_each_entry(feature, &pmt_feature_list, list) {
+		if (feature->priv->parent != &entry->ep->pcidev->dev)
+			continue;
+
+		pmt_get_features(entry, feature);
+	}
+	mutex_unlock(&feature_list_lock);
+}
+EXPORT_SYMBOL_NS_GPL(intel_pmt_get_features, "INTEL_PMT");
+
 static const struct auxiliary_device_id pmt_features_id_table[] = {
 	{ .name = "intel_vsec.discovery" },
 	{}
diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c
index ac3a9bdf5601..58d06749e417 100644
--- a/drivers/platform/x86/intel/pmt/telemetry.c
+++ b/drivers/platform/x86/intel/pmt/telemetry.c
@@ -9,11 +9,14 @@
  */
 
 #include <linux/auxiliary_bus.h>
+#include <linux/intel_pmt_features.h>
 #include <linux/intel_vsec.h>
 #include <linux/kernel.h>
+#include <linux/kref.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
+#include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/overflow.h>
 
@@ -311,6 +314,8 @@ static int pmt_telem_probe(struct auxiliary_device *auxdev, const struct auxilia
 			continue;
 
 		priv->num_entries++;
+
+		intel_pmt_get_features(entry);
 	}
 
 	return 0;
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 4bd0c6e7857c..f185e9c01c90 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -4,6 +4,7 @@
 
 #include <linux/auxiliary_bus.h>
 #include <linux/bits.h>
+#include <linux/intel_pmt_features.h>
 
 /*
  * VSEC_CAP_UNUSED is reserved. It exists to prevent zero initialized
@@ -166,6 +167,21 @@ struct oobmsm_plat_info {
 	u8 function_number;
 };
 
+struct telemetry_region {
+	struct oobmsm_plat_info	plat_info;
+	void __iomem		*addr;
+	size_t			size;
+	u32			guid;
+	u32			num_rmids;
+};
+
+struct pmt_feature_group {
+	enum pmt_feature_id	id;
+	int			count;
+	struct kref		kref;
+	struct telemetry_region	regions[];
+};
+
 int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent,
 		       struct intel_vsec_device *intel_vsec_dev,
 		       const char *name);
-- 
cgit v1.2.3


From 42dabe5442887946b16e64c6ebe91d2671a96fbb Mon Sep 17 00:00:00 2001
From: "David E. Box" <david.e.box@linux.intel.com>
Date: Wed, 2 Jul 2025 19:28:29 -0700
Subject: platform/x86/intel/pmt/telemetry: Add API to retrieve telemetry
 regions by feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a new API, intel_pmt_get_regions_by_feature(), that gathers
telemetry regions based on a provided capability flag. This API enables
retrieval of regions with various capabilities (for example, RMID-based
telemetry) and provides a unified interface for accessing them. Resource
management is handled via reference counting using
intel_pmt_put_feature_group().

Signed-off-by: David E. Box <david.e.box@linux.intel.com>
Link: https://lore.kernel.org/r/20250703022832.1302928-15-david.e.box@linux.intel.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/platform/x86/intel/pmt/telemetry.c | 89 +++++++++++++++++++++++++++++-
 include/linux/intel_vsec.h                 | 18 ++++++
 2 files changed, 106 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c
index 58d06749e417..a4dfca6cac19 100644
--- a/drivers/platform/x86/intel/pmt/telemetry.c
+++ b/drivers/platform/x86/intel/pmt/telemetry.c
@@ -9,16 +9,21 @@
  */
 
 #include <linux/auxiliary_bus.h>
+#include <linux/bitops.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
 #include <linux/intel_pmt_features.h>
 #include <linux/intel_vsec.h>
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/overflow.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
-#include <linux/overflow.h>
+#include <linux/xarray.h>
 
 #include "class.h"
 
@@ -209,6 +214,87 @@ unlock:
 }
 EXPORT_SYMBOL_NS_GPL(pmt_telem_get_endpoint_info, "INTEL_PMT_TELEMETRY");
 
+static int pmt_copy_region(struct telemetry_region *region,
+			   struct intel_pmt_entry *entry)
+{
+
+	struct oobmsm_plat_info *plat_info;
+
+	plat_info = intel_vsec_get_mapping(entry->ep->pcidev);
+	if (IS_ERR(plat_info))
+		return PTR_ERR(plat_info);
+
+	region->plat_info = *plat_info;
+	region->guid = entry->guid;
+	region->addr = entry->ep->base;
+	region->size = entry->size;
+	region->num_rmids = entry->num_rmids;
+
+	return 0;
+}
+
+static void pmt_feature_group_release(struct kref *kref)
+{
+	struct pmt_feature_group *feature_group;
+
+	feature_group = container_of(kref, struct pmt_feature_group, kref);
+	kfree(feature_group);
+}
+
+struct pmt_feature_group *intel_pmt_get_regions_by_feature(enum pmt_feature_id id)
+{
+	struct pmt_feature_group *feature_group __free(kfree) = NULL;
+	struct telemetry_region *region;
+	struct intel_pmt_entry *entry;
+	unsigned long idx;
+	int count = 0;
+	size_t size;
+
+	if (!pmt_feature_id_is_valid(id))
+		return ERR_PTR(-EINVAL);
+
+	guard(mutex)(&ep_lock);
+	xa_for_each(&telem_array, idx, entry) {
+		if (entry->feature_flags & BIT(id))
+			count++;
+	}
+
+	if (!count)
+		return ERR_PTR(-ENOENT);
+
+	size = struct_size(feature_group, regions, count);
+	feature_group = kzalloc(size, GFP_KERNEL);
+	if (!feature_group)
+		return ERR_PTR(-ENOMEM);
+
+	feature_group->count = count;
+
+	region = feature_group->regions;
+	xa_for_each(&telem_array, idx, entry) {
+		int ret;
+
+		if (!(entry->feature_flags & BIT(id)))
+			continue;
+
+		ret = pmt_copy_region(region, entry);
+		if (ret)
+			return ERR_PTR(ret);
+
+		region++;
+	}
+
+	kref_init(&feature_group->kref);
+
+	return no_free_ptr(feature_group);
+}
+EXPORT_SYMBOL(intel_pmt_get_regions_by_feature);
+
+void intel_pmt_put_feature_group(struct pmt_feature_group *feature_group)
+{
+	kref_put(&feature_group->kref, pmt_feature_group_release);
+}
+EXPORT_SYMBOL(intel_pmt_put_feature_group);
+
 int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count)
 {
 	u32 offset, size;
@@ -353,3 +439,4 @@ MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
 MODULE_DESCRIPTION("Intel PMT Telemetry driver");
 MODULE_LICENSE("GPL v2");
 MODULE_IMPORT_NS("INTEL_PMT");
+MODULE_IMPORT_NS("INTEL_VSEC");
diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index f185e9c01c90..53f6fe88e369 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -4,6 +4,7 @@
 
 #include <linux/auxiliary_bus.h>
 #include <linux/bits.h>
+#include <linux/err.h>
 #include <linux/intel_pmt_features.h>
 
 /*
@@ -218,4 +219,21 @@ static inline struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pd
 	return ERR_PTR(-ENODEV);
 }
 #endif
+
+#if IS_ENABLED(CONFIG_INTEL_PMT_TELEMETRY)
+struct pmt_feature_group *
+intel_pmt_get_regions_by_feature(enum pmt_feature_id id);
+
+void intel_pmt_put_feature_group(struct pmt_feature_group *feature_group);
+#else
+static inline struct pmt_feature_group *
+intel_pmt_get_regions_by_feature(enum pmt_feature_id id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
+static inline void
+intel_pmt_put_feature_group(struct pmt_feature_group *feature_group) {}
+#endif
+
 #endif
-- 
cgit v1.2.3


From fd72f265bb00d2dd2a3bbad7ec45520025e3a926 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 22 May 2025 16:52:23 +0200
Subject: netfilter: conntrack: remove DCCP protocol support

The DCCP socket family has now been removed from this tree, see:

  8bb3212be4b4 ("Merge branch 'net-retire-dccp-socket'")

Remove connection tracking and NAT support for this protocol, this
should not pose a problem because no DCCP traffic is expected to be seen
on the wire.

As for the code for matching on dccp header for iptables and nftables,
mark it as deprecated and keep it in place. Ruleset restoration is an
atomic operation. Without dccp matching support, an astray match on dccp
could break this operation leaving your computer with no policy in
place, so let's follow a more conservative approach for matches.

Add CONFIG_NFT_EXTHDR_DCCP which is set to 'n' by default to deprecate
dccp extension support. Similarly, label CONFIG_NETFILTER_XT_MATCH_DCCP
as deprecated too and also set it to 'n' by default.

Code to match on DCCP protocol from ebtables also remains in place, this
is just a few checks on IPPROTO_DCCP from _check() path which is
exercised when ruleset is loaded. There is another use of IPPROTO_DCCP
from the _check() path in the iptables multiport match. Another check
for IPPROTO_DCCP from the packet in the reject target is also removed.

So let's schedule removal of the dccp matching for a second stage, this
should not interfer with the dccp retirement since this is only matching
on the dccp header.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.rst |   1 -
 arch/arm/configs/omap2plus_defconfig             |   1 -
 arch/loongarch/configs/loongson3_defconfig       |   1 -
 arch/m68k/configs/amiga_defconfig                |   1 -
 arch/m68k/configs/apollo_defconfig               |   1 -
 arch/m68k/configs/atari_defconfig                |   1 -
 arch/m68k/configs/bvme6000_defconfig             |   1 -
 arch/m68k/configs/hp300_defconfig                |   1 -
 arch/m68k/configs/mac_defconfig                  |   1 -
 arch/m68k/configs/multi_defconfig                |   1 -
 arch/m68k/configs/mvme147_defconfig              |   1 -
 arch/m68k/configs/mvme16x_defconfig              |   1 -
 arch/m68k/configs/q40_defconfig                  |   1 -
 arch/m68k/configs/sun3_defconfig                 |   1 -
 arch/m68k/configs/sun3x_defconfig                |   1 -
 arch/mips/configs/fuloong2e_defconfig            |   1 -
 arch/mips/configs/ip22_defconfig                 |   1 -
 arch/mips/configs/loongson2k_defconfig           |   1 -
 arch/mips/configs/loongson3_defconfig            |   1 -
 arch/mips/configs/malta_defconfig                |   1 -
 arch/mips/configs/malta_kvm_defconfig            |   1 -
 arch/mips/configs/maltaup_xpa_defconfig          |   1 -
 arch/mips/configs/rb532_defconfig                |   1 -
 arch/mips/configs/rm200_defconfig                |   1 -
 arch/powerpc/configs/cell_defconfig              |   1 -
 arch/s390/configs/debug_defconfig                |   1 -
 arch/s390/configs/defconfig                      |   1 -
 arch/sh/configs/titan_defconfig                  |   1 -
 include/linux/netfilter/nf_conntrack_dccp.h      |  38 --
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h   |   3 -
 include/net/netfilter/nf_conntrack.h             |   2 -
 include/net/netfilter/nf_conntrack_l4proto.h     |  13 -
 include/net/netfilter/nf_reject.h                |   1 -
 include/net/netns/conntrack.h                    |  13 -
 net/netfilter/Kconfig                            |  20 +-
 net/netfilter/Makefile                           |   1 -
 net/netfilter/nf_conntrack_core.c                |   8 -
 net/netfilter/nf_conntrack_netlink.c             |   1 -
 net/netfilter/nf_conntrack_proto.c               |   6 -
 net/netfilter/nf_conntrack_proto_dccp.c          | 826 -----------------------
 net/netfilter/nf_conntrack_standalone.c          |  92 ---
 net/netfilter/nf_nat_core.c                      |   6 -
 net/netfilter/nf_nat_proto.c                     |  43 --
 net/netfilter/nfnetlink_cttimeout.c              |   5 -
 net/netfilter/nft_exthdr.c                       |   8 +
 45 files changed, 16 insertions(+), 1098 deletions(-)
 delete mode 100644 include/linux/netfilter/nf_conntrack_dccp.h
 delete mode 100644 net/netfilter/nf_conntrack_proto_dccp.c

(limited to 'include')

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..35f889259fcd 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -85,7 +85,6 @@ nf_conntrack_log_invalid - INTEGER
 	- 1   - log ICMP packets
 	- 6   - log TCP packets
 	- 17  - log UDP packets
-	- 33  - log DCCP packets
 	- 41  - log ICMPv6 packets
 	- 136 - log UDPLITE packets
 	- 255 - log packets of any protocol
diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig
index 9f9780c8e62a..fee43d156622 100644
--- a/arch/arm/configs/omap2plus_defconfig
+++ b/arch/arm/configs/omap2plus_defconfig
@@ -142,7 +142,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index 0d59af6007b7..68e337aed2bb 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -225,7 +225,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index d05690289e33..83eab331872f 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -85,7 +85,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index a1747fbe23fb..0e5de7edd544 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -81,7 +81,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 74293551f66b..35fc466095f4 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -88,7 +88,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 419b13ae950a..53b7844cf301 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 4c81d756587c..560fdf3ed106 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index daa01d7fb462..2e28e54b52f8 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 641ca22eb3b2..f5f6b8e65c26 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -99,7 +99,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index f98ffa7a1640..36bbf98d6aa4 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -77,7 +77,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 2bfc3f4b48f9..e247bff8f1a4 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2bd46cbcca2a..27aa4eb5d3f4 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index dc7fc94fc669..b338f2043d97 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -74,7 +74,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index b026a54867f5..87ee47da4e31 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -75,7 +75,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
 CONFIG_NF_CONNTRACK_H323=m
diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig
index 114fcd67898d..cdedbb8a8f53 100644
--- a/arch/mips/configs/fuloong2e_defconfig
+++ b/arch/mips/configs/fuloong2e_defconfig
@@ -44,7 +44,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig
index f1a8ccf2c459..2decf8b98d31 100644
--- a/arch/mips/configs/ip22_defconfig
+++ b/arch/mips/configs/ip22_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/mips/configs/loongson2k_defconfig b/arch/mips/configs/loongson2k_defconfig
index 4b7f914d01d0..6aea6a5b1b66 100644
--- a/arch/mips/configs/loongson2k_defconfig
+++ b/arch/mips/configs/loongson2k_defconfig
@@ -52,7 +52,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index 98844b457b7f..43a72c410538 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -72,7 +72,6 @@ CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 869a14b3184f..9fcbac829920 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index 41e1fea303ea..19102386a81c 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -84,7 +84,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 13ff1877e26e..1dd07c9d1812 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -82,7 +82,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_HELPER=m
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 9fb114ef5e2d..30d18b084cda 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -56,7 +56,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_TARGET_TRACE=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_LIMIT=y
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 7b5a5591ccc9..39a2419e1f3e 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -64,7 +64,6 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m
 CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig
index 3347192b77b8..7a31b52e92e1 100644
--- a/arch/powerpc/configs/cell_defconfig
+++ b/arch/powerpc/configs/cell_defconfig
@@ -62,7 +62,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
 CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 8ecad727497e..0808a3718298 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -248,7 +248,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index c13a77765162..6118e3105adb 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -239,7 +239,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m
 CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
 CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
 CONFIG_NETFILTER_XT_MATCH_CPU=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m
 CONFIG_NETFILTER_XT_MATCH_DSCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index f022ada363b5..8ef72b8dbcd3 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -61,7 +61,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
 CONFIG_NETFILTER_XT_TARGET_MARK=m
 CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
 CONFIG_NETFILTER_XT_MATCH_COMMENT=m
-CONFIG_NETFILTER_XT_MATCH_DCCP=m
 CONFIG_NETFILTER_XT_MATCH_ESP=m
 CONFIG_NETFILTER_XT_MATCH_LENGTH=m
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
diff --git a/include/linux/netfilter/nf_conntrack_dccp.h b/include/linux/netfilter/nf_conntrack_dccp.h
deleted file mode 100644
index c509ed76e714..000000000000
--- a/include/linux/netfilter/nf_conntrack_dccp.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_CONNTRACK_DCCP_H
-#define _NF_CONNTRACK_DCCP_H
-
-/* Exposed to userspace over nfnetlink */
-enum ct_dccp_states {
-	CT_DCCP_NONE,
-	CT_DCCP_REQUEST,
-	CT_DCCP_RESPOND,
-	CT_DCCP_PARTOPEN,
-	CT_DCCP_OPEN,
-	CT_DCCP_CLOSEREQ,
-	CT_DCCP_CLOSING,
-	CT_DCCP_TIMEWAIT,
-	CT_DCCP_IGNORE,
-	CT_DCCP_INVALID,
-	__CT_DCCP_MAX
-};
-#define CT_DCCP_MAX		(__CT_DCCP_MAX - 1)
-
-enum ct_dccp_roles {
-	CT_DCCP_ROLE_CLIENT,
-	CT_DCCP_ROLE_SERVER,
-	__CT_DCCP_ROLE_MAX
-};
-#define CT_DCCP_ROLE_MAX	(__CT_DCCP_ROLE_MAX - 1)
-
-#include <linux/netfilter/nf_conntrack_tuple_common.h>
-
-struct nf_ct_dccp {
-	u_int8_t	role[IP_CT_DIR_MAX];
-	u_int8_t	state;
-	u_int8_t	last_pkt;
-	u_int8_t	last_dir;
-	u_int64_t	handshake_seq;
-};
-
-#endif /* _NF_CONNTRACK_DCCP_H */
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..8d65ffbf57de 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -13,9 +13,6 @@
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp;
-#endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp;
 #endif
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..a844aa46d076 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -18,7 +18,6 @@
 
 #include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
-#include <linux/netfilter/nf_conntrack_dccp.h>
 #include <linux/netfilter/nf_conntrack_sctp.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
 
@@ -31,7 +30,6 @@ struct nf_ct_udp {
 /* per conntrack: protocol private data */
 union nf_conntrack_proto {
 	/* insert conntrack proto private data here */
-	struct nf_ct_dccp dccp;
 	struct ip_ct_sctp sctp;
 	struct ip_ct_tcp tcp;
 	struct nf_ct_udp udp;
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 1f47bef51722..6929f8daf1ed 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -117,11 +117,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			    unsigned int dataoff,
 			    enum ip_conntrack_info ctinfo,
 			    const struct nf_hook_state *state);
-int nf_conntrack_dccp_packet(struct nf_conn *ct,
-			     struct sk_buff *skb,
-			     unsigned int dataoff,
-			     enum ip_conntrack_info ctinfo,
-			     const struct nf_hook_state *state);
 int nf_conntrack_sctp_packet(struct nf_conn *ct,
 			     struct sk_buff *skb,
 			     unsigned int dataoff,
@@ -137,7 +132,6 @@ void nf_conntrack_generic_init_net(struct net *net);
 void nf_conntrack_tcp_init_net(struct net *net);
 void nf_conntrack_udp_init_net(struct net *net);
 void nf_conntrack_gre_init_net(struct net *net);
-void nf_conntrack_dccp_init_net(struct net *net);
 void nf_conntrack_sctp_init_net(struct net *net);
 void nf_conntrack_icmp_init_net(struct net *net);
 void nf_conntrack_icmpv6_init_net(struct net *net);
@@ -223,13 +217,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct)
 }
 #endif
 
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net)
-{
-	return &net->ct.nf_ct_proto.dccp;
-}
-#endif
-
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net)
 {
diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 7c669792fb9c..f1db33bc6bf8 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
 
 		/* Protocols with partial checksums. */
 		case IPPROTO_UDPLITE:
-		case IPPROTO_DCCP:
 			return false;
 	}
 	return true;
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..ab74b5ed0b01 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -7,9 +7,6 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-#include <linux/netfilter/nf_conntrack_dccp.h>
-#endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 #include <linux/netfilter/nf_conntrack_sctp.h>
 #endif
@@ -50,13 +47,6 @@ struct nf_icmp_net {
 	unsigned int timeout;
 };
 
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-struct nf_dccp_net {
-	u8 dccp_loose;
-	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-#endif
-
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 struct nf_sctp_net {
 	unsigned int timeouts[SCTP_CONNTRACK_MAX];
@@ -82,9 +72,6 @@ struct nf_ip_net {
 	struct nf_udp_net	udp;
 	struct nf_icmp_net	icmp;
 	struct nf_icmp_net	icmpv6;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	struct nf_dccp_net	dccp;
-#endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	struct nf_sctp_net	sctp;
 #endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2560416218d0..ba60b48d7567 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -195,16 +195,6 @@ config NF_CONNTRACK_LABELS
 config NF_CONNTRACK_OVS
 	bool
 
-config NF_CT_PROTO_DCCP
-	bool 'DCCP protocol connection tracking support'
-	depends on NETFILTER_ADVANCED
-	default y
-	help
-	  With this option enabled, the layer 3 independent connection
-	  tracking code will be able to do state tracking on DCCP connections.
-
-	  If unsure, say Y.
-
 config NF_CT_PROTO_GRE
 	bool
 
@@ -516,6 +506,12 @@ config NFT_CT
 	  This option adds the "ct" expression that you can use to match
 	  connection tracking information such as the flow state.
 
+config NFT_EXTHDR_DCCP
+	bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)"
+	default n
+	help
+	  This option adds support for matching on DCCP extension headers.
+
 config NFT_FLOW_OFFLOAD
 	depends on NF_CONNTRACK && NF_FLOW_TABLE
 	tristate "Netfilter nf_tables hardware flow offload module"
@@ -1278,9 +1274,9 @@ config NETFILTER_XT_MATCH_CPU
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config NETFILTER_XT_MATCH_DCCP
-	tristate '"dccp" protocol match support'
+	tristate '"dccp" protocol match support (DEPRECATED)'
 	depends on NETFILTER_ADVANCED
-	default IP_DCCP
+	default n
 	help
 	  With this option enabled, you will be able to use the iptables
 	  `dccp' match in order to match on DCCP source/destination ports
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index f0aa4d7ef499..e43e20f529f8 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -12,7 +12,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
-nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
 nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
 ifeq ($(CONFIG_NF_CONNTRACK),m)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 201d3c4ec623..1097f26a6788 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -328,9 +328,6 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 #endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	case IPPROTO_SCTP:
-#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	case IPPROTO_DCCP:
 #endif
 		/* fallthrough */
 		return nf_ct_get_tuple_ports(skb, dataoff, tuple);
@@ -1982,11 +1979,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct,
 		return nf_conntrack_sctp_packet(ct, skb, dataoff,
 						ctinfo, state);
 #endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	case IPPROTO_DCCP:
-		return nf_conntrack_dccp_packet(ct, skb, dataoff,
-						ctinfo, state);
-#endif
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	case IPPROTO_GRE:
 		return nf_conntrack_gre_packet(ct, skb, dataoff,
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..486d52b45fe5 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2036,7 +2036,6 @@ static void ctnetlink_change_mark(struct nf_conn *ct,
 
 static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = {
 	[CTA_PROTOINFO_TCP]	= { .type = NLA_NESTED },
-	[CTA_PROTOINFO_DCCP]	= { .type = NLA_NESTED },
 	[CTA_PROTOINFO_SCTP]	= { .type = NLA_NESTED },
 };
 
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index f36727ed91e1..bc1d96686b9c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 	case IPPROTO_UDP: return &nf_conntrack_l4proto_udp;
 	case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp;
 	case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp;
-#endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp;
 #endif
@@ -681,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #if IS_ENABLED(CONFIG_IPV6)
 	nf_conntrack_icmpv6_init_net(net);
 #endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	nf_conntrack_dccp_init_net(net);
-#endif
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	nf_conntrack_sctp_init_net(net);
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
deleted file mode 100644
index ebc4f733bb2e..000000000000
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ /dev/null
@@ -1,826 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * DCCP connection tracking protocol helper
- *
- * Copyright (c) 2005, 2006, 2008 Patrick McHardy <kaber@trash.net>
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sysctl.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/dccp.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-#include <net/netfilter/nf_conntrack_timeout.h>
-#include <net/netfilter/nf_log.h>
-
-/* Timeouts are based on values from RFC4340:
- *
- * - REQUEST:
- *
- *   8.1.2. Client Request
- *
- *   A client MAY give up on its DCCP-Requests after some time
- *   (3 minutes, for example).
- *
- * - RESPOND:
- *
- *   8.1.3. Server Response
- *
- *   It MAY also leave the RESPOND state for CLOSED after a timeout of
- *   not less than 4MSL (8 minutes);
- *
- * - PARTOPEN:
- *
- *   8.1.5. Handshake Completion
- *
- *   If the client remains in PARTOPEN for more than 4MSL (8 minutes),
- *   it SHOULD reset the connection with Reset Code 2, "Aborted".
- *
- * - OPEN:
- *
- *   The DCCP timestamp overflows after 11.9 hours. If the connection
- *   stays idle this long the sequence number won't be recognized
- *   as valid anymore.
- *
- * - CLOSEREQ/CLOSING:
- *
- *   8.3. Termination
- *
- *   The retransmission timer should initially be set to go off in two
- *   round-trip times and should back off to not less than once every
- *   64 seconds ...
- *
- * - TIMEWAIT:
- *
- *   4.3. States
- *
- *   A server or client socket remains in this state for 2MSL (4 minutes)
- *   after the connection has been town down, ...
- */
-
-#define DCCP_MSL (2 * 60 * HZ)
-
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-static const char * const dccp_state_names[] = {
-	[CT_DCCP_NONE]		= "NONE",
-	[CT_DCCP_REQUEST]	= "REQUEST",
-	[CT_DCCP_RESPOND]	= "RESPOND",
-	[CT_DCCP_PARTOPEN]	= "PARTOPEN",
-	[CT_DCCP_OPEN]		= "OPEN",
-	[CT_DCCP_CLOSEREQ]	= "CLOSEREQ",
-	[CT_DCCP_CLOSING]	= "CLOSING",
-	[CT_DCCP_TIMEWAIT]	= "TIMEWAIT",
-	[CT_DCCP_IGNORE]	= "IGNORE",
-	[CT_DCCP_INVALID]	= "INVALID",
-};
-#endif
-
-#define sNO	CT_DCCP_NONE
-#define sRQ	CT_DCCP_REQUEST
-#define sRS	CT_DCCP_RESPOND
-#define sPO	CT_DCCP_PARTOPEN
-#define sOP	CT_DCCP_OPEN
-#define sCR	CT_DCCP_CLOSEREQ
-#define sCG	CT_DCCP_CLOSING
-#define sTW	CT_DCCP_TIMEWAIT
-#define sIG	CT_DCCP_IGNORE
-#define sIV	CT_DCCP_INVALID
-
-/*
- * DCCP state transition table
- *
- * The assumption is the same as for TCP tracking:
- *
- * We are the man in the middle. All the packets go through us but might
- * get lost in transit to the destination. It is assumed that the destination
- * can't receive segments we haven't seen.
- *
- * The following states exist:
- *
- * NONE:	Initial state, expecting Request
- * REQUEST:	Request seen, waiting for Response from server
- * RESPOND:	Response from server seen, waiting for Ack from client
- * PARTOPEN:	Ack after Response seen, waiting for packet other than Response,
- * 		Reset or Sync from server
- * OPEN:	Packet other than Response, Reset or Sync seen
- * CLOSEREQ:	CloseReq from server seen, expecting Close from client
- * CLOSING:	Close seen, expecting Reset
- * TIMEWAIT:	Reset seen
- * IGNORE:	Not determinable whether packet is valid
- *
- * Some states exist only on one side of the connection: REQUEST, RESPOND,
- * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to
- * the one it was in before.
- *
- * Packets are marked as ignored (sIG) if we don't know if they're valid
- * (for example a reincarnation of a connection we didn't notice is dead
- * already) and the server may send back a connection closing Reset or a
- * Response. They're also used for Sync/SyncAck packets, which we don't
- * care about.
- */
-static const u_int8_t
-dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = {
-	[CT_DCCP_ROLE_CLIENT] = {
-		[DCCP_PKT_REQUEST] = {
-		/*
-		 * sNO -> sRQ		Regular Request
-		 * sRQ -> sRQ		Retransmitted Request or reincarnation
-		 * sRS -> sRS		Retransmitted Request (apparently Response
-		 * 			got lost after we saw it) or reincarnation
-		 * sPO -> sIG		Ignore, conntrack might be out of sync
-		 * sOP -> sIG		Ignore, conntrack might be out of sync
-		 * sCR -> sIG		Ignore, conntrack might be out of sync
-		 * sCG -> sIG		Ignore, conntrack might be out of sync
-		 * sTW -> sRQ		Reincarnation
-		 *
-		 *	sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */
-			sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ,
-		},
-		[DCCP_PKT_RESPONSE] = {
-		/*
-		 * sNO -> sIV		Invalid
-		 * sRQ -> sIG		Ignore, might be response to ignored Request
-		 * sRS -> sIG		Ignore, might be response to ignored Request
-		 * sPO -> sIG		Ignore, might be response to ignored Request
-		 * sOP -> sIG		Ignore, might be response to ignored Request
-		 * sCR -> sIG		Ignore, might be response to ignored Request
-		 * sCG -> sIG		Ignore, might be response to ignored Request
-		 * sTW -> sIV		Invalid, reincarnation in reverse direction
-		 *			goes through sRQ
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV,
-		},
-		[DCCP_PKT_ACK] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sPO		Ack for Response, move to PARTOPEN (8.1.5.)
-		 * sPO -> sPO		Retransmitted Ack for Response, remain in PARTOPEN
-		 * sOP -> sOP		Regular ACK, remain in OPEN
-		 * sCR -> sCR		Ack in CLOSEREQ MAY be processed (8.3.)
-		 * sCG -> sCG		Ack in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
-		},
-		[DCCP_PKT_DATA] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sIV		MUST use DataAck in PARTOPEN state (8.1.5.)
-		 * sOP -> sOP		Regular Data packet
-		 * sCR -> sCR		Data in CLOSEREQ MAY be processed (8.3.)
-		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV,
-		},
-		[DCCP_PKT_DATAACK] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sPO		Ack for Response, move to PARTOPEN (8.1.5.)
-		 * sPO -> sPO		Remain in PARTOPEN state
-		 * sOP -> sOP		Regular DataAck packet in OPEN state
-		 * sCR -> sCR		DataAck in CLOSEREQ MAY be processed (8.3.)
-		 * sCG -> sCG		DataAck in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV
-		},
-		[DCCP_PKT_CLOSEREQ] = {
-		/*
-		 * CLOSEREQ may only be sent by the server.
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV
-		},
-		[DCCP_PKT_CLOSE] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sCG		Client-initiated close
-		 * sOP -> sCG		Client-initiated close
-		 * sCR -> sCG		Close in response to CloseReq (8.3.)
-		 * sCG -> sCG		Retransmit
-		 * sTW -> sIV		Late retransmit, already in TIME_WAIT
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV
-		},
-		[DCCP_PKT_RESET] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sTW		Sync received or timeout, SHOULD send Reset (8.1.1.)
-		 * sRS -> sTW		Response received without Request
-		 * sPO -> sTW		Timeout, SHOULD send Reset (8.1.5.)
-		 * sOP -> sTW		Connection reset
-		 * sCR -> sTW		Connection reset
-		 * sCG -> sTW		Connection reset
-		 * sTW -> sIG		Ignore (don't refresh timer)
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG
-		},
-		[DCCP_PKT_SYNC] = {
-		/*
-		 * We currently ignore Sync packets
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
-		},
-		[DCCP_PKT_SYNCACK] = {
-		/*
-		 * We currently ignore SyncAck packets
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
-		},
-	},
-	[CT_DCCP_ROLE_SERVER] = {
-		[DCCP_PKT_REQUEST] = {
-		/*
-		 * sNO -> sIV		Invalid
-		 * sRQ -> sIG		Ignore, conntrack might be out of sync
-		 * sRS -> sIG		Ignore, conntrack might be out of sync
-		 * sPO -> sIG		Ignore, conntrack might be out of sync
-		 * sOP -> sIG		Ignore, conntrack might be out of sync
-		 * sCR -> sIG		Ignore, conntrack might be out of sync
-		 * sCG -> sIG		Ignore, conntrack might be out of sync
-		 * sTW -> sRQ		Reincarnation, must reverse roles
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ
-		},
-		[DCCP_PKT_RESPONSE] = {
-		/*
-		 * sNO -> sIV		Response without Request
-		 * sRQ -> sRS		Response to clients Request
-		 * sRS -> sRS		Retransmitted Response (8.1.3. SHOULD NOT)
-		 * sPO -> sIG		Response to an ignored Request or late retransmit
-		 * sOP -> sIG		Ignore, might be response to ignored Request
-		 * sCR -> sIG		Ignore, might be response to ignored Request
-		 * sCG -> sIG		Ignore, might be response to ignored Request
-		 * sTW -> sIV		Invalid, Request from client in sTW moves to sRQ
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV
-		},
-		[DCCP_PKT_ACK] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sOP		Enter OPEN state (8.1.5.)
-		 * sOP -> sOP		Regular Ack in OPEN state
-		 * sCR -> sIV		Waiting for Close from client
-		 * sCG -> sCG		Ack in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
-		},
-		[DCCP_PKT_DATA] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sOP		Enter OPEN state (8.1.5.)
-		 * sOP -> sOP		Regular Data packet in OPEN state
-		 * sCR -> sIV		Waiting for Close from client
-		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
-		},
-		[DCCP_PKT_DATAACK] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sOP		Enter OPEN state (8.1.5.)
-		 * sOP -> sOP		Regular DataAck in OPEN state
-		 * sCR -> sIV		Waiting for Close from client
-		 * sCG -> sCG		Data in CLOSING MAY be processed (8.3.)
-		 * sTW -> sIV
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV
-		},
-		[DCCP_PKT_CLOSEREQ] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sOP -> sCR	Move directly to CLOSEREQ (8.1.5.)
-		 * sOP -> sCR		CloseReq in OPEN state
-		 * sCR -> sCR		Retransmit
-		 * sCG -> sCR		Simultaneous close, client sends another Close
-		 * sTW -> sIV		Already closed
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV
-		},
-		[DCCP_PKT_CLOSE] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sIV		No connection
-		 * sRS -> sIV		No connection
-		 * sPO -> sOP -> sCG	Move direcly to CLOSING
-		 * sOP -> sCG		Move to CLOSING
-		 * sCR -> sIV		Close after CloseReq is invalid
-		 * sCG -> sCG		Retransmit
-		 * sTW -> sIV		Already closed
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV
-		},
-		[DCCP_PKT_RESET] = {
-		/*
-		 * sNO -> sIV		No connection
-		 * sRQ -> sTW		Reset in response to Request
-		 * sRS -> sTW		Timeout, SHOULD send Reset (8.1.3.)
-		 * sPO -> sTW		Timeout, SHOULD send Reset (8.1.3.)
-		 * sOP -> sTW
-		 * sCR -> sTW
-		 * sCG -> sTW
-		 * sTW -> sIG		Ignore (don't refresh timer)
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */
-			sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG
-		},
-		[DCCP_PKT_SYNC] = {
-		/*
-		 * We currently ignore Sync packets
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
-		},
-		[DCCP_PKT_SYNCACK] = {
-		/*
-		 * We currently ignore SyncAck packets
-		 *
-		 *	sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
-			sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
-		},
-	},
-};
-
-static noinline bool
-dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
-	 const struct dccp_hdr *dh,
-	 const struct nf_hook_state *hook_state)
-{
-	struct net *net = nf_ct_net(ct);
-	struct nf_dccp_net *dn;
-	const char *msg;
-	u_int8_t state;
-
-	state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE];
-	switch (state) {
-	default:
-		dn = nf_dccp_pernet(net);
-		if (dn->dccp_loose == 0) {
-			msg = "not picking up existing connection ";
-			goto out_invalid;
-		}
-		break;
-	case CT_DCCP_REQUEST:
-		break;
-	case CT_DCCP_INVALID:
-		msg = "invalid state transition ";
-		goto out_invalid;
-	}
-
-	ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
-	ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
-	ct->proto.dccp.state = CT_DCCP_NONE;
-	ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
-	ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
-	ct->proto.dccp.handshake_seq = 0;
-	return true;
-
-out_invalid:
-	nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg);
-	return false;
-}
-
-static u64 dccp_ack_seq(const struct dccp_hdr *dh)
-{
-	const struct dccp_hdr_ack_bits *dhack;
-
-	dhack = (void *)dh + __dccp_basic_hdr_len(dh);
-	return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) +
-		     ntohl(dhack->dccph_ack_nr_low);
-}
-
-static bool dccp_error(const struct dccp_hdr *dh,
-		       struct sk_buff *skb, unsigned int dataoff,
-		       const struct nf_hook_state *state)
-{
-	static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST |
-						   1 << DCCP_PKT_RESPONSE |
-						   1 << DCCP_PKT_CLOSEREQ |
-						   1 << DCCP_PKT_CLOSE |
-						   1 << DCCP_PKT_RESET |
-						   1 << DCCP_PKT_SYNC |
-						   1 << DCCP_PKT_SYNCACK;
-	unsigned int dccp_len = skb->len - dataoff;
-	unsigned int cscov;
-	const char *msg;
-	u8 type;
-
-	BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG);
-
-	if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) ||
-	    dh->dccph_doff * 4 > dccp_len) {
-		msg = "nf_ct_dccp: truncated/malformed packet ";
-		goto out_invalid;
-	}
-
-	cscov = dccp_len;
-	if (dh->dccph_cscov) {
-		cscov = (dh->dccph_cscov - 1) * 4;
-		if (cscov > dccp_len) {
-			msg = "nf_ct_dccp: bad checksum coverage ";
-			goto out_invalid;
-		}
-	}
-
-	if (state->hook == NF_INET_PRE_ROUTING &&
-	    state->net->ct.sysctl_checksum &&
-	    nf_checksum_partial(skb, state->hook, dataoff, cscov,
-				IPPROTO_DCCP, state->pf)) {
-		msg = "nf_ct_dccp: bad checksum ";
-		goto out_invalid;
-	}
-
-	type = dh->dccph_type;
-	if (type >= DCCP_PKT_INVALID) {
-		msg = "nf_ct_dccp: reserved packet type ";
-		goto out_invalid;
-	}
-
-	if (test_bit(type, &require_seq48) && !dh->dccph_x) {
-		msg = "nf_ct_dccp: type lacks 48bit sequence numbers";
-		goto out_invalid;
-	}
-
-	return false;
-out_invalid:
-	nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg);
-	return true;
-}
-
-struct nf_conntrack_dccp_buf {
-	struct dccp_hdr dh;	 /* generic header part */
-	struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */
-	union {			 /* depends on header type */
-		struct dccp_hdr_ack_bits ack;
-		struct dccp_hdr_request req;
-		struct dccp_hdr_response response;
-		struct dccp_hdr_reset rst;
-	} u;
-};
-
-static struct dccp_hdr *
-dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh,
-		    struct nf_conntrack_dccp_buf *buf)
-{
-	unsigned int hdrlen = __dccp_hdr_len(dh);
-
-	if (hdrlen > sizeof(*buf))
-		return NULL;
-
-	return skb_header_pointer(skb, offset, hdrlen, buf);
-}
-
-int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
-			     unsigned int dataoff,
-			     enum ip_conntrack_info ctinfo,
-			     const struct nf_hook_state *state)
-{
-	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-	struct nf_conntrack_dccp_buf _dh;
-	u_int8_t type, old_state, new_state;
-	enum ct_dccp_roles role;
-	unsigned int *timeouts;
-	struct dccp_hdr *dh;
-
-	dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh);
-	if (!dh)
-		return -NF_ACCEPT;
-
-	if (dccp_error(dh, skb, dataoff, state))
-		return -NF_ACCEPT;
-
-	/* pull again, including possible 48 bit sequences and subtype header */
-	dh = dccp_header_pointer(skb, dataoff, dh, &_dh);
-	if (!dh)
-		return -NF_ACCEPT;
-
-	type = dh->dccph_type;
-	if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state))
-		return -NF_ACCEPT;
-
-	if (type == DCCP_PKT_RESET &&
-	    !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
-		/* Tear down connection immediately if only reply is a RESET */
-		nf_ct_kill_acct(ct, ctinfo, skb);
-		return NF_ACCEPT;
-	}
-
-	spin_lock_bh(&ct->lock);
-
-	role = ct->proto.dccp.role[dir];
-	old_state = ct->proto.dccp.state;
-	new_state = dccp_state_table[role][type][old_state];
-
-	switch (new_state) {
-	case CT_DCCP_REQUEST:
-		if (old_state == CT_DCCP_TIMEWAIT &&
-		    role == CT_DCCP_ROLE_SERVER) {
-			/* Reincarnation in the reverse direction: reopen and
-			 * reverse client/server roles. */
-			ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT;
-			ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER;
-		}
-		break;
-	case CT_DCCP_RESPOND:
-		if (old_state == CT_DCCP_REQUEST)
-			ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
-		break;
-	case CT_DCCP_PARTOPEN:
-		if (old_state == CT_DCCP_RESPOND &&
-		    type == DCCP_PKT_ACK &&
-		    dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq)
-			set_bit(IPS_ASSURED_BIT, &ct->status);
-		break;
-	case CT_DCCP_IGNORE:
-		/*
-		 * Connection tracking might be out of sync, so we ignore
-		 * packets that might establish a new connection and resync
-		 * if the server responds with a valid Response.
-		 */
-		if (ct->proto.dccp.last_dir == !dir &&
-		    ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST &&
-		    type == DCCP_PKT_RESPONSE) {
-			ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT;
-			ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER;
-			ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh);
-			new_state = CT_DCCP_RESPOND;
-			break;
-		}
-		ct->proto.dccp.last_dir = dir;
-		ct->proto.dccp.last_pkt = type;
-
-		spin_unlock_bh(&ct->lock);
-		nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet");
-		return NF_ACCEPT;
-	case CT_DCCP_INVALID:
-		spin_unlock_bh(&ct->lock);
-		nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition");
-		return -NF_ACCEPT;
-	}
-
-	ct->proto.dccp.last_dir = dir;
-	ct->proto.dccp.last_pkt = type;
-	ct->proto.dccp.state = new_state;
-	spin_unlock_bh(&ct->lock);
-
-	if (new_state != old_state)
-		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
-
-	timeouts = nf_ct_timeout_lookup(ct);
-	if (!timeouts)
-		timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout;
-	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
-
-	return NF_ACCEPT;
-}
-
-static bool dccp_can_early_drop(const struct nf_conn *ct)
-{
-	switch (ct->proto.dccp.state) {
-	case CT_DCCP_CLOSEREQ:
-	case CT_DCCP_CLOSING:
-	case CT_DCCP_TIMEWAIT:
-		return true;
-	default:
-		break;
-	}
-
-	return false;
-}
-
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
-{
-	seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  struct nf_conn *ct, bool destroy)
-{
-	struct nlattr *nest_parms;
-
-	spin_lock_bh(&ct->lock);
-	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP);
-	if (!nest_parms)
-		goto nla_put_failure;
-	if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state))
-		goto nla_put_failure;
-
-	if (destroy)
-		goto skip_state;
-
-	if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE,
-		       ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) ||
-	    nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ,
-			 cpu_to_be64(ct->proto.dccp.handshake_seq),
-			 CTA_PROTOINFO_DCCP_PAD))
-		goto nla_put_failure;
-skip_state:
-	nla_nest_end(skb, nest_parms);
-	spin_unlock_bh(&ct->lock);
-
-	return 0;
-
-nla_put_failure:
-	spin_unlock_bh(&ct->lock);
-	return -1;
-}
-
-static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
-	[CTA_PROTOINFO_DCCP_STATE]	= { .type = NLA_U8 },
-	[CTA_PROTOINFO_DCCP_ROLE]	= { .type = NLA_U8 },
-	[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 },
-	[CTA_PROTOINFO_DCCP_PAD]	= { .type = NLA_UNSPEC },
-};
-
-#define DCCP_NLATTR_SIZE ( \
-	NLA_ALIGN(NLA_HDRLEN + 1) + \
-	NLA_ALIGN(NLA_HDRLEN + 1) + \
-	NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \
-	NLA_ALIGN(NLA_HDRLEN + 0))
-
-static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
-{
-	struct nlattr *attr = cda[CTA_PROTOINFO_DCCP];
-	struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1];
-	int err;
-
-	if (!attr)
-		return 0;
-
-	err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr,
-					  dccp_nla_policy, NULL);
-	if (err < 0)
-		return err;
-
-	if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
-	    !tb[CTA_PROTOINFO_DCCP_ROLE] ||
-	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
-	    nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
-		return -EINVAL;
-	}
-
-	spin_lock_bh(&ct->lock);
-	ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
-	if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
-		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
-		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
-	} else {
-		ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
-		ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
-	}
-	if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) {
-		ct->proto.dccp.handshake_seq =
-		be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]));
-	}
-	spin_unlock_bh(&ct->lock);
-	return 0;
-}
-#endif
-
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_cttimeout.h>
-
-static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[],
-				      struct net *net, void *data)
-{
-	struct nf_dccp_net *dn = nf_dccp_pernet(net);
-	unsigned int *timeouts = data;
-	int i;
-
-	if (!timeouts)
-		 timeouts = dn->dccp_timeout;
-
-	/* set default DCCP timeouts. */
-	for (i=0; i<CT_DCCP_MAX; i++)
-		timeouts[i] = dn->dccp_timeout[i];
-
-	/* there's a 1:1 mapping between attributes and protocol states. */
-	for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
-		if (tb[i]) {
-			timeouts[i] = ntohl(nla_get_be32(tb[i])) * HZ;
-		}
-	}
-
-	timeouts[CTA_TIMEOUT_DCCP_UNSPEC] = timeouts[CTA_TIMEOUT_DCCP_REQUEST];
-	return 0;
-}
-
-static int
-dccp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
-{
-        const unsigned int *timeouts = data;
-	int i;
-
-	for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; i<CTA_TIMEOUT_DCCP_MAX+1; i++) {
-		if (nla_put_be32(skb, i, htonl(timeouts[i] / HZ)))
-			goto nla_put_failure;
-	}
-	return 0;
-
-nla_put_failure:
-	return -ENOSPC;
-}
-
-static const struct nla_policy
-dccp_timeout_nla_policy[CTA_TIMEOUT_DCCP_MAX+1] = {
-	[CTA_TIMEOUT_DCCP_REQUEST]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_RESPOND]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_PARTOPEN]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_OPEN]		= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_CLOSEREQ]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_CLOSING]	= { .type = NLA_U32 },
-	[CTA_TIMEOUT_DCCP_TIMEWAIT]	= { .type = NLA_U32 },
-};
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-
-void nf_conntrack_dccp_init_net(struct net *net)
-{
-	struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
-	/* default values */
-	dn->dccp_loose = 1;
-	dn->dccp_timeout[CT_DCCP_REQUEST]	= 2 * DCCP_MSL;
-	dn->dccp_timeout[CT_DCCP_RESPOND]	= 4 * DCCP_MSL;
-	dn->dccp_timeout[CT_DCCP_PARTOPEN]	= 4 * DCCP_MSL;
-	dn->dccp_timeout[CT_DCCP_OPEN]		= 12 * 3600 * HZ;
-	dn->dccp_timeout[CT_DCCP_CLOSEREQ]	= 64 * HZ;
-	dn->dccp_timeout[CT_DCCP_CLOSING]	= 64 * HZ;
-	dn->dccp_timeout[CT_DCCP_TIMEWAIT]	= 2 * DCCP_MSL;
-
-	/* timeouts[0] is unused, make it same as SYN_SENT so
-	 * ->timeouts[0] contains 'new' timeout, like udp or icmp.
-	 */
-	dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST];
-}
-
-const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = {
-	.l4proto		= IPPROTO_DCCP,
-	.can_early_drop		= dccp_can_early_drop,
-#ifdef CONFIG_NF_CONNTRACK_PROCFS
-	.print_conntrack	= dccp_print_conntrack,
-#endif
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.nlattr_size		= DCCP_NLATTR_SIZE,
-	.to_nlattr		= dccp_to_nlattr,
-	.from_nlattr		= nlattr_to_dccp,
-	.tuple_to_nlattr	= nf_ct_port_tuple_to_nlattr,
-	.nlattr_tuple_size	= nf_ct_port_nlattr_tuple_size,
-	.nlattr_to_tuple	= nf_ct_port_nlattr_to_tuple,
-	.nla_policy		= nf_ct_port_nla_policy,
-#endif
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-	.ctnl_timeout		= {
-		.nlattr_to_obj	= dccp_timeout_nlattr_to_obj,
-		.obj_to_nlattr	= dccp_timeout_obj_to_nlattr,
-		.nlattr_max	= CTA_TIMEOUT_DCCP_MAX,
-		.obj_size	= sizeof(unsigned int) * CT_DCCP_MAX,
-		.nla_policy	= dccp_timeout_nla_policy,
-	},
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
-};
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 6c4cff10357d..829f60496008 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -67,11 +67,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 			   ntohs(tuple->dst.u.udp.port));
 
 		break;
-	case IPPROTO_DCCP:
-		seq_printf(s, "sport=%hu dport=%hu ",
-			   ntohs(tuple->src.u.dccp.port),
-			   ntohs(tuple->dst.u.dccp.port));
-		break;
 	case IPPROTO_SCTP:
 		seq_printf(s, "sport=%hu dport=%hu ",
 			   ntohs(tuple->src.u.sctp.port),
@@ -279,7 +274,6 @@ static const char* l4proto_name(u16 proto)
 	case IPPROTO_ICMP: return "icmp";
 	case IPPROTO_TCP: return "tcp";
 	case IPPROTO_UDP: return "udp";
-	case IPPROTO_DCCP: return "dccp";
 	case IPPROTO_GRE: return "gre";
 	case IPPROTO_SCTP: return "sctp";
 	case IPPROTO_UDPLITE: return "udplite";
@@ -612,16 +606,6 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT,
 #endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT,
-	NF_SYSCTL_CT_PROTO_DCCP_LOOSE,
-#endif
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	NF_SYSCTL_CT_PROTO_TIMEOUT_GRE,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
@@ -895,58 +879,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 #endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = {
-		.procname	= "nf_conntrack_dccp_timeout_request",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = {
-		.procname	= "nf_conntrack_dccp_timeout_respond",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = {
-		.procname	= "nf_conntrack_dccp_timeout_partopen",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = {
-		.procname	= "nf_conntrack_dccp_timeout_open",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = {
-		.procname	= "nf_conntrack_dccp_timeout_closereq",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = {
-		.procname	= "nf_conntrack_dccp_timeout_closing",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = {
-		.procname	= "nf_conntrack_dccp_timeout_timewait",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
-	[NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = {
-		.procname	= "nf_conntrack_dccp_loose",
-		.maxlen		= sizeof(u8),
-		.mode		= 0644,
-		.proc_handler	= proc_dou8vec_minmax,
-		.extra1 	= SYSCTL_ZERO,
-		.extra2 	= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_NF_CT_PROTO_GRE
 	[NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = {
 		.procname       = "nf_conntrack_gre_timeout",
@@ -1032,29 +964,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,
 #endif
 }
 
-static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net,
-						     struct ctl_table *table)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	struct nf_dccp_net *dn = nf_dccp_pernet(net);
-
-#define XASSIGN(XNAME, dn) \
-	table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \
-			&(dn)->dccp_timeout[CT_DCCP_ ## XNAME]
-
-	XASSIGN(REQUEST, dn);
-	XASSIGN(RESPOND, dn);
-	XASSIGN(PARTOPEN, dn);
-	XASSIGN(OPEN, dn);
-	XASSIGN(CLOSEREQ, dn);
-	XASSIGN(CLOSING, dn);
-	XASSIGN(TIMEWAIT, dn);
-#undef XASSIGN
-
-	table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose;
-#endif
-}
-
 static void nf_conntrack_standalone_init_gre_sysctl(struct net *net,
 						    struct ctl_table *table)
 {
@@ -1100,7 +1009,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	nf_conntrack_standalone_init_tcp_sysctl(net, table);
 	nf_conntrack_standalone_init_sctp_sysctl(net, table);
-	nf_conntrack_standalone_init_dccp_sysctl(net, table);
 	nf_conntrack_standalone_init_gre_sysctl(net, table);
 
 	/* Don't allow non-init_net ns to alter global sysctls */
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index f391cd267922..78a61dac4ade 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -69,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
 		if (t->dst.protonum == IPPROTO_TCP ||
 		    t->dst.protonum == IPPROTO_UDP ||
 		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
 		    t->dst.protonum == IPPROTO_SCTP)
 			fl4->fl4_dport = t->dst.u.all;
 	}
@@ -81,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
 		if (t->dst.protonum == IPPROTO_TCP ||
 		    t->dst.protonum == IPPROTO_UDP ||
 		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
 		    t->dst.protonum == IPPROTO_SCTP)
 			fl4->fl4_sport = t->src.u.all;
 	}
@@ -102,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
 		if (t->dst.protonum == IPPROTO_TCP ||
 		    t->dst.protonum == IPPROTO_UDP ||
 		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
 		    t->dst.protonum == IPPROTO_SCTP)
 			fl6->fl6_dport = t->dst.u.all;
 	}
@@ -114,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
 		if (t->dst.protonum == IPPROTO_TCP ||
 		    t->dst.protonum == IPPROTO_UDP ||
 		    t->dst.protonum == IPPROTO_UDPLITE ||
-		    t->dst.protonum == IPPROTO_DCCP ||
 		    t->dst.protonum == IPPROTO_SCTP)
 			fl6->fl6_sport = t->src.u.all;
 	}
@@ -432,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
 	case IPPROTO_UDPLITE:
-	case IPPROTO_DCCP:
 	case IPPROTO_SCTP:
 		if (maniptype == NF_NAT_MANIP_SRC)
 			port = tuple->src.u.all;
@@ -632,7 +627,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
 	case IPPROTO_UDPLITE:
 	case IPPROTO_TCP:
 	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
 		if (maniptype == NF_NAT_MANIP_SRC)
 			keyptr = &tuple->src.u.all;
 		else
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index dc450cc81222..b14a434b9561 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -179,46 +179,6 @@ tcp_manip_pkt(struct sk_buff *skb,
 	return true;
 }
 
-static bool
-dccp_manip_pkt(struct sk_buff *skb,
-	       unsigned int iphdroff, unsigned int hdroff,
-	       const struct nf_conntrack_tuple *tuple,
-	       enum nf_nat_manip_type maniptype)
-{
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	struct dccp_hdr *hdr;
-	__be16 *portptr, oldport, newport;
-	int hdrsize = 8; /* DCCP connection tracking guarantees this much */
-
-	if (skb->len >= hdroff + sizeof(struct dccp_hdr))
-		hdrsize = sizeof(struct dccp_hdr);
-
-	if (skb_ensure_writable(skb, hdroff + hdrsize))
-		return false;
-
-	hdr = (struct dccp_hdr *)(skb->data + hdroff);
-
-	if (maniptype == NF_NAT_MANIP_SRC) {
-		newport = tuple->src.u.dccp.port;
-		portptr = &hdr->dccph_sport;
-	} else {
-		newport = tuple->dst.u.dccp.port;
-		portptr = &hdr->dccph_dport;
-	}
-
-	oldport = *portptr;
-	*portptr = newport;
-
-	if (hdrsize < sizeof(*hdr))
-		return true;
-
-	nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype);
-	inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport,
-				 false);
-#endif
-	return true;
-}
-
 static bool
 icmp_manip_pkt(struct sk_buff *skb,
 	       unsigned int iphdroff, unsigned int hdroff,
@@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb,
 	case IPPROTO_ICMPV6:
 		return icmpv6_manip_pkt(skb, iphdroff, hdroff,
 					tuple, maniptype);
-	case IPPROTO_DCCP:
-		return dccp_manip_pkt(skb, iphdroff, hdroff,
-				      tuple, maniptype);
 	case IPPROTO_GRE:
 		return gre_manip_pkt(skb, iphdroff, hdroff,
 				     tuple, maniptype);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index eab4f476b47f..38d75484e531 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -461,11 +461,6 @@ static int cttimeout_default_get(struct sk_buff *skb,
 	case IPPROTO_UDPLITE:
 		timeouts = nf_udp_pernet(info->net)->timeouts;
 		break;
-	case IPPROTO_DCCP:
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-		timeouts = nf_dccp_pernet(info->net)->dccp_timeout;
-#endif
-		break;
 	case IPPROTO_ICMPV6:
 		timeouts = &nf_icmpv6_pernet(info->net)->timeout;
 		break;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index c74012c99125..7eedf4e3ae9c 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -407,6 +407,7 @@ err:
 		regs->verdict.code = NFT_BREAK;
 }
 
+#ifdef CONFIG_NFT_EXTHDR_DCCP
 static void nft_exthdr_dccp_eval(const struct nft_expr *expr,
 				 struct nft_regs *regs,
 				 const struct nft_pktinfo *pkt)
@@ -482,6 +483,7 @@ static void nft_exthdr_dccp_eval(const struct nft_expr *expr,
 err:
 	*dest = 0;
 }
+#endif
 
 static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = {
 	[NFTA_EXTHDR_DREG]		= { .type = NLA_U32 },
@@ -634,6 +636,7 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
+#ifdef CONFIG_NFT_EXTHDR_DCCP
 static int nft_exthdr_dccp_init(const struct nft_ctx *ctx,
 				const struct nft_expr *expr,
 				const struct nlattr * const tb[])
@@ -649,6 +652,7 @@ static int nft_exthdr_dccp_init(const struct nft_ctx *ctx,
 
 	return 0;
 }
+#endif
 
 static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv)
 {
@@ -779,6 +783,7 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = {
 	.reduce		= nft_exthdr_reduce,
 };
 
+#ifdef CONFIG_NFT_EXTHDR_DCCP
 static const struct nft_expr_ops nft_exthdr_dccp_ops = {
 	.type		= &nft_exthdr_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_exthdr)),
@@ -787,6 +792,7 @@ static const struct nft_expr_ops nft_exthdr_dccp_ops = {
 	.dump		= nft_exthdr_dump,
 	.reduce		= nft_exthdr_reduce,
 };
+#endif
 
 static const struct nft_expr_ops *
 nft_exthdr_select_ops(const struct nft_ctx *ctx,
@@ -822,10 +828,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx,
 		if (tb[NFTA_EXTHDR_DREG])
 			return &nft_exthdr_sctp_ops;
 		break;
+#ifdef CONFIG_NFT_EXTHDR_DCCP
 	case NFT_EXTHDR_OP_DCCP:
 		if (tb[NFTA_EXTHDR_DREG])
 			return &nft_exthdr_dccp_ops;
 		break;
+#endif
 	}
 
 	return ERR_PTR(-EOPNOTSUPP);
-- 
cgit v1.2.3


From 9931d2899eec3737f4e4fa9fc900be7329816e94 Mon Sep 17 00:00:00 2001
From: Shengjiu Wang <shengjiu.wang@nxp.com>
Date: Fri, 20 Jun 2025 13:52:28 +0800
Subject: ASoC: fsl_mqs: Distinguish different modules by system manager
 indices

On i.MX94, the MQS2 also needs to be configured by SCMI interface, add
sm_index variable in struct fsl_mqs_soc_data to distinguish the MQS1 and
MQS2 on this platform.

Add the system manager indices for i.MX94 in the header file.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Link: https://patch.msgid.link/20250620055229.965942-2-shengjiu.wang@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/imx/sm.h |  8 ++++++++
 sound/soc/fsl/fsl_mqs.c         | 11 ++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h
index a8a17eeb7d90..a6220c500f7c 100644
--- a/include/linux/firmware/imx/sm.h
+++ b/include/linux/firmware/imx/sm.h
@@ -18,6 +18,14 @@
 #define SCMI_IMX_CTRL_SAI4_MCLK		4	/* WAKE SAI4 MCLK */
 #define SCMI_IMX_CTRL_SAI5_MCLK		5	/* WAKE SAI5 MCLK */
 
+#define SCMI_IMX94_CTRL_PDM_CLK_SEL	0U	/*!< AON PDM clock sel */
+#define SCMI_IMX94_CTRL_MQS1_SETTINGS	1U	/*!< AON MQS settings */
+#define SCMI_IMX94_CTRL_MQS2_SETTINGS	2U	/*!< WAKE MQS settings */
+#define SCMI_IMX94_CTRL_SAI1_MCLK	3U	/*!< AON SAI1 MCLK */
+#define SCMI_IMX94_CTRL_SAI2_MCLK	4U	/*!< WAKE SAI2 MCLK */
+#define SCMI_IMX94_CTRL_SAI3_MCLK	5U	/*!< WAKE SAI3 MCLK */
+#define SCMI_IMX94_CTRL_SAI4_MCLK	6U	/*!< WAKE SAI4 MCLK */
+
 int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val);
 int scmi_imx_misc_ctrl_set(u32 id, u32 val);
 
diff --git a/sound/soc/fsl/fsl_mqs.c b/sound/soc/fsl/fsl_mqs.c
index e34e5ea98de5..11f2f3792dce 100644
--- a/sound/soc/fsl/fsl_mqs.c
+++ b/sound/soc/fsl/fsl_mqs.c
@@ -39,6 +39,7 @@ enum reg_type {
  * struct fsl_mqs_soc_data - soc specific data
  *
  * @type: control register space type
+ * @sm_index: index from definition in system manager
  * @ctrl_off: control register offset
  * @en_mask: enable bit mask
  * @en_shift: enable bit shift
@@ -51,6 +52,7 @@ enum reg_type {
  */
 struct fsl_mqs_soc_data {
 	enum reg_type type;
+	int  sm_index;
 	int  ctrl_off;
 	int  en_mask;
 	int  en_shift;
@@ -82,7 +84,7 @@ static int fsl_mqs_sm_read(void *context, unsigned int reg, unsigned int *val)
 
 	if (IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) &&
 	    mqs_priv->soc->ctrl_off == reg)
-		return scmi_imx_misc_ctrl_get(SCMI_IMX_CTRL_MQS1_SETTINGS, &num, val);
+		return scmi_imx_misc_ctrl_get(mqs_priv->soc->sm_index, &num, val);
 
 	return -EINVAL;
 };
@@ -93,7 +95,7 @@ static int fsl_mqs_sm_write(void *context, unsigned int reg, unsigned int val)
 
 	if (IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) &&
 	    mqs_priv->soc->ctrl_off == reg)
-		return scmi_imx_misc_ctrl_set(SCMI_IMX_CTRL_MQS1_SETTINGS, val);
+		return scmi_imx_misc_ctrl_set(mqs_priv->soc->sm_index, val);
 
 	return -EINVAL;
 };
@@ -386,6 +388,7 @@ static const struct fsl_mqs_soc_data fsl_mqs_imx93_data = {
 
 static const struct fsl_mqs_soc_data fsl_mqs_imx95_aon_data = {
 	.type = TYPE_REG_SM,
+	.sm_index = SCMI_IMX_CTRL_MQS1_SETTINGS,
 	.ctrl_off = 0x88,
 	.en_mask  = BIT(1),
 	.en_shift = 1,
@@ -412,6 +415,7 @@ static const struct fsl_mqs_soc_data fsl_mqs_imx95_netc_data = {
 
 static const struct fsl_mqs_soc_data fsl_mqs_imx943_aon_data = {
 	.type = TYPE_REG_SM,
+	.sm_index = SCMI_IMX94_CTRL_MQS1_SETTINGS,
 	.ctrl_off = 0x88,
 	.en_mask  = BIT(1),
 	.en_shift = 1,
@@ -424,7 +428,8 @@ static const struct fsl_mqs_soc_data fsl_mqs_imx943_aon_data = {
 };
 
 static const struct fsl_mqs_soc_data fsl_mqs_imx943_wakeup_data = {
-	.type = TYPE_REG_GPR,
+	.type = TYPE_REG_SM,
+	.sm_index = SCMI_IMX94_CTRL_MQS2_SETTINGS,
 	.ctrl_off = 0x10,
 	.en_mask  = BIT(1),
 	.en_shift = 1,
-- 
cgit v1.2.3


From baee26a9d6cd3d3c6c3c03c56270aa647a67e4bd Mon Sep 17 00:00:00 2001
From: Shengjiu Wang <shengjiu.wang@nxp.com>
Date: Fri, 20 Jun 2025 13:52:29 +0800
Subject: ASoC: fsl_mqs: rename system manager indices for i.MX95

The system manager indices names are different for each platform, rename
the indices for i.MX95 to differentiate with other platform.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Link: https://patch.msgid.link/20250620055229.965942-3-shengjiu.wang@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/firmware/imx/sm.h | 12 ++++++------
 sound/soc/fsl/fsl_mqs.c         |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h
index a6220c500f7c..d4212bc42b2c 100644
--- a/include/linux/firmware/imx/sm.h
+++ b/include/linux/firmware/imx/sm.h
@@ -11,12 +11,12 @@
 #include <linux/scmi_imx_protocol.h>
 #include <linux/types.h>
 
-#define SCMI_IMX_CTRL_PDM_CLK_SEL	0	/* AON PDM clock sel */
-#define SCMI_IMX_CTRL_MQS1_SETTINGS	1	/* AON MQS settings */
-#define SCMI_IMX_CTRL_SAI1_MCLK		2	/* AON SAI1 MCLK */
-#define SCMI_IMX_CTRL_SAI3_MCLK		3	/* WAKE SAI3 MCLK */
-#define SCMI_IMX_CTRL_SAI4_MCLK		4	/* WAKE SAI4 MCLK */
-#define SCMI_IMX_CTRL_SAI5_MCLK		5	/* WAKE SAI5 MCLK */
+#define SCMI_IMX95_CTRL_PDM_CLK_SEL	0	/* AON PDM clock sel */
+#define SCMI_IMX95_CTRL_MQS1_SETTINGS	1	/* AON MQS settings */
+#define SCMI_IMX95_CTRL_SAI1_MCLK	2	/* AON SAI1 MCLK */
+#define SCMI_IMX95_CTRL_SAI3_MCLK	3	/* WAKE SAI3 MCLK */
+#define SCMI_IMX95_CTRL_SAI4_MCLK	4	/* WAKE SAI4 MCLK */
+#define SCMI_IMX95_CTRL_SAI5_MCLK	5	/* WAKE SAI5 MCLK */
 
 #define SCMI_IMX94_CTRL_PDM_CLK_SEL	0U	/*!< AON PDM clock sel */
 #define SCMI_IMX94_CTRL_MQS1_SETTINGS	1U	/*!< AON MQS settings */
diff --git a/sound/soc/fsl/fsl_mqs.c b/sound/soc/fsl/fsl_mqs.c
index 11f2f3792dce..901f840df904 100644
--- a/sound/soc/fsl/fsl_mqs.c
+++ b/sound/soc/fsl/fsl_mqs.c
@@ -388,7 +388,7 @@ static const struct fsl_mqs_soc_data fsl_mqs_imx93_data = {
 
 static const struct fsl_mqs_soc_data fsl_mqs_imx95_aon_data = {
 	.type = TYPE_REG_SM,
-	.sm_index = SCMI_IMX_CTRL_MQS1_SETTINGS,
+	.sm_index = SCMI_IMX95_CTRL_MQS1_SETTINGS,
 	.ctrl_off = 0x88,
 	.en_mask  = BIT(1),
 	.en_shift = 1,
-- 
cgit v1.2.3


From 5b605dbee07dda8fd538af1f07cbf1baf0a49cbc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Jul 2025 15:26:58 +0200
Subject: timekeeping: Provide ktime_get_clock_ts64()

PTP implements an inline switch case for taking timestamps from various
POSIX clock IDs, which already consumes quite some text space. Expanding it
for auxiliary clocks really becomes too big for inlining.

Provide a out of line version.

The function invalidates the timestamp in case the clock is invalid. The
invalidation allows to implement a validation check without the need to
propagate a return value through deep existing call chains.

Due to merge logistics this temporarily defines CLOCK_AUX[_LAST] if
undefined, so that the plain branch, which does not contain any of the core
timekeeper changes, can be pulled into the networking tree as prerequisite
for the PTP side changes. These temporary defines are removed after that
branch is merged into the tip::timers/ptp branch. That way the result in
-next or upstream in the next merge window has zero dependencies.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: John Stultz <jstultz@google.com>
Link: https://lore.kernel.org/all/20250701132628.357686408@linutronix.de
---
 include/linux/timekeeping.h | 10 ++++++++++
 kernel/time/timekeeping.c   | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 542773650200..4a4c2778abae 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -44,6 +44,7 @@ extern void ktime_get_ts64(struct timespec64 *ts);
 extern void ktime_get_real_ts64(struct timespec64 *tv);
 extern void ktime_get_coarse_ts64(struct timespec64 *ts);
 extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
+extern void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts);
 
 /* Multigrain timestamp interfaces */
 extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
@@ -345,4 +346,13 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
 extern int update_persistent_clock64(struct timespec64 now);
 #endif
 
+/* Temporary workaround to avoid merge dependencies and cross tree messes */
+#ifndef CLOCK_AUX
+#define CLOCK_AUX			MAX_CLOCKS
+#define MAX_AUX_CLOCKS			8
+#define CLOCK_AUX_LAST			(CLOCK_AUX + MAX_AUX_CLOCKS - 1)
+
+static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; }
+#endif
+
 #endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a009c91f7b05..572e3bd0cc94 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1573,6 +1573,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
 }
 EXPORT_SYMBOL(ktime_get_raw_ts64);
 
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id:		POSIX clock ID of the clock to read
+ * @ts:		Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+	/* Invalidate time stamp */
+	ts->tv_sec = -1;
+	ts->tv_nsec = 0;
+
+	switch (id) {
+	case CLOCK_REALTIME:
+		ktime_get_real_ts64(ts);
+		return;
+	case CLOCK_MONOTONIC:
+		ktime_get_ts64(ts);
+		return;
+	case CLOCK_MONOTONIC_RAW:
+		ktime_get_raw_ts64(ts);
+		return;
+	case CLOCK_AUX ... CLOCK_AUX_LAST:
+		if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+			ktime_get_aux_ts64(id, ts);
+		return;
+	default:
+		WARN_ON_ONCE(1);
+	}
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
 
 /**
  * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
-- 
cgit v1.2.3


From 8959338617a85e35820e3a7fa21801cf55b068bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Jul 2025 14:39:28 +0200
Subject: timekeeping: Remove the temporary CLOCK_AUX workaround

ktime_get_clock_ts64() was provided for the networking tree as a stand
alone commit based on v6.16-rc1. It contains a temporary workaround for the
CLOCK_AUX* defines, which are only available in the timekeeping tree.

As this commit is now merged into the timers/ptp branch, which contains the
real CLOCK_AUX* defines, the workaround is obsolete.

Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701130923.579834908@linutronix.de
---
 include/linux/timekeeping.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 6121924d93c4..aee2c1a46e47 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -357,13 +357,4 @@ void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
 extern int update_persistent_clock64(struct timespec64 now);
 #endif
 
-/* Temporary workaround to avoid merge dependencies and cross tree messes */
-#ifndef CLOCK_AUX
-#define CLOCK_AUX			MAX_CLOCKS
-#define MAX_AUX_CLOCKS			8
-#define CLOCK_AUX_LAST			(CLOCK_AUX + MAX_AUX_CLOCKS - 1)
-
-static inline bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *kt) { return false; }
-#endif
-
 #endif
-- 
cgit v1.2.3


From 4c09a4cebd0320c5381afad3fb6f997f20082a3b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 1 Jul 2025 15:27:00 +0200
Subject: ptp: Use ktime_get_clock_ts64() for timestamping

The inlined ptp_read_system_[pre|post]ts() switch cases expand to a copious
amount of text in drivers, e.g. ~500 bytes in e1000e. Adding auxiliary
clock support to the inlines would increase it further.

Replace the inline switch case with a call to ktime_get_clock_ts64(), which
reduces the code size in drivers and allows to access auxiliary clocks once
they are enabled in the IOCTL parameter filter.

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Acked-by: John Stultz <jstultz@google.com>
Link: https://patch.msgid.link/20250701132628.426168092@linutronix.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/ptp_clock_kernel.h | 34 ++++------------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index eced7e9bf69a..3d089bd4d5e9 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -477,40 +477,14 @@ static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp,
 
 static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts)
 {
-	if (sts) {
-		switch (sts->clockid) {
-		case CLOCK_REALTIME:
-			ktime_get_real_ts64(&sts->pre_ts);
-			break;
-		case CLOCK_MONOTONIC:
-			ktime_get_ts64(&sts->pre_ts);
-			break;
-		case CLOCK_MONOTONIC_RAW:
-			ktime_get_raw_ts64(&sts->pre_ts);
-			break;
-		default:
-			break;
-		}
-	}
+	if (sts)
+		ktime_get_clock_ts64(sts->clockid, &sts->pre_ts);
 }
 
 static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts)
 {
-	if (sts) {
-		switch (sts->clockid) {
-		case CLOCK_REALTIME:
-			ktime_get_real_ts64(&sts->post_ts);
-			break;
-		case CLOCK_MONOTONIC:
-			ktime_get_ts64(&sts->post_ts);
-			break;
-		case CLOCK_MONOTONIC_RAW:
-			ktime_get_raw_ts64(&sts->post_ts);
-			break;
-		default:
-			break;
-		}
-	}
+	if (sts)
+		ktime_get_clock_ts64(sts->clockid, &sts->post_ts);
 }
 
 #endif
-- 
cgit v1.2.3


From 858e65af91351f8842bbe2c5ae6f100778783f42 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 26 Jun 2025 16:48:58 +0200
Subject: irqdomain: Add device pointer to irq_domain_info and msi_domain_info

Add device pointer to irq_domain_info and msi_domain_info, so that the device
can be specified at domain creation time.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/943e52403b20cf13c320d55bd4446b4562466aab.1750860131.git.namcao@linutronix.de
---
 include/linux/irqdomain.h | 2 ++
 include/linux/msi.h       | 2 ++
 kernel/irq/irqdomain.c    | 1 +
 kernel/irq/msi.c          | 3 ++-
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 7387d183029b..266b5e5bb8ce 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -279,6 +279,7 @@ struct irq_domain_chip_generic_info;
  *			domains are added using same fwnode
  * @ops:		Domain operation callbacks
  * @host_data:		Controller private data pointer
+ * @dev:		Device which creates the domain
  * @dgc_info:		Geneneric chip information structure pointer used to
  *			create generic chips for the domain if not NULL.
  * @init:		Function called when the domain is created.
@@ -298,6 +299,7 @@ struct irq_domain_info {
 	const char				*name_suffix;
 	const struct irq_domain_ops		*ops;
 	void					*host_data;
+	struct device				*dev;
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 	/**
 	 * @parent: Pointer to the parent irq domain used in a hierarchy domain
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6863540f4b71..77227d23ea84 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -488,6 +488,7 @@ struct msi_domain_ops {
  *			gets initialized to the maximum software index limit
  *			by the domain creation code.
  * @ops:		The callback data structure
+ * @dev:		Device which creates the domain
  * @chip:		Optional: associated interrupt chip
  * @chip_data:		Optional: associated interrupt chip data
  * @handler:		Optional: associated interrupt flow handler
@@ -501,6 +502,7 @@ struct msi_domain_info {
 	enum irq_domain_bus_token	bus_token;
 	unsigned int			hwsize;
 	struct msi_domain_ops		*ops;
+	struct device			*dev;
 	struct irq_chip			*chip;
 	void				*chip_data;
 	irq_flow_handler_t		handler;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index c8b6de09047b..4afbd3ac532f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info
 
 	domain->flags |= info->domain_flags;
 	domain->exit = info->exit;
+	domain->dev = info->dev;
 
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 	if (info->parent) {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9febe797a5f6..9b09ad3f9914 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -889,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
 
 	if (domain) {
 		irq_domain_update_bus_token(domain, info->bus_token);
+		domain->dev = info->dev;
 		if (info->flags & MSI_FLAG_PARENT_PM_DEV)
 			domain->pm_dev = parent->pm_dev;
 	}
@@ -1051,6 +1052,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
 	bundle->info.data = domain_data;
 	bundle->info.chip_data = chip_data;
 	bundle->info.alloc_data = &bundle->alloc_info;
+	bundle->info.dev = dev;
 
 	pops = parent->msi_parent_ops;
 	snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1089,7 +1091,6 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
 	if (!domain)
 		return false;
 
-	domain->dev = dev;
 	dev->msi.data->__domains[domid].domain = domain;
 
 	if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
-- 
cgit v1.2.3


From 25a59e813cd2ca728047f657d64f9b29480be393 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@riscstar.com>
Date: Wed, 2 Jul 2025 06:37:03 -0500
Subject: dt-bindings: soc: spacemit: define spacemit,k1-ccu resets

There are additional SpacemiT syscon CCUs whose registers control both
clocks and resets:  RCPU, RCPU2, and APBC2. Unlike those defined
previously, these will (initially) support only resets.  They do not
incorporate power domain functionality.

Previously the clock properties were required for all compatible nodes.
Make that requirement only apply to the three existing CCUs (APBC, APMU,
and MPMU), so that the new reset-only CCUs can go without specifying them.

Define the index values for resets associated with all SpacemiT K1
syscon nodes, including those with clocks already defined, as well as
the new ones (without clocks).

Signed-off-by: Alex Elder <elder@riscstar.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Yixun Lan <dlan@gentoo.org>
Link: https://lore.kernel.org/r/20250702113709.291748-2-elder@riscstar.com
Signed-off-by: Yixun Lan <dlan@gentoo.org>
---
 .../bindings/soc/spacemit/spacemit,k1-syscon.yaml  |  27 +++-
 include/dt-bindings/clock/spacemit,k1-syscon.h     | 141 +++++++++++++++++++++
 2 files changed, 162 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/soc/spacemit/spacemit,k1-syscon.yaml b/Documentation/devicetree/bindings/soc/spacemit/spacemit,k1-syscon.yaml
index 30aaf49da03d..133a391ee68c 100644
--- a/Documentation/devicetree/bindings/soc/spacemit/spacemit,k1-syscon.yaml
+++ b/Documentation/devicetree/bindings/soc/spacemit/spacemit,k1-syscon.yaml
@@ -19,6 +19,9 @@ properties:
       - spacemit,k1-syscon-apbc
       - spacemit,k1-syscon-apmu
       - spacemit,k1-syscon-mpmu
+      - spacemit,k1-syscon-rcpu
+      - spacemit,k1-syscon-rcpu2
+      - spacemit,k1-syscon-apbc2
 
   reg:
     maxItems: 1
@@ -47,9 +50,6 @@ properties:
 required:
   - compatible
   - reg
-  - clocks
-  - clock-names
-  - "#clock-cells"
   - "#reset-cells"
 
 allOf:
@@ -57,13 +57,28 @@ allOf:
       properties:
         compatible:
           contains:
-            const: spacemit,k1-syscon-apbc
+            enum:
+              - spacemit,k1-syscon-apmu
+              - spacemit,k1-syscon-mpmu
     then:
+      required:
+        - "#power-domain-cells"
+    else:
       properties:
         "#power-domain-cells": false
-    else:
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - spacemit,k1-syscon-apbc
+              - spacemit,k1-syscon-apmu
+              - spacemit,k1-syscon-mpmu
+    then:
       required:
-        - "#power-domain-cells"
+        - clocks
+        - clock-names
+        - "#clock-cells"
 
 additionalProperties: false
 
diff --git a/include/dt-bindings/clock/spacemit,k1-syscon.h b/include/dt-bindings/clock/spacemit,k1-syscon.h
index 35968ae98246..2714c3fe66cd 100644
--- a/include/dt-bindings/clock/spacemit,k1-syscon.h
+++ b/include/dt-bindings/clock/spacemit,k1-syscon.h
@@ -78,6 +78,9 @@
 #define CLK_APB			31
 #define CLK_WDT_BUS		32
 
+/* MPMU resets */
+#define RESET_WDT		0
+
 /* APBC clocks */
 #define CLK_UART0		0
 #define CLK_UART2		1
@@ -180,6 +183,59 @@
 #define CLK_TSEN_BUS		98
 #define CLK_IPC_AP2AUD_BUS	99
 
+/* APBC resets */
+#define RESET_UART0		0
+#define RESET_UART2		1
+#define RESET_UART3		2
+#define RESET_UART4		3
+#define RESET_UART5		4
+#define RESET_UART6		5
+#define RESET_UART7		6
+#define RESET_UART8		7
+#define RESET_UART9		8
+#define RESET_GPIO		9
+#define RESET_PWM0		10
+#define RESET_PWM1		11
+#define RESET_PWM2		12
+#define RESET_PWM3		13
+#define RESET_PWM4		14
+#define RESET_PWM5		15
+#define RESET_PWM6		16
+#define RESET_PWM7		17
+#define RESET_PWM8		18
+#define RESET_PWM9		19
+#define RESET_PWM10		20
+#define RESET_PWM11		21
+#define RESET_PWM12		22
+#define RESET_PWM13		23
+#define RESET_PWM14		24
+#define RESET_PWM15		25
+#define RESET_PWM16		26
+#define RESET_PWM17		27
+#define RESET_PWM18		28
+#define RESET_PWM19		29
+#define RESET_SSP3		30
+#define RESET_RTC		31
+#define RESET_TWSI0		32
+#define RESET_TWSI1		33
+#define RESET_TWSI2		34
+#define RESET_TWSI4		35
+#define RESET_TWSI5		36
+#define RESET_TWSI6		37
+#define RESET_TWSI7		38
+#define RESET_TWSI8		39
+#define RESET_TIMERS1		40
+#define RESET_TIMERS2		41
+#define RESET_AIB		42
+#define RESET_ONEWIRE		43
+#define RESET_SSPA0		44
+#define RESET_SSPA1		45
+#define RESET_DRO		46
+#define RESET_IR		47
+#define RESET_TSEN		48
+#define RESET_IPC_AP2AUD	49
+#define RESET_CAN0		50
+
 /* APMU clocks */
 #define CLK_CCI550		0
 #define CLK_CPU_C0_HI		1
@@ -244,4 +300,89 @@
 #define CLK_V2D			60
 #define CLK_EMMC_BUS		61
 
+/* APMU resets */
+#define RESET_CCIC_4X		0
+#define RESET_CCIC1_PHY		1
+#define RESET_SDH_AXI		2
+#define RESET_SDH0		3
+#define RESET_SDH1		4
+#define RESET_SDH2		5
+#define RESET_USBP1_AXI		6
+#define RESET_USB_AXI		7
+#define RESET_USB30_AHB		8
+#define RESET_USB30_VCC		9
+#define RESET_USB30_PHY		10
+#define RESET_QSPI		11
+#define RESET_QSPI_BUS		12
+#define RESET_DMA		13
+#define RESET_AES		14
+#define RESET_VPU		15
+#define RESET_GPU		16
+#define RESET_EMMC		17
+#define RESET_EMMC_X		18
+#define RESET_AUDIO_SYS		19
+#define RESET_AUDIO_MCU		20
+#define RESET_AUDIO_APMU	21
+#define RESET_HDMI		22
+#define RESET_PCIE0_MASTER	23
+#define RESET_PCIE0_SLAVE	24
+#define RESET_PCIE0_DBI		25
+#define RESET_PCIE0_GLOBAL	26
+#define RESET_PCIE1_MASTER	27
+#define RESET_PCIE1_SLAVE	28
+#define RESET_PCIE1_DBI		29
+#define RESET_PCIE1_GLOBAL	30
+#define RESET_PCIE2_MASTER	31
+#define RESET_PCIE2_SLAVE	32
+#define RESET_PCIE2_DBI		33
+#define RESET_PCIE2_GLOBAL	34
+#define RESET_EMAC0		35
+#define RESET_EMAC1		36
+#define RESET_JPG		37
+#define RESET_CCIC2PHY		38
+#define RESET_CCIC3PHY		39
+#define RESET_CSI		40
+#define RESET_ISP_CPP		41
+#define RESET_ISP_BUS		42
+#define RESET_ISP		43
+#define RESET_ISP_CI		44
+#define RESET_DPU_MCLK		45
+#define RESET_DPU_ESC		46
+#define RESET_DPU_HCLK		47
+#define RESET_DPU_SPIBUS	48
+#define RESET_DPU_SPI_HBUS	49
+#define RESET_V2D		50
+#define RESET_MIPI		51
+#define RESET_MC		52
+
+/*	RCPU resets	*/
+#define RESET_RCPU_SSP0		0
+#define RESET_RCPU_I2C0		1
+#define RESET_RCPU_UART1	2
+#define RESET_RCPU_IR		3
+#define RESET_RCPU_CAN		4
+#define RESET_RCPU_UART0	5
+#define RESET_RCPU_HDMI_AUDIO	6
+
+/*	RCPU2 resets	*/
+#define RESET_RCPU2_PWM0	0
+#define RESET_RCPU2_PWM1	1
+#define RESET_RCPU2_PWM2	2
+#define RESET_RCPU2_PWM3	3
+#define RESET_RCPU2_PWM4	4
+#define RESET_RCPU2_PWM5	5
+#define RESET_RCPU2_PWM6	6
+#define RESET_RCPU2_PWM7	7
+#define RESET_RCPU2_PWM8	8
+#define RESET_RCPU2_PWM9	9
+
+/*	APBC2 resets	*/
+#define RESET_APBC2_UART1	0
+#define RESET_APBC2_SSP2	1
+#define RESET_APBC2_TWSI3	2
+#define RESET_APBC2_RTC		3
+#define RESET_APBC2_TIMERS0	4
+#define RESET_APBC2_KPC		5
+#define RESET_APBC2_GPIO	6
+
 #endif /* _DT_BINDINGS_SPACEMIT_CCU_H_ */
-- 
cgit v1.2.3


From 9047685cfd2911c36ce89a16270aafa71057c507 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 18:42:44 +0300
Subject: PM: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://patch.msgid.link/20250626154244.324265-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index f0bd8fbae4f2..938b1b446a5d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -8,14 +8,15 @@
 #ifndef _LINUX_PM_H
 #define _LINUX_PM_H
 
+#include <linux/completion.h>
 #include <linux/export.h>
-#include <linux/list.h>
-#include <linux/workqueue.h>
+#include <linux/hrtimer_types.h>
+#include <linux/mutex.h>
 #include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/util_macros.h>
 #include <linux/wait.h>
-#include <linux/timer.h>
-#include <linux/hrtimer.h>
-#include <linux/completion.h>
+#include <linux/workqueue_types.h>
 
 /*
  * Callbacks for platform drivers to implement.
-- 
cgit v1.2.3


From c021c1b38f90d639423c1369625daa703a8472ea Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 27 Jun 2025 21:08:48 +0200
Subject: PM: Move two sleep-related functions under CONFIG_PM_SLEEP

Since pm_runtime_force_resume() and pm_runtime_need_not_resume() are only
needed for handling system-wide PM transitions, there is no reason to
compile them in if CONFIG_PM_SLEEP is unset.

Accordingly, move them under CONFIG_PM_SLEEP and make the static
inline stub for pm_runtime_force_resume() return an error to indicate
that it should not be used outside CONFIG_PM_SLEEP.

Putting pm_runtime_force_resume() also allows subsequent changes to
be more straightforward because this function is going to access a
device PM flag that is only defined when CONFIG_PM_SLEEP is set.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/3384523.aeNJFYEL58@rjwysocki.net
---
 drivers/base/power/runtime.c | 18 +++++++++++-------
 include/linux/pm_runtime.h   | 16 ++++++++++++----
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7dd815ab83a0..fe1f7cc663ac 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1941,13 +1941,6 @@ void pm_runtime_drop_link(struct device_link *link)
 	pm_request_idle(link->supplier);
 }
 
-bool pm_runtime_need_not_resume(struct device *dev)
-{
-	return atomic_read(&dev->power.usage_count) <= 1 &&
-		(atomic_read(&dev->power.child_count) == 0 ||
-		 dev->power.ignore_children);
-}
-
 /**
  * pm_runtime_force_suspend - Force a device into suspend state if needed.
  * @dev: Device to suspend.
@@ -2009,6 +2002,8 @@ err:
 }
 EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
 
+#ifdef CONFIG_PM_SLEEP
+
 /**
  * pm_runtime_force_resume - Force a device into resume state if needed.
  * @dev: Device to resume.
@@ -2052,3 +2047,12 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL_GPL(pm_runtime_force_resume);
+
+bool pm_runtime_need_not_resume(struct device *dev)
+{
+	return atomic_read(&dev->power.usage_count) <= 1 &&
+		(atomic_read(&dev->power.child_count) == 0 ||
+		 dev->power.ignore_children);
+}
+
+#endif /* CONFIG_PM_SLEEP */
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index e7cb70fcc0af..9bea07f22041 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -66,9 +66,7 @@ static inline bool queue_pm_work(struct work_struct *work)
 
 extern int pm_generic_runtime_suspend(struct device *dev);
 extern int pm_generic_runtime_resume(struct device *dev);
-extern bool pm_runtime_need_not_resume(struct device *dev);
 extern int pm_runtime_force_suspend(struct device *dev);
-extern int pm_runtime_force_resume(struct device *dev);
 
 extern int __pm_runtime_idle(struct device *dev, int rpmflags);
 extern int __pm_runtime_suspend(struct device *dev, int rpmflags);
@@ -257,9 +255,7 @@ static inline bool queue_pm_work(struct work_struct *work) { return false; }
 
 static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
-static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; }
 static inline int pm_runtime_force_suspend(struct device *dev) { return 0; }
-static inline int pm_runtime_force_resume(struct device *dev) { return 0; }
 
 static inline int __pm_runtime_idle(struct device *dev, int rpmflags)
 {
@@ -330,6 +326,18 @@ static inline void pm_runtime_release_supplier(struct device_link *link) {}
 
 #endif /* !CONFIG_PM */
 
+#ifdef CONFIG_PM_SLEEP
+
+bool pm_runtime_need_not_resume(struct device *dev);
+int pm_runtime_force_resume(struct device *dev);
+
+#else /* !CONFIG_PM_SLEEP */
+
+static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; }
+static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; }
+
+#endif /* CONFIG_PM_SLEEP */
+
 /**
  * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it.
  * @dev: Target device.
-- 
cgit v1.2.3


From ffda4ca4608ea811aee2aace211bbf27c68a8853 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 27 Jun 2025 21:23:42 +0200
Subject: PM: sleep: Add strict_midlayer flag to struct dev_pm_info

Add a new flag, called strict_midlayer, to struct dev_pm_info, along
with helper functions for updating and reading its value, to allow
middle layer code that provides proper callbacks for device suspend-
resume during system-wide PM transitions to let pm_runtime_force_suspend()
and and pm_runtime_force_resume() know that they should only invoke
runtime PM callbacks coming from the device's driver.

Namely, if this flag is set, pm_runtime_force_suspend() and
and pm_runtime_force_resume() will invoke runtime PM callbacks
provided by the device's driver directly with the assumption that
they have been called via a middle layer callback for device suspend
or resume, respectively.

For instance, acpi_general_pm_domain provides specific
callback functions for system suspend, acpi_subsys_suspend(),
acpi_subsys_suspend_late() and acpi_subsys_suspend_noirq(), and
it does not expect its runtime suspend callback function,
acpi_subsys_runtime_suspend(), to be invoked at any point during
system suspend. In particular, it does not expect that function
to be called from within any of the system suspend callback functions
mentioned above which would happen if a device driver collaborating
with acpi_general_pm_domain used pm_runtime_force_suspend() as its
callback function for any system suspend phase later than "prepare".

The new flag allows this expectation of acpi_general_pm_domain to
be formally expressed, which is going to be done subsequently.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/24017035.6Emhk5qWAg@rjwysocki.net
---
 drivers/base/power/runtime.c | 21 +++++++++++++++++++--
 include/linux/device.h       | 27 +++++++++++++++++++++++++++
 include/linux/pm.h           |  1 +
 3 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 8cd1a4db5e84..05ff3d2209e6 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1958,6 +1958,23 @@ void pm_runtime_drop_link(struct device_link *link)
 	pm_request_idle(link->supplier);
 }
 
+static pm_callback_t get_callback(struct device *dev, size_t cb_offset)
+{
+	/*
+	 * Setting power.strict_midlayer means that the middle layer
+	 * code does not want its runtime PM callbacks to be invoked via
+	 * pm_runtime_force_suspend() and pm_runtime_force_resume(), so
+	 * return a direct pointer to the driver callback in that case.
+	 */
+	if (dev_pm_strict_midlayer_is_set(dev))
+		return __rpm_get_driver_callback(dev, cb_offset);
+
+	return __rpm_get_callback(dev, cb_offset);
+}
+
+#define GET_CALLBACK(dev, callback) \
+		get_callback(dev, offsetof(struct dev_pm_ops, callback))
+
 /**
  * pm_runtime_force_suspend - Force a device into suspend state if needed.
  * @dev: Device to suspend.
@@ -1984,7 +2001,7 @@ int pm_runtime_force_suspend(struct device *dev)
 	if (pm_runtime_status_suspended(dev) || dev->power.needs_force_resume)
 		return 0;
 
-	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
+	callback = GET_CALLBACK(dev, runtime_suspend);
 
 	dev_pm_enable_wake_irq_check(dev, true);
 	ret = callback ? callback(dev) : 0;
@@ -2046,7 +2063,7 @@ int pm_runtime_force_resume(struct device *dev)
 	    pm_runtime_status_suspended(dev)))
 		goto out;
 
-	callback = RPM_GET_CALLBACK(dev, runtime_resume);
+	callback = GET_CALLBACK(dev, runtime_resume);
 
 	dev_pm_disable_wake_irq_check(dev, false);
 	ret = callback ? callback(dev) : 0;
diff --git a/include/linux/device.h b/include/linux/device.h
index 4940db137fff..5137f9d213ec 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -879,6 +879,33 @@ static inline bool dev_pm_smart_suspend(struct device *dev)
 #endif
 }
 
+/*
+ * dev_pm_set_strict_midlayer - Update the device's power.strict_midlayer flag
+ * @dev: Target device.
+ * @val: New flag value.
+ *
+ * When set, power.strict_midlayer means that the middle layer power management
+ * code (typically, a bus type or a PM domain) does not expect its runtime PM
+ * suspend callback to be invoked at all during system-wide PM transitions and
+ * it does not expect its runtime PM resume callback to be invoked at any point
+ * when runtime PM is disabled for the device during system-wide PM transitions.
+ */
+static inline void dev_pm_set_strict_midlayer(struct device *dev, bool val)
+{
+#ifdef CONFIG_PM_SLEEP
+	dev->power.strict_midlayer = val;
+#endif
+}
+
+static inline bool dev_pm_strict_midlayer_is_set(struct device *dev)
+{
+#ifdef CONFIG_PM_SLEEP
+	return dev->power.strict_midlayer;
+#else
+	return false;
+#endif
+}
+
 static inline void device_lock(struct device *dev)
 {
 	mutex_lock(&dev->mutex);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index f0bd8fbae4f2..4149d45f6f76 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -683,6 +683,7 @@ struct dev_pm_info {
 	bool			smart_suspend:1;	/* Owned by the PM core */
 	bool			must_resume:1;		/* Owned by the PM core */
 	bool			may_skip_resume:1;	/* Set by subsystems */
+	bool			strict_midlayer:1;
 #else
 	bool			should_wakeup:1;
 #endif
-- 
cgit v1.2.3


From f8e656382b4aa45ae51135b72262044550224920 Mon Sep 17 00:00:00 2001
From: Philip Radford <philip.radford@arm.com>
Date: Mon, 30 Jun 2025 10:55:43 +0000
Subject: include: trace:  Add tracepoint support for inflight xfer count

Enhance the existing SCMI transfer tracepoints by including the current
in-flight transfer count in `scmi_xfer_begin` and `scmi_xfer_end`.

Introduce a new helper `scmi_inflight_count()` to retrieve the active
transfer count from the SCMI debug counters when debug is enabled.

This trace data is useful for visualizing transfer activity over time
and identifying congestion or unexpected behavior in SCMI messaging.

Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Philip Radford <philip.radford@arm.com>
Message-Id: <20250630105544.531723-4-philip.radford@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/common.h   |  1 +
 drivers/firmware/arm_scmi/driver.c   | 17 +++++++++++++++--
 drivers/firmware/arm_scmi/raw_mode.c |  6 ++++--
 include/trace/events/scmi.h          | 24 ++++++++++++++----------
 4 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/common.h b/drivers/firmware/arm_scmi/common.h
index ad9232c982ce..07b9e629276d 100644
--- a/drivers/firmware/arm_scmi/common.h
+++ b/drivers/firmware/arm_scmi/common.h
@@ -505,4 +505,5 @@ static struct platform_driver __drv = {					       \
 void scmi_notification_instance_data_set(const struct scmi_handle *handle,
 					 void *priv);
 void *scmi_notification_instance_data_get(const struct scmi_handle *handle);
+int scmi_inflight_count(const struct scmi_handle *handle);
 #endif /* _SCMI_COMMON_H */
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index 5a4dac27afdf..bd56a877fdfc 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -1443,7 +1443,8 @@ static int do_xfer(const struct scmi_protocol_handle *ph,
 
 	trace_scmi_xfer_begin(xfer->transfer_id, xfer->hdr.id,
 			      xfer->hdr.protocol_id, xfer->hdr.seq,
-			      xfer->hdr.poll_completion);
+			      xfer->hdr.poll_completion,
+			      scmi_inflight_count(&info->handle));
 
 	/* Clear any stale status */
 	xfer->hdr.status = SCMI_SUCCESS;
@@ -1479,7 +1480,8 @@ static int do_xfer(const struct scmi_protocol_handle *ph,
 		info->desc->ops->mark_txdone(cinfo, ret, xfer);
 
 	trace_scmi_xfer_end(xfer->transfer_id, xfer->hdr.id,
-			    xfer->hdr.protocol_id, xfer->hdr.seq, ret);
+			    xfer->hdr.protocol_id, xfer->hdr.seq, ret,
+			    scmi_inflight_count(&info->handle));
 
 	return ret;
 }
@@ -3416,6 +3418,17 @@ static struct dentry *scmi_debugfs_init(void)
 	return d;
 }
 
+int scmi_inflight_count(const struct scmi_handle *handle)
+{
+	if (IS_ENABLED(CONFIG_ARM_SCMI_DEBUG_COUNTERS)) {
+		struct scmi_info *info = handle_to_scmi_info(handle);
+
+		return atomic_read(&info->dbg->counters[XFERS_INFLIGHT]);
+	} else {
+		return 0;
+	}
+}
+
 static int __init scmi_driver_init(void)
 {
 	scmi_quirks_initialize();
diff --git a/drivers/firmware/arm_scmi/raw_mode.c b/drivers/firmware/arm_scmi/raw_mode.c
index 3d543b1d8947..73db5492ab44 100644
--- a/drivers/firmware/arm_scmi/raw_mode.c
+++ b/drivers/firmware/arm_scmi/raw_mode.c
@@ -475,7 +475,8 @@ static void scmi_xfer_raw_worker(struct work_struct *work)
 			raw->desc->ops->mark_txdone(rw->cinfo, ret, xfer);
 
 		trace_scmi_xfer_end(xfer->transfer_id, xfer->hdr.id,
-				    xfer->hdr.protocol_id, xfer->hdr.seq, ret);
+				    xfer->hdr.protocol_id, xfer->hdr.seq,
+				    ret, scmi_inflight_count(raw->handle));
 
 		/* Wait also for an async delayed response if needed */
 		if (!ret && xfer->async_done) {
@@ -642,7 +643,8 @@ static int scmi_do_xfer_raw_start(struct scmi_raw_mode_info *raw,
 
 	trace_scmi_xfer_begin(xfer->transfer_id, xfer->hdr.id,
 			      xfer->hdr.protocol_id, xfer->hdr.seq,
-			      xfer->hdr.poll_completion);
+			      xfer->hdr.poll_completion,
+			      scmi_inflight_count(raw->handle));
 
 	ret = raw->desc->ops->send_message(rw->cinfo, xfer);
 	if (ret) {
diff --git a/include/trace/events/scmi.h b/include/trace/events/scmi.h
index 127300481123..703b7bb68e44 100644
--- a/include/trace/events/scmi.h
+++ b/include/trace/events/scmi.h
@@ -36,8 +36,8 @@ TRACE_EVENT(scmi_fc_call,
 
 TRACE_EVENT(scmi_xfer_begin,
 	TP_PROTO(int transfer_id, u8 msg_id, u8 protocol_id, u16 seq,
-		 bool poll),
-	TP_ARGS(transfer_id, msg_id, protocol_id, seq, poll),
+		 bool poll, int inflight),
+	TP_ARGS(transfer_id, msg_id, protocol_id, seq, poll, inflight),
 
 	TP_STRUCT__entry(
 		__field(int, transfer_id)
@@ -45,6 +45,7 @@ TRACE_EVENT(scmi_xfer_begin,
 		__field(u8, protocol_id)
 		__field(u16, seq)
 		__field(bool, poll)
+		__field(int, inflight)
 	),
 
 	TP_fast_assign(
@@ -53,11 +54,12 @@ TRACE_EVENT(scmi_xfer_begin,
 		__entry->protocol_id = protocol_id;
 		__entry->seq = seq;
 		__entry->poll = poll;
+		__entry->inflight = inflight;
 	),
 
-	TP_printk("pt=%02X msg_id=%02X seq=%04X transfer_id=%X poll=%u",
-		__entry->protocol_id, __entry->msg_id, __entry->seq,
-		__entry->transfer_id, __entry->poll)
+	TP_printk("pt=%02X msg_id=%02X seq=%04X transfer_id=%X poll=%u inflight=%d",
+		  __entry->protocol_id, __entry->msg_id, __entry->seq,
+		  __entry->transfer_id, __entry->poll, __entry->inflight)
 );
 
 TRACE_EVENT(scmi_xfer_response_wait,
@@ -90,8 +92,8 @@ TRACE_EVENT(scmi_xfer_response_wait,
 
 TRACE_EVENT(scmi_xfer_end,
 	TP_PROTO(int transfer_id, u8 msg_id, u8 protocol_id, u16 seq,
-		 int status),
-	TP_ARGS(transfer_id, msg_id, protocol_id, seq, status),
+		 int status, int inflight),
+	TP_ARGS(transfer_id, msg_id, protocol_id, seq, status, inflight),
 
 	TP_STRUCT__entry(
 		__field(int, transfer_id)
@@ -99,6 +101,7 @@ TRACE_EVENT(scmi_xfer_end,
 		__field(u8, protocol_id)
 		__field(u16, seq)
 		__field(int, status)
+		__field(int, inflight)
 	),
 
 	TP_fast_assign(
@@ -107,11 +110,12 @@ TRACE_EVENT(scmi_xfer_end,
 		__entry->protocol_id = protocol_id;
 		__entry->seq = seq;
 		__entry->status = status;
+		__entry->inflight = inflight;
 	),
 
-	TP_printk("pt=%02X msg_id=%02X seq=%04X transfer_id=%X s=%d",
-		__entry->protocol_id, __entry->msg_id, __entry->seq,
-		__entry->transfer_id, __entry->status)
+	TP_printk("pt=%02X msg_id=%02X seq=%04X transfer_id=%X s=%d inflight=%d",
+		  __entry->protocol_id, __entry->msg_id, __entry->seq,
+		  __entry->transfer_id, __entry->status, __entry->inflight)
 );
 
 TRACE_EVENT(scmi_rx_done,
-- 
cgit v1.2.3


From bf6239ddaa6a73a44cd8ea3afec5fc82ed900038 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@riscstar.com>
Date: Wed, 2 Jul 2025 06:37:04 -0500
Subject: soc: spacemit: create a header for clock/reset registers

Move the definitions of register offsets and fields used by the SpacemiT
K1 SoC CCUs into a separate header file, so that they can be shared by
the reset driver that will be found under drivers/reset.

Signed-off-by: Alex Elder <elder@riscstar.com>
Reviewed-by: Haylen Chu <heylenay@4d2.org>
Reviewed-by: Yixun Lan <dlan@gentoo.org>
Link: https://lore.kernel.org/r/20250702113709.291748-3-elder@riscstar.com
Signed-off-by: Yixun Lan <dlan@gentoo.org>
---
 drivers/clk/spacemit/ccu-k1.c    | 111 +-----------------------------------
 include/soc/spacemit/k1-syscon.h | 118 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 110 deletions(-)
 create mode 100644 include/soc/spacemit/k1-syscon.h

(limited to 'include')

diff --git a/drivers/clk/spacemit/ccu-k1.c b/drivers/clk/spacemit/ccu-k1.c
index cdde37a05235..801150f4ff0f 100644
--- a/drivers/clk/spacemit/ccu-k1.c
+++ b/drivers/clk/spacemit/ccu-k1.c
@@ -11,6 +11,7 @@
 #include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <soc/spacemit/k1-syscon.h>
 
 #include "ccu_common.h"
 #include "ccu_pll.h"
@@ -19,116 +20,6 @@
 
 #include <dt-bindings/clock/spacemit,k1-syscon.h>
 
-/* APBS register offset */
-#define APBS_PLL1_SWCR1			0x100
-#define APBS_PLL1_SWCR2			0x104
-#define APBS_PLL1_SWCR3			0x108
-#define APBS_PLL2_SWCR1			0x118
-#define APBS_PLL2_SWCR2			0x11c
-#define APBS_PLL2_SWCR3			0x120
-#define APBS_PLL3_SWCR1			0x124
-#define APBS_PLL3_SWCR2			0x128
-#define APBS_PLL3_SWCR3			0x12c
-
-/* MPMU register offset */
-#define MPMU_POSR			0x0010
-#define  POSR_PLL1_LOCK			BIT(27)
-#define  POSR_PLL2_LOCK			BIT(28)
-#define  POSR_PLL3_LOCK			BIT(29)
-#define MPMU_SUCCR			0x0014
-#define MPMU_ISCCR			0x0044
-#define MPMU_WDTPCR			0x0200
-#define MPMU_RIPCCR			0x0210
-#define MPMU_ACGR			0x1024
-#define MPMU_APBCSCR			0x1050
-#define MPMU_SUCCR_1			0x10b0
-
-/* APBC register offset */
-#define APBC_UART1_CLK_RST		0x00
-#define APBC_UART2_CLK_RST		0x04
-#define APBC_GPIO_CLK_RST		0x08
-#define APBC_PWM0_CLK_RST		0x0c
-#define APBC_PWM1_CLK_RST		0x10
-#define APBC_PWM2_CLK_RST		0x14
-#define APBC_PWM3_CLK_RST		0x18
-#define APBC_TWSI8_CLK_RST		0x20
-#define APBC_UART3_CLK_RST		0x24
-#define APBC_RTC_CLK_RST		0x28
-#define APBC_TWSI0_CLK_RST		0x2c
-#define APBC_TWSI1_CLK_RST		0x30
-#define APBC_TIMERS1_CLK_RST		0x34
-#define APBC_TWSI2_CLK_RST		0x38
-#define APBC_AIB_CLK_RST		0x3c
-#define APBC_TWSI4_CLK_RST		0x40
-#define APBC_TIMERS2_CLK_RST		0x44
-#define APBC_ONEWIRE_CLK_RST		0x48
-#define APBC_TWSI5_CLK_RST		0x4c
-#define APBC_DRO_CLK_RST		0x58
-#define APBC_IR_CLK_RST			0x5c
-#define APBC_TWSI6_CLK_RST		0x60
-#define APBC_COUNTER_CLK_SEL		0x64
-#define APBC_TWSI7_CLK_RST		0x68
-#define APBC_TSEN_CLK_RST		0x6c
-#define APBC_UART4_CLK_RST		0x70
-#define APBC_UART5_CLK_RST		0x74
-#define APBC_UART6_CLK_RST		0x78
-#define APBC_SSP3_CLK_RST		0x7c
-#define APBC_SSPA0_CLK_RST		0x80
-#define APBC_SSPA1_CLK_RST		0x84
-#define APBC_IPC_AP2AUD_CLK_RST		0x90
-#define APBC_UART7_CLK_RST		0x94
-#define APBC_UART8_CLK_RST		0x98
-#define APBC_UART9_CLK_RST		0x9c
-#define APBC_CAN0_CLK_RST		0xa0
-#define APBC_PWM4_CLK_RST		0xa8
-#define APBC_PWM5_CLK_RST		0xac
-#define APBC_PWM6_CLK_RST		0xb0
-#define APBC_PWM7_CLK_RST		0xb4
-#define APBC_PWM8_CLK_RST		0xb8
-#define APBC_PWM9_CLK_RST		0xbc
-#define APBC_PWM10_CLK_RST		0xc0
-#define APBC_PWM11_CLK_RST		0xc4
-#define APBC_PWM12_CLK_RST		0xc8
-#define APBC_PWM13_CLK_RST		0xcc
-#define APBC_PWM14_CLK_RST		0xd0
-#define APBC_PWM15_CLK_RST		0xd4
-#define APBC_PWM16_CLK_RST		0xd8
-#define APBC_PWM17_CLK_RST		0xdc
-#define APBC_PWM18_CLK_RST		0xe0
-#define APBC_PWM19_CLK_RST		0xe4
-
-/* APMU register offset */
-#define APMU_JPG_CLK_RES_CTRL		0x020
-#define APMU_CSI_CCIC2_CLK_RES_CTRL	0x024
-#define APMU_ISP_CLK_RES_CTRL		0x038
-#define APMU_LCD_CLK_RES_CTRL1		0x044
-#define APMU_LCD_SPI_CLK_RES_CTRL	0x048
-#define APMU_LCD_CLK_RES_CTRL2		0x04c
-#define APMU_CCIC_CLK_RES_CTRL		0x050
-#define APMU_SDH0_CLK_RES_CTRL		0x054
-#define APMU_SDH1_CLK_RES_CTRL		0x058
-#define APMU_USB_CLK_RES_CTRL		0x05c
-#define APMU_QSPI_CLK_RES_CTRL		0x060
-#define APMU_DMA_CLK_RES_CTRL		0x064
-#define APMU_AES_CLK_RES_CTRL		0x068
-#define APMU_VPU_CLK_RES_CTRL		0x0a4
-#define APMU_GPU_CLK_RES_CTRL		0x0cc
-#define APMU_SDH2_CLK_RES_CTRL		0x0e0
-#define APMU_PMUA_MC_CTRL		0x0e8
-#define APMU_PMU_CC2_AP			0x100
-#define APMU_PMUA_EM_CLK_RES_CTRL	0x104
-#define APMU_AUDIO_CLK_RES_CTRL		0x14c
-#define APMU_HDMI_CLK_RES_CTRL		0x1b8
-#define APMU_CCI550_CLK_CTRL		0x300
-#define APMU_ACLK_CLK_CTRL		0x388
-#define APMU_CPU_C0_CLK_CTRL		0x38C
-#define APMU_CPU_C1_CLK_CTRL		0x390
-#define APMU_PCIE_CLK_RES_CTRL_0	0x3cc
-#define APMU_PCIE_CLK_RES_CTRL_1	0x3d4
-#define APMU_PCIE_CLK_RES_CTRL_2	0x3dc
-#define APMU_EMAC0_CLK_RES_CTRL		0x3e4
-#define APMU_EMAC1_CLK_RES_CTRL		0x3ec
-
 struct spacemit_ccu_data {
 	struct clk_hw **hws;
 	size_t num;
diff --git a/include/soc/spacemit/k1-syscon.h b/include/soc/spacemit/k1-syscon.h
new file mode 100644
index 000000000000..039a448c51a0
--- /dev/null
+++ b/include/soc/spacemit/k1-syscon.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* SpacemiT clock and reset driver definitions for the K1 SoC */
+
+#ifndef __SOC_K1_SYSCON_H__
+#define __SOC_K1_SYSCON_H__
+
+/* APBS register offset */
+#define APBS_PLL1_SWCR1			0x100
+#define APBS_PLL1_SWCR2			0x104
+#define APBS_PLL1_SWCR3			0x108
+#define APBS_PLL2_SWCR1			0x118
+#define APBS_PLL2_SWCR2			0x11c
+#define APBS_PLL2_SWCR3			0x120
+#define APBS_PLL3_SWCR1			0x124
+#define APBS_PLL3_SWCR2			0x128
+#define APBS_PLL3_SWCR3			0x12c
+
+/* MPMU register offset */
+#define MPMU_POSR			0x0010
+#define  POSR_PLL1_LOCK			BIT(27)
+#define  POSR_PLL2_LOCK			BIT(28)
+#define  POSR_PLL3_LOCK			BIT(29)
+#define MPMU_SUCCR			0x0014
+#define MPMU_ISCCR			0x0044
+#define MPMU_WDTPCR			0x0200
+#define MPMU_RIPCCR			0x0210
+#define MPMU_ACGR			0x1024
+#define MPMU_APBCSCR			0x1050
+#define MPMU_SUCCR_1			0x10b0
+
+/* APBC register offset */
+#define APBC_UART1_CLK_RST		0x00
+#define APBC_UART2_CLK_RST		0x04
+#define APBC_GPIO_CLK_RST		0x08
+#define APBC_PWM0_CLK_RST		0x0c
+#define APBC_PWM1_CLK_RST		0x10
+#define APBC_PWM2_CLK_RST		0x14
+#define APBC_PWM3_CLK_RST		0x18
+#define APBC_TWSI8_CLK_RST		0x20
+#define APBC_UART3_CLK_RST		0x24
+#define APBC_RTC_CLK_RST		0x28
+#define APBC_TWSI0_CLK_RST		0x2c
+#define APBC_TWSI1_CLK_RST		0x30
+#define APBC_TIMERS1_CLK_RST		0x34
+#define APBC_TWSI2_CLK_RST		0x38
+#define APBC_AIB_CLK_RST		0x3c
+#define APBC_TWSI4_CLK_RST		0x40
+#define APBC_TIMERS2_CLK_RST		0x44
+#define APBC_ONEWIRE_CLK_RST		0x48
+#define APBC_TWSI5_CLK_RST		0x4c
+#define APBC_DRO_CLK_RST		0x58
+#define APBC_IR_CLK_RST			0x5c
+#define APBC_TWSI6_CLK_RST		0x60
+#define APBC_COUNTER_CLK_SEL		0x64
+#define APBC_TWSI7_CLK_RST		0x68
+#define APBC_TSEN_CLK_RST		0x6c
+#define APBC_UART4_CLK_RST		0x70
+#define APBC_UART5_CLK_RST		0x74
+#define APBC_UART6_CLK_RST		0x78
+#define APBC_SSP3_CLK_RST		0x7c
+#define APBC_SSPA0_CLK_RST		0x80
+#define APBC_SSPA1_CLK_RST		0x84
+#define APBC_IPC_AP2AUD_CLK_RST		0x90
+#define APBC_UART7_CLK_RST		0x94
+#define APBC_UART8_CLK_RST		0x98
+#define APBC_UART9_CLK_RST		0x9c
+#define APBC_CAN0_CLK_RST		0xa0
+#define APBC_PWM4_CLK_RST		0xa8
+#define APBC_PWM5_CLK_RST		0xac
+#define APBC_PWM6_CLK_RST		0xb0
+#define APBC_PWM7_CLK_RST		0xb4
+#define APBC_PWM8_CLK_RST		0xb8
+#define APBC_PWM9_CLK_RST		0xbc
+#define APBC_PWM10_CLK_RST		0xc0
+#define APBC_PWM11_CLK_RST		0xc4
+#define APBC_PWM12_CLK_RST		0xc8
+#define APBC_PWM13_CLK_RST		0xcc
+#define APBC_PWM14_CLK_RST		0xd0
+#define APBC_PWM15_CLK_RST		0xd4
+#define APBC_PWM16_CLK_RST		0xd8
+#define APBC_PWM17_CLK_RST		0xdc
+#define APBC_PWM18_CLK_RST		0xe0
+#define APBC_PWM19_CLK_RST		0xe4
+
+/* APMU register offset */
+#define APMU_JPG_CLK_RES_CTRL		0x020
+#define APMU_CSI_CCIC2_CLK_RES_CTRL	0x024
+#define APMU_ISP_CLK_RES_CTRL		0x038
+#define APMU_LCD_CLK_RES_CTRL1		0x044
+#define APMU_LCD_SPI_CLK_RES_CTRL	0x048
+#define APMU_LCD_CLK_RES_CTRL2		0x04c
+#define APMU_CCIC_CLK_RES_CTRL		0x050
+#define APMU_SDH0_CLK_RES_CTRL		0x054
+#define APMU_SDH1_CLK_RES_CTRL		0x058
+#define APMU_USB_CLK_RES_CTRL		0x05c
+#define APMU_QSPI_CLK_RES_CTRL		0x060
+#define APMU_DMA_CLK_RES_CTRL		0x064
+#define APMU_AES_CLK_RES_CTRL		0x068
+#define APMU_VPU_CLK_RES_CTRL		0x0a4
+#define APMU_GPU_CLK_RES_CTRL		0x0cc
+#define APMU_SDH2_CLK_RES_CTRL		0x0e0
+#define APMU_PMUA_MC_CTRL		0x0e8
+#define APMU_PMU_CC2_AP			0x100
+#define APMU_PMUA_EM_CLK_RES_CTRL	0x104
+#define APMU_AUDIO_CLK_RES_CTRL		0x14c
+#define APMU_HDMI_CLK_RES_CTRL		0x1b8
+#define APMU_CCI550_CLK_CTRL		0x300
+#define APMU_ACLK_CLK_CTRL		0x388
+#define APMU_CPU_C0_CLK_CTRL		0x38C
+#define APMU_CPU_C1_CLK_CTRL		0x390
+#define APMU_PCIE_CLK_RES_CTRL_0	0x3cc
+#define APMU_PCIE_CLK_RES_CTRL_1	0x3d4
+#define APMU_PCIE_CLK_RES_CTRL_2	0x3dc
+#define APMU_EMAC0_CLK_RES_CTRL		0x3e4
+#define APMU_EMAC1_CLK_RES_CTRL		0x3ec
+
+#endif /* __SOC_K1_SYSCON_H__ */
-- 
cgit v1.2.3


From 988543522ebd6a9af53c288833503f0501e401b0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@riscstar.com>
Date: Wed, 2 Jul 2025 06:37:05 -0500
Subject: clk: spacemit: set up reset auxiliary devices

Add a new reset_name field to the spacemit_ccu_data structure.  If it is
non-null, the CCU implements a reset controller, and the name will be
used in the name for the auxiliary device that implements it.

Define a new type to hold an auxiliary device as well as the regmap
pointer that will be needed by CCU reset controllers.  Set up code to
initialize and add an auxiliary device for any CCU that implements reset
functionality.

Make it optional for a CCU to implement a clock controller.  This
doesn't apply to any of the existing CCUs but will for some new ones
that will be added soon.

Signed-off-by: Alex Elder <elder@riscstar.com>
Reviewed-by: Haylen Chu <heylenay@4d2.org>
Reviewed-by: Yixun Lan <dlan@gentoo.org>
Link: https://lore.kernel.org/r/20250702113709.291748-4-elder@riscstar.com
Signed-off-by: Yixun Lan <dlan@gentoo.org>
---
 drivers/clk/spacemit/Kconfig     |   1 +
 drivers/clk/spacemit/ccu-k1.c    | 104 +++++++++++++++++++++++++++++++++++----
 include/soc/spacemit/k1-syscon.h |  12 +++++
 3 files changed, 107 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/spacemit/Kconfig b/drivers/clk/spacemit/Kconfig
index 4c4df845b3cb..3854f6ae6d0e 100644
--- a/drivers/clk/spacemit/Kconfig
+++ b/drivers/clk/spacemit/Kconfig
@@ -3,6 +3,7 @@
 config SPACEMIT_CCU
 	tristate "Clock support for SpacemiT SoCs"
 	depends on ARCH_SPACEMIT || COMPILE_TEST
+	select AUXILIARY_BUS
 	select MFD_SYSCON
 	help
 	  Say Y to enable clock controller unit support for SpacemiT SoCs.
diff --git a/drivers/clk/spacemit/ccu-k1.c b/drivers/clk/spacemit/ccu-k1.c
index 801150f4ff0f..1c9ed434ae93 100644
--- a/drivers/clk/spacemit/ccu-k1.c
+++ b/drivers/clk/spacemit/ccu-k1.c
@@ -5,12 +5,15 @@
  */
 
 #include <linux/array_size.h>
+#include <linux/auxiliary_bus.h>
 #include <linux/clk-provider.h>
 #include <linux/delay.h>
+#include <linux/idr.h>
 #include <linux/mfd/syscon.h>
 #include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 #include <soc/spacemit/k1-syscon.h>
 
 #include "ccu_common.h"
@@ -21,10 +24,13 @@
 #include <dt-bindings/clock/spacemit,k1-syscon.h>
 
 struct spacemit_ccu_data {
+	const char *reset_name;
 	struct clk_hw **hws;
 	size_t num;
 };
 
+static DEFINE_IDA(auxiliary_ids);
+
 /* APBS clocks start, APBS region contains and only contains all PLL clocks */
 
 /*
@@ -710,8 +716,9 @@ static struct clk_hw *k1_ccu_pll_hws[] = {
 };
 
 static const struct spacemit_ccu_data k1_ccu_pll_data = {
-	.hws	= k1_ccu_pll_hws,
-	.num	= ARRAY_SIZE(k1_ccu_pll_hws),
+	/* The PLL CCU implements no resets */
+	.hws		= k1_ccu_pll_hws,
+	.num		= ARRAY_SIZE(k1_ccu_pll_hws),
 };
 
 static struct clk_hw *k1_ccu_mpmu_hws[] = {
@@ -751,8 +758,9 @@ static struct clk_hw *k1_ccu_mpmu_hws[] = {
 };
 
 static const struct spacemit_ccu_data k1_ccu_mpmu_data = {
-	.hws	= k1_ccu_mpmu_hws,
-	.num	= ARRAY_SIZE(k1_ccu_mpmu_hws),
+	.reset_name	= "mpmu-reset",
+	.hws		= k1_ccu_mpmu_hws,
+	.num		= ARRAY_SIZE(k1_ccu_mpmu_hws),
 };
 
 static struct clk_hw *k1_ccu_apbc_hws[] = {
@@ -859,8 +867,9 @@ static struct clk_hw *k1_ccu_apbc_hws[] = {
 };
 
 static const struct spacemit_ccu_data k1_ccu_apbc_data = {
-	.hws	= k1_ccu_apbc_hws,
-	.num	= ARRAY_SIZE(k1_ccu_apbc_hws),
+	.reset_name	= "apbc-reset",
+	.hws		= k1_ccu_apbc_hws,
+	.num		= ARRAY_SIZE(k1_ccu_apbc_hws),
 };
 
 static struct clk_hw *k1_ccu_apmu_hws[] = {
@@ -929,8 +938,9 @@ static struct clk_hw *k1_ccu_apmu_hws[] = {
 };
 
 static const struct spacemit_ccu_data k1_ccu_apmu_data = {
-	.hws	= k1_ccu_apmu_hws,
-	.num	= ARRAY_SIZE(k1_ccu_apmu_hws),
+	.reset_name	= "apmu-reset",
+	.hws		= k1_ccu_apmu_hws,
+	.num		= ARRAY_SIZE(k1_ccu_apmu_hws),
 };
 
 static int spacemit_ccu_register(struct device *dev,
@@ -941,6 +951,10 @@ static int spacemit_ccu_register(struct device *dev,
 	struct clk_hw_onecell_data *clk_data;
 	int i, ret;
 
+	/* Nothing to do if the CCU does not implement any clocks */
+	if (!data->hws)
+		return 0;
+
 	clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, data->num),
 				GFP_KERNEL);
 	if (!clk_data)
@@ -981,9 +995,74 @@ static int spacemit_ccu_register(struct device *dev,
 	return ret;
 }
 
+static void spacemit_cadev_release(struct device *dev)
+{
+	struct auxiliary_device *adev = to_auxiliary_dev(dev);
+
+	ida_free(&auxiliary_ids, adev->id);
+	kfree(to_spacemit_ccu_adev(adev));
+}
+
+static void spacemit_adev_unregister(void *data)
+{
+	struct auxiliary_device *adev = data;
+
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
+}
+
+static int spacemit_ccu_reset_register(struct device *dev,
+				       struct regmap *regmap,
+				       const char *reset_name)
+{
+	struct spacemit_ccu_adev *cadev;
+	struct auxiliary_device *adev;
+	int ret;
+
+	/* Nothing to do if the CCU does not implement a reset controller */
+	if (!reset_name)
+		return 0;
+
+	cadev = kzalloc(sizeof(*cadev), GFP_KERNEL);
+	if (!cadev)
+		return -ENOMEM;
+
+	cadev->regmap = regmap;
+
+	adev = &cadev->adev;
+	adev->name = reset_name;
+	adev->dev.parent = dev;
+	adev->dev.release = spacemit_cadev_release;
+	adev->dev.of_node = dev->of_node;
+	ret = ida_alloc(&auxiliary_ids, GFP_KERNEL);
+	if (ret < 0)
+		goto err_free_cadev;
+	adev->id = ret;
+
+	ret = auxiliary_device_init(adev);
+	if (ret)
+		goto err_free_aux_id;
+
+	ret = auxiliary_device_add(adev);
+	if (ret) {
+		auxiliary_device_uninit(adev);
+		return ret;
+	}
+
+	return devm_add_action_or_reset(dev, spacemit_adev_unregister, adev);
+
+err_free_aux_id:
+	ida_free(&auxiliary_ids, adev->id);
+err_free_cadev:
+	kfree(cadev);
+
+	return ret;
+}
+
 static int k1_ccu_probe(struct platform_device *pdev)
 {
 	struct regmap *base_regmap, *lock_regmap = NULL;
+	const struct spacemit_ccu_data *data;
 	struct device *dev = &pdev->dev;
 	int ret;
 
@@ -1012,11 +1091,16 @@ static int k1_ccu_probe(struct platform_device *pdev)
 					     "failed to get lock regmap\n");
 	}
 
-	ret = spacemit_ccu_register(dev, base_regmap, lock_regmap,
-				    of_device_get_match_data(dev));
+	data = of_device_get_match_data(dev);
+
+	ret = spacemit_ccu_register(dev, base_regmap, lock_regmap, data);
 	if (ret)
 		return dev_err_probe(dev, ret, "failed to register clocks\n");
 
+	ret = spacemit_ccu_reset_register(dev, base_regmap, data->reset_name);
+	if (ret)
+		return dev_err_probe(dev, ret, "failed to register resets\n");
+
 	return 0;
 }
 
diff --git a/include/soc/spacemit/k1-syscon.h b/include/soc/spacemit/k1-syscon.h
index 039a448c51a0..53eff7691f33 100644
--- a/include/soc/spacemit/k1-syscon.h
+++ b/include/soc/spacemit/k1-syscon.h
@@ -5,6 +5,18 @@
 #ifndef __SOC_K1_SYSCON_H__
 #define __SOC_K1_SYSCON_H__
 
+/* Auxiliary device used to represent a CCU reset controller */
+struct spacemit_ccu_adev {
+	struct auxiliary_device adev;
+	struct regmap *regmap;
+};
+
+static inline struct spacemit_ccu_adev *
+to_spacemit_ccu_adev(struct auxiliary_device *adev)
+{
+	return container_of(adev, struct spacemit_ccu_adev, adev);
+}
+
 /* APBS register offset */
 #define APBS_PLL1_SWCR1			0x100
 #define APBS_PLL1_SWCR2			0x104
-- 
cgit v1.2.3


From f45b2949b1a235881255132a119b8cc8c3738bd5 Mon Sep 17 00:00:00 2001
From: Paul Kocialkowski <paulk@sys-base.io>
Date: Tue, 1 Jul 2025 22:11:21 +0200
Subject: clk: sunxi-ng: v3s: Fix CSI SCLK clock name

The CSI SCLK clock is incorrectly called CSI1 SCLK while it is used for
both the CSI0 and CSI1 interfaces and is called CSI SCLK all around the
documentation.

Fix the name in the driver, header and device-tree.

Fixes: d0f11d14b0bc ("clk: sunxi-ng: add support for V3s CCU")
Signed-off-by: Paul Kocialkowski <paulk@sys-base.io>
Reviewed-By: Icenowy Zheng <uwu@icenowy.me>
Link: https://patch.msgid.link/20250701201124.812882-3-paulk@sys-base.io
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 .../devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml     |  2 +-
 .../devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml     |  2 +-
 .../bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml          |  2 +-
 arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi                     |  2 +-
 drivers/clk/sunxi-ng/ccu-sun8i-v3s.c                           | 10 +++++-----
 include/dt-bindings/clock/sun8i-v3s-ccu.h                      |  2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
index b3d6db922693..1aa5775ba2bc 100644
--- a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
+++ b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
@@ -110,7 +110,7 @@ examples:
         reg = <0x01cb4000 0x1000>;
         interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
         clocks = <&ccu CLK_BUS_CSI>,
-                 <&ccu CLK_CSI1_SCLK>,
+                 <&ccu CLK_CSI_SCLK>,
                  <&ccu CLK_DRAM_CSI>;
         clock-names = "bus",
                       "mod",
diff --git a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml
index a61a76bb611c..3ea4a4290f23 100644
--- a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml
+++ b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-isp.yaml
@@ -79,7 +79,7 @@ examples:
         reg = <0x01cb8000 0x1000>;
         interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
         clocks = <&ccu CLK_BUS_CSI>,
-             <&ccu CLK_CSI1_SCLK>,
+             <&ccu CLK_CSI_SCLK>,
              <&ccu CLK_DRAM_CSI>;
         clock-names = "bus", "mod", "ram";
         resets = <&ccu RST_BUS_CSI>;
diff --git a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml
index 54e15ab8a7f5..627b28e94354 100644
--- a/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml
+++ b/Documentation/devicetree/bindings/media/allwinner,sun6i-a31-mipi-csi2.yaml
@@ -103,7 +103,7 @@ examples:
         reg = <0x01cb1000 0x1000>;
         interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>;
         clocks = <&ccu CLK_BUS_CSI>,
-                 <&ccu CLK_CSI1_SCLK>;
+                 <&ccu CLK_CSI_SCLK>;
         clock-names = "bus", "mod";
         resets = <&ccu RST_BUS_CSI>;
 
diff --git a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
index f909b1d4dbca..e82cf312da25 100644
--- a/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
+++ b/arch/arm/boot/dts/allwinner/sun8i-v3s.dtsi
@@ -652,7 +652,7 @@
 			reg = <0x01cb4000 0x3000>;
 			interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>;
 			clocks = <&ccu CLK_BUS_CSI>,
-				 <&ccu CLK_CSI1_SCLK>,
+				 <&ccu CLK_CSI_SCLK>,
 				 <&ccu CLK_DRAM_CSI>;
 			clock-names = "bus", "mod", "ram";
 			resets = <&ccu RST_BUS_CSI>;
diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
index 579a81bb46df..d12791b31a9d 100644
--- a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
+++ b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c
@@ -362,8 +362,8 @@ static const char * const csi_mclk_parents[] = { "osc24M", "pll-video",
 static SUNXI_CCU_M_WITH_MUX_GATE(csi0_mclk_clk, "csi0-mclk", csi_mclk_parents,
 				 0x130, 0, 5, 8, 3, BIT(15), 0);
 
-static const char * const csi1_sclk_parents[] = { "pll-video", "pll-isp" };
-static SUNXI_CCU_M_WITH_MUX_GATE(csi1_sclk_clk, "csi-sclk", csi1_sclk_parents,
+static const char * const csi_sclk_parents[] = { "pll-video", "pll-isp" };
+static SUNXI_CCU_M_WITH_MUX_GATE(csi_sclk_clk, "csi-sclk", csi_sclk_parents,
 				 0x134, 16, 4, 24, 3, BIT(31), 0);
 
 static SUNXI_CCU_M_WITH_MUX_GATE(csi1_mclk_clk, "csi-mclk", csi_mclk_parents,
@@ -452,7 +452,7 @@ static struct ccu_common *sun8i_v3s_ccu_clks[] = {
 	&tcon_clk.common,
 	&csi_misc_clk.common,
 	&csi0_mclk_clk.common,
-	&csi1_sclk_clk.common,
+	&csi_sclk_clk.common,
 	&csi1_mclk_clk.common,
 	&ve_clk.common,
 	&ac_dig_clk.common,
@@ -551,7 +551,7 @@ static struct clk_hw_onecell_data sun8i_v3s_hw_clks = {
 		[CLK_TCON0]		= &tcon_clk.common.hw,
 		[CLK_CSI_MISC]		= &csi_misc_clk.common.hw,
 		[CLK_CSI0_MCLK]		= &csi0_mclk_clk.common.hw,
-		[CLK_CSI1_SCLK]		= &csi1_sclk_clk.common.hw,
+		[CLK_CSI_SCLK]		= &csi_sclk_clk.common.hw,
 		[CLK_CSI1_MCLK]		= &csi1_mclk_clk.common.hw,
 		[CLK_VE]		= &ve_clk.common.hw,
 		[CLK_AC_DIG]		= &ac_dig_clk.common.hw,
@@ -633,7 +633,7 @@ static struct clk_hw_onecell_data sun8i_v3_hw_clks = {
 		[CLK_TCON0]		= &tcon_clk.common.hw,
 		[CLK_CSI_MISC]		= &csi_misc_clk.common.hw,
 		[CLK_CSI0_MCLK]		= &csi0_mclk_clk.common.hw,
-		[CLK_CSI1_SCLK]		= &csi1_sclk_clk.common.hw,
+		[CLK_CSI_SCLK]		= &csi_sclk_clk.common.hw,
 		[CLK_CSI1_MCLK]		= &csi1_mclk_clk.common.hw,
 		[CLK_VE]		= &ve_clk.common.hw,
 		[CLK_AC_DIG]		= &ac_dig_clk.common.hw,
diff --git a/include/dt-bindings/clock/sun8i-v3s-ccu.h b/include/dt-bindings/clock/sun8i-v3s-ccu.h
index 014ac6123d17..c4055629c9f9 100644
--- a/include/dt-bindings/clock/sun8i-v3s-ccu.h
+++ b/include/dt-bindings/clock/sun8i-v3s-ccu.h
@@ -96,7 +96,7 @@
 #define CLK_TCON0		64
 #define CLK_CSI_MISC		65
 #define CLK_CSI0_MCLK		66
-#define CLK_CSI1_SCLK		67
+#define CLK_CSI_SCLK		67
 #define CLK_CSI1_MCLK		68
 #define CLK_VE			69
 #define CLK_AC_DIG		70
-- 
cgit v1.2.3


From 59710a26a289ad4e7ef227d22063e964930928b0 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 30 Jun 2025 15:37:46 -0400
Subject: Bluetooth: hci_core: Remove check of BDADDR_ANY in
 hci_conn_hash_lookup_big_state

The check for destination to be BDADDR_ANY is no longer necessary with
the introduction of BIS_LINK.

Fixes: 23205562ffc8 ("Bluetooth: separate CIS_LINK and BIS_LINK link types")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 9fc8f544e20e..0da011fc8146 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1350,8 +1350,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle,  __u16 state)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) ||
-		    c->state != state)
+		if (c->type != BIS_LINK || c->state != state)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big) {
-- 
cgit v1.2.3


From 803f0700a3bbf528c4c624a22f87d08178ca0fbe Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Wed, 2 Jul 2025 23:39:56 +0800
Subject: bpf: Show precise link_type for {uprobe,kprobe}_multi fdinfo

Alexei suggested, 'link_type' can be more precise and differentiate
for human in fdinfo. In fact BPF_LINK_TYPE_KPROBE_MULTI includes
kretprobe_multi type, the same as BPF_LINK_TYPE_UPROBE_MULTI, so we
can show it more concretely.

link_type:	kprobe_multi
link_id:	1
prog_tag:	d2b307e915f0dd37
...
link_type:	kretprobe_multi
link_id:	2
prog_tag:	ab9ea0545870781d
...
link_type:	uprobe_multi
link_id:	9
prog_tag:	e729f789e34a8eca
...
link_type:	uretprobe_multi
link_id:	10
prog_tag:	7db356c03e61a4d4

Co-developed-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Link: https://lore.kernel.org/r/20250702153958.639852-1-chen.dylane@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/syscall.c     |  9 ++++++++-
 kernel/trace/bpf_trace.c | 10 ++++------
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5dd556e89cce..287c956cdbd2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1702,6 +1702,7 @@ struct bpf_link {
 	 * link's semantics is determined by target attach hook
 	 */
 	bool sleepable;
+	u32 flags;
 	/* rcu is used before freeing, work can be used to schedule that
 	 * RCU-based freeing before that, so they never overlap
 	 */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 56500381c28a..f1d9ee9717a1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3228,7 +3228,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
 	if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) {
-		seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
+		if (link->type == BPF_LINK_TYPE_KPROBE_MULTI)
+			seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ?
+				   "kretprobe_multi" : "kprobe_multi");
+		else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI)
+			seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ?
+				   "uretprobe_multi" : "uprobe_multi");
+		else
+			seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]);
 	} else {
 		WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type);
 		seq_printf(m, "link_type:\t<%u>\n", type);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0a06ea6638fe..81d7a4e5ae15 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2466,7 +2466,6 @@ struct bpf_kprobe_multi_link {
 	u32 cnt;
 	u32 mods_cnt;
 	struct module **mods;
-	u32 flags;
 };
 
 struct bpf_kprobe_multi_run_ctx {
@@ -2586,7 +2585,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 	kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
 	info->kprobe_multi.count = kmulti_link->cnt;
-	info->kprobe_multi.flags = kmulti_link->flags;
+	info->kprobe_multi.flags = kmulti_link->link.flags;
 	info->kprobe_multi.missed = kmulti_link->fp.nmissed;
 
 	if (!uaddrs)
@@ -2976,7 +2975,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	link->addrs = addrs;
 	link->cookies = cookies;
 	link->cnt = cnt;
-	link->flags = flags;
+	link->link.flags = flags;
 
 	if (cookies) {
 		/*
@@ -3045,7 +3044,6 @@ struct bpf_uprobe_multi_link {
 	struct path path;
 	struct bpf_link link;
 	u32 cnt;
-	u32 flags;
 	struct bpf_uprobe *uprobes;
 	struct task_struct *task;
 };
@@ -3109,7 +3107,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
 
 	umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
 	info->uprobe_multi.count = umulti_link->cnt;
-	info->uprobe_multi.flags = umulti_link->flags;
+	info->uprobe_multi.flags = umulti_link->link.flags;
 	info->uprobe_multi.pid = umulti_link->task ?
 				 task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
 
@@ -3369,7 +3367,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	link->uprobes = uprobes;
 	link->path = path;
 	link->task = task;
-	link->flags = flags;
+	link->link.flags = flags;
 
 	bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
 		      &bpf_uprobe_multi_link_lops, prog);
-- 
cgit v1.2.3


From 0426729f46cd1f6354fad07267a21579186a5757 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 3 Jul 2025 13:48:07 -0700
Subject: bpf: Refactor bprintf buffer support

Refactor code to be able to get and put bprintf buffers and use
bpf_printf_prepare independently. This will be used in the next patch to
implement BPF streams support, particularly as a staging buffer for
strings that need to be formatted and then allocated and pushed into a
stream.

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 15 ++++++++++++++-
 kernel/bpf/helpers.c | 26 +++++++++++---------------
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 287c956cdbd2..a07451457d9e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3551,6 +3551,16 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id);
 #define MAX_BPRINTF_VARARGS		12
 #define MAX_BPRINTF_BUF			1024
 
+/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
+ * arguments representation.
+ */
+#define MAX_BPRINTF_BIN_ARGS	512
+
+struct bpf_bprintf_buffers {
+	char bin_args[MAX_BPRINTF_BIN_ARGS];
+	char buf[MAX_BPRINTF_BUF];
+};
+
 struct bpf_bprintf_data {
 	u32 *bin_args;
 	char *buf;
@@ -3558,9 +3568,12 @@ struct bpf_bprintf_data {
 	bool get_buf;
 };
 
-int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 num_args, struct bpf_bprintf_data *data);
 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
+int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs);
+void bpf_put_buffers(void);
+
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5269381d6d3d..da66ce307e75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -764,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
 	return -EINVAL;
 }
 
-/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary
- * arguments representation.
- */
-#define MAX_BPRINTF_BIN_ARGS	512
-
 /* Support executing three nested bprintf helper calls on a given CPU */
 #define MAX_BPRINTF_NEST_LEVEL	3
-struct bpf_bprintf_buffers {
-	char bin_args[MAX_BPRINTF_BIN_ARGS];
-	char buf[MAX_BPRINTF_BUF];
-};
 
 static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);
 static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
 
-static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
+int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs)
 {
 	int nest_level;
 
@@ -795,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs)
 	return 0;
 }
 
-void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+void bpf_put_buffers(void)
 {
-	if (!data->bin_args && !data->buf)
-		return;
 	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0))
 		return;
 	this_cpu_dec(bpf_bprintf_nest_level);
 	preempt_enable();
 }
 
+void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
+{
+	if (!data->bin_args && !data->buf)
+		return;
+	bpf_put_buffers();
+}
+
 /*
  * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers
  *
@@ -819,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)
  * In argument preparation mode, if 0 is returned, safe temporary buffers are
  * allocated and bpf_bprintf_cleanup should be called to free them after use.
  */
-int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
+int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 num_args, struct bpf_bprintf_data *data)
 {
 	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;
@@ -835,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 		return -EINVAL;
 	fmt_size = fmt_end - fmt;
 
-	if (get_buffers && try_get_buffers(&buffers))
+	if (get_buffers && bpf_try_get_buffers(&buffers))
 		return -EBUSY;
 
 	if (data->get_bin_args) {
-- 
cgit v1.2.3


From 5ab154f1463a111e1dc8fd5d31eaa7a2a71fe2e6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 3 Jul 2025 13:48:08 -0700
Subject: bpf: Introduce BPF standard streams

Add support for a stream API to the kernel and expose related kfuncs to
BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These
can be used for printing messages that can be consumed from user space,
thus it's similar in spirit to existing trace_pipe interface.

The kernel will use the BPF_STDERR stream to notify the program of any
errors encountered at runtime. BPF programs themselves may use both
streams for writing debug messages. BPF library-like code may use
BPF_STDERR to print warnings or errors on misuse at runtime.

The implementation of a stream is as follows. Everytime a message is
emitted from the kernel (directly, or through a BPF program), a record
is allocated by bump allocating from per-cpu region backed by a page
obtained using alloc_pages_nolock(). This ensures that we can allocate
memory from any context. The eventual plan is to discard this scheme in
favor of Alexei's kmalloc_nolock() [0].

This record is then locklessly inserted into a list (llist_add()) so
that the printing side doesn't require holding any locks, and works in
any context. Each stream has a maximum capacity of 4MB of text, and each
printed message is accounted against this limit.

Messages from a program are emitted using the bpf_stream_vprintk kfunc,
which takes a stream_id argument in addition to working otherwise
similar to bpf_trace_vprintk.

The bprintf buffer helpers are extracted out to be reused for printing
the string into them before copying it into the stream, so that we can
(with the defined max limit) format a string and know its true length
before performing allocations of the stream element.

For consuming elements from a stream, we expose a bpf(2) syscall command
named BPF_PROG_STREAM_READ_BY_FD, which allows reading data from the
stream of a given prog_fd into a user space buffer. The main logic is
implemented in bpf_stream_read(). The log messages are queued in
bpf_stream::log by the bpf_stream_vprintk kfunc, and then pulled and
ordered correctly in the stream backlog.

For this purpose, we hold a lock around bpf_stream_backlog_peek(), as
llist_del_first() (if we maintained a second lockless list for the
backlog) wouldn't be safe from multiple threads anyway. Then, if we
fail to find something in the backlog log, we splice out everything from
the lockless log, and place it in the backlog log, and then return the
head of the backlog. Once the full length of the element is consumed, we
will pop it and free it.

The lockless list bpf_stream::log is a LIFO stack. Elements obtained
using a llist_del_all() operation are in LIFO order, thus would break
the chronological ordering if printed directly. Hence, this batch of
messages is first reversed. Then, it is stashed into a separate list in
the stream, i.e. the backlog_log. The head of this list is the actual
message that should always be returned to the caller. All of this is
done in bpf_stream_backlog_fill().

From the kernel side, the writing into the stream will be a bit more
involved than the typical printk. First, the kernel typically may print
a collection of messages into the stream, and parallel writers into the
stream may suffer from interleaving of messages. To ensure each group of
messages is visible atomically, we can lift the advantage of using a
lockless list for pushing in messages.

To enable this, we add a bpf_stream_stage() macro, and require kernel
users to use bpf_stream_printk statements for the passed expression to
write into the stream. Underneath the macro, we have a message staging
API, where a bpf_stream_stage object on the stack accumulates the
messages being printed into a local llist_head, and then a commit
operation splices the whole batch into the stream's lockless log list.

This is especially pertinent for rqspinlock deadlock messages printed to
program streams. After this change, we see each deadlock invocation as a
non-interleaving contiguous message without any confusion on the
reader's part, improving their user experience in debugging the fault.

While programs cannot benefit from this staged stream writing API, they
could just as well hold an rqspinlock around their print statements to
serialize messages, hence this is kept kernel-internal for now.

Overall, this infrastructure provides NMI-safe any context printing of
messages to two dedicated streams.

Later patches will add support for printing splats in case of BPF arena
page faults, rqspinlock deadlocks, and cond_break timeouts, and
integration of this facility into bpftool for dumping messages to user
space.

  [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  52 +++++
 include/uapi/linux/bpf.h       |  24 +++
 kernel/bpf/Makefile            |   2 +-
 kernel/bpf/core.c              |   5 +
 kernel/bpf/helpers.c           |   1 +
 kernel/bpf/stream.c            | 478 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  25 +++
 kernel/bpf/verifier.c          |   1 +
 tools/include/uapi/linux/bpf.h |  24 +++
 9 files changed, 611 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/stream.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a07451457d9e..f61aeccb23c3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1538,6 +1538,37 @@ struct btf_mod_pair {
 
 struct bpf_kfunc_desc_tab;
 
+enum bpf_stream_id {
+	BPF_STDOUT = 1,
+	BPF_STDERR = 2,
+};
+
+struct bpf_stream_elem {
+	struct llist_node node;
+	int total_len;
+	int consumed_len;
+	char str[];
+};
+
+enum {
+	/* 100k bytes */
+	BPF_STREAM_MAX_CAPACITY = 100000ULL,
+};
+
+struct bpf_stream {
+	atomic_t capacity;
+	struct llist_head log;	/* list of in-flight stream elements in LIFO order */
+
+	struct mutex lock;  /* lock protecting backlog_{head,tail} */
+	struct llist_node *backlog_head; /* list of in-flight stream elements in FIFO order */
+	struct llist_node *backlog_tail; /* tail of the list above */
+};
+
+struct bpf_stream_stage {
+	struct llist_head log;
+	int len;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -1646,6 +1677,7 @@ struct bpf_prog_aux {
 		struct work_struct work;
 		struct rcu_head	rcu;
 	};
+	struct bpf_stream stream[2];
 };
 
 struct bpf_prog {
@@ -2409,6 +2441,7 @@ int  generic_map_delete_batch(struct bpf_map *map,
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
+
 int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
 			unsigned long nr_pages, struct page **page_array);
 #ifdef CONFIG_MEMCG
@@ -3574,6 +3607,25 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
 int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs);
 void bpf_put_buffers(void);
 
+void bpf_prog_stream_init(struct bpf_prog *prog);
+void bpf_prog_stream_free(struct bpf_prog *prog);
+int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len);
+void bpf_stream_stage_init(struct bpf_stream_stage *ss);
+void bpf_stream_stage_free(struct bpf_stream_stage *ss);
+__printf(2, 3)
+int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...);
+int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
+			    enum bpf_stream_id stream_id);
+
+#define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__)
+
+#define bpf_stream_stage(ss, prog, stream_id, expr)            \
+	({                                                     \
+		bpf_stream_stage_init(&ss);                    \
+		(expr);                                        \
+		bpf_stream_stage_commit(&ss, prog, stream_id); \
+		bpf_stream_stage_free(&ss);                    \
+	})
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 719ba230032f..0670e15a6100 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -906,6 +906,17 @@ union bpf_iter_link_info {
  *		A new file descriptor (a nonnegative integer), or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_PROG_STREAM_READ_BY_FD
+ *	Description
+ *		Read data of a program's BPF stream. The program is identified
+ *		by *prog_fd*, and the stream is identified by the *stream_id*.
+ *		The data is copied to a buffer pointed to by *stream_buf*, and
+ *		filled less than or equal to *stream_buf_len* bytes.
+ *
+ *	Return
+ *		Number of bytes read from the stream on success, or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -961,6 +972,7 @@ enum bpf_cmd {
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
 	BPF_TOKEN_CREATE,
+	BPF_PROG_STREAM_READ_BY_FD,
 	__MAX_BPF_CMD,
 };
 
@@ -1463,6 +1475,11 @@ struct bpf_stack_build_id {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+enum {
+	BPF_STREAM_STDOUT = 1,
+	BPF_STREAM_STDERR = 2,
+};
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -1849,6 +1866,13 @@ union bpf_attr {
 		__u32		bpffs_fd;
 	} token_create;
 
+	struct {
+		__aligned_u64	stream_buf;
+		__u32		stream_buf_len;
+		__u32		stream_id;
+		__u32		prog_fd;
+	} prog_stream_read;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3a335c50e6e3..269c04a24664 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o
 ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
 obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e536a34a32c8..f0def24573ae 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	mutex_init(&fp->aux->ext_mutex);
 	mutex_init(&fp->aux->dst_mutex);
 
+#ifdef CONFIG_BPF_SYSCALL
+	bpf_prog_stream_init(fp);
+#endif
+
 	return fp;
 }
 
@@ -2862,6 +2866,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 #ifdef CONFIG_BPF_SYSCALL
 	bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+	bpf_prog_stream_free(aux->prog);
 #endif
 #ifdef CONFIG_CGROUP_BPF
 	if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index da66ce307e75..6bfcbcdf6588 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -3825,6 +3825,7 @@ BTF_ID_FLAGS(func, bpf_strnstr);
 #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS)
 BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
 #endif
+BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
new file mode 100644
index 000000000000..e434541358db
--- /dev/null
+++ b/kernel/bpf/stream.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/percpu.h>
+#include <linux/refcount.h>
+#include <linux/gfp.h>
+#include <linux/memory.h>
+#include <linux/local_lock.h>
+#include <linux/mutex.h>
+
+/*
+ * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe
+ * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and
+ * stash it in a local per-CPU variable, and bump allocate from the page
+ * whenever items need to be printed to a stream. Each page holds a global
+ * atomic refcount in its first 4 bytes, and then records of variable length
+ * that describe the printed messages. Once the global refcount has dropped to
+ * zero, it is a signal to free the page back to the kernel's page allocator,
+ * given all the individual records in it have been consumed.
+ *
+ * It is possible the same page is used to serve allocations across different
+ * programs, which may be consumed at different times individually, hence
+ * maintaining a reference count per-page is critical for correct lifetime
+ * tracking.
+ *
+ * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it
+ * lands.
+ */
+struct bpf_stream_page {
+	refcount_t ref;
+	u32 consumed;
+	char buf[];
+};
+
+/* Available room to add data to a refcounted page. */
+#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed))
+
+static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock);
+static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page);
+
+static bool bpf_stream_page_local_lock(unsigned long *flags)
+{
+	return local_trylock_irqsave(&stream_local_lock, *flags);
+}
+
+static void bpf_stream_page_local_unlock(unsigned long *flags)
+{
+	local_unlock_irqrestore(&stream_local_lock, *flags);
+}
+
+static void bpf_stream_page_free(struct bpf_stream_page *stream_page)
+{
+	struct page *p;
+
+	if (!stream_page)
+		return;
+	p = virt_to_page(stream_page);
+	free_pages_nolock(p, 0);
+}
+
+static void bpf_stream_page_get(struct bpf_stream_page *stream_page)
+{
+	refcount_inc(&stream_page->ref);
+}
+
+static void bpf_stream_page_put(struct bpf_stream_page *stream_page)
+{
+	if (refcount_dec_and_test(&stream_page->ref))
+		bpf_stream_page_free(stream_page);
+}
+
+static void bpf_stream_page_init(struct bpf_stream_page *stream_page)
+{
+	refcount_set(&stream_page->ref, 1);
+	stream_page->consumed = 0;
+}
+
+static struct bpf_stream_page *bpf_stream_page_replace(void)
+{
+	struct bpf_stream_page *stream_page, *old_stream_page;
+	struct page *page;
+
+	page = alloc_pages_nolock(NUMA_NO_NODE, 0);
+	if (!page)
+		return NULL;
+	stream_page = page_address(page);
+	bpf_stream_page_init(stream_page);
+
+	old_stream_page = this_cpu_read(stream_pcpu_page);
+	if (old_stream_page)
+		bpf_stream_page_put(old_stream_page);
+	this_cpu_write(stream_pcpu_page, stream_page);
+	return stream_page;
+}
+
+static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len)
+{
+	int min = offsetof(struct bpf_stream_elem, str[0]);
+	int consumed = stream_page->consumed;
+	int total = BPF_STREAM_PAGE_SZ;
+	int rem = max(0, total - consumed - min);
+
+	/* Let's give room of at least 8 bytes. */
+	WARN_ON_ONCE(rem % 8 != 0);
+	rem = rem < 8 ? 0 : rem;
+	return min(len, rem);
+}
+
+static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len)
+{
+	init_llist_node(&elem->node);
+	elem->total_len = len;
+	elem->consumed_len = 0;
+}
+
+static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem)
+{
+	unsigned long addr = (unsigned long)elem;
+
+	return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr);
+}
+
+static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len)
+{
+	u32 consumed = stream_page->consumed;
+
+	stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8);
+	return (struct bpf_stream_elem *)&stream_page->buf[consumed];
+}
+
+static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len)
+{
+	struct bpf_stream_elem *elem = NULL;
+	struct bpf_stream_page *page;
+	int room = 0;
+
+	page = this_cpu_read(stream_pcpu_page);
+	if (!page)
+		page = bpf_stream_page_replace();
+	if (!page)
+		return NULL;
+
+	room = bpf_stream_page_check_room(page, len);
+	if (room != len)
+		page = bpf_stream_page_replace();
+	if (!page)
+		return NULL;
+	bpf_stream_page_get(page);
+	room = bpf_stream_page_check_room(page, len);
+	WARN_ON_ONCE(room != len);
+
+	elem = bpf_stream_page_push_elem(page, room);
+	bpf_stream_elem_init(elem, room);
+	return elem;
+}
+
+static struct bpf_stream_elem *bpf_stream_elem_alloc(int len)
+{
+	const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf);
+	struct bpf_stream_elem *elem;
+	unsigned long flags;
+
+	BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ);
+	/*
+	 * Length denotes the amount of data to be written as part of stream element,
+	 * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can
+	 * accomodate, therefore deny allocations that won't fit into them.
+	 */
+	if (len < 0 || len > max_len)
+		return NULL;
+
+	if (!bpf_stream_page_local_lock(&flags))
+		return NULL;
+	elem = bpf_stream_page_reserve_elem(len);
+	bpf_stream_page_local_unlock(&flags);
+	return elem;
+}
+
+static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len)
+{
+	struct bpf_stream_elem *elem = NULL;
+
+	/*
+	 * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream
+	 * log, elements will be popped at once and reversed to print the log.
+	 */
+	elem = bpf_stream_elem_alloc(len);
+	if (!elem)
+		return -ENOMEM;
+
+	memcpy(elem->str, str, len);
+	llist_add(&elem->node, log);
+
+	return 0;
+}
+
+static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len)
+{
+	if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY)
+		return -ENOSPC;
+	if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) {
+		atomic_sub(len, &stream->capacity);
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem)
+{
+	int len = elem->total_len;
+
+	atomic_sub(len, &stream->capacity);
+}
+
+static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len)
+{
+	int ret = bpf_stream_consume_capacity(stream, len);
+
+	return ret ?: __bpf_stream_push_str(&stream->log, str, len);
+}
+
+static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux)
+{
+	if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR)
+		return NULL;
+	return &aux->stream[stream_id - 1];
+}
+
+static void bpf_stream_free_elem(struct bpf_stream_elem *elem)
+{
+	struct bpf_stream_page *p;
+
+	p = bpf_stream_page_from_elem(elem);
+	bpf_stream_page_put(p);
+}
+
+static void bpf_stream_free_list(struct llist_node *list)
+{
+	struct bpf_stream_elem *elem, *tmp;
+
+	llist_for_each_entry_safe(elem, tmp, list, node)
+		bpf_stream_free_elem(elem);
+}
+
+static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream)
+{
+	return stream->backlog_head;
+}
+
+static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream)
+{
+	struct llist_node *node;
+
+	node = stream->backlog_head;
+	if (stream->backlog_head == stream->backlog_tail)
+		stream->backlog_head = stream->backlog_tail = NULL;
+	else
+		stream->backlog_head = node->next;
+	return node;
+}
+
+static void bpf_stream_backlog_fill(struct bpf_stream *stream)
+{
+	struct llist_node *head, *tail;
+
+	if (llist_empty(&stream->log))
+		return;
+	tail = llist_del_all(&stream->log);
+	if (!tail)
+		return;
+	head = llist_reverse_order(tail);
+
+	if (!stream->backlog_head) {
+		stream->backlog_head = head;
+		stream->backlog_tail = tail;
+	} else {
+		stream->backlog_tail->next = head;
+		stream->backlog_tail = tail;
+	}
+
+	return;
+}
+
+static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len)
+{
+	int rem = elem->total_len - elem->consumed_len;
+	int used = min(rem, *len);
+
+	elem->consumed_len += used;
+	*len -= used;
+
+	return elem->consumed_len == elem->total_len;
+}
+
+static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len)
+{
+	int rem_len = len, cons_len, ret = 0;
+	struct bpf_stream_elem *elem = NULL;
+	struct llist_node *node;
+
+	mutex_lock(&stream->lock);
+
+	while (rem_len) {
+		int pos = len - rem_len;
+		bool cont;
+
+		node = bpf_stream_backlog_peek(stream);
+		if (!node) {
+			bpf_stream_backlog_fill(stream);
+			node = bpf_stream_backlog_peek(stream);
+		}
+		if (!node)
+			break;
+		elem = container_of(node, typeof(*elem), node);
+
+		cons_len = elem->consumed_len;
+		cont = bpf_stream_consume_elem(elem, &rem_len) == false;
+
+		ret = copy_to_user(buf + pos, elem->str + cons_len,
+				   elem->consumed_len - cons_len);
+		/* Restore in case of error. */
+		if (ret) {
+			ret = -EFAULT;
+			elem->consumed_len = cons_len;
+			break;
+		}
+
+		if (cont)
+			continue;
+		bpf_stream_backlog_pop(stream);
+		bpf_stream_release_capacity(stream, elem);
+		bpf_stream_free_elem(elem);
+	}
+
+	mutex_unlock(&stream->lock);
+	return ret ? ret : len - rem_len;
+}
+
+int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len)
+{
+	struct bpf_stream *stream;
+
+	stream = bpf_stream_get(stream_id, prog->aux);
+	if (!stream)
+		return -ENOENT;
+	return bpf_stream_read(stream, buf, len);
+}
+
+__bpf_kfunc_start_defs();
+
+/*
+ * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the
+ * enum in headers.
+ */
+__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog)
+{
+	struct bpf_bprintf_data data = {
+		.get_bin_args	= true,
+		.get_buf	= true,
+	};
+	struct bpf_prog_aux *aux = aux__prog;
+	u32 fmt_size = strlen(fmt__str) + 1;
+	struct bpf_stream *stream;
+	u32 data_len = len__sz;
+	int ret, num_args;
+
+	stream = bpf_stream_get(stream_id, aux);
+	if (!stream)
+		return -ENOENT;
+
+	if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 ||
+	    (data_len && !args))
+		return -EINVAL;
+	num_args = data_len / 8;
+
+	ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data);
+	if (ret < 0)
+		return ret;
+
+	ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args);
+	/* Exclude NULL byte during push. */
+	ret = bpf_stream_push_str(stream, data.buf, ret);
+	bpf_bprintf_cleanup(&data);
+
+	return ret;
+}
+
+__bpf_kfunc_end_defs();
+
+/* Added kfunc to common_btf_ids */
+
+void bpf_prog_stream_init(struct bpf_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+		atomic_set(&prog->aux->stream[i].capacity, 0);
+		init_llist_head(&prog->aux->stream[i].log);
+		mutex_init(&prog->aux->stream[i].lock);
+		prog->aux->stream[i].backlog_head = NULL;
+		prog->aux->stream[i].backlog_tail = NULL;
+	}
+}
+
+void bpf_prog_stream_free(struct bpf_prog *prog)
+{
+	struct llist_node *list;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) {
+		list = llist_del_all(&prog->aux->stream[i].log);
+		bpf_stream_free_list(list);
+		bpf_stream_free_list(prog->aux->stream[i].backlog_head);
+	}
+}
+
+void bpf_stream_stage_init(struct bpf_stream_stage *ss)
+{
+	init_llist_head(&ss->log);
+	ss->len = 0;
+}
+
+void bpf_stream_stage_free(struct bpf_stream_stage *ss)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&ss->log);
+	bpf_stream_free_list(node);
+}
+
+int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...)
+{
+	struct bpf_bprintf_buffers *buf;
+	va_list args;
+	int ret;
+
+	if (bpf_try_get_buffers(&buf))
+		return -EBUSY;
+
+	va_start(args, fmt);
+	ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args);
+	va_end(args);
+	ss->len += ret;
+	/* Exclude NULL byte during push. */
+	ret = __bpf_stream_push_str(&ss->log, buf->buf, ret);
+	bpf_put_buffers();
+	return ret;
+}
+
+int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
+			    enum bpf_stream_id stream_id)
+{
+	struct llist_node *list, *head, *tail;
+	struct bpf_stream *stream;
+	int ret;
+
+	stream = bpf_stream_get(stream_id, prog->aux);
+	if (!stream)
+		return -EINVAL;
+
+	ret = bpf_stream_consume_capacity(stream, ss->len);
+	if (ret)
+		return ret;
+
+	list = llist_del_all(&ss->log);
+	head = tail = list;
+
+	if (!list)
+		return 0;
+	while (llist_next(list)) {
+		tail = llist_next(list);
+		list = tail;
+	}
+	llist_add_batch(head, tail, &stream->log);
+	return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f1d9ee9717a1..7db7182a3057 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5943,6 +5943,28 @@ static int token_create(union bpf_attr *attr)
 	return bpf_token_create(attr);
 }
 
+#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd
+
+static int prog_stream_read(union bpf_attr *attr)
+{
+	char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf);
+	u32 len = attr->prog_stream_read.stream_buf_len;
+	struct bpf_prog *prog;
+	int ret;
+
+	if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->prog_stream_read.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len);
+	bpf_prog_put(prog);
+
+	return ret;
+}
+
 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -6079,6 +6101,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_TOKEN_CREATE:
 		err = token_create(&attr);
 		break;
+	case BPF_PROG_STREAM_READ_BY_FD:
+		err = prog_stream_read(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52e36fd23f40..9f09dcd2eabb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -46,6 +46,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 
 enum bpf_features {
 	BPF_FEAT_RDONLY_CAST_TO_VOID = 0,
+	BPF_FEAT_STREAMS	     = 1,
 	__MAX_BPF_FEAT,
 };
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 719ba230032f..0670e15a6100 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -906,6 +906,17 @@ union bpf_iter_link_info {
  *		A new file descriptor (a nonnegative integer), or -1 if an
  *		error occurred (in which case, *errno* is set appropriately).
  *
+ * BPF_PROG_STREAM_READ_BY_FD
+ *	Description
+ *		Read data of a program's BPF stream. The program is identified
+ *		by *prog_fd*, and the stream is identified by the *stream_id*.
+ *		The data is copied to a buffer pointed to by *stream_buf*, and
+ *		filled less than or equal to *stream_buf_len* bytes.
+ *
+ *	Return
+ *		Number of bytes read from the stream on success, or -1 if an
+ *		error occurred (in which case, *errno* is set appropriately).
+ *
  * NOTES
  *	eBPF objects (maps and programs) can be shared between processes.
  *
@@ -961,6 +972,7 @@ enum bpf_cmd {
 	BPF_LINK_DETACH,
 	BPF_PROG_BIND_MAP,
 	BPF_TOKEN_CREATE,
+	BPF_PROG_STREAM_READ_BY_FD,
 	__MAX_BPF_CMD,
 };
 
@@ -1463,6 +1475,11 @@ struct bpf_stack_build_id {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+enum {
+	BPF_STREAM_STDOUT = 1,
+	BPF_STREAM_STDERR = 2,
+};
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -1849,6 +1866,13 @@ union bpf_attr {
 		__u32		bpffs_fd;
 	} token_create;
 
+	struct {
+		__aligned_u64	stream_buf;
+		__u32		stream_buf_len;
+		__u32		stream_id;
+		__u32		prog_fd;
+	} prog_stream_read;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
-- 
cgit v1.2.3


From 0e521efaf36350b8f783984541efa56f560c90b0 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 3 Jul 2025 13:48:09 -0700
Subject: bpf: Add function to extract program source info

Prepare a function for use in future patches that can extract the file
info, line info, and the source line number for a given BPF program
provided it's program counter.

Only the basename of the file path is provided, given it can be
excessively long in some cases.

This will be used in later patches to print source info to the BPF
stream.

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  3 +++
 kernel/bpf/core.c   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f61aeccb23c3..c3802af11bac 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3661,4 +3661,7 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 	return prog->aux->func_idx != 0;
 }
 
+int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
+			   const char **linep, int *nump);
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f0def24573ae..2dc5b846ae50 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3213,3 +3213,50 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
+			   const char **linep, int *nump)
+{
+	int idx = -1, insn_start, insn_end, len;
+	struct bpf_line_info *linfo;
+	void **jited_linfo;
+	struct btf *btf;
+
+	btf = prog->aux->btf;
+	linfo = prog->aux->linfo;
+	jited_linfo = prog->aux->jited_linfo;
+
+	if (!btf || !linfo || !jited_linfo)
+		return -EINVAL;
+	len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len;
+
+	linfo = &prog->aux->linfo[prog->aux->linfo_idx];
+	jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx];
+
+	insn_start = linfo[0].insn_off;
+	insn_end = insn_start + len;
+
+	for (int i = 0; i < prog->aux->nr_linfo &&
+	     linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) {
+		if (jited_linfo[i] >= (void *)ip)
+			break;
+		idx = i;
+	}
+
+	if (idx == -1)
+		return -ENOENT;
+
+	/* Get base component of the file path. */
+	*filep = btf_name_by_offset(btf, linfo[idx].file_name_off);
+	*filep = kbasename(*filep);
+	/* Obtain the source line, and strip whitespace in prefix. */
+	*linep = btf_name_by_offset(btf, linfo[idx].line_off);
+	while (isspace(**linep))
+		*linep += 1;
+	*nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col);
+	return 0;
+}
+
+#endif
-- 
cgit v1.2.3


From f0c53fd4a742f957da7077a691a85ef9775907dc Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 3 Jul 2025 13:48:11 -0700
Subject: bpf: Add function to find program from stack trace

In preparation of figuring out the closest program that led to the
current point in the kernel, implement a function that scans through the
stack trace and finds out the closest BPF program when walking down the
stack trace.

Special care needs to be taken to skip over kernel and BPF subprog
frames. We basically scan until we find a BPF main prog frame. The
assumption is that if a program calls into us transitively, we'll
hit it along the way. If not, we end up returning NULL.

Contextually the function will be used in places where we know the
program may have called into us.

Due to reliance on arch_bpf_stack_walk(), this function only works on
x86 with CONFIG_UNWINDER_ORC, arm64, and s390. Remove the warning from
arch_bpf_stack_walk as well since we call it outside bpf_throw()
context.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-6-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  1 -
 include/linux/bpf.h         |  1 +
 kernel/bpf/core.c           | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 15672cb926fc..40e1b3b9634f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3845,7 +3845,6 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 	}
 	return;
 #endif
-	WARN(1, "verification of programs using bpf_throw should have failed\n");
 }
 
 void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3802af11bac..b267c378d884 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3663,5 +3663,6 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 
 int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep,
 			   const char **linep, int *nump);
+struct bpf_prog *bpf_prog_find_from_stack(void);
 
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 833442661742..037d67cf5fb1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3262,4 +3262,37 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char *
 	return 0;
 }
 
+struct walk_stack_ctx {
+	struct bpf_prog *prog;
+};
+
+static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+	struct walk_stack_ctx *ctxp = cookie;
+	struct bpf_prog *prog;
+
+	/*
+	 * The RCU read lock is held to safely traverse the latch tree, but we
+	 * don't need its protection when accessing the prog, since it has an
+	 * active stack frame on the current stack trace, and won't disappear.
+	 */
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(ip);
+	rcu_read_unlock();
+	if (!prog)
+		return true;
+	if (bpf_is_subprog(prog))
+		return true;
+	ctxp->prog = prog;
+	return false;
+}
+
+struct bpf_prog *bpf_prog_find_from_stack(void)
+{
+	struct walk_stack_ctx ctx = {};
+
+	arch_bpf_stack_walk(find_from_stack_cb, &ctx);
+	return ctx.prog;
+}
+
 #endif
-- 
cgit v1.2.3


From d7c431cafcb4917b0d87b5cd10637cd47b6c8d79 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Thu, 3 Jul 2025 13:48:12 -0700
Subject: bpf: Add dump_stack() analogue to print to BPF stderr

Introduce a kernel function which is the analogue of dump_stack()
printing some useful information and the stack trace. This is not
exposed to BPF programs yet, but can be made available in the future.

When we have a program counter for a BPF program in the stack trace,
also additionally output the filename and line number to make the trace
helpful. The rest of the trace can be passed into ./decode_stacktrace.sh
to obtain the line numbers for kernel symbols.

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20250703204818.925464-7-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/stream.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b267c378d884..34dd90ec7fad 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3616,8 +3616,10 @@ __printf(2, 3)
 int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...);
 int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
 			    enum bpf_stream_id stream_id);
+int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss);
 
 #define bpf_stream_printk(ss, ...) bpf_stream_stage_printk(&ss, __VA_ARGS__)
+#define bpf_stream_dump_stack(ss) bpf_stream_stage_dump_stack(&ss)
 
 #define bpf_stream_stage(ss, prog, stream_id, expr)            \
 	({                                                     \
diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c
index e434541358db..8c842f845245 100644
--- a/kernel/bpf/stream.c
+++ b/kernel/bpf/stream.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
 
 #include <linux/bpf.h>
+#include <linux/filter.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/percpu.h>
 #include <linux/refcount.h>
@@ -476,3 +477,50 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog,
 	llist_add_batch(head, tail, &stream->log);
 	return 0;
 }
+
+struct dump_stack_ctx {
+	struct bpf_stream_stage *ss;
+	int err;
+};
+
+static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp)
+{
+	struct dump_stack_ctx *ctxp = cookie;
+	const char *file = "", *line = "";
+	struct bpf_prog *prog;
+	int num, ret;
+
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(ip);
+	rcu_read_unlock();
+	if (prog) {
+		ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num);
+		if (ret < 0)
+			goto end;
+		ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n  %s @ %s:%d\n",
+						    (void *)ip, line, file, num);
+		return !ctxp->err;
+	}
+end:
+	ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip);
+	return !ctxp->err;
+}
+
+int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss)
+{
+	struct dump_stack_ctx ctx = { .ss = ss };
+	int ret;
+
+	ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n",
+				      raw_smp_processor_id(), __kuid_val(current_real_cred()->euid),
+				      current->pid, current->comm);
+	if (ret)
+		return ret;
+	ret = bpf_stream_stage_printk(ss, "Call trace:\n");
+	if (ret)
+		return ret;
+	arch_bpf_stack_walk(dump_stack_cb, &ctx);
+	if (ctx.err)
+		return ctx.err;
+	return bpf_stream_stage_printk(ss, "\n");
+}
-- 
cgit v1.2.3


From 82bc4abf28d8147dd5da9ba52f0aa1bac23c125e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Thu, 3 Jul 2025 07:11:17 -0700
Subject: bpf: Avoid putting struct bpf_scc_callchain variables on the stack

Add a 'struct bpf_scc_callchain callchain_buf' field in bpf_verifier_env.
This way, the previous bpf_scc_callchain local variables can be
replaced by taking address of env->callchain_buf. This can reduce stack
usage and fix the following error:
    kernel/bpf/verifier.c:19921:12: error: stack frame size (1368) exceeds limit (1280) in 'do_check'
        [-Werror,-Wframe-larger-than]

Reported-by: Arnd Bergmann <arnd@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250703141117.1485108-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 36 ++++++++++++++++++------------------
 2 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e459e839f8b..94defa405c85 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -841,6 +841,7 @@ struct bpf_verifier_env {
 	char tmp_str_buf[TMP_STR_BUF_LEN];
 	struct bpf_insn insn_buf[INSN_BUF_SIZE];
 	struct bpf_insn epilogue_buf[INSN_BUF_SIZE];
+	struct bpf_scc_callchain callchain_buf;
 	/* array of pointers to bpf_scc_info indexed by SCC id */
 	struct bpf_scc_info **scc_info;
 	u32 scc_cnt;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92dba3c9664f..0f6cc2275695 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1914,19 +1914,19 @@ static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callc
  */
 static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
-	struct bpf_scc_callchain callchain;
+	struct bpf_scc_callchain *callchain = &env->callchain_buf;
 	struct bpf_scc_visit *visit;
 
-	if (!compute_scc_callchain(env, st, &callchain))
+	if (!compute_scc_callchain(env, st, callchain))
 		return 0;
-	visit = scc_visit_lookup(env, &callchain);
-	visit = visit ?: scc_visit_alloc(env, &callchain);
+	visit = scc_visit_lookup(env, callchain);
+	visit = visit ?: scc_visit_alloc(env, callchain);
 	if (!visit)
 		return -ENOMEM;
 	if (!visit->entry_state) {
 		visit->entry_state = st;
 		if (env->log.level & BPF_LOG_LEVEL2)
-			verbose(env, "SCC enter %s\n", format_callchain(env, &callchain));
+			verbose(env, "SCC enter %s\n", format_callchain(env, callchain));
 	}
 	return 0;
 }
@@ -1939,21 +1939,21 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi
  */
 static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
 {
-	struct bpf_scc_callchain callchain;
+	struct bpf_scc_callchain *callchain = &env->callchain_buf;
 	struct bpf_scc_visit *visit;
 
-	if (!compute_scc_callchain(env, st, &callchain))
+	if (!compute_scc_callchain(env, st, callchain))
 		return 0;
-	visit = scc_visit_lookup(env, &callchain);
+	visit = scc_visit_lookup(env, callchain);
 	if (!visit) {
 		verifier_bug(env, "scc exit: no visit info for call chain %s",
-			     format_callchain(env, &callchain));
+			     format_callchain(env, callchain));
 		return -EFAULT;
 	}
 	if (visit->entry_state != st)
 		return 0;
 	if (env->log.level & BPF_LOG_LEVEL2)
-		verbose(env, "SCC exit %s\n", format_callchain(env, &callchain));
+		verbose(env, "SCC exit %s\n", format_callchain(env, callchain));
 	visit->entry_state = NULL;
 	env->num_backedges -= visit->num_backedges;
 	visit->num_backedges = 0;
@@ -1968,22 +1968,22 @@ static int add_scc_backedge(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *st,
 			    struct bpf_scc_backedge *backedge)
 {
-	struct bpf_scc_callchain callchain;
+	struct bpf_scc_callchain *callchain = &env->callchain_buf;
 	struct bpf_scc_visit *visit;
 
-	if (!compute_scc_callchain(env, st, &callchain)) {
+	if (!compute_scc_callchain(env, st, callchain)) {
 		verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d",
 			     st->insn_idx);
 		return -EFAULT;
 	}
-	visit = scc_visit_lookup(env, &callchain);
+	visit = scc_visit_lookup(env, callchain);
 	if (!visit) {
 		verifier_bug(env, "add backedge: no visit info for call chain %s",
-			     format_callchain(env, &callchain));
+			     format_callchain(env, callchain));
 		return -EFAULT;
 	}
 	if (env->log.level & BPF_LOG_LEVEL2)
-		verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain));
+		verbose(env, "SCC backedge %s\n", format_callchain(env, callchain));
 	backedge->next = visit->backedges;
 	visit->backedges = backedge;
 	visit->num_backedges++;
@@ -1999,12 +1999,12 @@ static int add_scc_backedge(struct bpf_verifier_env *env,
 static bool incomplete_read_marks(struct bpf_verifier_env *env,
 				  struct bpf_verifier_state *st)
 {
-	struct bpf_scc_callchain callchain;
+	struct bpf_scc_callchain *callchain = &env->callchain_buf;
 	struct bpf_scc_visit *visit;
 
-	if (!compute_scc_callchain(env, st, &callchain))
+	if (!compute_scc_callchain(env, st, callchain))
 		return false;
-	visit = scc_visit_lookup(env, &callchain);
+	visit = scc_visit_lookup(env, callchain);
 	if (!visit)
 		return false;
 	return !!visit->backedges;
-- 
cgit v1.2.3


From 2b9996417e4ec231c91818f9ea8107ae62ef75ad Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Fri, 4 Jul 2025 00:23:08 +0200
Subject: af_unix/scm: fix whitespace errors

Fix whitespace/formatting errors.

Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Kuniyuki Iwashima <kuniyu@google.com>
Cc: Lennart Poettering <mzxreary@0pointer.de>
Cc: Luca Boccassi <bluca@debian.org>
Cc: David Rheinsberg <david@readahead.eu>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Link: https://lore.kernel.org/20250703222314.309967-5-aleksandr.mikhalitsyn@canonical.com
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/net/scm.h  | 4 ++--
 net/unix/af_unix.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/scm.h b/include/net/scm.h
index 84c4707e78a5..c52519669349 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co
 static __inline__ void scm_set_cred(struct scm_cookie *scm,
 				    struct pid *pid, kuid_t uid, kgid_t gid)
 {
-	scm->pid  = get_pid(pid);
+	scm->pid = get_pid(pid);
 	scm->creds.pid = pid_vnr(pid);
 	scm->creds.uid = uid;
 	scm->creds.gid = gid;
@@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm,
 static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 {
 	put_pid(scm->pid);
-	scm->pid  = NULL;
+	scm->pid = NULL;
 }
 
 static __inline__ void scm_destroy(struct scm_cookie *scm)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index df2174d9904d..323e4fc85d4b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1929,7 +1929,7 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	struct scm_cookie scm;
 
 	memset(&scm, 0, sizeof(scm));
-	scm.pid  = UNIXCB(skb).pid;
+	scm.pid = UNIXCB(skb).pid;
 	if (UNIXCB(skb).fp)
 		unix_detach_fds(&scm, skb);
 
-- 
cgit v1.2.3


From 6d4405b16d37090a0c905079eab951cfb5044a65 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 3 Jul 2025 19:36:19 +0900
Subject: ata: libata-core: Cache the general purpose log directory

The function ata_log_supported() tests if a log page is supported by a
device using the General Purpose Log Directory log page, which lists the
size of all surported log pages. However, this log page is read from the
device using ata_read_log_page() every time ata_log_supported() is
called. That is not necessary.

Avoid reading the General Purpose Log Directory log page by caching its
content in the gp_log_dir buffer defined as part of struct ata_device.
The functions ata_read_log_directory() and ata_clear_log_directory() are
introduced to manage this buffer. ata_clear_log_directory() zero-fill
the gp_log_dir buffer every time ata_dev_configure() is called, that is,
when the device is first scanned and when it is being revalidated.
The function ata_log_supported() is modified to call
ata_read_log_directory() instead of ata_read_log_page().

The function ata_read_log_directory() calls ata_read_log_page() to read
the General Purpose Log Directory log page from the device only if the
first 16-bits word of the log is not equal to 0x0001, that is, it is not
equal to the ACS mandated value for the log version.

With this, the log page is read from the device only once for every
ata_dev_configure() call. For instance, with pr_debug enabled, a call
to ata_dev_configure() before this patch generates the following log
page accesses:

ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x13, page 0x0
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x12, page 0x0
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x30, page 0x8
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x30, page 0x3
ata3.00: read log page - log 0x30, page 0x4
ata3.00: read log page - log 0x18, page 0x0

That is, the general purpose log directory page is read 7 times.
With this patch applied, the number of accesses to this log page is
reduced to one:

ata3.00: read log page - log 0x0, page 0x0
ata3.00: read log page - log 0x13, page 0x0
ata3.00: read log page - log 0x12, page 0x0
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x30, page 0x8
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x30, page 0x0
ata3.00: read log page - log 0x30, page 0x3
ata3.00: read log page - log 0x30, page 0x4
ata3.00: read log page - log 0x18, page 0x0

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Link: https://lore.kernel.org/r/20250703103622.291272-2-dlemoal@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/libata.h    |  3 +++
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 7f6cebe61b33..30913bc6fe21 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2154,14 +2154,46 @@ retry:
 	return err_mask;
 }
 
+static inline void ata_clear_log_directory(struct ata_device *dev)
+{
+	memset(dev->gp_log_dir, 0, ATA_SECT_SIZE);
+}
+
+static int ata_read_log_directory(struct ata_device *dev)
+{
+	u16 version;
+
+	/* If the log page is already cached, do nothing. */
+	version = get_unaligned_le16(&dev->gp_log_dir[0]);
+	if (version == 0x0001)
+		return 0;
+
+	if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, dev->gp_log_dir, 1)) {
+		ata_clear_log_directory(dev);
+		return -EIO;
+	}
+
+	version = get_unaligned_le16(&dev->gp_log_dir[0]);
+	if (version != 0x0001) {
+		ata_dev_err(dev, "Invalid log directory version 0x%04x\n",
+			    version);
+		ata_clear_log_directory(dev);
+		dev->quirks |= ATA_QUIRK_NO_LOG_DIR;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int ata_log_supported(struct ata_device *dev, u8 log)
 {
 	if (dev->quirks & ATA_QUIRK_NO_LOG_DIR)
 		return 0;
 
-	if (ata_read_log_page(dev, ATA_LOG_DIRECTORY, 0, dev->sector_buf, 1))
+	if (ata_read_log_directory(dev))
 		return 0;
-	return get_unaligned_le16(&dev->sector_buf[log * 2]);
+
+	return get_unaligned_le16(&dev->gp_log_dir[log * 2]);
 }
 
 static bool ata_identify_page_supported(struct ata_device *dev, u8 page)
@@ -2890,6 +2922,9 @@ int ata_dev_configure(struct ata_device *dev)
 		return 0;
 	}
 
+	/* Clear the general purpose log directory cache. */
+	ata_clear_log_directory(dev);
+
 	/* Set quirks */
 	dev->quirks |= ata_dev_quirks(dev);
 	ata_force_quirks(dev);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7462218312ad..78a4addc6659 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -761,6 +761,9 @@ struct ata_device {
 		u32		gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */
 	} ____cacheline_aligned;
 
+	/* General Purpose Log Directory log page */
+	u8			gp_log_dir[ATA_SECT_SIZE] ____cacheline_aligned;
+
 	/* DEVSLP Timing Variables from Identify Device Data Log */
 	u8			devslp_timing[ATA_LOG_DEVSLP_SIZE];
 
-- 
cgit v1.2.3


From 31921e87b2d2614e261096fdabedef1db7679611 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Thu, 3 Jul 2025 19:36:22 +0900
Subject: ata: libata-core: Rename ata_do_set_mode()

With the renaming of libata-eh ata_set_mode() function to
ata_eh_set_mode(), libata-core function ata_do_set_mode() can now be
renamed to the simpler ata_set_mode().

All the call sites of the former ata_do_set_mode() are updated to use
the new function name.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Link: https://lore.kernel.org/r/20250703103622.291272-5-dlemoal@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c   | 6 +++---
 drivers/ata/libata-eh.c     | 2 +-
 drivers/ata/pata_optidma.c  | 4 +++-
 drivers/ata/pata_pcmcia.c   | 4 ++--
 drivers/ata/pata_pdc2027x.c | 2 +-
 drivers/ata/sata_sil.c      | 2 +-
 include/linux/libata.h      | 2 +-
 7 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 72abd2996e9c..bbf1318a2b9a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -3510,7 +3510,7 @@ static int ata_dev_set_mode(struct ata_device *dev)
 }
 
 /**
- *	ata_do_set_mode - Program timings and issue SET FEATURES - XFER
+ *	ata_set_mode - Program timings and issue SET FEATURES - XFER
  *	@link: link on which timings will be programmed
  *	@r_failed_dev: out parameter for failed device
  *
@@ -3526,7 +3526,7 @@ static int ata_dev_set_mode(struct ata_device *dev)
  *	0 on success, negative errno otherwise
  */
 
-int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
+int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 {
 	struct ata_port *ap = link->ap;
 	struct ata_device *dev;
@@ -3607,7 +3607,7 @@ int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev)
 		*r_failed_dev = dev;
 	return rc;
 }
-EXPORT_SYMBOL_GPL(ata_do_set_mode);
+EXPORT_SYMBOL_GPL(ata_set_mode);
 
 /**
  *	ata_wait_ready - wait for link to become ready
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 8b2a0a56ffe1..e5fa61fb8a59 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -3444,7 +3444,7 @@ static int ata_eh_set_mode(struct ata_link *link,
 	if (ap->ops->set_mode)
 		rc = ap->ops->set_mode(link, r_failed_dev);
 	else
-		rc = ata_do_set_mode(link, r_failed_dev);
+		rc = ata_set_mode(link, r_failed_dev);
 
 	/* if transfer mode has changed, set DUBIOUS_XFER on device */
 	ata_for_each_dev(dev, link, ENABLED) {
diff --git a/drivers/ata/pata_optidma.c b/drivers/ata/pata_optidma.c
index dfc36b4ec9c6..cc876dc7a9d8 100644
--- a/drivers/ata/pata_optidma.c
+++ b/drivers/ata/pata_optidma.c
@@ -322,7 +322,9 @@ static int optidma_set_mode(struct ata_link *link, struct ata_device **r_failed)
 	u8 r;
 	int nybble = 4 * ap->port_no;
 	struct pci_dev *pdev = to_pci_dev(ap->host->dev);
-	int rc  = ata_do_set_mode(link, r_failed);
+	int rc;
+
+	rc = ata_set_mode(link, r_failed);
 	if (rc == 0) {
 		pci_read_config_byte(pdev, 0x43, &r);
 
diff --git a/drivers/ata/pata_pcmcia.c b/drivers/ata/pata_pcmcia.c
index 5b602206c522..cf3810933a27 100644
--- a/drivers/ata/pata_pcmcia.c
+++ b/drivers/ata/pata_pcmcia.c
@@ -46,7 +46,7 @@ static int pcmcia_set_mode(struct ata_link *link, struct ata_device **r_failed_d
 	struct ata_device *slave = &link->device[1];
 
 	if (!ata_dev_enabled(master) || !ata_dev_enabled(slave))
-		return ata_do_set_mode(link, r_failed_dev);
+		return ata_set_mode(link, r_failed_dev);
 
 	if (memcmp(master->id + ATA_ID_FW_REV,  slave->id + ATA_ID_FW_REV,
 			   ATA_ID_FW_REV_LEN + ATA_ID_PROD_LEN) == 0) {
@@ -58,7 +58,7 @@ static int pcmcia_set_mode(struct ata_link *link, struct ata_device **r_failed_d
 			ata_dev_disable(slave);
 		}
 	}
-	return ata_do_set_mode(link, r_failed_dev);
+	return ata_set_mode(link, r_failed_dev);
 }
 
 /**
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index 6820c5597b14..a4ee3b92c9aa 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -387,7 +387,7 @@ static int pdc2027x_set_mode(struct ata_link *link, struct ata_device **r_failed
 	struct ata_device *dev;
 	int rc;
 
-	rc = ata_do_set_mode(link, r_failed);
+	rc = ata_set_mode(link, r_failed);
 	if (rc < 0)
 		return rc;
 
diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index 3a99f66198a9..1b6dc950a42a 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -351,7 +351,7 @@ static int sil_set_mode(struct ata_link *link, struct ata_device **r_failed)
 	u32 tmp, dev_mode[2] = { };
 	int rc;
 
-	rc = ata_do_set_mode(link, r_failed);
+	rc = ata_set_mode(link, r_failed);
 	if (rc)
 		return rc;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 78a4addc6659..d092747be588 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1218,7 +1218,7 @@ extern int ata_ncq_prio_enabled(struct ata_port *ap, struct scsi_device *sdev,
 extern int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev,
 			       bool enable);
 extern struct ata_device *ata_dev_pair(struct ata_device *adev);
-extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev);
+int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev);
 extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap);
 extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q);
 
-- 
cgit v1.2.3


From 9bc35edb9ac76b23285e7ceafa1576ce53da9b6c Mon Sep 17 00:00:00 2001
From: Etienne Carriere <etienne.carriere@foss.st.com>
Date: Tue, 3 Jun 2025 11:02:09 +0200
Subject: dt-bindings: regulator: Add STM32MP15 SCMI regulator identifiers

These bindings will be used for the SCMI voltage domain.

Signed-off-by: Etienne Carriere <etienne.carriere@foss.st.com>
Signed-off-by: Pascal Paillet <p.paillet@foss.st.com>
Signed-off-by: Amelie Delaunay <amelie.delaunay@foss.st.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250603-stm32mp157f-dk2-v2-3-5be0854a9299@foss.st.com
Signed-off-by: Alexandre Torgue <alexandre.torgue@foss.st.com>
---
 .../dt-bindings/regulator/st,stm32mp15-regulator.h | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 include/dt-bindings/regulator/st,stm32mp15-regulator.h

(limited to 'include')

diff --git a/include/dt-bindings/regulator/st,stm32mp15-regulator.h b/include/dt-bindings/regulator/st,stm32mp15-regulator.h
new file mode 100644
index 000000000000..7052507cb3e5
--- /dev/null
+++ b/include/dt-bindings/regulator/st,stm32mp15-regulator.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (C) 2025, STMicroelectronics - All Rights Reserved
+ */
+
+#ifndef __DT_BINDINGS_REGULATOR_ST_STM32MP15_REGULATOR_H
+#define __DT_BINDINGS_REGULATOR_ST_STM32MP15_REGULATOR_H
+
+/* SCMI voltage domain identifiers */
+
+/* SOC Internal regulators */
+#define VOLTD_SCMI_REG11		0
+#define VOLTD_SCMI_REG18		1
+#define VOLTD_SCMI_USB33		2
+
+/* STPMIC1 regulators */
+#define VOLTD_SCMI_STPMIC1_BUCK1	3
+#define VOLTD_SCMI_STPMIC1_BUCK2	4
+#define VOLTD_SCMI_STPMIC1_BUCK3	5
+#define VOLTD_SCMI_STPMIC1_BUCK4	6
+#define VOLTD_SCMI_STPMIC1_LDO1		7
+#define VOLTD_SCMI_STPMIC1_LDO2		8
+#define VOLTD_SCMI_STPMIC1_LDO3		9
+#define VOLTD_SCMI_STPMIC1_LDO4		10
+#define VOLTD_SCMI_STPMIC1_LDO5		11
+#define VOLTD_SCMI_STPMIC1_LDO6		12
+#define VOLTD_SCMI_STPMIC1_VREFDDR	13
+#define VOLTD_SCMI_STPMIC1_BOOST	14
+#define VOLTD_SCMI_STPMIC1_PWR_SW1	15
+#define VOLTD_SCMI_STPMIC1_PWR_SW2	16
+#define VOLTD_SCMI_VREFBUF		17
+
+/* External regulators */
+#define VOLTD_SCMI_REGU0		18
+#define VOLTD_SCMI_REGU1		19
+#define VOLTD_SCMI_REGU2		20
+#define VOLTD_SCMI_REGU3		21
+#define VOLTD_SCMI_REGU4		22
+
+#endif /*__DT_BINDINGS_REGULATOR_ST_STM32MP15_REGULATOR_H */
-- 
cgit v1.2.3


From 60bc47b5a0b164082d448815d7db3066266aa3ed Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Tue, 1 Jul 2025 19:02:13 +0800
Subject: watchdog/perf: Provide function for adjusting the event period

Architecture's using perf events for hard lockup detection needs to
convert the watchdog_thresh to the event's period, some architecture
for example arm64 perform this conversion using the CPU's maximum
frequency which will be acquired by cpufreq. However by the time
the lockup detector's initialized the cpufreq driver may not be
initialized, thus launch a watchdog with inaccurate period. Provide
a function hardlockup_detector_perf_adjust_period() to allowing
adjust the event period. Then architecture can update with more
accurate period if cpufreq is initialized.

Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20250701110214.27242-2-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/nmi.h    |  2 ++
 kernel/watchdog_perf.c | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e78fa535f61d..cf3c6ab408aa 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -103,10 +103,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_config_perf_event(const char *str);
+extern void hardlockup_detector_perf_adjust_period(u64 period);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_config_perf_event(const char *str) { }
+static inline void hardlockup_detector_perf_adjust_period(u64 period) { }
 #endif
 
 void watchdog_hardlockup_stop(void);
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 75af12ff774e..9c58f5b4381d 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -186,6 +186,28 @@ void watchdog_hardlockup_disable(unsigned int cpu)
 	}
 }
 
+/**
+ * hardlockup_detector_perf_adjust_period - Adjust the event period due
+ *                                          to current cpu frequency change
+ * @period: The target period to be set
+ */
+void hardlockup_detector_perf_adjust_period(u64 period)
+{
+	struct perf_event *event = this_cpu_read(watchdog_ev);
+
+	if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
+		return;
+
+	if (!event)
+		return;
+
+	if (event->attr.sample_period == period)
+		return;
+
+	if (perf_event_period(event, period))
+		pr_err("failed to change period to %llu\n", period);
+}
+
 /**
  * hardlockup_detector_perf_stop - Globally stop watchdog events
  *
-- 
cgit v1.2.3


From ccfb15b8158c11a8304204aeac354c7b1cfb18a3 Mon Sep 17 00:00:00 2001
From: "Vodapalli, Ravi Kumar" <ravi.kumar.vodapalli@intel.com>
Date: Fri, 4 Jul 2025 16:05:27 +0530
Subject: drm/xe/bmg: Add one additional PCI ID

One additional PCI ID is added in Bspec for BMG, Add it so that
driver recognizes this device with this new ID.

Bspec: 68090
Cc: stable@vger.kernel.org # v6.12+
Signed-off-by: Vodapalli, Ravi Kumar <ravi.kumar.vodapalli@intel.com>
Reviewed-by: Shekhar Chauhan <shekhar.chauhan@intel.com>
Acked-by: Matthew Auld <matthew.auld@intel.com>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Link: https://lore.kernel.org/r/20250704103527.100178-1-ravi.kumar.vodapalli@intel.com
---
 include/drm/intel/pciids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h
index a0180d10e260..76f8d26f9cc9 100644
--- a/include/drm/intel/pciids.h
+++ b/include/drm/intel/pciids.h
@@ -846,6 +846,7 @@
 /* BMG */
 #define INTEL_BMG_IDS(MACRO__, ...) \
 	MACRO__(0xE202, ## __VA_ARGS__), \
+	MACRO__(0xE209, ## __VA_ARGS__), \
 	MACRO__(0xE20B, ## __VA_ARGS__), \
 	MACRO__(0xE20C, ## __VA_ARGS__), \
 	MACRO__(0xE20D, ## __VA_ARGS__), \
-- 
cgit v1.2.3


From ca115d7e754691c0219eec95ec94dbac7f87daef Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 3 Jul 2025 09:36:41 +0200
Subject: tree-wide: s/struct fileattr/struct file_kattr/g

Now that we expose struct file_attr as our uapi struct rename all the
internal struct to struct file_kattr to clearly communicate that it is a
kernel internal struct. This is similar to struct mount_{k}attr and
others.

Link: https://lore.kernel.org/20250703-restlaufzeit-baurecht-9ed44552b481@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/locking.rst |  4 ++--
 Documentation/filesystems/vfs.rst     |  4 ++--
 fs/bcachefs/fs.c                      |  4 ++--
 fs/btrfs/ioctl.c                      |  4 ++--
 fs/btrfs/ioctl.h                      |  6 +++---
 fs/ecryptfs/inode.c                   |  4 ++--
 fs/efivarfs/inode.c                   |  4 ++--
 fs/ext2/ext2.h                        |  4 ++--
 fs/ext2/ioctl.c                       |  4 ++--
 fs/ext4/ext4.h                        |  4 ++--
 fs/ext4/ioctl.c                       |  4 ++--
 fs/f2fs/f2fs.h                        |  4 ++--
 fs/f2fs/file.c                        |  4 ++--
 fs/file_attr.c                        | 34 +++++++++++++++++-----------------
 fs/fuse/fuse_i.h                      |  4 ++--
 fs/fuse/ioctl.c                       |  4 ++--
 fs/gfs2/file.c                        |  4 ++--
 fs/gfs2/inode.h                       |  4 ++--
 fs/hfsplus/hfsplus_fs.h               |  4 ++--
 fs/hfsplus/inode.c                    |  4 ++--
 fs/jfs/ioctl.c                        |  4 ++--
 fs/jfs/jfs_inode.h                    |  4 ++--
 fs/nilfs2/ioctl.c                     |  4 ++--
 fs/nilfs2/nilfs.h                     |  4 ++--
 fs/ocfs2/ioctl.c                      |  4 ++--
 fs/ocfs2/ioctl.h                      |  4 ++--
 fs/orangefs/inode.c                   |  4 ++--
 fs/overlayfs/copy_up.c                |  4 ++--
 fs/overlayfs/inode.c                  | 12 ++++++------
 fs/overlayfs/overlayfs.h              | 10 +++++-----
 fs/overlayfs/util.c                   |  2 +-
 fs/ubifs/ioctl.c                      |  4 ++--
 fs/ubifs/ubifs.h                      |  4 ++--
 fs/xfs/xfs_ioctl.c                    | 18 +++++++++---------
 fs/xfs/xfs_ioctl.h                    |  4 ++--
 include/linux/fileattr.h              | 14 +++++++-------
 include/linux/fs.h                    |  6 +++---
 include/linux/lsm_hook_defs.h         |  4 ++--
 include/linux/security.h              |  8 ++++----
 include/uapi/linux/fs.h               |  2 +-
 mm/shmem.c                            |  4 ++--
 security/security.c                   |  4 ++--
 security/selinux/hooks.c              |  4 ++--
 43 files changed, 122 insertions(+), 122 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 2e567e341c3b..2ff02653d7cc 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -87,8 +87,8 @@ prototypes::
 	int (*tmpfile) (struct mnt_idmap *, struct inode *,
 			struct file *, umode_t);
 	int (*fileattr_set)(struct mnt_idmap *idmap,
-			    struct dentry *dentry, struct fileattr *fa);
-	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+			    struct dentry *dentry, struct file_kattr *fa);
+	int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa);
 	struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
 	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index fd32a9a17bfb..f2bbf4def123 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -515,8 +515,8 @@ As of kernel 2.6.22, the following members are defined:
 		struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int);
 	        int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
 		int (*fileattr_set)(struct mnt_idmap *idmap,
-				    struct dentry *dentry, struct fileattr *fa);
-		int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+				    struct dentry *dentry, struct file_kattr *fa);
+		int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa);
 	        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 	};
 
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 85d13f800165..7c4de887629c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1619,7 +1619,7 @@ static const __maybe_unused unsigned bch_flags_to_xflags[] = {
 };
 
 static int bch2_fileattr_get(struct dentry *dentry,
-			     struct fileattr *fa)
+			     struct file_kattr *fa)
 {
 	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -1682,7 +1682,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans,
 
 static int bch2_fileattr_set(struct mnt_idmap *idmap,
 			     struct dentry *dentry,
-			     struct fileattr *fa)
+			     struct file_kattr *fa)
 {
 	struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 913acef3f0a9..ffb28bfba4fa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -245,7 +245,7 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
  * Set flags/xflags from the internal inode flags. The remaining items of
  * fsxattr are zeroed.
  */
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 
@@ -254,7 +254,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int btrfs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa)
+		       struct dentry *dentry, struct file_kattr *fa)
 {
 	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 	struct btrfs_root *root = inode->root;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index e08ea446cf48..ccf6bed9cc24 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -8,7 +8,7 @@
 struct file;
 struct dentry;
 struct mnt_idmap;
-struct fileattr;
+struct file_kattr;
 struct io_uring_cmd;
 struct btrfs_inode;
 struct btrfs_fs_info;
@@ -16,9 +16,9 @@ struct btrfs_ioctl_balance_args;
 
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int btrfs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa);
+		       struct dentry *dentry, struct file_kattr *fa);
 int btrfs_ioctl_get_supported_features(void __user *arg);
 void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 493d7f194956..d83416af17b4 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1124,13 +1124,13 @@ out:
 	return rc;
 }
 
-static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa);
 }
 
 static int ecryptfs_fileattr_set(struct mnt_idmap *idmap,
-				 struct dentry *dentry, struct fileattr *fa)
+				 struct dentry *dentry, struct file_kattr *fa)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	int rc;
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 98a7299a9ee9..2891614abf8d 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -138,7 +138,7 @@ const struct inode_operations efivarfs_dir_inode_operations = {
 };
 
 static int
-efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	unsigned int i_flags;
 	unsigned int flags = 0;
@@ -154,7 +154,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 
 static int
 efivarfs_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	unsigned int i_flags = 0;
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4025f875252a..cf97b76e9fd3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -750,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
 
 /* ioctl.c */
-extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 extern int ext2_fileattr_set(struct mnt_idmap *idmap,
-			     struct dentry *dentry, struct fileattr *fa);
+			     struct dentry *dentry, struct file_kattr *fa);
 extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
 
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 44e04484e570..c3fea55b8efa 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -18,7 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/fileattr.h>
 
-int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
 
@@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int ext2_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ext2_inode_info *ei = EXT2_I(inode);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 18373de980f2..7d962e7f388a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3103,8 +3103,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 int ext4_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa);
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+		      struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 extern void ext4_reset_inode_seed(struct inode *inode);
 int ext4_update_overhead(struct super_block *sb, bool force);
 int ext4_force_shutdown(struct super_block *sb, u32 flags);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5668a17458ae..84e3c73952d7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -980,7 +980,7 @@ group_add_out:
 	return err;
 }
 
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int ext4_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	u32 flags = fa->flags;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9333a22b9a01..c78464792ceb 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3615,9 +3615,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
 int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
 						bool readonly, bool need_lock);
 int f2fs_precache_extents(struct inode *inode);
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int f2fs_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa);
+		      struct dentry *dentry, struct file_kattr *fa);
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 6bd3de64f2a8..90180ca22abd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3356,7 +3356,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
 }
 #endif
 
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -3380,7 +3380,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int f2fs_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL;
diff --git a/fs/file_attr.c b/fs/file_attr.c
index 21d6a0607345..17745c89e2be 100644
--- a/fs/file_attr.c
+++ b/fs/file_attr.c
@@ -17,7 +17,7 @@
  * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
  * other fields are zeroed.
  */
-void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
+void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
 {
 	memset(fa, 0, sizeof(*fa));
 	fa->fsx_valid = true;
@@ -47,7 +47,7 @@ EXPORT_SYMBOL(fileattr_fill_xflags);
  * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
  * All other fields are zeroed.
  */
-void fileattr_fill_flags(struct fileattr *fa, u32 flags)
+void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
 {
 	memset(fa, 0, sizeof(*fa));
 	fa->flags_valid = true;
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(fileattr_fill_flags);
  *
  * Return: 0 on success, or a negative error on failure.
  */
-int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	int error;
@@ -94,7 +94,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 EXPORT_SYMBOL(vfs_fileattr_get);
 
-static void fileattr_to_file_attr(const struct fileattr *fa,
+static void fileattr_to_file_attr(const struct file_kattr *fa,
 				  struct file_attr *fattr)
 {
 	__u32 mask = FS_XFLAGS_MASK;
@@ -114,7 +114,7 @@ static void fileattr_to_file_attr(const struct fileattr *fa,
  *
  * Return: 0 on success, or -EFAULT on failure.
  */
-int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
+int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa)
 {
 	struct fsxattr xfa;
 	__u32 mask = FS_XFLAGS_MASK;
@@ -134,7 +134,7 @@ int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
 EXPORT_SYMBOL(copy_fsxattr_to_user);
 
 static int file_attr_to_fileattr(const struct file_attr *fattr,
-				 struct fileattr *fa)
+				 struct file_kattr *fa)
 {
 	__u32 mask = FS_XFLAGS_MASK;
 
@@ -150,7 +150,7 @@ static int file_attr_to_fileattr(const struct file_attr *fattr,
 	return 0;
 }
 
-static int copy_fsxattr_from_user(struct fileattr *fa,
+static int copy_fsxattr_from_user(struct file_kattr *fa,
 				  struct fsxattr __user *ufa)
 {
 	struct fsxattr xfa;
@@ -179,8 +179,8 @@ static int copy_fsxattr_from_user(struct fileattr *fa,
  * Note: must be called with inode lock held.
  */
 static int fileattr_set_prepare(struct inode *inode,
-			      const struct fileattr *old_ma,
-			      struct fileattr *fa)
+			      const struct file_kattr *old_ma,
+			      struct file_kattr *fa)
 {
 	int err;
 
@@ -263,10 +263,10 @@ static int fileattr_set_prepare(struct inode *inode,
  * Return: 0 on success, or a negative error on failure.
  */
 int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct fileattr *fa)
+		     struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
-	struct fileattr old_ma = {};
+	struct file_kattr old_ma = {};
 	int err;
 
 	if (!inode->i_op->fileattr_set)
@@ -308,7 +308,7 @@ EXPORT_SYMBOL(vfs_fileattr_set);
 
 int ioctl_getflags(struct file *file, unsigned int __user *argp)
 {
-	struct fileattr fa = { .flags_valid = true }; /* hint only */
+	struct file_kattr fa = { .flags_valid = true }; /* hint only */
 	int err;
 
 	err = vfs_fileattr_get(file->f_path.dentry, &fa);
@@ -324,7 +324,7 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp)
 {
 	struct mnt_idmap *idmap = file_mnt_idmap(file);
 	struct dentry *dentry = file->f_path.dentry;
-	struct fileattr fa;
+	struct file_kattr fa;
 	unsigned int flags;
 	int err;
 
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(ioctl_setflags);
 
 int ioctl_fsgetxattr(struct file *file, void __user *argp)
 {
-	struct fileattr fa = { .fsx_valid = true }; /* hint only */
+	struct file_kattr fa = { .fsx_valid = true }; /* hint only */
 	int err;
 
 	err = vfs_fileattr_get(file->f_path.dentry, &fa);
@@ -362,7 +362,7 @@ int ioctl_fssetxattr(struct file *file, void __user *argp)
 {
 	struct mnt_idmap *idmap = file_mnt_idmap(file);
 	struct dentry *dentry = file->f_path.dentry;
-	struct fileattr fa;
+	struct file_kattr fa;
 	int err;
 
 	err = copy_fsxattr_from_user(&fa, argp);
@@ -387,7 +387,7 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
 	struct filename *name __free(putname) = NULL;
 	unsigned int lookup_flags = 0;
 	struct file_attr fattr;
-	struct fileattr fa;
+	struct file_kattr fa;
 	int error;
 
 	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
@@ -442,7 +442,7 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
 	struct filename *name __free(putname) = NULL;
 	unsigned int lookup_flags = 0;
 	struct file_attr fattr;
-	struct fileattr fa;
+	struct file_kattr fa;
 	int error;
 
 	BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b54f4f57789f..501f64ceeab3 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1486,9 +1486,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc);
 long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg);
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int fuse_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa);
+		      struct dentry *dentry, struct file_kattr *fa);
 
 /* iomode.c */
 int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index f2692f7d5932..57032eadca6c 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
 	fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
 }
 
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct fuse_file *ff;
@@ -542,7 +542,7 @@ cleanup:
 }
 
 int fuse_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct fuse_file *ff;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fd1147aa3891..65f4371f428c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -155,7 +155,7 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
 	return fsflags;
 }
 
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -276,7 +276,7 @@ out:
 }
 
 int gfs2_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa)
+		      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	u32 fsflags = fa->flags, gfsflags = 0;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index eafe123617e6..dd970e644fe0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -107,9 +107,9 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset);
 extern const struct file_operations gfs2_file_fops_nolock;
 extern const struct file_operations gfs2_dir_fops_nolock;
 
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int gfs2_fileattr_set(struct mnt_idmap *idmap,
-		      struct dentry *dentry, struct fileattr *fa);
+		      struct dentry *dentry, struct file_kattr *fa);
 void gfs2_set_inode_flags(struct inode *inode);
 
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2f089bff0095..927db2b8b17c 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -489,9 +489,9 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
 		    unsigned int query_flags);
 int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 		       int datasync);
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int hfsplus_fileattr_set(struct mnt_idmap *idmap,
-			 struct dentry *dentry, struct fileattr *fa);
+			 struct dentry *dentry, struct file_kattr *fa);
 
 /* ioctl.c */
 long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f331e9574217..3ec0b33808c0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -654,7 +654,7 @@ out:
 	return res;
 }
 
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@ -673,7 +673,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int hfsplus_fileattr_set(struct mnt_idmap *idmap,
-			 struct dentry *dentry, struct fileattr *fa)
+			 struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index f7bd7e8f5be4..563f148be8af 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -57,7 +57,7 @@ static long jfs_map_ext2(unsigned long flags, int from)
 	return mapped;
 }
 
-int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry));
 	unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
@@ -71,7 +71,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int jfs_fileattr_set(struct mnt_idmap *idmap,
-		     struct dentry *dentry, struct fileattr *fa)
+		     struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct jfs_inode_info *jfs_inode = JFS_IP(inode);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index ea80661597ac..2c6c81c8cb9f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -9,9 +9,9 @@ struct fid;
 
 extern struct inode *ialloc(struct inode *, umode_t);
 extern int jfs_fsync(struct file *, loff_t, loff_t, int);
-extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 extern int jfs_fileattr_set(struct mnt_idmap *idmap,
-			    struct dentry *dentry, struct fileattr *fa);
+			    struct dentry *dentry, struct file_kattr *fa);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index a66d62a51f77..3288c3b4be9e 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -118,7 +118,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
  *
  * Return: always 0 as success.
  */
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 
@@ -136,7 +136,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
  * Return: 0 on success, or a negative error code on failure.
  */
 int nilfs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa)
+		       struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct nilfs_transaction_info ti;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index cb6ed54accd7..f466daa39440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -268,9 +268,9 @@ int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
 
 /* ioctl.c */
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m);
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m);
 int nilfs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa);
+		       struct dentry *dentry, struct file_kattr *fa);
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
 long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7ae96fb8807a..db14c92302a1 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req)
 	return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
 }
 
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int flags;
@@ -83,7 +83,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int ocfs2_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa)
+		       struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	unsigned int flags = fa->flags;
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 48a5fdfe87a1..4a1c2313b429 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -11,9 +11,9 @@
 #ifndef OCFS2_IOCTL_PROTO_H
 #define OCFS2_IOCTL_PROTO_H
 
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int ocfs2_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa);
+		       struct dentry *dentry, struct file_kattr *fa);
 long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 08a6f372a352..926d1659902d 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -887,7 +887,7 @@ int orangefs_update_time(struct inode *inode, int flags)
 	return __orangefs_setattr(inode, &iattr);
 }
 
-static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int orangefs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	u64 val = 0;
 	int ret;
@@ -908,7 +908,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 static int orangefs_fileattr_set(struct mnt_idmap *idmap,
-				 struct dentry *dentry, struct fileattr *fa)
+				 struct dentry *dentry, struct file_kattr *fa)
 {
 	u64 val = 0;
 
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2c646b7076d0..74817e1ece19 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -171,8 +171,8 @@ out:
 static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
 			     const struct path *new)
 {
-	struct fileattr oldfa = { .flags_valid = true };
-	struct fileattr newfa = { .flags_valid = true };
+	struct file_kattr oldfa = { .flags_valid = true };
+	struct file_kattr newfa = { .flags_valid = true };
 	int err;
 
 	err = ovl_real_fileattr_get(old, &oldfa);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index cf3581dc1034..ecb9f2019395 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
  * Introducing security_inode_fileattr_get/set() hooks would solve this issue
  * properly.
  */
-static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa,
 				 bool set)
 {
 	struct file *file;
@@ -637,7 +637,7 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f
 	return err;
 }
 
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa)
 {
 	int err;
 
@@ -649,7 +649,7 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
 }
 
 int ovl_fileattr_set(struct mnt_idmap *idmap,
-		     struct dentry *dentry, struct fileattr *fa)
+		     struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct path upperpath;
@@ -697,7 +697,7 @@ out:
 }
 
 /* Convert inode protection flags to fileattr flags */
-static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
+static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa)
 {
 	BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
 	BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
@@ -712,7 +712,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
 	}
 }
 
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)
 {
 	int err;
 
@@ -723,7 +723,7 @@ int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
 	return vfs_fileattr_get(realpath->dentry, fa);
 }
 
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct path realpath;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 8baaba0a3fe5..e19d91f22186 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -815,7 +815,7 @@ void ovl_copyattr(struct inode *to);
 
 void ovl_check_protattr(struct inode *inode, struct dentry *upper);
 int ovl_set_protattr(struct inode *inode, struct dentry *upper,
-		      struct fileattr *fa);
+		      struct file_kattr *fa);
 
 static inline void ovl_copyflags(struct inode *from, struct inode *to)
 {
@@ -847,11 +847,11 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
 
 /* file.c */
 extern const struct file_operations ovl_file_operations;
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa);
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int ovl_fileattr_set(struct mnt_idmap *idmap,
-		     struct dentry *dentry, struct fileattr *fa);
+		     struct dentry *dentry, struct file_kattr *fa);
 struct ovl_file;
 struct ovl_file *ovl_file_alloc(struct file *realfile);
 void ovl_file_free(struct ovl_file *of);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index dcccb4b4a66c..607860f199a8 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -959,7 +959,7 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper)
 }
 
 int ovl_set_protattr(struct inode *inode, struct dentry *upper,
-		      struct fileattr *fa)
+		      struct file_kattr *fa)
 {
 	struct ovl_fs *ofs = OVL_FS(inode->i_sb);
 	char buf[OVL_PROTATTR_MAX];
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 2c99349cf537..79536b2e3d7a 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -130,7 +130,7 @@ static int setflags(struct inode *inode, int flags)
 	return err;
 }
 
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	int flags = ubifs2ioctl(ubifs_inode(inode)->flags);
@@ -145,7 +145,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 int ubifs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa)
+		       struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	int flags = fa->flags;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 256dbaeeb0de..5db45c9e26ee 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2073,9 +2073,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place);
 void ubifs_destroy_size_tree(struct ubifs_info *c);
 
 /* ioctl.c */
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int ubifs_fileattr_set(struct mnt_idmap *idmap,
-		       struct dentry *dentry, struct fileattr *fa);
+		       struct dentry *dentry, struct file_kattr *fa);
 long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void ubifs_set_inode_flags(struct inode *inode);
 #ifdef CONFIG_COMPAT
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d250f7f74e3b..911b68175ad8 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -444,7 +444,7 @@ static void
 xfs_fill_fsxattr(
 	struct xfs_inode	*ip,
 	int			whichfork,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
@@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
 	xfs_inode_t		*ip,
 	void			__user *arg)
 {
-	struct fileattr		fa;
+	struct file_kattr	fa;
 
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
@@ -508,7 +508,7 @@ xfs_ioc_fsgetxattra(
 int
 xfs_fileattr_get(
 	struct dentry		*dentry,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 
@@ -526,7 +526,7 @@ static int
 xfs_ioctl_setattr_xflags(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	bool			rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
@@ -582,7 +582,7 @@ xfs_ioctl_setattr_xflags(
 static void
 xfs_ioctl_setattr_prepare_dax(
 	struct xfs_inode	*ip,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct inode            *inode = VFS_I(ip);
@@ -642,7 +642,7 @@ out_error:
 static int
 xfs_ioctl_setattr_check_extsize(
 	struct xfs_inode	*ip,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_failaddr_t		failaddr;
@@ -684,7 +684,7 @@ xfs_ioctl_setattr_check_extsize(
 static int
 xfs_ioctl_setattr_check_cowextsize(
 	struct xfs_inode	*ip,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_failaddr_t		failaddr;
@@ -709,7 +709,7 @@ xfs_ioctl_setattr_check_cowextsize(
 static int
 xfs_ioctl_setattr_check_projid(
 	struct xfs_inode	*ip,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	if (!fa->fsx_valid)
 		return 0;
@@ -725,7 +725,7 @@ int
 xfs_fileattr_set(
 	struct mnt_idmap	*idmap,
 	struct dentry		*dentry,
-	struct fileattr		*fa)
+	struct file_kattr	*fa)
 {
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 	struct xfs_mount	*mp = ip->i_mount;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 12124946f347..f5ed5cf9d3df 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -17,13 +17,13 @@ xfs_ioc_swapext(
 extern int
 xfs_fileattr_get(
 	struct dentry		*dentry,
-	struct fileattr		*fa);
+	struct file_kattr	*fa);
 
 extern int
 xfs_fileattr_set(
 	struct mnt_idmap	*idmap,
 	struct dentry		*dentry,
-	struct fileattr		*fa);
+	struct file_kattr	*fa);
 
 extern long
 xfs_file_ioctl(
diff --git a/include/linux/fileattr.h b/include/linux/fileattr.h
index e2a2f4ae242d..f89dcfad3f8f 100644
--- a/include/linux/fileattr.h
+++ b/include/linux/fileattr.h
@@ -40,7 +40,7 @@
  * is handled by the VFS helpers, so filesystems are free to implement just one
  * or both of these sub-interfaces.
  */
-struct fileattr {
+struct file_kattr {
 	u32	flags;		/* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */
 	/* struct fsxattr: */
 	u32	fsx_xflags;	/* xflags field value (get/set) */
@@ -53,10 +53,10 @@ struct fileattr {
 	bool	fsx_valid:1;
 };
 
-int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa);
+int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa);
 
-void fileattr_fill_xflags(struct fileattr *fa, u32 xflags);
-void fileattr_fill_flags(struct fileattr *fa, u32 flags);
+void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags);
+void fileattr_fill_flags(struct file_kattr *fa, u32 flags);
 
 /**
  * fileattr_has_fsx - check for extended flags/attributes
@@ -65,16 +65,16 @@ void fileattr_fill_flags(struct fileattr *fa, u32 flags);
  * Return: true if any attributes are present that are not represented in
  * ->flags.
  */
-static inline bool fileattr_has_fsx(const struct fileattr *fa)
+static inline bool fileattr_has_fsx(const struct file_kattr *fa)
 {
 	return fa->fsx_valid &&
 		((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 ||
 		 fa->fsx_projid != 0 ||	fa->fsx_cowextsize != 0);
 }
 
-int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
 int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
-		     struct fileattr *fa);
+		     struct file_kattr *fa);
 int ioctl_getflags(struct file *file, unsigned int __user *argp);
 int ioctl_setflags(struct file *file, unsigned int __user *argp);
 int ioctl_fsgetxattr(struct file *file, void __user *argp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..0c58617645ea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -80,7 +80,7 @@ struct fsnotify_mark_connector;
 struct fsnotify_sb_info;
 struct fs_context;
 struct fs_parameter_spec;
-struct fileattr;
+struct file_kattr;
 struct iomap_ops;
 
 extern void __init inode_init(void);
@@ -2254,8 +2254,8 @@ struct inode_operations {
 	int (*set_acl)(struct mnt_idmap *, struct dentry *,
 		       struct posix_acl *, int);
 	int (*fileattr_set)(struct mnt_idmap *idmap,
-			    struct dentry *dentry, struct fileattr *fa);
-	int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
+			    struct dentry *dentry, struct file_kattr *fa);
+	int (*fileattr_get)(struct dentry *dentry, struct file_kattr *fa);
 	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 } ____cacheline_aligned;
 
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 9600a4350e79..fd11fffdd3c3 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -157,8 +157,8 @@ LSM_HOOK(int, 0, inode_removexattr, struct mnt_idmap *idmap,
 	 struct dentry *dentry, const char *name)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_removexattr, struct dentry *dentry,
 	 const char *name)
-LSM_HOOK(int, 0, inode_file_setattr, struct dentry *dentry, struct fileattr *fa)
-LSM_HOOK(int, 0, inode_file_getattr, struct dentry *dentry, struct fileattr *fa)
+LSM_HOOK(int, 0, inode_file_setattr, struct dentry *dentry, struct file_kattr *fa)
+LSM_HOOK(int, 0, inode_file_getattr, struct dentry *dentry, struct file_kattr *fa)
 LSM_HOOK(int, 0, inode_set_acl, struct mnt_idmap *idmap,
 	 struct dentry *dentry, const char *acl_name, struct posix_acl *kacl)
 LSM_HOOK(void, LSM_RET_VOID, inode_post_set_acl, struct dentry *dentry,
diff --git a/include/linux/security.h b/include/linux/security.h
index 9ed0d0e0c81f..b95b5540c429 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -452,9 +452,9 @@ int security_inode_removexattr(struct mnt_idmap *idmap,
 			       struct dentry *dentry, const char *name);
 void security_inode_post_removexattr(struct dentry *dentry, const char *name);
 int security_inode_file_setattr(struct dentry *dentry,
-			      struct fileattr *fa);
+			      struct file_kattr *fa);
 int security_inode_file_getattr(struct dentry *dentry,
-			      struct fileattr *fa);
+			      struct file_kattr *fa);
 int security_inode_need_killpriv(struct dentry *dentry);
 int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry);
 int security_inode_getsecurity(struct mnt_idmap *idmap,
@@ -1057,13 +1057,13 @@ static inline void security_inode_post_removexattr(struct dentry *dentry,
 { }
 
 static inline int security_inode_file_setattr(struct dentry *dentry,
-					      struct fileattr *fa)
+					      struct file_kattr *fa)
 {
 	return 0;
 }
 
 static inline int security_inode_file_getattr(struct dentry *dentry,
-					      struct fileattr *fa)
+					      struct file_kattr *fa)
 {
 	return 0;
 }
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 9663dbdda181..6e136c9c6a22 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -151,7 +151,7 @@ struct fsxattr {
 /*
  * Variable size structure for file_[sg]et_attr().
  *
- * Note. This is alternative to the structure 'struct fileattr'/'struct fsxattr'.
+ * Note. This is alternative to the structure 'struct file_kattr'/'struct fsxattr'.
  * As this structure is passed to/from userspace with its size, this can
  * be versioned based on the size.
  */
diff --git a/mm/shmem.c b/mm/shmem.c
index 0c5fb4ffa03a..6311fe35c577 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4183,7 +4183,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
 
 #ifdef CONFIG_TMPFS_XATTR
 
-static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
 {
 	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
 
@@ -4193,7 +4193,7 @@ static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 }
 
 static int shmem_fileattr_set(struct mnt_idmap *idmap,
-			      struct dentry *dentry, struct fileattr *fa)
+			      struct dentry *dentry, struct file_kattr *fa)
 {
 	struct inode *inode = d_inode(dentry);
 	struct shmem_inode_info *info = SHMEM_I(inode);
diff --git a/security/security.c b/security/security.c
index 711b4de40b8d..a5766cbf6f7c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2632,7 +2632,7 @@ void security_inode_post_removexattr(struct dentry *dentry, const char *name)
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_inode_file_setattr(struct dentry *dentry, struct fileattr *fa)
+int security_inode_file_setattr(struct dentry *dentry, struct file_kattr *fa)
 {
 	return call_int_hook(inode_file_setattr, dentry, fa);
 }
@@ -2647,7 +2647,7 @@ int security_inode_file_setattr(struct dentry *dentry, struct fileattr *fa)
  *
  * Return: Returns 0 if permission is granted.
  */
-int security_inode_file_getattr(struct dentry *dentry, struct fileattr *fa)
+int security_inode_file_getattr(struct dentry *dentry, struct file_kattr *fa)
 {
 	return call_int_hook(inode_file_getattr, dentry, fa);
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index be7aca2269fa..0dadce2267c1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3481,13 +3481,13 @@ static int selinux_inode_removexattr(struct mnt_idmap *idmap,
 }
 
 static int selinux_inode_file_setattr(struct dentry *dentry,
-				      struct fileattr *fa)
+				      struct file_kattr *fa)
 {
 	return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
 }
 
 static int selinux_inode_file_getattr(struct dentry *dentry,
-				      struct fileattr *fa)
+				      struct file_kattr *fa)
 {
 	return dentry_has_perm(current_cred(), dentry, FILE__GETATTR);
 }
-- 
cgit v1.2.3


From 3135d5be7c27841526d98150c245304ab312e9f4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:34 -0700
Subject: lib/crypto: sha256: Reorder some code

First, move the declarations of sha224_init/update/final to be just
above the corresponding SHA-256 code, matching the order that I used for
SHA-384 and SHA-512.  In sha2.h, the end result is that SHA-224,
SHA-256, SHA-384, and SHA-512 are all in the logical order.

Second, move sha224_block_init() and sha256_block_init() to be just
below crypto_sha256_state.  In later changes, these functions as well as
struct crypto_sha256_state will no longer be used by the library
functions.  They'll remain just for some legacy offload drivers.  This
gets them into a logical place in the file for that.

No code changes other than reordering.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 60 +++++++++++++++++++++++++--------------------------
 lib/crypto/sha256.c   | 12 +++++------
 2 files changed, 36 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 296ce9d468bf..bb181b7996cd 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -71,6 +71,32 @@ struct crypto_sha256_state {
 	u64 count;
 };
 
+static inline void sha224_block_init(struct crypto_sha256_state *sctx)
+{
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+}
+
+static inline void sha256_block_init(struct crypto_sha256_state *sctx)
+{
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+}
+
 struct sha256_state {
 	union {
 		struct crypto_sha256_state ctx;
@@ -88,18 +114,12 @@ struct sha512_state {
 	u8 buf[SHA512_BLOCK_SIZE];
 };
 
-static inline void sha256_block_init(struct crypto_sha256_state *sctx)
+static inline void sha224_init(struct sha256_state *sctx)
 {
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
+	sha224_block_init(&sctx->ctx);
 }
+/* Simply use sha256_update as it is equivalent to sha224_update. */
+void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]);
 
 static inline void sha256_init(struct sha256_state *sctx)
 {
@@ -109,26 +129,6 @@ void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len);
 void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
-static inline void sha224_block_init(struct crypto_sha256_state *sctx)
-{
-	sctx->state[0] = SHA224_H0;
-	sctx->state[1] = SHA224_H1;
-	sctx->state[2] = SHA224_H2;
-	sctx->state[3] = SHA224_H3;
-	sctx->state[4] = SHA224_H4;
-	sctx->state[5] = SHA224_H5;
-	sctx->state[6] = SHA224_H6;
-	sctx->state[7] = SHA224_H7;
-	sctx->count = 0;
-}
-
-static inline void sha224_init(struct sha256_state *sctx)
-{
-	sha224_block_init(&sctx->ctx);
-}
-/* Simply use sha256_update as it is equivalent to sha224_update. */
-void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]);
-
 /* State for the SHA-512 (and SHA-384) compression function */
 struct sha512_block_state {
 	u64 h[8];
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 6bfa4ae8dfb5..573ccecbf48b 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -58,18 +58,18 @@ static inline void __sha256_final(struct sha256_state *sctx, u8 *out,
 	memzero_explicit(sctx, sizeof(*sctx));
 }
 
-void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
-{
-	__sha256_final(sctx, out, SHA256_DIGEST_SIZE);
-}
-EXPORT_SYMBOL(sha256_final);
-
 void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE])
 {
 	__sha256_final(sctx, out, SHA224_DIGEST_SIZE);
 }
 EXPORT_SYMBOL(sha224_final);
 
+void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
+{
+	__sha256_final(sctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL(sha256_final);
+
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 {
 	struct sha256_state sctx;
-- 
cgit v1.2.3


From 9f97707bdb1e479ea15e14e5525164f5f1128e97 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:35 -0700
Subject: lib/crypto: sha256: Remove sha256_blocks_simd()

Instead of having both sha256_blocks_arch() and sha256_blocks_simd(),
instead have just sha256_blocks_arch() which uses the most efficient
implementation that is available in the calling context.

This is simpler, as it reduces the API surface.  It's also safer, since
sha256_blocks_arch() just works in all contexts, including contexts
where the FPU/SIMD/vector registers cannot be used.  This doesn't mean
that SHA-256 computations *should* be done in such contexts, but rather
we should just do the right thing instead of corrupting a random task's
registers.  Eliminating this footgun and simplifying the code is well
worth the very small performance cost of doing the check.

Note: in the case of arm and arm64, what used to be sha256_blocks_arch()
is renamed back to its original name of sha256_block_data_order().
sha256_blocks_arch() is now used for the higher-level dispatch function.
This renaming also required an update to lib/crypto/arm64/sha512.h,
since sha2-armv8.pl is shared by both SHA-256 and SHA-512.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/internal/sha2.h |  6 ------
 lib/crypto/Kconfig             |  8 --------
 lib/crypto/arm/Kconfig         |  1 -
 lib/crypto/arm/sha256-armv4.pl | 20 ++++++++++----------
 lib/crypto/arm/sha256.c        | 14 +++++++-------
 lib/crypto/arm64/Kconfig       |  1 -
 lib/crypto/arm64/sha2-armv8.pl |  2 +-
 lib/crypto/arm64/sha256.c      | 14 +++++++-------
 lib/crypto/arm64/sha512.h      |  6 +++---
 lib/crypto/riscv/Kconfig       |  1 -
 lib/crypto/riscv/sha256.c      | 12 +++---------
 lib/crypto/x86/Kconfig         |  1 -
 lib/crypto/x86/sha256.c        | 12 +++---------
 13 files changed, 34 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h
index 21a27fd5e198..5a25ccc49388 100644
--- a/include/crypto/internal/sha2.h
+++ b/include/crypto/internal/sha2.h
@@ -3,7 +3,6 @@
 #ifndef _CRYPTO_INTERNAL_SHA2_H
 #define _CRYPTO_INTERNAL_SHA2_H
 
-#include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
 #include <linux/compiler_attributes.h>
 #include <linux/string.h>
@@ -22,8 +21,6 @@ void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS],
 			   const u8 *data, size_t nblocks);
 void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks);
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
-			const u8 *data, size_t nblocks);
 
 static __always_inline void sha256_choose_blocks(
 	u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks,
@@ -31,9 +28,6 @@ static __always_inline void sha256_choose_blocks(
 {
 	if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic)
 		sha256_blocks_generic(state, data, nblocks);
-	else if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD) &&
-		 (force_simd || crypto_simd_usable()))
-		sha256_blocks_simd(state, data, nblocks);
 	else
 		sha256_blocks_arch(state, data, nblocks);
 }
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 2460ddff967f..9bd740475a89 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -150,14 +150,6 @@ config CRYPTO_ARCH_HAVE_LIB_SHA256
 	  Declares whether the architecture provides an arch-specific
 	  accelerated implementation of the SHA-256 library interface.
 
-config CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
-	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the SHA-256 library interface
-	  that is SIMD-based and therefore not usable in hardirq
-	  context.
-
 config CRYPTO_LIB_SHA256_GENERIC
 	tristate
 	default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig
index d1ad664f0c67..9f3ff30f4032 100644
--- a/lib/crypto/arm/Kconfig
+++ b/lib/crypto/arm/Kconfig
@@ -28,4 +28,3 @@ config CRYPTO_SHA256_ARM
 	depends on !CPU_V7M
 	default CRYPTO_LIB_SHA256
 	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
index 8122db7fd599..f3a2b54efd4e 100644
--- a/lib/crypto/arm/sha256-armv4.pl
+++ b/lib/crypto/arm/sha256-armv4.pl
@@ -204,18 +204,18 @@ K256:
 .word	0				@ terminator
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha256_blocks_arch
+.word	OPENSSL_armcap_P-sha256_block_data_order
 #endif
 .align	5
 
-.global	sha256_blocks_arch
-.type	sha256_blocks_arch,%function
-sha256_blocks_arch:
-.Lsha256_blocks_arch:
+.global	sha256_block_data_order
+.type	sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
 #if __ARM_ARCH__<7
-	sub	r3,pc,#8		@ sha256_blocks_arch
+	sub	r3,pc,#8		@ sha256_block_data_order
 #else
-	adr	r3,.Lsha256_blocks_arch
+	adr	r3,.Lsha256_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
 	ldr	r12,.LOPENSSL_armcap
@@ -282,7 +282,7 @@ $code.=<<___;
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
 #endif
-.size	sha256_blocks_arch,.-sha256_blocks_arch
+.size	sha256_block_data_order,.-sha256_block_data_order
 ___
 ######################################################################
 # NEON stuff
@@ -470,8 +470,8 @@ sha256_block_data_order_neon:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	$H,sp,#16*4+16
-	adr	$Ktbl,.Lsha256_blocks_arch
-	sub	$Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
+	adr	$Ktbl,.Lsha256_block_data_order
+	sub	$Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
 	bic	$H,$H,#15		@ align for 128-bit stores
 	mov	$t2,sp
 	mov	sp,$H			@ alloca
diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c
index 109192e54b0f..2c9cfdaaa069 100644
--- a/lib/crypto/arm/sha256.c
+++ b/lib/crypto/arm/sha256.c
@@ -6,12 +6,12 @@
  */
 #include <asm/neon.h>
 #include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-				   const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+					const u8 *data, size_t nblocks);
 asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
 					     const u8 *data, size_t nblocks);
 asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
@@ -20,11 +20,11 @@ asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
 
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon)) {
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
 		kernel_neon_begin();
 		if (static_branch_likely(&have_ce))
 			sha256_ce_transform(state, data, nblocks);
@@ -32,10 +32,10 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
 			sha256_block_data_order_neon(state, data, nblocks);
 		kernel_neon_end();
 	} else {
-		sha256_blocks_arch(state, data, nblocks);
+		sha256_block_data_order(state, data, nblocks);
 	}
 }
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
 bool sha256_is_arch_optimized(void)
 {
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
index 129a7685cb4c..49e57bfdb5b5 100644
--- a/lib/crypto/arm64/Kconfig
+++ b/lib/crypto/arm64/Kconfig
@@ -17,4 +17,3 @@ config CRYPTO_SHA256_ARM64
 	tristate
 	default CRYPTO_LIB_SHA256
 	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
index 4aebd20c498b..35ec9ae99fe1 100644
--- a/lib/crypto/arm64/sha2-armv8.pl
+++ b/lib/crypto/arm64/sha2-armv8.pl
@@ -95,7 +95,7 @@ if ($output =~ /512/) {
 	$reg_t="w";
 }
 
-$func="sha${BITS}_blocks_arch";
+$func="sha${BITS}_block_data_order";
 
 ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
 
diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c
index bcf7a3adc0c4..fb9bff40357b 100644
--- a/lib/crypto/arm64/sha256.c
+++ b/lib/crypto/arm64/sha256.c
@@ -6,12 +6,12 @@
  */
 #include <asm/neon.h>
 #include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-				   const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+					const u8 *data, size_t nblocks);
 asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
 				  const u8 *data, size_t nblocks);
 asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
@@ -20,11 +20,11 @@ asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
 
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon)) {
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
 		if (static_branch_likely(&have_ce)) {
 			do {
 				size_t rem;
@@ -42,10 +42,10 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
 			kernel_neon_end();
 		}
 	} else {
-		sha256_blocks_arch(state, data, nblocks);
+		sha256_block_data_order(state, data, nblocks);
 	}
 }
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
 bool sha256_is_arch_optimized(void)
 {
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
index eae14f9752e0..6abb40b467f2 100644
--- a/lib/crypto/arm64/sha512.h
+++ b/lib/crypto/arm64/sha512.h
@@ -11,8 +11,8 @@
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha512_insns);
 
-asmlinkage void sha512_blocks_arch(struct sha512_block_state *state,
-				   const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+					const u8 *data, size_t nblocks);
 asmlinkage size_t __sha512_ce_transform(struct sha512_block_state *state,
 					const u8 *data, size_t nblocks);
 
@@ -32,7 +32,7 @@ static void sha512_blocks(struct sha512_block_state *state,
 			nblocks = rem;
 		} while (nblocks);
 	} else {
-		sha512_blocks_arch(state, data, nblocks);
+		sha512_block_data_order(state, data, nblocks);
 	}
 }
 
diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig
index 47c99ea97ce2..c100571feb7e 100644
--- a/lib/crypto/riscv/Kconfig
+++ b/lib/crypto/riscv/Kconfig
@@ -12,5 +12,4 @@ config CRYPTO_SHA256_RISCV64
 	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
 	default CRYPTO_LIB_SHA256
 	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
 	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c
index 71808397dff4..aa77349d08f3 100644
--- a/lib/crypto/riscv/sha256.c
+++ b/lib/crypto/riscv/sha256.c
@@ -11,6 +11,7 @@
 
 #include <asm/vector.h>
 #include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 
@@ -19,10 +20,10 @@ asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
 
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks)
 {
-	if (static_branch_likely(&have_extensions)) {
+	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
 		kernel_vector_begin();
 		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
 		kernel_vector_end();
@@ -30,13 +31,6 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
 		sha256_blocks_generic(state, data, nblocks);
 	}
 }
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-			const u8 *data, size_t nblocks)
-{
-	sha256_blocks_generic(state, data, nblocks);
-}
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
 bool sha256_is_arch_optimized(void)
diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig
index 5e94cdee492c..e344579db3d8 100644
--- a/lib/crypto/x86/Kconfig
+++ b/lib/crypto/x86/Kconfig
@@ -30,5 +30,4 @@ config CRYPTO_SHA256_X86_64
 	depends on 64BIT
 	default CRYPTO_LIB_SHA256
 	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
 	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c
index 80380f8fdcee..baba74d7d26f 100644
--- a/lib/crypto/x86/sha256.c
+++ b/lib/crypto/x86/sha256.c
@@ -6,6 +6,7 @@
  */
 #include <asm/fpu/api.h>
 #include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/static_call.h>
@@ -23,10 +24,10 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
 
 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
 
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
 			const u8 *data, size_t nblocks)
 {
-	if (static_branch_likely(&have_sha256_x86)) {
+	if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) {
 		kernel_fpu_begin();
 		static_call(sha256_blocks_x86)(state, data, nblocks);
 		kernel_fpu_end();
@@ -34,13 +35,6 @@ void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
 		sha256_blocks_generic(state, data, nblocks);
 	}
 }
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-			const u8 *data, size_t nblocks)
-{
-	sha256_blocks_generic(state, data, nblocks);
-}
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
 bool sha256_is_arch_optimized(void)
-- 
cgit v1.2.3


From 6fa4b292204b15e0e269a9fd33bc99b5e36b6883 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:36 -0700
Subject: lib/crypto: sha256: Add sha224() and sha224_update()

Add a one-shot SHA-224 computation function sha224(), for consistency
with sha256(), sha384(), and sha512() which all already exist.

Similarly, add sha224_update().  While for now it's identical to
sha256_update(), omitting it makes the API harder to use since users
have to "know" which functions are the same between SHA-224 and SHA-256.
Also, this is a prerequisite for using different context types for each.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 10 ++++++++--
 lib/crypto/sha256.c   | 10 ++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index bb181b7996cd..e31da0743a52 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -114,18 +114,24 @@ struct sha512_state {
 	u8 buf[SHA512_BLOCK_SIZE];
 };
 
+void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len);
+
 static inline void sha224_init(struct sha256_state *sctx)
 {
 	sha224_block_init(&sctx->ctx);
 }
-/* Simply use sha256_update as it is equivalent to sha224_update. */
+static inline void sha224_update(struct sha256_state *sctx,
+				 const u8 *data, size_t len)
+{
+	sha256_update(sctx, data, len);
+}
 void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]);
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
 
 static inline void sha256_init(struct sha256_state *sctx)
 {
 	sha256_block_init(&sctx->ctx);
 }
-void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len);
 void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 573ccecbf48b..ccaae7088016 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -70,6 +70,16 @@ void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256_final);
 
+void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
+{
+	struct sha256_state sctx;
+
+	sha224_init(&sctx);
+	sha224_update(&sctx, data, len);
+	sha224_final(&sctx, out);
+}
+EXPORT_SYMBOL(sha224);
+
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 {
 	struct sha256_state sctx;
-- 
cgit v1.2.3


From b86ced882b8e667758afddffd8d6354197842110 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:37 -0700
Subject: lib/crypto: sha256: Make library API use strongly-typed contexts

Currently the SHA-224 and SHA-256 library functions can be mixed
arbitrarily, even in ways that are incorrect, for example using
sha224_init() and sha256_final().  This is because they operate on the
same structure, sha256_state.

Introduce stronger typing, as I did for SHA-384 and SHA-512.

Also as I did for SHA-384 and SHA-512, use the names *_ctx instead of
*_state.  The *_ctx names have the following small benefits:

- They're shorter.
- They avoid an ambiguity with the compression function state.
- They're consistent with the well-known OpenSSL API.
- Users usually name the variable 'sctx' anyway, which suggests that
  *_ctx would be the more natural name for the actual struct.

Therefore: update the SHA-224 and SHA-256 APIs, implementation, and
calling code accordingly.

In the new structs, also strongly-type the compression function state.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/riscv/purgatory/purgatory.c |   8 ++--
 arch/s390/purgatory/purgatory.c  |   2 +-
 arch/x86/purgatory/purgatory.c   |   2 +-
 crypto/sha256.c                  |  16 +++----
 drivers/char/tpm/tpm2-sessions.c |  12 ++---
 include/crypto/sha2.h            |  52 +++++++++++++++-----
 kernel/kexec_file.c              |  10 ++--
 lib/crypto/sha256.c              | 100 ++++++++++++++++++++++++++++-----------
 8 files changed, 139 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/purgatory/purgatory.c b/arch/riscv/purgatory/purgatory.c
index 80596ab5fb62..bbd5cfa4d741 100644
--- a/arch/riscv/purgatory/purgatory.c
+++ b/arch/riscv/purgatory/purgatory.c
@@ -20,14 +20,14 @@ struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(".kex
 static int verify_sha256_digest(void)
 {
 	struct kexec_sha_region *ptr, *end;
-	struct sha256_state ss;
+	struct sha256_ctx sctx;
 	u8 digest[SHA256_DIGEST_SIZE];
 
-	sha256_init(&ss);
+	sha256_init(&sctx);
 	end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
 	for (ptr = purgatory_sha_regions; ptr < end; ptr++)
-		sha256_update(&ss, (uint8_t *)(ptr->start), ptr->len);
-	sha256_final(&ss, digest);
+		sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+	sha256_final(&sctx, digest);
 	if (memcmp(digest, purgatory_sha256_digest, sizeof(digest)) != 0)
 		return 1;
 	return 0;
diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c
index 030efda05dbe..ecb38102187c 100644
--- a/arch/s390/purgatory/purgatory.c
+++ b/arch/s390/purgatory/purgatory.c
@@ -16,7 +16,7 @@ int verify_sha256_digest(void)
 {
 	struct kexec_sha_region *ptr, *end;
 	u8 digest[SHA256_DIGEST_SIZE];
-	struct sha256_state sctx;
+	struct sha256_ctx sctx;
 
 	sha256_init(&sctx);
 	end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index aea47e793963..655139dd0532 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -25,7 +25,7 @@ static int verify_sha256_digest(void)
 {
 	struct kexec_sha_region *ptr, *end;
 	u8 digest[SHA256_DIGEST_SIZE];
-	struct sha256_state sctx;
+	struct sha256_ctx sctx;
 
 	sha256_init(&sctx);
 	end = purgatory_sha_regions + ARRAY_SIZE(purgatory_sha_regions);
diff --git a/crypto/sha256.c b/crypto/sha256.c
index 4aeb213bab11..15c57fba256b 100644
--- a/crypto/sha256.c
+++ b/crypto/sha256.c
@@ -137,24 +137,24 @@ static int crypto_sha224_final_lib(struct shash_desc *desc, u8 *out)
 
 static int crypto_sha256_import_lib(struct shash_desc *desc, const void *in)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
+	struct __sha256_ctx *sctx = shash_desc_ctx(desc);
 	const u8 *p = in;
 
 	memcpy(sctx, p, sizeof(*sctx));
 	p += sizeof(*sctx);
-	sctx->count += *p;
+	sctx->bytecount += *p;
 	return 0;
 }
 
 static int crypto_sha256_export_lib(struct shash_desc *desc, void *out)
 {
-	struct sha256_state *sctx0 = shash_desc_ctx(desc);
-	struct sha256_state sctx = *sctx0;
+	struct __sha256_ctx *sctx0 = shash_desc_ctx(desc);
+	struct __sha256_ctx sctx = *sctx0;
 	unsigned int partial;
 	u8 *p = out;
 
-	partial = sctx.count % SHA256_BLOCK_SIZE;
-	sctx.count -= partial;
+	partial = sctx.bytecount % SHA256_BLOCK_SIZE;
+	sctx.bytecount -= partial;
 	memcpy(p, &sctx, sizeof(sctx));
 	p += sizeof(sctx);
 	*p = partial;
@@ -201,7 +201,7 @@ static struct shash_alg algs[] = {
 		.update			= crypto_sha256_update_lib,
 		.final			= crypto_sha256_final_lib,
 		.digest			= crypto_sha256_digest_lib,
-		.descsize		= sizeof(struct sha256_state),
+		.descsize		= sizeof(struct sha256_ctx),
 		.statesize		= sizeof(struct crypto_sha256_state) +
 					  SHA256_BLOCK_SIZE + 1,
 		.import			= crypto_sha256_import_lib,
@@ -216,7 +216,7 @@ static struct shash_alg algs[] = {
 		.init			= crypto_sha224_init,
 		.update			= crypto_sha256_update_lib,
 		.final			= crypto_sha224_final_lib,
-		.descsize		= sizeof(struct sha256_state),
+		.descsize		= sizeof(struct sha224_ctx),
 		.statesize		= sizeof(struct crypto_sha256_state) +
 					  SHA256_BLOCK_SIZE + 1,
 		.import			= crypto_sha256_import_lib,
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 7b5049b3d476..bdb119453dfb 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -390,7 +390,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy,
  * on every operation, so we weld the hmac init and final functions in
  * here to give it the same usage characteristics as a regular hash
  */
-static void tpm2_hmac_init(struct sha256_state *sctx, u8 *key, u32 key_len)
+static void tpm2_hmac_init(struct sha256_ctx *sctx, u8 *key, u32 key_len)
 {
 	u8 pad[SHA256_BLOCK_SIZE];
 	int i;
@@ -406,7 +406,7 @@ static void tpm2_hmac_init(struct sha256_state *sctx, u8 *key, u32 key_len)
 	sha256_update(sctx, pad, sizeof(pad));
 }
 
-static void tpm2_hmac_final(struct sha256_state *sctx, u8 *key, u32 key_len,
+static void tpm2_hmac_final(struct sha256_ctx *sctx, u8 *key, u32 key_len,
 			    u8 *out)
 {
 	u8 pad[SHA256_BLOCK_SIZE];
@@ -440,7 +440,7 @@ static void tpm2_KDFa(u8 *key, u32 key_len, const char *label, u8 *u,
 	const __be32 bits = cpu_to_be32(bytes * 8);
 
 	while (bytes > 0) {
-		struct sha256_state sctx;
+		struct sha256_ctx sctx;
 		__be32 c = cpu_to_be32(counter);
 
 		tpm2_hmac_init(&sctx, key, key_len);
@@ -467,7 +467,7 @@ static void tpm2_KDFa(u8 *key, u32 key_len, const char *label, u8 *u,
 static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
 		      u8 *out)
 {
-	struct sha256_state sctx;
+	struct sha256_ctx sctx;
 	/*
 	 * this should be an iterative counter, but because we know
 	 *  we're only taking 32 bytes for the point using a sha256
@@ -592,7 +592,7 @@ void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
 	u8 *hmac = NULL;
 	u32 attrs;
 	u8 cphash[SHA256_DIGEST_SIZE];
-	struct sha256_state sctx;
+	struct sha256_ctx sctx;
 
 	if (!auth)
 		return;
@@ -750,7 +750,7 @@ int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
 	off_t offset_s, offset_p;
 	u8 rphash[SHA256_DIGEST_SIZE];
 	u32 attrs, cc;
-	struct sha256_state sctx;
+	struct sha256_ctx sctx;
 	u16 tag = be16_to_cpu(head->tag);
 	int parm_len, len, i, handles;
 
diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index e31da0743a52..18e1eec841b7 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -114,25 +114,55 @@ struct sha512_state {
 	u8 buf[SHA512_BLOCK_SIZE];
 };
 
-void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len);
+/* State for the SHA-256 (and SHA-224) compression function */
+struct sha256_block_state {
+	u32 h[SHA256_STATE_WORDS];
+};
 
-static inline void sha224_init(struct sha256_state *sctx)
-{
-	sha224_block_init(&sctx->ctx);
-}
-static inline void sha224_update(struct sha256_state *sctx,
+/*
+ * Context structure, shared by SHA-224 and SHA-256.  The sha224_ctx and
+ * sha256_ctx structs wrap this one so that the API has proper typing and
+ * doesn't allow mixing the SHA-224 and SHA-256 functions arbitrarily.
+ */
+struct __sha256_ctx {
+	struct sha256_block_state state;
+	u64 bytecount;
+	u8 buf[SHA256_BLOCK_SIZE] __aligned(__alignof__(__be64));
+};
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * struct sha224_ctx - Context for hashing a message with SHA-224
+ * @ctx: private
+ */
+struct sha224_ctx {
+	struct __sha256_ctx ctx;
+};
+
+void sha224_init(struct sha224_ctx *ctx);
+static inline void sha224_update(struct sha224_ctx *ctx,
 				 const u8 *data, size_t len)
 {
-	sha256_update(sctx, data, len);
+	__sha256_update(&ctx->ctx, data, len);
 }
-void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE]);
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
 void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
 
-static inline void sha256_init(struct sha256_state *sctx)
+/**
+ * struct sha256_ctx - Context for hashing a message with SHA-256
+ * @ctx: private
+ */
+struct sha256_ctx {
+	struct __sha256_ctx ctx;
+};
+
+void sha256_init(struct sha256_ctx *ctx);
+static inline void sha256_update(struct sha256_ctx *ctx,
+				 const u8 *data, size_t len)
 {
-	sha256_block_init(&sctx->ctx);
+	__sha256_update(&ctx->ctx, data, len);
 }
-void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE]);
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
 /* State for the SHA-512 (and SHA-384) compression function */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69fe76fd9233..b835033c65eb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -751,7 +751,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 /* Calculate and store the digest of segments */
 static int kexec_calculate_store_digests(struct kimage *image)
 {
-	struct sha256_state state;
+	struct sha256_ctx sctx;
 	int ret = 0, i, j, zero_buf_sz, sha_region_sz;
 	size_t nullsz;
 	u8 digest[SHA256_DIGEST_SIZE];
@@ -770,7 +770,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 	if (!sha_regions)
 		return -ENOMEM;
 
-	sha256_init(&state);
+	sha256_init(&sctx);
 
 	for (j = i = 0; i < image->nr_segments; i++) {
 		struct kexec_segment *ksegment;
@@ -796,7 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 		if (check_ima_segment_index(image, i))
 			continue;
 
-		sha256_update(&state, ksegment->kbuf, ksegment->bufsz);
+		sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz);
 
 		/*
 		 * Assume rest of the buffer is filled with zero and
@@ -808,7 +808,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 
 			if (bytes > zero_buf_sz)
 				bytes = zero_buf_sz;
-			sha256_update(&state, zero_buf, bytes);
+			sha256_update(&sctx, zero_buf, bytes);
 			nullsz -= bytes;
 		}
 
@@ -817,7 +817,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
 		j++;
 	}
 
-	sha256_final(&state, digest);
+	sha256_final(&sctx, digest);
 
 	ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
 					     sha_regions, sha_region_sz, 0);
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index ccaae7088016..3e7797a4489d 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -18,6 +18,20 @@
 #include <linux/module.h>
 #include <linux/string.h>
 
+static const struct sha256_block_state sha224_iv = {
+	.h = {
+		SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
+		SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
+	},
+};
+
+static const struct sha256_block_state sha256_iv = {
+	.h = {
+		SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
+		SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
+	},
+};
+
 /*
  * If __DISABLE_EXPORTS is defined, then this file is being compiled for a
  * pre-boot environment.  In that case, ignore the kconfig options, pull the
@@ -32,61 +46,93 @@ static inline bool sha256_purgatory(void)
 	return __is_defined(__DISABLE_EXPORTS);
 }
 
-static inline void sha256_blocks(u32 state[SHA256_STATE_WORDS], const u8 *data,
-				 size_t nblocks)
+static inline void sha256_blocks(struct sha256_block_state *state,
+				 const u8 *data, size_t nblocks)
+{
+	sha256_choose_blocks(state->h, data, nblocks, sha256_purgatory(), false);
+}
+
+static void __sha256_init(struct __sha256_ctx *ctx,
+			  const struct sha256_block_state *iv,
+			  u64 initial_bytecount)
+{
+	ctx->state = *iv;
+	ctx->bytecount = initial_bytecount;
+}
+
+void sha224_init(struct sha224_ctx *ctx)
+{
+	__sha256_init(&ctx->ctx, &sha224_iv, 0);
+}
+EXPORT_SYMBOL_GPL(sha224_init);
+
+void sha256_init(struct sha256_ctx *ctx)
 {
-	sha256_choose_blocks(state, data, nblocks, sha256_purgatory(), false);
+	__sha256_init(&ctx->ctx, &sha256_iv, 0);
 }
+EXPORT_SYMBOL_GPL(sha256_init);
 
-void sha256_update(struct sha256_state *sctx, const u8 *data, size_t len)
+void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len)
 {
-	size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
 
-	sctx->count += len;
-	BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, sctx->ctx.state, data, len,
-				 SHA256_BLOCK_SIZE, sctx->buf, partial);
+	ctx->bytecount += len;
+	BLOCK_HASH_UPDATE_BLOCKS(sha256_blocks, &ctx->state, data, len,
+				 SHA256_BLOCK_SIZE, ctx->buf, partial);
 }
-EXPORT_SYMBOL(sha256_update);
+EXPORT_SYMBOL(__sha256_update);
 
-static inline void __sha256_final(struct sha256_state *sctx, u8 *out,
-				  size_t digest_size)
+static void __sha256_final(struct __sha256_ctx *ctx,
+			   u8 *out, size_t digest_size)
 {
-	size_t partial = sctx->count % SHA256_BLOCK_SIZE;
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA256_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial);
+		sha256_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha256_blocks(&ctx->state, ctx->buf, 1);
 
-	sha256_finup(&sctx->ctx, sctx->buf, partial, out, digest_size,
-		     sha256_purgatory(), false);
-	memzero_explicit(sctx, sizeof(*sctx));
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
 }
 
-void sha224_final(struct sha256_state *sctx, u8 out[SHA224_DIGEST_SIZE])
+void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE])
 {
-	__sha256_final(sctx, out, SHA224_DIGEST_SIZE);
+	__sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(sha224_final);
 
-void sha256_final(struct sha256_state *sctx, u8 out[SHA256_DIGEST_SIZE])
+void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE])
 {
-	__sha256_final(sctx, out, SHA256_DIGEST_SIZE);
+	__sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(sha256_final);
 
 void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE])
 {
-	struct sha256_state sctx;
+	struct sha224_ctx ctx;
 
-	sha224_init(&sctx);
-	sha224_update(&sctx, data, len);
-	sha224_final(&sctx, out);
+	sha224_init(&ctx);
+	sha224_update(&ctx, data, len);
+	sha224_final(&ctx, out);
 }
 EXPORT_SYMBOL(sha224);
 
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 {
-	struct sha256_state sctx;
+	struct sha256_ctx ctx;
 
-	sha256_init(&sctx);
-	sha256_update(&sctx, data, len);
-	sha256_final(&sctx, out);
+	sha256_init(&ctx);
+	sha256_update(&ctx, data, len);
+	sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL(sha256);
 
-- 
cgit v1.2.3


From 4c855d5069ee2edbcf62fafc7f1a5d4cfea1bce1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:38 -0700
Subject: lib/crypto: sha256: Propagate sha256_block_state type to
 implementations

The previous commit made the SHA-256 compression function state be
strongly typed, but it wasn't propagated all the way down to the
implementations of it.  Do that now.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/mips/cavium-octeon/crypto/octeon-sha256.c     |  2 +-
 include/crypto/internal/sha2.h                     |  8 ++++----
 lib/crypto/arm/sha256-ce.S                         |  2 +-
 lib/crypto/arm/sha256.c                            |  8 ++++----
 lib/crypto/arm64/sha256-ce.S                       |  2 +-
 lib/crypto/arm64/sha256.c                          |  8 ++++----
 lib/crypto/powerpc/sha256.c                        |  5 +++--
 .../riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S   |  2 +-
 lib/crypto/riscv/sha256.c                          |  7 ++++---
 lib/crypto/s390/sha256.c                           |  2 +-
 lib/crypto/sha256-generic.c                        | 24 ++++++++++++++++------
 lib/crypto/sparc/sha256.c                          |  4 ++--
 lib/crypto/x86/sha256-avx-asm.S                    |  2 +-
 lib/crypto/x86/sha256-avx2-asm.S                   |  2 +-
 lib/crypto/x86/sha256-ni-asm.S                     |  2 +-
 lib/crypto/x86/sha256-ssse3-asm.S                  |  2 +-
 lib/crypto/x86/sha256.c                            | 10 ++++-----
 17 files changed, 53 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
index c20038239cb6..f8664818d04e 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
@@ -22,7 +22,7 @@
  * We pass everything as 64-bit. OCTEON can handle misaligned data.
  */
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	struct octeon_cop2_state cop2_state;
diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h
index 5a25ccc49388..f0f455477bbd 100644
--- a/include/crypto/internal/sha2.h
+++ b/include/crypto/internal/sha2.h
@@ -17,9 +17,9 @@ static inline bool sha256_is_arch_optimized(void)
 	return false;
 }
 #endif
-void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_generic(struct sha256_block_state *state,
 			   const u8 *data, size_t nblocks);
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks);
 
 static __always_inline void sha256_choose_blocks(
@@ -27,9 +27,9 @@ static __always_inline void sha256_choose_blocks(
 	bool force_generic, bool force_simd)
 {
 	if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic)
-		sha256_blocks_generic(state, data, nblocks);
+		sha256_blocks_generic((struct sha256_block_state *)state, data, nblocks);
 	else
-		sha256_blocks_arch(state, data, nblocks);
+		sha256_blocks_arch((struct sha256_block_state *)state, data, nblocks);
 }
 
 static __always_inline void sha256_finup(
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
index ac2c9b01b22d..7481ac8e6c0d 100644
--- a/lib/crypto/arm/sha256-ce.S
+++ b/lib/crypto/arm/sha256-ce.S
@@ -67,7 +67,7 @@
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 	/*
-	 * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+	 * void sha256_ce_transform(struct sha256_block_state *state,
 	 *			    const u8 *data, size_t nblocks);
 	 */
 ENTRY(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c
index 2c9cfdaaa069..7d9082358695 100644
--- a/lib/crypto/arm/sha256.c
+++ b/lib/crypto/arm/sha256.c
@@ -10,17 +10,17 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
-asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
 					     const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
 				    const u8 *data, size_t nblocks);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index f3e21c6d87d2..b99d9589c421 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -71,7 +71,7 @@
 	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 
 	/*
-	 * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+	 * size_t __sha256_ce_transform(struct sha256_block_state *state,
 	 *				const u8 *data, size_t nblocks);
 	 */
 	.text
diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c
index fb9bff40357b..609ffb815198 100644
--- a/lib/crypto/arm64/sha256.c
+++ b/lib/crypto/arm64/sha256.c
@@ -10,17 +10,17 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
-asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
 				  const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
 					const u8 *data, size_t nblocks);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c
index 6b0f079587eb..55f42403d572 100644
--- a/lib/crypto/powerpc/sha256.c
+++ b/lib/crypto/powerpc/sha256.c
@@ -26,7 +26,8 @@
  */
 #define MAX_BYTES 1024
 
-extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks);
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+				     const u8 *src, u32 blocks);
 
 static void spe_begin(void)
 {
@@ -42,7 +43,7 @@ static void spe_end(void)
 	preempt_enable();
 }
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	do {
diff --git a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
index fad501ad0617..1618d1220a6e 100644
--- a/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
+++ b/lib/crypto/riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
@@ -106,7 +106,7 @@
 	sha256_4rounds	\last, \k3, W3, W0, W1, W2
 .endm
 
-// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[SHA256_STATE_WORDS],
+// void sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
 //					       const u8 *data, size_t nblocks);
 SYM_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
 
diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c
index aa77349d08f3..a2079aa3ae92 100644
--- a/lib/crypto/riscv/sha256.c
+++ b/lib/crypto/riscv/sha256.c
@@ -15,12 +15,13 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 
-asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
-	u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks);
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+				       const u8 *data, size_t nblocks);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c
index 7dfe120fafab..fb565718f753 100644
--- a/lib/crypto/s390/sha256.c
+++ b/lib/crypto/s390/sha256.c
@@ -12,7 +12,7 @@
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (static_branch_likely(&have_cpacf_sha256))
diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c
index 2968d95d0403..99f904033c26 100644
--- a/lib/crypto/sha256-generic.c
+++ b/lib/crypto/sha256-generic.c
@@ -70,7 +70,7 @@ static inline void BLEND_OP(int I, u32 *W)
 	h = t1 + t2;						\
 } while (0)
 
-static void sha256_block_generic(u32 state[SHA256_STATE_WORDS],
+static void sha256_block_generic(struct sha256_block_state *state,
 				 const u8 *input, u32 W[64])
 {
 	u32 a, b, c, d, e, f, g, h;
@@ -101,8 +101,14 @@ static void sha256_block_generic(u32 state[SHA256_STATE_WORDS],
 	}
 
 	/* load the state into our registers */
-	a = state[0];  b = state[1];  c = state[2];  d = state[3];
-	e = state[4];  f = state[5];  g = state[6];  h = state[7];
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+	e = state->h[4];
+	f = state->h[5];
+	g = state->h[6];
+	h = state->h[7];
 
 	/* now iterate */
 	for (i = 0; i < 64; i += 8) {
@@ -116,11 +122,17 @@ static void sha256_block_generic(u32 state[SHA256_STATE_WORDS],
 		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
 	}
 
-	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
-	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
 }
 
-void sha256_blocks_generic(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_generic(struct sha256_block_state *state,
 			   const u8 *data, size_t nblocks)
 {
 	u32 W[64];
diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c
index 8bdec2db08b3..060664b88a6d 100644
--- a/lib/crypto/sparc/sha256.c
+++ b/lib/crypto/sparc/sha256.c
@@ -19,10 +19,10 @@
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
 
-asmlinkage void sha256_sparc64_transform(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
 					 const u8 *data, size_t nblocks);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (static_branch_likely(&have_sha256_opcodes))
diff --git a/lib/crypto/x86/sha256-avx-asm.S b/lib/crypto/x86/sha256-avx-asm.S
index 0d7b2c3e45d9..73bcff2b548f 100644
--- a/lib/crypto/x86/sha256-avx-asm.S
+++ b/lib/crypto/x86/sha256-avx-asm.S
@@ -341,7 +341,7 @@ a = TMP_
 .endm
 
 ########################################################################
-## void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+## void sha256_transform_avx(struct sha256_block_state *state,
 ##			     const u8 *data, size_t nblocks);
 ########################################################################
 .text
diff --git a/lib/crypto/x86/sha256-avx2-asm.S b/lib/crypto/x86/sha256-avx2-asm.S
index 25d3380321ec..45787570387f 100644
--- a/lib/crypto/x86/sha256-avx2-asm.S
+++ b/lib/crypto/x86/sha256-avx2-asm.S
@@ -518,7 +518,7 @@ STACK_SIZE	= _CTX      + _CTX_SIZE
 .endm
 
 ########################################################################
-## void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+## void sha256_transform_rorx(struct sha256_block_state *state,
 ##			      const u8 *data, size_t nblocks);
 ########################################################################
 .text
diff --git a/lib/crypto/x86/sha256-ni-asm.S b/lib/crypto/x86/sha256-ni-asm.S
index d3548206cf3d..4af7d22e29e4 100644
--- a/lib/crypto/x86/sha256-ni-asm.S
+++ b/lib/crypto/x86/sha256-ni-asm.S
@@ -106,7 +106,7 @@
  * only processes complete blocks.  State initialization, buffering of partial
  * blocks, and digest finalization is expected to be handled elsewhere.
  *
- * void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+ * void sha256_ni_transform(struct sha256_block_state *state,
  *			    const u8 *data, size_t nblocks);
  */
 .text
diff --git a/lib/crypto/x86/sha256-ssse3-asm.S b/lib/crypto/x86/sha256-ssse3-asm.S
index 7f24a4cdcb25..407b30adcd37 100644
--- a/lib/crypto/x86/sha256-ssse3-asm.S
+++ b/lib/crypto/x86/sha256-ssse3-asm.S
@@ -348,7 +348,7 @@ a = TMP_
 .endm
 
 ########################################################################
-## void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+## void sha256_transform_ssse3(struct sha256_block_state *state,
 ##			       const u8 *data, size_t nblocks);
 ########################################################################
 .text
diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c
index baba74d7d26f..cbb45defbefa 100644
--- a/lib/crypto/x86/sha256.c
+++ b/lib/crypto/x86/sha256.c
@@ -11,20 +11,20 @@
 #include <linux/module.h>
 #include <linux/static_call.h>
 
-asmlinkage void sha256_transform_ssse3(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state,
 				       const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_avx(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_transform_avx(struct sha256_block_state *state,
 				     const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_rorx(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_transform_rorx(struct sha256_block_state *state,
 				      const u8 *data, size_t nblocks);
-asmlinkage void sha256_ni_transform(u32 state[SHA256_STATE_WORDS],
+asmlinkage void sha256_ni_transform(struct sha256_block_state *state,
 				    const u8 *data, size_t nblocks);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
 
 DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
 
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+void sha256_blocks_arch(struct sha256_block_state *state,
 			const u8 *data, size_t nblocks)
 {
 	if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) {
-- 
cgit v1.2.3


From 077833cd600908359391bd22d5350c9106ea238c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:39 -0700
Subject: lib/crypto: sha256: Add HMAC-SHA224 and HMAC-SHA256 support

Since HMAC support is commonly needed and is fairly simple, include it
as a first-class citizen of the SHA-256 library.

The API supports both incremental and one-shot computation, and either
preparing the key ahead of time or just using a raw key.  The
implementation is much more streamlined than crypto/hmac.c.

I've kept it consistent with the HMAC-SHA384 and HMAC-SHA512 code as
much as possible.

Testing of these functions will be via sha224_kunit and sha256_kunit,
added by a later commit.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-9-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/crypto/sha256.c   | 147 +++++++++++++++++++++++++++++++--
 2 files changed, 364 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 18e1eec841b7..2e3fc2cf4aa0 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -131,6 +131,22 @@ struct __sha256_ctx {
 };
 void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len);
 
+/*
+ * HMAC key and message context structs, shared by HMAC-SHA224 and HMAC-SHA256.
+ * The hmac_sha224_* and hmac_sha256_* structs wrap this one so that the API has
+ * proper typing and doesn't allow mixing the functions arbitrarily.
+ */
+struct __hmac_sha256_key {
+	struct sha256_block_state istate;
+	struct sha256_block_state ostate;
+};
+struct __hmac_sha256_ctx {
+	struct __sha256_ctx sha_ctx;
+	struct sha256_block_state ostate;
+};
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+			const struct __hmac_sha256_key *key);
+
 /**
  * struct sha224_ctx - Context for hashing a message with SHA-224
  * @ctx: private
@@ -148,6 +164,109 @@ static inline void sha224_update(struct sha224_ctx *ctx,
 void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
 void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
 
+/**
+ * struct hmac_sha224_key - Prepared key for HMAC-SHA224
+ * @key: private
+ */
+struct hmac_sha224_key {
+	struct __hmac_sha256_key key;
+};
+
+/**
+ * struct hmac_sha224_ctx - Context for computing HMAC-SHA224 of a message
+ * @ctx: private
+ */
+struct hmac_sha224_ctx {
+	struct __hmac_sha256_ctx ctx;
+};
+
+/**
+ * hmac_sha224_preparekey() - Prepare a key for HMAC-SHA224
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-SHA224 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_sha224_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+			    const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha224_init() - Initialize an HMAC-SHA224 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_sha224() instead.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha224_init(struct hmac_sha224_ctx *ctx,
+				    const struct hmac_sha224_key *key)
+{
+	__hmac_sha256_init(&ctx->ctx, &key->key);
+}
+
+/**
+ * hmac_sha224_update() - Update an HMAC-SHA224 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx,
+				      const u8 *data, size_t data_len)
+{
+	__sha256_update(&ctx->ctx.sha_ctx, data, data_len);
+}
+
+/**
+ * hmac_sha224_final() - Finish computing an HMAC-SHA224 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-SHA224 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+
+/**
+ * hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA224 value
+ *
+ * If you're using the key only once, consider using hmac_sha224_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_sha224(const struct hmac_sha224_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]);
+
+/**
+ * hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key
+ * @raw_key: the raw HMAC-SHA224 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA224 value
+ *
+ * If you're using the key multiple times, prefer to use
+ * hmac_sha224_preparekey() followed by multiple calls to hmac_sha224() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA224_DIGEST_SIZE]);
+
 /**
  * struct sha256_ctx - Context for hashing a message with SHA-256
  * @ctx: private
@@ -165,6 +284,109 @@ static inline void sha256_update(struct sha256_ctx *ctx,
 void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
+/**
+ * struct hmac_sha256_key - Prepared key for HMAC-SHA256
+ * @key: private
+ */
+struct hmac_sha256_key {
+	struct __hmac_sha256_key key;
+};
+
+/**
+ * struct hmac_sha256_ctx - Context for computing HMAC-SHA256 of a message
+ * @ctx: private
+ */
+struct hmac_sha256_ctx {
+	struct __hmac_sha256_ctx ctx;
+};
+
+/**
+ * hmac_sha256_preparekey() - Prepare a key for HMAC-SHA256
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-SHA256 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_sha256_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+			    const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha256_init() - Initialize an HMAC-SHA256 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_sha256() instead.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha256_init(struct hmac_sha256_ctx *ctx,
+				    const struct hmac_sha256_key *key)
+{
+	__hmac_sha256_init(&ctx->ctx, &key->key);
+}
+
+/**
+ * hmac_sha256_update() - Update an HMAC-SHA256 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx,
+				      const u8 *data, size_t data_len)
+{
+	__sha256_update(&ctx->ctx.sha_ctx, data, data_len);
+}
+
+/**
+ * hmac_sha256_final() - Finish computing an HMAC-SHA256 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-SHA256 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+
+/**
+ * hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA256 value
+ *
+ * If you're using the key only once, consider using hmac_sha256_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_sha256(const struct hmac_sha256_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]);
+
+/**
+ * hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key
+ * @raw_key: the raw HMAC-SHA256 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA256 value
+ *
+ * If you're using the key multiple times, prefer to use
+ * hmac_sha256_preparekey() followed by multiple calls to hmac_sha256() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA256_DIGEST_SIZE]);
+
 /* State for the SHA-512 (and SHA-384) compression function */
 struct sha512_block_state {
 	u64 h[8];
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 3e7797a4489d..12b4b59052c4 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -1,9 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions
  *
  * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
  * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
@@ -11,12 +8,14 @@
  * Copyright (c) 2014 Red Hat Inc.
  */
 
+#include <crypto/hmac.h>
 #include <crypto/internal/blockhash.h>
 #include <crypto/internal/sha2.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/wordpart.h>
 
 static const struct sha256_block_state sha224_iv = {
 	.h = {
@@ -136,5 +135,143 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 }
 EXPORT_SYMBOL(sha256);
 
-MODULE_DESCRIPTION("SHA-256 Algorithm");
+/* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
+#ifndef __DISABLE_EXPORTS
+static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key,
+				     const u8 *raw_key, size_t raw_key_len,
+				     const struct sha256_block_state *iv)
+{
+	union {
+		u8 b[SHA256_BLOCK_SIZE];
+		unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) {
+		if (iv == &sha224_iv)
+			sha224(raw_key, raw_key_len, derived_key.b);
+		else
+			sha256(raw_key, raw_key_len, derived_key.b);
+	} else {
+		memcpy(derived_key.b, raw_key, raw_key_len);
+	}
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	key->istate = *iv;
+	sha256_blocks(&key->istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	key->ostate = *iv;
+	sha256_blocks(&key->ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha224_preparekey(struct hmac_sha224_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha224_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
+
+void hmac_sha256_preparekey(struct hmac_sha256_key *key,
+			    const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha256_iv);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
+
+void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
+			const struct __hmac_sha256_key *key)
+{
+	__sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE);
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(__hmac_sha256_init);
+
+static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
+				u8 *out, size_t digest_size)
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size);
+	memset(&ctx->sha_ctx.buf[digest_size], 0,
+	       SHA256_BLOCK_SIZE - digest_size);
+	ctx->sha_ctx.buf[digest_size] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < digest_size; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+void hmac_sha224_final(struct hmac_sha224_ctx *ctx,
+		       u8 out[SHA224_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_final);
+
+void hmac_sha256_final(struct hmac_sha256_ctx *ctx,
+		       u8 out[SHA256_DIGEST_SIZE])
+{
+	__hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_final);
+
+void hmac_sha224(const struct hmac_sha224_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_ctx ctx;
+
+	hmac_sha224_init(&ctx, key);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha224);
+
+void hmac_sha256(const struct hmac_sha256_key *key,
+		 const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_ctx ctx;
+
+	hmac_sha256_init(&ctx, key);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha256);
+
+void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA224_DIGEST_SIZE])
+{
+	struct hmac_sha224_key key;
+
+	hmac_sha224_preparekey(&key, raw_key, raw_key_len);
+	hmac_sha224(&key, data, data_len, out);
+
+	memzero_explicit(&key, sizeof(key));
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
+
+void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			     const u8 *data, size_t data_len,
+			     u8 out[SHA256_DIGEST_SIZE])
+{
+	struct hmac_sha256_key key;
+
+	hmac_sha256_preparekey(&key, raw_key, raw_key_len);
+	hmac_sha256(&key, data, data_len, out);
+
+	memzero_explicit(&key, sizeof(key));
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
+#endif /* !__DISABLE_EXPORTS */
+
+MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9f9846a72eec406db9e1eadcad1dd5e90aa0f355 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:42 -0700
Subject: lib/crypto: sha256: Remove sha256_is_arch_optimized()

Remove sha256_is_arch_optimized(), since it is no longer used.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-12-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/mips/cavium-octeon/crypto/octeon-sha256.c | 6 ------
 include/crypto/internal/sha2.h                 | 8 --------
 lib/crypto/arm/sha256.c                        | 7 -------
 lib/crypto/arm64/sha256.c                      | 7 -------
 lib/crypto/powerpc/sha256.c                    | 6 ------
 lib/crypto/riscv/sha256.c                      | 6 ------
 lib/crypto/s390/sha256.c                       | 6 ------
 lib/crypto/sparc/sha256.c                      | 6 ------
 lib/crypto/x86/sha256.c                        | 6 ------
 9 files changed, 58 deletions(-)

(limited to 'include')

diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
index f8664818d04e..c7c67bdc2bd0 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
@@ -61,12 +61,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return octeon_has_crypto();
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)");
 MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h
index f0f455477bbd..7915a3a46bc8 100644
--- a/include/crypto/internal/sha2.h
+++ b/include/crypto/internal/sha2.h
@@ -9,14 +9,6 @@
 #include <linux/types.h>
 #include <linux/unaligned.h>
 
-#if IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256)
-bool sha256_is_arch_optimized(void);
-#else
-static inline bool sha256_is_arch_optimized(void)
-{
-	return false;
-}
-#endif
 void sha256_blocks_generic(struct sha256_block_state *state,
 			   const u8 *data, size_t nblocks);
 void sha256_blocks_arch(struct sha256_block_state *state,
diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c
index 7d9082358695..27181be0aa92 100644
--- a/lib/crypto/arm/sha256.c
+++ b/lib/crypto/arm/sha256.c
@@ -37,13 +37,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init sha256_arm_mod_init(void)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c
index 609ffb815198..a5a498276708 100644
--- a/lib/crypto/arm64/sha256.c
+++ b/lib/crypto/arm64/sha256.c
@@ -47,13 +47,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM64 scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init sha256_arm64_mod_init(void)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c
index 55f42403d572..14b8adcdcfc2 100644
--- a/lib/crypto/powerpc/sha256.c
+++ b/lib/crypto/powerpc/sha256.c
@@ -61,11 +61,5 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");
diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c
index a2079aa3ae92..01004cb9c6e9 100644
--- a/lib/crypto/riscv/sha256.c
+++ b/lib/crypto/riscv/sha256.c
@@ -34,12 +34,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_extensions);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init riscv64_sha256_mod_init(void)
 {
 	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c
index fb565718f753..6ebfd35a5d44 100644
--- a/lib/crypto/s390/sha256.c
+++ b/lib/crypto/s390/sha256.c
@@ -23,12 +23,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_cpacf_sha256);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init sha256_s390_mod_init(void)
 {
 	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c
index 060664b88a6d..f41c109c1c18 100644
--- a/lib/crypto/sparc/sha256.c
+++ b/lib/crypto/sparc/sha256.c
@@ -32,12 +32,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_sha256_opcodes);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init sha256_sparc64_mod_init(void)
 {
 	unsigned long cfr;
diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c
index cbb45defbefa..9ee38d2b3d57 100644
--- a/lib/crypto/x86/sha256.c
+++ b/lib/crypto/x86/sha256.c
@@ -37,12 +37,6 @@ void sha256_blocks_arch(struct sha256_block_state *state,
 }
 EXPORT_SYMBOL_GPL(sha256_blocks_arch);
 
-bool sha256_is_arch_optimized(void)
-{
-	return static_key_enabled(&have_sha256_x86);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
 static int __init sha256_x86_mod_init(void)
 {
 	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
-- 
cgit v1.2.3


From e96cb9507f2d8ba150d417dcd283204564945831 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:43 -0700
Subject: lib/crypto: sha256: Consolidate into single module

Consolidate the CPU-based SHA-256 code into a single module, following
what I did with SHA-512:

- Each arch now provides a header file lib/crypto/$(SRCARCH)/sha256.h,
  replacing lib/crypto/$(SRCARCH)/sha256.c.  The header defines
  sha256_blocks() and optionally sha256_mod_init_arch().  It is included
  by lib/crypto/sha256.c, and thus the code gets built into the single
  libsha256 module, with proper inlining and dead code elimination.

- sha256_blocks_generic() is moved from lib/crypto/sha256-generic.c into
  lib/crypto/sha256.c.  It's now a static function marked with
  __maybe_unused, so the compiler automatically eliminates it in any
  cases where it's not used.

- Whether arch-optimized SHA-256 is buildable is now controlled
  centrally by lib/crypto/Kconfig instead of by
  lib/crypto/$(SRCARCH)/Kconfig.  The conditions for enabling it remain
  the same as before, and it remains enabled by default.

- Any additional arch-specific translation units for the optimized
  SHA-256 code (such as assembly files) are now compiled by
  lib/crypto/Makefile instead of lib/crypto/$(SRCARCH)/Makefile.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-13-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/mips/cavium-octeon/Kconfig                |   6 -
 arch/mips/cavium-octeon/crypto/Makefile        |   1 -
 arch/mips/cavium-octeon/crypto/octeon-sha256.c |  66 -----------
 include/crypto/internal/sha2.h                 |  52 ---------
 lib/crypto/Kconfig                             |  26 ++---
 lib/crypto/Makefile                            |  39 ++++++-
 lib/crypto/arm/Kconfig                         |   6 -
 lib/crypto/arm/Makefile                        |   8 +-
 lib/crypto/arm/sha256.c                        |  57 ----------
 lib/crypto/arm/sha256.h                        |  46 ++++++++
 lib/crypto/arm64/Kconfig                       |   5 -
 lib/crypto/arm64/Makefile                      |   9 +-
 lib/crypto/arm64/sha256.c                      |  68 -----------
 lib/crypto/arm64/sha256.h                      |  57 ++++++++++
 lib/crypto/mips/sha256.h                       |  58 ++++++++++
 lib/crypto/powerpc/Kconfig                     |   6 -
 lib/crypto/powerpc/Makefile                    |   3 -
 lib/crypto/powerpc/sha256.c                    |  65 -----------
 lib/crypto/powerpc/sha256.h                    |  58 ++++++++++
 lib/crypto/riscv/Kconfig                       |   7 --
 lib/crypto/riscv/Makefile                      |   3 -
 lib/crypto/riscv/sha256.c                      |  56 ---------
 lib/crypto/riscv/sha256.h                      |  42 +++++++
 lib/crypto/s390/Kconfig                        |   6 -
 lib/crypto/s390/Makefile                       |   3 -
 lib/crypto/s390/sha256.c                       |  41 -------
 lib/crypto/s390/sha256.h                       |  28 +++++
 lib/crypto/sha256-generic.c                    | 150 -------------------------
 lib/crypto/sha256.c                            | 146 +++++++++++++++++++++---
 lib/crypto/sparc/Kconfig                       |   8 --
 lib/crypto/sparc/Makefile                      |   4 -
 lib/crypto/sparc/sha256.c                      |  58 ----------
 lib/crypto/sparc/sha256.h                      |  43 +++++++
 lib/crypto/x86/Kconfig                         |   7 --
 lib/crypto/x86/Makefile                        |   3 -
 lib/crypto/x86/sha256.c                        |  68 -----------
 lib/crypto/x86/sha256.h                        |  55 +++++++++
 37 files changed, 565 insertions(+), 799 deletions(-)
 delete mode 100644 arch/mips/cavium-octeon/crypto/octeon-sha256.c
 delete mode 100644 include/crypto/internal/sha2.h
 delete mode 100644 lib/crypto/arm/sha256.c
 create mode 100644 lib/crypto/arm/sha256.h
 delete mode 100644 lib/crypto/arm64/sha256.c
 create mode 100644 lib/crypto/arm64/sha256.h
 create mode 100644 lib/crypto/mips/sha256.h
 delete mode 100644 lib/crypto/powerpc/sha256.c
 create mode 100644 lib/crypto/powerpc/sha256.h
 delete mode 100644 lib/crypto/riscv/sha256.c
 create mode 100644 lib/crypto/riscv/sha256.h
 delete mode 100644 lib/crypto/s390/sha256.c
 create mode 100644 lib/crypto/s390/sha256.h
 delete mode 100644 lib/crypto/sha256-generic.c
 delete mode 100644 lib/crypto/sparc/Kconfig
 delete mode 100644 lib/crypto/sparc/Makefile
 delete mode 100644 lib/crypto/sparc/sha256.c
 create mode 100644 lib/crypto/sparc/sha256.h
 delete mode 100644 lib/crypto/x86/sha256.c
 create mode 100644 lib/crypto/x86/sha256.h

(limited to 'include')

diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 11f4aa6e80e9..450e979ef5d9 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -23,12 +23,6 @@ config CAVIUM_OCTEON_CVMSEG_SIZE
 	  legally range is from zero to 54 cache blocks (i.e. CVMSEG LM is
 	  between zero and 6192 bytes).
 
-config CRYPTO_SHA256_OCTEON
-	tristate
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_LIB_SHA256_GENERIC
-
 endif # CPU_CAVIUM_OCTEON
 
 if CAVIUM_OCTEON_SOC
diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile
index 168b19ef7ce8..db428e4b30bc 100644
--- a/arch/mips/cavium-octeon/crypto/Makefile
+++ b/arch/mips/cavium-octeon/crypto/Makefile
@@ -7,4 +7,3 @@ obj-y += octeon-crypto.o
 
 obj-$(CONFIG_CRYPTO_MD5_OCTEON)		+= octeon-md5.o
 obj-$(CONFIG_CRYPTO_SHA1_OCTEON)	+= octeon-sha1.o
-obj-$(CONFIG_CRYPTO_SHA256_OCTEON)	+= octeon-sha256.o
diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
deleted file mode 100644
index c7c67bdc2bd0..000000000000
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 Secure Hash Algorithm.
- *
- * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
- *
- * Based on crypto/sha256_generic.c, which is:
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
- */
-
-#include <asm/octeon/crypto.h>
-#include <asm/octeon/octeon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/*
- * We pass everything as 64-bit. OCTEON can handle misaligned data.
- */
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	struct octeon_cop2_state cop2_state;
-	u64 *state64 = (u64 *)state;
-	unsigned long flags;
-
-	if (!octeon_has_crypto())
-		return sha256_blocks_generic(state, data, nblocks);
-
-	flags = octeon_crypto_enable(&cop2_state);
-	write_octeon_64bit_hash_dword(state64[0], 0);
-	write_octeon_64bit_hash_dword(state64[1], 1);
-	write_octeon_64bit_hash_dword(state64[2], 2);
-	write_octeon_64bit_hash_dword(state64[3], 3);
-
-	do {
-		const u64 *block = (const u64 *)data;
-
-		write_octeon_64bit_block_dword(block[0], 0);
-		write_octeon_64bit_block_dword(block[1], 1);
-		write_octeon_64bit_block_dword(block[2], 2);
-		write_octeon_64bit_block_dword(block[3], 3);
-		write_octeon_64bit_block_dword(block[4], 4);
-		write_octeon_64bit_block_dword(block[5], 5);
-		write_octeon_64bit_block_dword(block[6], 6);
-		octeon_sha256_start(block[7]);
-
-		data += SHA256_BLOCK_SIZE;
-	} while (--nblocks);
-
-	state64[0] = read_octeon_64bit_hash_dword(0);
-	state64[1] = read_octeon_64bit_hash_dword(1);
-	state64[2] = read_octeon_64bit_hash_dword(2);
-	state64[3] = read_octeon_64bit_hash_dword(3);
-	octeon_crypto_disable(&cop2_state, flags);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm (OCTEON)");
-MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>");
diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h
deleted file mode 100644
index 7915a3a46bc8..000000000000
--- a/include/crypto/internal/sha2.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-
-#ifndef _CRYPTO_INTERNAL_SHA2_H
-#define _CRYPTO_INTERNAL_SHA2_H
-
-#include <crypto/sha2.h>
-#include <linux/compiler_attributes.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/unaligned.h>
-
-void sha256_blocks_generic(struct sha256_block_state *state,
-			   const u8 *data, size_t nblocks);
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks);
-
-static __always_inline void sha256_choose_blocks(
-	u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks,
-	bool force_generic, bool force_simd)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_SHA256) || force_generic)
-		sha256_blocks_generic((struct sha256_block_state *)state, data, nblocks);
-	else
-		sha256_blocks_arch((struct sha256_block_state *)state, data, nblocks);
-}
-
-static __always_inline void sha256_finup(
-	struct crypto_sha256_state *sctx, u8 buf[SHA256_BLOCK_SIZE],
-	size_t len, u8 out[SHA256_DIGEST_SIZE], size_t digest_size,
-	bool force_generic, bool force_simd)
-{
-	const size_t bit_offset = SHA256_BLOCK_SIZE - 8;
-	__be64 *bits = (__be64 *)&buf[bit_offset];
-	int i;
-
-	buf[len++] = 0x80;
-	if (len > bit_offset) {
-		memset(&buf[len], 0, SHA256_BLOCK_SIZE - len);
-		sha256_choose_blocks(sctx->state, buf, 1, force_generic,
-				     force_simd);
-		len = 0;
-	}
-
-	memset(&buf[len], 0, bit_offset - len);
-	*bits = cpu_to_be64(sctx->count << 3);
-	sha256_choose_blocks(sctx->state, buf, 1, force_generic, force_simd);
-
-	for (i = 0; i < digest_size; i += 4)
-		put_unaligned_be32(sctx->state[i / 4], out + i);
-}
-
-#endif /* _CRYPTO_INTERNAL_SHA2_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 9bd740475a89..3305c6908581 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -144,20 +144,17 @@ config CRYPTO_LIB_SHA256
 	  by either the generic implementation or an arch-specific one, if one
 	  is available and enabled.
 
-config CRYPTO_ARCH_HAVE_LIB_SHA256
+config CRYPTO_LIB_SHA256_ARCH
 	bool
-	help
-	  Declares whether the architecture provides an arch-specific
-	  accelerated implementation of the SHA-256 library interface.
-
-config CRYPTO_LIB_SHA256_GENERIC
-	tristate
-	default CRYPTO_LIB_SHA256 if !CRYPTO_ARCH_HAVE_LIB_SHA256
-	help
-	  This symbol can be selected by arch implementations of the SHA-256
-	  library interface that require the generic code as a fallback, e.g.,
-	  for SIMD implementations. If no arch specific implementation is
-	  enabled, this implementation serves the users of CRYPTO_LIB_SHA256.
+	depends on CRYPTO_LIB_SHA256 && !UML
+	default y if ARM && !CPU_V7M
+	default y if ARM64
+	default y if MIPS && CPU_CAVIUM_OCTEON
+	default y if PPC && SPE
+	default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+	default y if S390
+	default y if SPARC64
+	default y if X86_64
 
 config CRYPTO_LIB_SHA512
 	tristate
@@ -199,9 +196,6 @@ endif
 if S390
 source "lib/crypto/s390/Kconfig"
 endif
-if SPARC
-source "lib/crypto/sparc/Kconfig"
-endif
 if X86
 source "lib/crypto/x86/Kconfig"
 endif
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 5823137fa5a8..a887bf103bf0 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -66,11 +66,39 @@ libpoly1305-generic-y				+= poly1305-generic.o
 obj-$(CONFIG_CRYPTO_LIB_SHA1)			+= libsha1.o
 libsha1-y					:= sha1.o
 
-obj-$(CONFIG_CRYPTO_LIB_SHA256)			+= libsha256.o
-libsha256-y					:= sha256.o
+################################################################################
 
-obj-$(CONFIG_CRYPTO_LIB_SHA256_GENERIC)		+= libsha256-generic.o
-libsha256-generic-y				:= sha256-generic.o
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA256_ARCH),y)
+CFLAGS_sha256.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha256-y += arm/sha256-ce.o arm/sha256-core.o
+$(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl
+	$(call cmd,perlasm)
+clean-files += arm/sha256-core.S
+AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y)
+endif
+
+ifeq ($(CONFIG_ARM64),y)
+libsha256-y += arm64/sha256-core.o
+$(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl
+	$(call cmd,perlasm_with_args)
+clean-files += arm64/sha256-core.S
+libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o
+endif
+
+libsha256-$(CONFIG_PPC) += powerpc/sha256-spe-asm.o
+libsha256-$(CONFIG_RISCV) += riscv/sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+libsha256-$(CONFIG_SPARC) += sparc/sha256_asm.o
+libsha256-$(CONFIG_X86) += x86/sha256-ssse3-asm.o \
+			   x86/sha256-avx-asm.o \
+			   x86/sha256-avx2-asm.o \
+			   x86/sha256-ni-asm.o
+endif # CONFIG_CRYPTO_LIB_SHA256_ARCH
+
+################################################################################
 
 obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
 libsha512-y := sha512.o
@@ -100,6 +128,8 @@ libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \
 			   x86/sha512-avx2-asm.o
 endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
+################################################################################
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
@@ -113,5 +143,4 @@ obj-$(CONFIG_MIPS) += mips/
 obj-$(CONFIG_PPC) += powerpc/
 obj-$(CONFIG_RISCV) += riscv/
 obj-$(CONFIG_S390) += s390/
-obj-$(CONFIG_SPARC) += sparc/
 obj-$(CONFIG_X86) += x86/
diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig
index 9f3ff30f4032..e8444fd0aae3 100644
--- a/lib/crypto/arm/Kconfig
+++ b/lib/crypto/arm/Kconfig
@@ -22,9 +22,3 @@ config CRYPTO_POLY1305_ARM
 	tristate
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM
-	tristate
-	depends on !CPU_V7M
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile
index 431f77c3ff6f..4c042a4c77ed 100644
--- a/lib/crypto/arm/Makefile
+++ b/lib/crypto/arm/Makefile
@@ -10,17 +10,13 @@ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 poly1305-arm-y := poly1305-core.o poly1305-glue.o
 
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-sha256-arm-y := sha256.o sha256-core.o
-sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
 
 $(obj)/%-core.S: $(src)/%-armv4.pl
 	$(call cmd,perl)
 
-clean-files += poly1305-core.S sha256-core.S
+clean-files += poly1305-core.S
 
 aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 
@@ -28,5 +24,3 @@ aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
 poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
 AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
-
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c
deleted file mode 100644
index 27181be0aa92..000000000000
--- a/lib/crypto/arm/sha256.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <crypto/internal/simd.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
-					const u8 *data, size_t nblocks);
-asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
-					     const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
-				    const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha256_ce_transform(state, data, nblocks);
-		else
-			sha256_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
-	} else {
-		sha256_block_data_order(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init sha256_arm_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-		static_branch_enable(&have_neon);
-		if (elf_hwcap2 & HWCAP2_SHA2)
-			static_branch_enable(&have_ce);
-	}
-	return 0;
-}
-subsys_initcall(sha256_arm_mod_init);
-
-static void __exit sha256_arm_mod_exit(void)
-{
-}
-module_exit(sha256_arm_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM");
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
new file mode 100644
index 000000000000..da75cbdc51d4
--- /dev/null
+++ b/lib/crypto/arm/sha256.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(struct sha256_block_state *state,
+					     const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(struct sha256_block_state *state,
+				    const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		if (static_branch_likely(&have_ce))
+			sha256_ce_transform(state, data, nblocks);
+		else
+			sha256_block_data_order_neon(state, data, nblocks);
+		kernel_neon_end();
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON) {
+		static_branch_enable(&have_neon);
+		if (elf_hwcap2 & HWCAP2_SHA2)
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
index 49e57bfdb5b5..0b903ef524d8 100644
--- a/lib/crypto/arm64/Kconfig
+++ b/lib/crypto/arm64/Kconfig
@@ -12,8 +12,3 @@ config CRYPTO_POLY1305_NEON
 	depends on KERNEL_MODE_NEON
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM64
-	tristate
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile
index 946c09903711..6207088397a7 100644
--- a/lib/crypto/arm64/Makefile
+++ b/lib/crypto/arm64/Makefile
@@ -8,17 +8,10 @@ poly1305-neon-y := poly1305-core.o poly1305-glue.o
 AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
 AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
 
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256.o sha256-core.o
-sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
 $(obj)/%-core.S: $(src)/%-armv8.pl
 	$(call cmd,perlasm)
 
-$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
-	$(call cmd,perlasm)
-
-clean-files += poly1305-core.S sha256-core.S
+clean-files += poly1305-core.S
diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c
deleted file mode 100644
index a5a498276708..000000000000
--- a/lib/crypto/arm64/sha256.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <crypto/internal/simd.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
-					const u8 *data, size_t nblocks);
-asmlinkage void sha256_block_neon(struct sha256_block_state *state,
-				  const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
-					const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		if (static_branch_likely(&have_ce)) {
-			do {
-				size_t rem;
-
-				kernel_neon_begin();
-				rem = __sha256_ce_transform(state,
-							    data, nblocks);
-				kernel_neon_end();
-				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
-				nblocks = rem;
-			} while (nblocks);
-		} else {
-			kernel_neon_begin();
-			sha256_block_neon(state, data, nblocks);
-			kernel_neon_end();
-		}
-	} else {
-		sha256_block_data_order(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init sha256_arm64_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    cpu_have_named_feature(ASIMD)) {
-		static_branch_enable(&have_neon);
-		if (cpu_have_named_feature(SHA2))
-			static_branch_enable(&have_ce);
-	}
-	return 0;
-}
-subsys_initcall(sha256_arm64_mod_init);
-
-static void __exit sha256_arm64_mod_exit(void)
-{
-}
-module_exit(sha256_arm64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
new file mode 100644
index 000000000000..a211966c124a
--- /dev/null
+++ b/lib/crypto/arm64/sha256.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+
+asmlinkage void sha256_block_data_order(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_neon(struct sha256_block_state *state,
+				  const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(struct sha256_block_state *state,
+					const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    static_branch_likely(&have_neon) && crypto_simd_usable()) {
+		if (static_branch_likely(&have_ce)) {
+			do {
+				size_t rem;
+
+				kernel_neon_begin();
+				rem = __sha256_ce_transform(state,
+							    data, nblocks);
+				kernel_neon_end();
+				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+				nblocks = rem;
+			} while (nblocks);
+		} else {
+			kernel_neon_begin();
+			sha256_block_neon(state, data, nblocks);
+			kernel_neon_end();
+		}
+	} else {
+		sha256_block_data_order(state, data, nblocks);
+	}
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(ASIMD)) {
+		static_branch_enable(&have_neon);
+		if (cpu_have_named_feature(SHA2))
+			static_branch_enable(&have_ce);
+	}
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */
diff --git a/lib/crypto/mips/sha256.h b/lib/crypto/mips/sha256.h
new file mode 100644
index 000000000000..ccccfd131634
--- /dev/null
+++ b/lib/crypto/mips/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm.
+ *
+ * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>.
+ *
+ * Based on crypto/sha256_generic.c, which is:
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/octeon/crypto.h>
+#include <asm/octeon/octeon.h>
+
+/*
+ * We pass everything as 64-bit. OCTEON can handle misaligned data.
+ */
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	struct octeon_cop2_state cop2_state;
+	u64 *state64 = (u64 *)state;
+	unsigned long flags;
+
+	if (!octeon_has_crypto())
+		return sha256_blocks_generic(state, data, nblocks);
+
+	flags = octeon_crypto_enable(&cop2_state);
+	write_octeon_64bit_hash_dword(state64[0], 0);
+	write_octeon_64bit_hash_dword(state64[1], 1);
+	write_octeon_64bit_hash_dword(state64[2], 2);
+	write_octeon_64bit_hash_dword(state64[3], 3);
+
+	do {
+		const u64 *block = (const u64 *)data;
+
+		write_octeon_64bit_block_dword(block[0], 0);
+		write_octeon_64bit_block_dword(block[1], 1);
+		write_octeon_64bit_block_dword(block[2], 2);
+		write_octeon_64bit_block_dword(block[3], 3);
+		write_octeon_64bit_block_dword(block[4], 4);
+		write_octeon_64bit_block_dword(block[5], 5);
+		write_octeon_64bit_block_dword(block[6], 6);
+		octeon_sha256_start(block[7]);
+
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	state64[0] = read_octeon_64bit_hash_dword(0);
+	state64[1] = read_octeon_64bit_hash_dword(1);
+	state64[2] = read_octeon_64bit_hash_dword(2);
+	state64[3] = read_octeon_64bit_hash_dword(3);
+	octeon_crypto_disable(&cop2_state, flags);
+}
diff --git a/lib/crypto/powerpc/Kconfig b/lib/crypto/powerpc/Kconfig
index 3f9e1bbd9905..2eaeb7665a6a 100644
--- a/lib/crypto/powerpc/Kconfig
+++ b/lib/crypto/powerpc/Kconfig
@@ -14,9 +14,3 @@ config CRYPTO_POLY1305_P10
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
 	select CRYPTO_LIB_POLY1305_GENERIC
-
-config CRYPTO_SHA256_PPC_SPE
-	tristate
-	depends on SPE
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
diff --git a/lib/crypto/powerpc/Makefile b/lib/crypto/powerpc/Makefile
index 27f231f8e334..5709ae14258a 100644
--- a/lib/crypto/powerpc/Makefile
+++ b/lib/crypto/powerpc/Makefile
@@ -5,6 +5,3 @@ chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o
 
 obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o
 poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o
-
-obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
-sha256-ppc-spe-y := sha256.o sha256-spe-asm.o
diff --git a/lib/crypto/powerpc/sha256.c b/lib/crypto/powerpc/sha256.c
deleted file mode 100644
index 14b8adcdcfc2..000000000000
--- a/lib/crypto/powerpc/sha256.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 Secure Hash Algorithm, SPE optimized
- *
- * Based on generic implementation. The assembler module takes care
- * about the SPE registers so it can run from interrupt context.
- *
- * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
- */
-
-#include <asm/switch_to.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/preempt.h>
-
-/*
- * MAX_BYTES defines the number of bytes that are allowed to be processed
- * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
- * operations per 64 bytes. e500 cores can issue two arithmetic instructions
- * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
- * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
- * Headroom for cache misses included. Even with the low end model clocked
- * at 667 MHz this equals to a critical time window of less than 27us.
- *
- */
-#define MAX_BYTES 1024
-
-extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
-				     const u8 *src, u32 blocks);
-
-static void spe_begin(void)
-{
-	/* We just start SPE operations and will save SPE registers later. */
-	preempt_disable();
-	enable_kernel_spe();
-}
-
-static void spe_end(void)
-{
-	disable_kernel_spe();
-	/* reenable preemption */
-	preempt_enable();
-}
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	do {
-		/* cut input data into smaller blocks */
-		u32 unit = min_t(size_t, nblocks,
-				 MAX_BYTES / SHA256_BLOCK_SIZE);
-
-		spe_begin();
-		ppc_spe_sha256_transform(state, data, unit);
-		spe_end();
-
-		data += unit * SHA256_BLOCK_SIZE;
-		nblocks -= unit;
-	} while (nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized");
diff --git a/lib/crypto/powerpc/sha256.h b/lib/crypto/powerpc/sha256.h
new file mode 100644
index 000000000000..50d355441c7e
--- /dev/null
+++ b/lib/crypto/powerpc/sha256.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 Secure Hash Algorithm, SPE optimized
+ *
+ * Based on generic implementation. The assembler module takes care
+ * about the SPE registers so it can run from interrupt context.
+ *
+ * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
+ */
+
+#include <asm/switch_to.h>
+#include <linux/preempt.h>
+
+/*
+ * MAX_BYTES defines the number of bytes that are allowed to be processed
+ * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000
+ * operations per 64 bytes. e500 cores can issue two arithmetic instructions
+ * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2).
+ * Thus 1KB of input data will need an estimated maximum of 18,000 cycles.
+ * Headroom for cache misses included. Even with the low end model clocked
+ * at 667 MHz this equals to a critical time window of less than 27us.
+ *
+ */
+#define MAX_BYTES 1024
+
+extern void ppc_spe_sha256_transform(struct sha256_block_state *state,
+				     const u8 *src, u32 blocks);
+
+static void spe_begin(void)
+{
+	/* We just start SPE operations and will save SPE registers later. */
+	preempt_disable();
+	enable_kernel_spe();
+}
+
+static void spe_end(void)
+{
+	disable_kernel_spe();
+	/* reenable preemption */
+	preempt_enable();
+}
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	do {
+		/* cut input data into smaller blocks */
+		u32 unit = min_t(size_t, nblocks,
+				 MAX_BYTES / SHA256_BLOCK_SIZE);
+
+		spe_begin();
+		ppc_spe_sha256_transform(state, data, unit);
+		spe_end();
+
+		data += unit * SHA256_BLOCK_SIZE;
+		nblocks -= unit;
+	} while (nblocks);
+}
diff --git a/lib/crypto/riscv/Kconfig b/lib/crypto/riscv/Kconfig
index c100571feb7e..bc7a43f33eb3 100644
--- a/lib/crypto/riscv/Kconfig
+++ b/lib/crypto/riscv/Kconfig
@@ -6,10 +6,3 @@ config CRYPTO_CHACHA_RISCV64
 	default CRYPTO_LIB_CHACHA
 	select CRYPTO_ARCH_HAVE_LIB_CHACHA
 	select CRYPTO_LIB_CHACHA_GENERIC
-
-config CRYPTO_SHA256_RISCV64
-	tristate
-	depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/riscv/Makefile b/lib/crypto/riscv/Makefile
index b7cb877a2c07..e27b78f317fc 100644
--- a/lib/crypto/riscv/Makefile
+++ b/lib/crypto/riscv/Makefile
@@ -2,6 +2,3 @@
 
 obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
 chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
-
-obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
-sha256-riscv64-y := sha256.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
diff --git a/lib/crypto/riscv/sha256.c b/lib/crypto/riscv/sha256.c
deleted file mode 100644
index 01004cb9c6e9..000000000000
--- a/lib/crypto/riscv/sha256.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 (RISC-V accelerated)
- *
- * Copyright (C) 2022 VRULL GmbH
- * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
- *
- * Copyright (C) 2023 SiFive, Inc.
- * Author: Jerry Shih <jerry.shih@sifive.com>
- */
-
-#include <asm/vector.h>
-#include <crypto/internal/sha2.h>
-#include <crypto/internal/simd.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void
-sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
-				       const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
-		kernel_vector_begin();
-		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
-		kernel_vector_end();
-	} else {
-		sha256_blocks_generic(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init riscv64_sha256_mod_init(void)
-{
-	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
-	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
-	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
-	    riscv_isa_extension_available(NULL, ZVKB) &&
-	    riscv_vector_vlen() >= 128)
-		static_branch_enable(&have_extensions);
-	return 0;
-}
-subsys_initcall(riscv64_sha256_mod_init);
-
-static void __exit riscv64_sha256_mod_exit(void)
-{
-}
-module_exit(riscv64_sha256_mod_exit);
-
-MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
-MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/riscv/sha256.h b/lib/crypto/riscv/sha256.h
new file mode 100644
index 000000000000..c0f79c18f119
--- /dev/null
+++ b/lib/crypto/riscv/sha256.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 (RISC-V accelerated)
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/vector.h>
+#include <crypto/internal/simd.h>
+
+asmlinkage void
+sha256_transform_zvknha_or_zvknhb_zvkb(struct sha256_block_state *state,
+				       const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_extensions);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_extensions) && crypto_simd_usable()) {
+		kernel_vector_begin();
+		sha256_transform_zvknha_or_zvknhb_zvkb(state, data, nblocks);
+		kernel_vector_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	/* Both zvknha and zvknhb provide the SHA-256 instructions. */
+	if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+	     riscv_isa_extension_available(NULL, ZVKNHB)) &&
+	    riscv_isa_extension_available(NULL, ZVKB) &&
+	    riscv_vector_vlen() >= 128)
+		static_branch_enable(&have_extensions);
+}
diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig
index e3f855ef4393..069b355fe51a 100644
--- a/lib/crypto/s390/Kconfig
+++ b/lib/crypto/s390/Kconfig
@@ -5,9 +5,3 @@ config CRYPTO_CHACHA_S390
 	default CRYPTO_LIB_CHACHA
 	select CRYPTO_LIB_CHACHA_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_SHA256_S390
-	tristate
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile
index 5df30f1e7930..06c2cf77178e 100644
--- a/lib/crypto/s390/Makefile
+++ b/lib/crypto/s390/Makefile
@@ -2,6 +2,3 @@
 
 obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
 chacha_s390-y := chacha-glue.o chacha-s390.o
-
-obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o
-sha256-s390-y := sha256.o
diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c
deleted file mode 100644
index 6ebfd35a5d44..000000000000
--- a/lib/crypto/s390/sha256.c
+++ /dev/null
@@ -1,41 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/sha2.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (static_branch_likely(&have_cpacf_sha256))
-		cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
-			   nblocks * SHA256_BLOCK_SIZE);
-	else
-		sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init sha256_s390_mod_init(void)
-{
-	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
-	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
-		static_branch_enable(&have_cpacf_sha256);
-	return 0;
-}
-subsys_initcall(sha256_s390_mod_init);
-
-static void __exit sha256_s390_mod_exit(void)
-{
-}
-module_exit(sha256_s390_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)");
diff --git a/lib/crypto/s390/sha256.h b/lib/crypto/s390/sha256.h
new file mode 100644
index 000000000000..70a81cbc06b2
--- /dev/null
+++ b/lib/crypto/s390/sha256.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_cpacf_sha256))
+		cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+			   nblocks * SHA256_BLOCK_SIZE);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+	    cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+		static_branch_enable(&have_cpacf_sha256);
+}
diff --git a/lib/crypto/sha256-generic.c b/lib/crypto/sha256-generic.c
deleted file mode 100644
index 99f904033c26..000000000000
--- a/lib/crypto/sha256-generic.c
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256, as specified in
- * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
- *
- * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * Copyright (c) 2014 Red Hat Inc.
- */
-
-#include <crypto/internal/sha2.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u32 SHA256_K[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
-};
-
-static inline u32 Ch(u32 x, u32 y, u32 z)
-{
-	return z ^ (x & (y ^ z));
-}
-
-static inline u32 Maj(u32 x, u32 y, u32 z)
-{
-	return (x & y) | (z & (x | y));
-}
-
-#define e0(x)       (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
-#define e1(x)       (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
-#define s0(x)       (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
-#define s1(x)       (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
-
-static inline void LOAD_OP(int I, u32 *W, const u8 *input)
-{
-	W[I] = get_unaligned_be32((__u32 *)input + I);
-}
-
-static inline void BLEND_OP(int I, u32 *W)
-{
-	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
-}
-
-#define SHA256_ROUND(i, a, b, c, d, e, f, g, h) do {		\
-	u32 t1, t2;						\
-	t1 = h + e1(e) + Ch(e, f, g) + SHA256_K[i] + W[i];	\
-	t2 = e0(a) + Maj(a, b, c);				\
-	d += t1;						\
-	h = t1 + t2;						\
-} while (0)
-
-static void sha256_block_generic(struct sha256_block_state *state,
-				 const u8 *input, u32 W[64])
-{
-	u32 a, b, c, d, e, f, g, h;
-	int i;
-
-	/* load the input */
-	for (i = 0; i < 16; i += 8) {
-		LOAD_OP(i + 0, W, input);
-		LOAD_OP(i + 1, W, input);
-		LOAD_OP(i + 2, W, input);
-		LOAD_OP(i + 3, W, input);
-		LOAD_OP(i + 4, W, input);
-		LOAD_OP(i + 5, W, input);
-		LOAD_OP(i + 6, W, input);
-		LOAD_OP(i + 7, W, input);
-	}
-
-	/* now blend */
-	for (i = 16; i < 64; i += 8) {
-		BLEND_OP(i + 0, W);
-		BLEND_OP(i + 1, W);
-		BLEND_OP(i + 2, W);
-		BLEND_OP(i + 3, W);
-		BLEND_OP(i + 4, W);
-		BLEND_OP(i + 5, W);
-		BLEND_OP(i + 6, W);
-		BLEND_OP(i + 7, W);
-	}
-
-	/* load the state into our registers */
-	a = state->h[0];
-	b = state->h[1];
-	c = state->h[2];
-	d = state->h[3];
-	e = state->h[4];
-	f = state->h[5];
-	g = state->h[6];
-	h = state->h[7];
-
-	/* now iterate */
-	for (i = 0; i < 64; i += 8) {
-		SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
-		SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
-		SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
-		SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
-		SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
-		SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
-		SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
-		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
-	}
-
-	state->h[0] += a;
-	state->h[1] += b;
-	state->h[2] += c;
-	state->h[3] += d;
-	state->h[4] += e;
-	state->h[5] += f;
-	state->h[6] += g;
-	state->h[7] += h;
-}
-
-void sha256_blocks_generic(struct sha256_block_state *state,
-			   const u8 *data, size_t nblocks)
-{
-	u32 W[64];
-
-	do {
-		sha256_block_generic(state, data, W);
-		data += SHA256_BLOCK_SIZE;
-	} while (--nblocks);
-
-	memzero_explicit(W, sizeof(W));
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_generic);
-
-MODULE_DESCRIPTION("SHA-256 Algorithm (generic implementation)");
-MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 12b4b59052c4..68936d5cd774 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -6,15 +6,17 @@
  * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
  * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
  * Copyright (c) 2014 Red Hat Inc.
+ * Copyright 2025 Google LLC
  */
 
 #include <crypto/hmac.h>
 #include <crypto/internal/blockhash.h>
-#include <crypto/internal/sha2.h>
+#include <crypto/sha2.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/unaligned.h>
 #include <linux/wordpart.h>
 
 static const struct sha256_block_state sha224_iv = {
@@ -31,26 +33,128 @@ static const struct sha256_block_state sha256_iv = {
 	},
 };
 
-/*
- * If __DISABLE_EXPORTS is defined, then this file is being compiled for a
- * pre-boot environment.  In that case, ignore the kconfig options, pull the
- * generic code into the same translation unit, and use that only.
- */
-#ifdef __DISABLE_EXPORTS
-#include "sha256-generic.c"
-#endif
+static const u32 sha256_K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+	0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+	0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+	0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+	0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+	0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+};
+
+#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y))))
+#define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22))
+#define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25))
+#define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3))
+#define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+	W[I] = get_unaligned_be32((__u32 *)input + I);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+	W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16];
+}
 
-static inline bool sha256_purgatory(void)
+#define SHA256_ROUND(i, a, b, c, d, e, f, g, h)                    \
+	do {                                                       \
+		u32 t1, t2;                                        \
+		t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \
+		t2 = e0(a) + Maj(a, b, c);                         \
+		d += t1;                                           \
+		h = t1 + t2;                                       \
+	} while (0)
+
+static void sha256_block_generic(struct sha256_block_state *state,
+				 const u8 *input, u32 W[64])
 {
-	return __is_defined(__DISABLE_EXPORTS);
+	u32 a, b, c, d, e, f, g, h;
+	int i;
+
+	/* load the input */
+	for (i = 0; i < 16; i += 8) {
+		LOAD_OP(i + 0, W, input);
+		LOAD_OP(i + 1, W, input);
+		LOAD_OP(i + 2, W, input);
+		LOAD_OP(i + 3, W, input);
+		LOAD_OP(i + 4, W, input);
+		LOAD_OP(i + 5, W, input);
+		LOAD_OP(i + 6, W, input);
+		LOAD_OP(i + 7, W, input);
+	}
+
+	/* now blend */
+	for (i = 16; i < 64; i += 8) {
+		BLEND_OP(i + 0, W);
+		BLEND_OP(i + 1, W);
+		BLEND_OP(i + 2, W);
+		BLEND_OP(i + 3, W);
+		BLEND_OP(i + 4, W);
+		BLEND_OP(i + 5, W);
+		BLEND_OP(i + 6, W);
+		BLEND_OP(i + 7, W);
+	}
+
+	/* load the state into our registers */
+	a = state->h[0];
+	b = state->h[1];
+	c = state->h[2];
+	d = state->h[3];
+	e = state->h[4];
+	f = state->h[5];
+	g = state->h[6];
+	h = state->h[7];
+
+	/* now iterate */
+	for (i = 0; i < 64; i += 8) {
+		SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h);
+		SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g);
+		SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f);
+		SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e);
+		SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d);
+		SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c);
+		SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b);
+		SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a);
+	}
+
+	state->h[0] += a;
+	state->h[1] += b;
+	state->h[2] += c;
+	state->h[3] += d;
+	state->h[4] += e;
+	state->h[5] += f;
+	state->h[6] += g;
+	state->h[7] += h;
 }
 
-static inline void sha256_blocks(struct sha256_block_state *state,
-				 const u8 *data, size_t nblocks)
+static void __maybe_unused
+sha256_blocks_generic(struct sha256_block_state *state,
+		      const u8 *data, size_t nblocks)
 {
-	sha256_choose_blocks(state->h, data, nblocks, sha256_purgatory(), false);
+	u32 W[64];
+
+	do {
+		sha256_block_generic(state, data, W);
+		data += SHA256_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(W, sizeof(W));
 }
 
+#if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS)
+#include "sha256.h" /* $(SRCARCH)/sha256.h */
+#else
+#define sha256_blocks sha256_blocks_generic
+#endif
+
 static void __sha256_init(struct __sha256_ctx *ctx,
 			  const struct sha256_block_state *iv,
 			  u64 initial_bytecount)
@@ -273,5 +377,19 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
 #endif /* !__DISABLE_EXPORTS */
 
+#ifdef sha256_mod_init_arch
+static int __init sha256_mod_init(void)
+{
+	sha256_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha256_mod_init);
+
+static void __exit sha256_mod_exit(void)
+{
+}
+module_exit(sha256_mod_exit);
+#endif
+
 MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sparc/Kconfig b/lib/crypto/sparc/Kconfig
deleted file mode 100644
index e5c3e4d3dba6..000000000000
--- a/lib/crypto/sparc/Kconfig
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_SHA256_SPARC64
-	tristate
-	depends on SPARC64
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/sparc/Makefile b/lib/crypto/sparc/Makefile
deleted file mode 100644
index 75ee244ad6f7..000000000000
--- a/lib/crypto/sparc/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
-sha256-sparc64-y := sha256.o sha256_asm.o
diff --git a/lib/crypto/sparc/sha256.c b/lib/crypto/sparc/sha256.c
deleted file mode 100644
index f41c109c1c18..000000000000
--- a/lib/crypto/sparc/sha256.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SHA-256 accelerated using the sparc64 sha256 opcodes
- *
- * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
- * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
- */
-
-#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
-
-#include <asm/elf.h>
-#include <asm/opcodes.h>
-#include <asm/pstate.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
-
-asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
-					 const u8 *data, size_t nblocks);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (static_branch_likely(&have_sha256_opcodes))
-		sha256_sparc64_transform(state, data, nblocks);
-	else
-		sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init sha256_sparc64_mod_init(void)
-{
-	unsigned long cfr;
-
-	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
-		return 0;
-
-	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
-	if (!(cfr & CFR_SHA256))
-		return 0;
-
-	static_branch_enable(&have_sha256_opcodes);
-	pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
-	return 0;
-}
-subsys_initcall(sha256_sparc64_mod_init);
-
-static void __exit sha256_sparc64_mod_exit(void)
-{
-}
-module_exit(sha256_sparc64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 accelerated using the sparc64 sha256 opcodes");
diff --git a/lib/crypto/sparc/sha256.h b/lib/crypto/sparc/sha256.h
new file mode 100644
index 000000000000..1d10108eb195
--- /dev/null
+++ b/lib/crypto/sparc/sha256.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * SHA-256 accelerated using the sparc64 sha256 opcodes
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#include <asm/elf.h>
+#include <asm/opcodes.h>
+#include <asm/pstate.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_opcodes);
+
+asmlinkage void sha256_sparc64_transform(struct sha256_block_state *state,
+					 const u8 *data, size_t nblocks);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_opcodes))
+		sha256_sparc64_transform(state, data, nblocks);
+	else
+		sha256_blocks_generic(state, data, nblocks);
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return;
+
+	static_branch_enable(&have_sha256_opcodes);
+	pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+}
diff --git a/lib/crypto/x86/Kconfig b/lib/crypto/x86/Kconfig
index e344579db3d8..546fe2afe0b5 100644
--- a/lib/crypto/x86/Kconfig
+++ b/lib/crypto/x86/Kconfig
@@ -24,10 +24,3 @@ config CRYPTO_POLY1305_X86_64
 	depends on 64BIT
 	default CRYPTO_LIB_POLY1305
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_X86_64
-	tristate
-	depends on 64BIT
-	default CRYPTO_LIB_SHA256
-	select CRYPTO_ARCH_HAVE_LIB_SHA256
-	select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/x86/Makefile b/lib/crypto/x86/Makefile
index abceca3d31c0..c2ff8c5f1046 100644
--- a/lib/crypto/x86/Makefile
+++ b/lib/crypto/x86/Makefile
@@ -10,9 +10,6 @@ obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 targets += poly1305-x86_64-cryptogams.S
 
-obj-$(CONFIG_CRYPTO_SHA256_X86_64) += sha256-x86_64.o
-sha256-x86_64-y := sha256.o sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256-ni-asm.o
-
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $< > $@
 
diff --git a/lib/crypto/x86/sha256.c b/lib/crypto/x86/sha256.c
deleted file mode 100644
index 9ee38d2b3d57..000000000000
--- a/lib/crypto/x86/sha256.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for x86_64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/fpu/api.h>
-#include <crypto/internal/sha2.h>
-#include <crypto/internal/simd.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/static_call.h>
-
-asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state,
-				       const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_avx(struct sha256_block_state *state,
-				     const u8 *data, size_t nblocks);
-asmlinkage void sha256_transform_rorx(struct sha256_block_state *state,
-				      const u8 *data, size_t nblocks);
-asmlinkage void sha256_ni_transform(struct sha256_block_state *state,
-				    const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
-
-DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
-
-void sha256_blocks_arch(struct sha256_block_state *state,
-			const u8 *data, size_t nblocks)
-{
-	if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) {
-		kernel_fpu_begin();
-		static_call(sha256_blocks_x86)(state, data, nblocks);
-		kernel_fpu_end();
-	} else {
-		sha256_blocks_generic(state, data, nblocks);
-	}
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-static int __init sha256_x86_mod_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
-		static_call_update(sha256_blocks_x86, sha256_ni_transform);
-	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE |
-				     XFEATURE_MASK_YMM, NULL) &&
-		   boot_cpu_has(X86_FEATURE_AVX)) {
-		if (boot_cpu_has(X86_FEATURE_AVX2) &&
-		    boot_cpu_has(X86_FEATURE_BMI2))
-			static_call_update(sha256_blocks_x86,
-					   sha256_transform_rorx);
-		else
-			static_call_update(sha256_blocks_x86,
-					   sha256_transform_avx);
-	} else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
-		return 0;
-	}
-	static_branch_enable(&have_sha256_x86);
-	return 0;
-}
-subsys_initcall(sha256_x86_mod_init);
-
-static void __exit sha256_x86_mod_exit(void)
-{
-}
-module_exit(sha256_x86_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for x86_64");
diff --git a/lib/crypto/x86/sha256.h b/lib/crypto/x86/sha256.h
new file mode 100644
index 000000000000..3b5456c222ba
--- /dev/null
+++ b/lib/crypto/x86/sha256.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-256 optimized for x86_64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <crypto/internal/simd.h>
+#include <linux/static_call.h>
+
+asmlinkage void sha256_transform_ssse3(struct sha256_block_state *state,
+				       const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_avx(struct sha256_block_state *state,
+				     const u8 *data, size_t nblocks);
+asmlinkage void sha256_transform_rorx(struct sha256_block_state *state,
+				      const u8 *data, size_t nblocks);
+asmlinkage void sha256_ni_transform(struct sha256_block_state *state,
+				    const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha256_x86);
+
+DEFINE_STATIC_CALL(sha256_blocks_x86, sha256_transform_ssse3);
+
+static void sha256_blocks(struct sha256_block_state *state,
+			  const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_sha256_x86) && crypto_simd_usable()) {
+		kernel_fpu_begin();
+		static_call(sha256_blocks_x86)(state, data, nblocks);
+		kernel_fpu_end();
+	} else {
+		sha256_blocks_generic(state, data, nblocks);
+	}
+}
+
+#define sha256_mod_init_arch sha256_mod_init_arch
+static inline void sha256_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_SHA_NI)) {
+		static_call_update(sha256_blocks_x86, sha256_ni_transform);
+	} else if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+				     NULL) &&
+		   boot_cpu_has(X86_FEATURE_AVX)) {
+		if (boot_cpu_has(X86_FEATURE_AVX2) &&
+		    boot_cpu_has(X86_FEATURE_BMI2))
+			static_call_update(sha256_blocks_x86,
+					   sha256_transform_rorx);
+		else
+			static_call_update(sha256_blocks_x86,
+					   sha256_transform_avx);
+	} else if (!boot_cpu_has(X86_FEATURE_SSSE3)) {
+		return;
+	}
+	static_branch_enable(&have_sha256_x86);
+}
-- 
cgit v1.2.3


From b34c9803aabd85189ffacc0d3cdb9ce4515c2b4d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Mon, 30 Jun 2025 09:06:45 -0700
Subject: lib/crypto: sha256: Document the SHA-224 and SHA-256 API

Add kerneldoc comments, consistent with the kerneldoc comments of the
SHA-384 and SHA-512 API.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160645.3198-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index 2e3fc2cf4aa0..e0a08f6addd0 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -155,13 +155,51 @@ struct sha224_ctx {
 	struct __sha256_ctx ctx;
 };
 
+/**
+ * sha224_init() - Initialize a SHA-224 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider sha224() instead.
+ *
+ * Context: Any context.
+ */
 void sha224_init(struct sha224_ctx *ctx);
+
+/**
+ * sha224_update() - Update a SHA-224 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
 static inline void sha224_update(struct sha224_ctx *ctx,
 				 const u8 *data, size_t len)
 {
 	__sha256_update(&ctx->ctx, data, len);
 }
+
+/**
+ * sha224_final() - Finish computing a SHA-224 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting SHA-224 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
 void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]);
+
+/**
+ * sha224() - Compute SHA-224 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting SHA-224 message digest
+ *
+ * Context: Any context.
+ */
 void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]);
 
 /**
@@ -275,13 +313,51 @@ struct sha256_ctx {
 	struct __sha256_ctx ctx;
 };
 
+/**
+ * sha256_init() - Initialize a SHA-256 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider sha256() instead.
+ *
+ * Context: Any context.
+ */
 void sha256_init(struct sha256_ctx *ctx);
+
+/**
+ * sha256_update() - Update a SHA-256 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
 static inline void sha256_update(struct sha256_ctx *ctx,
 				 const u8 *data, size_t len)
 {
 	__sha256_update(&ctx->ctx, data, len);
 }
+
+/**
+ * sha256_final() - Finish computing a SHA-256 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting SHA-256 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
 void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]);
+
+/**
+ * sha256() - Compute SHA-256 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting SHA-256 message digest
+ *
+ * Context: Any context.
+ */
 void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]);
 
 /**
-- 
cgit v1.2.3


From cdd73b1666079a73d061396f361df55d59fe96e6 Mon Sep 17 00:00:00 2001
From: Ariel Otilibili <ariel.otilibili-anieli@eurecom.fr>
Date: Wed, 2 Jul 2025 12:00:21 +0200
Subject: uapi: fix broken link in linux/capability.h

The link to the libcap library is outdated. Instead, use a link to the
libcap2 library.

As well, give the complete reference of the POSIX compliance.

Signed-off-by: Ariel Otilibili <ariel.otilibili-anieli@eurecom.fr>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Reviewed-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: Serge Hallyn <sergeh@kernel.org>
---
 include/uapi/linux/capability.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 2e21b5594f81..ea5a0899ecf0 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -6,9 +6,10 @@
  * Alexander Kjeldaas <astor@guardian.no>
  * with help from Aleph1, Roland Buresund and Andrew Main.
  *
- * See here for the libcap library ("POSIX draft" compliance):
+ * See here for the libcap2 library (compliant with Section 25 of
+ * the withdrawn POSIX 1003.1e Draft 17):
  *
- * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
+ * https://www.kernel.org/pub/linux/libs/security/linux-privs/libcap2/
  */
 
 #ifndef _UAPI_LINUX_CAPABILITY_H
-- 
cgit v1.2.3


From 9c06f26ba5f5da14bcac405c7a652dcf578a785d Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Date: Wed, 30 Apr 2025 13:56:01 +0200
Subject: pwm: Add support for pwmchip devices for faster and easier userspace
 access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this change each pwmchip defining the new-style waveform callbacks
can be accessed from userspace via a character device. Compared to the
sysfs-API this is faster and allows to pass the whole configuration in a
single ioctl allowing atomic application and thus reducing glitches.

On an STM32MP13 I see:

	root@DistroKit:~ time pwmtestperf
	real	0m 1.27s
	user	0m 0.02s
	sys	0m 1.21s
	root@DistroKit:~ rm /dev/pwmchip0
	root@DistroKit:~ time pwmtestperf
	real	0m 3.61s
	user	0m 0.27s
	sys	0m 3.26s

pwmtestperf does essentially:

	for i in 0 .. 50000:
		pwm_set_waveform(duty_length_ns=i, period_length_ns=50000, duty_offset_ns=0)

and in the presence of /dev/pwmchip0 is uses the ioctls introduced here,
without that device it uses /sys/class/pwm/pwmchip0.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/ad4a4e49ae3f8ea81e23cac1ac12b338c3bf5c5b.1746010245.git.u.kleine-koenig@baylibre.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c       | 322 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/pwm.h      |   3 +
 include/uapi/linux/pwm.h |  53 ++++++++
 3 files changed, 363 insertions(+), 15 deletions(-)
 create mode 100644 include/uapi/linux/pwm.h

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index edf776b8ad53..50aa0528a265 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -23,9 +23,13 @@
 
 #include <dt-bindings/pwm/pwm.h>
 
+#include <uapi/linux/pwm.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/pwm.h>
 
+#define PWM_MINOR_COUNT 256
+
 /* protects access to pwm_chips */
 static DEFINE_MUTEX(pwm_lock);
 
@@ -2007,20 +2011,9 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id)
 }
 EXPORT_SYMBOL_GPL(pwm_get);
 
-/**
- * pwm_put() - release a PWM device
- * @pwm: PWM device
- */
-void pwm_put(struct pwm_device *pwm)
+static void __pwm_put(struct pwm_device *pwm)
 {
-	struct pwm_chip *chip;
-
-	if (!pwm)
-		return;
-
-	chip = pwm->chip;
-
-	guard(mutex)(&pwm_lock);
+	struct pwm_chip *chip = pwm->chip;
 
 	/*
 	 * Trigger a warning if a consumer called pwm_put() twice.
@@ -2041,6 +2034,20 @@ void pwm_put(struct pwm_device *pwm)
 
 	module_put(chip->owner);
 }
+
+/**
+ * pwm_put() - release a PWM device
+ * @pwm: PWM device
+ */
+void pwm_put(struct pwm_device *pwm)
+{
+	if (!pwm)
+		return;
+
+	guard(mutex)(&pwm_lock);
+
+	__pwm_put(pwm);
+}
 EXPORT_SYMBOL_GPL(pwm_put);
 
 static void devm_pwm_release(void *pwm)
@@ -2110,6 +2117,274 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_fwnode_pwm_get);
 
+struct pwm_cdev_data {
+	struct pwm_chip *chip;
+	struct pwm_device *pwm[];
+};
+
+static int pwm_cdev_open(struct inode *inode, struct file *file)
+{
+	struct pwm_chip *chip = container_of(inode->i_cdev, struct pwm_chip, cdev);
+	struct pwm_cdev_data *cdata;
+
+	guard(mutex)(&pwm_lock);
+
+	if (!chip->operational)
+		return -ENXIO;
+
+	cdata = kzalloc(struct_size(cdata, pwm, chip->npwm), GFP_KERNEL);
+	if (!cdata)
+		return -ENOMEM;
+
+	cdata->chip = chip;
+
+	file->private_data = cdata;
+
+	return nonseekable_open(inode, file);
+}
+
+static int pwm_cdev_release(struct inode *inode, struct file *file)
+{
+	struct pwm_cdev_data *cdata = file->private_data;
+	unsigned int i;
+
+	for (i = 0; i < cdata->chip->npwm; ++i) {
+		struct pwm_device *pwm = cdata->pwm[i];
+
+		if (pwm) {
+			const char *label = pwm->label;
+
+			pwm_put(cdata->pwm[i]);
+			kfree(label);
+		}
+	}
+	kfree(cdata);
+
+	return 0;
+}
+
+static int pwm_cdev_request(struct pwm_cdev_data *cdata, unsigned int hwpwm)
+{
+	struct pwm_chip *chip = cdata->chip;
+
+	if (hwpwm >= chip->npwm)
+		return -EINVAL;
+
+	if (!cdata->pwm[hwpwm]) {
+		struct pwm_device *pwm = &chip->pwms[hwpwm];
+		const char *label;
+		int ret;
+
+		label = kasprintf(GFP_KERNEL, "pwm-cdev (pid=%d)", current->pid);
+		if (!label)
+			return -ENOMEM;
+
+		ret = pwm_device_request(pwm, label);
+		if (ret < 0) {
+			kfree(label);
+			return ret;
+		}
+
+		cdata->pwm[hwpwm] = pwm;
+	}
+
+	return 0;
+}
+
+static int pwm_cdev_free(struct pwm_cdev_data *cdata, unsigned int hwpwm)
+{
+	struct pwm_chip *chip = cdata->chip;
+
+	if (hwpwm >= chip->npwm)
+		return -EINVAL;
+
+	if (cdata->pwm[hwpwm]) {
+		struct pwm_device *pwm = cdata->pwm[hwpwm];
+		const char *label = pwm->label;
+
+		__pwm_put(pwm);
+
+		kfree(label);
+
+		cdata->pwm[hwpwm] = NULL;
+	}
+
+	return 0;
+}
+
+static struct pwm_device *pwm_cdev_get_requested_pwm(struct pwm_cdev_data *cdata,
+						     u32 hwpwm)
+{
+	struct pwm_chip *chip = cdata->chip;
+
+	if (hwpwm >= chip->npwm)
+		return ERR_PTR(-EINVAL);
+
+	if (cdata->pwm[hwpwm])
+		return cdata->pwm[hwpwm];
+
+	return ERR_PTR(-EINVAL);
+}
+
+static long pwm_cdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	struct pwm_cdev_data *cdata = file->private_data;
+	struct pwm_chip *chip = cdata->chip;
+
+	guard(mutex)(&pwm_lock);
+
+	if (!chip->operational)
+		return -ENODEV;
+
+	switch (cmd) {
+	case PWM_IOCTL_REQUEST:
+		{
+			unsigned int hwpwm = arg;
+
+			return pwm_cdev_request(cdata, hwpwm);
+		}
+
+	case PWM_IOCTL_FREE:
+		{
+			unsigned int hwpwm = arg;
+
+			return pwm_cdev_free(cdata, hwpwm);
+		}
+
+	case PWM_IOCTL_ROUNDWF:
+		{
+			struct pwmchip_waveform cwf;
+			struct pwm_waveform wf;
+			struct pwm_device *pwm;
+
+			ret = copy_from_user(&cwf,
+					     (struct pwmchip_waveform __user *)arg,
+					     sizeof(cwf));
+			if (ret)
+				return -EFAULT;
+
+			if (cwf.__pad != 0)
+				return -EINVAL;
+
+			pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm);
+			if (IS_ERR(pwm))
+				return PTR_ERR(pwm);
+
+			wf = (struct pwm_waveform) {
+				.period_length_ns = cwf.period_length_ns,
+				.duty_length_ns = cwf.duty_length_ns,
+				.duty_offset_ns = cwf.duty_offset_ns,
+			};
+
+			ret = pwm_round_waveform_might_sleep(pwm, &wf);
+			if (ret < 0)
+				return ret;
+
+			cwf = (struct pwmchip_waveform) {
+				.hwpwm = cwf.hwpwm,
+				.period_length_ns = wf.period_length_ns,
+				.duty_length_ns = wf.duty_length_ns,
+				.duty_offset_ns = wf.duty_offset_ns,
+			};
+
+			return copy_to_user((struct pwmchip_waveform __user *)arg,
+					    &cwf, sizeof(cwf));
+		}
+
+	case PWM_IOCTL_GETWF:
+		{
+			struct pwmchip_waveform cwf;
+			struct pwm_waveform wf;
+			struct pwm_device *pwm;
+
+			ret = copy_from_user(&cwf,
+					     (struct pwmchip_waveform __user *)arg,
+					     sizeof(cwf));
+			if (ret)
+				return -EFAULT;
+
+			if (cwf.__pad != 0)
+				return -EINVAL;
+
+			pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm);
+			if (IS_ERR(pwm))
+				return PTR_ERR(pwm);
+
+			ret = pwm_get_waveform_might_sleep(pwm, &wf);
+			if (ret)
+				return ret;
+
+			cwf = (struct pwmchip_waveform) {
+				.hwpwm = cwf.hwpwm,
+				.period_length_ns = wf.period_length_ns,
+				.duty_length_ns = wf.duty_length_ns,
+				.duty_offset_ns = wf.duty_offset_ns,
+			};
+
+			return copy_to_user((struct pwmchip_waveform __user *)arg,
+					    &cwf, sizeof(cwf));
+		}
+
+	case PWM_IOCTL_SETROUNDEDWF:
+	case PWM_IOCTL_SETEXACTWF:
+		{
+			struct pwmchip_waveform cwf;
+			struct pwm_waveform wf;
+			struct pwm_device *pwm;
+
+			ret = copy_from_user(&cwf,
+					     (struct pwmchip_waveform __user *)arg,
+					     sizeof(cwf));
+			if (ret)
+				return -EFAULT;
+
+			if (cwf.__pad != 0)
+				return -EINVAL;
+
+			wf = (struct pwm_waveform){
+				.period_length_ns = cwf.period_length_ns,
+				.duty_length_ns = cwf.duty_length_ns,
+				.duty_offset_ns = cwf.duty_offset_ns,
+			};
+
+			if (!pwm_wf_valid(&wf))
+				return -EINVAL;
+
+			pwm = pwm_cdev_get_requested_pwm(cdata, cwf.hwpwm);
+			if (IS_ERR(pwm))
+				return PTR_ERR(pwm);
+
+			ret = pwm_set_waveform_might_sleep(pwm, &wf,
+							   cmd == PWM_IOCTL_SETEXACTWF);
+
+			/*
+			 * If userspace cares about rounding deviations it has
+			 * to check the values anyhow, so simplify handling for
+			 * them and don't signal uprounding. This matches the
+			 * behaviour of PWM_IOCTL_ROUNDWF which also returns 0
+			 * in that case.
+			 */
+			if (ret == 1)
+				ret = 0;
+
+			return ret;
+		}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations pwm_cdev_fileops = {
+	.open = pwm_cdev_open,
+	.release = pwm_cdev_release,
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = pwm_cdev_ioctl,
+};
+
+static dev_t pwm_devt;
+
 /**
  * __pwmchip_add() - register a new PWM chip
  * @chip: the PWM chip to add
@@ -2162,7 +2437,17 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	scoped_guard(pwmchip, chip)
 		chip->operational = true;
 
-	ret = device_add(&chip->dev);
+	if (chip->ops->write_waveform) {
+		if (chip->id < PWM_MINOR_COUNT)
+			chip->dev.devt = MKDEV(MAJOR(pwm_devt), chip->id);
+		else
+			dev_warn(&chip->dev, "chip id too high to create a chardev\n");
+	}
+
+	cdev_init(&chip->cdev, &pwm_cdev_fileops);
+	chip->cdev.owner = owner;
+
+	ret = cdev_device_add(&chip->cdev, &chip->dev);
 	if (ret)
 		goto err_device_add;
 
@@ -2213,7 +2498,7 @@ void pwmchip_remove(struct pwm_chip *chip)
 		idr_remove(&pwm_chips, chip->id);
 	}
 
-	device_del(&chip->dev);
+	cdev_device_del(&chip->cdev, &chip->dev);
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
@@ -2357,9 +2642,16 @@ static int __init pwm_init(void)
 {
 	int ret;
 
+	ret = alloc_chrdev_region(&pwm_devt, 0, PWM_MINOR_COUNT, "pwm");
+	if (ret) {
+		pr_err("Failed to initialize chrdev region for PWM usage\n");
+		return ret;
+	}
+
 	ret = class_register(&pwm_class);
 	if (ret) {
 		pr_err("Failed to initialize PWM class (%pe)\n", ERR_PTR(ret));
+		unregister_chrdev_region(pwm_devt, 256);
 		return ret;
 	}
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 63a17d2b4ec8..2492c91452f9 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_PWM_H
 #define __LINUX_PWM_H
 
+#include <linux/cdev.h>
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/module.h>
@@ -311,6 +312,7 @@ struct pwm_ops {
 /**
  * struct pwm_chip - abstract a PWM controller
  * @dev: device providing the PWMs
+ * @cdev: &struct cdev for this device
  * @ops: callbacks for this PWM controller
  * @owner: module providing this chip
  * @id: unique number of this PWM chip
@@ -325,6 +327,7 @@ struct pwm_ops {
  */
 struct pwm_chip {
 	struct device dev;
+	struct cdev cdev;
 	const struct pwm_ops *ops;
 	struct module *owner;
 	unsigned int id;
diff --git a/include/uapi/linux/pwm.h b/include/uapi/linux/pwm.h
new file mode 100644
index 000000000000..182d59cc07ee
--- /dev/null
+++ b/include/uapi/linux/pwm.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */
+
+#ifndef _UAPI_PWM_H_
+#define _UAPI_PWM_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct pwmchip_waveform - Describe a PWM waveform for a pwm_chip's PWM channel
+ * @hwpwm: per-chip relative index of the PWM device
+ * @__pad: padding, must be zero
+ * @period_length_ns: duration of the repeating period.
+ *    A value of 0 represents a disabled PWM.
+ * @duty_length_ns: duration of the active part in each period
+ * @duty_offset_ns: offset of the rising edge from a period's start
+ */
+struct pwmchip_waveform {
+	__u32 hwpwm;
+	__u32 __pad;
+	__u64 period_length_ns;
+	__u64 duty_length_ns;
+	__u64 duty_offset_ns;
+};
+
+/* Reserves the passed hwpwm for exclusive control. */
+#define PWM_IOCTL_REQUEST	_IO(0x75, 1)
+
+/* counter part to PWM_IOCTL_REQUEST */
+#define PWM_IOCTL_FREE		_IO(0x75, 2)
+
+/*
+ * Modifies the passed wf according to hardware constraints. All parameters are
+ * rounded down to the next possible value, unless there is no such value, then
+ * values are rounded up. Note that zero isn't considered for rounding down
+ * period_length_ns.
+ */
+#define PWM_IOCTL_ROUNDWF	_IOWR(0x75, 3, struct pwmchip_waveform)
+
+/* Get the currently implemented waveform */
+#define PWM_IOCTL_GETWF		_IOWR(0x75, 4, struct pwmchip_waveform)
+
+/* Like PWM_IOCTL_ROUNDWF + PWM_IOCTL_SETEXACTWF in one go. */
+#define PWM_IOCTL_SETROUNDEDWF	_IOW(0x75, 5, struct pwmchip_waveform)
+
+/*
+ * Program the PWM to emit exactly the passed waveform, subject only to rounding
+ * down each value less than 1 ns. Returns 0 on success, -EDOM if the waveform
+ * cannot be implemented exactly, or other negative error codes.
+ */
+#define PWM_IOCTL_SETEXACTWF	_IOW(0x75, 6, struct pwmchip_waveform)
+
+#endif /* _UAPI_PWM_H_ */
-- 
cgit v1.2.3


From edd3bcb1801e1bb98f4f81485140e18c86406ced Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <m.wilczynski@samsung.com>
Date: Wed, 2 Jul 2025 15:45:29 +0200
Subject: pwm: Expose PWM_WFHWSIZE in public header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The WFHWSIZE constant defines the maximum size for the hardware-specific
waveform representation buffer. It is currently local to
drivers/pwm/core.c, which makes it inaccessible to external tools like
bindgen.

Move the constant to include/linux/pwm.h to make it part of the public
API. As part of this change, rename it to PWM_WFHWSIZE to follow
standard kernel conventions for namespacing macros in public headers.

This allows bindgen to automatically generate a corresponding constant
for the Rust PWM abstractions, ensuring the value remains synchronized
between the C core and Rust code and preventing future maintenance
issues.

Signed-off-by: Michal Wilczynski <m.wilczynski@samsung.com>
Link: https://lore.kernel.org/r/20250702-rust-next-pwm-working-fan-for-sending-v7-1-67ef39ff1d29@samsung.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/core.c  | 26 ++++++++++++--------------
 include/linux/pwm.h |  2 ++
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 50aa0528a265..0d66376a83ec 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -210,8 +210,6 @@ static int __pwm_write_waveform(struct pwm_chip *chip, struct pwm_device *pwm, c
 	return ret;
 }
 
-#define WFHWSIZE 20
-
 /**
  * pwm_round_waveform_might_sleep - Query hardware capabilities
  * Cannot be used in atomic context.
@@ -248,10 +246,10 @@ int pwm_round_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *
 	struct pwm_chip *chip = pwm->chip;
 	const struct pwm_ops *ops = chip->ops;
 	struct pwm_waveform wf_req = *wf;
-	char wfhw[WFHWSIZE];
+	char wfhw[PWM_WFHWSIZE];
 	int ret_tohw, ret_fromhw;
 
-	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+	BUG_ON(PWM_WFHWSIZE < ops->sizeof_wfhw);
 
 	if (!pwmchip_supports_waveform(chip))
 		return -EOPNOTSUPP;
@@ -306,10 +304,10 @@ int pwm_get_waveform_might_sleep(struct pwm_device *pwm, struct pwm_waveform *wf
 {
 	struct pwm_chip *chip = pwm->chip;
 	const struct pwm_ops *ops = chip->ops;
-	char wfhw[WFHWSIZE];
+	char wfhw[PWM_WFHWSIZE];
 	int err;
 
-	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+	BUG_ON(PWM_WFHWSIZE < ops->sizeof_wfhw);
 
 	if (!pwmchip_supports_waveform(chip) || !ops->read_waveform)
 		return -EOPNOTSUPP;
@@ -334,11 +332,11 @@ static int __pwm_set_waveform(struct pwm_device *pwm,
 {
 	struct pwm_chip *chip = pwm->chip;
 	const struct pwm_ops *ops = chip->ops;
-	char wfhw[WFHWSIZE];
+	char wfhw[PWM_WFHWSIZE];
 	struct pwm_waveform wf_rounded;
 	int err, ret_tohw;
 
-	BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+	BUG_ON(PWM_WFHWSIZE < ops->sizeof_wfhw);
 
 	if (!pwmchip_supports_waveform(chip))
 		return -EOPNOTSUPP;
@@ -650,9 +648,9 @@ static int __pwm_apply(struct pwm_device *pwm, const struct pwm_state *state)
 
 	if (pwmchip_supports_waveform(chip)) {
 		struct pwm_waveform wf;
-		char wfhw[WFHWSIZE];
+		char wfhw[PWM_WFHWSIZE];
 
-		BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+		BUG_ON(PWM_WFHWSIZE < ops->sizeof_wfhw);
 
 		pwm_state2wf(state, &wf);
 
@@ -809,10 +807,10 @@ int pwm_get_state_hw(struct pwm_device *pwm, struct pwm_state *state)
 		return -ENODEV;
 
 	if (pwmchip_supports_waveform(chip) && ops->read_waveform) {
-		char wfhw[WFHWSIZE];
+		char wfhw[PWM_WFHWSIZE];
 		struct pwm_waveform wf;
 
-		BUG_ON(WFHWSIZE < ops->sizeof_wfhw);
+		BUG_ON(PWM_WFHWSIZE < ops->sizeof_wfhw);
 
 		ret = __pwm_read_waveform(chip, pwm, &wfhw);
 		if (ret)
@@ -1696,8 +1694,8 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
 		    !ops->write_waveform)
 			return false;
 
-		if (WFHWSIZE < ops->sizeof_wfhw) {
-			dev_warn(pwmchip_parent(chip), "WFHWSIZE < %zu\n", ops->sizeof_wfhw);
+		if (PWM_WFHWSIZE < ops->sizeof_wfhw) {
+			dev_warn(pwmchip_parent(chip), "PWM_WFHWSIZE < %zu\n", ops->sizeof_wfhw);
 			return false;
 		}
 	} else {
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 2492c91452f9..8cafc483db53 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -274,6 +274,8 @@ struct pwm_capture {
 	unsigned int duty_cycle;
 };
 
+#define PWM_WFHWSIZE 20
+
 /**
  * struct pwm_ops - PWM controller operations
  * @request: optional hook for requesting a PWM
-- 
cgit v1.2.3


From 9bad4bec5daddbb296481af759f9d56c849ba96f Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Tue, 1 Jul 2025 13:49:40 +0200
Subject: gpio: mmio: remove struct bgpio_pdata

With no more users, we can now remove struct bgpio_pdata. Move the
relevant bits from bgpio_parse_fw() into bgpio_pdev_probe() while
maintaining the logical ordering (get flags before calling
bgpio_init()).

Link: https://lore.kernel.org/r/20250701-gpio-mmio-pdata-v2-6-ebf34d273497@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpio-mmio.c    | 73 ++++++++++++---------------------------------
 include/linux/gpio/driver.h |  6 ----
 2 files changed, 19 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index 7d062f9a2c6a..a8103e26298c 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -712,43 +712,6 @@ static const struct of_device_id bgpio_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, bgpio_of_match);
 
-static struct bgpio_pdata *bgpio_parse_fw(struct device *dev, unsigned long *flags)
-{
-	struct bgpio_pdata *pdata;
-	const char *label;
-	unsigned int base;
-	int ret;
-
-	if (!dev_fwnode(dev))
-		return NULL;
-
-	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
-	if (!pdata)
-		return ERR_PTR(-ENOMEM);
-
-	pdata->base = -1;
-
-	if (device_is_big_endian(dev))
-		*flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER;
-
-	if (device_property_read_bool(dev, "no-output"))
-		*flags |= BGPIOF_NO_OUTPUT;
-
-	ret = device_property_read_string(dev, "label", &label);
-	if (!ret)
-		pdata->label = label;
-
-	/*
-	 * This property *must not* be used in device-tree sources, it's only
-	 * meant to be passed to the driver from board files and MFD core.
-	 */
-	ret = device_property_read_u32(dev, "gpio-mmio,base", &base);
-	if (!ret && base <= INT_MAX)
-		pdata->base = base;
-
-	return pdata;
-}
-
 static int bgpio_pdev_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -760,18 +723,10 @@ static int bgpio_pdev_probe(struct platform_device *pdev)
 	void __iomem *dirin;
 	unsigned long sz;
 	unsigned long flags = 0;
+	unsigned int base;
 	int err;
 	struct gpio_chip *gc;
-	struct bgpio_pdata *pdata;
-
-	pdata = bgpio_parse_fw(dev, &flags);
-	if (IS_ERR(pdata))
-		return PTR_ERR(pdata);
-
-	if (!pdata) {
-		pdata = dev_get_platdata(dev);
-		flags = pdev->id_entry->driver_data;
-	}
+	const char *label;
 
 	r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dat");
 	if (!r)
@@ -803,17 +758,27 @@ static int bgpio_pdev_probe(struct platform_device *pdev)
 	if (!gc)
 		return -ENOMEM;
 
+	if (device_is_big_endian(dev))
+		flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER;
+
+	if (device_property_read_bool(dev, "no-output"))
+		flags |= BGPIOF_NO_OUTPUT;
+
 	err = bgpio_init(gc, dev, sz, dat, set, clr, dirout, dirin, flags);
 	if (err)
 		return err;
 
-	if (pdata) {
-		if (pdata->label)
-			gc->label = pdata->label;
-		gc->base = pdata->base;
-		if (pdata->ngpio > 0)
-			gc->ngpio = pdata->ngpio;
-	}
+	err = device_property_read_string(dev, "label", &label);
+	if (!err)
+		gc->label = label;
+
+	/*
+	 * This property *must not* be used in device-tree sources, it's only
+	 * meant to be passed to the driver from board files and MFD core.
+	 */
+	err = device_property_read_u32(dev, "gpio-mmio,base", &base);
+	if (!err && base <= INT_MAX)
+		gc->base = base;
 
 	platform_set_drvdata(pdev, gc);
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index b53233051bee..602d4acd36b2 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -718,12 +718,6 @@ const unsigned long *gpiochip_query_valid_mask(const struct gpio_chip *gc);
 /* get driver data */
 void *gpiochip_get_data(struct gpio_chip *gc);
 
-struct bgpio_pdata {
-	const char *label;
-	int base;
-	int ngpio;
-};
-
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
 int gpiochip_populate_parent_fwspec_twocell(struct gpio_chip *gc,
-- 
cgit v1.2.3


From 8595375e4fded27de24b189c692c2c50051a7b3b Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Wed, 2 Jul 2025 11:22:08 +0200
Subject: gpio: generic: add new generic GPIO chip API

As the first step in removing the fields specific to the gpio-mmio
module from struct gpio_chip, we introduce a new set of generic GPIO
chip interfaces that are meant to replace the existing bgpio_ ones.

The new initialization function - gpio_generic_chip_init() - takes a
configuration structure as argument instead of 9 separate parameters.
This will allow easy extension if needed in the future. We hide the
locking details behind a set of helpers in order to be able to move the
raw spinlock out of struct gpio_chip without the users noticing.

For now, the new APIs just wrap the existing ones. Once all users have
been converted to the new interfaces, we'll pull them into gpio-mmio and
implement them in a backward-compatible way while also moving all fields
specific to the generic GPIO chip into struct gpio_generic_chip.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20250702-gpio-mmio-rework-v2-1-6b77aab684d8@linaro.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/generic.h | 120 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 include/linux/gpio/generic.h

(limited to 'include')

diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h
new file mode 100644
index 000000000000..b511acd58ab0
--- /dev/null
+++ b/include/linux/gpio/generic.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_GPIO_GENERIC_H
+#define __LINUX_GPIO_GENERIC_H
+
+#include <linux/cleanup.h>
+#include <linux/gpio/driver.h>
+#include <linux/spinlock.h>
+
+struct device;
+
+/**
+ * struct gpio_generic_chip_config - Generic GPIO chip configuration data
+ * @dev: Parent device of the new GPIO chip (compulsory).
+ * @sz: Size (width) of the MMIO registers in bytes, typically 1, 2 or 4.
+ * @dat: MMIO address for the register to READ the value of the GPIO lines, it
+ *       is expected that a 1 in the corresponding bit in this register means
+ *       the line is asserted.
+ * @set: MMIO address for the register to SET the value of the GPIO lines, it
+ *       is expected that we write the line with 1 in this register to drive
+ *       the GPIO line high.
+ * @clr: MMIO address for the register to CLEAR the value of the GPIO lines,
+ *       it is expected that we write the line with 1 in this register to
+ *       drive the GPIO line low. It is allowed to leave this address as NULL,
+ *       in that case the SET register will be assumed to also clear the GPIO
+ *       lines, by actively writing the line with 0.
+ * @dirout: MMIO address for the register to set the line as OUTPUT. It is
+ *          assumed that setting a line to 1 in this register will turn that
+ *          line into an output line. Conversely, setting the line to 0 will
+ *          turn that line into an input.
+ * @dirin: MMIO address for the register to set this line as INPUT. It is
+ *         assumed that setting a line to 1 in this register will turn that
+ *         line into an input line. Conversely, setting the line to 0 will
+ *         turn that line into an output.
+ * @flags: Different flags that will affect the behaviour of the device, such
+ *         as endianness etc.
+ */
+struct gpio_generic_chip_config {
+	struct device *dev;
+	unsigned long sz;
+	void __iomem *dat;
+	void __iomem *set;
+	void __iomem *clr;
+	void __iomem *dirout;
+	void __iomem *dirin;
+	unsigned long flags;
+};
+
+/**
+ * struct gpio_generic_chip - Generic GPIO chip implementation.
+ * @gc: The underlying struct gpio_chip object, implementing low-level GPIO
+ *      chip routines.
+ */
+struct gpio_generic_chip {
+	struct gpio_chip gc;
+};
+
+/**
+ * gpio_generic_chip_init() - Initialize a generic GPIO chip.
+ * @chip: Generic GPIO chip to set up.
+ * @cfg: Generic GPIO chip configuration.
+ *
+ * Returns 0 on success, negative error number on failure.
+ */
+static inline int
+gpio_generic_chip_init(struct gpio_generic_chip *chip,
+		       const struct gpio_generic_chip_config *cfg)
+{
+	return bgpio_init(&chip->gc, cfg->dev, cfg->sz, cfg->dat, cfg->set,
+			  cfg->clr, cfg->dirout, cfg->dirin, cfg->flags);
+}
+
+/**
+ * gpio_generic_chip_set() - Set the GPIO line value of the generic GPIO chip.
+ * @chip: Generic GPIO chip to use.
+ * @offset: Hardware offset of the line to set.
+ * @value: New GPIO line value.
+ *
+ * Some modules using the generic GPIO chip, need to set line values in their
+ * direction setters but they don't have access to the gpio-mmio symbols so
+ * they use the function pointer in struct gpio_chip directly. This is not
+ * optimal and can lead to crashes at run-time in some instances. This wrapper
+ * provides a safe interface for users.
+ *
+ * Returns: 0 on success, negative error number of failure.
+ */
+static inline int
+gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset,
+		      int value)
+{
+	if (WARN_ON(!chip->gc.set_rv))
+		return -EOPNOTSUPP;
+
+	return chip->gc.set_rv(&chip->gc, offset, value);
+}
+
+#define gpio_generic_chip_lock(gen_gc) \
+	raw_spin_lock(&(gen_gc)->gc.bgpio_lock)
+
+#define gpio_generic_chip_unlock(gen_gc) \
+	raw_spin_unlock(&(gen_gc)->gc.bgpio_lock)
+
+#define gpio_generic_chip_lock_irqsave(gen_gc, flags) \
+	raw_spin_lock_irqsave(&(gen_gc)->gc.bgpio_lock, flags)
+
+#define gpio_generic_chip_unlock_irqrestore(gen_gc, flags) \
+	raw_spin_unlock_irqrestore(&(gen_gc)->gc.bgpio_lock, flags)
+
+DEFINE_LOCK_GUARD_1(gpio_generic_lock,
+		    struct gpio_generic_chip,
+		    gpio_generic_chip_lock(_T->lock),
+		    gpio_generic_chip_unlock(_T->lock))
+
+DEFINE_LOCK_GUARD_1(gpio_generic_lock_irqsave,
+		    struct gpio_generic_chip,
+		    gpio_generic_chip_lock_irqsave(_T->lock, _T->flags),
+		    gpio_generic_chip_unlock_irqrestore(_T->lock, _T->flags),
+		    unsigned long flags)
+
+#endif /* __LINUX_GPIO_GENERIC_H */
-- 
cgit v1.2.3


From c5fd399a24c8e2865524361f7dc4d4a6899be4f4 Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Tue, 1 Jul 2025 17:55:41 +1000
Subject: wifi: mac80211: correctly identify S1G short beacon

mac80211 identifies a short beacon by the presence of the next
TBTT field, however the standard actually doesn't explicitly state that
the next TBTT can't be in a long beacon or even that it is required in
a short beacon - and as a result this validation does not work for all
vendor implementations.

The standard explicitly states that an S1G long beacon shall contain
the S1G beacon compatibility element as the first element in a beacon
transmitted at a TBTT that is not a TSBTT (Target Short Beacon
Transmission Time) as per IEEE80211-2024 11.1.3.10.1. This is validated
by 9.3.4.3 Table 9-76 which states that the S1G beacon compatibility
element is only allowed in the full set and is not allowed in the
minimum set of elements permitted for use within short beacons.

Correctly identify short beacons by the lack of an S1G beacon
compatibility element as the first element in an S1G beacon frame.

Fixes: 9eaffe5078ca ("cfg80211: convert S1G beacon to scan results")
Signed-off-by: Simon Wadsworth <simon@morsemicro.com>
Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250701075541.162619-1-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 45 +++++++++++++++++++++++++++++++++------------
 net/mac80211/mlme.c       |  7 +++++--
 2 files changed, 38 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 22f39e5e2ff1..996be3c2cff0 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -662,18 +662,6 @@ static inline bool ieee80211_s1g_has_cssid(__le16 fc)
 		(fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID));
 }
 
-/**
- * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon
- * @fc: frame control bytes in little-endian byteorder
- * Return: whether or not the frame is an S1G short beacon,
- *	i.e. it is an S1G beacon with 'next TBTT' flag set
- */
-static inline bool ieee80211_is_s1g_short_beacon(__le16 fc)
-{
-	return ieee80211_is_s1g_beacon(fc) &&
-		(fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT));
-}
-
 /**
  * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM
  * @fc: frame control bytes in little-endian byteorder
@@ -4901,6 +4889,39 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb)
 	return false;
 }
 
+/**
+ * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon
+ * @fc: frame control bytes in little-endian byteorder
+ * @variable: pointer to the beacon frame elements
+ * @variable_len: length of the frame elements
+ * Return: whether or not the frame is an S1G short beacon. As per
+ *	IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall
+ *	always be present as the first element in beacon frames generated at a
+ *	TBTT (Target Beacon Transmission Time), so any frame not containing
+ *	this element must have been generated at a TSBTT (Target Short Beacon
+ *	Transmission Time) that is not a TBTT. Additionally, short beacons are
+ *	prohibited from containing the S1G beacon compatibility element as per
+ *	IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with
+ *	either no elements or the first element is not the beacon compatibility
+ *	element, we have a short beacon.
+ */
+static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable,
+						 size_t variable_len)
+{
+	if (!ieee80211_is_s1g_beacon(fc))
+		return false;
+
+	/*
+	 * If the frame does not contain at least 1 element (this is perfectly
+	 * valid in a short beacon) and is an S1G beacon, we have a short
+	 * beacon.
+	 */
+	if (variable_len < 2)
+		return true;
+
+	return variable[0] != WLAN_EID_S1G_BCN_COMPAT;
+}
+
 struct element {
 	u8 id;
 	u8 datalen;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2d46d4af60d7..7ddb8e77b4c7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -7195,6 +7195,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 	struct ieee80211_bss_conf *bss_conf = link->conf;
 	struct ieee80211_vif_cfg *vif_cfg = &sdata->vif.cfg;
 	struct ieee80211_mgmt *mgmt = (void *) hdr;
+	struct ieee80211_ext *ext = NULL;
 	size_t baselen;
 	struct ieee802_11_elems *elems;
 	struct ieee80211_local *local = sdata->local;
@@ -7220,7 +7221,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 	/* Process beacon from the current BSS */
 	bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type);
 	if (ieee80211_is_s1g_beacon(mgmt->frame_control)) {
-		struct ieee80211_ext *ext = (void *) mgmt;
+		ext = (void *)mgmt;
 		variable = ext->u.s1g_beacon.variable +
 			   ieee80211_s1g_optional_len(ext->frame_control);
 	}
@@ -7407,7 +7408,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
 	}
 
 	if ((ncrc == link->u.mgd.beacon_crc && link->u.mgd.beacon_crc_valid) ||
-	    ieee80211_is_s1g_short_beacon(mgmt->frame_control))
+	    (ext && ieee80211_is_s1g_short_beacon(ext->frame_control,
+						  parse_params.start,
+						  parse_params.len)))
 		goto free;
 	link->u.mgd.beacon_crc = ncrc;
 	link->u.mgd.beacon_crc_valid = true;
-- 
cgit v1.2.3


From 377d7860c960ac8e672881bc50353d867e2f94a4 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 12 Jun 2025 15:25:33 +0200
Subject: cred: add auto cleanup method

Add a simple auto cleanup method for struct cred.

Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-19-315c0c34ba94@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cred.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/cred.h b/include/linux/cred.h
index 5658a3bfe803..a102a10f833f 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -263,6 +263,8 @@ static inline void put_cred(const struct cred *cred)
 	put_cred_many(cred, 1);
 }
 
+DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T))
+
 /**
  * current_cred - Access the current task's subjective credentials
  *
-- 
cgit v1.2.3


From 3ebed2fddf6fac5729ffc8c471c87d111b641678 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Fri, 27 Jun 2025 22:51:22 +0200
Subject: power: supply: core: Add power_supply_get/set_property_direct()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Power supply extensions might want to interact with the underlying
power supply to retrieve data like serial numbers, charging status
and more. However doing so causes psy->extensions_sem to be locked
twice, possibly causing a deadlock.

Provide special variants of power_supply_get/set_property() that
ignore any power supply extensions and thus do not touch the
associated psy->extensions_sem lock.

Suggested-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com>
Reviewed-by: Hans de Goede <hansg@kernel.org>
Link: https://lore.kernel.org/r/20250627205124.250433-1-W_Armin@gmx.de
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 drivers/power/supply/power_supply_core.c | 82 +++++++++++++++++++++++++++-----
 include/linux/power_supply.h             |  8 ++++
 2 files changed, 78 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index 33a5bfce4604..cfb0e3e0d4aa 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1235,9 +1235,8 @@ bool power_supply_has_property(struct power_supply *psy,
 	return false;
 }
 
-int power_supply_get_property(struct power_supply *psy,
-			    enum power_supply_property psp,
-			    union power_supply_propval *val)
+static int __power_supply_get_property(struct power_supply *psy, enum power_supply_property psp,
+				       union power_supply_propval *val, bool use_extensions)
 {
 	struct power_supply_ext_registration *reg;
 
@@ -1247,10 +1246,14 @@ int power_supply_get_property(struct power_supply *psy,
 		return -ENODEV;
 	}
 
-	scoped_guard(rwsem_read, &psy->extensions_sem) {
-		power_supply_for_each_extension(reg, psy) {
-			if (power_supply_ext_has_property(reg->ext, psp))
+	if (use_extensions) {
+		scoped_guard(rwsem_read, &psy->extensions_sem) {
+			power_supply_for_each_extension(reg, psy) {
+				if (!power_supply_ext_has_property(reg->ext, psp))
+					continue;
+
 				return reg->ext->get_property(psy, reg->ext, reg->data, psp, val);
+			}
 		}
 	}
 
@@ -1261,20 +1264,49 @@ int power_supply_get_property(struct power_supply *psy,
 	else
 		return -EINVAL;
 }
+
+int power_supply_get_property(struct power_supply *psy, enum power_supply_property psp,
+			      union power_supply_propval *val)
+{
+	return __power_supply_get_property(psy, psp, val, true);
+}
 EXPORT_SYMBOL_GPL(power_supply_get_property);
 
-int power_supply_set_property(struct power_supply *psy,
-			    enum power_supply_property psp,
-			    const union power_supply_propval *val)
+/**
+ * power_supply_get_property_direct - Read a power supply property without checking for extensions
+ * @psy: The power supply
+ * @psp: The power supply property to read
+ * @val: The resulting value of the power supply property
+ *
+ * Read a power supply property without taking into account any power supply extensions registered
+ * on the given power supply. This is mostly useful for power supply extensions that want to access
+ * their own power supply as using power_supply_get_property() directly will result in a potential
+ * deadlock.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int power_supply_get_property_direct(struct power_supply *psy, enum power_supply_property psp,
+				     union power_supply_propval *val)
+{
+        return __power_supply_get_property(psy, psp, val, false);
+}
+EXPORT_SYMBOL_GPL(power_supply_get_property_direct);
+
+
+static int __power_supply_set_property(struct power_supply *psy, enum power_supply_property psp,
+				       const union power_supply_propval *val, bool use_extensions)
 {
 	struct power_supply_ext_registration *reg;
 
 	if (atomic_read(&psy->use_cnt) <= 0)
 		return -ENODEV;
 
-	scoped_guard(rwsem_read, &psy->extensions_sem) {
-		power_supply_for_each_extension(reg, psy) {
-			if (power_supply_ext_has_property(reg->ext, psp)) {
+	if (use_extensions) {
+		scoped_guard(rwsem_read, &psy->extensions_sem) {
+			power_supply_for_each_extension(reg, psy) {
+				if (!power_supply_ext_has_property(reg->ext, psp))
+					continue;
+
 				if (reg->ext->set_property)
 					return reg->ext->set_property(psy, reg->ext, reg->data,
 								      psp, val);
@@ -1289,8 +1321,34 @@ int power_supply_set_property(struct power_supply *psy,
 
 	return psy->desc->set_property(psy, psp, val);
 }
+
+int power_supply_set_property(struct power_supply *psy, enum power_supply_property psp,
+			      const union power_supply_propval *val)
+{
+	return __power_supply_set_property(psy, psp, val, true);
+}
 EXPORT_SYMBOL_GPL(power_supply_set_property);
 
+/**
+ * power_supply_set_property_direct - Write a power supply property without checking for extensions
+ * @psy: The power supply
+ * @psp: The power supply property to write
+ * @val: The value to write to the power supply property
+ *
+ * Write a power supply property without taking into account any power supply extensions registered
+ * on the given power supply. This is mostly useful for power supply extensions that want to access
+ * their own power supply as using power_supply_set_property() directly will result in a potential
+ * deadlock.
+ *
+ * Return: 0 on success or negative error code on failure.
+ */
+int power_supply_set_property_direct(struct power_supply *psy, enum power_supply_property psp,
+				     const union power_supply_propval *val)
+{
+	return __power_supply_set_property(psy, psp, val, false);
+}
+EXPORT_SYMBOL_GPL(power_supply_set_property_direct);
+
 int power_supply_property_is_writeable(struct power_supply *psy,
 					enum power_supply_property psp)
 {
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 7803edaa8ff8..0cca01b5607b 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -888,15 +888,23 @@ static inline int power_supply_is_system_supplied(void) { return -ENOSYS; }
 extern int power_supply_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val);
+int power_supply_get_property_direct(struct power_supply *psy, enum power_supply_property psp,
+				     union power_supply_propval *val);
 #if IS_ENABLED(CONFIG_POWER_SUPPLY)
 extern int power_supply_set_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    const union power_supply_propval *val);
+int power_supply_set_property_direct(struct power_supply *psy, enum power_supply_property psp,
+				     const union power_supply_propval *val);
 #else
 static inline int power_supply_set_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    const union power_supply_propval *val)
 { return 0; }
+static inline int power_supply_set_property_direct(struct power_supply *psy,
+						   enum power_supply_property psp,
+						   const union power_supply_propval *val)
+{ return 0; }
 #endif
 extern void power_supply_external_power_changed(struct power_supply *psy);
 
-- 
cgit v1.2.3


From 1cea5180f2f812c444ceebdc40f5d001bedd030d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 5 Jul 2025 14:19:50 -0600
Subject: block: remove pktcdvd driver

This driver has long outlived it's utility, and it's broken and unloved.
The main use case for this was direct mount with UDF of cd-rw drives
that required 32kb packets. It would collect writes into that size and
write them out in multiples of that. That's not a common use case
anymore, the world has moved on from those kinds of media. To make
matters worse, it's actively breaking setups where it's not even
required or useful.

Link: https://lore.kernel.org/linux-block/fxg6dksau4jsk3u5xldlyo2m7qgiux6vtdrz5rywseotsouqdv@urcrwz6qtd3r/
Link: https://lore.kernel.org/linux-block/dcc4836e-6da9-4208-ad27-bbd44b3a2063@kernel.dk/
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/debugfs-pktcdvd          |   18 -
 Documentation/ABI/testing/sysfs-class-pktcdvd      |   97 -
 Documentation/cdrom/packet-writing.rst             |  139 -
 Documentation/userspace-api/ioctl/ioctl-number.rst |    1 -
 MAINTAINERS                                        |    7 -
 drivers/block/Kconfig                              |   43 -
 drivers/block/Makefile                             |    1 -
 drivers/block/pktcdvd.c                            | 2916 --------------------
 include/linux/pktcdvd.h                            |  198 --
 9 files changed, 3420 deletions(-)
 delete mode 100644 Documentation/ABI/testing/debugfs-pktcdvd
 delete mode 100644 Documentation/ABI/testing/sysfs-class-pktcdvd
 delete mode 100644 Documentation/cdrom/packet-writing.rst
 delete mode 100644 drivers/block/pktcdvd.c
 delete mode 100644 include/linux/pktcdvd.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
deleted file mode 100644
index f6f65a4faea0..000000000000
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ /dev/null
@@ -1,18 +0,0 @@
-What:           /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
-Date:           Oct. 2006
-KernelVersion:  2.6.20
-Contact:        Thomas Maier <balagi@justmail.de>
-Description:
-
-The pktcdvd module (packet writing driver) creates
-these files in debugfs:
-
-/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
-
-    ====            ====== ====================================
-    info            0444   Lots of driver statistics and infos.
-    ====            ====== ====================================
-
-Example::
-
-    cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/ABI/testing/sysfs-class-pktcdvd b/Documentation/ABI/testing/sysfs-class-pktcdvd
deleted file mode 100644
index ba1ce626591d..000000000000
--- a/Documentation/ABI/testing/sysfs-class-pktcdvd
+++ /dev/null
@@ -1,97 +0,0 @@
-sysfs interface
----------------
-The pktcdvd module (packet writing driver) creates the following files in the
-sysfs: (<devid> is in the format major:minor)
-
-What:		/sys/class/pktcdvd/add
-What:		/sys/class/pktcdvd/remove
-What:		/sys/class/pktcdvd/device_map
-Date:		Oct. 2006
-KernelVersion:	2.6.20
-Contact:	Thomas Maier <balagi@justmail.de>
-Description:
-
-		==========	==============================================
-		add		(WO) Write a block device id (major:minor) to
-				create a new pktcdvd device and map it to the
-				block device.
-
-		remove		(WO) Write the pktcdvd device id (major:minor)
-				to remove the pktcdvd device.
-
-		device_map	(RO) Shows the device mapping in format:
-				pktcdvd[0-7] <pktdevid> <blkdevid>
-		==========	==============================================
-
-
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/dev
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/uevent
-Date:		Oct. 2006
-KernelVersion:	2.6.20
-Contact:	Thomas Maier <balagi@justmail.de>
-Description:
-		dev:	(RO) Device id
-
-		uevent:	(WO) To send a uevent
-
-
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/stat/reset
-Date:		Oct. 2006
-KernelVersion:	2.6.20
-Contact:	Thomas Maier <balagi@justmail.de>
-Description:
-		packets_started:	(RO) Number of started packets.
-
-		packets_finished:	(RO) Number of finished packets.
-
-		kb_written:		(RO) kBytes written.
-
-		kb_read:		(RO) kBytes read.
-
-		kb_read_gather:		(RO) kBytes read to fill write packets.
-
-		reset:			(WO) Write any value to it to reset
-					pktcdvd device statistic values, like
-					bytes read/written.
-
-
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off
-What:		/sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on
-Date:		Oct. 2006
-KernelVersion:	2.6.20
-Contact:	Thomas Maier <balagi@justmail.de>
-Description:
-		==============	================================================
-		size		(RO) Contains the size of the bio write queue.
-
-		congestion_off	(RW) If bio write queue size is below this mark,
-				accept new bio requests from the block layer.
-
-		congestion_on	(RW) If bio write queue size is higher as this
-				mark, do no longer accept bio write requests
-				from the block layer and wait till the pktcdvd
-				device has processed enough bio's so that bio
-				write queue size is below congestion off mark.
-				A value of <= 0 disables congestion control.
-		==============	================================================
-
-
-Example:
---------
-To use the pktcdvd sysfs interface directly, you can do::
-
-    # create a new pktcdvd device mapped to /dev/hdc
-    echo "22:0" >/sys/class/pktcdvd/add
-    cat /sys/class/pktcdvd/device_map
-    # assuming device pktcdvd0 was created, look at stat's
-    cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
-    # print the device id of the mapped block device
-    fgrep pktcdvd0 /sys/class/pktcdvd/device_map
-    # remove device, using pktcdvd0 device id   253:0
-    echo "253:0" >/sys/class/pktcdvd/remove
diff --git a/Documentation/cdrom/packet-writing.rst b/Documentation/cdrom/packet-writing.rst
deleted file mode 100644
index 43db58c50d29..000000000000
--- a/Documentation/cdrom/packet-writing.rst
+++ /dev/null
@@ -1,139 +0,0 @@
-==============
-Packet writing
-==============
-
-Getting started quick
----------------------
-
-- Select packet support in the block device section and UDF support in
-  the file system section.
-
-- Compile and install kernel and modules, reboot.
-
-- You need the udftools package (pktsetup, mkudffs, cdrwtool).
-  Download from https://github.com/pali/udftools
-
-- Grab a new CD-RW disc and format it (assuming CD-RW is hdc, substitute
-  as appropriate)::
-
-	# cdrwtool -d /dev/hdc -q
-
-- Setup your writer::
-
-	# pktsetup dev_name /dev/hdc
-
-- Now you can mount /dev/pktcdvd/dev_name and copy files to it. Enjoy::
-
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD-RW media
--------------------------------
-
-DVD-RW discs can be written to much like CD-RW discs if they are in
-the so called "restricted overwrite" mode. To put a disc in restricted
-overwrite mode, run::
-
-	# dvd+rw-format /dev/hdc
-
-You can then use the disc the same way you would use a CD-RW disc::
-
-	# pktsetup dev_name /dev/hdc
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD+RW media
--------------------------------
-
-According to the DVD+RW specification, a drive supporting DVD+RW discs
-shall implement "true random writes with 2KB granularity", which means
-that it should be possible to put any filesystem with a block size >=
-2KB on such a disc. For example, it should be possible to do::
-
-	# dvd+rw-format /dev/hdc   (only needed if the disc has never
-	                            been formatted)
-	# mkudffs /dev/hdc
-	# mount /dev/hdc /cdrom -t udf -o rw,noatime
-
-However, some drives don't follow the specification and expect the
-host to perform aligned writes at 32KB boundaries. Other drives do
-follow the specification, but suffer bad performance problems if the
-writes are not 32KB aligned.
-
-Both problems can be solved by using the pktcdvd driver, which always
-generates aligned writes::
-
-	# dvd+rw-format /dev/hdc
-	# pktsetup dev_name /dev/hdc
-	# mkudffs /dev/pktcdvd/dev_name
-	# mount /dev/pktcdvd/dev_name /cdrom -t udf -o rw,noatime
-
-
-Packet writing for DVD-RAM media
---------------------------------
-
-DVD-RAM discs are random writable, so using the pktcdvd driver is not
-necessary. However, using the pktcdvd driver can improve performance
-in the same way it does for DVD+RW media.
-
-
-Notes
------
-
-- CD-RW media can usually not be overwritten more than about 1000
-  times, so to avoid unnecessary wear on the media, you should always
-  use the noatime mount option.
-
-- Defect management (ie automatic remapping of bad sectors) has not
-  been implemented yet, so you are likely to get at least some
-  filesystem corruption if the disc wears out.
-
-- Since the pktcdvd driver makes the disc appear as a regular block
-  device with a 2KB block size, you can put any filesystem you like on
-  the disc. For example, run::
-
-	# /sbin/mke2fs /dev/pktcdvd/dev_name
-
-  to create an ext2 filesystem on the disc.
-
-
-Using the pktcdvd sysfs interface
----------------------------------
-
-Since Linux 2.6.20, the pktcdvd module has a sysfs interface
-and can be controlled by it. For example the "pktcdvd" tool uses
-this interface. (see http://tom.ist-im-web.de/linux/software/pktcdvd )
-
-"pktcdvd" works similar to "pktsetup", e.g.::
-
-	# pktcdvd -a dev_name /dev/hdc
-	# mkudffs /dev/pktcdvd/dev_name
-	# mount -t udf -o rw,noatime /dev/pktcdvd/dev_name /dvdram
-	# cp files /dvdram
-	# umount /dvdram
-	# pktcdvd -r dev_name
-
-
-For a description of the sysfs interface look into the file:
-
-  Documentation/ABI/testing/sysfs-class-pktcdvd
-
-
-Using the pktcdvd debugfs interface
------------------------------------
-
-To read pktcdvd device infos in human readable form, do::
-
-	# cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
-
-For a description of the debugfs interface look into the file:
-
-  Documentation/ABI/testing/debugfs-pktcdvd
-
-
-
-Links
------
-
-See http://fy.chalmers.se/~appro/linux/DVD+RW/ for more information
-about DVD writing.
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index bc91756bde73..4f1532a251d2 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -220,7 +220,6 @@ Code  Seq#    Include File                                           Comments
              include/linux/falloc.h,
              linux/fs.h,
 'X'   all    fs/ocfs2/ocfs_fs.h                                      conflict!
-'X'   01     linux/pktcdvd.h                                         conflict!
 'Z'   14-15  drivers/message/fusion/mptctl.h
 '['   00-3F  linux/usb/tmc.h                                         USB Test and Measurement Devices
                                                                      <mailto:gregkh@linuxfoundation.org>
diff --git a/MAINTAINERS b/MAINTAINERS
index 4bac4ea21b64..560339f8acd3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19697,13 +19697,6 @@ S:	Supported
 F:	Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
 F:	drivers/input/keyboard/pinephone-keyboard.c
 
-PKTCDVD DRIVER
-M:	linux-block@vger.kernel.org
-S:	Orphan
-F:	drivers/block/pktcdvd.c
-F:	include/linux/pktcdvd.h
-F:	include/uapi/linux/pktcdvd.h
-
 PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
 M:	Tomasz Duszynski <tduszyns@gmail.com>
 S:	Maintained
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 0f70e2374e7f..df38fb364904 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -256,49 +256,6 @@ config BLK_DEV_RAM_SIZE
 	  The default value is 4096 kilobytes. Only change this if you know
 	  what you are doing.
 
-config CDROM_PKTCDVD
-	tristate "Packet writing on CD/DVD media (DEPRECATED)"
-	depends on !UML
-	depends on SCSI
-	select CDROM
-	help
-	  Note: This driver is deprecated and will be removed from the
-	  kernel in the near future!
-
-	  If you have a CDROM/DVD drive that supports packet writing, say
-	  Y to include support. It should work with any MMC/Mt Fuji
-	  compliant ATAPI or SCSI drive, which is just about any newer
-	  DVD/CD writer.
-
-	  Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
-	  is possible.
-	  DVD-RW disks must be in restricted overwrite mode.
-
-	  See the file <file:Documentation/cdrom/packet-writing.rst>
-	  for further information on the use of this driver.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called pktcdvd.
-
-config CDROM_PKTCDVD_BUFFERS
-	int "Free buffers for data gathering"
-	depends on CDROM_PKTCDVD
-	default "8"
-	help
-	  This controls the maximum number of active concurrent packets. More
-	  concurrent packets can increase write performance, but also require
-	  more memory. Each concurrent packet will require approximately 64Kb
-	  of non-swappable kernel memory, memory which will be allocated when
-	  a disc is opened for writing.
-
-config CDROM_PKTCDVD_WCACHE
-	bool "Enable write caching"
-	depends on CDROM_PKTCDVD
-	help
-	  If enabled, write caching will be set for the CD-R/W device. For now
-	  this option is dangerous unless the CD-RW media is known good, as we
-	  don't do deferred write error handling yet.
-
 config ATA_OVER_ETH
 	tristate "ATA over Ethernet support"
 	depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 097707aca725..a695ce74ef22 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_N64CART)		+= n64cart.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
-obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
deleted file mode 100644
index d5cc7bd2875c..000000000000
--- a/drivers/block/pktcdvd.c
+++ /dev/null
@@ -1,2916 +0,0 @@
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- * Copyright (C) 2006 Thomas Maier <balagi@justmail.de>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and
- * DVD-RAM devices.
- *
- * Theory of operation:
- *
- * At the lowest level, there is the standard driver for the CD/DVD device,
- * such as drivers/scsi/sr.c. This driver can handle read and write requests,
- * but it doesn't know anything about the special restrictions that apply to
- * packet writing. One restriction is that write requests must be aligned to
- * packet boundaries on the physical media, and the size of a write request
- * must be equal to the packet size. Another restriction is that a
- * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read
- * command, if the previous command was a write.
- *
- * The purpose of the packet writing driver is to hide these restrictions from
- * higher layers, such as file systems, and present a block device that can be
- * randomly read and written using 2kB-sized blocks.
- *
- * The lowest layer in the packet writing driver is the packet I/O scheduler.
- * Its data is defined by the struct packet_iosched and includes two bio
- * queues with pending read and write requests. These queues are processed
- * by the pkt_iosched_process_queue() function. The write requests in this
- * queue are already properly aligned and sized. This layer is responsible for
- * issuing the flush cache commands and scheduling the I/O in a good order.
- *
- * The next layer transforms unaligned write requests to aligned writes. This
- * transformation requires reading missing pieces of data from the underlying
- * block device, assembling the pieces to full packets and queuing them to the
- * packet I/O scheduler.
- *
- * At the top layer there is a custom ->submit_bio function that forwards
- * read requests directly to the iosched queue and puts write requests in the
- * unaligned write queue. A kernel thread performs the necessary read
- * gathering to convert the unaligned writes to aligned writes and then feeds
- * them to the packet I/O scheduler.
- *
- *************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/backing-dev.h>
-#include <linux/compat.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/freezer.h>
-#include <linux/kernel.h>
-#include <linux/kthread.h>
-#include <linux/miscdevice.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/nospec.h>
-#include <linux/pktcdvd.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_ioctl.h>
-
-#include <linux/unaligned.h>
-
-#define DRIVER_NAME	"pktcdvd"
-
-#define MAX_SPEED 0xffff
-
-static DEFINE_MUTEX(pktcdvd_mutex);
-static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
-static struct proc_dir_entry *pkt_proc;
-static int pktdev_major;
-static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
-static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
-static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
-static mempool_t psd_pool;
-static struct bio_set pkt_bio_set;
-
-/* /sys/class/pktcdvd */
-static struct class	class_pktcdvd;
-static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
-
-/* forward declaration */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
-static int pkt_remove_dev(dev_t pkt_dev);
-
-static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
-{
-	return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
-}
-
-/**********************************************************
- * sysfs interface for pktcdvd
- * by (C) 2006  Thomas Maier <balagi@justmail.de>
- 
-  /sys/class/pktcdvd/pktcdvd[0-7]/
-                     stat/reset
-                     stat/packets_started
-                     stat/packets_finished
-                     stat/kb_written
-                     stat/kb_read
-                     stat/kb_read_gather
-                     write_queue/size
-                     write_queue/congestion_off
-                     write_queue/congestion_on
- **********************************************************/
-
-static ssize_t packets_started_show(struct device *dev,
-				    struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started);
-}
-static DEVICE_ATTR_RO(packets_started);
-
-static ssize_t packets_finished_show(struct device *dev,
-				     struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended);
-}
-static DEVICE_ATTR_RO(packets_finished);
-
-static ssize_t kb_written_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1);
-}
-static DEVICE_ATTR_RO(kb_written);
-
-static ssize_t kb_read_show(struct device *dev,
-			    struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1);
-}
-static DEVICE_ATTR_RO(kb_read);
-
-static ssize_t kb_read_gather_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1);
-}
-static DEVICE_ATTR_RO(kb_read_gather);
-
-static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
-			   const char *buf, size_t len)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
-	if (len > 0) {
-		pd->stats.pkt_started = 0;
-		pd->stats.pkt_ended = 0;
-		pd->stats.secs_w = 0;
-		pd->stats.secs_rg = 0;
-		pd->stats.secs_r = 0;
-	}
-	return len;
-}
-static DEVICE_ATTR_WO(reset);
-
-static struct attribute *pkt_stat_attrs[] = {
-	&dev_attr_packets_finished.attr,
-	&dev_attr_packets_started.attr,
-	&dev_attr_kb_read.attr,
-	&dev_attr_kb_written.attr,
-	&dev_attr_kb_read_gather.attr,
-	&dev_attr_reset.attr,
-	NULL,
-};
-
-static const struct attribute_group pkt_stat_group = {
-	.name = "stat",
-	.attrs = pkt_stat_attrs,
-};
-
-static ssize_t size_show(struct device *dev,
-			 struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-	int n;
-
-	spin_lock(&pd->lock);
-	n = sysfs_emit(buf, "%d\n", pd->bio_queue_size);
-	spin_unlock(&pd->lock);
-	return n;
-}
-static DEVICE_ATTR_RO(size);
-
-static void init_write_congestion_marks(int* lo, int* hi)
-{
-	if (*hi > 0) {
-		*hi = max(*hi, 500);
-		*hi = min(*hi, 1000000);
-		if (*lo <= 0)
-			*lo = *hi - 100;
-		else {
-			*lo = min(*lo, *hi - 100);
-			*lo = max(*lo, 100);
-		}
-	} else {
-		*hi = -1;
-		*lo = -1;
-	}
-}
-
-static ssize_t congestion_off_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-	int n;
-
-	spin_lock(&pd->lock);
-	n = sysfs_emit(buf, "%d\n", pd->write_congestion_off);
-	spin_unlock(&pd->lock);
-	return n;
-}
-
-static ssize_t congestion_off_store(struct device *dev,
-				    struct device_attribute *attr,
-				    const char *buf, size_t len)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-	int val, ret;
-
-	ret = kstrtoint(buf, 10, &val);
-	if (ret)
-		return ret;
-
-	spin_lock(&pd->lock);
-	pd->write_congestion_off = val;
-	init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on);
-	spin_unlock(&pd->lock);
-	return len;
-}
-static DEVICE_ATTR_RW(congestion_off);
-
-static ssize_t congestion_on_show(struct device *dev,
-				  struct device_attribute *attr, char *buf)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-	int n;
-
-	spin_lock(&pd->lock);
-	n = sysfs_emit(buf, "%d\n", pd->write_congestion_on);
-	spin_unlock(&pd->lock);
-	return n;
-}
-
-static ssize_t congestion_on_store(struct device *dev,
-				   struct device_attribute *attr,
-				   const char *buf, size_t len)
-{
-	struct pktcdvd_device *pd = dev_get_drvdata(dev);
-	int val, ret;
-
-	ret = kstrtoint(buf, 10, &val);
-	if (ret)
-		return ret;
-
-	spin_lock(&pd->lock);
-	pd->write_congestion_on = val;
-	init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on);
-	spin_unlock(&pd->lock);
-	return len;
-}
-static DEVICE_ATTR_RW(congestion_on);
-
-static struct attribute *pkt_wq_attrs[] = {
-	&dev_attr_congestion_on.attr,
-	&dev_attr_congestion_off.attr,
-	&dev_attr_size.attr,
-	NULL,
-};
-
-static const struct attribute_group pkt_wq_group = {
-	.name = "write_queue",
-	.attrs = pkt_wq_attrs,
-};
-
-static const struct attribute_group *pkt_groups[] = {
-	&pkt_stat_group,
-	&pkt_wq_group,
-	NULL,
-};
-
-static void pkt_sysfs_dev_new(struct pktcdvd_device *pd)
-{
-	if (class_is_registered(&class_pktcdvd)) {
-		pd->dev = device_create_with_groups(&class_pktcdvd, NULL,
-						    MKDEV(0, 0), pd, pkt_groups,
-						    "%s", pd->disk->disk_name);
-		if (IS_ERR(pd->dev))
-			pd->dev = NULL;
-	}
-}
-
-static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
-{
-	if (class_is_registered(&class_pktcdvd))
-		device_unregister(pd->dev);
-}
-
-
-/********************************************************************
-  /sys/class/pktcdvd/
-                     add            map block device
-                     remove         unmap packet dev
-                     device_map     show mappings
- *******************************************************************/
-
-static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr,
-			       char *data)
-{
-	int n = 0;
-	int idx;
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-	for (idx = 0; idx < MAX_WRITERS; idx++) {
-		struct pktcdvd_device *pd = pkt_devs[idx];
-		if (!pd)
-			continue;
-		n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n",
-			pd->disk->disk_name,
-			MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
-			MAJOR(file_bdev(pd->bdev_file)->bd_dev),
-			MINOR(file_bdev(pd->bdev_file)->bd_dev));
-	}
-	mutex_unlock(&ctl_mutex);
-	return n;
-}
-static CLASS_ATTR_RO(device_map);
-
-static ssize_t add_store(const struct class *c, const struct class_attribute *attr,
-			 const char *buf, size_t count)
-{
-	unsigned int major, minor;
-
-	if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
-		/* pkt_setup_dev() expects caller to hold reference to self */
-		if (!try_module_get(THIS_MODULE))
-			return -ENODEV;
-
-		pkt_setup_dev(MKDEV(major, minor), NULL);
-
-		module_put(THIS_MODULE);
-
-		return count;
-	}
-
-	return -EINVAL;
-}
-static CLASS_ATTR_WO(add);
-
-static ssize_t remove_store(const struct class *c, const struct class_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int major, minor;
-	if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
-		pkt_remove_dev(MKDEV(major, minor));
-		return count;
-	}
-	return -EINVAL;
-}
-static CLASS_ATTR_WO(remove);
-
-static struct attribute *class_pktcdvd_attrs[] = {
-	&class_attr_add.attr,
-	&class_attr_remove.attr,
-	&class_attr_device_map.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(class_pktcdvd);
-
-static struct class class_pktcdvd = {
-	.name		= DRIVER_NAME,
-	.class_groups	= class_pktcdvd_groups,
-};
-
-static int pkt_sysfs_init(void)
-{
-	/*
-	 * create control files in sysfs
-	 * /sys/class/pktcdvd/...
-	 */
-	return class_register(&class_pktcdvd);
-}
-
-static void pkt_sysfs_cleanup(void)
-{
-	class_unregister(&class_pktcdvd);
-}
-
-/********************************************************************
-  entries in debugfs
-
-  /sys/kernel/debug/pktcdvd[0-7]/
-			info
-
- *******************************************************************/
-
-static void pkt_count_states(struct pktcdvd_device *pd, int *states)
-{
-	struct packet_data *pkt;
-	int i;
-
-	for (i = 0; i < PACKET_NUM_STATES; i++)
-		states[i] = 0;
-
-	spin_lock(&pd->cdrw.active_list_lock);
-	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-		states[pkt->state]++;
-	}
-	spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-static int pkt_seq_show(struct seq_file *m, void *p)
-{
-	struct pktcdvd_device *pd = m->private;
-	char *msg;
-	int states[PACKET_NUM_STATES];
-
-	seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name,
-		   file_bdev(pd->bdev_file));
-
-	seq_printf(m, "\nSettings:\n");
-	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
-
-	if (pd->settings.write_type == 0)
-		msg = "Packet";
-	else
-		msg = "Unknown";
-	seq_printf(m, "\twrite type:\t\t%s\n", msg);
-
-	seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
-	seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
-
-	seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
-
-	if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
-		msg = "Mode 1";
-	else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
-		msg = "Mode 2";
-	else
-		msg = "Unknown";
-	seq_printf(m, "\tblock mode:\t\t%s\n", msg);
-
-	seq_printf(m, "\nStatistics:\n");
-	seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
-	seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
-	seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
-	seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
-	seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
-
-	seq_printf(m, "\nMisc:\n");
-	seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
-	seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
-	seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
-	seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
-	seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
-	seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
-
-	seq_printf(m, "\nQueue state:\n");
-	seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
-	seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
-	seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", pd->current_sector);
-
-	pkt_count_states(pd, states);
-	seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
-		   states[0], states[1], states[2], states[3], states[4], states[5]);
-
-	seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n",
-			pd->write_congestion_off,
-			pd->write_congestion_on);
-	return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(pkt_seq);
-
-static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
-{
-	if (!pkt_debugfs_root)
-		return;
-	pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root);
-
-	pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root,
-					     pd, &pkt_seq_fops);
-}
-
-static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
-{
-	if (!pkt_debugfs_root)
-		return;
-	debugfs_remove(pd->dfs_f_info);
-	debugfs_remove(pd->dfs_d_root);
-	pd->dfs_f_info = NULL;
-	pd->dfs_d_root = NULL;
-}
-
-static void pkt_debugfs_init(void)
-{
-	pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
-}
-
-static void pkt_debugfs_cleanup(void)
-{
-	debugfs_remove(pkt_debugfs_root);
-	pkt_debugfs_root = NULL;
-}
-
-/* ----------------------------------------------------------*/
-
-
-static void pkt_bio_finished(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
-	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
-		dev_dbg(ddev, "queue empty\n");
-		atomic_set(&pd->iosched.attention, 1);
-		wake_up(&pd->wqueue);
-	}
-}
-
-/*
- * Allocate a packet_data struct
- */
-static struct packet_data *pkt_alloc_packet_data(int frames)
-{
-	int i;
-	struct packet_data *pkt;
-
-	pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL);
-	if (!pkt)
-		goto no_pkt;
-
-	pkt->frames = frames;
-	pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL);
-	if (!pkt->w_bio)
-		goto no_bio;
-
-	for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
-		pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (!pkt->pages[i])
-			goto no_page;
-	}
-
-	spin_lock_init(&pkt->lock);
-	bio_list_init(&pkt->orig_bios);
-
-	for (i = 0; i < frames; i++) {
-		pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL);
-		if (!pkt->r_bios[i])
-			goto no_rd_bio;
-	}
-
-	return pkt;
-
-no_rd_bio:
-	for (i = 0; i < frames; i++)
-		kfree(pkt->r_bios[i]);
-no_page:
-	for (i = 0; i < frames / FRAMES_PER_PAGE; i++)
-		if (pkt->pages[i])
-			__free_page(pkt->pages[i]);
-	kfree(pkt->w_bio);
-no_bio:
-	kfree(pkt);
-no_pkt:
-	return NULL;
-}
-
-/*
- * Free a packet_data struct
- */
-static void pkt_free_packet_data(struct packet_data *pkt)
-{
-	int i;
-
-	for (i = 0; i < pkt->frames; i++)
-		kfree(pkt->r_bios[i]);
-	for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++)
-		__free_page(pkt->pages[i]);
-	kfree(pkt->w_bio);
-	kfree(pkt);
-}
-
-static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
-{
-	struct packet_data *pkt, *next;
-
-	BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
-
-	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
-		pkt_free_packet_data(pkt);
-	}
-	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
-}
-
-static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
-{
-	struct packet_data *pkt;
-
-	BUG_ON(!list_empty(&pd->cdrw.pkt_free_list));
-
-	while (nr_packets > 0) {
-		pkt = pkt_alloc_packet_data(pd->settings.size >> 2);
-		if (!pkt) {
-			pkt_shrink_pktlist(pd);
-			return 0;
-		}
-		pkt->id = nr_packets;
-		pkt->pd = pd;
-		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
-		nr_packets--;
-	}
-	return 1;
-}
-
-static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
-{
-	struct rb_node *n = rb_next(&node->rb_node);
-	if (!n)
-		return NULL;
-	return rb_entry(n, struct pkt_rb_node, rb_node);
-}
-
-static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
-	rb_erase(&node->rb_node, &pd->bio_queue);
-	mempool_free(node, &pd->rb_pool);
-	pd->bio_queue_size--;
-	BUG_ON(pd->bio_queue_size < 0);
-}
-
-/*
- * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
- */
-static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
-{
-	struct rb_node *n = pd->bio_queue.rb_node;
-	struct rb_node *next;
-	struct pkt_rb_node *tmp;
-
-	if (!n) {
-		BUG_ON(pd->bio_queue_size > 0);
-		return NULL;
-	}
-
-	for (;;) {
-		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
-		if (s <= tmp->bio->bi_iter.bi_sector)
-			next = n->rb_left;
-		else
-			next = n->rb_right;
-		if (!next)
-			break;
-		n = next;
-	}
-
-	if (s > tmp->bio->bi_iter.bi_sector) {
-		tmp = pkt_rbtree_next(tmp);
-		if (!tmp)
-			return NULL;
-	}
-	BUG_ON(s > tmp->bio->bi_iter.bi_sector);
-	return tmp;
-}
-
-/*
- * Insert a node into the pd->bio_queue rb tree.
- */
-static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
-	struct rb_node **p = &pd->bio_queue.rb_node;
-	struct rb_node *parent = NULL;
-	sector_t s = node->bio->bi_iter.bi_sector;
-	struct pkt_rb_node *tmp;
-
-	while (*p) {
-		parent = *p;
-		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
-		if (s < tmp->bio->bi_iter.bi_sector)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&node->rb_node, parent, p);
-	rb_insert_color(&node->rb_node, &pd->bio_queue);
-	pd->bio_queue_size++;
-}
-
-/*
- * Send a packet_command to the underlying block device and
- * wait for completion.
- */
-static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
-	struct request_queue *q = bdev_get_queue(file_bdev(pd->bdev_file));
-	struct scsi_cmnd *scmd;
-	struct request *rq;
-	int ret = 0;
-
-	rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
-			     REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-	if (IS_ERR(rq))
-		return PTR_ERR(rq);
-	scmd = blk_mq_rq_to_pdu(rq);
-
-	if (cgc->buflen) {
-		ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
-				      GFP_NOIO);
-		if (ret)
-			goto out;
-	}
-
-	scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
-	memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE);
-
-	rq->timeout = 60*HZ;
-	if (cgc->quiet)
-		rq->rq_flags |= RQF_QUIET;
-
-	blk_execute_rq(rq, false);
-	if (scmd->result)
-		ret = -EIO;
-out:
-	blk_mq_free_request(rq);
-	return ret;
-}
-
-static const char *sense_key_string(__u8 index)
-{
-	static const char * const info[] = {
-		"No sense", "Recovered error", "Not ready",
-		"Medium error", "Hardware error", "Illegal request",
-		"Unit attention", "Data protect", "Blank check",
-	};
-
-	return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
-}
-
-/*
- * A generic sense dump / resolve mechanism should be implemented across
- * all ATAPI + SCSI devices.
- */
-static void pkt_dump_sense(struct pktcdvd_device *pd,
-			   struct packet_command *cgc)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct scsi_sense_hdr *sshdr = cgc->sshdr;
-
-	if (sshdr)
-		dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n",
-			CDROM_PACKET_SIZE, cgc->cmd,
-			sshdr->sense_key, sshdr->asc, sshdr->ascq,
-			sense_key_string(sshdr->sense_key));
-	else
-		dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
-}
-
-/*
- * flush the drive cache to media
- */
-static int pkt_flush_cache(struct pktcdvd_device *pd)
-{
-	struct packet_command cgc;
-
-	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
-	cgc.quiet = 1;
-
-	/*
-	 * the IMMED bit -- we default to not setting it, although that
-	 * would allow a much faster close, this is safer
-	 */
-#if 0
-	cgc.cmd[1] = 1 << 1;
-#endif
-	return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * speed is given as the normal factor, e.g. 4 for 4x
- */
-static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
-				unsigned write_speed, unsigned read_speed)
-{
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	int ret;
-
-	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.sshdr = &sshdr;
-	cgc.cmd[0] = GPCMD_SET_SPEED;
-	put_unaligned_be16(read_speed, &cgc.cmd[2]);
-	put_unaligned_be16(write_speed, &cgc.cmd[4]);
-
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret)
-		pkt_dump_sense(pd, &cgc);
-
-	return ret;
-}
-
-/*
- * Queue a bio for processing by the low-level CD device. Must be called
- * from process context.
- */
-static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
-{
-	/*
-	 * Some CDRW drives can not handle writes larger than one packet,
-	 * even if the size is a multiple of the packet size.
-	 */
-	bio->bi_opf |= REQ_NOMERGE;
-
-	spin_lock(&pd->iosched.lock);
-	if (bio_data_dir(bio) == READ)
-		bio_list_add(&pd->iosched.read_queue, bio);
-	else
-		bio_list_add(&pd->iosched.write_queue, bio);
-	spin_unlock(&pd->iosched.lock);
-
-	atomic_set(&pd->iosched.attention, 1);
-	wake_up(&pd->wqueue);
-}
-
-/*
- * Process the queued read/write requests. This function handles special
- * requirements for CDRW drives:
- * - A cache flush command must be inserted before a read request if the
- *   previous request was a write.
- * - Switching between reading and writing is slow, so don't do it more often
- *   than necessary.
- * - Optimize for throughput at the expense of latency. This means that streaming
- *   writes will never be interrupted by a read, but if the drive has to seek
- *   before the next write, switch to reading instead if there are any pending
- *   read requests.
- * - Set the read speed according to current usage pattern. When only reading
- *   from the device, it's best to use the highest possible read speed, but
- *   when switching often between reading and writing, it's better to have the
- *   same read and write speeds.
- */
-static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	if (atomic_read(&pd->iosched.attention) == 0)
-		return;
-	atomic_set(&pd->iosched.attention, 0);
-
-	for (;;) {
-		struct bio *bio;
-		int reads_queued, writes_queued;
-
-		spin_lock(&pd->iosched.lock);
-		reads_queued = !bio_list_empty(&pd->iosched.read_queue);
-		writes_queued = !bio_list_empty(&pd->iosched.write_queue);
-		spin_unlock(&pd->iosched.lock);
-
-		if (!reads_queued && !writes_queued)
-			break;
-
-		if (pd->iosched.writing) {
-			int need_write_seek = 1;
-			spin_lock(&pd->iosched.lock);
-			bio = bio_list_peek(&pd->iosched.write_queue);
-			spin_unlock(&pd->iosched.lock);
-			if (bio && (bio->bi_iter.bi_sector ==
-				    pd->iosched.last_write))
-				need_write_seek = 0;
-			if (need_write_seek && reads_queued) {
-				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					dev_dbg(ddev, "write, waiting\n");
-					break;
-				}
-				pkt_flush_cache(pd);
-				pd->iosched.writing = 0;
-			}
-		} else {
-			if (!reads_queued && writes_queued) {
-				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					dev_dbg(ddev, "read, waiting\n");
-					break;
-				}
-				pd->iosched.writing = 1;
-			}
-		}
-
-		spin_lock(&pd->iosched.lock);
-		if (pd->iosched.writing)
-			bio = bio_list_pop(&pd->iosched.write_queue);
-		else
-			bio = bio_list_pop(&pd->iosched.read_queue);
-		spin_unlock(&pd->iosched.lock);
-
-		if (!bio)
-			continue;
-
-		if (bio_data_dir(bio) == READ)
-			pd->iosched.successive_reads +=
-				bio->bi_iter.bi_size >> 10;
-		else {
-			pd->iosched.successive_reads = 0;
-			pd->iosched.last_write = bio_end_sector(bio);
-		}
-		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
-			if (pd->read_speed == pd->write_speed) {
-				pd->read_speed = MAX_SPEED;
-				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
-			}
-		} else {
-			if (pd->read_speed != pd->write_speed) {
-				pd->read_speed = pd->write_speed;
-				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
-			}
-		}
-
-		atomic_inc(&pd->cdrw.pending_bios);
-		submit_bio_noacct(bio);
-	}
-}
-
-/*
- * Special care is needed if the underlying block device has a small
- * max_phys_segments value.
- */
-static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) {
-		/*
-		 * The cdrom device can handle one segment/frame
-		 */
-		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
-		return 0;
-	}
-
-	if ((pd->settings.size << 9) / PAGE_SIZE <= queue_max_segments(q)) {
-		/*
-		 * We can handle this case at the expense of some extra memory
-		 * copies during write operations
-		 */
-		set_bit(PACKET_MERGE_SEGS, &pd->flags);
-		return 0;
-	}
-
-	dev_err(ddev, "cdrom max_phys_segments too small\n");
-	return -EIO;
-}
-
-static void pkt_end_io_read(struct bio *bio)
-{
-	struct packet_data *pkt = bio->bi_private;
-	struct pktcdvd_device *pd = pkt->pd;
-	BUG_ON(!pd);
-
-	dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n",
-		bio, pkt->sector, bio->bi_iter.bi_sector, bio->bi_status);
-
-	if (bio->bi_status)
-		atomic_inc(&pkt->io_errors);
-	bio_uninit(bio);
-	if (atomic_dec_and_test(&pkt->io_wait)) {
-		atomic_inc(&pkt->run_sm);
-		wake_up(&pd->wqueue);
-	}
-	pkt_bio_finished(pd);
-}
-
-static void pkt_end_io_packet_write(struct bio *bio)
-{
-	struct packet_data *pkt = bio->bi_private;
-	struct pktcdvd_device *pd = pkt->pd;
-	BUG_ON(!pd);
-
-	dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status);
-
-	pd->stats.pkt_ended++;
-
-	bio_uninit(bio);
-	pkt_bio_finished(pd);
-	atomic_dec(&pkt->io_wait);
-	atomic_inc(&pkt->run_sm);
-	wake_up(&pd->wqueue);
-}
-
-/*
- * Schedule reads for the holes in a packet
- */
-static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	int frames_read = 0;
-	struct bio *bio;
-	int f;
-	char written[PACKET_MAX_SIZE];
-
-	BUG_ON(bio_list_empty(&pkt->orig_bios));
-
-	atomic_set(&pkt->io_wait, 0);
-	atomic_set(&pkt->io_errors, 0);
-
-	/*
-	 * Figure out which frames we need to read before we can write.
-	 */
-	memset(written, 0, sizeof(written));
-	spin_lock(&pkt->lock);
-	bio_list_for_each(bio, &pkt->orig_bios) {
-		int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
-			(CD_FRAMESIZE >> 9);
-		int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
-		pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
-		BUG_ON(first_frame < 0);
-		BUG_ON(first_frame + num_frames > pkt->frames);
-		for (f = first_frame; f < first_frame + num_frames; f++)
-			written[f] = 1;
-	}
-	spin_unlock(&pkt->lock);
-
-	if (pkt->cache_valid) {
-		dev_dbg(ddev, "zone %llx cached\n", pkt->sector);
-		goto out_account;
-	}
-
-	/*
-	 * Schedule reads for missing parts of the packet.
-	 */
-	for (f = 0; f < pkt->frames; f++) {
-		int p, offset;
-
-		if (written[f])
-			continue;
-
-		bio = pkt->r_bios[f];
-		bio_init(bio, file_bdev(pd->bdev_file), bio->bi_inline_vecs, 1,
-			 REQ_OP_READ);
-		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
-		bio->bi_end_io = pkt_end_io_read;
-		bio->bi_private = pkt;
-
-		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
-		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-		dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f,
-			pkt->pages[p], offset);
-		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
-			BUG();
-
-		atomic_inc(&pkt->io_wait);
-		pkt_queue_bio(pd, bio);
-		frames_read++;
-	}
-
-out_account:
-	dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, pkt->sector);
-	pd->stats.pkt_started++;
-	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
-}
-
-/*
- * Find a packet matching zone, or the least recently used packet if
- * there is no match.
- */
-static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
-{
-	struct packet_data *pkt;
-
-	list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
-		if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
-			list_del_init(&pkt->list);
-			if (pkt->sector != zone)
-				pkt->cache_valid = 0;
-			return pkt;
-		}
-	}
-	BUG();
-	return NULL;
-}
-
-static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
-	if (pkt->cache_valid) {
-		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
-	} else {
-		list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
-	}
-}
-
-static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt,
-				 enum packet_data_state state)
-{
-	static const char *state_name[] = {
-		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
-	};
-	enum packet_data_state old_state = pkt->state;
-
-	dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n",
-		pkt->id, pkt->sector, state_name[old_state], state_name[state]);
-
-	pkt->state = state;
-}
-
-/*
- * Scan the work queue to see if we can start a new packet.
- * returns non-zero if any work was done.
- */
-static int pkt_handle_queue(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_data *pkt, *p;
-	struct bio *bio = NULL;
-	sector_t zone = 0; /* Suppress gcc warning */
-	struct pkt_rb_node *node, *first_node;
-	struct rb_node *n;
-
-	atomic_set(&pd->scan_queue, 0);
-
-	if (list_empty(&pd->cdrw.pkt_free_list)) {
-		dev_dbg(ddev, "no pkt\n");
-		return 0;
-	}
-
-	/*
-	 * Try to find a zone we are not already working on.
-	 */
-	spin_lock(&pd->lock);
-	first_node = pkt_rbtree_find(pd, pd->current_sector);
-	if (!first_node) {
-		n = rb_first(&pd->bio_queue);
-		if (n)
-			first_node = rb_entry(n, struct pkt_rb_node, rb_node);
-	}
-	node = first_node;
-	while (node) {
-		bio = node->bio;
-		zone = get_zone(bio->bi_iter.bi_sector, pd);
-		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
-			if (p->sector == zone) {
-				bio = NULL;
-				goto try_next_bio;
-			}
-		}
-		break;
-try_next_bio:
-		node = pkt_rbtree_next(node);
-		if (!node) {
-			n = rb_first(&pd->bio_queue);
-			if (n)
-				node = rb_entry(n, struct pkt_rb_node, rb_node);
-		}
-		if (node == first_node)
-			node = NULL;
-	}
-	spin_unlock(&pd->lock);
-	if (!bio) {
-		dev_dbg(ddev, "no bio\n");
-		return 0;
-	}
-
-	pkt = pkt_get_packet_data(pd, zone);
-
-	pd->current_sector = zone + pd->settings.size;
-	pkt->sector = zone;
-	BUG_ON(pkt->frames != pd->settings.size >> 2);
-	pkt->write_size = 0;
-
-	/*
-	 * Scan work queue for bios in the same zone and link them
-	 * to this packet.
-	 */
-	spin_lock(&pd->lock);
-	dev_dbg(ddev, "looking for zone %llx\n", zone);
-	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
-		sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd);
-
-		bio = node->bio;
-		dev_dbg(ddev, "found zone=%llx\n", tmp);
-		if (tmp != zone)
-			break;
-		pkt_rbtree_erase(pd, node);
-		spin_lock(&pkt->lock);
-		bio_list_add(&pkt->orig_bios, bio);
-		pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
-		spin_unlock(&pkt->lock);
-	}
-	/* check write congestion marks, and if bio_queue_size is
-	 * below, wake up any waiters
-	 */
-	if (pd->congested &&
-	    pd->bio_queue_size <= pd->write_congestion_off) {
-		pd->congested = false;
-		wake_up_var(&pd->congested);
-	}
-	spin_unlock(&pd->lock);
-
-	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
-	pkt_set_state(ddev, pkt, PACKET_WAITING_STATE);
-	atomic_set(&pkt->run_sm, 1);
-
-	spin_lock(&pd->cdrw.active_list_lock);
-	list_add(&pkt->list, &pd->cdrw.pkt_active_list);
-	spin_unlock(&pd->cdrw.active_list_lock);
-
-	return 1;
-}
-
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-static void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
-	struct bvec_iter src_iter = src->bi_iter;
-	struct bvec_iter dst_iter = dst->bi_iter;
-
-	while (1) {
-		if (!src_iter.bi_size) {
-			src = src->bi_next;
-			if (!src)
-				break;
-
-			src_iter = src->bi_iter;
-		}
-
-		if (!dst_iter.bi_size) {
-			dst = dst->bi_next;
-			if (!dst)
-				break;
-
-			dst_iter = dst->bi_iter;
-		}
-
-		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-	}
-}
-
-/*
- * Assemble a bio to write one packet and queue the bio for processing
- * by the underlying block device.
- */
-static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	int f;
-
-	bio_init(pkt->w_bio, file_bdev(pd->bdev_file), pkt->w_bio->bi_inline_vecs,
-		 pkt->frames, REQ_OP_WRITE);
-	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
-	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-	pkt->w_bio->bi_private = pkt;
-
-	/* XXX: locking? */
-	for (f = 0; f < pkt->frames; f++) {
-		struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
-		unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-
-		if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset))
-			BUG();
-	}
-	dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
-	/*
-	 * Fill-in bvec with data from orig_bios.
-	 */
-	spin_lock(&pkt->lock);
-	bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
-
-	pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE);
-	spin_unlock(&pkt->lock);
-
-	dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, pkt->sector);
-
-	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames))
-		pkt->cache_valid = 1;
-	else
-		pkt->cache_valid = 0;
-
-	/* Start the write request */
-	atomic_set(&pkt->io_wait, 1);
-	pkt_queue_bio(pd, pkt->w_bio);
-}
-
-static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status)
-{
-	struct bio *bio;
-
-	if (status)
-		pkt->cache_valid = 0;
-
-	/* Finish all bios corresponding to this packet */
-	while ((bio = bio_list_pop(&pkt->orig_bios))) {
-		bio->bi_status = status;
-		bio_endio(bio);
-	}
-}
-
-static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	dev_dbg(ddev, "pkt %d\n", pkt->id);
-
-	for (;;) {
-		switch (pkt->state) {
-		case PACKET_WAITING_STATE:
-			if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
-				return;
-
-			pkt->sleep_time = 0;
-			pkt_gather_data(pd, pkt);
-			pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE);
-			break;
-
-		case PACKET_READ_WAIT_STATE:
-			if (atomic_read(&pkt->io_wait) > 0)
-				return;
-
-			if (atomic_read(&pkt->io_errors) > 0) {
-				pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE);
-			} else {
-				pkt_start_write(pd, pkt);
-			}
-			break;
-
-		case PACKET_WRITE_WAIT_STATE:
-			if (atomic_read(&pkt->io_wait) > 0)
-				return;
-
-			if (!pkt->w_bio->bi_status) {
-				pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE);
-			} else {
-				pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE);
-			}
-			break;
-
-		case PACKET_RECOVERY_STATE:
-			dev_dbg(ddev, "No recovery possible\n");
-			pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE);
-			break;
-
-		case PACKET_FINISHED_STATE:
-			pkt_finish_packet(pkt, pkt->w_bio->bi_status);
-			return;
-
-		default:
-			BUG();
-			break;
-		}
-	}
-}
-
-static void pkt_handle_packets(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_data *pkt, *next;
-
-	/*
-	 * Run state machine for active packets
-	 */
-	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-		if (atomic_read(&pkt->run_sm) > 0) {
-			atomic_set(&pkt->run_sm, 0);
-			pkt_run_state_machine(pd, pkt);
-		}
-	}
-
-	/*
-	 * Move no longer active packets to the free list
-	 */
-	spin_lock(&pd->cdrw.active_list_lock);
-	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
-		if (pkt->state == PACKET_FINISHED_STATE) {
-			list_del(&pkt->list);
-			pkt_put_packet_data(pd, pkt);
-			pkt_set_state(ddev, pkt, PACKET_IDLE_STATE);
-			atomic_set(&pd->scan_queue, 1);
-		}
-	}
-	spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-/*
- * kcdrwd is woken up when writes have been queued for one of our
- * registered devices
- */
-static int kcdrwd(void *foobar)
-{
-	struct pktcdvd_device *pd = foobar;
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_data *pkt;
-	int states[PACKET_NUM_STATES];
-	long min_sleep_time, residue;
-
-	set_user_nice(current, MIN_NICE);
-	set_freezable();
-
-	for (;;) {
-		DECLARE_WAITQUEUE(wait, current);
-
-		/*
-		 * Wait until there is something to do
-		 */
-		add_wait_queue(&pd->wqueue, &wait);
-		for (;;) {
-			set_current_state(TASK_INTERRUPTIBLE);
-
-			/* Check if we need to run pkt_handle_queue */
-			if (atomic_read(&pd->scan_queue) > 0)
-				goto work_to_do;
-
-			/* Check if we need to run the state machine for some packet */
-			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-				if (atomic_read(&pkt->run_sm) > 0)
-					goto work_to_do;
-			}
-
-			/* Check if we need to process the iosched queues */
-			if (atomic_read(&pd->iosched.attention) != 0)
-				goto work_to_do;
-
-			/* Otherwise, go to sleep */
-			pkt_count_states(pd, states);
-			dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
-				states[0], states[1], states[2], states[3], states[4], states[5]);
-
-			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
-			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-				if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
-					min_sleep_time = pkt->sleep_time;
-			}
-
-			dev_dbg(ddev, "sleeping\n");
-			residue = schedule_timeout(min_sleep_time);
-			dev_dbg(ddev, "wake up\n");
-
-			/* make swsusp happy with our thread */
-			try_to_freeze();
-
-			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-				if (!pkt->sleep_time)
-					continue;
-				pkt->sleep_time -= min_sleep_time - residue;
-				if (pkt->sleep_time <= 0) {
-					pkt->sleep_time = 0;
-					atomic_inc(&pkt->run_sm);
-				}
-			}
-
-			if (kthread_should_stop())
-				break;
-		}
-work_to_do:
-		set_current_state(TASK_RUNNING);
-		remove_wait_queue(&pd->wqueue, &wait);
-
-		if (kthread_should_stop())
-			break;
-
-		/*
-		 * if pkt_handle_queue returns true, we can queue
-		 * another request.
-		 */
-		while (pkt_handle_queue(pd))
-			;
-
-		/*
-		 * Handle packet state machine
-		 */
-		pkt_handle_packets(pd);
-
-		/*
-		 * Handle iosched queues
-		 */
-		pkt_iosched_process_queue(pd);
-	}
-
-	return 0;
-}
-
-static void pkt_print_settings(struct pktcdvd_device *pd)
-{
-	dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n",
-		 pd->settings.fp ? "Fixed" : "Variable",
-		 pd->settings.size >> 2,
-		 pd->settings.block_mode == 8 ? '1' : '2');
-}
-
-static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
-{
-	memset(cgc->cmd, 0, sizeof(cgc->cmd));
-
-	cgc->cmd[0] = GPCMD_MODE_SENSE_10;
-	cgc->cmd[2] = page_code | (page_control << 6);
-	put_unaligned_be16(cgc->buflen, &cgc->cmd[7]);
-	cgc->data_direction = CGC_DATA_READ;
-	return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
-	memset(cgc->cmd, 0, sizeof(cgc->cmd));
-	memset(cgc->buffer, 0, 2);
-	cgc->cmd[0] = GPCMD_MODE_SELECT_10;
-	cgc->cmd[1] = 0x10;		/* PF */
-	put_unaligned_be16(cgc->buflen, &cgc->cmd[7]);
-	cgc->data_direction = CGC_DATA_WRITE;
-	return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
-{
-	struct packet_command cgc;
-	int ret;
-
-	/* set up command and get the disc info */
-	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
-	cgc.cmd[8] = cgc.buflen = 2;
-	cgc.quiet = 1;
-
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret)
-		return ret;
-
-	/* not all drives have the same disc_info length, so requeue
-	 * packet with the length the drive tells us it can supply
-	 */
-	cgc.buflen = be16_to_cpu(di->disc_information_length) +
-		     sizeof(di->disc_information_length);
-
-	if (cgc.buflen > sizeof(disc_information))
-		cgc.buflen = sizeof(disc_information);
-
-	cgc.cmd[8] = cgc.buflen;
-	return pkt_generic_packet(pd, &cgc);
-}
-
-static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
-{
-	struct packet_command cgc;
-	int ret;
-
-	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
-	cgc.cmd[1] = type & 3;
-	put_unaligned_be16(track, &cgc.cmd[4]);
-	cgc.cmd[8] = 8;
-	cgc.quiet = 1;
-
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret)
-		return ret;
-
-	cgc.buflen = be16_to_cpu(ti->track_information_length) +
-		     sizeof(ti->track_information_length);
-
-	if (cgc.buflen > sizeof(track_information))
-		cgc.buflen = sizeof(track_information);
-
-	cgc.cmd[8] = cgc.buflen;
-	return pkt_generic_packet(pd, &cgc);
-}
-
-static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
-						long *last_written)
-{
-	disc_information di;
-	track_information ti;
-	__u32 last_track;
-	int ret;
-
-	ret = pkt_get_disc_info(pd, &di);
-	if (ret)
-		return ret;
-
-	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
-	ret = pkt_get_track_info(pd, last_track, 1, &ti);
-	if (ret)
-		return ret;
-
-	/* if this track is blank, try the previous. */
-	if (ti.blank) {
-		last_track--;
-		ret = pkt_get_track_info(pd, last_track, 1, &ti);
-		if (ret)
-			return ret;
-	}
-
-	/* if last recorded field is valid, return it. */
-	if (ti.lra_v) {
-		*last_written = be32_to_cpu(ti.last_rec_address);
-	} else {
-		/* make it up instead */
-		*last_written = be32_to_cpu(ti.track_start) +
-				be32_to_cpu(ti.track_size);
-		if (ti.free_blocks)
-			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
-	}
-	return 0;
-}
-
-/*
- * write mode select package based on pd->settings
- */
-static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	write_param_page *wp;
-	char buffer[128];
-	int ret, size;
-
-	/* doesn't apply to DVD+RW or DVD-RAM */
-	if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12))
-		return 0;
-
-	memset(buffer, 0, sizeof(buffer));
-	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
-	cgc.sshdr = &sshdr;
-	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
-	if (ret) {
-		pkt_dump_sense(pd, &cgc);
-		return ret;
-	}
-
-	size = 2 + get_unaligned_be16(&buffer[0]);
-	pd->mode_offset = get_unaligned_be16(&buffer[6]);
-	if (size > sizeof(buffer))
-		size = sizeof(buffer);
-
-	/*
-	 * now get it all
-	 */
-	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
-	cgc.sshdr = &sshdr;
-	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
-	if (ret) {
-		pkt_dump_sense(pd, &cgc);
-		return ret;
-	}
-
-	/*
-	 * write page is offset header + block descriptor length
-	 */
-	wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
-
-	wp->fp = pd->settings.fp;
-	wp->track_mode = pd->settings.track_mode;
-	wp->write_type = pd->settings.write_type;
-	wp->data_block_type = pd->settings.block_mode;
-
-	wp->multi_session = 0;
-
-#ifdef PACKET_USE_LS
-	wp->link_size = 7;
-	wp->ls_v = 1;
-#endif
-
-	if (wp->data_block_type == PACKET_BLOCK_MODE1) {
-		wp->session_format = 0;
-		wp->subhdr2 = 0x20;
-	} else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
-		wp->session_format = 0x20;
-		wp->subhdr2 = 8;
-#if 0
-		wp->mcn[0] = 0x80;
-		memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
-#endif
-	} else {
-		/*
-		 * paranoia
-		 */
-		dev_err(ddev, "write mode wrong %d\n", wp->data_block_type);
-		return 1;
-	}
-	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
-
-	cgc.buflen = cgc.cmd[8] = size;
-	ret = pkt_mode_select(pd, &cgc);
-	if (ret) {
-		pkt_dump_sense(pd, &cgc);
-		return ret;
-	}
-
-	pkt_print_settings(pd);
-	return 0;
-}
-
-/*
- * 1 -- we can write to this track, 0 -- we can't
- */
-static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	switch (pd->mmc3_profile) {
-		case 0x1a: /* DVD+RW */
-		case 0x12: /* DVD-RAM */
-			/* The track is always writable on DVD+RW/DVD-RAM */
-			return 1;
-		default:
-			break;
-	}
-
-	if (!ti->packet || !ti->fp)
-		return 0;
-
-	/*
-	 * "good" settings as per Mt Fuji.
-	 */
-	if (ti->rt == 0 && ti->blank == 0)
-		return 1;
-
-	if (ti->rt == 0 && ti->blank == 1)
-		return 1;
-
-	if (ti->rt == 1 && ti->blank == 0)
-		return 1;
-
-	dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
-	return 0;
-}
-
-/*
- * 1 -- we can write to this disc, 0 -- we can't
- */
-static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	switch (pd->mmc3_profile) {
-		case 0x0a: /* CD-RW */
-		case 0xffff: /* MMC3 not supported */
-			break;
-		case 0x1a: /* DVD+RW */
-		case 0x13: /* DVD-RW */
-		case 0x12: /* DVD-RAM */
-			return 1;
-		default:
-			dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile);
-			return 0;
-	}
-
-	/*
-	 * for disc type 0xff we should probably reserve a new track.
-	 * but i'm not sure, should we leave this to user apps? probably.
-	 */
-	if (di->disc_type == 0xff) {
-		dev_notice(ddev, "unknown disc - no track?\n");
-		return 0;
-	}
-
-	if (di->disc_type != 0x20 && di->disc_type != 0) {
-		dev_err(ddev, "wrong disc type (%x)\n", di->disc_type);
-		return 0;
-	}
-
-	if (di->erasable == 0) {
-		dev_err(ddev, "disc not erasable\n");
-		return 0;
-	}
-
-	if (di->border_status == PACKET_SESSION_RESERVED) {
-		dev_err(ddev, "can't write to last track (reserved)\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_command cgc;
-	unsigned char buf[12];
-	disc_information di;
-	track_information ti;
-	int ret, track;
-
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
-	cgc.cmd[8] = 8;
-	ret = pkt_generic_packet(pd, &cgc);
-	pd->mmc3_profile = ret ? 0xffff : get_unaligned_be16(&buf[6]);
-
-	memset(&di, 0, sizeof(disc_information));
-	memset(&ti, 0, sizeof(track_information));
-
-	ret = pkt_get_disc_info(pd, &di);
-	if (ret) {
-		dev_err(ddev, "failed get_disc\n");
-		return ret;
-	}
-
-	if (!pkt_writable_disc(pd, &di))
-		return -EROFS;
-
-	pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
-
-	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
-	ret = pkt_get_track_info(pd, track, 1, &ti);
-	if (ret) {
-		dev_err(ddev, "failed get_track\n");
-		return ret;
-	}
-
-	if (!pkt_writable_track(pd, &ti)) {
-		dev_err(ddev, "can't write to this track\n");
-		return -EROFS;
-	}
-
-	/*
-	 * we keep packet size in 512 byte units, makes it easier to
-	 * deal with request calculations.
-	 */
-	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
-	if (pd->settings.size == 0) {
-		dev_notice(ddev, "detected zero packet size!\n");
-		return -ENXIO;
-	}
-	if (pd->settings.size > PACKET_MAX_SECTORS) {
-		dev_err(ddev, "packet size is too big\n");
-		return -EROFS;
-	}
-	pd->settings.fp = ti.fp;
-	pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
-
-	if (ti.nwa_v) {
-		pd->nwa = be32_to_cpu(ti.next_writable);
-		set_bit(PACKET_NWA_VALID, &pd->flags);
-	}
-
-	/*
-	 * in theory we could use lra on -RW media as well and just zero
-	 * blocks that haven't been written yet, but in practice that
-	 * is just a no-go. we'll use that for -R, naturally.
-	 */
-	if (ti.lra_v) {
-		pd->lra = be32_to_cpu(ti.last_rec_address);
-		set_bit(PACKET_LRA_VALID, &pd->flags);
-	} else {
-		pd->lra = 0xffffffff;
-		set_bit(PACKET_LRA_VALID, &pd->flags);
-	}
-
-	/*
-	 * fine for now
-	 */
-	pd->settings.link_loss = 7;
-	pd->settings.write_type = 0;	/* packet */
-	pd->settings.track_mode = ti.track_mode;
-
-	/*
-	 * mode1 or mode2 disc
-	 */
-	switch (ti.data_mode) {
-		case PACKET_MODE1:
-			pd->settings.block_mode = PACKET_BLOCK_MODE1;
-			break;
-		case PACKET_MODE2:
-			pd->settings.block_mode = PACKET_BLOCK_MODE2;
-			break;
-		default:
-			dev_err(ddev, "unknown data mode\n");
-			return -EROFS;
-	}
-	return 0;
-}
-
-/*
- * enable/disable write caching on drive
- */
-static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	unsigned char buf[64];
-	bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE);
-	int ret;
-
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
-	cgc.sshdr = &sshdr;
-	cgc.buflen = pd->mode_offset + 12;
-
-	/*
-	 * caching mode page might not be there, so quiet this command
-	 */
-	cgc.quiet = 1;
-
-	ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
-	if (ret)
-		return ret;
-
-	/*
-	 * use drive write caching -- we need deferred error handling to be
-	 * able to successfully recover with this option (drive will return good
-	 * status as soon as the cdb is validated).
-	 */
-	buf[pd->mode_offset + 10] |= (set << 2);
-
-	cgc.buflen = cgc.cmd[8] = 2 + get_unaligned_be16(&buf[0]);
-	ret = pkt_mode_select(pd, &cgc);
-	if (ret) {
-		dev_err(ddev, "write caching control failed\n");
-		pkt_dump_sense(pd, &cgc);
-	} else if (!ret && set)
-		dev_notice(ddev, "enabled write caching\n");
-	return ret;
-}
-
-static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
-{
-	struct packet_command cgc;
-
-	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
-	cgc.cmd[4] = lockflag ? 1 : 0;
-	return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * Returns drive maximum write speed
- */
-static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
-						unsigned *write_speed)
-{
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	unsigned char buf[256+18];
-	unsigned char *cap_buf;
-	int ret, offset;
-
-	cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
-	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
-	cgc.sshdr = &sshdr;
-
-	ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
-	if (ret) {
-		cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
-			     sizeof(struct mode_page_header);
-		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
-		if (ret) {
-			pkt_dump_sense(pd, &cgc);
-			return ret;
-		}
-	}
-
-	offset = 20;			    /* Obsoleted field, used by older drives */
-	if (cap_buf[1] >= 28)
-		offset = 28;		    /* Current write speed selected */
-	if (cap_buf[1] >= 30) {
-		/* If the drive reports at least one "Logical Unit Write
-		 * Speed Performance Descriptor Block", use the information
-		 * in the first block. (contains the highest speed)
-		 */
-		int num_spdb = get_unaligned_be16(&cap_buf[30]);
-		if (num_spdb > 0)
-			offset = 34;
-	}
-
-	*write_speed = get_unaligned_be16(&cap_buf[offset]);
-	return 0;
-}
-
-/* These tables from cdrecord - I don't have orange book */
-/* standard speed CD-RW (1-4x) */
-static char clv_to_speed[16] = {
-	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
-	   0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* high speed CD-RW (-10x) */
-static char hs_clv_to_speed[16] = {
-	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
-	   0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* ultra high speed CD-RW */
-static char us_clv_to_speed[16] = {
-	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
-	   0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
-};
-
-/*
- * reads the maximum media speed from ATIP
- */
-static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
-						unsigned *speed)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	unsigned char buf[64];
-	unsigned int size, st, sp;
-	int ret;
-
-	init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
-	cgc.sshdr = &sshdr;
-	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
-	cgc.cmd[1] = 2;
-	cgc.cmd[2] = 4; /* READ ATIP */
-	cgc.cmd[8] = 2;
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret) {
-		pkt_dump_sense(pd, &cgc);
-		return ret;
-	}
-	size = 2 + get_unaligned_be16(&buf[0]);
-	if (size > sizeof(buf))
-		size = sizeof(buf);
-
-	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
-	cgc.sshdr = &sshdr;
-	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
-	cgc.cmd[1] = 2;
-	cgc.cmd[2] = 4;
-	cgc.cmd[8] = size;
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret) {
-		pkt_dump_sense(pd, &cgc);
-		return ret;
-	}
-
-	if (!(buf[6] & 0x40)) {
-		dev_notice(ddev, "disc type is not CD-RW\n");
-		return 1;
-	}
-	if (!(buf[6] & 0x4)) {
-		dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n");
-		return 1;
-	}
-
-	st = (buf[6] >> 3) & 0x7; /* disc sub-type */
-
-	sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
-
-	/* Info from cdrecord */
-	switch (st) {
-		case 0: /* standard speed */
-			*speed = clv_to_speed[sp];
-			break;
-		case 1: /* high speed */
-			*speed = hs_clv_to_speed[sp];
-			break;
-		case 2: /* ultra high speed */
-			*speed = us_clv_to_speed[sp];
-			break;
-		default:
-			dev_notice(ddev, "unknown disc sub-type %d\n", st);
-			return 1;
-	}
-	if (*speed) {
-		dev_info(ddev, "maximum media speed: %d\n", *speed);
-		return 0;
-	} else {
-		dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st);
-		return 1;
-	}
-}
-
-static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct packet_command cgc;
-	struct scsi_sense_hdr sshdr;
-	int ret;
-
-	dev_dbg(ddev, "Performing OPC\n");
-
-	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
-	cgc.sshdr = &sshdr;
-	cgc.timeout = 60*HZ;
-	cgc.cmd[0] = GPCMD_SEND_OPC;
-	cgc.cmd[1] = 1;
-	ret = pkt_generic_packet(pd, &cgc);
-	if (ret)
-		pkt_dump_sense(pd, &cgc);
-	return ret;
-}
-
-static int pkt_open_write(struct pktcdvd_device *pd)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	int ret;
-	unsigned int write_speed, media_write_speed, read_speed;
-
-	ret = pkt_probe_settings(pd);
-	if (ret) {
-		dev_dbg(ddev, "failed probe\n");
-		return ret;
-	}
-
-	ret = pkt_set_write_settings(pd);
-	if (ret) {
-		dev_notice(ddev, "failed saving write settings\n");
-		return -EIO;
-	}
-
-	pkt_write_caching(pd);
-
-	ret = pkt_get_max_speed(pd, &write_speed);
-	if (ret)
-		write_speed = 16 * 177;
-	switch (pd->mmc3_profile) {
-		case 0x13: /* DVD-RW */
-		case 0x1a: /* DVD+RW */
-		case 0x12: /* DVD-RAM */
-			dev_notice(ddev, "write speed %ukB/s\n", write_speed);
-			break;
-		default:
-			ret = pkt_media_speed(pd, &media_write_speed);
-			if (ret)
-				media_write_speed = 16;
-			write_speed = min(write_speed, media_write_speed * 177);
-			dev_notice(ddev, "write speed %ux\n", write_speed / 176);
-			break;
-	}
-	read_speed = write_speed;
-
-	ret = pkt_set_speed(pd, write_speed, read_speed);
-	if (ret) {
-		dev_notice(ddev, "couldn't set write speed\n");
-		return -EIO;
-	}
-	pd->write_speed = write_speed;
-	pd->read_speed = read_speed;
-
-	ret = pkt_perform_opc(pd);
-	if (ret)
-		dev_notice(ddev, "Optimum Power Calibration failed\n");
-
-	return 0;
-}
-
-/*
- * called at open time.
- */
-static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	int ret;
-	long lba;
-	struct request_queue *q;
-	struct file *bdev_file;
-
-	/*
-	 * We need to re-open the cdrom device without O_NONBLOCK to be able
-	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
-	 * so open should not fail.
-	 */
-	bdev_file = bdev_file_open_by_dev(file_bdev(pd->bdev_file)->bd_dev,
-				       BLK_OPEN_READ, pd, NULL);
-	if (IS_ERR(bdev_file)) {
-		ret = PTR_ERR(bdev_file);
-		goto out;
-	}
-	pd->f_open_bdev = bdev_file;
-
-	ret = pkt_get_last_written(pd, &lba);
-	if (ret) {
-		dev_err(ddev, "pkt_get_last_written failed\n");
-		goto out_putdev;
-	}
-
-	set_capacity(pd->disk, lba << 2);
-	set_capacity_and_notify(file_bdev(pd->bdev_file)->bd_disk, lba << 2);
-
-	q = bdev_get_queue(file_bdev(pd->bdev_file));
-	if (write) {
-		ret = pkt_open_write(pd);
-		if (ret)
-			goto out_putdev;
-		set_bit(PACKET_WRITABLE, &pd->flags);
-	} else {
-		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-		clear_bit(PACKET_WRITABLE, &pd->flags);
-	}
-
-	ret = pkt_set_segment_merging(pd, q);
-	if (ret)
-		goto out_putdev;
-
-	if (write) {
-		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
-			dev_err(ddev, "not enough memory for buffers\n");
-			ret = -ENOMEM;
-			goto out_putdev;
-		}
-		dev_info(ddev, "%lukB available on disc\n", lba << 1);
-	}
-	set_blocksize(bdev_file, CD_FRAMESIZE);
-
-	return 0;
-
-out_putdev:
-	fput(bdev_file);
-out:
-	return ret;
-}
-
-/*
- * called when the device is closed. makes sure that the device flushes
- * the internal cache before we close.
- */
-static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-
-	if (flush && pkt_flush_cache(pd))
-		dev_notice(ddev, "not flushing cache\n");
-
-	pkt_lock_door(pd, 0);
-
-	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	fput(pd->f_open_bdev);
-	pd->f_open_bdev = NULL;
-
-	pkt_shrink_pktlist(pd);
-}
-
-static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
-{
-	if (dev_minor >= MAX_WRITERS)
-		return NULL;
-
-	dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
-	return pkt_devs[dev_minor];
-}
-
-static int pkt_open(struct gendisk *disk, blk_mode_t mode)
-{
-	struct pktcdvd_device *pd = NULL;
-	int ret;
-
-	mutex_lock(&pktcdvd_mutex);
-	mutex_lock(&ctl_mutex);
-	pd = pkt_find_dev_from_minor(disk->first_minor);
-	if (!pd) {
-		ret = -ENODEV;
-		goto out;
-	}
-	BUG_ON(pd->refcnt < 0);
-
-	pd->refcnt++;
-	if (pd->refcnt > 1) {
-		if ((mode & BLK_OPEN_WRITE) &&
-		    !test_bit(PACKET_WRITABLE, &pd->flags)) {
-			ret = -EBUSY;
-			goto out_dec;
-		}
-	} else {
-		ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE);
-		if (ret)
-			goto out_dec;
-	}
-	mutex_unlock(&ctl_mutex);
-	mutex_unlock(&pktcdvd_mutex);
-	return 0;
-
-out_dec:
-	pd->refcnt--;
-out:
-	mutex_unlock(&ctl_mutex);
-	mutex_unlock(&pktcdvd_mutex);
-	return ret;
-}
-
-static void pkt_release(struct gendisk *disk)
-{
-	struct pktcdvd_device *pd = disk->private_data;
-
-	mutex_lock(&pktcdvd_mutex);
-	mutex_lock(&ctl_mutex);
-	pd->refcnt--;
-	BUG_ON(pd->refcnt < 0);
-	if (pd->refcnt == 0) {
-		int flush = test_bit(PACKET_WRITABLE, &pd->flags);
-		pkt_release_dev(pd, flush);
-	}
-	mutex_unlock(&ctl_mutex);
-	mutex_unlock(&pktcdvd_mutex);
-}
-
-
-static void pkt_end_io_read_cloned(struct bio *bio)
-{
-	struct packet_stacked_data *psd = bio->bi_private;
-	struct pktcdvd_device *pd = psd->pd;
-
-	psd->bio->bi_status = bio->bi_status;
-	bio_put(bio);
-	bio_endio(psd->bio);
-	mempool_free(psd, &psd_pool);
-	pkt_bio_finished(pd);
-}
-
-static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
-{
-	struct bio *cloned_bio = bio_alloc_clone(file_bdev(pd->bdev_file), bio,
-		GFP_NOIO, &pkt_bio_set);
-	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
-
-	psd->pd = pd;
-	psd->bio = bio;
-	cloned_bio->bi_private = psd;
-	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-	pd->stats.secs_r += bio_sectors(bio);
-	pkt_queue_bio(pd, cloned_bio);
-}
-
-static void pkt_make_request_write(struct bio *bio)
-{
-	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
-	sector_t zone;
-	struct packet_data *pkt;
-	int was_empty, blocked_bio;
-	struct pkt_rb_node *node;
-
-	zone = get_zone(bio->bi_iter.bi_sector, pd);
-
-	/*
-	 * If we find a matching packet in state WAITING or READ_WAIT, we can
-	 * just append this bio to that packet.
-	 */
-	spin_lock(&pd->cdrw.active_list_lock);
-	blocked_bio = 0;
-	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
-		if (pkt->sector == zone) {
-			spin_lock(&pkt->lock);
-			if ((pkt->state == PACKET_WAITING_STATE) ||
-			    (pkt->state == PACKET_READ_WAIT_STATE)) {
-				bio_list_add(&pkt->orig_bios, bio);
-				pkt->write_size +=
-					bio->bi_iter.bi_size / CD_FRAMESIZE;
-				if ((pkt->write_size >= pkt->frames) &&
-				    (pkt->state == PACKET_WAITING_STATE)) {
-					atomic_inc(&pkt->run_sm);
-					wake_up(&pd->wqueue);
-				}
-				spin_unlock(&pkt->lock);
-				spin_unlock(&pd->cdrw.active_list_lock);
-				return;
-			} else {
-				blocked_bio = 1;
-			}
-			spin_unlock(&pkt->lock);
-		}
-	}
-	spin_unlock(&pd->cdrw.active_list_lock);
-
-	/*
-	 * Test if there is enough room left in the bio work queue
-	 * (queue size >= congestion on mark).
-	 * If not, wait till the work queue size is below the congestion off mark.
-	 */
-	spin_lock(&pd->lock);
-	if (pd->write_congestion_on > 0
-	    && pd->bio_queue_size >= pd->write_congestion_on) {
-		struct wait_bit_queue_entry wqe;
-
-		init_wait_var_entry(&wqe, &pd->congested, 0);
-		for (;;) {
-			prepare_to_wait_event(__var_waitqueue(&pd->congested),
-					      &wqe.wq_entry,
-					      TASK_UNINTERRUPTIBLE);
-			if (pd->bio_queue_size <= pd->write_congestion_off)
-				break;
-			pd->congested = true;
-			spin_unlock(&pd->lock);
-			schedule();
-			spin_lock(&pd->lock);
-		}
-	}
-	spin_unlock(&pd->lock);
-
-	/*
-	 * No matching packet found. Store the bio in the work queue.
-	 */
-	node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
-	node->bio = bio;
-	spin_lock(&pd->lock);
-	BUG_ON(pd->bio_queue_size < 0);
-	was_empty = (pd->bio_queue_size == 0);
-	pkt_rbtree_insert(pd, node);
-	spin_unlock(&pd->lock);
-
-	/*
-	 * Wake up the worker thread.
-	 */
-	atomic_set(&pd->scan_queue, 1);
-	if (was_empty) {
-		/* This wake_up is required for correct operation */
-		wake_up(&pd->wqueue);
-	} else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
-		/*
-		 * This wake up is not required for correct operation,
-		 * but improves performance in some cases.
-		 */
-		wake_up(&pd->wqueue);
-	}
-}
-
-static void pkt_submit_bio(struct bio *bio)
-{
-	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
-	struct device *ddev = disk_to_dev(pd->disk);
-	struct bio *split;
-
-	bio = bio_split_to_limits(bio);
-	if (!bio)
-		return;
-
-	dev_dbg(ddev, "start = %6llx stop = %6llx\n",
-		bio->bi_iter.bi_sector, bio_end_sector(bio));
-
-	/*
-	 * Clone READ bios so we can have our own bi_end_io callback.
-	 */
-	if (bio_data_dir(bio) == READ) {
-		pkt_make_request_read(pd, bio);
-		return;
-	}
-
-	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
-		dev_notice(ddev, "WRITE for ro device (%llu)\n", bio->bi_iter.bi_sector);
-		goto end_io;
-	}
-
-	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
-		dev_err(ddev, "wrong bio size\n");
-		goto end_io;
-	}
-
-	do {
-		sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
-		sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
-
-		if (last_zone != zone) {
-			BUG_ON(last_zone != zone + pd->settings.size);
-
-			split = bio_split(bio, last_zone -
-					  bio->bi_iter.bi_sector,
-					  GFP_NOIO, &pkt_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
-
-		pkt_make_request_write(split);
-	} while (split != bio);
-
-	return;
-end_io:
-	bio_io_error(bio);
-}
-
-static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
-{
-	struct device *ddev = disk_to_dev(pd->disk);
-	int i;
-	struct file *bdev_file;
-	struct scsi_device *sdev;
-
-	if (pd->pkt_dev == dev) {
-		dev_err(ddev, "recursive setup not allowed\n");
-		return -EBUSY;
-	}
-	for (i = 0; i < MAX_WRITERS; i++) {
-		struct pktcdvd_device *pd2 = pkt_devs[i];
-		if (!pd2)
-			continue;
-		if (file_bdev(pd2->bdev_file)->bd_dev == dev) {
-			dev_err(ddev, "%pg already setup\n",
-				file_bdev(pd2->bdev_file));
-			return -EBUSY;
-		}
-		if (pd2->pkt_dev == dev) {
-			dev_err(ddev, "can't chain pktcdvd devices\n");
-			return -EBUSY;
-		}
-	}
-
-	bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY,
-				       NULL, NULL);
-	if (IS_ERR(bdev_file))
-		return PTR_ERR(bdev_file);
-	sdev = scsi_device_from_queue(file_bdev(bdev_file)->bd_disk->queue);
-	if (!sdev) {
-		fput(bdev_file);
-		return -EINVAL;
-	}
-	put_device(&sdev->sdev_gendev);
-
-	/* This is safe, since we have a reference from open(). */
-	__module_get(THIS_MODULE);
-
-	pd->bdev_file = bdev_file;
-
-	atomic_set(&pd->cdrw.pending_bios, 0);
-	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name);
-	if (IS_ERR(pd->cdrw.thread)) {
-		dev_err(ddev, "can't start kernel thread\n");
-		goto out_mem;
-	}
-
-	proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd);
-	dev_notice(ddev, "writer mapped to %pg\n", file_bdev(bdev_file));
-	return 0;
-
-out_mem:
-	fput(bdev_file);
-	/* This is safe: open() is still holding a reference. */
-	module_put(THIS_MODULE);
-	return -ENOMEM;
-}
-
-static int pkt_ioctl(struct block_device *bdev, blk_mode_t mode,
-		unsigned int cmd, unsigned long arg)
-{
-	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
-	struct device *ddev = disk_to_dev(pd->disk);
-	int ret;
-
-	dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-
-	mutex_lock(&pktcdvd_mutex);
-	switch (cmd) {
-	case CDROMEJECT:
-		/*
-		 * The door gets locked when the device is opened, so we
-		 * have to unlock it or else the eject command fails.
-		 */
-		if (pd->refcnt == 1)
-			pkt_lock_door(pd, 0);
-		fallthrough;
-	/*
-	 * forward selected CDROM ioctls to CD-ROM, for UDF
-	 */
-	case CDROMMULTISESSION:
-	case CDROMREADTOCENTRY:
-	case CDROM_LAST_WRITTEN:
-	case CDROM_SEND_PACKET:
-	case SCSI_IOCTL_SEND_COMMAND:
-		if (!bdev->bd_disk->fops->ioctl)
-			ret = -ENOTTY;
-		else
-			ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
-		break;
-	default:
-		dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd);
-		ret = -ENOTTY;
-	}
-	mutex_unlock(&pktcdvd_mutex);
-
-	return ret;
-}
-
-static unsigned int pkt_check_events(struct gendisk *disk,
-				     unsigned int clearing)
-{
-	struct pktcdvd_device *pd = disk->private_data;
-	struct gendisk *attached_disk;
-
-	if (!pd)
-		return 0;
-	if (!pd->bdev_file)
-		return 0;
-	attached_disk = file_bdev(pd->bdev_file)->bd_disk;
-	if (!attached_disk || !attached_disk->fops->check_events)
-		return 0;
-	return attached_disk->fops->check_events(attached_disk, clearing);
-}
-
-static char *pkt_devnode(struct gendisk *disk, umode_t *mode)
-{
-	return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name);
-}
-
-static const struct block_device_operations pktcdvd_ops = {
-	.owner =		THIS_MODULE,
-	.submit_bio =		pkt_submit_bio,
-	.open =			pkt_open,
-	.release =		pkt_release,
-	.ioctl =		pkt_ioctl,
-	.compat_ioctl =		blkdev_compat_ptr_ioctl,
-	.check_events =		pkt_check_events,
-	.devnode =		pkt_devnode,
-};
-
-/*
- * Set up mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
-{
-	struct queue_limits lim = {
-		.max_hw_sectors		= PACKET_MAX_SECTORS,
-		.logical_block_size	= CD_FRAMESIZE,
-		.features		= BLK_FEAT_ROTATIONAL,
-	};
-	int idx;
-	int ret = -ENOMEM;
-	struct pktcdvd_device *pd;
-	struct gendisk *disk;
-
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	for (idx = 0; idx < MAX_WRITERS; idx++)
-		if (!pkt_devs[idx])
-			break;
-	if (idx == MAX_WRITERS) {
-		pr_err("max %d writers supported\n", MAX_WRITERS);
-		ret = -EBUSY;
-		goto out_mutex;
-	}
-
-	pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
-	if (!pd)
-		goto out_mutex;
-
-	ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
-					sizeof(struct pkt_rb_node));
-	if (ret)
-		goto out_mem;
-
-	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
-	INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
-	spin_lock_init(&pd->cdrw.active_list_lock);
-
-	spin_lock_init(&pd->lock);
-	spin_lock_init(&pd->iosched.lock);
-	bio_list_init(&pd->iosched.read_queue);
-	bio_list_init(&pd->iosched.write_queue);
-	init_waitqueue_head(&pd->wqueue);
-	pd->bio_queue = RB_ROOT;
-
-	pd->write_congestion_on  = write_congestion_on;
-	pd->write_congestion_off = write_congestion_off;
-
-	disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
-	if (IS_ERR(disk)) {
-		ret = PTR_ERR(disk);
-		goto out_mem;
-	}
-	pd->disk = disk;
-	disk->major = pktdev_major;
-	disk->first_minor = idx;
-	disk->minors = 1;
-	disk->fops = &pktcdvd_ops;
-	disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART;
-	snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx);
-	disk->private_data = pd;
-
-	pd->pkt_dev = MKDEV(pktdev_major, idx);
-	ret = pkt_new_dev(pd, dev);
-	if (ret)
-		goto out_mem2;
-
-	/* inherit events of the host device */
-	disk->events = file_bdev(pd->bdev_file)->bd_disk->events;
-
-	ret = add_disk(disk);
-	if (ret)
-		goto out_mem2;
-
-	pkt_sysfs_dev_new(pd);
-	pkt_debugfs_dev_new(pd);
-
-	pkt_devs[idx] = pd;
-	if (pkt_dev)
-		*pkt_dev = pd->pkt_dev;
-
-	mutex_unlock(&ctl_mutex);
-	return 0;
-
-out_mem2:
-	put_disk(disk);
-out_mem:
-	mempool_exit(&pd->rb_pool);
-	kfree(pd);
-out_mutex:
-	mutex_unlock(&ctl_mutex);
-	pr_err("setup of pktcdvd device failed\n");
-	return ret;
-}
-
-/*
- * Tear down mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_remove_dev(dev_t pkt_dev)
-{
-	struct pktcdvd_device *pd;
-	struct device *ddev;
-	int idx;
-	int ret = 0;
-
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	for (idx = 0; idx < MAX_WRITERS; idx++) {
-		pd = pkt_devs[idx];
-		if (pd && (pd->pkt_dev == pkt_dev))
-			break;
-	}
-	if (idx == MAX_WRITERS) {
-		pr_debug("dev not setup\n");
-		ret = -ENXIO;
-		goto out;
-	}
-
-	if (pd->refcnt > 0) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ddev = disk_to_dev(pd->disk);
-
-	if (!IS_ERR(pd->cdrw.thread))
-		kthread_stop(pd->cdrw.thread);
-
-	pkt_devs[idx] = NULL;
-
-	pkt_debugfs_dev_remove(pd);
-	pkt_sysfs_dev_remove(pd);
-
-	fput(pd->bdev_file);
-
-	remove_proc_entry(pd->disk->disk_name, pkt_proc);
-	dev_notice(ddev, "writer unmapped\n");
-
-	del_gendisk(pd->disk);
-	put_disk(pd->disk);
-
-	mempool_exit(&pd->rb_pool);
-	kfree(pd);
-
-	/* This is safe: open() is still holding a reference. */
-	module_put(THIS_MODULE);
-
-out:
-	mutex_unlock(&ctl_mutex);
-	return ret;
-}
-
-static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
-{
-	struct pktcdvd_device *pd;
-
-	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
-	pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
-	if (pd) {
-		ctrl_cmd->dev = new_encode_dev(file_bdev(pd->bdev_file)->bd_dev);
-		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
-	} else {
-		ctrl_cmd->dev = 0;
-		ctrl_cmd->pkt_dev = 0;
-	}
-	ctrl_cmd->num_devices = MAX_WRITERS;
-
-	mutex_unlock(&ctl_mutex);
-}
-
-static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	struct pkt_ctrl_command ctrl_cmd;
-	int ret = 0;
-	dev_t pkt_dev = 0;
-
-	if (cmd != PACKET_CTRL_CMD)
-		return -ENOTTY;
-
-	if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
-		return -EFAULT;
-
-	switch (ctrl_cmd.command) {
-	case PKT_CTRL_CMD_SETUP:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev);
-		ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev);
-		break;
-	case PKT_CTRL_CMD_TEARDOWN:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev));
-		break;
-	case PKT_CTRL_CMD_STATUS:
-		pkt_get_status(&ctrl_cmd);
-		break;
-	default:
-		return -ENOTTY;
-	}
-
-	if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
-		return -EFAULT;
-	return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
-static const struct file_operations pkt_ctl_fops = {
-	.open		= nonseekable_open,
-	.unlocked_ioctl	= pkt_ctl_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= pkt_ctl_compat_ioctl,
-#endif
-	.owner		= THIS_MODULE,
-};
-
-static struct miscdevice pkt_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
-	.name  		= DRIVER_NAME,
-	.nodename	= "pktcdvd/control",
-	.fops  		= &pkt_ctl_fops
-};
-
-static int __init pkt_init(void)
-{
-	int ret;
-
-	mutex_init(&ctl_mutex);
-
-	ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
-				    sizeof(struct packet_stacked_data));
-	if (ret)
-		return ret;
-	ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
-	if (ret) {
-		mempool_exit(&psd_pool);
-		return ret;
-	}
-
-	ret = register_blkdev(pktdev_major, DRIVER_NAME);
-	if (ret < 0) {
-		pr_err("unable to register block device\n");
-		goto out2;
-	}
-	if (!pktdev_major)
-		pktdev_major = ret;
-
-	ret = pkt_sysfs_init();
-	if (ret)
-		goto out;
-
-	pkt_debugfs_init();
-
-	ret = misc_register(&pkt_misc);
-	if (ret) {
-		pr_err("unable to register misc device\n");
-		goto out_misc;
-	}
-
-	pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL);
-
-	return 0;
-
-out_misc:
-	pkt_debugfs_cleanup();
-	pkt_sysfs_cleanup();
-out:
-	unregister_blkdev(pktdev_major, DRIVER_NAME);
-out2:
-	mempool_exit(&psd_pool);
-	bioset_exit(&pkt_bio_set);
-	return ret;
-}
-
-static void __exit pkt_exit(void)
-{
-	remove_proc_entry("driver/"DRIVER_NAME, NULL);
-	misc_deregister(&pkt_misc);
-
-	pkt_debugfs_cleanup();
-	pkt_sysfs_cleanup();
-
-	unregister_blkdev(pktdev_major, DRIVER_NAME);
-	mempool_exit(&psd_pool);
-	bioset_exit(&pkt_bio_set);
-}
-
-MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
-MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
-MODULE_LICENSE("GPL");
-
-module_init(pkt_init);
-module_exit(pkt_exit);
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
deleted file mode 100644
index 2f1b952d596a..000000000000
--- a/include/linux/pktcdvd.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
- * DVD-RW devices.
- *
- */
-#ifndef __PKTCDVD_H
-#define __PKTCDVD_H
-
-#include <linux/blkdev.h>
-#include <linux/completion.h>
-#include <linux/cdrom.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/mempool.h>
-#include <uapi/linux/pktcdvd.h>
-
-/* default bio write queue congestion marks */
-#define PKT_WRITE_CONGESTION_ON    10000
-#define PKT_WRITE_CONGESTION_OFF   9000
-
-
-struct packet_settings
-{
-	__u32			size;		/* packet size in (512 byte) sectors */
-	__u8			fp;		/* fixed packets */
-	__u8			link_loss;	/* the rest is specified
-						 * as per Mt Fuji */
-	__u8			write_type;
-	__u8			track_mode;
-	__u8			block_mode;
-};
-
-/*
- * Very crude stats for now
- */
-struct packet_stats
-{
-	unsigned long		pkt_started;
-	unsigned long		pkt_ended;
-	unsigned long		secs_w;
-	unsigned long		secs_rg;
-	unsigned long		secs_r;
-};
-
-struct packet_cdrw
-{
-	struct list_head	pkt_free_list;
-	struct list_head	pkt_active_list;
-	spinlock_t		active_list_lock; /* Serialize access to pkt_active_list */
-	struct task_struct	*thread;
-	atomic_t		pending_bios;
-};
-
-/*
- * Switch to high speed reading after reading this many kilobytes
- * with no interspersed writes.
- */
-#define HI_SPEED_SWITCH 512
-
-struct packet_iosched
-{
-	atomic_t		attention;	/* Set to non-zero when queue processing is needed */
-	int			writing;	/* Non-zero when writing, zero when reading */
-	spinlock_t		lock;		/* Protecting read/write queue manipulations */
-	struct bio_list		read_queue;
-	struct bio_list		write_queue;
-	sector_t		last_write;	/* The sector where the last write ended */
-	int			successive_reads;
-};
-
-/*
- * 32 buffers of 2048 bytes
- */
-#if (PAGE_SIZE % CD_FRAMESIZE) != 0
-#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE"
-#endif
-#define PACKET_MAX_SIZE		128
-#define FRAMES_PER_PAGE		(PAGE_SIZE / CD_FRAMESIZE)
-#define PACKET_MAX_SECTORS	(PACKET_MAX_SIZE * CD_FRAMESIZE >> 9)
-
-enum packet_data_state {
-	PACKET_IDLE_STATE,			/* Not used at the moment */
-	PACKET_WAITING_STATE,			/* Waiting for more bios to arrive, so */
-						/* we don't have to do as much */
-						/* data gathering */
-	PACKET_READ_WAIT_STATE,			/* Waiting for reads to fill in holes */
-	PACKET_WRITE_WAIT_STATE,		/* Waiting for the write to complete */
-	PACKET_RECOVERY_STATE,			/* Recover after read/write errors */
-	PACKET_FINISHED_STATE,			/* After write has finished */
-
-	PACKET_NUM_STATES			/* Number of possible states */
-};
-
-/*
- * Information needed for writing a single packet
- */
-struct pktcdvd_device;
-
-struct packet_data
-{
-	struct list_head	list;
-
-	spinlock_t		lock;		/* Lock protecting state transitions and */
-						/* orig_bios list */
-
-	struct bio_list		orig_bios;	/* Original bios passed to pkt_make_request */
-						/* that will be handled by this packet */
-	int			write_size;	/* Total size of all bios in the orig_bios */
-						/* list, measured in number of frames */
-
-	struct bio		*w_bio;		/* The bio we will send to the real CD */
-						/* device once we have all data for the */
-						/* packet we are going to write */
-	sector_t		sector;		/* First sector in this packet */
-	int			frames;		/* Number of frames in this packet */
-
-	enum packet_data_state	state;		/* Current state */
-	atomic_t		run_sm;		/* Incremented whenever the state */
-						/* machine needs to be run */
-	long			sleep_time;	/* Set this to non-zero to make the state */
-						/* machine run after this many jiffies. */
-
-	atomic_t		io_wait;	/* Number of pending IO operations */
-	atomic_t		io_errors;	/* Number of read/write errors during IO */
-
-	struct bio		*r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */
-	struct page		*pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE];
-
-	int			cache_valid;	/* If non-zero, the data for the zone defined */
-						/* by the sector variable is completely cached */
-						/* in the pages[] vector. */
-
-	int			id;		/* ID number for debugging */
-	struct pktcdvd_device	*pd;
-};
-
-struct pkt_rb_node {
-	struct rb_node		rb_node;
-	struct bio		*bio;
-};
-
-struct packet_stacked_data
-{
-	struct bio		*bio;		/* Original read request bio */
-	struct pktcdvd_device	*pd;
-};
-#define PSD_POOL_SIZE		64
-
-struct pktcdvd_device
-{
-	struct file		*bdev_file;	/* dev attached */
-	/* handle acquired for bdev during pkt_open_dev() */
-	struct file		*f_open_bdev;
-	dev_t			pkt_dev;	/* our dev */
-	struct packet_settings	settings;
-	struct packet_stats	stats;
-	int			refcnt;		/* Open count */
-	int			write_speed;	/* current write speed, kB/s */
-	int			read_speed;	/* current read speed, kB/s */
-	unsigned long		offset;		/* start offset */
-	__u8			mode_offset;	/* 0 / 8 */
-	__u8			type;
-	unsigned long		flags;
-	__u16			mmc3_profile;
-	__u32			nwa;		/* next writable address */
-	__u32			lra;		/* last recorded address */
-	struct packet_cdrw	cdrw;
-	wait_queue_head_t	wqueue;
-
-	spinlock_t		lock;		/* Serialize access to bio_queue */
-	struct rb_root		bio_queue;	/* Work queue of bios we need to handle */
-	int			bio_queue_size;	/* Number of nodes in bio_queue */
-	bool			congested;	/* Someone is waiting for bio_queue_size
-						 * to drop. */
-	sector_t		current_sector;	/* Keep track of where the elevator is */
-	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
-						/* needs to be run. */
-	mempool_t		rb_pool;	/* mempool for pkt_rb_node allocations */
-
-	struct packet_iosched   iosched;
-	struct gendisk		*disk;
-
-	int			write_congestion_off;
-	int			write_congestion_on;
-
-	struct device		*dev;		/* sysfs pktcdvd[0-7] dev */
-
-	struct dentry		*dfs_d_root;	/* debugfs: devname directory */
-	struct dentry		*dfs_f_info;	/* debugfs: info file */
-};
-
-#endif /* __PKTCDVD_H */
-- 
cgit v1.2.3


From c479d7cf06c3d65532442fa368b058e05dbba1a2 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@riscstar.com>
Date: Wed, 2 Jul 2025 06:37:07 -0500
Subject: reset: spacemit: add support for SpacemiT CCU resets

Implement reset support for SpacemiT CCUs.  A SpacemiT reset controller
device is an auxiliary device associated with a clock controller (CCU).

This patch defines the reset controllers for the MPMU, APBC, and MPMU
CCUs, which already define clock controllers.  It also adds RCPU, RCPU2,
and ACPB2 CCUs, which only define resets.

Signed-off-by: Alex Elder <elder@riscstar.com>
Reviewed-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Yixun Lan <dlan@gentoo.org>
Acked-by: Philipp Zabel <p.zabel@pengutronix.de>
Link: https://lore.kernel.org/r/20250702113709.291748-6-elder@riscstar.com
Signed-off-by: Yixun Lan <dlan@gentoo.org>
---
 drivers/reset/Kconfig            |   9 ++
 drivers/reset/Makefile           |   1 +
 drivers/reset/reset-spacemit.c   | 304 +++++++++++++++++++++++++++++++++++++++
 include/soc/spacemit/k1-syscon.h |  30 ++++
 4 files changed, 344 insertions(+)
 create mode 100644 drivers/reset/reset-spacemit.c

(limited to 'include')

diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig
index d85be5899da6..49f5d914172e 100644
--- a/drivers/reset/Kconfig
+++ b/drivers/reset/Kconfig
@@ -270,6 +270,15 @@ config RESET_SOCFPGA
 	  This enables the reset driver for the SoCFPGA ARMv7 platforms. This
 	  driver gets initialized early during platform init calls.
 
+config RESET_SPACEMIT
+	tristate "SpacemiT reset driver"
+	depends on ARCH_SPACEMIT || COMPILE_TEST
+	select AUXILIARY_BUS
+	default ARCH_SPACEMIT
+	help
+	  This enables the reset controller driver for SpacemiT SoCs,
+	  including the K1.
+
 config RESET_SUNPLUS
 	bool "Sunplus SoCs Reset Driver" if COMPILE_TEST
 	default ARCH_SUNPLUS
diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile
index 91e6348e3351..35344abe4a75 100644
--- a/drivers/reset/Makefile
+++ b/drivers/reset/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_RESET_RZV2H_USB2PHY) += reset-rzv2h-usb2phy.o
 obj-$(CONFIG_RESET_SCMI) += reset-scmi.o
 obj-$(CONFIG_RESET_SIMPLE) += reset-simple.o
 obj-$(CONFIG_RESET_SOCFPGA) += reset-socfpga.o
+obj-$(CONFIG_RESET_SPACEMIT) += reset-spacemit.o
 obj-$(CONFIG_RESET_SUNPLUS) += reset-sunplus.o
 obj-$(CONFIG_RESET_SUNXI) += reset-sunxi.o
 obj-$(CONFIG_RESET_TH1520) += reset-th1520.o
diff --git a/drivers/reset/reset-spacemit.c b/drivers/reset/reset-spacemit.c
new file mode 100644
index 000000000000..e1272aff28f7
--- /dev/null
+++ b/drivers/reset/reset-spacemit.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* SpacemiT reset controller driver */
+
+#include <linux/auxiliary_bus.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+#include <linux/reset-controller.h>
+#include <linux/types.h>
+
+#include <soc/spacemit/k1-syscon.h>
+#include <dt-bindings/clock/spacemit,k1-syscon.h>
+
+struct ccu_reset_data {
+	u32 offset;
+	u32 assert_mask;
+	u32 deassert_mask;
+};
+
+struct ccu_reset_controller_data {
+	const struct ccu_reset_data *reset_data;	/* array */
+	size_t count;
+};
+
+struct ccu_reset_controller {
+	struct reset_controller_dev rcdev;
+	const struct ccu_reset_controller_data *data;
+	struct regmap *regmap;
+};
+
+#define RESET_DATA(_offset, _assert_mask, _deassert_mask)	\
+	{							\
+		.offset		= (_offset),			\
+		.assert_mask	= (_assert_mask),		\
+		.deassert_mask	= (_deassert_mask),		\
+	}
+
+static const struct ccu_reset_data k1_mpmu_resets[] = {
+	[RESET_WDT]	= RESET_DATA(MPMU_WDTPCR,		BIT(2), 0),
+};
+
+static const struct ccu_reset_controller_data k1_mpmu_reset_data = {
+	.reset_data	= k1_mpmu_resets,
+	.count		= ARRAY_SIZE(k1_mpmu_resets),
+};
+
+static const struct ccu_reset_data k1_apbc_resets[] = {
+	[RESET_UART0]	= RESET_DATA(APBC_UART1_CLK_RST,	BIT(2),	0),
+	[RESET_UART2]	= RESET_DATA(APBC_UART2_CLK_RST,	BIT(2), 0),
+	[RESET_GPIO]	= RESET_DATA(APBC_GPIO_CLK_RST,		BIT(2), 0),
+	[RESET_PWM0]	= RESET_DATA(APBC_PWM0_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM1]	= RESET_DATA(APBC_PWM1_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM2]	= RESET_DATA(APBC_PWM2_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM3]	= RESET_DATA(APBC_PWM3_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM4]	= RESET_DATA(APBC_PWM4_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM5]	= RESET_DATA(APBC_PWM5_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM6]	= RESET_DATA(APBC_PWM6_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM7]	= RESET_DATA(APBC_PWM7_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM8]	= RESET_DATA(APBC_PWM8_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM9]	= RESET_DATA(APBC_PWM9_CLK_RST,		BIT(2), BIT(0)),
+	[RESET_PWM10]	= RESET_DATA(APBC_PWM10_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM11]	= RESET_DATA(APBC_PWM11_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM12]	= RESET_DATA(APBC_PWM12_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM13]	= RESET_DATA(APBC_PWM13_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM14]	= RESET_DATA(APBC_PWM14_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM15]	= RESET_DATA(APBC_PWM15_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM16]	= RESET_DATA(APBC_PWM16_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM17]	= RESET_DATA(APBC_PWM17_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM18]	= RESET_DATA(APBC_PWM18_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_PWM19]	= RESET_DATA(APBC_PWM19_CLK_RST,	BIT(2), BIT(0)),
+	[RESET_SSP3]	= RESET_DATA(APBC_SSP3_CLK_RST,		BIT(2), 0),
+	[RESET_UART3]	= RESET_DATA(APBC_UART3_CLK_RST,	BIT(2), 0),
+	[RESET_RTC]	= RESET_DATA(APBC_RTC_CLK_RST,		BIT(2), 0),
+	[RESET_TWSI0]	= RESET_DATA(APBC_TWSI0_CLK_RST,	BIT(2), 0),
+	[RESET_TIMERS1]	= RESET_DATA(APBC_TIMERS1_CLK_RST,	BIT(2), 0),
+	[RESET_AIB]	= RESET_DATA(APBC_AIB_CLK_RST,		BIT(2), 0),
+	[RESET_TIMERS2]	= RESET_DATA(APBC_TIMERS2_CLK_RST,	BIT(2), 0),
+	[RESET_ONEWIRE]	= RESET_DATA(APBC_ONEWIRE_CLK_RST,	BIT(2), 0),
+	[RESET_SSPA0]	= RESET_DATA(APBC_SSPA0_CLK_RST,	BIT(2), 0),
+	[RESET_SSPA1]	= RESET_DATA(APBC_SSPA1_CLK_RST,	BIT(2), 0),
+	[RESET_DRO]	= RESET_DATA(APBC_DRO_CLK_RST,		BIT(2), 0),
+	[RESET_IR]	= RESET_DATA(APBC_IR_CLK_RST,		BIT(2), 0),
+	[RESET_TWSI1]	= RESET_DATA(APBC_TWSI1_CLK_RST,	BIT(2), 0),
+	[RESET_TSEN]	= RESET_DATA(APBC_TSEN_CLK_RST,		BIT(2), 0),
+	[RESET_TWSI2]	= RESET_DATA(APBC_TWSI2_CLK_RST,	BIT(2), 0),
+	[RESET_TWSI4]	= RESET_DATA(APBC_TWSI4_CLK_RST,	BIT(2), 0),
+	[RESET_TWSI5]	= RESET_DATA(APBC_TWSI5_CLK_RST,	BIT(2), 0),
+	[RESET_TWSI6]	= RESET_DATA(APBC_TWSI6_CLK_RST,	BIT(2), 0),
+	[RESET_TWSI7]	= RESET_DATA(APBC_TWSI7_CLK_RST,	BIT(2), 0),
+	[RESET_TWSI8]	= RESET_DATA(APBC_TWSI8_CLK_RST,	BIT(2), 0),
+	[RESET_IPC_AP2AUD] = RESET_DATA(APBC_IPC_AP2AUD_CLK_RST, BIT(2), 0),
+	[RESET_UART4]	= RESET_DATA(APBC_UART4_CLK_RST,	BIT(2), 0),
+	[RESET_UART5]	= RESET_DATA(APBC_UART5_CLK_RST,	BIT(2), 0),
+	[RESET_UART6]	= RESET_DATA(APBC_UART6_CLK_RST,	BIT(2), 0),
+	[RESET_UART7]	= RESET_DATA(APBC_UART7_CLK_RST,	BIT(2), 0),
+	[RESET_UART8]	= RESET_DATA(APBC_UART8_CLK_RST,	BIT(2), 0),
+	[RESET_UART9]	= RESET_DATA(APBC_UART9_CLK_RST,	BIT(2), 0),
+	[RESET_CAN0]	= RESET_DATA(APBC_CAN0_CLK_RST,		BIT(2), 0),
+};
+
+static const struct ccu_reset_controller_data k1_apbc_reset_data = {
+	.reset_data	= k1_apbc_resets,
+	.count		= ARRAY_SIZE(k1_apbc_resets),
+};
+
+static const struct ccu_reset_data k1_apmu_resets[] = {
+	[RESET_CCIC_4X]	= RESET_DATA(APMU_CCIC_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_CCIC1_PHY] = RESET_DATA(APMU_CCIC_CLK_RES_CTRL,	0, BIT(2)),
+	[RESET_SDH_AXI]	= RESET_DATA(APMU_SDH0_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_SDH0]	= RESET_DATA(APMU_SDH0_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_SDH1]	= RESET_DATA(APMU_SDH1_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_SDH2]	= RESET_DATA(APMU_SDH2_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_USBP1_AXI] = RESET_DATA(APMU_USB_CLK_RES_CTRL,	0, BIT(4)),
+	[RESET_USB_AXI]	= RESET_DATA(APMU_USB_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_USB30_AHB] = RESET_DATA(APMU_USB_CLK_RES_CTRL,	0, BIT(9)),
+	[RESET_USB30_VCC] = RESET_DATA(APMU_USB_CLK_RES_CTRL,	0, BIT(10)),
+	[RESET_USB30_PHY] = RESET_DATA(APMU_USB_CLK_RES_CTRL,	0, BIT(11)),
+	[RESET_QSPI]	= RESET_DATA(APMU_QSPI_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_QSPI_BUS] = RESET_DATA(APMU_QSPI_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_DMA]	= RESET_DATA(APMU_DMA_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_AES]	= RESET_DATA(APMU_AES_CLK_RES_CTRL,	0, BIT(4)),
+	[RESET_VPU]	= RESET_DATA(APMU_VPU_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_GPU]	= RESET_DATA(APMU_GPU_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_EMMC]	= RESET_DATA(APMU_PMUA_EM_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_EMMC_X]	= RESET_DATA(APMU_PMUA_EM_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_AUDIO_SYS] = RESET_DATA(APMU_AUDIO_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_AUDIO_MCU] = RESET_DATA(APMU_AUDIO_CLK_RES_CTRL, 0, BIT(2)),
+	[RESET_AUDIO_APMU] = RESET_DATA(APMU_AUDIO_CLK_RES_CTRL, 0, BIT(3)),
+	[RESET_HDMI]	= RESET_DATA(APMU_HDMI_CLK_RES_CTRL,	0, BIT(9)),
+	[RESET_PCIE0_DBI] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_0, 0, BIT(3)),
+	[RESET_PCIE0_SLAVE] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_0, 0, BIT(4)),
+	[RESET_PCIE0_MASTER] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_0, 0, BIT(5)),
+	[RESET_PCIE0_GLOBAL] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_0, BIT(8), 0),
+	[RESET_PCIE1_DBI] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_1, 0, BIT(3)),
+	[RESET_PCIE1_SLAVE] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_1, 0, BIT(4)),
+	[RESET_PCIE1_MASTER] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_1, 0, BIT(5)),
+	[RESET_PCIE1_GLOBAL] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_1, BIT(8), 0),
+	[RESET_PCIE2_DBI] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_2, 0, BIT(3)),
+	[RESET_PCIE2_SLAVE] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_2, 0, BIT(4)),
+	[RESET_PCIE2_MASTER] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_2, 0, BIT(5)),
+	[RESET_PCIE2_GLOBAL] = RESET_DATA(APMU_PCIE_CLK_RES_CTRL_2, BIT(8), 0),
+	[RESET_EMAC0]	= RESET_DATA(APMU_EMAC0_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_EMAC1]	= RESET_DATA(APMU_EMAC1_CLK_RES_CTRL,	0, BIT(1)),
+	[RESET_JPG]	= RESET_DATA(APMU_JPG_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_CCIC2PHY] = RESET_DATA(APMU_CSI_CCIC2_CLK_RES_CTRL, 0, BIT(2)),
+	[RESET_CCIC3PHY] = RESET_DATA(APMU_CSI_CCIC2_CLK_RES_CTRL, 0, BIT(29)),
+	[RESET_CSI]	= RESET_DATA(APMU_CSI_CCIC2_CLK_RES_CTRL, 0, BIT(1)),
+	[RESET_ISP]	= RESET_DATA(APMU_ISP_CLK_RES_CTRL,	0, BIT(0)),
+	[RESET_ISP_CPP]	= RESET_DATA(APMU_ISP_CLK_RES_CTRL,	0, BIT(27)),
+	[RESET_ISP_BUS]	= RESET_DATA(APMU_ISP_CLK_RES_CTRL,	0, BIT(3)),
+	[RESET_ISP_CI]	= RESET_DATA(APMU_ISP_CLK_RES_CTRL,	0, BIT(16)),
+	[RESET_DPU_MCLK] = RESET_DATA(APMU_LCD_CLK_RES_CTRL2,	0, BIT(9)),
+	[RESET_DPU_ESC]	= RESET_DATA(APMU_LCD_CLK_RES_CTRL1,	0, BIT(3)),
+	[RESET_DPU_HCLK] = RESET_DATA(APMU_LCD_CLK_RES_CTRL1,	0, BIT(4)),
+	[RESET_DPU_SPIBUS] = RESET_DATA(APMU_LCD_SPI_CLK_RES_CTRL, 0, BIT(4)),
+	[RESET_DPU_SPI_HBUS] = RESET_DATA(APMU_LCD_SPI_CLK_RES_CTRL, 0, BIT(2)),
+	[RESET_V2D]	= RESET_DATA(APMU_LCD_CLK_RES_CTRL1,	0, BIT(27)),
+	[RESET_MIPI]	= RESET_DATA(APMU_LCD_CLK_RES_CTRL1,	0, BIT(15)),
+	[RESET_MC]	= RESET_DATA(APMU_PMUA_MC_CTRL,		0, BIT(0)),
+};
+
+static const struct ccu_reset_controller_data k1_apmu_reset_data = {
+	.reset_data	= k1_apmu_resets,
+	.count		= ARRAY_SIZE(k1_apmu_resets),
+};
+
+static const struct ccu_reset_data k1_rcpu_resets[] = {
+	[RESET_RCPU_SSP0]	= RESET_DATA(RCPU_SSP0_CLK_RST,	0, BIT(0)),
+	[RESET_RCPU_I2C0]	= RESET_DATA(RCPU_I2C0_CLK_RST,	0, BIT(0)),
+	[RESET_RCPU_UART1]	= RESET_DATA(RCPU_UART1_CLK_RST, 0, BIT(0)),
+	[RESET_RCPU_IR]		= RESET_DATA(RCPU_CAN_CLK_RST,	0, BIT(0)),
+	[RESET_RCPU_CAN]	= RESET_DATA(RCPU_IR_CLK_RST,	0, BIT(0)),
+	[RESET_RCPU_UART0]	= RESET_DATA(RCPU_UART0_CLK_RST, 0, BIT(0)),
+	[RESET_RCPU_HDMI_AUDIO]	= RESET_DATA(AUDIO_HDMI_CLK_CTRL, 0, BIT(0)),
+};
+
+static const struct ccu_reset_controller_data k1_rcpu_reset_data = {
+	.reset_data	= k1_rcpu_resets,
+	.count		= ARRAY_SIZE(k1_rcpu_resets),
+};
+
+static const struct ccu_reset_data k1_rcpu2_resets[] = {
+	[RESET_RCPU2_PWM0]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM1]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM2]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM3]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM4]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM5]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM6]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM7]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM8]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+	[RESET_RCPU2_PWM9]	= RESET_DATA(RCPU2_PWM9_CLK_RST, BIT(2), BIT(0)),
+};
+
+static const struct ccu_reset_controller_data k1_rcpu2_reset_data = {
+	.reset_data	= k1_rcpu2_resets,
+	.count		= ARRAY_SIZE(k1_rcpu2_resets),
+};
+
+static const struct ccu_reset_data k1_apbc2_resets[] = {
+	[RESET_APBC2_UART1]	= RESET_DATA(APBC2_UART1_CLK_RST, BIT(2), 0),
+	[RESET_APBC2_SSP2]	= RESET_DATA(APBC2_SSP2_CLK_RST, BIT(2), 0),
+	[RESET_APBC2_TWSI3]	= RESET_DATA(APBC2_TWSI3_CLK_RST, BIT(2), 0),
+	[RESET_APBC2_RTC]	= RESET_DATA(APBC2_RTC_CLK_RST,	BIT(2), 0),
+	[RESET_APBC2_TIMERS0]	= RESET_DATA(APBC2_TIMERS0_CLK_RST, BIT(2), 0),
+	[RESET_APBC2_KPC]	= RESET_DATA(APBC2_KPC_CLK_RST,	BIT(2), 0),
+	[RESET_APBC2_GPIO]	= RESET_DATA(APBC2_GPIO_CLK_RST, BIT(2), 0),
+};
+
+static const struct ccu_reset_controller_data k1_apbc2_reset_data = {
+	.reset_data	= k1_apbc2_resets,
+	.count		= ARRAY_SIZE(k1_apbc2_resets),
+};
+
+static int spacemit_reset_update(struct reset_controller_dev *rcdev,
+				 unsigned long id, bool assert)
+{
+	struct ccu_reset_controller *controller;
+	const struct ccu_reset_data *data;
+	u32 mask;
+	u32 val;
+
+	controller = container_of(rcdev, struct ccu_reset_controller, rcdev);
+	data = &controller->data->reset_data[id];
+	mask = data->assert_mask | data->deassert_mask;
+	val = assert ? data->assert_mask : data->deassert_mask;
+
+	return regmap_update_bits(controller->regmap, data->offset, mask, val);
+}
+
+static int spacemit_reset_assert(struct reset_controller_dev *rcdev,
+				 unsigned long id)
+{
+	return spacemit_reset_update(rcdev, id, true);
+}
+
+static int spacemit_reset_deassert(struct reset_controller_dev *rcdev,
+				   unsigned long id)
+{
+	return spacemit_reset_update(rcdev, id, false);
+}
+
+static const struct reset_control_ops spacemit_reset_control_ops = {
+	.assert		= spacemit_reset_assert,
+	.deassert	= spacemit_reset_deassert,
+};
+
+static int spacemit_reset_controller_register(struct device *dev,
+					      struct ccu_reset_controller *controller)
+{
+	struct reset_controller_dev *rcdev = &controller->rcdev;
+
+	rcdev->ops = &spacemit_reset_control_ops;
+	rcdev->owner = THIS_MODULE;
+	rcdev->of_node = dev->of_node;
+	rcdev->nr_resets = controller->data->count;
+
+	return devm_reset_controller_register(dev, &controller->rcdev);
+}
+
+static int spacemit_reset_probe(struct auxiliary_device *adev,
+				const struct auxiliary_device_id *id)
+{
+	struct spacemit_ccu_adev *rdev = to_spacemit_ccu_adev(adev);
+	struct ccu_reset_controller *controller;
+	struct device *dev = &adev->dev;
+
+	controller = devm_kzalloc(dev, sizeof(*controller), GFP_KERNEL);
+	if (!controller)
+		return -ENOMEM;
+	controller->data = (const struct ccu_reset_controller_data *)id->driver_data;
+	controller->regmap = rdev->regmap;
+
+	return spacemit_reset_controller_register(dev, controller);
+}
+
+#define K1_AUX_DEV_ID(_unit) \
+	{ \
+		.name = "spacemit_ccu_k1." #_unit "-reset", \
+		.driver_data = (kernel_ulong_t)&k1_ ## _unit ## _reset_data, \
+	}
+
+static const struct auxiliary_device_id spacemit_reset_ids[] = {
+	K1_AUX_DEV_ID(mpmu),
+	K1_AUX_DEV_ID(apbc),
+	K1_AUX_DEV_ID(apmu),
+	K1_AUX_DEV_ID(rcpu),
+	K1_AUX_DEV_ID(rcpu2),
+	K1_AUX_DEV_ID(apbc2),
+	{ },
+};
+MODULE_DEVICE_TABLE(auxiliary, spacemit_reset_ids);
+
+static struct auxiliary_driver spacemit_k1_reset_driver = {
+	.probe          = spacemit_reset_probe,
+	.id_table       = spacemit_reset_ids,
+};
+module_auxiliary_driver(spacemit_k1_reset_driver);
+
+MODULE_AUTHOR("Alex Elder <elder@kernel.org>");
+MODULE_DESCRIPTION("SpacemiT reset controller driver");
+MODULE_LICENSE("GPL");
diff --git a/include/soc/spacemit/k1-syscon.h b/include/soc/spacemit/k1-syscon.h
index 53eff7691f33..c59bd7a38e5b 100644
--- a/include/soc/spacemit/k1-syscon.h
+++ b/include/soc/spacemit/k1-syscon.h
@@ -127,4 +127,34 @@ to_spacemit_ccu_adev(struct auxiliary_device *adev)
 #define APMU_EMAC0_CLK_RES_CTRL		0x3e4
 #define APMU_EMAC1_CLK_RES_CTRL		0x3ec
 
+/* RCPU register offsets */
+#define RCPU_SSP0_CLK_RST		0x0028
+#define RCPU_I2C0_CLK_RST		0x0030
+#define RCPU_UART1_CLK_RST		0x003c
+#define RCPU_CAN_CLK_RST		0x0048
+#define RCPU_IR_CLK_RST			0x004c
+#define RCPU_UART0_CLK_RST		0x00d8
+#define AUDIO_HDMI_CLK_CTRL		0x2044
+
+/* RCPU2 register offsets */
+#define RCPU2_PWM0_CLK_RST		0x0000
+#define RCPU2_PWM1_CLK_RST		0x0004
+#define RCPU2_PWM2_CLK_RST		0x0008
+#define RCPU2_PWM3_CLK_RST		0x000c
+#define RCPU2_PWM4_CLK_RST		0x0010
+#define RCPU2_PWM5_CLK_RST		0x0014
+#define RCPU2_PWM6_CLK_RST		0x0018
+#define RCPU2_PWM7_CLK_RST		0x001c
+#define RCPU2_PWM8_CLK_RST		0x0020
+#define RCPU2_PWM9_CLK_RST		0x0024
+
+/* APBC2 register offsets */
+#define APBC2_UART1_CLK_RST		0x0000
+#define APBC2_SSP2_CLK_RST		0x0004
+#define APBC2_TWSI3_CLK_RST		0x0008
+#define APBC2_RTC_CLK_RST		0x000c
+#define APBC2_TIMERS0_CLK_RST		0x0010
+#define APBC2_KPC_CLK_RST		0x0014
+#define APBC2_GPIO_CLK_RST		0x001c
+
 #endif /* __SOC_K1_SYSCON_H__ */
-- 
cgit v1.2.3


From 2ec1067d1e5acb9e7aeb1fe6d5178424fc3107ab Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:10:33 +0000
Subject: ASoC: soc-dapm: remove snd_soc_dapm_nc_pin[_unlocked]()

snd_soc_dapm_nc_pin() was added in commit 5817b52a298a ("ALSA: ASoC: Allow
machine drivers to mark pins as not connected") at 2008.
It is identical to snd_soc_dapm_disable_pin[_unlocked](). It was expected
to be updated, but were enough as-is for this 17 years.

We might update these, but renaming function name by define is enough
for now. We can re-create these if needed in the future. Let's remove it.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87tt3whitj.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h | 12 ++++++++++--
 sound/soc/soc-dapm.c     | 51 ------------------------------------------------
 2 files changed, 10 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index 400584474bc8..2a0250fff9ba 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -498,8 +498,6 @@ int snd_soc_dapm_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_disable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_disable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_nc_pin(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_nc_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_get_pin_status(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_sync(struct snd_soc_dapm_context *dapm);
 int snd_soc_dapm_sync_unlocked(struct snd_soc_dapm_context *dapm);
@@ -509,6 +507,16 @@ int snd_soc_dapm_ignore_suspend(struct snd_soc_dapm_context *dapm, const char *p
 unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol);
 void dapm_mark_endpoints_dirty(struct snd_soc_card *card);
 
+/*
+ * Marks the specified pin as being not connected, disabling it along
+ * any parent or child widgets.  At present this is identical to
+ * snd_soc_dapm_disable_pin[_unlocked]() but in future it will be extended to do
+ * additional things such as disabling controls which only affect
+ * paths through the pin.
+ */
+#define snd_soc_dapm_nc_pin		snd_soc_dapm_disable_pin
+#define snd_soc_dapm_nc_pin_unlocked	snd_soc_dapm_disable_pin_unlocked
+
 /* dapm path query */
 int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream,
 	struct snd_soc_dapm_widget_list **list,
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 98b35600904d..e4c2feec6e39 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -4772,57 +4772,6 @@ int snd_soc_dapm_disable_pin(struct snd_soc_dapm_context *dapm,
 }
 EXPORT_SYMBOL_GPL(snd_soc_dapm_disable_pin);
 
-/**
- * snd_soc_dapm_nc_pin_unlocked - permanently disable pin.
- * @dapm: DAPM context
- * @pin: pin name
- *
- * Marks the specified pin as being not connected, disabling it along
- * any parent or child widgets.  At present this is identical to
- * snd_soc_dapm_disable_pin() but in future it will be extended to do
- * additional things such as disabling controls which only affect
- * paths through the pin.
- *
- * Requires external locking.
- *
- * NOTE: snd_soc_dapm_sync() needs to be called after this for DAPM to
- * do any widget power switching.
- */
-int snd_soc_dapm_nc_pin_unlocked(struct snd_soc_dapm_context *dapm,
-			       const char *pin)
-{
-	return snd_soc_dapm_set_pin(dapm, pin, 0);
-}
-EXPORT_SYMBOL_GPL(snd_soc_dapm_nc_pin_unlocked);
-
-/**
- * snd_soc_dapm_nc_pin - permanently disable pin.
- * @dapm: DAPM context
- * @pin: pin name
- *
- * Marks the specified pin as being not connected, disabling it along
- * any parent or child widgets.  At present this is identical to
- * snd_soc_dapm_disable_pin() but in future it will be extended to do
- * additional things such as disabling controls which only affect
- * paths through the pin.
- *
- * NOTE: snd_soc_dapm_sync() needs to be called after this for DAPM to
- * do any widget power switching.
- */
-int snd_soc_dapm_nc_pin(struct snd_soc_dapm_context *dapm, const char *pin)
-{
-	int ret;
-
-	snd_soc_dapm_mutex_lock(dapm);
-
-	ret = snd_soc_dapm_set_pin(dapm, pin, 0);
-
-	snd_soc_dapm_mutex_unlock(dapm);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(snd_soc_dapm_nc_pin);
-
 /**
  * snd_soc_dapm_get_pin_status - get audio pin status
  * @dapm: DAPM context
-- 
cgit v1.2.3


From d6f240031afbd780431ef289357373e2bbf2f793 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:10:39 +0000
Subject: ASoC: soc-dapm: remove snd_soc_dapm_weak_routes()

No one is using snd_soc_dapm_weak_routes(), let's remove it.
Because snd_soc_dapm_weak_routes() was removed, path->weak is not
needed either. Remove it, too.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87sejghitd.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h |  3 --
 sound/soc/soc-dapm.c     | 91 +++---------------------------------------------
 2 files changed, 4 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index 2a0250fff9ba..6d854c727bca 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -473,8 +473,6 @@ int snd_soc_dapm_add_routes(struct snd_soc_dapm_context *dapm,
 			    const struct snd_soc_dapm_route *route, int num);
 int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm,
 			    const struct snd_soc_dapm_route *route, int num);
-int snd_soc_dapm_weak_routes(struct snd_soc_dapm_context *dapm,
-			     const struct snd_soc_dapm_route *route, int num);
 void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w);
 
 /* dapm events */
@@ -611,7 +609,6 @@ struct snd_soc_dapm_path {
 	/* status */
 	u32 connect:1;		/* source and sink widgets are connected */
 	u32 walking:1;		/* path is in the process of being walked */
-	u32 weak:1;		/* path ignored for power management */
 	u32 is_supply:1;	/* At least one of the connected widgets is a supply */
 
 	int (*connected)(struct snd_soc_dapm_widget *source,
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index e4c2feec6e39..79cd3c93a4fd 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -214,7 +214,7 @@ static __always_inline void dapm_widget_invalidate_paths(
 
 	list_for_each_entry(w, &list, work_list) {
 		snd_soc_dapm_widget_for_each_path(w, dir, p) {
-			if (p->is_supply || p->weak || !p->connect)
+			if (p->is_supply || !p->connect)
 				continue;
 			node = p->node[rdir];
 			if (node->endpoints[dir] != -1) {
@@ -276,7 +276,7 @@ static void dapm_path_invalidate(struct snd_soc_dapm_path *p)
 	 * Weak paths or supply paths do not influence the number of input or
 	 * output paths of their neighbors.
 	 */
-	if (p->weak || p->is_supply)
+	if (p->is_supply)
 		return;
 
 	/*
@@ -1162,7 +1162,7 @@ static void invalidate_paths_ep(struct snd_soc_dapm_widget *widget,
 	widget->endpoints[dir] = -1;
 
 	snd_soc_dapm_widget_for_each_path(widget, rdir, path) {
-		if (path->weak || path->is_supply)
+		if (path->is_supply)
 			continue;
 
 		if (path->walking)
@@ -1217,7 +1217,7 @@ static __always_inline int is_connected_ep(struct snd_soc_dapm_widget *widget,
 	snd_soc_dapm_widget_for_each_path(widget, rdir, path) {
 		DAPM_UPDATE_STAT(widget, neighbour_checks);
 
-		if (path->weak || path->is_supply)
+		if (path->is_supply)
 			continue;
 
 		if (path->walking)
@@ -1454,9 +1454,6 @@ static int dapm_supply_check_power(struct snd_soc_dapm_widget *w)
 	snd_soc_dapm_widget_for_each_sink_path(w, path) {
 		DAPM_UPDATE_STAT(w, neighbour_checks);
 
-		if (path->weak)
-			continue;
-
 		if (path->connected &&
 		    !path->connected(path->source, path->sink))
 			continue;
@@ -3202,86 +3199,6 @@ int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm,
 }
 EXPORT_SYMBOL_GPL(snd_soc_dapm_del_routes);
 
-static int snd_soc_dapm_weak_route(struct snd_soc_dapm_context *dapm,
-				   const struct snd_soc_dapm_route *route)
-{
-	struct snd_soc_dapm_widget *source = dapm_find_widget(dapm,
-							      route->source,
-							      true);
-	struct snd_soc_dapm_widget *sink = dapm_find_widget(dapm,
-							    route->sink,
-							    true);
-	struct snd_soc_dapm_path *path;
-	int count = 0;
-
-	if (!source) {
-		dev_err(dapm->dev, "ASoC: Unable to find source %s for weak route\n",
-			route->source);
-		return -ENODEV;
-	}
-
-	if (!sink) {
-		dev_err(dapm->dev, "ASoC: Unable to find sink %s for weak route\n",
-			route->sink);
-		return -ENODEV;
-	}
-
-	if (route->control || route->connected)
-		dev_warn(dapm->dev, "ASoC: Ignoring control for weak route %s->%s\n",
-			 route->source, route->sink);
-
-	snd_soc_dapm_widget_for_each_sink_path(source, path) {
-		if (path->sink == sink) {
-			path->weak = 1;
-			count++;
-		}
-	}
-
-	if (count == 0)
-		dev_err(dapm->dev, "ASoC: No path found for weak route %s->%s\n",
-			route->source, route->sink);
-	if (count > 1)
-		dev_warn(dapm->dev, "ASoC: %d paths found for weak route %s->%s\n",
-			 count, route->source, route->sink);
-
-	return 0;
-}
-
-/**
- * snd_soc_dapm_weak_routes - Mark routes between DAPM widgets as weak
- * @dapm: DAPM context
- * @route: audio routes
- * @num: number of routes
- *
- * Mark existing routes matching those specified in the passed array
- * as being weak, meaning that they are ignored for the purpose of
- * power decisions.  The main intended use case is for sidetone paths
- * which couple audio between other independent paths if they are both
- * active in order to make the combination work better at the user
- * level but which aren't intended to be "used".
- *
- * Note that CODEC drivers should not use this as sidetone type paths
- * can frequently also be used as bypass paths.
- */
-int snd_soc_dapm_weak_routes(struct snd_soc_dapm_context *dapm,
-			     const struct snd_soc_dapm_route *route, int num)
-{
-	int i;
-	int ret = 0;
-
-	snd_soc_dapm_mutex_lock_root(dapm);
-	for (i = 0; i < num; i++) {
-		int err = snd_soc_dapm_weak_route(dapm, route);
-		if (err)
-			ret = err;
-		route++;
-	}
-	snd_soc_dapm_mutex_unlock(dapm);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(snd_soc_dapm_weak_routes);
-
 /**
  * snd_soc_dapm_new_widgets - add new dapm widgets
  * @card: card to be checked for new dapm widgets
-- 
cgit v1.2.3


From f02ccc8c0b99382807bac0065918a90ba974e9ab Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:10:51 +0000
Subject: ASoC: soc-dapm: reordering header definitions

Because header defined randomly, it needs name definitions on top of
soc-dapm.h. it is not needed if definitions are implemented in correct
order.

This patch has big change from change-line point of view, but is
just reordering, nothing changed in meaning.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87plekhit0.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h | 235 ++++++++++++++++++++++-------------------------
 1 file changed, 110 insertions(+), 125 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index 6d854c727bca..5aeb0822ce0b 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -16,9 +16,10 @@
 #include <sound/asoc.h>
 
 struct device;
+struct regulator;
+struct soc_enum;
 struct snd_pcm_substream;
 struct snd_soc_pcm_runtime;
-struct soc_enum;
 
 /* widget has no PM register bit */
 #define SND_SOC_NOPM	-1
@@ -399,17 +400,6 @@ struct soc_enum;
 /* regulator widget flags */
 #define SND_SOC_DAPM_REGULATOR_BYPASS	0x1	/* bypass when disabled */
 
-struct snd_soc_dapm_widget;
-enum snd_soc_dapm_type;
-struct snd_soc_dapm_path;
-struct snd_soc_dapm_pin;
-struct snd_soc_dapm_route;
-struct snd_soc_dapm_context;
-struct regulator;
-struct snd_soc_dapm_widget_list;
-struct snd_soc_dapm_update;
-enum snd_soc_dapm_direction;
-
 /*
  * Bias levels
  *
@@ -428,104 +418,6 @@ enum snd_soc_bias_level {
 	SND_SOC_BIAS_ON = 3,
 };
 
-int dapm_regulator_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
-int dapm_clock_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
-int dapm_pinctrl_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
-
-/* dapm controls */
-int snd_soc_dapm_put_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
-int snd_soc_dapm_get_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
-int snd_soc_dapm_get_enum_double(struct snd_kcontrol *kcontrol,
-	struct snd_ctl_elem_value *ucontrol);
-int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol,
-	struct snd_ctl_elem_value *ucontrol);
-int snd_soc_dapm_info_pin_switch(struct snd_kcontrol *kcontrol,
-	struct snd_ctl_elem_info *uinfo);
-int snd_soc_dapm_get_pin_switch(struct snd_kcontrol *kcontrol,
-	struct snd_ctl_elem_value *uncontrol);
-int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol,
-	struct snd_ctl_elem_value *uncontrol);
-int snd_soc_dapm_get_component_pin_switch(struct snd_kcontrol *kcontrol,
-					  struct snd_ctl_elem_value *uncontrol);
-int snd_soc_dapm_put_component_pin_switch(struct snd_kcontrol *kcontrol,
-					  struct snd_ctl_elem_value *uncontrol);
-int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm,
-	const struct snd_soc_dapm_widget *widget, unsigned int num);
-struct snd_soc_dapm_widget *snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm,
-		const struct snd_soc_dapm_widget *widget);
-struct snd_soc_dapm_widget *snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm,
-		const struct snd_soc_dapm_widget *widget);
-int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, struct snd_soc_dai *dai);
-void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w);
-int snd_soc_dapm_link_dai_widgets(struct snd_soc_card *card);
-void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card);
-
-int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream,
-			    struct snd_pcm_hw_params *params, struct snd_soc_dai *dai);
-int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s);
-
-/* dapm path setup */
-int snd_soc_dapm_new_widgets(struct snd_soc_card *card);
-void snd_soc_dapm_free(struct snd_soc_dapm_context *dapm);
-void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm,
-		       struct snd_soc_card *card, struct snd_soc_component *component);
-int snd_soc_dapm_add_routes(struct snd_soc_dapm_context *dapm,
-			    const struct snd_soc_dapm_route *route, int num);
-int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm,
-			    const struct snd_soc_dapm_route *route, int num);
-void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w);
-
-/* dapm events */
-void snd_soc_dapm_stream_event(struct snd_soc_pcm_runtime *rtd, int stream, int event);
-void snd_soc_dapm_stream_stop(struct snd_soc_pcm_runtime *rtd, int stream);
-void snd_soc_dapm_shutdown(struct snd_soc_card *card);
-
-/* external DAPM widget events */
-int snd_soc_dapm_mixer_update_power(struct snd_soc_dapm_context *dapm,
-		struct snd_kcontrol *kcontrol, int connect, struct snd_soc_dapm_update *update);
-int snd_soc_dapm_mux_update_power(struct snd_soc_dapm_context *dapm,
-		struct snd_kcontrol *kcontrol, int mux, struct soc_enum *e,
-		struct snd_soc_dapm_update *update);
-
-/* dapm sys fs - used by the core */
-extern struct attribute *soc_dapm_dev_attrs[];
-void snd_soc_dapm_debugfs_init(struct snd_soc_dapm_context *dapm, struct dentry *parent);
-
-/* dapm audio pin control and status */
-int snd_soc_dapm_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_disable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_disable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_get_pin_status(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_sync(struct snd_soc_dapm_context *dapm);
-int snd_soc_dapm_sync_unlocked(struct snd_soc_dapm_context *dapm);
-int snd_soc_dapm_force_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_force_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
-int snd_soc_dapm_ignore_suspend(struct snd_soc_dapm_context *dapm, const char *pin);
-unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol);
-void dapm_mark_endpoints_dirty(struct snd_soc_card *card);
-
-/*
- * Marks the specified pin as being not connected, disabling it along
- * any parent or child widgets.  At present this is identical to
- * snd_soc_dapm_disable_pin[_unlocked]() but in future it will be extended to do
- * additional things such as disabling controls which only affect
- * paths through the pin.
- */
-#define snd_soc_dapm_nc_pin		snd_soc_dapm_disable_pin
-#define snd_soc_dapm_nc_pin_unlocked	snd_soc_dapm_disable_pin_unlocked
-
-/* dapm path query */
-int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream,
-	struct snd_soc_dapm_widget_list **list,
-	bool (*custom_stop_condition)(struct snd_soc_dapm_widget *, enum snd_soc_dapm_direction));
-void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list);
-
-struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm(struct snd_kcontrol *kcontrol);
-struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget(struct snd_kcontrol *kcontrol);
-
-int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level);
-
 /* dapm widget types */
 enum snd_soc_dapm_type {
 	snd_soc_dapm_input = 0,		/* input pin */
@@ -717,11 +609,6 @@ struct snd_soc_dapm_widget_list {
 	struct snd_soc_dapm_widget *widgets[] __counted_by(num_widgets);
 };
 
-#define for_each_dapm_widgets(list, i, widget)				\
-	for ((i) = 0;							\
-	     (i) < list->num_widgets && (widget = list->widgets[i]);	\
-	     (i)++)
-
 struct snd_soc_dapm_stats {
 	int power_checks;
 	int path_checks;
@@ -733,6 +620,114 @@ struct snd_soc_dapm_pinctrl_priv {
 	const char *sleep_state;
 };
 
+enum snd_soc_dapm_direction {
+	SND_SOC_DAPM_DIR_IN,
+	SND_SOC_DAPM_DIR_OUT
+};
+
+#define SND_SOC_DAPM_DIR_TO_EP(x) BIT(x)
+
+#define SND_SOC_DAPM_EP_SOURCE	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_IN)
+#define SND_SOC_DAPM_EP_SINK	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_OUT)
+
+int dapm_regulator_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+int dapm_clock_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+int dapm_pinctrl_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+
+/* dapm controls */
+int snd_soc_dapm_put_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
+int snd_soc_dapm_get_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
+int snd_soc_dapm_get_enum_double(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
+int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
+int snd_soc_dapm_info_pin_switch(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo);
+int snd_soc_dapm_get_pin_switch(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *uncontrol);
+int snd_soc_dapm_put_pin_switch(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *uncontrol);
+int snd_soc_dapm_get_component_pin_switch(struct snd_kcontrol *kcontrol,
+					  struct snd_ctl_elem_value *uncontrol);
+int snd_soc_dapm_put_component_pin_switch(struct snd_kcontrol *kcontrol,
+					  struct snd_ctl_elem_value *uncontrol);
+int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm,
+	const struct snd_soc_dapm_widget *widget, unsigned int num);
+struct snd_soc_dapm_widget *snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm,
+		const struct snd_soc_dapm_widget *widget);
+struct snd_soc_dapm_widget *snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm,
+		const struct snd_soc_dapm_widget *widget);
+int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, struct snd_soc_dai *dai);
+void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w);
+int snd_soc_dapm_link_dai_widgets(struct snd_soc_card *card);
+void snd_soc_dapm_connect_dai_link_widgets(struct snd_soc_card *card);
+
+int snd_soc_dapm_update_dai(struct snd_pcm_substream *substream,
+			    struct snd_pcm_hw_params *params, struct snd_soc_dai *dai);
+int snd_soc_dapm_widget_name_cmp(struct snd_soc_dapm_widget *widget, const char *s);
+
+/* dapm path setup */
+int snd_soc_dapm_new_widgets(struct snd_soc_card *card);
+void snd_soc_dapm_free(struct snd_soc_dapm_context *dapm);
+void snd_soc_dapm_init(struct snd_soc_dapm_context *dapm,
+		       struct snd_soc_card *card, struct snd_soc_component *component);
+int snd_soc_dapm_add_routes(struct snd_soc_dapm_context *dapm,
+			    const struct snd_soc_dapm_route *route, int num);
+int snd_soc_dapm_del_routes(struct snd_soc_dapm_context *dapm,
+			    const struct snd_soc_dapm_route *route, int num);
+void snd_soc_dapm_free_widget(struct snd_soc_dapm_widget *w);
+
+/* dapm events */
+void snd_soc_dapm_stream_event(struct snd_soc_pcm_runtime *rtd, int stream, int event);
+void snd_soc_dapm_stream_stop(struct snd_soc_pcm_runtime *rtd, int stream);
+void snd_soc_dapm_shutdown(struct snd_soc_card *card);
+
+/* external DAPM widget events */
+int snd_soc_dapm_mixer_update_power(struct snd_soc_dapm_context *dapm,
+		struct snd_kcontrol *kcontrol, int connect, struct snd_soc_dapm_update *update);
+int snd_soc_dapm_mux_update_power(struct snd_soc_dapm_context *dapm,
+		struct snd_kcontrol *kcontrol, int mux, struct soc_enum *e,
+		struct snd_soc_dapm_update *update);
+
+/* dapm sys fs - used by the core */
+extern struct attribute *soc_dapm_dev_attrs[];
+void snd_soc_dapm_debugfs_init(struct snd_soc_dapm_context *dapm, struct dentry *parent);
+
+/* dapm audio pin control and status */
+int snd_soc_dapm_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_disable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_disable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_get_pin_status(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_sync(struct snd_soc_dapm_context *dapm);
+int snd_soc_dapm_sync_unlocked(struct snd_soc_dapm_context *dapm);
+int snd_soc_dapm_force_enable_pin(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_force_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
+int snd_soc_dapm_ignore_suspend(struct snd_soc_dapm_context *dapm, const char *pin);
+unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol);
+void dapm_mark_endpoints_dirty(struct snd_soc_card *card);
+
+/*
+ * Marks the specified pin as being not connected, disabling it along
+ * any parent or child widgets.  At present this is identical to
+ * snd_soc_dapm_disable_pin[_unlocked]() but in future it will be extended to do
+ * additional things such as disabling controls which only affect
+ * paths through the pin.
+ */
+#define snd_soc_dapm_nc_pin		snd_soc_dapm_disable_pin
+#define snd_soc_dapm_nc_pin_unlocked	snd_soc_dapm_disable_pin_unlocked
+
+/* dapm path query */
+int snd_soc_dapm_dai_get_connected_widgets(struct snd_soc_dai *dai, int stream,
+	struct snd_soc_dapm_widget_list **list,
+	bool (*custom_stop_condition)(struct snd_soc_dapm_widget *, enum snd_soc_dapm_direction));
+void snd_soc_dapm_dai_free_widgets(struct snd_soc_dapm_widget_list **list);
+
+struct snd_soc_dapm_context *snd_soc_dapm_kcontrol_dapm(struct snd_kcontrol *kcontrol);
+struct snd_soc_dapm_widget *snd_soc_dapm_kcontrol_widget(struct snd_kcontrol *kcontrol);
+
+int snd_soc_dapm_force_bias_level(struct snd_soc_dapm_context *dapm, enum snd_soc_bias_level level);
+
+#define for_each_dapm_widgets(list, i, widget)				\
+	for ((i) = 0;							\
+	     (i) < list->num_widgets && (widget = list->widgets[i]);	\
+	     (i)++)
+
 /**
  * snd_soc_dapm_init_bias_level() - Initialize DAPM bias level
  * @dapm: The DAPM context to initialize
@@ -764,16 +759,6 @@ static inline enum snd_soc_bias_level snd_soc_dapm_get_bias_level(
 	return dapm->bias_level;
 }
 
-enum snd_soc_dapm_direction {
-	SND_SOC_DAPM_DIR_IN,
-	SND_SOC_DAPM_DIR_OUT
-};
-
-#define SND_SOC_DAPM_DIR_TO_EP(x) BIT(x)
-
-#define SND_SOC_DAPM_EP_SOURCE	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_IN)
-#define SND_SOC_DAPM_EP_SINK	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_OUT)
-
 /**
  * snd_soc_dapm_widget_for_each_path - Iterates over all paths in the
  *   specified direction of a widget
-- 
cgit v1.2.3


From 805c019fbb94795e391974f673c5b3e57b825f6d Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:11:08 +0000
Subject: ASoC: soc-dapm: add prefix on dapm_mark_endpoints_dirty()

dapm_mark_endpoints_dirty() is global function. Let's add snd_soc_
prefix.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87ldp8hisj.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h | 2 +-
 sound/soc/soc-core.c     | 6 +++---
 sound/soc/soc-dapm.c     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index 5aeb0822ce0b..ee77a80765c6 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -700,7 +700,7 @@ int snd_soc_dapm_force_enable_pin(struct snd_soc_dapm_context *dapm, const char
 int snd_soc_dapm_force_enable_pin_unlocked(struct snd_soc_dapm_context *dapm, const char *pin);
 int snd_soc_dapm_ignore_suspend(struct snd_soc_dapm_context *dapm, const char *pin);
 unsigned int dapm_kcontrol_get_value(const struct snd_kcontrol *kcontrol);
-void dapm_mark_endpoints_dirty(struct snd_soc_card *card);
+void snd_soc_dapm_mark_endpoints_dirty(struct snd_soc_card *card);
 
 /*
  * Marks the specified pin as being not connected, disabling it along
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 67bebc339148..b59f612de4e5 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -681,7 +681,7 @@ int snd_soc_suspend(struct device *dev)
 	soc_dapm_suspend_resume(card, SND_SOC_DAPM_STREAM_SUSPEND);
 
 	/* Recheck all endpoints too, their state is affected by suspend */
-	dapm_mark_endpoints_dirty(card);
+	snd_soc_dapm_mark_endpoints_dirty(card);
 	snd_soc_dapm_sync(&card->dapm);
 
 	/* suspend all COMPONENTs */
@@ -778,7 +778,7 @@ static void soc_resume_deferred(struct work_struct *work)
 	dev_dbg(card->dev, "ASoC: resume work completed\n");
 
 	/* Recheck all endpoints too, their state is affected by suspend */
-	dapm_mark_endpoints_dirty(card);
+	snd_soc_dapm_mark_endpoints_dirty(card);
 	snd_soc_dapm_sync(&card->dapm);
 
 	/* userspace can access us now we are back as we were before */
@@ -2286,7 +2286,7 @@ static int snd_soc_bind_card(struct snd_soc_card *card)
 	}
 
 	card->instantiated = 1;
-	dapm_mark_endpoints_dirty(card);
+	snd_soc_dapm_mark_endpoints_dirty(card);
 	snd_soc_dapm_sync(&card->dapm);
 
 	/* deactivate pins to sleep state */
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 16f8eff42835..d612dfbd245e 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -283,7 +283,7 @@ static void dapm_path_invalidate(struct snd_soc_dapm_path *p)
 		dapm_widget_invalidate_output_paths(p->source);
 }
 
-void dapm_mark_endpoints_dirty(struct snd_soc_card *card)
+void snd_soc_dapm_mark_endpoints_dirty(struct snd_soc_card *card)
 {
 	struct snd_soc_dapm_widget *w;
 
-- 
cgit v1.2.3


From 9d33f9ca4404e532453a0305ce11bf76a9945c5d Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:11:15 +0000
Subject: ASoC: soc-dapm: add prefix on dapm_xxx_event()

dapm_xxx_event() is global function. Let's add snd_soc_ prefix.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87jz4shisc.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h                  | 12 ++++++------
 sound/soc/mediatek/mt8188/mt8188-mt6359.c |  4 ++--
 sound/soc/soc-dapm.c                      | 20 ++++++++++----------
 3 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index ee77a80765c6..be5ecc276562 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -295,7 +295,7 @@ struct snd_soc_pcm_runtime;
 #define SND_SOC_DAPM_CLOCK_SUPPLY(wname) \
 (struct snd_soc_dapm_widget) { \
 	.id = snd_soc_dapm_clock_supply, .name = wname, \
-	.reg = SND_SOC_NOPM, .event = dapm_clock_event, \
+	.reg = SND_SOC_NOPM, .event = snd_soc_dapm_clock_event, \
 	.event_flags = SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD }
 
 /* generic widgets */
@@ -312,7 +312,7 @@ struct snd_soc_pcm_runtime;
 #define SND_SOC_DAPM_REGULATOR_SUPPLY(wname, wdelay, wflags)	    \
 (struct snd_soc_dapm_widget) { \
 	.id = snd_soc_dapm_regulator_supply, .name = wname, \
-	.reg = SND_SOC_NOPM, .shift = wdelay, .event = dapm_regulator_event, \
+	.reg = SND_SOC_NOPM, .shift = wdelay, .event = snd_soc_dapm_regulator_event, \
 	.event_flags = SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD, \
 	.on_val = wflags}
 #define SND_SOC_DAPM_PINCTRL(wname, active, sleep) \
@@ -320,7 +320,7 @@ struct snd_soc_pcm_runtime;
 	.id = snd_soc_dapm_pinctrl, .name = wname, \
 	.priv = (&(struct snd_soc_dapm_pinctrl_priv) \
 		{ .active_state = active, .sleep_state = sleep,}), \
-	.reg = SND_SOC_NOPM, .event = dapm_pinctrl_event, \
+	.reg = SND_SOC_NOPM, .event = snd_soc_dapm_pinctrl_event, \
 	.event_flags = SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD }
 
 
@@ -630,9 +630,9 @@ enum snd_soc_dapm_direction {
 #define SND_SOC_DAPM_EP_SOURCE	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_IN)
 #define SND_SOC_DAPM_EP_SINK	SND_SOC_DAPM_DIR_TO_EP(SND_SOC_DAPM_DIR_OUT)
 
-int dapm_regulator_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
-int dapm_clock_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
-int dapm_pinctrl_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+int snd_soc_dapm_regulator_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+int snd_soc_dapm_clock_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
+int snd_soc_dapm_pinctrl_event(struct snd_soc_dapm_widget *w, struct snd_kcontrol *kcontrol, int event);
 
 /* dapm controls */
 int snd_soc_dapm_put_volsw(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol);
diff --git a/sound/soc/mediatek/mt8188/mt8188-mt6359.c b/sound/soc/mediatek/mt8188/mt8188-mt6359.c
index a2a76b6df631..ea814a0f726d 100644
--- a/sound/soc/mediatek/mt8188/mt8188-mt6359.c
+++ b/sound/soc/mediatek/mt8188/mt8188-mt6359.c
@@ -408,7 +408,7 @@ static int mt8188_mt6359_mtkaif_calibration(struct snd_soc_pcm_runtime *rtd)
 	}
 
 	if (pin_w)
-		dapm_pinctrl_event(pin_w, NULL, SND_SOC_DAPM_PRE_PMU);
+		snd_soc_dapm_pinctrl_event(pin_w, NULL, SND_SOC_DAPM_PRE_PMU);
 	else
 		dev_dbg(afe->dev, "%s(), no pinmux widget, please check if default on\n", __func__);
 
@@ -510,7 +510,7 @@ static int mt8188_mt6359_mtkaif_calibration(struct snd_soc_pcm_runtime *rtd)
 		param->mtkaif_phase_cycle[i] = mtkaif_phase_cycle[i];
 
 	if (pin_w)
-		dapm_pinctrl_event(pin_w, NULL, SND_SOC_DAPM_POST_PMD);
+		snd_soc_dapm_pinctrl_event(pin_w, NULL, SND_SOC_DAPM_POST_PMD);
 
 	dev_dbg(afe->dev, "%s(), end, calibration ok %d\n",
 		__func__, param->mtkaif_calibration_ok);
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index d612dfbd245e..e8bd83fb9ef4 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1537,8 +1537,8 @@ EXPORT_SYMBOL_GPL(snd_soc_dapm_dai_free_widgets);
 /*
  * Handler for regulator supply widget.
  */
-int dapm_regulator_event(struct snd_soc_dapm_widget *w,
-		   struct snd_kcontrol *kcontrol, int event)
+int snd_soc_dapm_regulator_event(struct snd_soc_dapm_widget *w,
+				 struct snd_kcontrol *kcontrol, int event)
 {
 	int ret;
 
@@ -1566,13 +1566,13 @@ int dapm_regulator_event(struct snd_soc_dapm_widget *w,
 		return regulator_disable_deferred(w->regulator, w->shift);
 	}
 }
-EXPORT_SYMBOL_GPL(dapm_regulator_event);
+EXPORT_SYMBOL_GPL(snd_soc_dapm_regulator_event);
 
 /*
  * Handler for pinctrl widget.
  */
-int dapm_pinctrl_event(struct snd_soc_dapm_widget *w,
-		       struct snd_kcontrol *kcontrol, int event)
+int snd_soc_dapm_pinctrl_event(struct snd_soc_dapm_widget *w,
+			       struct snd_kcontrol *kcontrol, int event)
 {
 	struct snd_soc_dapm_pinctrl_priv *priv = w->priv;
 	struct pinctrl *p = w->pinctrl;
@@ -1591,13 +1591,13 @@ int dapm_pinctrl_event(struct snd_soc_dapm_widget *w,
 
 	return pinctrl_select_state(p, s);
 }
-EXPORT_SYMBOL_GPL(dapm_pinctrl_event);
+EXPORT_SYMBOL_GPL(snd_soc_dapm_pinctrl_event);
 
 /*
  * Handler for clock supply widget.
  */
-int dapm_clock_event(struct snd_soc_dapm_widget *w,
-		   struct snd_kcontrol *kcontrol, int event)
+int snd_soc_dapm_clock_event(struct snd_soc_dapm_widget *w,
+			     struct snd_kcontrol *kcontrol, int event)
 {
 	if (!w->clk)
 		return -EIO;
@@ -1613,7 +1613,7 @@ int dapm_clock_event(struct snd_soc_dapm_widget *w,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dapm_clock_event);
+EXPORT_SYMBOL_GPL(snd_soc_dapm_clock_event);
 
 static int dapm_widget_power_check(struct snd_soc_dapm_widget *w)
 {
@@ -3668,7 +3668,7 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm,
 		}
 
 		/* set to sleep_state when initializing */
-		dapm_pinctrl_event(w, NULL, SND_SOC_DAPM_POST_PMD);
+		snd_soc_dapm_pinctrl_event(w, NULL, SND_SOC_DAPM_POST_PMD);
 		break;
 	case snd_soc_dapm_clock_supply:
 		w->clk = devm_clk_get(dapm->dev, widget->name);
-- 
cgit v1.2.3


From 08dc0f5cc26a203e8008c38d9b436c079e7dbb45 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 1 Jul 2025 00:11:23 +0000
Subject: ASoC: soc-dapm: add prefix on soc_dapm_dev_attrs

soc_dapm_dev_attrs is global variable. Let's add snd_soc_ prefix.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Link: https://patch.msgid.link/87ikkchis6.wl-kuninori.morimoto.gx@renesas.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dapm.h | 2 +-
 sound/soc/soc-core.c     | 2 +-
 sound/soc/soc-dapm.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h
index be5ecc276562..0b5c7e6a90c8 100644
--- a/include/sound/soc-dapm.h
+++ b/include/sound/soc-dapm.h
@@ -685,7 +685,7 @@ int snd_soc_dapm_mux_update_power(struct snd_soc_dapm_context *dapm,
 		struct snd_soc_dapm_update *update);
 
 /* dapm sys fs - used by the core */
-extern struct attribute *soc_dapm_dev_attrs[];
+extern struct attribute *snd_soc_dapm_dev_attrs[];
 void snd_soc_dapm_debugfs_init(struct snd_soc_dapm_context *dapm, struct dentry *parent);
 
 /* dapm audio pin control and status */
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index b59f612de4e5..ebe94956df44 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -112,7 +112,7 @@ static umode_t soc_dev_attr_is_visible(struct kobject *kobj,
 }
 
 static const struct attribute_group soc_dapm_dev_group = {
-	.attrs = soc_dapm_dev_attrs,
+	.attrs = snd_soc_dapm_dev_attrs,
 	.is_visible = soc_dev_attr_is_visible,
 };
 
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index e8bd83fb9ef4..b8a5875378c8 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -2715,7 +2715,7 @@ static ssize_t dapm_widget_show(struct device *dev,
 
 static DEVICE_ATTR_RO(dapm_widget);
 
-struct attribute *soc_dapm_dev_attrs[] = {
+struct attribute *snd_soc_dapm_dev_attrs[] = {
 	&dev_attr_dapm_widget.attr,
 	NULL
 };
-- 
cgit v1.2.3


From c4aa454c64ae022e5a9d55b3c31e9b8dd8a1544f Mon Sep 17 00:00:00 2001
From: Eduard Zingerman <eddyz87@gmail.com>
Date: Fri, 4 Jul 2025 16:03:53 -0700
Subject: bpf: support for void/primitive __arg_untrusted global func params

Allow specifying __arg_untrusted for void */char */int */long *
parameters. Treat such parameters as
PTR_TO_MEM|MEM_RDONLY|PTR_UNTRUSTED of size zero.
Intended usage is as follows:

  int memcmp(char *a __arg_untrusted, char *b __arg_untrusted, size_t n) {
    bpf_for(i, 0, n) {
      if (a[i] - b[i])      // load at any offset is allowed
        return a[i] - b[i];
    }
    return 0;
  }

Allocate register id for ARG_PTR_TO_MEM parameters only when
PTR_MAYBE_NULL is set. Register id for PTR_TO_MEM is used only to
propagate non-null status after conditionals.

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20250704230354.1323244-8-eddyz87@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   |  1 +
 kernel/bpf/btf.c      | 15 ++++++++++++++-
 kernel/bpf/verifier.c |  7 ++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index a40beb9cf160..9eda6b113f9b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -223,6 +223,7 @@ u32 btf_nr_types(const struct btf *btf);
 struct btf *btf_base_btf(const struct btf *btf);
 bool btf_type_is_i32(const struct btf_type *t);
 bool btf_type_is_i64(const struct btf_type *t);
+bool btf_type_is_primitive(const struct btf_type *t);
 bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e0414d9f5e29..2dd13eea7b0e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -891,6 +891,12 @@ bool btf_type_is_i64(const struct btf_type *t)
 	return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8);
 }
 
+bool btf_type_is_primitive(const struct btf_type *t)
+{
+	return (btf_type_is_int(t) && btf_type_int_is_regular(t)) ||
+	       btf_is_any_enum(t);
+}
+
 /*
  * Check that given struct member is a regular int with expected
  * offset and size.
@@ -7830,6 +7836,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 				return -EINVAL;
 			}
 
+			ref_t = btf_type_skip_modifiers(btf, t->type, NULL);
+			if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) {
+				sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED;
+				sub->args[i].mem_size = 0;
+				continue;
+			}
+
 			kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t);
 			if (kern_type_id < 0)
 				return kern_type_id;
@@ -7838,7 +7851,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog)
 			ref_t = btf_type_by_id(vmlinux_btf, kern_type_id);
 			if (!btf_type_is_struct(ref_t)) {
 				tname = __btf_name_by_offset(vmlinux_btf, t->name_off);
-				bpf_log(log, "arg#%d has type %s '%s', but only struct types are allowed\n",
+				bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n",
 					i, btf_type_str(ref_t), tname);
 				return -EINVAL;
 			}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7af902c3ecc3..1e567fff6f23 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23152,11 +23152,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
 				__mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen);
 			} else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) {
 				reg->type = PTR_TO_MEM;
-				if (arg->arg_type & PTR_MAYBE_NULL)
-					reg->type |= PTR_MAYBE_NULL;
+				reg->type |= arg->arg_type &
+					     (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY);
 				mark_reg_known_zero(env, regs, i);
 				reg->mem_size = arg->mem_size;
-				reg->id = ++env->id_gen;
+				if (arg->arg_type & PTR_MAYBE_NULL)
+					reg->id = ++env->id_gen;
 			} else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) {
 				reg->type = PTR_TO_BTF_ID;
 				if (arg->arg_type & PTR_MAYBE_NULL)
-- 
cgit v1.2.3


From 4cdf1bdd45ac78a088773722f009883af30ad318 Mon Sep 17 00:00:00 2001
From: Pankaj Raghav <p.raghav@samsung.com>
Date: Fri, 4 Jul 2025 11:21:34 +0200
Subject: block: reject bs > ps block devices when THP is disabled

If THP is disabled and when a block device with logical block size >
page size is present, the following null ptr deref panic happens during
boot:

[   [13.2 mK  AOSAN: null-ptr-deref in range [0x0000000000000000-0x0000000000K0 0 0[07]
[   13.017749] RIP: 0010:create_empty_buffers+0x3b/0x380
<snip>
[   13.025448] Call Trace:
[   13.025692]  <TASK>
[   13.025895]  block_read_full_folio+0x610/0x780
[   13.026379]  ? __pfx_blkdev_get_block+0x10/0x10
[   13.027008]  ? __folio_batch_add_and_move+0x1fa/0x2b0
[   13.027548]  ? __pfx_blkdev_read_folio+0x10/0x10
[   13.028080]  filemap_read_folio+0x9b/0x200
[   13.028526]  ? __pfx_filemap_read_folio+0x10/0x10
[   13.029030]  ? __filemap_get_folio+0x43/0x620
[   13.029497]  do_read_cache_folio+0x155/0x3b0
[   13.029962]  ? __pfx_blkdev_read_folio+0x10/0x10
[   13.030381]  read_part_sector+0xb7/0x2a0
[   13.030805]  read_lba+0x174/0x2c0
<snip>
[   13.045348]  nvme_scan_ns+0x684/0x850 [nvme_core]
[   13.045858]  ? __pfx_nvme_scan_ns+0x10/0x10 [nvme_core]
[   13.046414]  ? _raw_spin_unlock+0x15/0x40
[   13.046843]  ? __switch_to+0x523/0x10a0
[   13.047253]  ? kvm_clock_get_cycles+0x14/0x30
[   13.047742]  ? __pfx_nvme_scan_ns_async+0x10/0x10 [nvme_core]
[   13.048353]  async_run_entry_fn+0x96/0x4f0
[   13.048787]  process_one_work+0x667/0x10a0
[   13.049219]  worker_thread+0x63c/0xf60

As large folio support depends on THP, only allow bs > ps block devices
if THP is enabled.

Fixes: 47dd67532303 ("block/bdev: lift block size restrictions to 64k")
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/20250704092134.289491-1-p.raghav@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 332b56f323d9..369a8e63c865 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -269,11 +269,16 @@ static inline dev_t disk_devt(struct gendisk *disk)
 	return MKDEV(disk->major, disk->first_minor);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER)
  * however we constrain this to what we can validate and test.
  */
 #define BLK_MAX_BLOCK_SIZE      SZ_64K
+#else
+#define BLK_MAX_BLOCK_SIZE      PAGE_SIZE
+#endif
+
 
 /* blk_validate_limits() validates bsize, so drivers don't usually need to */
 static inline int blk_validate_block_size(unsigned long bsize)
-- 
cgit v1.2.3


From d42c7c6fd66a6e2a78ae1da666c5df6c2fde8389 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Thu, 3 Jul 2025 14:27:06 +0300
Subject: PM: domains: Add flags to specify power on attach/detach

Calling dev_pm_domain_attach()/dev_pm_domain_detach() in bus driver
probe/remove functions can affect system behavior when the drivers
attached to the bus use devres-managed resources. Since devres actions
may need to access device registers, calling dev_pm_domain_detach() too
early, i.e., before these actions complete, can cause failures on some
systems. One such example is Renesas RZ/G3S SoC-based platforms.

If the device clocks are managed via PM domains, invoking
dev_pm_domain_detach() in the bus driver's remove function removes the
device's clocks from the PM domain, preventing any subsequent
pm_runtime_resume*() calls from enabling those clocks.

The second argument of dev_pm_domain_attach() specifies whether the PM
domain should be powered on during attachment. Likewise, the second
argument of dev_pm_domain_detach() indicates whether the domain should be
powered off during detachment.

Upcoming changes address the issue described above (initially for the
platform bus only) by deferring the call to dev_pm_domain_detach() until
after devres_release_all() in device_unbind_cleanup(). The detach_power_off
field in struct dev_pm_info stores the detach power off info from the
second argument of dev_pm_domain_attach().

Because there are cases where the device's PM domain power-on/off behavior
must be conditional (e.g., in i2c_device_probe()), the patch introduces
PD_FLAG_ATTACH_POWER_ON and PD_FLAG_DETACH_POWER_OFF flags to be passed
to dev_pm_domain_attach().

Finally, dev_pm_domain_attach() and its users are updated to use the newly
introduced PD_FLAG_ATTACH_POWER_ON and PD_FLAG_DETACH_POWER_OFF macros.

This change is preparatory.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Wolfram Sang <wsa+renesas@sang-engineering.com> # I2C
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/20250703112708.1621607-2-claudiu.beznea.uj@bp.renesas.com
[ rjw: Changelog adjustments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/amba/bus.c                       |  4 ++--
 drivers/base/auxiliary.c                 |  2 +-
 drivers/base/platform.c                  |  2 +-
 drivers/base/power/common.c              |  6 +++---
 drivers/clk/qcom/apcs-sdx55.c            |  2 +-
 drivers/gpu/drm/display/drm_dp_aux_bus.c |  2 +-
 drivers/i2c/i2c-core-base.c              |  2 +-
 drivers/mmc/core/sdio_bus.c              |  2 +-
 drivers/rpmsg/rpmsg_core.c               |  2 +-
 drivers/soundwire/bus_type.c             |  2 +-
 drivers/spi/spi.c                        |  2 +-
 drivers/tty/serdev/core.c                |  2 +-
 include/linux/pm_domain.h                | 10 ++++++++--
 13 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 71482d639a6d..74e34a07ef72 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -138,7 +138,7 @@ static int amba_read_periphid(struct amba_device *dev)
 	void __iomem *tmp;
 	int i, ret;
 
-	ret = dev_pm_domain_attach(&dev->dev, true);
+	ret = dev_pm_domain_attach(&dev->dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret) {
 		dev_dbg(&dev->dev, "can't get PM domain: %d\n", ret);
 		goto err_out;
@@ -291,7 +291,7 @@ static int amba_probe(struct device *dev)
 		if (ret < 0)
 			break;
 
-		ret = dev_pm_domain_attach(dev, true);
+		ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 		if (ret)
 			break;
 
diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c
index dba7c8e13a53..44cd3f85b659 100644
--- a/drivers/base/auxiliary.c
+++ b/drivers/base/auxiliary.c
@@ -217,7 +217,7 @@ static int auxiliary_bus_probe(struct device *dev)
 	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
 	int ret;
 
-	ret = dev_pm_domain_attach(dev, true);
+	ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret) {
 		dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret);
 		return ret;
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 075ec1d1b73a..df1ec34fdf56 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -1396,7 +1396,7 @@ static int platform_probe(struct device *_dev)
 	if (ret < 0)
 		return ret;
 
-	ret = dev_pm_domain_attach(_dev, true);
+	ret = dev_pm_domain_attach(_dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret)
 		goto out;
 
diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index 781968a128ff..fecb85fa85ac 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -83,7 +83,7 @@ EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);
 /**
  * dev_pm_domain_attach - Attach a device to its PM domain.
  * @dev: Device to attach.
- * @power_on: Used to indicate whether we should power on the device.
+ * @flags: indicate whether we should power on/off the device on attach/detach
  *
  * The @dev may only be attached to a single PM domain. By iterating through
  * the available alternatives we try to find a valid PM domain for the device.
@@ -100,14 +100,14 @@ EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);
  * Returns 0 on successfully attached PM domain, or when it is found that the
  * device doesn't need a PM domain, else a negative error code.
  */
-int dev_pm_domain_attach(struct device *dev, bool power_on)
+int dev_pm_domain_attach(struct device *dev, u32 flags)
 {
 	int ret;
 
 	if (dev->pm_domain)
 		return 0;
 
-	ret = acpi_dev_pm_attach(dev, power_on);
+	ret = acpi_dev_pm_attach(dev, !!(flags & PD_FLAG_ATTACH_POWER_ON));
 	if (!ret)
 		ret = genpd_dev_pm_attach(dev);
 
diff --git a/drivers/clk/qcom/apcs-sdx55.c b/drivers/clk/qcom/apcs-sdx55.c
index 3ba01622d8f0..90dd1f1855c2 100644
--- a/drivers/clk/qcom/apcs-sdx55.c
+++ b/drivers/clk/qcom/apcs-sdx55.c
@@ -111,7 +111,7 @@ static int qcom_apcs_sdx55_clk_probe(struct platform_device *pdev)
 	 * driver, there seems to be no better place to do this. So do it here!
 	 */
 	cpu_dev = get_cpu_device(0);
-	ret = dev_pm_domain_attach(cpu_dev, true);
+	ret = dev_pm_domain_attach(cpu_dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret) {
 		dev_err_probe(dev, ret, "can't get PM domain: %d\n", ret);
 		goto err;
diff --git a/drivers/gpu/drm/display/drm_dp_aux_bus.c b/drivers/gpu/drm/display/drm_dp_aux_bus.c
index ec7eac6b595f..718c9122bc3a 100644
--- a/drivers/gpu/drm/display/drm_dp_aux_bus.c
+++ b/drivers/gpu/drm/display/drm_dp_aux_bus.c
@@ -57,7 +57,7 @@ static int dp_aux_ep_probe(struct device *dev)
 		container_of(aux_ep, struct dp_aux_ep_device_with_data, aux_ep);
 	int ret;
 
-	ret = dev_pm_domain_attach(dev, true);
+	ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret)
 		return dev_err_probe(dev, ret, "Failed to attach to PM Domain\n");
 
diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index 2ad2b1838f0f..38eabf1173da 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -573,7 +573,7 @@ static int i2c_device_probe(struct device *dev)
 		goto err_clear_wakeup_irq;
 
 	do_power_on = !i2c_acpi_waive_d0_probe(dev);
-	status = dev_pm_domain_attach(&client->dev, do_power_on);
+	status = dev_pm_domain_attach(&client->dev, do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0);
 	if (status)
 		goto err_clear_wakeup_irq;
 
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index b66b637e2d57..656601754966 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -161,7 +161,7 @@ static int sdio_bus_probe(struct device *dev)
 	if (!id)
 		return -ENODEV;
 
-	ret = dev_pm_domain_attach(dev, false);
+	ret = dev_pm_domain_attach(dev, 0);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
index 6ee36adcbdba..bece5e635ee9 100644
--- a/drivers/rpmsg/rpmsg_core.c
+++ b/drivers/rpmsg/rpmsg_core.c
@@ -479,7 +479,7 @@ static int rpmsg_dev_probe(struct device *dev)
 	struct rpmsg_endpoint *ept = NULL;
 	int err;
 
-	err = dev_pm_domain_attach(dev, true);
+	err = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 	if (err)
 		goto out;
 
diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c
index 75d6f16efced..bc1e653080d9 100644
--- a/drivers/soundwire/bus_type.c
+++ b/drivers/soundwire/bus_type.c
@@ -101,7 +101,7 @@ static int sdw_drv_probe(struct device *dev)
 	/*
 	 * attach to power domain but don't turn on (last arg)
 	 */
-	ret = dev_pm_domain_attach(dev, false);
+	ret = dev_pm_domain_attach(dev, 0);
 	if (ret)
 		return ret;
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 1bc0fdbb1bd7..8200b47b2295 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -427,7 +427,7 @@ static int spi_probe(struct device *dev)
 	if (spi->irq < 0)
 		spi->irq = 0;
 
-	ret = dev_pm_domain_attach(dev, true);
+	ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret)
 		return ret;
 
diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 0213381fa358..d16c207a1a9b 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -399,7 +399,7 @@ static int serdev_drv_probe(struct device *dev)
 	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
 	int ret;
 
-	ret = dev_pm_domain_attach(dev, true);
+	ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 0b18160901a2..62a35a78ce9b 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -36,10 +36,16 @@
  *				isn't specified, the index just follows the
  *				index for the attached PM domain.
  *
+ * PD_FLAG_ATTACH_POWER_ON:	Power on the domain during attach.
+ *
+ * PD_FLAG_DETACH_POWER_OFF:	Power off the domain during detach.
+ *
  */
 #define PD_FLAG_NO_DEV_LINK		BIT(0)
 #define PD_FLAG_DEV_LINK_ON		BIT(1)
 #define PD_FLAG_REQUIRED_OPP		BIT(2)
+#define PD_FLAG_ATTACH_POWER_ON		BIT(3)
+#define PD_FLAG_DETACH_POWER_OFF	BIT(4)
 
 struct dev_pm_domain_attach_data {
 	const char * const *pd_names;
@@ -501,7 +507,7 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np)
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 #ifdef CONFIG_PM
-int dev_pm_domain_attach(struct device *dev, bool power_on);
+int dev_pm_domain_attach(struct device *dev, u32 flags);
 struct device *dev_pm_domain_attach_by_id(struct device *dev,
 					  unsigned int index);
 struct device *dev_pm_domain_attach_by_name(struct device *dev,
@@ -518,7 +524,7 @@ int dev_pm_domain_start(struct device *dev);
 void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd);
 int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state);
 #else
-static inline int dev_pm_domain_attach(struct device *dev, bool power_on)
+static inline int dev_pm_domain_attach(struct device *dev, u32 flags)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From f99508074e78fea17f06d753d9ef453b174ec98e Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Thu, 3 Jul 2025 14:27:07 +0300
Subject: PM: domains: Detach on device_unbind_cleanup()

The dev_pm_domain_attach() function is typically used in bus code
alongside dev_pm_domain_detach(), often following patterns like:

static int bus_probe(struct device *_dev)
{
    struct bus_driver *drv = to_bus_driver(dev->driver);
    struct bus_device *dev = to_bus_device(_dev);
    int ret;

    // ...

    ret = dev_pm_domain_attach(_dev, true);
    if (ret)
        return ret;

    if (drv->probe)
        ret = drv->probe(dev);

    // ...
}

static void bus_remove(struct device *_dev)
{
    struct bus_driver *drv = to_bus_driver(dev->driver);
    struct bus_device *dev = to_bus_device(_dev);

    if (drv->remove)
        drv->remove(dev);
    dev_pm_domain_detach(_dev);
}

When the driver's probe function uses devres-managed resources that
depend on the power domain state, those resources are released later
during device_unbind_cleanup().

Releasing devres-managed resources that depend on the power domain state
after detaching the device from its PM domain can cause failures.

For example, if the driver uses devm_pm_runtime_enable() in its probe
function, and the device's clocks are managed by the PM domain, then
during removal the runtime PM is disabled in device_unbind_cleanup()
after the clocks have been removed from the PM domain. It may happen
that the devm_pm_runtime_enable() action causes the device to be runtime-
resumed. If the driver specific runtime PM APIs access registers directly,
this will lead to accessing device registers without clocks being enabled.
Similar issues may occur with other devres actions that access device
registers.

Add detach_power_off member to struct dev_pm_info, to be used
later in device_unbind_cleanup() as the power_off argument for
dev_pm_domain_detach(). This is a preparatory step toward removing
dev_pm_domain_detach() calls from bus remove functions. Since the current
PM domain detach functions (genpd_dev_pm_detach() and acpi_dev_pm_detach())
already set dev->pm_domain = NULL, there should be no issues with bus
drivers that still call dev_pm_domain_detach() in their remove functions.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://patch.msgid.link/20250703112708.1621607-3-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/dd.c           | 2 ++
 drivers/base/power/common.c | 3 +++
 include/linux/pm.h          | 1 +
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index b526e0e0f52d..13ab98e033ea 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -25,6 +25,7 @@
 #include <linux/kthread.h>
 #include <linux/wait.h>
 #include <linux/async.h>
+#include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
 #include <linux/pinctrl/devinfo.h>
 #include <linux/slab.h>
@@ -552,6 +553,7 @@ static void device_unbind_cleanup(struct device *dev)
 	dev->dma_range_map = NULL;
 	device_set_driver(dev, NULL);
 	dev_set_drvdata(dev, NULL);
+	dev_pm_domain_detach(dev, dev->power.detach_power_off);
 	if (dev->pm_domain && dev->pm_domain->dismiss)
 		dev->pm_domain->dismiss(dev);
 	pm_runtime_reinit(dev);
diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index fecb85fa85ac..6ecf9ce4a4e6 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -111,6 +111,9 @@ int dev_pm_domain_attach(struct device *dev, u32 flags)
 	if (!ret)
 		ret = genpd_dev_pm_attach(dev);
 
+	if (dev->pm_domain)
+		dev->power.detach_power_off = !!(flags & PD_FLAG_DETACH_POWER_OFF);
+
 	return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(dev_pm_domain_attach);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 938b1b446a5d..14e8370887e3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -721,6 +721,7 @@ struct dev_pm_info {
 	struct pm_subsys_data	*subsys_data;  /* Owned by the subsystem. */
 	void (*set_latency_tolerance)(struct device *, s32);
 	struct dev_pm_qos	*qos;
+	bool			detach_power_off:1;	/* Owned by the driver core */
 };
 
 extern int dev_pm_get_subsys_data(struct device *dev);
-- 
cgit v1.2.3


From 59f44c9ccc3bb68aa3b062b8e57ce0e1ee2fca75 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Wed, 2 Jul 2025 17:50:34 +0200
Subject: net: openvswitch: allow providing upcall pid for the 'execute'
 command

When a packet enters OVS datapath and there is no flow to handle it,
packet goes to userspace through a MISS upcall.  With per-CPU upcall
dispatch mechanism, we're using the current CPU id to select the
Netlink PID on which to send this packet.  This allows us to send
packets from the same traffic flow through the same handler.

The handler will process the packet, install required flow into the
kernel and re-inject the original packet via OVS_PACKET_CMD_EXECUTE.

While handling OVS_PACKET_CMD_EXECUTE, however, we may hit a
recirculation action that will pass the (likely modified) packet
through the flow lookup again.  And if the flow is not found, the
packet will be sent to userspace again through another MISS upcall.

However, the handler thread in userspace is likely running on a
different CPU core, and the OVS_PACKET_CMD_EXECUTE request is handled
in the syscall context of that thread.  So, when the time comes to
send the packet through another upcall, the per-CPU dispatch will
choose a different Netlink PID, and this packet will end up processed
by a different handler thread on a different CPU.

The process continues as long as there are new recirculations, each
time the packet goes to a different handler thread before it is sent
out of the OVS datapath to the destination port.  In real setups the
number of recirculations can go up to 4 or 5, sometimes more.

There is always a chance to re-order packets while processing upcalls,
because userspace will first install the flow and then re-inject the
original packet.  So, there is a race window when the flow is already
installed and the second packet can match it and be forwarded to the
destination before the first packet is re-injected.  But the fact that
packets are going through multiple upcalls handled by different
userspace threads makes the reordering noticeably more likely, because
we not only have a race between the kernel and a userspace handler
(which is hard to avoid), but also between multiple userspace handlers.

For example, let's assume that 10 packets got enqueued through a MISS
upcall for handler-1, it will start processing them, will install the
flow into the kernel and start re-injecting packets back, from where
they will go through another MISS to handler-2.  Handler-2 will install
the flow into the kernel and start re-injecting the packets, while
handler-1 continues to re-inject the last of the 10 packets, they will
hit the flow installed by handler-2 and be forwarded without going to
the handler-2, while handler-2 still re-injects the first of these 10
packets.  Given multiple recirculations and misses, these 10 packets
may end up completely mixed up on the output from the datapath.

Let's allow userspace to specify on which Netlink PID the packets
should be upcalled while processing OVS_PACKET_CMD_EXECUTE.
This makes it possible to ensure that all the packets are processed
by the same handler thread in the userspace even with them being
upcalled multiple times in the process.  Packets will remain in order
since they will be enqueued to the same socket and re-injected in the
same order.  This doesn't eliminate re-ordering as stated above, since
we still have a race between kernel and the userspace thread, but it
allows to eliminate races between multiple userspace threads.

Userspace knows the PID of the socket on which the original upcall is
received, so there is no need to send it up from the kernel.

Solution requires storing the value somewhere for the duration of the
packet processing.  There are two potential places for this: our skb
extension or the per-CPU storage.  It's not clear which is better,
so just following currently used scheme of storing this kind of things
along the skb.  We still have a decent amount of space in the cb.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Aaron Conole <aconole@redhat.com>
Link: https://patch.msgid.link/20250702155043.2331772-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/openvswitch.h | 6 ++++++
 net/openvswitch/actions.c        | 6 ++++--
 net/openvswitch/datapath.c       | 8 +++++++-
 net/openvswitch/datapath.h       | 3 +++
 net/openvswitch/vport.c          | 1 +
 5 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 3a701bd1f31b..3092c2c6f1d2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -186,6 +186,11 @@ enum ovs_packet_cmd {
  * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
  * size.
  * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
+ * @OVS_PACKET_ATTR_UPCALL_PID: Netlink PID to use for upcalls while
+ * processing %OVS_PACKET_CMD_EXECUTE.  Takes precedence over all other ways
+ * to determine the Netlink PID including %OVS_USERSPACE_ATTR_PID,
+ * %OVS_DP_ATTR_UPCALL_PID, %OVS_DP_ATTR_PER_CPU_PIDS and the
+ * %OVS_VPORT_ATTR_UPCALL_PID.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -205,6 +210,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
 	OVS_PACKET_ATTR_LEN,	    /* Packet size before truncation. */
 	OVS_PACKET_ATTR_HASH,	    /* Packet hash. */
+	OVS_PACKET_ATTR_UPCALL_PID, /* u32 Netlink PID. */
 	__OVS_PACKET_ATTR_MAX
 };
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 3add108340bf..2832e0794197 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -941,8 +941,10 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			break;
 
 		case OVS_USERSPACE_ATTR_PID:
-			if (dp->user_features &
-			    OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+			if (OVS_CB(skb)->upcall_pid)
+				upcall.portid = OVS_CB(skb)->upcall_pid;
+			else if (dp->user_features &
+				 OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
 				upcall.portid =
 				  ovs_dp_get_upcall_portid(dp,
 							   smp_processor_id());
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index b990dc83504f..d5b6e2002bc1 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -267,7 +267,9 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
 		memset(&upcall, 0, sizeof(upcall));
 		upcall.cmd = OVS_PACKET_CMD_MISS;
 
-		if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+		if (OVS_CB(skb)->upcall_pid)
+			upcall.portid = OVS_CB(skb)->upcall_pid;
+		else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
 			upcall.portid =
 			    ovs_dp_get_upcall_portid(dp, smp_processor_id());
 		else
@@ -651,6 +653,9 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 			       !!(hash & OVS_PACKET_HASH_L4_BIT));
 	}
 
+	OVS_CB(packet)->upcall_pid =
+		nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0);
+
 	/* Build an sw_flow for sending this packet. */
 	flow = ovs_flow_alloc();
 	err = PTR_ERR(flow);
@@ -719,6 +724,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 	[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
 	[OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
 	[OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 },
+	[OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 },
 };
 
 static const struct genl_small_ops dp_packet_genl_ops[] = {
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index cfeb817a1889..db0c3e69d66c 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -121,6 +121,8 @@ struct datapath {
  * @cutlen: The number of bytes from the packet end to be removed.
  * @probability: The sampling probability that was applied to this skb; 0 means
  * no sampling has occurred; U32_MAX means 100% probability.
+ * @upcall_pid: Netlink socket PID to use for sending this packet to userspace;
+ * 0 means "not set" and default per-CPU or per-vport dispatch should be used.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
@@ -128,6 +130,7 @@ struct ovs_skb_cb {
 	u16			acts_origlen;
 	u32			cutlen;
 	u32			probability;
+	u32			upcall_pid;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 8732f6e51ae5..6bbbc16ab778 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -501,6 +501,7 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	OVS_CB(skb)->mru = 0;
 	OVS_CB(skb)->cutlen = 0;
 	OVS_CB(skb)->probability = 0;
+	OVS_CB(skb)->upcall_pid = 0;
 	if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
 		u32 mark;
 
-- 
cgit v1.2.3


From f55ce5a6cd33211c8cc5bce0554b6ac710a6a28b Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sat, 5 Jul 2025 07:17:17 +0000
Subject: KVM: arm64: Expose new KVM cap for cacheable PFNMAP

Introduce a new KVM capability to expose to the userspace whether
cacheable mapping of PFNMAP is supported.

The ability to safely do the cacheable mapping of PFNMAP is contingent
on S2FWB and ARM64_HAS_CACHE_DIC. S2FWB allows KVM to avoid flushing
the D cache, ARM64_HAS_CACHE_DIC allows KVM to avoid flushing the icache
and turns icache_inval_pou() into a NOP. The cap would be false if
those requirements are missing and is checked by making use of
kvm_arch_supports_cacheable_pfnmap.

This capability would allow userspace to discover the support.
It could for instance be used by userspace to prevent live-migration
across FWB and non-FWB hosts.

CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Jason Gunthorpe <jgg@nvidia.com>
CC: Oliver Upton <oliver.upton@linux.dev>
CC: David Hildenbrand <david@redhat.com>
Suggested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Donald Dutile <ddutile@redhat.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20250705071717.5062-7-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 Documentation/virt/kvm/api.rst | 13 ++++++++++++-
 arch/arm64/kvm/arm.c           |  7 +++++++
 include/uapi/linux/kvm.h       |  1 +
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 9abf93ee5f65..53e0179d5294 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8585,7 +8585,7 @@ ENOSYS for the others.
 When enabled, KVM will exit to userspace with KVM_EXIT_SYSTEM_EVENT of
 type KVM_SYSTEM_EVENT_SUSPEND to process the guest suspend request.
 
-7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
+7.42 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS
 -------------------------------------
 
 :Architectures: arm64
@@ -8614,6 +8614,17 @@ given VM.
 When this capability is enabled, KVM resets the VCPU when setting
 MP_STATE_INIT_RECEIVED through IOCTL.  The original MP_STATE is preserved.
 
+7.43 KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED
+-------------------------------------------
+
+:Architectures: arm64
+:Target: VM
+:Parameters: None
+
+This capability indicate to the userspace whether a PFNMAP memory region
+can be safely mapped as cacheable. This relies on the presence of
+force write back (FWB) feature support on the hardware.
+
 8. Other capabilities.
 ======================
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 38a91bb5d4c7..25609aab032c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -408,6 +408,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
 		r = BIT(0);
 		break;
+	case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED:
+		if (!kvm)
+			r = -EINVAL;
+		else
+			r = kvm_supports_cacheable_pfnmap();
+		break;
+
 	default:
 		r = 0;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 37891580d05d..e4e566ff348b 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -956,6 +956,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_ARM_EL2 240
 #define KVM_CAP_ARM_EL2_E2H0 241
 #define KVM_CAP_RISCV_MP_STATE_RESET 242
+#define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
-- 
cgit v1.2.3


From 1e3b66e326015f77bc4b36976bebeedc2ac0f588 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Thu, 3 Jul 2025 13:23:29 +0200
Subject: vsock: fix `vsock_proto` declaration

From commit 634f1a7110b4 ("vsock: support sockmap"), `struct proto
vsock_proto`, defined in af_vsock.c, is not static anymore, since it's
used by vsock_bpf.c.

If CONFIG_BPF_SYSCALL is not defined, `make C=2` will print a warning:
    $ make O=build C=2 W=1 net/vmw_vsock/
      ...
      CC [M]  net/vmw_vsock/af_vsock.o
      CHECK   ../net/vmw_vsock/af_vsock.c
    ../net/vmw_vsock/af_vsock.c:123:14: warning: symbol 'vsock_proto' was not declared. Should it be static?

Declare `vsock_proto` regardless of CONFIG_BPF_SYSCALL, since it's defined
in af_vsock.c, which is built regardless of CONFIG_BPF_SYSCALL.

Fixes: 634f1a7110b4 ("vsock: support sockmap")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://patch.msgid.link/20250703112329.28365-1-sgarzare@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_vsock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index d56e6e135158..d40e978126e3 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -243,8 +243,8 @@ int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
 			size_t len, int flags);
 
-#ifdef CONFIG_BPF_SYSCALL
 extern struct proto vsock_proto;
+#ifdef CONFIG_BPF_SYSCALL
 int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void __init vsock_bpf_build_proto(void);
 #else
-- 
cgit v1.2.3


From 4369d40da2f28ae1d3caadd4eb5d7b7f49a3776f Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Wed, 2 Jul 2025 14:32:55 +0900
Subject: netmem: use _Generic to cover const casting for page_to_netmem()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current page_to_netmem() doesn't cover const casting resulting in
trying to cast const struct page * to const netmem_ref fails.

To cover the case, change page_to_netmem() to use macro and _Generic.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://patch.msgid.link/20250702053256.4594-5-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 7a1dafa3f080..de1d95f04076 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -139,10 +139,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
 	return (__force netmem_ref)((unsigned long)niov | NET_IOV);
 }
 
-static inline netmem_ref page_to_netmem(const struct page *page)
-{
-	return (__force netmem_ref)page;
-}
+#define page_to_netmem(p)	(_Generic((p),			\
+	const struct page * :	(__force const netmem_ref)(p),	\
+	struct page * :		(__force netmem_ref)(p)))
 
 /**
  * virt_to_netmem - convert virtual memory pointer to a netmem reference
-- 
cgit v1.2.3


From d8bf56a0ca10af7936de8bbdd510c33041dacecc Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Wed, 2 Jul 2025 14:32:56 +0900
Subject: page_pool: make page_pool_get_dma_addr() just wrap
 page_pool_get_dma_addr_netmem()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The page pool members in struct page cannot be removed unless it's not
allowed to access any of them via struct page.

Do not access 'page->dma_addr' directly in page_pool_get_dma_addr() but
just wrap page_pool_get_dma_addr_netmem() safely.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Acked-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/20250702053256.4594-6-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool/helpers.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 773fc65780b5..db180626be06 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -444,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
  */
 static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
 {
-	dma_addr_t ret = page->dma_addr;
-
-	if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
-		ret <<= PAGE_SHIFT;
-
-	return ret;
+	return page_pool_get_dma_addr_netmem(page_to_netmem(page));
 }
 
 static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool,
-- 
cgit v1.2.3


From a683a5b2ba23598ad343e5ec10a4ef4077497fc9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jul 2025 06:34:37 +0100
Subject: fold fs_struct->{lock,seq} into a seqlock

	The combination of spinlock_t lock and seqcount_spinlock_t seq
in struct fs_struct is an open-coded seqlock_t (see linux/seqlock_types.h).
	Combine and switch to equivalent seqlock_t primitives.  AFAICS,
that does end up with the same sequence of underlying operations in all
cases.
	While we are at it, get_fs_pwd() is open-coded verbatim in
get_path_from_fd(); rather than applying conversion to it, replace with
the call of get_fs_pwd() there.  Not worth splitting the commit for that,
IMO...

	A bit of historical background - conversion of seqlock_t to
use of seqcount_spinlock_t happened several months after the same
had been done to struct fs_struct; switching fs_struct to seqlock_t
could've been done immediately after that, but it looks like nobody
had gotten around to that until now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/20250702053437.GC1880847@ZenIV
Acked-by: Ahmed S. Darwish <darwi@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/d_path.c               |  8 ++++----
 fs/exec.c                 |  4 ++--
 fs/fhandle.c              |  6 +-----
 fs/fs_struct.c            | 36 ++++++++++++++----------------------
 fs/namei.c                |  8 ++++----
 include/linux/fs_struct.h | 11 +++++------
 kernel/fork.c             | 10 +++++-----
 7 files changed, 35 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
 	unsigned seq;
 
 	do {
-		seq = read_seqcount_begin(&fs->seq);
+		seq = read_seqbegin(&fs->seq);
 		*root = fs->root;
-	} while (read_seqcount_retry(&fs->seq, seq));
+	} while (read_seqretry(&fs->seq, seq));
 }
 
 /**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
 	unsigned seq;
 
 	do {
-		seq = read_seqcount_begin(&fs->seq);
+		seq = read_seqbegin(&fs->seq);
 		*root = fs->root;
 		*pwd = fs->pwd;
-	} while (read_seqcount_retry(&fs->seq, seq));
+	} while (read_seqretry(&fs->seq, seq));
 }
 
 /*
diff --git a/fs/exec.c b/fs/exec.c
index 1f5fdd2e096e..871078ddb220 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1510,7 +1510,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * state is protected by cred_guard_mutex we hold.
 	 */
 	n_fs = 1;
-	spin_lock(&p->fs->lock);
+	read_seqlock_excl(&p->fs->seq);
 	rcu_read_lock();
 	for_other_threads(p, t) {
 		if (t->fs == p->fs)
@@ -1523,7 +1523,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	else
 		p->fs->in_exec = 1;
-	spin_unlock(&p->fs->lock);
+	read_sequnlock_excl(&p->fs->seq);
 }
 
 static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index b1363ead6c5e..7c236f64cdea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -180,11 +180,7 @@ static int get_path_anchor(int fd, struct path *root)
 	}
 
 	if (fd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		spin_lock(&fs->lock);
-		*root = fs->pwd;
-		path_get(root);
-		spin_unlock(&fs->lock);
+		get_fs_pwd(current->fs, root);
 		return 0;
 	}
 
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..28be762ac1c6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
 	struct path old_root;
 
 	path_get(path);
-	spin_lock(&fs->lock);
-	write_seqcount_begin(&fs->seq);
+	write_seqlock(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	write_seqcount_end(&fs->seq);
-	spin_unlock(&fs->lock);
+	write_sequnlock(&fs->seq);
 	if (old_root.dentry)
 		path_put(&old_root);
 }
@@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
 	struct path old_pwd;
 
 	path_get(path);
-	spin_lock(&fs->lock);
-	write_seqcount_begin(&fs->seq);
+	write_seqlock(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	write_seqcount_end(&fs->seq);
-	spin_unlock(&fs->lock);
+	write_sequnlock(&fs->seq);
 
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
@@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
 		fs = p->fs;
 		if (fs) {
 			int hits = 0;
-			spin_lock(&fs->lock);
-			write_seqcount_begin(&fs->seq);
+			write_seqlock(&fs->seq);
 			hits += replace_path(&fs->root, old_root, new_root);
 			hits += replace_path(&fs->pwd, old_root, new_root);
-			write_seqcount_end(&fs->seq);
 			while (hits--) {
 				count++;
 				path_get(new_root);
 			}
-			spin_unlock(&fs->lock);
+			write_sequnlock(&fs->seq);
 		}
 		task_unlock(p);
 	}
@@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk)
 	if (fs) {
 		int kill;
 		task_lock(tsk);
-		spin_lock(&fs->lock);
+		read_seqlock_excl(&fs->seq);
 		tsk->fs = NULL;
 		kill = !--fs->users;
-		spin_unlock(&fs->lock);
+		read_sequnlock_excl(&fs->seq);
 		task_unlock(tsk);
 		if (kill)
 			free_fs_struct(fs);
@@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
-		spin_lock_init(&fs->lock);
-		seqcount_spinlock_init(&fs->seq, &fs->lock);
+		seqlock_init(&fs->seq);
 		fs->umask = old->umask;
 
-		spin_lock(&old->lock);
+		read_seqlock_excl(&old->seq);
 		fs->root = old->root;
 		path_get(&fs->root);
 		fs->pwd = old->pwd;
 		path_get(&fs->pwd);
-		spin_unlock(&old->lock);
+		read_sequnlock_excl(&old->seq);
 	}
 	return fs;
 }
@@ -140,10 +133,10 @@ int unshare_fs_struct(void)
 		return -ENOMEM;
 
 	task_lock(current);
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	kill = !--fs->users;
 	current->fs = new_fs;
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 	task_unlock(current);
 
 	if (kill)
@@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
-	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
-	.seq		= SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
+	.seq		= __SEQLOCK_UNLOCKED(init_fs.seq),
 	.umask		= 0022,
 };
diff --git a/fs/namei.c b/fs/namei.c
index 4bb889fc980b..f2fcaf84e111 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd)
 		unsigned seq;
 
 		do {
-			seq = read_seqcount_begin(&fs->seq);
+			seq = read_seqbegin(&fs->seq);
 			nd->root = fs->root;
 			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
+		} while (read_seqretry(&fs->seq, seq));
 	} else {
 		get_fs_root(fs, &nd->root);
 		nd->state |= ND_ROOT_GRABBED;
@@ -2580,11 +2580,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			unsigned seq;
 
 			do {
-				seq = read_seqcount_begin(&fs->seq);
+				seq = read_seqbegin(&fs->seq);
 				nd->path = fs->pwd;
 				nd->inode = nd->path.dentry->d_inode;
 				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			} while (read_seqcount_retry(&fs->seq, seq));
+			} while (read_seqretry(&fs->seq, seq));
 		} else {
 			get_fs_pwd(current->fs, &nd->path);
 			nd->inode = nd->path.dentry->d_inode;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 783b48dedb72..baf200ab5c77 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,8 +8,7 @@
 
 struct fs_struct {
 	int users;
-	spinlock_t lock;
-	seqcount_spinlock_t seq;
+	seqlock_t seq;
 	int umask;
 	int in_exec;
 	struct path root, pwd;
@@ -26,18 +25,18 @@ extern int unshare_fs_struct(void);
 
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	*root = fs->root;
 	path_get(root);
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 }
 
 static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 {
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	*pwd = fs->pwd;
 	path_get(pwd);
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 }
 
 extern bool current_chrooted(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..6318a25a16ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1542,14 +1542,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
-		spin_lock(&fs->lock);
+		read_seqlock_excl(&fs->seq);
 		/* "users" and "in_exec" locked for check_unsafe_exec() */
 		if (fs->in_exec) {
-			spin_unlock(&fs->lock);
+			read_sequnlock_excl(&fs->seq);
 			return -EAGAIN;
 		}
 		fs->users++;
-		spin_unlock(&fs->lock);
+		read_sequnlock_excl(&fs->seq);
 		return 0;
 	}
 	tsk->fs = copy_fs_struct(fs);
@@ -3149,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
-			spin_lock(&fs->lock);
+			read_seqlock_excl(&fs->seq);
 			current->fs = new_fs;
 			if (--fs->users)
 				new_fs = NULL;
 			else
 				new_fs = fs;
-			spin_unlock(&fs->lock);
+			read_sequnlock_excl(&fs->seq);
 		}
 
 		if (new_fd)
-- 
cgit v1.2.3


From ce7a381697cb3958ffe0b45e5028ac69444e9288 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <tonghao@bamaicloud.com>
Date: Fri, 27 Jun 2025 21:49:28 +0800
Subject: net: bonding: add broadcast_neighbor option for 802.3ad

Stacking technology is a type of technology used to expand ports on
Ethernet switches. It is widely used as a common access method in
large-scale Internet data center architectures. Years of practice
have proved that stacking technology has advantages and disadvantages
in high-reliability network architecture scenarios. For instance,
in stacking networking arch, conventional switch system upgrades
require multiple stacked devices to restart at the same time.
Therefore, it is inevitable that the business will be interrupted
for a while. It is for this reason that "no-stacking" in data centers
has become a trend. Additionally, when the stacking link connecting
the switches fails or is abnormal, the stack will split. Although it is
not common, it still happens in actual operation. The problem is that
after the split, it is equivalent to two switches with the same
configuration appearing in the network, causing network configuration
conflicts and ultimately interrupting the services carried by the
stacking system.

To improve network stability, "non-stacking" solutions have been
increasingly adopted, particularly by public cloud providers and
tech companies like Alibaba, Tencent, and Didi. "non-stacking" is
a method of mimicing switch stacking that convinces a LACP peer,
bonding in this case, connected to a set of "non-stacked" switches
that all of its ports are connected to a single switch
(i.e., LACP aggregator), as if those switches were stacked. This
enables the LACP peer's ports to aggregate together, and requires
(a) special switch configuration, described in the linked article,
and (b) modifications to the bonding 802.3ad (LACP) mode to send
all ARP/ND packets across all ports of the active aggregator.

Note that, with multiple aggregators, the current broadcast mode
logic will send only packets to the selected aggregator(s).

 +-----------+   +-----------+
 |  switch1  |   |  switch2  |
 +-----------+   +-----------+
         ^           ^
         |           |
      +-----------------+
      |   bond4 lacp    |
      +-----------------+
         |           |
         | NIC1      | NIC2
      +-----------------+
      |     server      |
      +-----------------+

- https://www.ruijie.com/fr-fr/support/tech-gallery/de-stack-data-center-network-architecture/

Cc: Jay Vosburgh <jv@jvosburgh.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Tonghao Zhang <tonghao@bamaicloud.com>
Signed-off-by: Zengbing Tu <tuzengbing@didiglobal.com>
Link: https://patch.msgid.link/84d0a044514157bb856a10b6d03a1028c4883561.1751031306.git.tonghao@bamaicloud.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/bonding.rst |  6 ++++
 drivers/net/bonding/bond_main.c      | 66 ++++++++++++++++++++++++++++++++----
 drivers/net/bonding/bond_options.c   | 42 +++++++++++++++++++++++
 include/net/bond_options.h           |  1 +
 include/net/bonding.h                |  3 ++
 5 files changed, 112 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst
index a4c1291d2561..14f7593d888d 100644
--- a/Documentation/networking/bonding.rst
+++ b/Documentation/networking/bonding.rst
@@ -562,6 +562,12 @@ lacp_rate
 
 	The default is slow.
 
+broadcast_neighbor
+
+	Option specifying whether to broadcast ARP/ND packets to all
+	active slaves.  This option has no effect in modes other than
+	802.3ad mode.  The default is off (0).
+
 max_bonds
 
 	Specifies the number of bonding devices to create for this
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c4d53e8e7c15..12046ef51569 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -212,6 +212,8 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0);
 
 unsigned int bond_net_id __read_mostly;
 
+DEFINE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled);
+
 static const struct flow_dissector_key flow_keys_bonding_keys[] = {
 	{
 		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
@@ -4456,6 +4458,9 @@ static int bond_open(struct net_device *bond_dev)
 
 		bond_for_each_slave(bond, slave, iter)
 			dev_mc_add(slave->dev, lacpdu_mcast_addr);
+
+		if (bond->params.broadcast_neighbor)
+			static_branch_inc(&bond_bcast_neigh_enabled);
 	}
 
 	if (bond_mode_can_use_xmit_hash(bond))
@@ -4475,6 +4480,10 @@ static int bond_close(struct net_device *bond_dev)
 		bond_alb_deinitialize(bond);
 	bond->recv_probe = NULL;
 
+	if (BOND_MODE(bond) == BOND_MODE_8023AD &&
+	    bond->params.broadcast_neighbor)
+		static_branch_dec(&bond_bcast_neigh_enabled);
+
 	if (bond_uses_primary(bond)) {
 		rcu_read_lock();
 		slave = rcu_dereference(bond->curr_active_slave);
@@ -5310,6 +5319,37 @@ static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
 	return slaves->arr[hash % count];
 }
 
+static bool bond_should_broadcast_neighbor(struct sk_buff *skb,
+					   struct net_device *dev)
+{
+	struct bonding *bond = netdev_priv(dev);
+	struct {
+		struct ipv6hdr ip6;
+		struct icmp6hdr icmp6;
+	} *combined, _combined;
+
+	if (!static_branch_unlikely(&bond_bcast_neigh_enabled))
+		return false;
+
+	if (!bond->params.broadcast_neighbor)
+		return false;
+
+	if (skb->protocol == htons(ETH_P_ARP))
+		return true;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		combined = skb_header_pointer(skb, skb_mac_header_len(skb),
+					      sizeof(_combined),
+					      &_combined);
+		if (combined && combined->ip6.nexthdr == NEXTHDR_ICMP &&
+		    (combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION ||
+		     combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT))
+			return true;
+	}
+
+	return false;
+}
+
 /* Use this Xmit function for 3AD as well as XOR modes. The current
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
@@ -5329,17 +5369,27 @@ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
 	return bond_tx_drop(dev, skb);
 }
 
-/* in broadcast mode, we send everything to all usable interfaces. */
+/* in broadcast mode, we send everything to all or usable slave interfaces.
+ * under rcu_read_lock when this function is called.
+ */
 static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
-				       struct net_device *bond_dev)
+				       struct net_device *bond_dev,
+				       bool all_slaves)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
-	struct slave *slave = NULL;
-	struct list_head *iter;
+	struct bond_up_slave *slaves;
 	bool xmit_suc = false;
 	bool skb_used = false;
+	int slaves_count, i;
 
-	bond_for_each_slave_rcu(bond, slave, iter) {
+	if (all_slaves)
+		slaves = rcu_dereference(bond->all_slaves);
+	else
+		slaves = rcu_dereference(bond->usable_slaves);
+
+	slaves_count = slaves ? READ_ONCE(slaves->count) : 0;
+	for (i = 0; i < slaves_count; i++) {
+		struct slave *slave = slaves->arr[i];
 		struct sk_buff *skb2;
 
 		if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP))
@@ -5577,10 +5627,13 @@ static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev
 	case BOND_MODE_ACTIVEBACKUP:
 		return bond_xmit_activebackup(skb, dev);
 	case BOND_MODE_8023AD:
+		if (bond_should_broadcast_neighbor(skb, dev))
+			return bond_xmit_broadcast(skb, dev, false);
+		fallthrough;
 	case BOND_MODE_XOR:
 		return bond_3ad_xor_xmit(skb, dev);
 	case BOND_MODE_BROADCAST:
-		return bond_xmit_broadcast(skb, dev);
+		return bond_xmit_broadcast(skb, dev, true);
 	case BOND_MODE_ALB:
 		return bond_alb_xmit(skb, dev);
 	case BOND_MODE_TLB:
@@ -6456,6 +6509,7 @@ static int __init bond_check_params(struct bond_params *params)
 	eth_zero_addr(params->ad_actor_system);
 	params->ad_user_port_key = ad_user_port_key;
 	params->coupled_control = 1;
+	params->broadcast_neighbor = 0;
 	if (packets_per_slave > 0) {
 		params->reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 91893c29b899..1d639a3be6ba 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -87,6 +87,8 @@ static int bond_option_missed_max_set(struct bonding *bond,
 				      const struct bond_opt_value *newval);
 static int bond_option_coupled_control_set(struct bonding *bond,
 					   const struct bond_opt_value *newval);
+static int bond_option_broadcast_neigh_set(struct bonding *bond,
+					   const struct bond_opt_value *newval);
 
 static const struct bond_opt_value bond_mode_tbl[] = {
 	{ "balance-rr",    BOND_MODE_ROUNDROBIN,   BOND_VALFLAG_DEFAULT},
@@ -240,6 +242,12 @@ static const struct bond_opt_value bond_coupled_control_tbl[] = {
 	{ NULL,  -1, 0},
 };
 
+static const struct bond_opt_value bond_broadcast_neigh_tbl[] = {
+	{ "off", 0, BOND_VALFLAG_DEFAULT},
+	{ "on",	 1, 0},
+	{ NULL,  -1, 0}
+};
+
 static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 	[BOND_OPT_MODE] = {
 		.id = BOND_OPT_MODE,
@@ -513,6 +521,14 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = {
 		.flags = BOND_OPTFLAG_IFDOWN,
 		.values = bond_coupled_control_tbl,
 		.set = bond_option_coupled_control_set,
+	},
+	[BOND_OPT_BROADCAST_NEIGH] = {
+		.id = BOND_OPT_BROADCAST_NEIGH,
+		.name = "broadcast_neighbor",
+		.desc = "Broadcast neighbor packets to all active slaves",
+		.unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)),
+		.values = bond_broadcast_neigh_tbl,
+		.set = bond_option_broadcast_neigh_set,
 	}
 };
 
@@ -894,6 +910,13 @@ static int bond_option_mode_set(struct bonding *bond,
 	bond->params.arp_validate = BOND_ARP_VALIDATE_NONE;
 	bond->params.mode = newval->value;
 
+	/* When changing mode, the bond device is down, we may reduce
+	 * the bond_bcast_neigh_enabled in bond_close() if broadcast_neighbor
+	 * enabled in 8023ad mode. Therefore, only clear broadcast_neighbor
+	 * to 0.
+	 */
+	bond->params.broadcast_neighbor = 0;
+
 	if (bond->dev->reg_state == NETREG_REGISTERED) {
 		bool update = false;
 
@@ -1840,3 +1863,22 @@ static int bond_option_coupled_control_set(struct bonding *bond,
 	bond->params.coupled_control = newval->value;
 	return 0;
 }
+
+static int bond_option_broadcast_neigh_set(struct bonding *bond,
+					   const struct bond_opt_value *newval)
+{
+	if (bond->params.broadcast_neighbor == newval->value)
+		return 0;
+
+	bond->params.broadcast_neighbor = newval->value;
+	if (bond->dev->flags & IFF_UP) {
+		if (bond->params.broadcast_neighbor)
+			static_branch_inc(&bond_bcast_neigh_enabled);
+		else
+			static_branch_dec(&bond_bcast_neigh_enabled);
+	}
+
+	netdev_dbg(bond->dev, "Setting broadcast_neighbor to %s (%llu)\n",
+		   newval->string, newval->value);
+	return 0;
+}
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 18687ccf0638..022b122a9fb6 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -77,6 +77,7 @@ enum {
 	BOND_OPT_NS_TARGETS,
 	BOND_OPT_PRIO,
 	BOND_OPT_COUPLED_CONTROL,
+	BOND_OPT_BROADCAST_NEIGH,
 	BOND_OPT_LAST
 };
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 95f67b308c19..e06f0d63b2c1 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -115,6 +115,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev)
 #define is_netpoll_tx_blocked(dev) (0)
 #endif
 
+DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled);
+
 struct bond_params {
 	int mode;
 	int xmit_policy;
@@ -149,6 +151,7 @@ struct bond_params {
 	struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
 #endif
 	int coupled_control;
+	int broadcast_neighbor;
 
 	/* 2 bytes of padding : see ether_addr_equal_64bits() */
 	u8 ad_actor_system[ETH_ALEN + 2];
-- 
cgit v1.2.3


From 3d98ee52659c3f1d3913ae5b97f7743c5247752c Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <tonghao@bamaicloud.com>
Date: Fri, 27 Jun 2025 21:49:29 +0800
Subject: net: bonding: add broadcast_neighbor netlink option

User can config or display the bonding broadcast_neighbor option via
iproute2/netlink.

Cc: Jay Vosburgh <jv@jvosburgh.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Tonghao Zhang <tonghao@bamaicloud.com>
Signed-off-by: Zengbing Tu <tuzengbing@didiglobal.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/76b90700ba5b98027dfb51a2f3c5cfea0440a21b.1751031306.git.tonghao@bamaicloud.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++
 include/uapi/linux/if_link.h       |  1 +
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index ac5e402c34bc..57fff2421f1b 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -124,6 +124,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = {
 	[IFLA_BOND_MISSED_MAX]		= { .type = NLA_U8 },
 	[IFLA_BOND_NS_IP6_TARGET]	= { .type = NLA_NESTED },
 	[IFLA_BOND_COUPLED_CONTROL]	= { .type = NLA_U8 },
+	[IFLA_BOND_BROADCAST_NEIGH]	= { .type = NLA_U8 },
 };
 
 static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = {
@@ -561,6 +562,16 @@ static int bond_changelink(struct net_device *bond_dev, struct nlattr *tb[],
 			return err;
 	}
 
+	if (data[IFLA_BOND_BROADCAST_NEIGH]) {
+		int broadcast_neigh = nla_get_u8(data[IFLA_BOND_BROADCAST_NEIGH]);
+
+		bond_opt_initval(&newval, broadcast_neigh);
+		err = __bond_opt_set(bond, BOND_OPT_BROADCAST_NEIGH, &newval,
+				     data[IFLA_BOND_BROADCAST_NEIGH], extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -630,6 +641,7 @@ static size_t bond_get_size(const struct net_device *bond_dev)
 		nla_total_size(sizeof(struct nlattr)) +
 		nla_total_size(sizeof(struct in6_addr)) * BOND_MAX_NS_TARGETS +
 		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_COUPLED_CONTROL */
+		nla_total_size(sizeof(u8)) +	/* IFLA_BOND_BROADCAST_NEIGH */
 		0;
 }
 
@@ -793,6 +805,10 @@ static int bond_fill_info(struct sk_buff *skb,
 		       bond->params.coupled_control))
 		goto nla_put_failure;
 
+	if (nla_put_u8(skb, IFLA_BOND_BROADCAST_NEIGH,
+		       bond->params.broadcast_neighbor))
+		goto nla_put_failure;
+
 	if (BOND_MODE(bond) == BOND_MODE_8023AD) {
 		struct ad_info info;
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 873c285996fe..784ace3a519c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1535,6 +1535,7 @@ enum {
 	IFLA_BOND_MISSED_MAX,
 	IFLA_BOND_NS_IP6_TARGET,
 	IFLA_BOND_COUPLED_CONTROL,
+	IFLA_BOND_BROADCAST_NEIGH,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit v1.2.3


From 269936db5eb3962fe290b1dc4dbf1859cd5a04dd Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Wed, 2 Jul 2025 14:20:03 +0800
Subject: net: mctp: separate routing database from routing operations

This change adds a struct mctp_dst, representing the result of a routing
lookup. This decouples the struct mctp_route from the actual
implementation of a routing operation.

This will allow for future routing changes which may require more
involved lookup logic, such as gateway routing - which may require
multiple traversals of the routing table.

Since we only use the struct mctp_route at lookup time, we no longer
hold routes over a routing operation, as we only need it to populate the
dst. However, we do hold the dev while the dst is active.

This requires some changes to the route test infrastructure, as we no
longer have a mock route to handle the route output operation, and
transient dsts are created by the routing code, so we can't override
them as easily.

Instead, we use kunit->priv to stash a packet queue, and a custom
dst_output function queues into that packet queue, which we can use for
later expectations.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20250702-dev-forwarding-v5-3-1468191da8a4@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h         |  35 +++++--
 net/mctp/af_mctp.c         |  62 ++++--------
 net/mctp/route.c           | 210 ++++++++++++++++++++------------------
 net/mctp/test/route-test.c | 245 ++++++++++++++++++++++++++++-----------------
 4 files changed, 313 insertions(+), 239 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 07d458990113..6c9c5c48f59a 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -222,6 +222,8 @@ struct mctp_flow {
 	struct mctp_sk_key *key;
 };
 
+struct mctp_dst;
+
 /* Route definition.
  *
  * These are held in the pernet->mctp.routes list, with RCU protection for
@@ -229,8 +231,7 @@ struct mctp_flow {
  * dropped on NETDEV_UNREGISTER events.
  *
  * Updates to the route table are performed under rtnl; all reads under RCU,
- * so routes cannot be referenced over a RCU grace period. Specifically: A
- * caller cannot block between mctp_route_lookup and mctp_route_release()
+ * so routes cannot be referenced over a RCU grace period.
  */
 struct mctp_route {
 	mctp_eid_t		min, max;
@@ -238,7 +239,7 @@ struct mctp_route {
 	unsigned char		type;
 	unsigned int		mtu;
 	struct mctp_dev		*dev;
-	int			(*output)(struct mctp_route *route,
+	int			(*output)(struct mctp_dst *dst,
 					  struct sk_buff *skb);
 
 	struct list_head	list;
@@ -246,12 +247,34 @@ struct mctp_route {
 	struct rcu_head		rcu;
 };
 
+/* Route lookup result: dst. Represents the results of a routing decision,
+ * but is only held over the individual routing operation.
+ *
+ * Will typically be stored on the caller stack, and must be released after
+ * usage.
+ */
+struct mctp_dst {
+	struct mctp_dev *dev;
+	unsigned int mtu;
+
+	/* set for direct addressing */
+	unsigned char halen;
+	unsigned char haddr[MAX_ADDR_LEN];
+
+	int (*output)(struct mctp_dst *dst, struct sk_buff *skb);
+};
+
+int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
+			  unsigned char halen, const unsigned char *haddr);
+
 /* route interfaces */
-struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
-				     mctp_eid_t daddr);
+int mctp_route_lookup(struct net *net, unsigned int dnet,
+		      mctp_eid_t daddr, struct mctp_dst *dst);
+
+void mctp_dst_release(struct mctp_dst *dst);
 
 /* always takes ownership of skb */
-int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
 
 void mctp_key_unref(struct mctp_sk_key *key);
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 9b12ca97f412..e2570d9755ea 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -97,8 +97,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	struct sock *sk = sock->sk;
 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
 	struct mctp_skb_cb *cb;
-	struct mctp_route *rt;
 	struct sk_buff *skb = NULL;
+	struct mctp_dst dst;
 	int hlen;
 
 	if (addr) {
@@ -133,34 +133,30 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) {
 		DECLARE_SOCKADDR(struct sockaddr_mctp_ext *,
 				 extaddr, msg->msg_name);
-		struct net_device *dev;
-
-		rc = -EINVAL;
-		rcu_read_lock();
-		dev = dev_get_by_index_rcu(sock_net(sk), extaddr->smctp_ifindex);
-		/* check for correct halen */
-		if (dev && extaddr->smctp_halen == dev->addr_len) {
-			hlen = LL_RESERVED_SPACE(dev) + sizeof(struct mctp_hdr);
-			rc = 0;
-		}
-		rcu_read_unlock();
+
+		if (!mctp_sockaddr_ext_is_ok(extaddr))
+			return -EINVAL;
+
+		rc = mctp_dst_from_extaddr(&dst, sock_net(sk),
+					   extaddr->smctp_ifindex,
+					   extaddr->smctp_halen,
+					   extaddr->smctp_haddr);
 		if (rc)
-			goto err_free;
-		rt = NULL;
+			return rc;
+
 	} else {
-		rt = mctp_route_lookup(sock_net(sk), addr->smctp_network,
-				       addr->smctp_addr.s_addr);
-		if (!rt) {
-			rc = -EHOSTUNREACH;
-			goto err_free;
-		}
-		hlen = LL_RESERVED_SPACE(rt->dev->dev) + sizeof(struct mctp_hdr);
+		rc = mctp_route_lookup(sock_net(sk), addr->smctp_network,
+				       addr->smctp_addr.s_addr, &dst);
+		if (rc)
+			return rc;
 	}
 
+	hlen = LL_RESERVED_SPACE(dst.dev->dev) + sizeof(struct mctp_hdr);
+
 	skb = sock_alloc_send_skb(sk, hlen + 1 + len,
 				  msg->msg_flags & MSG_DONTWAIT, &rc);
 	if (!skb)
-		return rc;
+		goto err_release_dst;
 
 	skb_reserve(skb, hlen);
 
@@ -175,30 +171,16 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 	cb = __mctp_cb(skb);
 	cb->net = addr->smctp_network;
 
-	if (!rt) {
-		/* fill extended address in cb */
-		DECLARE_SOCKADDR(struct sockaddr_mctp_ext *,
-				 extaddr, msg->msg_name);
-
-		if (!mctp_sockaddr_ext_is_ok(extaddr) ||
-		    extaddr->smctp_halen > sizeof(cb->haddr)) {
-			rc = -EINVAL;
-			goto err_free;
-		}
-
-		cb->ifindex = extaddr->smctp_ifindex;
-		/* smctp_halen is checked above */
-		cb->halen = extaddr->smctp_halen;
-		memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen);
-	}
-
-	rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr,
+	rc = mctp_local_output(sk, &dst, skb, addr->smctp_addr.s_addr,
 			       addr->smctp_tag);
 
+	mctp_dst_release(&dst);
 	return rc ? : len;
 
 err_free:
 	kfree_skb(skb);
+err_release_dst:
+	mctp_dst_release(&dst);
 	return rc;
 }
 
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 128ac46dda5e..3985388a6035 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -32,7 +32,7 @@ static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ;
 static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev);
 
 /* route output callbacks */
-static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb)
 {
 	kfree_skb(skb);
 	return 0;
@@ -368,7 +368,7 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
 	return 0;
 }
 
-static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb)
 {
 	struct mctp_sk_key *key, *any_key = NULL;
 	struct net *net = dev_net(skb->dev);
@@ -559,24 +559,17 @@ out:
 	return rc;
 }
 
-static unsigned int mctp_route_mtu(struct mctp_route *rt)
-{
-	return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
-}
-
-static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
+static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 {
 	struct mctp_skb_cb *cb = mctp_cb(skb);
 	struct mctp_hdr *hdr = mctp_hdr(skb);
 	char daddr_buf[MAX_ADDR_LEN];
 	char *daddr = NULL;
-	unsigned int mtu;
 	int rc;
 
 	skb->protocol = htons(ETH_P_MCTP);
 
-	mtu = READ_ONCE(skb->dev->mtu);
-	if (skb->len > mtu) {
+	if (skb->len > dst->mtu) {
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -598,7 +591,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
 	} else {
 		skb->pkt_type = PACKET_OUTGOING;
 		/* If lookup fails let the device handle daddr==NULL */
-		if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0)
+		if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0)
 			daddr = daddr_buf;
 	}
 
@@ -609,7 +602,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
 		return -EHOSTUNREACH;
 	}
 
-	mctp_flow_prepare_output(skb, route->dev);
+	mctp_flow_prepare_output(skb, dst->dev);
 
 	rc = dev_queue_xmit(skb);
 	if (rc)
@@ -638,7 +631,7 @@ static struct mctp_route *mctp_route_alloc(void)
 
 	INIT_LIST_HEAD(&rt->list);
 	refcount_set(&rt->refs, 1);
-	rt->output = mctp_route_discard;
+	rt->output = mctp_dst_discard;
 
 	return rt;
 }
@@ -828,49 +821,106 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1,
 		rt1->max == rt2->max;
 }
 
-struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
-				     mctp_eid_t daddr)
+static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route)
+{
+	mctp_dev_hold(route->dev);
+	dst->dev = route->dev;
+	dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu);
+	dst->halen = 0;
+	dst->output = route->output;
+}
+
+int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
+			  unsigned char halen, const unsigned char *haddr)
 {
-	struct mctp_route *tmp, *rt = NULL;
+	struct net_device *netdev;
+	struct mctp_dev *dev;
+	int rc = -ENOENT;
+
+	if (halen > sizeof(dst->haddr))
+		return -EINVAL;
 
 	rcu_read_lock();
 
-	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
+	netdev = dev_get_by_index_rcu(net, ifindex);
+	if (!netdev)
+		goto out_unlock;
+
+	if (netdev->addr_len != halen) {
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+
+	dev = __mctp_dev_get(netdev);
+	if (!dev)
+		goto out_unlock;
+
+	dst->dev = dev;
+	dst->mtu = READ_ONCE(netdev->mtu);
+	dst->halen = halen;
+	dst->output = mctp_dst_output;
+	memcpy(dst->haddr, haddr, halen);
+
+	rc = 0;
+
+out_unlock:
+	rcu_read_unlock();
+	return rc;
+}
+
+void mctp_dst_release(struct mctp_dst *dst)
+{
+	mctp_dev_put(dst->dev);
+}
+
+/* populates *dst on successful lookup, if set */
+int mctp_route_lookup(struct net *net, unsigned int dnet,
+		      mctp_eid_t daddr, struct mctp_dst *dst)
+{
+	int rc = -EHOSTUNREACH;
+	struct mctp_route *rt;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
 		/* TODO: add metrics */
-		if (mctp_rt_match_eid(tmp, dnet, daddr)) {
-			if (refcount_inc_not_zero(&tmp->refs)) {
-				rt = tmp;
-				break;
-			}
-		}
+		if (!mctp_rt_match_eid(rt, dnet, daddr))
+			continue;
+
+		if (dst)
+			mctp_dst_from_route(dst, rt);
+		rc = 0;
+		break;
 	}
 
 	rcu_read_unlock();
 
-	return rt;
+	return rc;
 }
 
-static struct mctp_route *mctp_route_lookup_null(struct net *net,
-						 struct net_device *dev)
+static int mctp_route_lookup_null(struct net *net, struct net_device *dev,
+				  struct mctp_dst *dst)
 {
-	struct mctp_route *tmp, *rt = NULL;
+	int rc = -EHOSTUNREACH;
+	struct mctp_route *rt;
 
 	rcu_read_lock();
 
-	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
-		if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL &&
-		    refcount_inc_not_zero(&tmp->refs)) {
-			rt = tmp;
-			break;
-		}
+	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
+		if (rt->dev->dev != dev || rt->type != RTN_LOCAL)
+			continue;
+
+		mctp_dst_from_route(dst, rt);
+		rc = 0;
+		break;
 	}
 
 	rcu_read_unlock();
 
-	return rt;
+	return rc;
 }
 
-static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
+static int mctp_do_fragment_route(struct mctp_dst *dst, struct sk_buff *skb,
 				  unsigned int mtu, u8 tag)
 {
 	const unsigned int hlen = sizeof(struct mctp_hdr);
@@ -943,7 +993,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
 		skb_ext_copy(skb2, skb);
 
 		/* do route */
-		rc = rt->output(rt, skb2);
+		rc = dst->output(dst, skb2);
 		if (rc)
 			break;
 
@@ -955,68 +1005,32 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
 	return rc;
 }
 
-int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
 {
 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
 	struct mctp_skb_cb *cb = mctp_cb(skb);
-	struct mctp_route tmp_rt = {0};
 	struct mctp_sk_key *key;
 	struct mctp_hdr *hdr;
 	unsigned long flags;
 	unsigned int netid;
 	unsigned int mtu;
 	mctp_eid_t saddr;
-	bool ext_rt;
 	int rc;
 	u8 tag;
 
 	rc = -ENODEV;
 
-	if (rt) {
-		ext_rt = false;
-		if (WARN_ON(!rt->dev))
-			goto out_release;
-
-	} else if (cb->ifindex) {
-		struct net_device *dev;
-
-		ext_rt = true;
-		rt = &tmp_rt;
-
-		rcu_read_lock();
-		dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex);
-		if (!dev) {
-			rcu_read_unlock();
-			goto out_free;
-		}
-		rt->dev = __mctp_dev_get(dev);
-		rcu_read_unlock();
-
-		if (!rt->dev)
-			goto out_release;
-
-		/* establish temporary route - we set up enough to keep
-		 * mctp_route_output happy
-		 */
-		rt->output = mctp_route_output;
-		rt->mtu = 0;
-
-	} else {
-		rc = -EINVAL;
-		goto out_free;
-	}
-
-	spin_lock_irqsave(&rt->dev->addrs_lock, flags);
-	if (rt->dev->num_addrs == 0) {
+	spin_lock_irqsave(&dst->dev->addrs_lock, flags);
+	if (dst->dev->num_addrs == 0) {
 		rc = -EHOSTUNREACH;
 	} else {
 		/* use the outbound interface's first address as our source */
-		saddr = rt->dev->addrs[0];
+		saddr = dst->dev->addrs[0];
 		rc = 0;
 	}
-	spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
-	netid = READ_ONCE(rt->dev->net);
+	spin_unlock_irqrestore(&dst->dev->addrs_lock, flags);
+	netid = READ_ONCE(dst->dev->net);
 
 	if (rc)
 		goto out_release;
@@ -1048,7 +1062,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	skb_reset_transport_header(skb);
 	skb_push(skb, sizeof(struct mctp_hdr));
 	skb_reset_network_header(skb);
-	skb->dev = rt->dev->dev;
+	skb->dev = dst->dev->dev;
 
 	/* cb->net will have been set on initial ingress */
 	cb->src = saddr;
@@ -1059,26 +1073,20 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt,
 	hdr->dest = daddr;
 	hdr->src = saddr;
 
-	mtu = mctp_route_mtu(rt);
+	mtu = dst->mtu;
 
 	if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
 		hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM |
 			MCTP_HDR_FLAG_EOM | tag;
-		rc = rt->output(rt, skb);
+		rc = dst->output(dst, skb);
 	} else {
-		rc = mctp_do_fragment_route(rt, skb, mtu, tag);
+		rc = mctp_do_fragment_route(dst, skb, mtu, tag);
 	}
 
 	/* route output functions consume the skb, even on error */
 	skb = NULL;
 
 out_release:
-	if (!ext_rt)
-		mctp_route_release(rt);
-
-	mctp_dev_put(tmp_rt.dev);
-
-out_free:
 	kfree_skb(skb);
 	return rc;
 }
@@ -1088,7 +1096,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 			  unsigned int daddr_extent, unsigned int mtu,
 			  unsigned char type)
 {
-	int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb);
+	int (*rtfn)(struct mctp_dst *dst, struct sk_buff *skb);
 	struct net *net = dev_net(mdev->dev);
 	struct mctp_route *rt, *ert;
 
@@ -1100,15 +1108,17 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 
 	switch (type) {
 	case RTN_LOCAL:
-		rtfn = mctp_route_input;
+		rtfn = mctp_dst_input;
 		break;
 	case RTN_UNICAST:
-		rtfn = mctp_route_output;
+		rtfn = mctp_dst_output;
 		break;
 	default:
 		return -EINVAL;
 	}
 
+	ASSERT_RTNL();
+
 	rt = mctp_route_alloc();
 	if (!rt)
 		return -ENOMEM;
@@ -1121,7 +1131,6 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
 	rt->type = type;
 	rt->output = rtfn;
 
-	ASSERT_RTNL();
 	/* Prevent duplicate identical routes. */
 	list_for_each_entry(ert, &net->mctp.routes, list) {
 		if (mctp_rt_compare_exact(rt, ert)) {
@@ -1200,8 +1209,9 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
 	struct net *net = dev_net(dev);
 	struct mctp_dev *mdev;
 	struct mctp_skb_cb *cb;
-	struct mctp_route *rt;
+	struct mctp_dst dst;
 	struct mctp_hdr *mh;
+	int rc;
 
 	rcu_read_lock();
 	mdev = __mctp_dev_get(dev);
@@ -1243,17 +1253,17 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
 	cb->net = READ_ONCE(mdev->net);
 	cb->ifindex = dev->ifindex;
 
-	rt = mctp_route_lookup(net, cb->net, mh->dest);
+	rc = mctp_route_lookup(net, cb->net, mh->dest, &dst);
 
 	/* NULL EID, but addressed to our physical address */
-	if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
-		rt = mctp_route_lookup_null(net, dev);
+	if (rc && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
+		rc = mctp_route_lookup_null(net, dev, &dst);
 
-	if (!rt)
+	if (rc)
 		goto err_drop;
 
-	rt->output(rt, skb);
-	mctp_route_release(rt);
+	dst.output(&dst, skb);
+	mctp_dst_release(&dst);
 	mctp_dev_put(mdev);
 
 	return NET_RX_SUCCESS;
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index 44ebc8e4e30c..7a1eba463fe7 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -2,18 +2,37 @@
 
 #include <kunit/test.h>
 
+/* keep clangd happy when compiled outside of the route.c include */
+#include <net/mctp.h>
+#include <net/mctpdevice.h>
+
 #include "utils.h"
 
 struct mctp_test_route {
 	struct mctp_route	rt;
-	struct sk_buff_head	pkts;
 };
 
-static int mctp_test_route_output(struct mctp_route *rt, struct sk_buff *skb)
+static const unsigned int test_pktqueue_magic = 0x5f713aef;
+
+struct mctp_test_pktqueue {
+	unsigned int magic;
+	struct sk_buff_head pkts;
+};
+
+static void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq)
+{
+	tpq->magic = test_pktqueue_magic;
+	skb_queue_head_init(&tpq->pkts);
+}
+
+static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 {
-	struct mctp_test_route *test_rt = container_of(rt, struct mctp_test_route, rt);
+	struct kunit *test = current->kunit_test;
+	struct mctp_test_pktqueue *tpq = test->priv;
+
+	KUNIT_ASSERT_EQ(test, tpq->magic, test_pktqueue_magic);
 
-	skb_queue_tail(&test_rt->pkts, skb);
+	skb_queue_tail(&tpq->pkts, skb);
 
 	return 0;
 }
@@ -29,9 +48,7 @@ static struct mctp_test_route *mctp_route_test_alloc(void)
 
 	INIT_LIST_HEAD(&rt->rt.list);
 	refcount_set(&rt->rt.refs, 1);
-	rt->rt.output = mctp_test_route_output;
-
-	skb_queue_head_init(&rt->pkts);
+	rt->rt.output = mctp_test_dst_output;
 
 	return rt;
 }
@@ -60,6 +77,32 @@ static struct mctp_test_route *mctp_test_create_route(struct net *net,
 	return rt;
 }
 
+/* Convenience function for our test dst; release with mctp_test_dst_release()
+ */
+static void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst,
+				struct mctp_test_dev *dev,
+				struct mctp_test_pktqueue *tpq,
+				unsigned int mtu)
+{
+	KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev);
+
+	memset(dst, 0, sizeof(*dst));
+
+	dst->dev = dev->mdev;
+	__mctp_dev_get(dst->dev->dev);
+	dst->mtu = mtu;
+	dst->output = mctp_test_dst_output;
+	mctp_test_pktqueue_init(tpq);
+	test->priv = tpq;
+}
+
+static void mctp_test_dst_release(struct mctp_dst *dst,
+				  struct mctp_test_pktqueue *tpq)
+{
+	mctp_dst_release(dst);
+	skb_queue_purge(&tpq->pkts);
+}
+
 static void mctp_test_route_destroy(struct kunit *test,
 				    struct mctp_test_route *rt)
 {
@@ -69,7 +112,6 @@ static void mctp_test_route_destroy(struct kunit *test,
 	list_del_rcu(&rt->rt.list);
 	rtnl_unlock();
 
-	skb_queue_purge(&rt->pkts);
 	if (rt->rt.dev)
 		mctp_dev_put(rt->rt.dev);
 
@@ -141,8 +183,10 @@ struct mctp_frag_test {
 static void mctp_test_fragment(struct kunit *test)
 {
 	const struct mctp_frag_test *params;
+	struct mctp_test_pktqueue tpq;
 	int rc, i, n, mtu, msgsize;
-	struct mctp_test_route *rt;
+	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct sk_buff *skb;
 	struct mctp_hdr hdr;
 	u8 seq;
@@ -159,13 +203,15 @@ static void mctp_test_fragment(struct kunit *test)
 	skb = mctp_test_create_skb(&hdr, msgsize);
 	KUNIT_ASSERT_TRUE(test, skb);
 
-	rt = mctp_test_create_route(&init_net, NULL, 10, mtu);
-	KUNIT_ASSERT_TRUE(test, rt);
+	dev = mctp_test_create_dev();
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
+
+	mctp_test_dst_setup(test, &dst, dev, &tpq, mtu);
 
-	rc = mctp_do_fragment_route(&rt->rt, skb, mtu, MCTP_TAG_OWNER);
+	rc = mctp_do_fragment_route(&dst, skb, mtu, MCTP_TAG_OWNER);
 	KUNIT_EXPECT_FALSE(test, rc);
 
-	n = rt->pkts.qlen;
+	n = tpq.pkts.qlen;
 
 	KUNIT_EXPECT_EQ(test, n, params->n_frags);
 
@@ -178,7 +224,7 @@ static void mctp_test_fragment(struct kunit *test)
 		first = i == 0;
 		last = i == (n - 1);
 
-		skb2 = skb_dequeue(&rt->pkts);
+		skb2 = skb_dequeue(&tpq.pkts);
 
 		if (!skb2)
 			break;
@@ -216,7 +262,8 @@ static void mctp_test_fragment(struct kunit *test)
 		kfree_skb(skb2);
 	}
 
-	mctp_test_route_destroy(test, rt);
+	mctp_test_dst_release(&dst, &tpq);
+	mctp_test_destroy_dev(dev);
 }
 
 static const struct mctp_frag_test mctp_frag_tests[] = {
@@ -246,11 +293,13 @@ struct mctp_rx_input_test {
 static void mctp_test_rx_input(struct kunit *test)
 {
 	const struct mctp_rx_input_test *params;
+	struct mctp_test_pktqueue tpq;
 	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
 	struct sk_buff *skb;
 
 	params = test->param_value;
+	test->priv = &tpq;
 
 	dev = mctp_test_create_dev();
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
@@ -261,10 +310,13 @@ static void mctp_test_rx_input(struct kunit *test)
 	skb = mctp_test_create_skb(&params->hdr, 1);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb);
 
+	mctp_test_pktqueue_init(&tpq);
+
 	mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL);
 
-	KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input);
+	KUNIT_EXPECT_EQ(test, !!tpq.pkts.qlen, params->input);
 
+	skb_queue_purge(&tpq.pkts);
 	mctp_test_route_destroy(test, rt);
 	mctp_test_destroy_dev(dev);
 }
@@ -292,12 +344,12 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests,
 /* set up a local dev, route on EID 8, and a socket listening on type 0 */
 static void __mctp_route_test_init(struct kunit *test,
 				   struct mctp_test_dev **devp,
-				   struct mctp_test_route **rtp,
+				   struct mctp_dst *dst,
+				   struct mctp_test_pktqueue *tpq,
 				   struct socket **sockp,
 				   unsigned int netid)
 {
 	struct sockaddr_mctp addr = {0};
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
 	struct socket *sock;
 	int rc;
@@ -307,8 +359,7 @@ static void __mctp_route_test_init(struct kunit *test,
 	if (netid != MCTP_NET_ANY)
 		WRITE_ONCE(dev->mdev->net, netid);
 
-	rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
+	mctp_test_dst_setup(test, dst, dev, tpq, 68);
 
 	rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock);
 	KUNIT_ASSERT_EQ(test, rc, 0);
@@ -320,18 +371,18 @@ static void __mctp_route_test_init(struct kunit *test,
 	rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr));
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
-	*rtp = rt;
 	*devp = dev;
 	*sockp = sock;
 }
 
 static void __mctp_route_test_fini(struct kunit *test,
 				   struct mctp_test_dev *dev,
-				   struct mctp_test_route *rt,
+				   struct mctp_dst *dst,
+				   struct mctp_test_pktqueue *tpq,
 				   struct socket *sock)
 {
 	sock_release(sock);
-	mctp_test_route_destroy(test, rt);
+	mctp_test_dst_release(dst, tpq);
 	mctp_test_destroy_dev(dev);
 }
 
@@ -344,22 +395,24 @@ struct mctp_route_input_sk_test {
 static void mctp_test_route_input_sk(struct kunit *test)
 {
 	const struct mctp_route_input_sk_test *params;
+	struct mctp_test_pktqueue tpq;
 	struct sk_buff *skb, *skb2;
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct socket *sock;
 	int rc;
 
 	params = test->param_value;
 
-	__mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY);
 
 	skb = mctp_test_create_skb_data(&params->hdr, &params->type);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb);
 
 	mctp_test_skb_set_dev(skb, dev);
+	mctp_test_pktqueue_init(&tpq);
 
-	rc = mctp_route_input(&rt->rt, skb);
+	rc = mctp_dst_input(&dst, skb);
 
 	if (params->deliver) {
 		KUNIT_EXPECT_EQ(test, rc, 0);
@@ -376,7 +429,7 @@ static void mctp_test_route_input_sk(struct kunit *test)
 		KUNIT_EXPECT_NULL(test, skb2);
 	}
 
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 #define FL_S	(MCTP_HDR_FLAG_SOM)
@@ -413,16 +466,17 @@ struct mctp_route_input_sk_reasm_test {
 static void mctp_test_route_input_sk_reasm(struct kunit *test)
 {
 	const struct mctp_route_input_sk_reasm_test *params;
+	struct mctp_test_pktqueue tpq;
 	struct sk_buff *skb, *skb2;
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct socket *sock;
 	int i, rc;
 	u8 c;
 
 	params = test->param_value;
 
-	__mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY);
 
 	for (i = 0; i < params->n_hdrs; i++) {
 		c = i;
@@ -431,7 +485,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test)
 
 		mctp_test_skb_set_dev(skb, dev);
 
-		rc = mctp_route_input(&rt->rt, skb);
+		rc = mctp_dst_input(&dst, skb);
 	}
 
 	skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc);
@@ -445,7 +499,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test)
 		KUNIT_EXPECT_NULL(test, skb2);
 	}
 
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 #define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT))
@@ -547,7 +601,7 @@ struct mctp_route_input_sk_keys_test {
 static void mctp_test_route_input_sk_keys(struct kunit *test)
 {
 	const struct mctp_route_input_sk_keys_test *params;
-	struct mctp_test_route *rt;
+	struct mctp_test_pktqueue tpq;
 	struct sk_buff *skb, *skb2;
 	struct mctp_test_dev *dev;
 	struct mctp_sk_key *key;
@@ -555,6 +609,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 	struct mctp_sock *msk;
 	struct socket *sock;
 	unsigned long flags;
+	struct mctp_dst dst;
 	unsigned int net;
 	int rc;
 	u8 c;
@@ -565,8 +620,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
 	net = READ_ONCE(dev->mdev->net);
 
-	rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
+	mctp_test_dst_setup(test, &dst, dev, &tpq, 68);
 
 	rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock);
 	KUNIT_ASSERT_EQ(test, rc, 0);
@@ -592,7 +646,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 
 	mctp_test_skb_set_dev(skb, dev);
 
-	rc = mctp_route_input(&rt->rt, skb);
+	rc = mctp_dst_input(&dst, skb);
 
 	/* (potentially) receive message */
 	skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc);
@@ -606,7 +660,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test)
 		skb_free_datagram(sock->sk, skb2);
 
 	mctp_key_unref(key);
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = {
@@ -681,7 +735,8 @@ KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests,
 struct test_net {
 	unsigned int netid;
 	struct mctp_test_dev *dev;
-	struct mctp_test_route *rt;
+	struct mctp_test_pktqueue tpq;
+	struct mctp_dst dst;
 	struct socket *sock;
 	struct sk_buff *skb;
 	struct mctp_sk_key *key;
@@ -699,18 +754,20 @@ mctp_test_route_input_multiple_nets_bind_init(struct kunit *test,
 
 	t->msg.data = t->netid;
 
-	__mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid);
+	__mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock,
+			       t->netid);
 
 	t->skb = mctp_test_create_skb_data(&hdr, &t->msg);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb);
 	mctp_test_skb_set_dev(t->skb, t->dev);
+	mctp_test_pktqueue_init(&t->tpq);
 }
 
 static void
 mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test,
 					      struct test_net *t)
 {
-	__mctp_route_test_fini(test, t->dev, t->rt, t->sock);
+	__mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock);
 }
 
 /* Test that skbs from different nets (otherwise identical) get routed to their
@@ -731,9 +788,9 @@ static void mctp_test_route_input_multiple_nets_bind(struct kunit *test)
 	mctp_test_route_input_multiple_nets_bind_init(test, &t1);
 	mctp_test_route_input_multiple_nets_bind_init(test, &t2);
 
-	rc = mctp_route_input(&t1.rt->rt, t1.skb);
+	rc = mctp_dst_input(&t1.dst, t1.skb);
 	KUNIT_ASSERT_EQ(test, rc, 0);
-	rc = mctp_route_input(&t2.rt->rt, t2.skb);
+	rc = mctp_dst_input(&t2.dst, t2.skb);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
 	rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc);
@@ -767,7 +824,8 @@ mctp_test_route_input_multiple_nets_key_init(struct kunit *test,
 
 	t->msg.data = t->netid;
 
-	__mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid);
+	__mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock,
+			       t->netid);
 
 	msk = container_of(t->sock->sk, struct mctp_sock, sk);
 
@@ -790,7 +848,7 @@ mctp_test_route_input_multiple_nets_key_fini(struct kunit *test,
 					     struct test_net *t)
 {
 	mctp_key_unref(t->key);
-	__mctp_route_test_fini(test, t->dev, t->rt, t->sock);
+	__mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock);
 }
 
 /* test that skbs from different nets (otherwise identical) get routed to their
@@ -812,9 +870,9 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test)
 	mctp_test_route_input_multiple_nets_key_init(test, &t1);
 	mctp_test_route_input_multiple_nets_key_init(test, &t2);
 
-	rc = mctp_route_input(&t1.rt->rt, t1.skb);
+	rc = mctp_dst_input(&t1.dst, t1.skb);
 	KUNIT_ASSERT_EQ(test, rc, 0);
-	rc = mctp_route_input(&t2.rt->rt, t2.skb);
+	rc = mctp_dst_input(&t2.dst, t2.skb);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
 	rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc);
@@ -843,13 +901,14 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test)
 static void mctp_test_route_input_sk_fail_single(struct kunit *test)
 {
 	const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO);
-	struct mctp_test_route *rt;
+	struct mctp_test_pktqueue tpq;
 	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct socket *sock;
 	struct sk_buff *skb;
 	int rc;
 
-	__mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY);
 
 	/* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will
 	 * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this.
@@ -865,14 +924,14 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test)
 	mctp_test_skb_set_dev(skb, dev);
 
 	/* do route input, which should fail */
-	rc = mctp_route_input(&rt->rt, skb);
+	rc = mctp_dst_input(&dst, skb);
 	KUNIT_EXPECT_NE(test, rc, 0);
 
 	/* we should hold the only reference to skb */
 	KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1);
 	kfree_skb(skb);
 
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 /* Input route to socket, using a fragmented message, where sock delivery fails.
@@ -880,14 +939,15 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test)
 static void mctp_test_route_input_sk_fail_frag(struct kunit *test)
 {
 	const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) };
-	struct mctp_test_route *rt;
+	struct mctp_test_pktqueue tpq;
 	struct mctp_test_dev *dev;
 	struct sk_buff *skbs[2];
+	struct mctp_dst dst;
 	struct socket *sock;
 	unsigned int i;
 	int rc;
 
-	__mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY);
 
 	lock_sock(sock->sk);
 	WRITE_ONCE(sock->sk->sk_rcvbuf, 0);
@@ -904,11 +964,11 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test)
 	/* first route input should succeed, we're only queueing to the
 	 * frag list
 	 */
-	rc = mctp_route_input(&rt->rt, skbs[0]);
+	rc = mctp_dst_input(&dst, skbs[0]);
 	KUNIT_EXPECT_EQ(test, rc, 0);
 
 	/* final route input should fail to deliver to the socket */
-	rc = mctp_route_input(&rt->rt, skbs[1]);
+	rc = mctp_dst_input(&dst, skbs[1]);
 	KUNIT_EXPECT_NE(test, rc, 0);
 
 	/* we should hold the only reference to both skbs */
@@ -918,7 +978,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1);
 	kfree_skb(skbs[1]);
 
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 /* Input route to socket, using a fragmented message created from clones.
@@ -936,10 +996,11 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
 	const size_t data_len = 3; /* arbitrary */
 	u8 compare[3 * ARRAY_SIZE(hdrs)];
 	u8 flat[3 * ARRAY_SIZE(hdrs)];
-	struct mctp_test_route *rt;
+	struct mctp_test_pktqueue tpq;
 	struct mctp_test_dev *dev;
 	struct sk_buff *skb[5];
 	struct sk_buff *rx_skb;
+	struct mctp_dst dst;
 	struct socket *sock;
 	size_t total;
 	void *p;
@@ -947,7 +1008,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
 
 	total = data_len + sizeof(struct mctp_hdr);
 
-	__mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY);
 
 	/* Create a single skb initially with concatenated packets */
 	skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total);
@@ -986,7 +1047,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
 
 	/* Feed the fragments into MCTP core */
 	for (int i = 0; i < 5; i++) {
-		rc = mctp_route_input(&rt->rt, skb[i]);
+		rc = mctp_dst_input(&dst, skb[i]);
 		KUNIT_EXPECT_EQ(test, rc, 0);
 	}
 
@@ -1024,29 +1085,29 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test)
 		kfree_skb(skb[i]);
 	}
 
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, &dst, &tpq, sock);
 }
 
 #if IS_ENABLED(CONFIG_MCTP_FLOWS)
 
 static void mctp_test_flow_init(struct kunit *test,
 				struct mctp_test_dev **devp,
-				struct mctp_test_route **rtp,
+				struct mctp_dst *dst,
+				struct mctp_test_pktqueue *tpq,
 				struct socket **sock,
 				struct sk_buff **skbp,
 				unsigned int len)
 {
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
 	struct sk_buff *skb;
 
 	/* we have a slightly odd routing setup here; the test route
 	 * is for EID 8, which is our local EID. We don't do a routing
 	 * lookup, so that's fine - all we require is a path through
-	 * mctp_local_output, which will call rt->output on whatever
+	 * mctp_local_output, which will call dst->output on whatever
 	 * route we provide
 	 */
-	__mctp_route_test_init(test, &dev, &rt, sock, MCTP_NET_ANY);
+	__mctp_route_test_init(test, &dev, dst, tpq, sock, MCTP_NET_ANY);
 
 	/* Assign a single EID. ->addrs is freed on mctp netdev release */
 	dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL);
@@ -1059,42 +1120,41 @@ static void mctp_test_flow_init(struct kunit *test,
 	skb_reserve(skb, sizeof(struct mctp_hdr) + 1);
 	memset(skb_put(skb, len), 0, len);
 
-	/* take a ref for the route, we'll decrement in local output */
-	refcount_inc(&rt->rt.refs);
 
 	*devp = dev;
-	*rtp = rt;
 	*skbp = skb;
 }
 
 static void mctp_test_flow_fini(struct kunit *test,
 				struct mctp_test_dev *dev,
-				struct mctp_test_route *rt,
+				struct mctp_dst *dst,
+				struct mctp_test_pktqueue *tpq,
 				struct socket *sock)
 {
-	__mctp_route_test_fini(test, dev, rt, sock);
+	__mctp_route_test_fini(test, dev, dst, tpq, sock);
 }
 
 /* test that an outgoing skb has the correct MCTP extension data set */
 static void mctp_test_packet_flow(struct kunit *test)
 {
+	struct mctp_test_pktqueue tpq;
 	struct sk_buff *skb, *skb2;
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct mctp_flow *flow;
 	struct socket *sock;
-	u8 dst = 8;
+	u8 dst_eid = 8;
 	int n, rc;
 
-	mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 30);
+	mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 30);
 
-	rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER);
+	rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
-	n = rt->pkts.qlen;
+	n = tpq.pkts.qlen;
 	KUNIT_ASSERT_EQ(test, n, 1);
 
-	skb2 = skb_dequeue(&rt->pkts);
+	skb2 = skb_dequeue(&tpq.pkts);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2);
 
 	flow = skb_ext_find(skb2, SKB_EXT_MCTP);
@@ -1103,7 +1163,7 @@ static void mctp_test_packet_flow(struct kunit *test)
 	KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk);
 
 	kfree_skb(skb2);
-	mctp_test_flow_fini(test, dev, rt, sock);
+	mctp_test_flow_fini(test, dev, &dst, &tpq, sock);
 }
 
 /* test that outgoing skbs, after fragmentation, all have the correct MCTP
@@ -1111,26 +1171,27 @@ static void mctp_test_packet_flow(struct kunit *test)
  */
 static void mctp_test_fragment_flow(struct kunit *test)
 {
+	struct mctp_test_pktqueue tpq;
 	struct mctp_flow *flows[2];
 	struct sk_buff *tx_skbs[2];
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
+	struct mctp_dst dst;
 	struct sk_buff *skb;
 	struct socket *sock;
-	u8 dst = 8;
+	u8 dst_eid = 8;
 	int n, rc;
 
-	mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 100);
+	mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 100);
 
-	rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER);
+	rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
-	n = rt->pkts.qlen;
+	n = tpq.pkts.qlen;
 	KUNIT_ASSERT_EQ(test, n, 2);
 
 	/* both resulting packets should have the same flow data */
-	tx_skbs[0] = skb_dequeue(&rt->pkts);
-	tx_skbs[1] = skb_dequeue(&rt->pkts);
+	tx_skbs[0] = skb_dequeue(&tpq.pkts);
+	tx_skbs[1] = skb_dequeue(&tpq.pkts);
 
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]);
@@ -1146,7 +1207,7 @@ static void mctp_test_fragment_flow(struct kunit *test)
 
 	kfree_skb(tx_skbs[0]);
 	kfree_skb(tx_skbs[1]);
-	mctp_test_flow_fini(test, dev, rt, sock);
+	mctp_test_flow_fini(test, dev, &dst, &tpq, sock);
 }
 
 #else
@@ -1164,15 +1225,16 @@ static void mctp_test_fragment_flow(struct kunit *test)
 /* Test that outgoing skbs cause a suitable tag to be created */
 static void mctp_test_route_output_key_create(struct kunit *test)
 {
+	const u8 dst_eid = 26, src_eid = 15;
+	struct mctp_test_pktqueue tpq;
 	const unsigned int netid = 50;
-	const u8 dst = 26, src = 15;
-	struct mctp_test_route *rt;
 	struct mctp_test_dev *dev;
 	struct mctp_sk_key *key;
 	struct netns_mctp *mns;
 	unsigned long flags;
 	struct socket *sock;
 	struct sk_buff *skb;
+	struct mctp_dst dst;
 	bool empty, single;
 	const int len = 2;
 	int rc;
@@ -1181,15 +1243,14 @@ static void mctp_test_route_output_key_create(struct kunit *test)
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev);
 	WRITE_ONCE(dev->mdev->net, netid);
 
-	rt = mctp_test_create_route(&init_net, dev->mdev, dst, 68);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt);
+	mctp_test_dst_setup(test, &dst, dev, &tpq, 68);
 
 	rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
 	dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL);
 	dev->mdev->num_addrs = 1;
-	dev->mdev->addrs[0] = src;
+	dev->mdev->addrs[0] = src_eid;
 
 	skb = alloc_skb(sizeof(struct mctp_hdr) + 1 + len, GFP_KERNEL);
 	KUNIT_ASSERT_TRUE(test, skb);
@@ -1197,8 +1258,6 @@ static void mctp_test_route_output_key_create(struct kunit *test)
 	skb_reserve(skb, sizeof(struct mctp_hdr) + 1 + len);
 	memset(skb_put(skb, len), 0, len);
 
-	refcount_inc(&rt->rt.refs);
-
 	mns = &sock_net(sock->sk)->mctp;
 
 	/* We assume we're starting from an empty keys list, which requires
@@ -1209,7 +1268,7 @@ static void mctp_test_route_output_key_create(struct kunit *test)
 	spin_unlock_irqrestore(&mns->keys_lock, flags);
 	KUNIT_ASSERT_TRUE(test, empty);
 
-	rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER);
+	rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER);
 	KUNIT_ASSERT_EQ(test, rc, 0);
 
 	key = NULL;
@@ -1225,13 +1284,13 @@ static void mctp_test_route_output_key_create(struct kunit *test)
 	KUNIT_ASSERT_TRUE(test, single);
 
 	KUNIT_EXPECT_EQ(test, key->net, netid);
-	KUNIT_EXPECT_EQ(test, key->local_addr, src);
-	KUNIT_EXPECT_EQ(test, key->peer_addr, dst);
+	KUNIT_EXPECT_EQ(test, key->local_addr, src_eid);
+	KUNIT_EXPECT_EQ(test, key->peer_addr, dst_eid);
 	/* key has incoming tag, so inverse of what we sent */
 	KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER);
 
 	sock_release(sock);
-	mctp_test_route_destroy(test, rt);
+	mctp_test_dst_release(&dst, &tpq);
 	mctp_test_destroy_dev(dev);
 }
 
-- 
cgit v1.2.3


From 3007f90ec0385304ab5794e9585427b73f40e32f Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Wed, 2 Jul 2025 14:20:04 +0800
Subject: net: mctp: separate cb from direct-addressing routing

Now that we have the dst->haddr populated by sendmsg (when extended
addressing is in use), we no longer need to stash the link-layer address
in the skb->cb.

Instead, only use skb->cb for incoming lladdr data.

While we're at it: remove cb->src, as was never used.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20250702-dev-forwarding-v5-4-1468191da8a4@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h |  4 ++--
 net/mctp/route.c   | 21 +++++----------------
 2 files changed, 7 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index 6c9c5c48f59a..b3af0690f607 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -183,8 +183,8 @@ struct mctp_sk_key {
 struct mctp_skb_cb {
 	unsigned int	magic;
 	unsigned int	net;
-	int		ifindex; /* extended/direct addressing if set */
-	mctp_eid_t	src;
+	/* fields below provide extended addressing for ingress to recvmsg() */
+	int		ifindex;
 	unsigned char	halen;
 	unsigned char	haddr[MAX_ADDR_LEN];
 };
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 3985388a6035..23f339b43643 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -561,35 +561,28 @@ out:
 
 static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 {
-	struct mctp_skb_cb *cb = mctp_cb(skb);
 	struct mctp_hdr *hdr = mctp_hdr(skb);
 	char daddr_buf[MAX_ADDR_LEN];
 	char *daddr = NULL;
 	int rc;
 
 	skb->protocol = htons(ETH_P_MCTP);
+	skb->pkt_type = PACKET_OUTGOING;
 
 	if (skb->len > dst->mtu) {
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
 
-	/* If we're forwarding, we don't want to use the input path's cb,
-	 * as it holds the *source* hardware addressing information.
-	 *
-	 * We will have a PACKET_HOST skb from the dev, or PACKET_OUTGOING
-	 * from a socket; only use cb in the latter case.
-	 */
-	if (skb->pkt_type == PACKET_OUTGOING && cb->ifindex) {
-		/* direct route; use the hwaddr we stashed in sendmsg */
-		if (cb->halen != skb->dev->addr_len) {
+	/* direct route; use the hwaddr we stashed in sendmsg */
+	if (dst->halen) {
+		if (dst->halen != skb->dev->addr_len) {
 			/* sanity check, sendmsg should have already caught this */
 			kfree_skb(skb);
 			return -EMSGSIZE;
 		}
-		daddr = cb->haddr;
+		daddr = dst->haddr;
 	} else {
-		skb->pkt_type = PACKET_OUTGOING;
 		/* If lookup fails let the device handle daddr==NULL */
 		if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0)
 			daddr = daddr_buf;
@@ -1009,7 +1002,6 @@ int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
 {
 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
-	struct mctp_skb_cb *cb = mctp_cb(skb);
 	struct mctp_sk_key *key;
 	struct mctp_hdr *hdr;
 	unsigned long flags;
@@ -1064,9 +1056,6 @@ int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
 	skb_reset_network_header(skb);
 	skb->dev = dst->dev->dev;
 
-	/* cb->net will have been set on initial ingress */
-	cb->src = saddr;
-
 	/* set up common header fields */
 	hdr = mctp_hdr(skb);
 	hdr->ver = 1;
-- 
cgit v1.2.3


From ad39c12fcee34b8980a80ad5c803bca9906fbd4e Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Wed, 2 Jul 2025 14:20:13 +0800
Subject: net: mctp: add gateway routing support

This change allows for gateway routing, where a route table entry
may reference a routable endpoint (by network and EID), instead of
routing directly to a netdevice.

We add support for a RTM_GATEWAY attribute for netlink route updates,
with an attribute format of:

    struct mctp_fq_addr {
        unsigned int net;
        mctp_eid_t eid;
    }

- we need the net here to uniquely identify the target EID, as we no
longer have the device reference directly (which would provide the net
id in the case of direct routes).

This makes route lookups recursive, as a route lookup that returns a
gateway route must be resolved into a direct route (ie, to a device)
eventually. We provide a limit to the route lookups, to prevent infinite
loop routing.

The route lookup populates a new 'nexthop' field in the dst structure,
which now specifies the key for the neighbour table lookup on device
output, rather than using the packet destination address directly.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h        |  13 ++-
 include/uapi/linux/mctp.h |   8 ++
 net/mctp/route.c          | 206 +++++++++++++++++++++++++++++++++-------------
 net/mctp/test/utils.c     |   3 +-
 4 files changed, 173 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index b3af0690f607..ac4f4ecdfc24 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -237,8 +237,18 @@ struct mctp_route {
 	mctp_eid_t		min, max;
 
 	unsigned char		type;
+
 	unsigned int		mtu;
-	struct mctp_dev		*dev;
+
+	enum {
+		MCTP_ROUTE_DIRECT,
+		MCTP_ROUTE_GATEWAY,
+	} dst_type;
+	union {
+		struct mctp_dev	*dev;
+		struct mctp_fq_addr gateway;
+	};
+
 	int			(*output)(struct mctp_dst *dst,
 					  struct sk_buff *skb);
 
@@ -256,6 +266,7 @@ struct mctp_route {
 struct mctp_dst {
 	struct mctp_dev *dev;
 	unsigned int mtu;
+	mctp_eid_t nexthop;
 
 	/* set for direct addressing */
 	unsigned char halen;
diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index e1db65df9359..19ad12a0cd4b 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -37,6 +37,14 @@ struct sockaddr_mctp_ext {
 	__u8			smctp_haddr[MAX_ADDR_LEN];
 };
 
+/* A "fully qualified" MCTP address, which includes the system-local network ID,
+ * required to uniquely resolve a routable EID.
+ */
+struct mctp_fq_addr {
+	unsigned int	net;
+	mctp_eid_t	eid;
+};
+
 #define MCTP_NET_ANY		0x0
 
 #define MCTP_ADDR_NULL		0x00
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 5eca3ce0ba3c..a20d6b11d418 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -563,7 +563,6 @@ out:
 
 static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 {
-	struct mctp_hdr *hdr = mctp_hdr(skb);
 	char daddr_buf[MAX_ADDR_LEN];
 	char *daddr = NULL;
 	int rc;
@@ -586,7 +585,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 		daddr = dst->haddr;
 	} else {
 		/* If lookup fails let the device handle daddr==NULL */
-		if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0)
+		if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0)
 			daddr = daddr_buf;
 	}
 
@@ -610,7 +609,8 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb)
 static void mctp_route_release(struct mctp_route *rt)
 {
 	if (refcount_dec_and_test(&rt->refs)) {
-		mctp_dev_put(rt->dev);
+		if (rt->dst_type == MCTP_ROUTE_DIRECT)
+			mctp_dev_put(rt->dev);
 		kfree_rcu(rt, rcu);
 	}
 }
@@ -799,10 +799,16 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
 }
 
 /* routing lookups */
+static unsigned int mctp_route_netid(struct mctp_route *rt)
+{
+	return rt->dst_type == MCTP_ROUTE_DIRECT ?
+		READ_ONCE(rt->dev->net) : rt->gateway.net;
+}
+
 static bool mctp_rt_match_eid(struct mctp_route *rt,
 			      unsigned int net, mctp_eid_t eid)
 {
-	return READ_ONCE(rt->dev->net) == net &&
+	return mctp_route_netid(rt) == net &&
 		rt->min <= eid && rt->max >= eid;
 }
 
@@ -811,16 +817,21 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1,
 				  struct mctp_route *rt2)
 {
 	ASSERT_RTNL();
-	return rt1->dev->net == rt2->dev->net &&
+	return mctp_route_netid(rt1) == mctp_route_netid(rt2) &&
 		rt1->min == rt2->min &&
 		rt1->max == rt2->max;
 }
 
-static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route)
+/* must only be called on a direct route, as the final output hop */
+static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid,
+				unsigned int mtu, struct mctp_route *route)
 {
 	mctp_dev_hold(route->dev);
+	dst->nexthop = eid;
 	dst->dev = route->dev;
-	dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu);
+	dst->mtu = READ_ONCE(dst->dev->dev->mtu);
+	if (mtu)
+		dst->mtu = min(dst->mtu, mtu);
 	dst->halen = 0;
 	dst->output = route->output;
 }
@@ -854,6 +865,7 @@ int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
 	dst->mtu = READ_ONCE(netdev->mtu);
 	dst->halen = halen;
 	dst->output = mctp_dst_output;
+	dst->nexthop = 0;
 	memcpy(dst->haddr, haddr, halen);
 
 	rc = 0;
@@ -868,24 +880,54 @@ void mctp_dst_release(struct mctp_dst *dst)
 	mctp_dev_put(dst->dev);
 }
 
+static struct mctp_route *mctp_route_lookup_single(struct net *net,
+						   unsigned int dnet,
+						   mctp_eid_t daddr)
+{
+	struct mctp_route *rt;
+
+	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
+		if (mctp_rt_match_eid(rt, dnet, daddr))
+			return rt;
+	}
+
+	return NULL;
+}
+
 /* populates *dst on successful lookup, if set */
 int mctp_route_lookup(struct net *net, unsigned int dnet,
 		      mctp_eid_t daddr, struct mctp_dst *dst)
 {
+	const unsigned int max_depth = 32;
+	unsigned int depth, mtu = 0;
 	int rc = -EHOSTUNREACH;
-	struct mctp_route *rt;
 
 	rcu_read_lock();
 
-	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
-		/* TODO: add metrics */
-		if (!mctp_rt_match_eid(rt, dnet, daddr))
-			continue;
+	for (depth = 0; depth < max_depth; depth++) {
+		struct mctp_route *rt;
 
-		if (dst)
-			mctp_dst_from_route(dst, rt);
-		rc = 0;
-		break;
+		rt = mctp_route_lookup_single(net, dnet, daddr);
+		if (!rt)
+			break;
+
+		/* clamp mtu to the smallest in the path, allowing 0
+		 * to specify no restrictions
+		 */
+		if (mtu && rt->mtu)
+			mtu = min(mtu, rt->mtu);
+		else
+			mtu = mtu ?: rt->mtu;
+
+		if (rt->dst_type == MCTP_ROUTE_DIRECT) {
+			if (dst)
+				mctp_dst_from_route(dst, daddr, mtu, rt);
+			rc = 0;
+			break;
+
+		} else if (rt->dst_type == MCTP_ROUTE_GATEWAY) {
+			daddr = rt->gateway.eid;
+		}
 	}
 
 	rcu_read_unlock();
@@ -902,10 +944,13 @@ static int mctp_route_lookup_null(struct net *net, struct net_device *dev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
-		if (rt->dev->dev != dev || rt->type != RTN_LOCAL)
+		if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL)
+			continue;
+
+		if (rt->dev->dev != dev)
 			continue;
 
-		mctp_dst_from_route(dst, rt);
+		mctp_dst_from_route(dst, 0, 0, rt);
 		rc = 0;
 		break;
 	}
@@ -1085,11 +1130,6 @@ out_release:
 	return rc;
 }
 
-static unsigned int mctp_route_netid(struct mctp_route *rt)
-{
-	return rt->dev->net;
-}
-
 /* route management */
 
 /* mctp_route_add(): Add the provided route, previously allocated via
@@ -1097,9 +1137,9 @@ static unsigned int mctp_route_netid(struct mctp_route *rt)
  * hold on rt->dev for usage in the route table. On failure a caller will want
  * to mctp_route_release().
  *
- * We expect that the caller has set rt->type, rt->min, rt->max, rt->dev and
- * rt->mtu, and that the route holds a reference to rt->dev (via mctp_dev_hold).
- * Other fields will be populated.
+ * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max,
+ * rt->mtu and either rt->dev (with a reference held appropriately) or
+ * rt->gateway. Other fields will be populated.
  */
 static int mctp_route_add(struct net *net, struct mctp_route *rt)
 {
@@ -1108,7 +1148,10 @@ static int mctp_route_add(struct net *net, struct mctp_route *rt)
 	if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max))
 		return -EINVAL;
 
-	if (!rt->dev)
+	if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev)
+		return -EINVAL;
+
+	if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid)
 		return -EINVAL;
 
 	switch (rt->type) {
@@ -1177,6 +1220,7 @@ int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
 
 	rt->min = addr;
 	rt->max = addr;
+	rt->dst_type = MCTP_ROUTE_DIRECT;
 	rt->dev = mdev;
 	rt->type = RTN_LOCAL;
 
@@ -1203,7 +1247,7 @@ void mctp_route_remove_dev(struct mctp_dev *mdev)
 
 	ASSERT_RTNL();
 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
-		if (rt->dev == mdev) {
+		if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) {
 			list_del_rcu(&rt->list);
 			/* TODO: immediate RTM_DELROUTE */
 			mctp_route_release(rt);
@@ -1296,21 +1340,28 @@ static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = {
 	[RTA_DST]		= { .type = NLA_U8 },
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_OIF]		= { .type = NLA_U32 },
+	[RTA_GATEWAY]		= NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)),
 };
 
 static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = {
 	[RTAX_MTU]		= { .type = NLA_U32 },
 };
 
-/* base parsing; common to both _lookup and _populate variants */
+/* base parsing; common to both _lookup and _populate variants.
+ *
+ * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate
+ * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev.
+ */
 static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh,
 				     struct netlink_ext_ack *extack,
 				     struct nlattr **tb, struct rtmsg **rtm,
 				     struct mctp_dev **mdev,
+				     struct mctp_fq_addr *gatewayp,
 				     mctp_eid_t *daddr_start)
 {
+	struct mctp_fq_addr *gateway = NULL;
+	unsigned int ifindex = 0;
 	struct net_device *dev;
-	unsigned int ifindex;
 	int rc;
 
 	rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX,
@@ -1326,11 +1377,44 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh,
 	}
 	*daddr_start = nla_get_u8(tb[RTA_DST]);
 
-	if (!tb[RTA_OIF]) {
-		NL_SET_ERR_MSG(extack, "ifindex missing");
+	if (tb[RTA_OIF])
+		ifindex = nla_get_u32(tb[RTA_OIF]);
+
+	if (tb[RTA_GATEWAY])
+		gateway = nla_data(tb[RTA_GATEWAY]);
+
+	if (ifindex && gateway) {
+		NL_SET_ERR_MSG(extack,
+			       "cannot specify both ifindex and gateway");
+		return -EINVAL;
+
+	} else if (ifindex) {
+		dev = __dev_get_by_index(net, ifindex);
+		if (!dev) {
+			NL_SET_ERR_MSG(extack, "bad ifindex");
+			return -ENODEV;
+		}
+		*mdev = mctp_dev_get_rtnl(dev);
+		if (!*mdev)
+			return -ENODEV;
+		gatewayp->eid = 0;
+
+	} else if (gateway) {
+		if (!mctp_address_unicast(gateway->eid)) {
+			NL_SET_ERR_MSG(extack, "bad gateway");
+			return -EINVAL;
+		}
+
+		gatewayp->eid = gateway->eid;
+		gatewayp->net = gateway->net != MCTP_NET_ANY ?
+			gateway->net :
+			READ_ONCE(net->mctp.default_net);
+		*mdev = NULL;
+
+	} else {
+		NL_SET_ERR_MSG(extack, "no route output provided");
 		return -EINVAL;
 	}
-	ifindex = nla_get_u32(tb[RTA_OIF]);
 
 	*rtm = nlmsg_data(nlh);
 	if ((*rtm)->rtm_family != AF_MCTP) {
@@ -1343,16 +1427,6 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh,
 		return -EINVAL;
 	}
 
-	dev = __dev_get_by_index(net, ifindex);
-	if (!dev) {
-		NL_SET_ERR_MSG(extack, "bad ifindex");
-		return -ENODEV;
-	}
-
-	*mdev = mctp_dev_get_rtnl(dev);
-	if (!*mdev)
-		return -ENODEV;
-
 	return 0;
 }
 
@@ -1366,24 +1440,34 @@ static int mctp_route_nlparse_lookup(struct net *net, struct nlmsghdr *nlh,
 				     unsigned int *daddr_extent)
 {
 	struct nlattr *tb[RTA_MAX + 1];
+	struct mctp_fq_addr gw;
 	struct mctp_dev *mdev;
 	struct rtmsg *rtm;
 	int rc;
 
 	rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm,
-				       &mdev, daddr_start);
+				       &mdev, &gw, daddr_start);
 	if (rc)
 		return rc;
 
-	*netid = mdev->net;
+	if (mdev) {
+		*netid = mdev->net;
+	} else if (gw.eid) {
+		*netid = gw.net;
+	} else {
+		/* bug: _nlparse_common should not allow this */
+		return -1;
+	}
+
 	*type = rtm->rtm_type;
 	*daddr_extent = rtm->rtm_dst_len;
 
 	return 0;
 }
 
-/* Full route parse for RTM_NEWROUTE: populate @rt. On success, the route will
- * hold a reference to the dev.
+/* Full route parse for RTM_NEWROUTE: populate @rt. On success,
+ * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference
+ * to that dev.
  */
 static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh,
 				       struct netlink_ext_ack *extack,
@@ -1392,6 +1476,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh,
 	struct nlattr *tbx[RTAX_MAX + 1];
 	struct nlattr *tb[RTA_MAX + 1];
 	unsigned int daddr_extent;
+	struct mctp_fq_addr gw;
 	mctp_eid_t daddr_start;
 	struct mctp_dev *dev;
 	struct rtmsg *rtm;
@@ -1399,7 +1484,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh,
 	int rc;
 
 	rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm,
-				       &dev, &daddr_start);
+				       &dev, &gw, &daddr_start);
 	if (rc)
 		return rc;
 
@@ -1425,8 +1510,15 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh,
 	rt->min = daddr_start;
 	rt->max = daddr_start + daddr_extent;
 	rt->mtu = mtu;
-	rt->dev = dev;
-	mctp_dev_hold(rt->dev);
+	if (gw.eid) {
+		rt->dst_type = MCTP_ROUTE_GATEWAY;
+		rt->gateway.eid = gw.eid;
+		rt->gateway.net = gw.net;
+	} else {
+		rt->dst_type = MCTP_ROUTE_DIRECT;
+		rt->dev = dev;
+		mctp_dev_hold(rt->dev);
+	}
 
 	return 0;
 }
@@ -1446,7 +1538,8 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rc < 0)
 		goto err_free;
 
-	if (rt->dev->dev->flags & IFF_LOOPBACK) {
+	if (rt->dst_type == MCTP_ROUTE_DIRECT &&
+	    rt->dev->dev->flags & IFF_LOOPBACK) {
 		NL_SET_ERR_MSG(extack, "no routes to loopback");
 		rc = -EINVAL;
 		goto err_free;
@@ -1505,7 +1598,6 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
 	hdr->rtm_tos = 0;
 	hdr->rtm_table = RT_TABLE_DEFAULT;
 	hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
-	hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
 	hdr->rtm_type = rt->type;
 
 	if (nla_put_u8(skb, RTA_DST, rt->min))
@@ -1522,13 +1614,17 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
 
 	nla_nest_end(skb, metrics);
 
-	if (rt->dev) {
+	if (rt->dst_type == MCTP_ROUTE_DIRECT) {
+		hdr->rtm_scope = RT_SCOPE_LINK;
 		if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex))
 			goto cancel;
+	} else if (rt->dst_type == MCTP_ROUTE_GATEWAY) {
+		hdr->rtm_scope = RT_SCOPE_UNIVERSE;
+		if (nla_put(skb, RTA_GATEWAY,
+			    sizeof(rt->gateway), &rt->gateway))
+			goto cancel;
 	}
 
-	/* TODO: conditional neighbour physaddr? */
-
 	nlmsg_end(skb, nlh);
 
 	return 0;
diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c
index 6b4dc40d882c..97b05e340586 100644
--- a/net/mctp/test/utils.c
+++ b/net/mctp/test/utils.c
@@ -134,6 +134,7 @@ struct mctp_test_route *mctp_test_create_route(struct net *net,
 	rt->rt.max = eid;
 	rt->rt.mtu = mtu;
 	rt->rt.type = RTN_UNSPEC;
+	rt->rt.dst_type = MCTP_ROUTE_DIRECT;
 	if (dev)
 		mctp_dev_hold(dev);
 	rt->rt.dev = dev;
@@ -176,7 +177,7 @@ void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt)
 	list_del_rcu(&rt->rt.list);
 	rtnl_unlock();
 
-	if (rt->rt.dev)
+	if (rt->rt.dst_type == MCTP_ROUTE_DIRECT && rt->rt.dev)
 		mctp_dev_put(rt->rt.dev);
 
 	refs = refcount_read(&rt->rt.refs);
-- 
cgit v1.2.3


From 84a7d6797e6a03705e6b48c613fa424662049d87 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 Jul 2025 07:12:30 +0000
Subject: net/sched: acp_api: no longer acquire RTNL in tc_action_net_exit()

tc_action_net_exit() got an rtnl exclusion in commit
a159d3c4b829 ("net_sched: acquire RTNL in tc_action_net_exit()")

Since then, commit 16af6067392c ("net: sched: implement reference
counted action release") made this RTNL exclusion obsolete for
most cases.

Only tcf_action_offload_del() might still require it.

Move the rtnl locking into tcf_idrinfo_destroy() when
an offload action is found.

Most netns do not have actions, yet deleting them is adding a lot
of pressure on RTNL, which is for many the most contended mutex
in the kernel.

We are moving to a per-netns 'rtnl', so tc_action_net_exit()
will not be able to grab 'rtnl' a single time for a batch of netns.

Before the patch:

perf probe -a rtnl_lock

perf record -e probe:rtnl_lock -a /bin/bash -c 'unshare -n "/bin/true"; sleep 1'
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.305 MB perf.data (25 samples) ]

After the patch:

perf record -e probe:rtnl_lock -a /bin/bash -c 'unshare -n "/bin/true"; sleep 1'
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.304 MB perf.data (9 samples) ]

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Vlad Buslov <vladbu@nvidia.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://patch.msgid.link/20250702071230.1892674-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/act_api.h | 2 --
 net/sched/act_api.c   | 9 ++++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 404df8557f6a..04781c92b43d 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -170,14 +170,12 @@ static inline void tc_action_net_exit(struct list_head *net_list,
 {
 	struct net *net;
 
-	rtnl_lock();
 	list_for_each_entry(net, net_list, exit_list) {
 		struct tc_action_net *tn = net_generic(net, id);
 
 		tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
 		kfree(tn->idrinfo);
 	}
-	rtnl_unlock();
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 057e20cef375..9e468e463467 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
 			 struct tcf_idrinfo *idrinfo)
 {
 	struct idr *idr = &idrinfo->action_idr;
+	bool mutex_taken = false;
 	struct tc_action *p;
-	int ret;
 	unsigned long id = 1;
 	unsigned long tmp;
+	int ret;
 
 	idr_for_each_entry_ul(idr, p, tmp, id) {
+		if (tc_act_in_hw(p) && !mutex_taken) {
+			rtnl_lock();
+			mutex_taken = true;
+		}
 		ret = __tcf_idr_release(p, false, true);
 		if (ret == ACT_P_DELETED)
 			module_put(ops->owner);
 		else if (ret < 0)
 			return;
 	}
+	if (mutex_taken)
+		rtnl_unlock();
 	idr_destroy(&idrinfo->action_idr);
 }
 EXPORT_SYMBOL(tcf_idrinfo_destroy);
-- 
cgit v1.2.3


From b441cf3f8c4b8576639d20c8eb4aa32917602ecd Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 4 Jul 2025 16:54:33 +0200
Subject: xfrm: delete x->tunnel as we delete x

The ipcomp fallback tunnels currently get deleted (from the various
lists and hashtables) as the last user state that needed that fallback
is destroyed (not deleted). If a reference to that user state still
exists, the fallback state will remain on the hashtables/lists,
triggering the WARN in xfrm_state_fini. Because of those remaining
references, the fix in commit f75a2804da39 ("xfrm: destroy xfrm_state
synchronously on net exit path") is not complete.

We recently fixed one such situation in TCP due to defered freeing of
skbs (commit 9b6412e6979f ("tcp: drop secpath at the same time as we
currently drop dst")). This can also happen due to IP reassembly: skbs
with a secpath remain on the reassembly queue until netns
destruction. If we can't guarantee that the queues are flushed by the
time xfrm_state_fini runs, there may still be references to a (user)
xfrm_state, preventing the timely deletion of the corresponding
fallback state.

Instead of chasing each instance of skbs holding a secpath one by one,
this patch fixes the issue directly within xfrm, by deleting the
fallback state as soon as the last user state depending on it has been
deleted. Destruction will still happen when the final reference is
dropped.

A separate lockdep class for the fallback state is required since
we're going to lock x->tunnel while x is locked.

Fixes: 9d4139c76905 ("netns xfrm: per-netns xfrm_state_all list")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  1 -
 net/ipv4/ipcomp.c       |  2 ++
 net/ipv6/ipcomp6.c      |  2 ++
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/xfrm/xfrm_ipcomp.c  |  1 -
 net/xfrm/xfrm_state.c   | 19 ++++++++-----------
 6 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e45a275fca26..91d52a380e37 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -441,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
 int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);
 
 void xfrm_flush_gc(void);
-void xfrm_state_delete_tunnel(struct xfrm_state *x);
 
 struct xfrm_type {
 	struct module		*owner;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 5a4fb2539b08..9a45aed508d1 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
 }
 
 /* We always hold one tunnel user reference to indicate a tunnel */
+static struct lock_class_key xfrm_state_lock_key;
 static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
 {
 	struct net *net = xs_net(x);
@@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
 	t = xfrm_state_alloc(net);
 	if (!t)
 		goto out;
+	lockdep_set_class(&t->lock, &xfrm_state_lock_key);
 
 	t->id.proto = IPPROTO_IPIP;
 	t->id.spi = x->props.saddr.a4;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 72d4858dec18..8607569de34f 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
+static struct lock_class_key xfrm_state_lock_key;
 static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
 {
 	struct net *net = xs_net(x);
@@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
 	t = xfrm_state_alloc(net);
 	if (!t)
 		goto out;
+	lockdep_set_class(&t->lock, &xfrm_state_lock_key);
 
 	t->id.proto = IPPROTO_IPV6;
 	t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index bf140ef781c1..7fd8bc08e6eb 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
 	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
 	unsigned int i;
 
-	xfrm_flush_gc();
 	xfrm_state_flush(net, 0, false, true);
+	xfrm_flush_gc();
 
 	for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
 		WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i]));
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index a38545413b80..43fdc6ed8dd1 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -313,7 +313,6 @@ void ipcomp_destroy(struct xfrm_state *x)
 	struct ipcomp_data *ipcd = x->data;
 	if (!ipcd)
 		return;
-	xfrm_state_delete_tunnel(x);
 	ipcomp_free_data(ipcd);
 	kfree(ipcd);
 }
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c7e6472c623d..f7110a658897 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -811,6 +811,7 @@ void __xfrm_state_destroy(struct xfrm_state *x, bool sync)
 }
 EXPORT_SYMBOL(__xfrm_state_destroy);
 
+static void xfrm_state_delete_tunnel(struct xfrm_state *x);
 int __xfrm_state_delete(struct xfrm_state *x)
 {
 	struct net *net = xs_net(x);
@@ -838,6 +839,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
 
 		xfrm_dev_state_delete(x);
 
+		xfrm_state_delete_tunnel(x);
+
 		/* All xfrm_state objects are created by xfrm_state_alloc.
 		 * The xfrm_state_alloc call gives a reference, and that
 		 * is what we are dropping here.
@@ -941,10 +944,7 @@ restart:
 				err = xfrm_state_delete(x);
 				xfrm_audit_state_delete(x, err ? 0 : 1,
 							task_valid);
-				if (sync)
-					xfrm_state_put_sync(x);
-				else
-					xfrm_state_put(x);
+				xfrm_state_put(x);
 				if (!err)
 					cnt++;
 
@@ -3068,20 +3068,17 @@ void xfrm_flush_gc(void)
 }
 EXPORT_SYMBOL(xfrm_flush_gc);
 
-/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
-void xfrm_state_delete_tunnel(struct xfrm_state *x)
+static void xfrm_state_delete_tunnel(struct xfrm_state *x)
 {
 	if (x->tunnel) {
 		struct xfrm_state *t = x->tunnel;
 
-		if (atomic_read(&t->tunnel_users) == 2)
+		if (atomic_dec_return(&t->tunnel_users) == 1)
 			xfrm_state_delete(t);
-		atomic_dec(&t->tunnel_users);
-		xfrm_state_put_sync(t);
+		xfrm_state_put(t);
 		x->tunnel = NULL;
 	}
 }
-EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
 u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
 {
@@ -3286,8 +3283,8 @@ void xfrm_state_fini(struct net *net)
 	unsigned int sz;
 
 	flush_work(&net->xfrm.state_hash_work);
-	flush_work(&xfrm_state_gc_work);
 	xfrm_state_flush(net, 0, false, true);
+	flush_work(&xfrm_state_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.state_all));
 
-- 
cgit v1.2.3


From 2a198bbec6913ae1c90ec963750003c6213668c7 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 4 Jul 2025 16:54:34 +0200
Subject: Revert "xfrm: destroy xfrm_state synchronously on net exit path"

This reverts commit f75a2804da391571563c4b6b29e7797787332673.

With all states (whether user or kern) removed from the hashtables
during deletion, there's no need for synchronous destruction of
states. xfrm6_tunnel states still need to have been destroyed (which
will be the case when its last user is deleted (not destroyed)) so
that xfrm6_tunnel_free_spi removes it from the per-netns hashtable
before the netns is destroyed.

This has the benefit of skipping one synchronize_rcu per state (in
__xfrm_state_destroy(sync=true)) when we exit a netns.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 12 +++---------
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/key/af_key.c        |  2 +-
 net/xfrm/xfrm_state.c   | 23 +++++++++--------------
 net/xfrm/xfrm_user.c    |  2 +-
 5 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 91d52a380e37..f3014e4f54fc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -915,7 +915,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
 		xfrm_pol_put(pols[i]);
 }
 
-void __xfrm_state_destroy(struct xfrm_state *, bool);
+void __xfrm_state_destroy(struct xfrm_state *);
 
 static inline void __xfrm_state_put(struct xfrm_state *x)
 {
@@ -925,13 +925,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x)
 static inline void xfrm_state_put(struct xfrm_state *x)
 {
 	if (refcount_dec_and_test(&x->refcnt))
-		__xfrm_state_destroy(x, false);
-}
-
-static inline void xfrm_state_put_sync(struct xfrm_state *x)
-{
-	if (refcount_dec_and_test(&x->refcnt))
-		__xfrm_state_destroy(x, true);
+		__xfrm_state_destroy(x);
 }
 
 static inline void xfrm_state_hold(struct xfrm_state *x)
@@ -1769,7 +1763,7 @@ struct xfrmk_spdinfo {
 
 struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
 int xfrm_state_delete(struct xfrm_state *x);
-int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
+int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
 int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
 			  bool task_valid);
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 7fd8bc08e6eb..5120a763da0d 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
 	struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
 	unsigned int i;
 
-	xfrm_state_flush(net, 0, false, true);
+	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
 	xfrm_flush_gc();
 
 	for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index efc2a91f4c48..b5d761700776 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1766,7 +1766,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m
 	if (proto == 0)
 		return -EINVAL;
 
-	err = xfrm_state_flush(net, proto, true, false);
+	err = xfrm_state_flush(net, proto, true);
 	err2 = unicast_flush_resp(sk, hdr);
 	if (err || err2) {
 		if (err == -ESRCH) /* empty table - go quietly */
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f7110a658897..327a1a6f892c 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -592,7 +592,7 @@ void xfrm_state_free(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_free);
 
-static void ___xfrm_state_destroy(struct xfrm_state *x)
+static void xfrm_state_gc_destroy(struct xfrm_state *x)
 {
 	if (x->mode_cbs && x->mode_cbs->destroy_state)
 		x->mode_cbs->destroy_state(x);
@@ -631,7 +631,7 @@ static void xfrm_state_gc_task(struct work_struct *work)
 	synchronize_rcu();
 
 	hlist_for_each_entry_safe(x, tmp, &gc_list, gclist)
-		___xfrm_state_destroy(x);
+		xfrm_state_gc_destroy(x);
 }
 
 static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
@@ -795,19 +795,14 @@ void xfrm_dev_state_free(struct xfrm_state *x)
 }
 #endif
 
-void __xfrm_state_destroy(struct xfrm_state *x, bool sync)
+void __xfrm_state_destroy(struct xfrm_state *x)
 {
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
 
-	if (sync) {
-		synchronize_rcu();
-		___xfrm_state_destroy(x);
-	} else {
-		spin_lock_bh(&xfrm_state_gc_lock);
-		hlist_add_head(&x->gclist, &xfrm_state_gc_list);
-		spin_unlock_bh(&xfrm_state_gc_lock);
-		schedule_work(&xfrm_state_gc_work);
-	}
+	spin_lock_bh(&xfrm_state_gc_lock);
+	hlist_add_head(&x->gclist, &xfrm_state_gc_list);
+	spin_unlock_bh(&xfrm_state_gc_lock);
+	schedule_work(&xfrm_state_gc_work);
 }
 EXPORT_SYMBOL(__xfrm_state_destroy);
 
@@ -922,7 +917,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool
 }
 #endif
 
-int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync)
+int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
 {
 	int i, err = 0, cnt = 0;
 
@@ -3283,7 +3278,7 @@ void xfrm_state_fini(struct net *net)
 	unsigned int sz;
 
 	flush_work(&net->xfrm.state_hash_work);
-	xfrm_state_flush(net, 0, false, true);
+	xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
 	flush_work(&xfrm_state_gc_work);
 
 	WARN_ON(!list_empty(&net->xfrm.state_all));
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 1db18f470f42..684239018bec 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2635,7 +2635,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct xfrm_usersa_flush *p = nlmsg_data(nlh);
 	int err;
 
-	err = xfrm_state_flush(net, p->proto, true, false);
+	err = xfrm_state_flush(net, p->proto, true);
 	if (err) {
 		if (err == -ESRCH) /* empty table */
 			return 0;
-- 
cgit v1.2.3


From e22da4685013922ade8e7b5e727ac6804fe5b51e Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@kernel.org>
Date: Tue, 1 Jul 2025 16:46:57 +0200
Subject: net/handshake: Add new parameter 'HANDSHAKE_A_ACCEPT_KEYRING'

Add a new netlink parameter 'HANDSHAKE_A_ACCEPT_KEYRING' to provide
the serial number of the keyring to use.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20250701144657.104401-1-hare@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/handshake.yaml | 4 ++++
 include/uapi/linux/handshake.h             | 1 +
 net/handshake/tlshd.c                      | 6 ++++++
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
index 39ed1661c7f1..95c3fade7a8d 100644
--- a/Documentation/netlink/specs/handshake.yaml
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -71,6 +71,9 @@ attribute-sets:
       -
         name: peername
         type: string
+      -
+        name: keyring
+        type: u32
   -
     name: done
     attributes:
@@ -109,6 +112,7 @@ operations:
             - peer-identity
             - certificate
             - peername
+            - keyring
     -
       name: done
       doc: Handler reports handshake completion
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 3d7ea58778c9..662e7de46c54 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -45,6 +45,7 @@ enum {
 	HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
 	HANDSHAKE_A_ACCEPT_CERTIFICATE,
 	HANDSHAKE_A_ACCEPT_PEERNAME,
+	HANDSHAKE_A_ACCEPT_KEYRING,
 
 	__HANDSHAKE_A_ACCEPT_MAX,
 	HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1)
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
index d6f52839827e..081093dfd553 100644
--- a/net/handshake/tlshd.c
+++ b/net/handshake/tlshd.c
@@ -230,6 +230,12 @@ static int tls_handshake_accept(struct handshake_req *req,
 		if (ret < 0)
 			goto out_cancel;
 	}
+	if (treq->th_keyring) {
+		ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_KEYRING,
+				  treq->th_keyring);
+		if (ret < 0)
+			goto out_cancel;
+	}
 
 	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
 			  treq->th_auth_mode);
-- 
cgit v1.2.3


From 70b9c0c11e55167b9552ef395bc00f4920299177 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 30 Jun 2025 15:02:18 +0200
Subject: uapi: bitops: use UAPI-safe variant of BITS_PER_LONG again (2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BITS_PER_LONG does not exist in UAPI headers, so can't be used by the UAPI
__GENMASK(). Instead __BITS_PER_LONG needs to be used.

When __GENMASK() was introduced in commit 3c7a8e190bc5 ("uapi: introduce uapi-friendly macros for GENMASK"),
the code was fine. A broken revert in 1e7933a575ed ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"")
introduced the incorrect usage of BITS_PER_LONG.
That was fixed in commit 11fcf368506d ("uapi: bitops: use UAPI-safe variant of BITS_PER_LONG again").
But a broken sync of the kernel headers with the tools/ headers in
commit fc92099902fb ("tools headers: Synchronize linux/bits.h with the kernel sources")
undid the fix.

Reapply the fix and while at it also fix the tools header.

Fixes: fc92099902fb ("tools headers: Synchronize linux/bits.h with the kernel sources")
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/uapi/linux/bits.h       | 4 ++--
 tools/include/uapi/linux/bits.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h
index 682b406e1067..a04afef9efca 100644
--- a/include/uapi/linux/bits.h
+++ b/include/uapi/linux/bits.h
@@ -4,9 +4,9 @@
 #ifndef _UAPI_LINUX_BITS_H
 #define _UAPI_LINUX_BITS_H
 
-#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h))))
+#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h))))
 
-#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
+#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))
 
 #define __GENMASK_U128(h, l) \
 	((_BIT128((h)) << 1) - (_BIT128(l)))
diff --git a/tools/include/uapi/linux/bits.h b/tools/include/uapi/linux/bits.h
index 682b406e1067..a04afef9efca 100644
--- a/tools/include/uapi/linux/bits.h
+++ b/tools/include/uapi/linux/bits.h
@@ -4,9 +4,9 @@
 #ifndef _UAPI_LINUX_BITS_H
 #define _UAPI_LINUX_BITS_H
 
-#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h))))
+#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h))))
 
-#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
+#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h))))
 
 #define __GENMASK_U128(h, l) \
 	((_BIT128((h)) << 1) - (_BIT128(l)))
-- 
cgit v1.2.3


From 25489a4f556414445d342951615178368ee45cde Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Wed, 2 Jul 2025 15:38:08 +0200
Subject: net: splice: Drop unused @gfp

Since its introduction in commit 2e910b95329c ("net: Add a function to
splice pages into an skbuff for MSG_SPLICE_PAGES"), skb_splice_from_iter()
never used the @gfp argument. Remove it and adapt callers.

No functional change intended.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
Link: https://patch.msgid.link/20250702-splice-drop-unused-v3-2-55f68b60d2b7@rbox.co
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c | 3 +--
 include/linux/skbuff.h                                      | 2 +-
 net/core/skbuff.c                                           | 3 +--
 net/ipv4/ip_output.c                                        | 3 +--
 net/ipv4/tcp.c                                              | 3 +--
 net/ipv6/ip6_output.c                                       | 3 +--
 net/kcm/kcmsock.c                                           | 3 +--
 net/unix/af_unix.c                                          | 3 +--
 8 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
index d567e42e1760..465fa8077964 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_io.c
@@ -1096,8 +1096,7 @@ new_buf:
 			copy = size;
 
 		if (msg->msg_flags & MSG_SPLICE_PAGES) {
-			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
 			if (err < 0) {
 				if (err == -EMSGSIZE)
 					goto new_buf;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4f6dcb37bae8..b8b06e71b73e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5265,7 +5265,7 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
 }
 
 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
-			     ssize_t maxsize, gfp_t gfp);
+			     ssize_t maxsize);
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ae0f1aae3c91..a34fe37cf7a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -7230,7 +7230,6 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
  * @skb: The buffer to add pages to
  * @iter: Iterator representing the pages to be added
  * @maxsize: Maximum amount of pages to be added
- * @gfp: Allocation flags
  *
  * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
  * extracts pages from an iterator and adds them to the socket buffer if
@@ -7241,7 +7240,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
  * insufficient space in the buffer to transfer anything.
  */
 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
-			     ssize_t maxsize, gfp_t gfp)
+			     ssize_t maxsize)
 {
 	size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
 	struct page *pages[8], **ppages = pages;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 414b47a0d513..10a1d182fd84 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1222,8 +1222,7 @@ alloc_new_skb:
 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
 				goto error;
 
-			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
 			if (err < 0)
 				goto error;
 			copy = err;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 925b2c572ca2..860223c6f124 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1295,8 +1295,7 @@ new_segment:
 			if (!copy)
 				goto wait_for_space;
 
-			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
 			if (err < 0) {
 				if (err == -EMSGSIZE) {
 					tcp_mark_push(tp, skb);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 877bee7ffee9..fcc20c7250eb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1762,8 +1762,7 @@ alloc_new_skb:
 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
 				goto error;
 
-			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
 			if (err < 0)
 				goto error;
 			copy = err;
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 24aec295a51c..a0be3896a934 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -835,8 +835,7 @@ start:
 			if (!sk_wmem_schedule(sk, copy))
 				goto wait_for_memory;
 
-			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
 			if (err < 0) {
 				if (err == -EMSGSIZE)
 					goto wait_for_memory;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 564c970d97ff..cd0d582bc7d4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2388,8 +2388,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
-						   sk->sk_allocation);
+			err = skb_splice_from_iter(skb, &msg->msg_iter, size);
 			if (err < 0)
 				goto out_free;
 
-- 
cgit v1.2.3


From c523058713abac66b0d83ae12a0574d76cd7df2b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 3 Jul 2025 07:55:52 +0200
Subject: net: phy: declare package-related struct members only if
 CONFIG_PHY_PACKAGE is enabled

Now that we have an own config symbol for the PHY package module,
we can use it to reduce size of these structs if it isn't enabled.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://patch.msgid.link/f0daefa4-406a-4a06-a4f0-7e31309f82bc@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 74c1bcf64b3c..543a94751a6b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -409,8 +409,10 @@ struct mii_bus {
 	/** @shared_lock: protect access to the shared element */
 	struct mutex shared_lock;
 
+#if IS_ENABLED(CONFIG_PHY_PACKAGE)
 	/** @shared: shared state across different PHYs */
 	struct phy_package_shared *shared[PHY_MAX_ADDR];
+#endif
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
@@ -718,9 +720,11 @@ struct phy_device {
 	/* For use by PHYs to maintain extra state */
 	void *priv;
 
+#if IS_ENABLED(CONFIG_PHY_PACKAGE)
 	/* shared data pointer */
 	/* For use by PHYs inside the same package that need a shared state. */
 	struct phy_package_shared *shared;
+#endif
 
 	/* Reporting cable test results */
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From e7d4c1c5a54648fd5b787a4a0f81521ec7260bcd Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:49 +0200
Subject: virtio: introduce extended features

The virtio specifications allows for up to 128 bits for the
device features. Soon we are going to use some of the 'extended'
bits features (above 64) for the virtio_net driver.

Introduce extended features as a fixed size array of u64. To minimize
the diffstat allows legacy driver to access the low 64 bits via a
transparent union.

Introduce an extended get_extended_features configuration callback
that devices supporting the extended features range must implement in
place of the traditional one.

Note that legacy and transport features don't need any change, as
they are always in the low 64 bit range.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/virtio/virtio.c         | 43 ++++++++++++--------
 drivers/virtio/virtio_debug.c   | 27 +++++++------
 include/linux/virtio.h          |  9 +++--
 include/linux/virtio_config.h   | 43 ++++++++++----------
 include/linux/virtio_features.h | 88 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 156 insertions(+), 54 deletions(-)
 create mode 100644 include/linux/virtio_features.h

(limited to 'include')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 95d5d7993e5b..5c48788cdbec 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -53,7 +53,7 @@ static ssize_t features_show(struct device *_d,
 
 	/* We actually represent this as a bitstring, as it could be
 	 * arbitrary length in future. */
-	for (i = 0; i < sizeof(dev->features)*8; i++)
+	for (i = 0; i < VIRTIO_FEATURES_MAX; i++)
 		len += sysfs_emit_at(buf, len, "%c",
 			       __virtio_test_bit(dev, i) ? '1' : '0');
 	len += sysfs_emit_at(buf, len, "\n");
@@ -272,22 +272,22 @@ static int virtio_dev_probe(struct device *_d)
 	int err, i;
 	struct virtio_device *dev = dev_to_virtio(_d);
 	struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
-	u64 device_features;
-	u64 driver_features;
+	u64 device_features[VIRTIO_FEATURES_DWORDS];
+	u64 driver_features[VIRTIO_FEATURES_DWORDS];
 	u64 driver_features_legacy;
 
 	/* We have a driver! */
 	virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
 
 	/* Figure out what features the device supports. */
-	device_features = dev->config->get_features(dev);
+	virtio_get_features(dev, device_features);
 
 	/* Figure out what features the driver supports. */
-	driver_features = 0;
+	virtio_features_zero(driver_features);
 	for (i = 0; i < drv->feature_table_size; i++) {
 		unsigned int f = drv->feature_table[i];
-		BUG_ON(f >= 64);
-		driver_features |= (1ULL << f);
+		if (!WARN_ON_ONCE(f >= VIRTIO_FEATURES_MAX))
+			virtio_features_set_bit(driver_features, f);
 	}
 
 	/* Some drivers have a separate feature table for virtio v1.0 */
@@ -295,24 +295,29 @@ static int virtio_dev_probe(struct device *_d)
 		driver_features_legacy = 0;
 		for (i = 0; i < drv->feature_table_size_legacy; i++) {
 			unsigned int f = drv->feature_table_legacy[i];
-			BUG_ON(f >= 64);
-			driver_features_legacy |= (1ULL << f);
+			if (!WARN_ON_ONCE(f >= 64))
+				driver_features_legacy |= (1ULL << f);
 		}
 	} else {
-		driver_features_legacy = driver_features;
+		driver_features_legacy = driver_features[0];
 	}
 
-	if (device_features & (1ULL << VIRTIO_F_VERSION_1))
-		dev->features = driver_features & device_features;
-	else
-		dev->features = driver_features_legacy & device_features;
+	if (virtio_features_test_bit(device_features, VIRTIO_F_VERSION_1)) {
+		for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+			dev->features_array[i] = driver_features[i] &
+						 device_features[i];
+	} else {
+		virtio_features_from_u64(dev->features_array,
+					 driver_features_legacy &
+					 device_features[0]);
+	}
 
 	/* When debugging, user may filter some features by hand. */
 	virtio_debug_device_filter_features(dev);
 
 	/* Transport features always preserved to pass to finalize_features. */
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
-		if (device_features & (1ULL << i))
+		if (virtio_features_test_bit(device_features, i))
 			__virtio_set_bit(dev, i);
 
 	err = dev->config->finalize_features(dev);
@@ -320,14 +325,15 @@ static int virtio_dev_probe(struct device *_d)
 		goto err;
 
 	if (drv->validate) {
-		u64 features = dev->features;
+		u64 features[VIRTIO_FEATURES_DWORDS];
 
+		virtio_features_copy(features, dev->features_array);
 		err = drv->validate(dev);
 		if (err)
 			goto err;
 
 		/* Did validation change any features? Then write them again. */
-		if (features != dev->features) {
+		if (!virtio_features_equal(features, dev->features_array)) {
 			err = dev->config->finalize_features(dev);
 			if (err)
 				goto err;
@@ -701,6 +707,9 @@ EXPORT_SYMBOL_GPL(virtio_device_reset_done);
 
 static int virtio_init(void)
 {
+	BUILD_BUG_ON(offsetof(struct virtio_device, features) !=
+		     offsetof(struct virtio_device, features_array[0]));
+
 	if (bus_register(&virtio_bus) != 0)
 		panic("virtio bus registration failed");
 	virtio_debug_init();
diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c
index 95c8fc7705bb..d58713ddf2e5 100644
--- a/drivers/virtio/virtio_debug.c
+++ b/drivers/virtio/virtio_debug.c
@@ -8,13 +8,13 @@ static struct dentry *virtio_debugfs_dir;
 
 static int virtio_debug_device_features_show(struct seq_file *s, void *data)
 {
+	u64 device_features[VIRTIO_FEATURES_DWORDS];
 	struct virtio_device *dev = s->private;
-	u64 device_features;
 	unsigned int i;
 
-	device_features = dev->config->get_features(dev);
-	for (i = 0; i < BITS_PER_LONG_LONG; i++) {
-		if (device_features & (1ULL << i))
+	virtio_get_features(dev, device_features);
+	for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+		if (virtio_features_test_bit(device_features, i))
 			seq_printf(s, "%u\n", i);
 	}
 	return 0;
@@ -26,8 +26,8 @@ static int virtio_debug_filter_features_show(struct seq_file *s, void *data)
 	struct virtio_device *dev = s->private;
 	unsigned int i;
 
-	for (i = 0; i < BITS_PER_LONG_LONG; i++) {
-		if (dev->debugfs_filter_features & (1ULL << i))
+	for (i = 0; i < VIRTIO_FEATURES_MAX; i++) {
+		if (virtio_features_test_bit(dev->debugfs_filter_features, i))
 			seq_printf(s, "%u\n", i);
 	}
 	return 0;
@@ -39,7 +39,7 @@ static int virtio_debug_filter_features_clear(void *data, u64 val)
 	struct virtio_device *dev = data;
 
 	if (val == 1)
-		dev->debugfs_filter_features = 0;
+		virtio_features_zero(dev->debugfs_filter_features);
 	return 0;
 }
 
@@ -50,9 +50,10 @@ static int virtio_debug_filter_feature_add(void *data, u64 val)
 {
 	struct virtio_device *dev = data;
 
-	if (val >= BITS_PER_LONG_LONG)
+	if (val >= VIRTIO_FEATURES_MAX)
 		return -EINVAL;
-	dev->debugfs_filter_features |= BIT_ULL_MASK(val);
+
+	virtio_features_set_bit(dev->debugfs_filter_features, val);
 	return 0;
 }
 
@@ -63,9 +64,10 @@ static int virtio_debug_filter_feature_del(void *data, u64 val)
 {
 	struct virtio_device *dev = data;
 
-	if (val >= BITS_PER_LONG_LONG)
+	if (val >= VIRTIO_FEATURES_MAX)
 		return -EINVAL;
-	dev->debugfs_filter_features &= ~BIT_ULL_MASK(val);
+
+	virtio_features_clear_bit(dev->debugfs_filter_features, val);
 	return 0;
 }
 
@@ -91,7 +93,8 @@ EXPORT_SYMBOL_GPL(virtio_debug_device_init);
 
 void virtio_debug_device_filter_features(struct virtio_device *dev)
 {
-	dev->features &= ~dev->debugfs_filter_features;
+	virtio_features_andnot(dev->features_array, dev->features_array,
+			       dev->debugfs_filter_features);
 }
 EXPORT_SYMBOL_GPL(virtio_debug_device_filter_features);
 
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 64cb4b04be7a..04b90c88d164 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -11,6 +11,7 @@
 #include <linux/gfp.h>
 #include <linux/dma-mapping.h>
 #include <linux/completion.h>
+#include <linux/virtio_features.h>
 
 /**
  * struct virtqueue - a queue to register buffers for sending or receiving.
@@ -141,7 +142,9 @@ struct virtio_admin_cmd {
  * @config: the configuration ops for this device.
  * @vringh_config: configuration ops for host vrings.
  * @vqs: the list of virtqueues for this device.
- * @features: the features supported by both driver and device.
+ * @features: the 64 lower features supported by both driver and device.
+ * @features_array: the full features space supported by both driver and
+ *		    device.
  * @priv: private pointer for the driver's use.
  * @debugfs_dir: debugfs directory entry.
  * @debugfs_filter_features: features to be filtered set by debugfs.
@@ -159,11 +162,11 @@ struct virtio_device {
 	const struct virtio_config_ops *config;
 	const struct vringh_config_ops *vringh_config;
 	struct list_head vqs;
-	u64 features;
+	VIRTIO_DECLARE_FEATURES(features);
 	void *priv;
 #ifdef CONFIG_VIRTIO_DEBUG
 	struct dentry *debugfs_dir;
-	u64 debugfs_filter_features;
+	u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS];
 #endif
 };
 
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index b3e1d30c765b..918cf25cd3c6 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -77,7 +77,11 @@ struct virtqueue_info {
  *      vdev: the virtio_device
  * @get_features: get the array of feature bits for this device.
  *	vdev: the virtio_device
- *	Returns the first 64 feature bits (all we currently need).
+ *	Returns the first 64 feature bits.
+ * @get_extended_features:
+ *      vdev: the virtio_device
+ *      Returns the first VIRTIO_FEATURES_MAX feature bits (all we currently
+ *      need).
  * @finalize_features: confirm what device features we'll be using.
  *	vdev: the virtio_device
  *	This sends the driver feature bits to the device: it can change
@@ -121,6 +125,8 @@ struct virtio_config_ops {
 	void (*del_vqs)(struct virtio_device *);
 	void (*synchronize_cbs)(struct virtio_device *);
 	u64 (*get_features)(struct virtio_device *vdev);
+	void (*get_extended_features)(struct virtio_device *vdev,
+				      u64 *features);
 	int (*finalize_features)(struct virtio_device *vdev);
 	const char *(*bus_name)(struct virtio_device *vdev);
 	int (*set_vq_affinity)(struct virtqueue *vq,
@@ -147,13 +153,7 @@ void virtio_check_driver_offered_feature(const struct virtio_device *vdev,
 static inline bool __virtio_test_bit(const struct virtio_device *vdev,
 				     unsigned int fbit)
 {
-	/* Did you forget to fix assumptions on max features? */
-	if (__builtin_constant_p(fbit))
-		BUILD_BUG_ON(fbit >= 64);
-	else
-		BUG_ON(fbit >= 64);
-
-	return vdev->features & BIT_ULL(fbit);
+	return virtio_features_test_bit(vdev->features_array, fbit);
 }
 
 /**
@@ -164,13 +164,7 @@ static inline bool __virtio_test_bit(const struct virtio_device *vdev,
 static inline void __virtio_set_bit(struct virtio_device *vdev,
 				    unsigned int fbit)
 {
-	/* Did you forget to fix assumptions on max features? */
-	if (__builtin_constant_p(fbit))
-		BUILD_BUG_ON(fbit >= 64);
-	else
-		BUG_ON(fbit >= 64);
-
-	vdev->features |= BIT_ULL(fbit);
+	virtio_features_set_bit(vdev->features_array, fbit);
 }
 
 /**
@@ -181,13 +175,7 @@ static inline void __virtio_set_bit(struct virtio_device *vdev,
 static inline void __virtio_clear_bit(struct virtio_device *vdev,
 				      unsigned int fbit)
 {
-	/* Did you forget to fix assumptions on max features? */
-	if (__builtin_constant_p(fbit))
-		BUILD_BUG_ON(fbit >= 64);
-	else
-		BUG_ON(fbit >= 64);
-
-	vdev->features &= ~BIT_ULL(fbit);
+	virtio_features_clear_bit(vdev->features_array, fbit);
 }
 
 /**
@@ -204,6 +192,17 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
 	return __virtio_test_bit(vdev, fbit);
 }
 
+static inline void virtio_get_features(struct virtio_device *vdev,
+				       u64 *features)
+{
+	if (vdev->config->get_extended_features) {
+		vdev->config->get_extended_features(vdev, features);
+		return;
+	}
+
+	virtio_features_from_u64(features, vdev->config->get_features(vdev));
+}
+
 /**
  * virtio_has_dma_quirk - determine whether this device has the DMA quirk
  * @vdev: the device
diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h
new file mode 100644
index 000000000000..f748f2f87de8
--- /dev/null
+++ b/include/linux/virtio_features.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VIRTIO_FEATURES_H
+#define _LINUX_VIRTIO_FEATURES_H
+
+#include <linux/bits.h>
+
+#define VIRTIO_FEATURES_DWORDS	2
+#define VIRTIO_FEATURES_MAX	(VIRTIO_FEATURES_DWORDS * 64)
+#define VIRTIO_FEATURES_WORDS	(VIRTIO_FEATURES_DWORDS * 2)
+#define VIRTIO_BIT(b)		BIT_ULL((b) & 0x3f)
+#define VIRTIO_DWORD(b)		((b) >> 6)
+#define VIRTIO_DECLARE_FEATURES(name)			\
+	union {						\
+		u64 name;				\
+		u64 name##_array[VIRTIO_FEATURES_DWORDS];\
+	}
+
+static inline bool virtio_features_chk_bit(unsigned int bit)
+{
+	if (__builtin_constant_p(bit)) {
+		/*
+		 * Don't care returning the correct value: the build
+		 * will fail before any bad features access
+		 */
+		BUILD_BUG_ON(bit >= VIRTIO_FEATURES_MAX);
+	} else {
+		if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_MAX))
+			return false;
+	}
+	return true;
+}
+
+static inline bool virtio_features_test_bit(const u64 *features,
+					    unsigned int bit)
+{
+	return virtio_features_chk_bit(bit) &&
+	       !!(features[VIRTIO_DWORD(bit)] & VIRTIO_BIT(bit));
+}
+
+static inline void virtio_features_set_bit(u64 *features,
+					   unsigned int bit)
+{
+	if (virtio_features_chk_bit(bit))
+		features[VIRTIO_DWORD(bit)] |= VIRTIO_BIT(bit);
+}
+
+static inline void virtio_features_clear_bit(u64 *features,
+					     unsigned int bit)
+{
+	if (virtio_features_chk_bit(bit))
+		features[VIRTIO_DWORD(bit)] &= ~VIRTIO_BIT(bit);
+}
+
+static inline void virtio_features_zero(u64 *features)
+{
+	memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_DWORDS);
+}
+
+static inline void virtio_features_from_u64(u64 *features, u64 from)
+{
+	virtio_features_zero(features);
+	features[0] = from;
+}
+
+static inline bool virtio_features_equal(const u64 *f1, const u64 *f2)
+{
+	int i;
+
+	for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i)
+		if (f1[i] != f2[i])
+			return false;
+	return true;
+}
+
+static inline void virtio_features_copy(u64 *to, const u64 *from)
+{
+	memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_DWORDS);
+}
+
+static inline void virtio_features_andnot(u64 *to, const u64 *f1, const u64 *f2)
+{
+	int i;
+
+	for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+		to[i] = f1[i] & ~f2[i];
+}
+
+#endif
-- 
cgit v1.2.3


From 69b9461512246599ed80cf13358e7e6aff7285f9 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:52 +0200
Subject: virtio_pci_modern: allow configuring extended features

The virtio specifications allows for up to 128 bits for the
device features. Soon we are going to use some of the 'extended'
bits features (above 64) for the virtio_net driver.

Extend the virtio pci modern driver to support configuring the full
virtio features range, replacing the unrolled loops reading and
writing the features space with explicit one bounded to the actual
features space size in word and implementing the get_extended_features
callback.

Note that in vp_finalize_features() we only need to cache the lower
64 features bits, to process the transport features.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/virtio/virtio_pci_modern.c     | 10 ++---
 drivers/virtio/virtio_pci_modern_dev.c | 69 ++++++++++++++++++++--------------
 include/linux/virtio_pci_modern.h      | 43 +++++++++++++++++++--
 3 files changed, 84 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 7182f43ed055..dd0e65f71d41 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -22,11 +22,11 @@
 
 #define VIRTIO_AVQ_SGS_MAX	4
 
-static u64 vp_get_features(struct virtio_device *vdev)
+static void vp_get_features(struct virtio_device *vdev, u64 *features)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 
-	return vp_modern_get_features(&vp_dev->mdev);
+	vp_modern_get_extended_features(&vp_dev->mdev, features);
 }
 
 static int vp_avq_index(struct virtio_device *vdev, u16 *index, u16 *num)
@@ -437,7 +437,7 @@ static int vp_finalize_features(struct virtio_device *vdev)
 	if (vp_check_common_size(vdev))
 		return -EINVAL;
 
-	vp_modern_set_features(&vp_dev->mdev, vdev->features);
+	vp_modern_set_extended_features(&vp_dev->mdev, vdev->features_array);
 
 	return 0;
 }
@@ -1234,7 +1234,7 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = {
 	.find_vqs	= vp_modern_find_vqs,
 	.del_vqs	= vp_del_vqs,
 	.synchronize_cbs = vp_synchronize_vectors,
-	.get_features	= vp_get_features,
+	.get_extended_features = vp_get_features,
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
 	.set_vq_affinity = vp_set_vq_affinity,
@@ -1254,7 +1254,7 @@ static const struct virtio_config_ops virtio_pci_config_ops = {
 	.find_vqs	= vp_modern_find_vqs,
 	.del_vqs	= vp_del_vqs,
 	.synchronize_cbs = vp_synchronize_vectors,
-	.get_features	= vp_get_features,
+	.get_extended_features = vp_get_features,
 	.finalize_features = vp_finalize_features,
 	.bus_name	= vp_bus_name,
 	.set_vq_affinity = vp_set_vq_affinity,
diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c
index 0d3dbfaf4b23..d665f8f73ea8 100644
--- a/drivers/virtio/virtio_pci_modern_dev.c
+++ b/drivers/virtio/virtio_pci_modern_dev.c
@@ -388,63 +388,74 @@ void vp_modern_remove(struct virtio_pci_modern_device *mdev)
 EXPORT_SYMBOL_GPL(vp_modern_remove);
 
 /*
- * vp_modern_get_features - get features from device
+ * vp_modern_get_extended_features - get features from device
  * @mdev: the modern virtio-pci device
+ * @features: the features array to be filled
  *
- * Returns the features read from the device
+ * Fill the specified features array with the features read from the device
  */
-u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev)
+void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev,
+				     u64 *features)
 {
 	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+	int i;
 
-	u64 features;
+	virtio_features_zero(features);
+	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+		u64 cur;
 
-	vp_iowrite32(0, &cfg->device_feature_select);
-	features = vp_ioread32(&cfg->device_feature);
-	vp_iowrite32(1, &cfg->device_feature_select);
-	features |= ((u64)vp_ioread32(&cfg->device_feature) << 32);
-
-	return features;
+		vp_iowrite32(i, &cfg->device_feature_select);
+		cur = vp_ioread32(&cfg->device_feature);
+		features[i >> 1] |= cur << (32 * (i & 1));
+	}
 }
-EXPORT_SYMBOL_GPL(vp_modern_get_features);
+EXPORT_SYMBOL_GPL(vp_modern_get_extended_features);
 
 /*
  * vp_modern_get_driver_features - get driver features from device
  * @mdev: the modern virtio-pci device
+ * @features: the features array to be filled
  *
- * Returns the driver features read from the device
+ * Fill the specified features array with the driver features read from the
+ * device
  */
-u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
+void
+vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev,
+				       u64 *features)
 {
 	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+	int i;
 
-	u64 features;
-
-	vp_iowrite32(0, &cfg->guest_feature_select);
-	features = vp_ioread32(&cfg->guest_feature);
-	vp_iowrite32(1, &cfg->guest_feature_select);
-	features |= ((u64)vp_ioread32(&cfg->guest_feature) << 32);
+	virtio_features_zero(features);
+	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+		u64 cur;
 
-	return features;
+		vp_iowrite32(i, &cfg->guest_feature_select);
+		cur = vp_ioread32(&cfg->guest_feature);
+		features[i >> 1] |= cur << (32 * (i & 1));
+	}
 }
-EXPORT_SYMBOL_GPL(vp_modern_get_driver_features);
+EXPORT_SYMBOL_GPL(vp_modern_get_driver_extended_features);
 
 /*
- * vp_modern_set_features - set features to device
+ * vp_modern_set_extended_features - set features to device
  * @mdev: the modern virtio-pci device
  * @features: the features set to device
  */
-void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
-			    u64 features)
+void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
+				     const u64 *features)
 {
 	struct virtio_pci_common_cfg __iomem *cfg = mdev->common;
+	int i;
+
+	for (i = 0; i < VIRTIO_FEATURES_WORDS; i++) {
+		u32 cur = features[i >> 1] >> (32 * (i & 1));
 
-	vp_iowrite32(0, &cfg->guest_feature_select);
-	vp_iowrite32((u32)features, &cfg->guest_feature);
-	vp_iowrite32(1, &cfg->guest_feature_select);
-	vp_iowrite32(features >> 32, &cfg->guest_feature);
+		vp_iowrite32(i, &cfg->guest_feature_select);
+		vp_iowrite32(cur, &cfg->guest_feature);
+	}
 }
-EXPORT_SYMBOL_GPL(vp_modern_set_features);
+EXPORT_SYMBOL_GPL(vp_modern_set_extended_features);
 
 /*
  * vp_modern_generation - get the device genreation
diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h
index c0b1b1ca1163..48bc12d1045b 100644
--- a/include/linux/virtio_pci_modern.h
+++ b/include/linux/virtio_pci_modern.h
@@ -3,6 +3,7 @@
 #define _LINUX_VIRTIO_PCI_MODERN_H
 
 #include <linux/pci.h>
+#include <linux/virtio_config.h>
 #include <linux/virtio_pci.h>
 
 /**
@@ -95,10 +96,44 @@ static inline void vp_iowrite64_twopart(u64 val,
 	vp_iowrite32(val >> 32, hi);
 }
 
-u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev);
-u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev);
-void vp_modern_set_features(struct virtio_pci_modern_device *mdev,
-		     u64 features);
+void
+vp_modern_get_driver_extended_features(struct virtio_pci_modern_device *mdev,
+				       u64 *features);
+void vp_modern_get_extended_features(struct virtio_pci_modern_device *mdev,
+				     u64 *features);
+void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev,
+				     const u64 *features);
+
+static inline u64
+vp_modern_get_features(struct virtio_pci_modern_device *mdev)
+{
+	u64 features_array[VIRTIO_FEATURES_DWORDS];
+
+	vp_modern_get_extended_features(mdev, features_array);
+	return features_array[0];
+}
+
+static inline u64
+vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev)
+{
+	u64 features_array[VIRTIO_FEATURES_DWORDS];
+	int i;
+
+	vp_modern_get_driver_extended_features(mdev, features_array);
+	for (i = 1; i < VIRTIO_FEATURES_DWORDS; ++i)
+		WARN_ON_ONCE(features_array[i]);
+	return features_array[0];
+}
+
+static inline void
+vp_modern_set_features(struct virtio_pci_modern_device *mdev, u64 features)
+{
+	u64 features_array[VIRTIO_FEATURES_DWORDS];
+
+	virtio_features_from_u64(features_array, features);
+	vp_modern_set_extended_features(mdev, features_array);
+}
+
 u32 vp_modern_generation(struct virtio_pci_modern_device *mdev);
 u8 vp_modern_get_status(struct virtio_pci_modern_device *mdev);
 void vp_modern_set_status(struct virtio_pci_modern_device *mdev,
-- 
cgit v1.2.3


From 333c515d189657c934470c9b0b8a8fedb016ce6f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:54 +0200
Subject: vhost-net: allow configuring extended features

Use the extended feature type for 'acked_features' and implement
two new ioctls operation allowing the user-space to set/query an
unbounded amount of features.

The actual number of processed features is limited by VIRTIO_FEATURES_MAX
and attempts to set features above such limit fail with
EOPNOTSUPP.

Note that: the legacy ioctls implicitly truncate the negotiated
features to the lower 64 bits range and the 'acked_backend_features'
field don't need conversion, as the only negotiated feature there
is in the low 64 bit range.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/vhost/net.c              | 87 +++++++++++++++++++++++++++++++---------
 drivers/vhost/vhost.c            |  2 +-
 drivers/vhost/vhost.h            |  4 +-
 include/uapi/linux/vhost.h       |  7 ++++
 include/uapi/linux/vhost_types.h |  5 +++
 5 files changed, 82 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7cbfc7d718b3..67d011b0d4f7 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -69,12 +69,12 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 
 #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
 
-enum {
-	VHOST_NET_FEATURES = VHOST_FEATURES |
-			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
-			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-			 (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
-			 (1ULL << VIRTIO_F_RING_RESET)
+static const u64 vhost_net_features[VIRTIO_FEATURES_DWORDS] = {
+	VHOST_FEATURES |
+	(1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
+	(1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+	(1ULL << VIRTIO_F_ACCESS_PLATFORM) |
+	(1ULL << VIRTIO_F_RING_RESET),
 };
 
 enum {
@@ -1614,16 +1614,17 @@ done:
 	return err;
 }
 
-static int vhost_net_set_features(struct vhost_net *n, u64 features)
+static int vhost_net_set_features(struct vhost_net *n, const u64 *features)
 {
 	size_t vhost_hlen, sock_hlen, hdr_len;
 	int i;
 
-	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-			       (1ULL << VIRTIO_F_VERSION_1))) ?
-			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
-			sizeof(struct virtio_net_hdr);
-	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
+	hdr_len = virtio_features_test_bit(features, VIRTIO_NET_F_MRG_RXBUF) ||
+		  virtio_features_test_bit(features, VIRTIO_F_VERSION_1) ?
+		  sizeof(struct virtio_net_hdr_mrg_rxbuf) :
+		  sizeof(struct virtio_net_hdr);
+
+	if (virtio_features_test_bit(features, VHOST_NET_F_VIRTIO_NET_HDR)) {
 		/* vhost provides vnet_hdr */
 		vhost_hlen = hdr_len;
 		sock_hlen = 0;
@@ -1633,18 +1634,19 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
 		sock_hlen = hdr_len;
 	}
 	mutex_lock(&n->dev.mutex);
-	if ((features & (1 << VHOST_F_LOG_ALL)) &&
+	if (virtio_features_test_bit(features, VHOST_F_LOG_ALL) &&
 	    !vhost_log_access_ok(&n->dev))
 		goto out_unlock;
 
-	if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
+	if (virtio_features_test_bit(features, VIRTIO_F_ACCESS_PLATFORM)) {
 		if (vhost_init_device_iotlb(&n->dev))
 			goto out_unlock;
 	}
 
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
 		mutex_lock(&n->vqs[i].vq.mutex);
-		n->vqs[i].vq.acked_features = features;
+		virtio_features_copy(n->vqs[i].vq.acked_features_array,
+				     features);
 		n->vqs[i].vhost_hlen = vhost_hlen;
 		n->vqs[i].sock_hlen = sock_hlen;
 		mutex_unlock(&n->vqs[i].vq.mutex);
@@ -1681,12 +1683,13 @@ out:
 static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 			    unsigned long arg)
 {
+	u64 all_features[VIRTIO_FEATURES_DWORDS];
 	struct vhost_net *n = f->private_data;
 	void __user *argp = (void __user *)arg;
 	u64 __user *featurep = argp;
 	struct vhost_vring_file backend;
-	u64 features;
-	int r;
+	u64 features, count, copied;
+	int r, i;
 
 	switch (ioctl) {
 	case VHOST_NET_SET_BACKEND:
@@ -1694,16 +1697,60 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
 			return -EFAULT;
 		return vhost_net_set_backend(n, backend.index, backend.fd);
 	case VHOST_GET_FEATURES:
-		features = VHOST_NET_FEATURES;
+		features = vhost_net_features[0];
 		if (copy_to_user(featurep, &features, sizeof features))
 			return -EFAULT;
 		return 0;
 	case VHOST_SET_FEATURES:
 		if (copy_from_user(&features, featurep, sizeof features))
 			return -EFAULT;
-		if (features & ~VHOST_NET_FEATURES)
+		if (features & ~vhost_net_features[0])
 			return -EOPNOTSUPP;
-		return vhost_net_set_features(n, features);
+
+		virtio_features_from_u64(all_features, features);
+		return vhost_net_set_features(n, all_features);
+	case VHOST_GET_FEATURES_ARRAY:
+		if (copy_from_user(&count, featurep, sizeof(count)))
+			return -EFAULT;
+
+		/* Copy the net features, up to the user-provided buffer size */
+		argp += sizeof(u64);
+		copied = min(count, VIRTIO_FEATURES_DWORDS);
+		if (copy_to_user(argp, vhost_net_features,
+				 copied * sizeof(u64)))
+			return -EFAULT;
+
+		/* Zero the trailing space provided by user-space, if any */
+		if (clear_user(argp, size_mul(count - copied, sizeof(u64))))
+			return -EFAULT;
+		return 0;
+	case VHOST_SET_FEATURES_ARRAY:
+		if (copy_from_user(&count, featurep, sizeof(count)))
+			return -EFAULT;
+
+		virtio_features_zero(all_features);
+		argp += sizeof(u64);
+		copied = min(count, VIRTIO_FEATURES_DWORDS);
+		if (copy_from_user(all_features, argp, copied * sizeof(u64)))
+			return -EFAULT;
+
+		/*
+		 * Any feature specified by user-space above
+		 * VIRTIO_FEATURES_MAX is not supported by definition.
+		 */
+		for (i = copied; i < count; ++i) {
+			if (copy_from_user(&features, featurep + 1 + i,
+					   sizeof(features)))
+				return -EFAULT;
+			if (features)
+				return -EOPNOTSUPP;
+		}
+
+		for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++)
+			if (all_features[i] & ~vhost_net_features[i])
+				return -EOPNOTSUPP;
+
+		return vhost_net_set_features(n, all_features);
 	case VHOST_GET_BACKEND_FEATURES:
 		features = VHOST_NET_BACKEND_FEATURES;
 		if (copy_to_user(featurep, &features, sizeof(features)))
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 3a5ebb973dba..1094256a943c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -372,7 +372,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->log_used = false;
 	vq->log_addr = -1ull;
 	vq->private_data = NULL;
-	vq->acked_features = 0;
+	virtio_features_zero(vq->acked_features_array);
 	vq->acked_backend_features = 0;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index bb75a292d50c..d1aed35c4b07 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -133,7 +133,7 @@ struct vhost_virtqueue {
 	struct vhost_iotlb *umem;
 	struct vhost_iotlb *iotlb;
 	void *private_data;
-	u64 acked_features;
+	VIRTIO_DECLARE_FEATURES(acked_features);
 	u64 acked_backend_features;
 	/* Log write descriptors */
 	void __user *log_base;
@@ -291,7 +291,7 @@ static inline void *vhost_vq_get_backend(struct vhost_virtqueue *vq)
 
 static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 {
-	return vq->acked_features & (1ULL << bit);
+	return virtio_features_test_bit(vq->acked_features_array, bit);
 }
 
 static inline bool vhost_backend_has_feature(struct vhost_virtqueue *vq, int bit)
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index d4b3e2ae1314..d6ad01fbb8d2 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -235,4 +235,11 @@
  */
 #define VHOST_VDPA_GET_VRING_SIZE	_IOWR(VHOST_VIRTIO, 0x82,	\
 					      struct vhost_vring_state)
+
+/* Extended features manipulation */
+#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+
 #endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index d7656908f730..1c39cc5f5a31 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -110,6 +110,11 @@ struct vhost_msg_v2 {
 	};
 };
 
+struct vhost_features_array {
+	__u64 count; /* number of entries present in features array */
+	__u64 features[] __counted_by(count);
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
-- 
cgit v1.2.3


From a2fb4bc4e2a6a031683910d85b278c1d25ae5420 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:59 +0200
Subject: net: implement virtio helpers to handle UDP GSO tunneling.

The virtio specification are introducing support for GSO over UDP
tunnel.

This patch brings in the needed defines and the additional virtio hdr
parsing/building helpers.

The UDP tunnel support uses additional fields in the virtio hdr, and such
fields location can change depending on other negotiated features -
specifically VIRTIO_NET_F_HASH_REPORT.

Try to be as conservative as possible with the new field validation.

Existing implementation for plain GSO offloads allow for invalid/
self-contradictory values of such fields. With GSO over UDP tunnel we can
be more strict, with no need to deal with legacy implementation.

Since the checksum-related field validation is asymmetric in the driver
and in the device, introduce a separate helper to implement the new checks
(to be used only on the driver side).

Note that while the feature space exceeds the 64-bit boundaries, the
guest offload space is fixed by the specification of the
VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET command to a 64-bit size.

Prior to the UDP tunnel GSO support, each guest offload bit corresponded
to the feature bit with the same value and vice versa.

Due to the limited 'guest offload' space, relevant features in the high
64 bits are 'mapped' to free bits in the lower range. That is simpler
than defining a new command (and associated features) to exchange an
extended guest offloads set.

As a consequence, the uAPIs also specify the mapped guest offload value
corresponding to the UDP tunnel GSO features.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
v4 -> v5:
  - avoid lines above 80 chars

v3 -> v4:
  - fixed offset for UDP GSO tunnel, update accordingly the helpers
  - tried to clarified vlan_hlen semantic
  - virtio_net_chk_data_valid() -> virtio_net_handle_csum_offload()

v2 -> v3:
  - add definitions for possible vnet hdr layouts with tunnel support

v1 -> v2:
  - 'relay' -> 'rely' typo
  - less unclear comment WRT enforced inner GSO checks
  - inner header fields are allowed only with 'modern' virtio,
    thus are always le
  - clarified in the commit message the need for 'mapped features'
    defines
  - assume little_endian is true when UDP GSO is enabled.
  - fix inner proto type value
---
 include/linux/virtio_net.h      | 197 ++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/virtio_net.h |  33 +++++++
 2 files changed, 222 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 02a9f4dc594d..20e0584db1dd 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -47,9 +47,9 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
 	return 0;
 }
 
-static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
-					const struct virtio_net_hdr *hdr,
-					bool little_endian)
+static inline int __virtio_net_hdr_to_skb(struct sk_buff *skb,
+					  const struct virtio_net_hdr *hdr,
+					  bool little_endian, u8 hdr_gso_type)
 {
 	unsigned int nh_min_len = sizeof(struct iphdr);
 	unsigned int gso_type = 0;
@@ -57,8 +57,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 	unsigned int p_off = 0;
 	unsigned int ip_proto;
 
-	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
-		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+	if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		switch (hdr_gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
 		case VIRTIO_NET_HDR_GSO_TCPV4:
 			gso_type = SKB_GSO_TCPV4;
 			ip_proto = IPPROTO_TCP;
@@ -84,7 +84,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 			return -EINVAL;
 		}
 
-		if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+		if (hdr_gso_type & VIRTIO_NET_HDR_GSO_ECN)
 			gso_type |= SKB_GSO_TCP_ECN;
 
 		if (hdr->gso_size == 0)
@@ -122,7 +122,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 
 				if (!protocol)
 					virtio_net_hdr_set_proto(skb, hdr);
-				else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type))
+				else if (!virtio_net_hdr_match_proto(protocol,
+								 hdr_gso_type))
 					return -EINVAL;
 				else
 					skb->protocol = protocol;
@@ -153,7 +154,7 @@ retry:
 		}
 	}
 
-	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+	if (hdr_gso_type != VIRTIO_NET_HDR_GSO_NONE) {
 		u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
 		unsigned int nh_off = p_off;
 		struct skb_shared_info *shinfo = skb_shinfo(skb);
@@ -199,6 +200,13 @@ retry:
 	return 0;
 }
 
+static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
+					const struct virtio_net_hdr *hdr,
+					bool little_endian)
+{
+	return __virtio_net_hdr_to_skb(skb, hdr, little_endian, hdr->gso_type);
+}
+
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian,
@@ -242,4 +250,177 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 	return 0;
 }
 
+static inline unsigned int virtio_l3min(bool is_ipv6)
+{
+	return is_ipv6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr);
+}
+
+static inline int
+virtio_net_hdr_tnl_to_skb(struct sk_buff *skb,
+			  const struct virtio_net_hdr_v1_hash_tunnel *vhdr,
+			  bool tnl_hdr_negotiated,
+			  bool tnl_csum_negotiated,
+			  bool little_endian)
+{
+	const struct virtio_net_hdr *hdr = (const struct virtio_net_hdr *)vhdr;
+	unsigned int inner_nh, outer_th, inner_th;
+	unsigned int inner_l3min, outer_l3min;
+	u8 gso_inner_type, gso_tunnel_type;
+	bool outer_isv6, inner_isv6;
+	int ret;
+
+	gso_tunnel_type = hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL;
+	if (!gso_tunnel_type)
+		return virtio_net_hdr_to_skb(skb, hdr, little_endian);
+
+	/* Tunnel not supported/negotiated, but the hdr asks for it. */
+	if (!tnl_hdr_negotiated)
+		return -EINVAL;
+
+	/* Either ipv4 or ipv6. */
+	if (gso_tunnel_type == VIRTIO_NET_HDR_GSO_UDP_TUNNEL)
+		return -EINVAL;
+
+	/* The UDP tunnel must carry a GSO packet, but no UFO. */
+	gso_inner_type = hdr->gso_type & ~(VIRTIO_NET_HDR_GSO_ECN |
+					   VIRTIO_NET_HDR_GSO_UDP_TUNNEL);
+	if (!gso_inner_type || gso_inner_type == VIRTIO_NET_HDR_GSO_UDP)
+		return -EINVAL;
+
+	/* Rely on csum being present. */
+	if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
+		return -EINVAL;
+
+	/* Validate offsets. */
+	outer_isv6 = gso_tunnel_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6;
+	inner_isv6 = gso_inner_type == VIRTIO_NET_HDR_GSO_TCPV6;
+	inner_l3min = virtio_l3min(inner_isv6);
+	outer_l3min = ETH_HLEN + virtio_l3min(outer_isv6);
+
+	inner_th = __virtio16_to_cpu(little_endian, hdr->csum_start);
+	inner_nh = le16_to_cpu(vhdr->inner_nh_offset);
+	outer_th = le16_to_cpu(vhdr->outer_th_offset);
+	if (outer_th < outer_l3min ||
+	    inner_nh < outer_th + sizeof(struct udphdr) ||
+	    inner_th < inner_nh + inner_l3min)
+		return -EINVAL;
+
+	/* Let the basic parsing deal with plain GSO features. */
+	ret = __virtio_net_hdr_to_skb(skb, hdr, true,
+				      hdr->gso_type & ~gso_tunnel_type);
+	if (ret)
+		return ret;
+
+	/* In case of USO, the inner protocol is still unknown and
+	 * `inner_isv6` is just a guess, additional parsing is needed.
+	 * The previous validation ensures that accessing an ipv4 inner
+	 * network header is safe.
+	 */
+	if (gso_inner_type == VIRTIO_NET_HDR_GSO_UDP_L4) {
+		struct iphdr *iphdr = (struct iphdr *)(skb->data + inner_nh);
+
+		inner_isv6 = iphdr->version == 6;
+		inner_l3min = virtio_l3min(inner_isv6);
+		if (inner_th < inner_nh + inner_l3min)
+			return -EINVAL;
+	}
+
+	skb_set_inner_protocol(skb, inner_isv6 ? htons(ETH_P_IPV6) :
+						 htons(ETH_P_IP));
+	if (hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM) {
+		if (!tnl_csum_negotiated)
+			return -EINVAL;
+
+		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	} else {
+		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
+	}
+
+	skb->inner_transport_header = inner_th + skb_headroom(skb);
+	skb->inner_network_header = inner_nh + skb_headroom(skb);
+	skb->inner_mac_header = inner_nh + skb_headroom(skb);
+	skb->transport_header = outer_th + skb_headroom(skb);
+	skb->encapsulation = 1;
+	return 0;
+}
+
+/* Checksum-related fields validation for the driver */
+static inline int virtio_net_handle_csum_offload(struct sk_buff *skb,
+						 struct virtio_net_hdr *hdr,
+						 bool tnl_csum_negotiated)
+{
+	if (!(hdr->gso_type & VIRTIO_NET_HDR_GSO_UDP_TUNNEL)) {
+		if (!(hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID))
+			return 0;
+
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!(hdr->flags & VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM))
+			return 0;
+
+		/* tunnel csum packets are invalid when the related
+		 * feature has not been negotiated
+		 */
+		if (!tnl_csum_negotiated)
+			return -EINVAL;
+		skb->csum_level = 1;
+		return 0;
+	}
+
+	/* DATA_VALID is mutually exclusive with NEEDS_CSUM, and GSO
+	 * over UDP tunnel requires the latter
+	 */
+	if (hdr->flags & VIRTIO_NET_HDR_F_DATA_VALID)
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * vlan_hlen always refers to the outermost MAC header. That also
+ * means it refers to the only MAC header, if the packet does not carry
+ * any encapsulation.
+ */
+static inline int
+virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb,
+			    struct virtio_net_hdr_v1_hash_tunnel *vhdr,
+			    bool tnl_hdr_negotiated,
+			    bool little_endian,
+			    int vlan_hlen)
+{
+	struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr;
+	unsigned int inner_nh, outer_th;
+	int tnl_gso_type;
+	int ret;
+
+	tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL |
+						    SKB_GSO_UDP_TUNNEL_CSUM);
+	if (!tnl_gso_type)
+		return virtio_net_hdr_from_skb(skb, hdr, little_endian, false,
+					       vlan_hlen);
+
+	/* Tunnel support not negotiated but skb ask for it. */
+	if (!tnl_hdr_negotiated)
+		return -EINVAL;
+
+	/* Let the basic parsing deal with plain GSO features. */
+	skb_shinfo(skb)->gso_type &= ~tnl_gso_type;
+	ret = virtio_net_hdr_from_skb(skb, hdr, true, false, vlan_hlen);
+	skb_shinfo(skb)->gso_type |= tnl_gso_type;
+	if (ret)
+		return ret;
+
+	if (skb->protocol == htons(ETH_P_IPV6))
+		hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6;
+	else
+		hdr->gso_type |= VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM)
+		hdr->flags |= VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM;
+
+	inner_nh = skb->inner_network_header - skb_headroom(skb);
+	outer_th = skb->transport_header - skb_headroom(skb);
+	vhdr->inner_nh_offset = cpu_to_le16(inner_nh);
+	vhdr->outer_th_offset = cpu_to_le16(outer_th);
+	return 0;
+}
+
 #endif /* _LINUX_VIRTIO_NET_H */
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 963540deae66..8bf27ab8bcb4 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -70,6 +70,28 @@
 					 * with the same MAC.
 					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO 65 /* Driver can receive
+					      * GSO-over-UDP-tunnel packets
+					      */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM 66 /* Driver handles
+						   * GSO-over-UDP-tunnel
+						   * packets with partial csum
+						   * for the outer header
+						   */
+#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO 67 /* Device can receive
+					     * GSO-over-UDP-tunnel packets
+					     */
+#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM 68 /* Device handles
+						  * GSO-over-UDP-tunnel
+						  * packets with partial csum
+						  * for the outer header
+						  */
+
+/* Offloads bits corresponding to VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO{,_CSUM}
+ * features
+ */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED	46
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED	47
 
 #ifndef VIRTIO_NET_NO_LEGACY
 #define VIRTIO_NET_F_GSO	6	/* Host handles pkts w/ any GSO type */
@@ -131,12 +153,17 @@ struct virtio_net_hdr_v1 {
 #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start, csum_offset */
 #define VIRTIO_NET_HDR_F_DATA_VALID	2	/* Csum is valid */
 #define VIRTIO_NET_HDR_F_RSC_INFO	4	/* rsc info in csum_ fields */
+#define VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM 8	/* UDP tunnel csum offload */
 	__u8 flags;
 #define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
 #define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
 #define VIRTIO_NET_HDR_GSO_UDP		3	/* GSO frame, IPv4 UDP (UFO) */
 #define VIRTIO_NET_HDR_GSO_TCPV6	4	/* GSO frame, IPv6 TCP */
 #define VIRTIO_NET_HDR_GSO_UDP_L4	5	/* GSO frame, IPv4& IPv6 UDP (USO) */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 0x20 /* UDPv4 tunnel present */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 0x40 /* UDPv6 tunnel present */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 | \
+				       VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6)
 #define VIRTIO_NET_HDR_GSO_ECN		0x80	/* TCP has ECN set */
 	__u8 gso_type;
 	__virtio16 hdr_len;	/* Ethernet + IP + tcp/udp hdrs */
@@ -181,6 +208,12 @@ struct virtio_net_hdr_v1_hash {
 	__le16 padding;
 };
 
+struct virtio_net_hdr_v1_hash_tunnel {
+	struct virtio_net_hdr_v1_hash hash_hdr;
+	__le16 outer_th_offset;
+	__le16 inner_nh_offset;
+};
+
 #ifndef VIRTIO_NET_NO_LEGACY
 /* This header comes first in the scatter-gather list.
  * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must
-- 
cgit v1.2.3


From 288f30435132d2f9e7a29ec9b9745a4f9dc7fd37 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:55:30 +0200
Subject: tun: enable gso over UDP tunnel support.

Add new tun features to represent the newly introduced virtio
GSO over UDP tunnel offload. Allows detection and selection of
such features via the existing TUNSETOFFLOAD ioctl and compute
the expected virtio header size and tunnel header offset using
the current netdev features, so that we can plug almost seamless
the newly introduced virtio helpers to serialize the extended
virtio header.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v6 -> v7:
  - rebased

v4 -> v5:
  - encapsulate the guest feature guessing in a tun helper
  - dropped irrelevant check on xdp buff headroom
  - do not remove unrelated black line
  - avoid line len > 80 char

v3 -> v4:
  - virtio tnl-related fields are at fixed offset, cleanup
    the code accordingly.
  - use netdev features instead of flags bit to check for
    the configured offload
  - drop packet in case of enabled features/configured hdr
    size mismatch

v2 -> v3:
  - cleaned-up uAPI comments
  - use explicit struct layout instead of raw buf.
---
 drivers/net/tun.c           |  58 +++++++++++++++++++++----
 drivers/net/tun_vnet.h      | 101 ++++++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/if_tun.h |   9 ++++
 3 files changed, 150 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f8c5e2fd04df..abc91f28dac4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -186,7 +186,8 @@ struct tun_struct {
 	struct net_device	*dev;
 	netdev_features_t	set_features;
 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
-			  NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4)
+			  NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4 | \
+			  NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM)
 
 	int			align;
 	int			vnet_hdr_sz;
@@ -925,6 +926,7 @@ static int tun_net_init(struct net_device *dev)
 	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 			   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
 			   NETIF_F_HW_VLAN_STAG_TX;
+	dev->hw_enc_features = dev->hw_features;
 	dev->features = dev->hw_features;
 	dev->vlan_features = dev->features &
 			     ~(NETIF_F_HW_VLAN_CTAG_TX |
@@ -1698,7 +1700,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	struct sk_buff *skb;
 	size_t total_len = iov_iter_count(from);
 	size_t len = total_len, align = tun->align, linear;
-	struct virtio_net_hdr gso = { 0 };
+	struct virtio_net_hdr_v1_hash_tunnel hdr;
+	struct virtio_net_hdr *gso;
 	int good_linear;
 	int copylen;
 	int hdr_len = 0;
@@ -1708,6 +1711,15 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int skb_xdp = 1;
 	bool frags = tun_napi_frags_enabled(tfile);
 	enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	netdev_features_t features = 0;
+
+	/*
+	 * Keep it easy and always zero the whole buffer, even if the
+	 * tunnel-related field will be touched only when the feature
+	 * is enabled and the hdr size id compatible.
+	 */
+	memset(&hdr, 0, sizeof(hdr));
+	gso = (struct virtio_net_hdr *)&hdr;
 
 	if (!(tun->flags & IFF_NO_PI)) {
 		if (len < sizeof(pi))
@@ -1721,7 +1733,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (tun->flags & IFF_VNET_HDR) {
 		int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
 
-		hdr_len = tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, from, &gso);
+		features = tun_vnet_hdr_guest_features(vnet_hdr_sz);
+		hdr_len = __tun_vnet_hdr_get(vnet_hdr_sz, tun->flags,
+					     features, from, gso);
 		if (hdr_len < 0)
 			return hdr_len;
 
@@ -1755,7 +1769,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
 		 */
-		skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
+		skb = tun_build_skb(tun, tfile, from, gso, len, &skb_xdp);
 		err = PTR_ERR_OR_ZERO(skb);
 		if (err)
 			goto drop;
@@ -1799,7 +1813,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		}
 	}
 
-	if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) {
+	if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, &hdr)) {
 		atomic_long_inc(&tun->rx_frame_errors);
 		err = -EINVAL;
 		goto free_skb;
@@ -2050,13 +2064,21 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	}
 
 	if (vnet_hdr_sz) {
-		struct virtio_net_hdr gso;
+		struct virtio_net_hdr_v1_hash_tunnel hdr;
+		struct virtio_net_hdr *gso;
 
-		ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
+		ret = tun_vnet_hdr_tnl_from_skb(tun->flags, tun->dev, skb,
+						&hdr);
 		if (ret)
 			return ret;
 
-		ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
+		/*
+		 * Drop the packet if the configured header size is too small
+		 * WRT the enabled offloads.
+		 */
+		gso = (struct virtio_net_hdr *)&hdr;
+		ret = __tun_vnet_hdr_put(vnet_hdr_sz, tun->dev->features,
+					 iter, gso);
 		if (ret)
 			return ret;
 	}
@@ -2357,10 +2379,12 @@ static int tun_xdp_one(struct tun_struct *tun,
 {
 	unsigned int datasize = xdp->data_end - xdp->data;
 	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
+	struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr;
 	struct virtio_net_hdr *gso = &hdr->gso;
 	struct bpf_prog *xdp_prog;
 	struct sk_buff *skb = NULL;
 	struct sk_buff_head *queue;
+	netdev_features_t features;
 	u32 rxhash = 0, act;
 	int buflen = hdr->buflen;
 	int metasize = 0;
@@ -2426,7 +2450,9 @@ build:
 	if (metasize > 0)
 		skb_metadata_set(skb, metasize);
 
-	if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) {
+	features = tun_vnet_hdr_guest_features(READ_ONCE(tun->vnet_hdr_sz));
+	tnl_hdr = (struct virtio_net_hdr_v1_hash_tunnel *)gso;
+	if (tun_vnet_hdr_tnl_to_skb(tun->flags, features, skb, tnl_hdr)) {
 		atomic_long_inc(&tun->rx_frame_errors);
 		kfree_skb(skb);
 		ret = -EINVAL;
@@ -2812,6 +2838,8 @@ static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
 
 }
 
+#define PLAIN_GSO (NETIF_F_GSO_UDP_L4 | NETIF_F_TSO | NETIF_F_TSO6)
+
 /* This is like a cut-down ethtool ops, except done via tun fd so no
  * privs required. */
 static int set_offload(struct tun_struct *tun, unsigned long arg)
@@ -2841,6 +2869,18 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
 			features |= NETIF_F_GSO_UDP_L4;
 			arg &= ~(TUN_F_USO4 | TUN_F_USO6);
 		}
+
+		/*
+		 * Tunnel offload is allowed only if some plain offload is
+		 * available, too.
+		 */
+		if (features & PLAIN_GSO && arg & TUN_F_UDP_TUNNEL_GSO) {
+			features |= NETIF_F_GSO_UDP_TUNNEL;
+			if (arg & TUN_F_UDP_TUNNEL_GSO_CSUM)
+				features |= NETIF_F_GSO_UDP_TUNNEL_CSUM;
+			arg &= ~(TUN_F_UDP_TUNNEL_GSO |
+				 TUN_F_UDP_TUNNEL_GSO_CSUM);
+		}
 	}
 
 	/* This gives the user a way to test for new features in future by
diff --git a/drivers/net/tun_vnet.h b/drivers/net/tun_vnet.h
index 58b9ac7a5fc4..81662328b2c7 100644
--- a/drivers/net/tun_vnet.h
+++ b/drivers/net/tun_vnet.h
@@ -6,6 +6,8 @@
 #define TUN_VNET_LE     0x80000000
 #define TUN_VNET_BE     0x40000000
 
+#define TUN_VNET_TNL_SIZE	sizeof(struct virtio_net_hdr_v1_hash_tunnel)
+
 static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
 {
 	bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) &&
@@ -107,16 +109,26 @@ static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags,
 	}
 }
 
-static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
-				   struct iov_iter *from,
-				   struct virtio_net_hdr *hdr)
+static inline unsigned int tun_vnet_parse_size(netdev_features_t features)
+{
+	if (!(features & NETIF_F_GSO_UDP_TUNNEL))
+		return sizeof(struct virtio_net_hdr);
+
+	return TUN_VNET_TNL_SIZE;
+}
+
+static inline int __tun_vnet_hdr_get(int sz, unsigned int flags,
+				     netdev_features_t features,
+				     struct iov_iter *from,
+				     struct virtio_net_hdr *hdr)
 {
+	unsigned int parsed_size = tun_vnet_parse_size(features);
 	u16 hdr_len;
 
 	if (iov_iter_count(from) < sz)
 		return -EINVAL;
 
-	if (!copy_from_iter_full(hdr, sizeof(*hdr), from))
+	if (!copy_from_iter_full(hdr, parsed_size, from))
 		return -EFAULT;
 
 	hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len);
@@ -129,32 +141,70 @@ static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
 	if (hdr_len > iov_iter_count(from))
 		return -EINVAL;
 
-	iov_iter_advance(from, sz - sizeof(*hdr));
+	iov_iter_advance(from, sz - parsed_size);
 
 	return hdr_len;
 }
 
-static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
-				   const struct virtio_net_hdr *hdr)
+static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
+				   struct iov_iter *from,
+				   struct virtio_net_hdr *hdr)
+{
+	return __tun_vnet_hdr_get(sz, flags, 0, from, hdr);
+}
+
+static inline int __tun_vnet_hdr_put(int sz, netdev_features_t features,
+				     struct iov_iter *iter,
+				     const struct virtio_net_hdr *hdr)
 {
+	unsigned int parsed_size = tun_vnet_parse_size(features);
+
 	if (unlikely(iov_iter_count(iter) < sz))
 		return -EINVAL;
 
-	if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
+	if (unlikely(copy_to_iter(hdr, parsed_size, iter) != parsed_size))
 		return -EFAULT;
 
-	if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
+	if (iov_iter_zero(sz - parsed_size, iter) != sz - parsed_size)
 		return -EFAULT;
 
 	return 0;
 }
 
+static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
+				   const struct virtio_net_hdr *hdr)
+{
+	return __tun_vnet_hdr_put(sz, 0, iter, hdr);
+}
+
 static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
 				      const struct virtio_net_hdr *hdr)
 {
 	return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
 }
 
+/*
+ * Tun is not aware of the negotiated guest features, guess them from the
+ * virtio net hdr size
+ */
+static inline netdev_features_t tun_vnet_hdr_guest_features(int vnet_hdr_sz)
+{
+	if (vnet_hdr_sz >= TUN_VNET_TNL_SIZE)
+		return NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM;
+	return 0;
+}
+
+static inline int
+tun_vnet_hdr_tnl_to_skb(unsigned int flags, netdev_features_t features,
+			struct sk_buff *skb,
+			const struct virtio_net_hdr_v1_hash_tunnel *hdr)
+{
+	return virtio_net_hdr_tnl_to_skb(skb, hdr,
+				features & NETIF_F_GSO_UDP_TUNNEL,
+				features & NETIF_F_GSO_UDP_TUNNEL_CSUM,
+				tun_vnet_is_little_endian(flags));
+}
+
 static inline int tun_vnet_hdr_from_skb(unsigned int flags,
 					const struct net_device *dev,
 					const struct sk_buff *skb,
@@ -183,4 +233,37 @@ static inline int tun_vnet_hdr_from_skb(unsigned int flags,
 	return 0;
 }
 
+static inline int
+tun_vnet_hdr_tnl_from_skb(unsigned int flags,
+			  const struct net_device *dev,
+			  const struct sk_buff *skb,
+			  struct virtio_net_hdr_v1_hash_tunnel *tnl_hdr)
+{
+	bool has_tnl_offload = !!(dev->features & NETIF_F_GSO_UDP_TUNNEL);
+	int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;
+
+	if (virtio_net_hdr_tnl_from_skb(skb, tnl_hdr, has_tnl_offload,
+					tun_vnet_is_little_endian(flags),
+					vlan_hlen)) {
+		struct virtio_net_hdr_v1 *hdr = &tnl_hdr->hash_hdr.hdr;
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		if (net_ratelimit()) {
+			int hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len);
+
+			netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
+				   sinfo->gso_type,
+				   tun_vnet16_to_cpu(flags, hdr->gso_size),
+				   tun_vnet16_to_cpu(flags, hdr->hdr_len));
+			print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE,
+				       16, 1, skb->head, min(hdr_len, 64),
+				       true);
+		}
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 #endif /* TUN_VNET_H */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 287cdc81c939..79d53c7a1ebd 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -93,6 +93,15 @@
 #define TUN_F_USO4	0x20	/* I can handle USO for IPv4 packets */
 #define TUN_F_USO6	0x40	/* I can handle USO for IPv6 packets */
 
+/* I can handle TSO/USO for UDP tunneled packets */
+#define TUN_F_UDP_TUNNEL_GSO		0x080
+
+/*
+ * I can handle TSO/USO for UDP tunneled packets requiring csum offload for
+ * the outer header
+ */
+#define TUN_F_UDP_TUNNEL_GSO_CSUM	0x100
+
 /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */
 #define TUN_PKT_STRIP	0x0001
 struct tun_pi {
-- 
cgit v1.2.3


From 58074a0fce66c6c97b35ce8a28ed4e7b780f9a8f Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 11 Jun 2025 13:01:14 -0500
Subject: perf: arm_pmuv3: Add support for the Branch Record Buffer Extension
 (BRBE)

The ARMv9.2 architecture introduces the optional Branch Record Buffer
Extension (BRBE), which records information about branches as they are
executed into set of branch record registers. BRBE is similar to x86's
Last Branch Record (LBR) and PowerPC's Branch History Rolling Buffer
(BHRB).

BRBE supports filtering by exception level and can filter just the
source or target address if excluded to avoid leaking privileged
addresses. The h/w filter would be sufficient except when there are
multiple events with disjoint filtering requirements. In this case, BRBE
is configured with a union of all the events' desired branches, and then
the recorded branches are filtered based on each event's filter. For
example, with one event capturing kernel events and another event
capturing user events, BRBE will be configured to capture both kernel
and user branches. When handling event overflow, the branch records have
to be filtered by software to only include kernel or user branch
addresses for that event. In contrast, x86 simply configures LBR using
the last installed event which seems broken.

It is possible on x86 to configure branch filter such that no branches
are ever recorded (e.g. -j save_type). For BRBE, events with a
configuration that will result in no samples are rejected.

Recording branches in KVM guests is not supported like x86. However,
perf on x86 allows requesting branch recording in guests. The guest
events are recorded, but the resulting branches are all from the host.
For BRBE, events with branch recording and "exclude_host" set are
rejected. Requiring "exclude_guest" to be set did not work. The default
for the perf tool does set "exclude_guest" if no exception level
options are specified. However, specifying kernel or user events
defaults to including both host and guest. In this case, only host
branches are recorded.

BRBE can support some additional exception branch types compared to
x86. On x86, all exceptions other than syscalls are recorded as IRQ.
With BRBE, it is possible to better categorize these exceptions. One
limitation relative to x86 is we cannot distinguish a syscall return
from other exception returns. So all exception returns are recorded as
ERET type. The FIQ branch type is omitted as the only FIQ user is Apple
platforms which don't support BRBE. The debug branch types are omitted
as there is no clear need for them.

BRBE records are invalidated whenever events are reconfigured, a new
task is scheduled in, or after recording is paused (and the records
have been recorded for the event). The architecture allows branch
records to be invalidated by the PE under implementation defined
conditions. It is expected that these conditions are rare.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Co-developed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Co-developed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: James Clark <james.clark@linaro.org>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
tested-by: Adam Young <admiyo@os.amperecomputing.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20250611-arm-brbe-v19-v23-4-e7775563036e@kernel.org
[will: Fix sparse warnings about mixed declarations and code.
       Fix C99 comment syntax.]
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/Kconfig         |  11 +
 drivers/perf/Makefile        |   1 +
 drivers/perf/arm_brbe.c      | 805 +++++++++++++++++++++++++++++++++++++++++++
 drivers/perf/arm_brbe.h      |  47 +++
 drivers/perf/arm_pmu.c       |  16 +-
 drivers/perf/arm_pmuv3.c     | 107 +++++-
 include/linux/perf/arm_pmu.h |   8 +
 7 files changed, 990 insertions(+), 5 deletions(-)
 create mode 100644 drivers/perf/arm_brbe.c
 create mode 100644 drivers/perf/arm_brbe.h

(limited to 'include')

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 278c929dc87a..a9188dec36fe 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -223,6 +223,17 @@ config ARM_SPE_PMU
 	  Extension, which provides periodic sampling of operations in
 	  the CPU pipeline and reports this via the perf AUX interface.
 
+config ARM64_BRBE
+	bool "Enable support for branch stack sampling using FEAT_BRBE"
+	depends on ARM_PMUV3 && ARM64
+	default y
+	help
+	  Enable perf support for Branch Record Buffer Extension (BRBE) which
+	  records all branches taken in an execution path. This supports some
+	  branch types and privilege based filtering. It captures additional
+	  relevant information such as cycle count, misprediction and branch
+	  type, branch privilege level etc.
+
 config ARM_DMC620_PMU
 	tristate "Enable PMU support for the ARM DMC-620 memory controller"
 	depends on (ARM64 && ACPI) || COMPILE_TEST
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index de71d2574857..192fc8b16204 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_STARFIVE_STARLINK_PMU) += starfive_starlink_pmu.o
 obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
+obj-$(CONFIG_ARM64_BRBE) += arm_brbe.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
diff --git a/drivers/perf/arm_brbe.c b/drivers/perf/arm_brbe.c
new file mode 100644
index 000000000000..ba554e0c846c
--- /dev/null
+++ b/drivers/perf/arm_brbe.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Branch Record Buffer Extension Driver.
+ *
+ * Copyright (C) 2022-2025 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#include <linux/types.h>
+#include <linux/bitmap.h>
+#include <linux/perf/arm_pmu.h>
+#include "arm_brbe.h"
+
+#define BRBFCR_EL1_BRANCH_FILTERS (BRBFCR_EL1_DIRECT   | \
+				   BRBFCR_EL1_INDIRECT | \
+				   BRBFCR_EL1_RTN      | \
+				   BRBFCR_EL1_INDCALL  | \
+				   BRBFCR_EL1_DIRCALL  | \
+				   BRBFCR_EL1_CONDDIR)
+
+/*
+ * BRBTS_EL1 is currently not used for branch stack implementation
+ * purpose but BRBCR_ELx.TS needs to have a valid value from all
+ * available options. BRBCR_ELx_TS_VIRTUAL is selected for this.
+ */
+#define BRBCR_ELx_DEFAULT_TS      FIELD_PREP(BRBCR_ELx_TS_MASK, BRBCR_ELx_TS_VIRTUAL)
+
+/*
+ * BRBE Buffer Organization
+ *
+ * BRBE buffer is arranged as multiple banks of 32 branch record
+ * entries each. An individual branch record in a given bank could
+ * be accessed, after selecting the bank in BRBFCR_EL1.BANK and
+ * accessing the registers i.e [BRBSRC, BRBTGT, BRBINF] set with
+ * indices [0..31].
+ *
+ * Bank 0
+ *
+ *	---------------------------------	------
+ *	| 00 | BRBSRC | BRBTGT | BRBINF |	| 00 |
+ *	---------------------------------	------
+ *	| 01 | BRBSRC | BRBTGT | BRBINF |	| 01 |
+ *	---------------------------------	------
+ *	| .. | BRBSRC | BRBTGT | BRBINF |	| .. |
+ *	---------------------------------	------
+ *	| 31 | BRBSRC | BRBTGT | BRBINF |	| 31 |
+ *	---------------------------------	------
+ *
+ * Bank 1
+ *
+ *	---------------------------------	------
+ *	| 32 | BRBSRC | BRBTGT | BRBINF |	| 00 |
+ *	---------------------------------	------
+ *	| 33 | BRBSRC | BRBTGT | BRBINF |	| 01 |
+ *	---------------------------------	------
+ *	| .. | BRBSRC | BRBTGT | BRBINF |	| .. |
+ *	---------------------------------	------
+ *	| 63 | BRBSRC | BRBTGT | BRBINF |	| 31 |
+ *	---------------------------------	------
+ */
+#define BRBE_BANK_MAX_ENTRIES	32
+
+struct brbe_regset {
+	u64 brbsrc;
+	u64 brbtgt;
+	u64 brbinf;
+};
+
+#define PERF_BR_ARM64_MAX (PERF_BR_MAX + PERF_BR_NEW_MAX)
+
+struct brbe_hw_attr {
+	int	brbe_version;
+	int	brbe_cc;
+	int	brbe_nr;
+	int	brbe_format;
+};
+
+#define BRBE_REGN_CASE(n, case_macro) \
+	case n: case_macro(n); break
+
+#define BRBE_REGN_SWITCH(x, case_macro)				\
+	do {							\
+		switch (x) {					\
+		BRBE_REGN_CASE(0, case_macro);			\
+		BRBE_REGN_CASE(1, case_macro);			\
+		BRBE_REGN_CASE(2, case_macro);			\
+		BRBE_REGN_CASE(3, case_macro);			\
+		BRBE_REGN_CASE(4, case_macro);			\
+		BRBE_REGN_CASE(5, case_macro);			\
+		BRBE_REGN_CASE(6, case_macro);			\
+		BRBE_REGN_CASE(7, case_macro);			\
+		BRBE_REGN_CASE(8, case_macro);			\
+		BRBE_REGN_CASE(9, case_macro);			\
+		BRBE_REGN_CASE(10, case_macro);			\
+		BRBE_REGN_CASE(11, case_macro);			\
+		BRBE_REGN_CASE(12, case_macro);			\
+		BRBE_REGN_CASE(13, case_macro);			\
+		BRBE_REGN_CASE(14, case_macro);			\
+		BRBE_REGN_CASE(15, case_macro);			\
+		BRBE_REGN_CASE(16, case_macro);			\
+		BRBE_REGN_CASE(17, case_macro);			\
+		BRBE_REGN_CASE(18, case_macro);			\
+		BRBE_REGN_CASE(19, case_macro);			\
+		BRBE_REGN_CASE(20, case_macro);			\
+		BRBE_REGN_CASE(21, case_macro);			\
+		BRBE_REGN_CASE(22, case_macro);			\
+		BRBE_REGN_CASE(23, case_macro);			\
+		BRBE_REGN_CASE(24, case_macro);			\
+		BRBE_REGN_CASE(25, case_macro);			\
+		BRBE_REGN_CASE(26, case_macro);			\
+		BRBE_REGN_CASE(27, case_macro);			\
+		BRBE_REGN_CASE(28, case_macro);			\
+		BRBE_REGN_CASE(29, case_macro);			\
+		BRBE_REGN_CASE(30, case_macro);			\
+		BRBE_REGN_CASE(31, case_macro);			\
+		default: WARN(1, "Invalid BRB* index %d\n", x);	\
+		}						\
+	} while (0)
+
+#define RETURN_READ_BRBSRCN(n) \
+	return read_sysreg_s(SYS_BRBSRC_EL1(n))
+static inline u64 get_brbsrc_reg(int idx)
+{
+	BRBE_REGN_SWITCH(idx, RETURN_READ_BRBSRCN);
+	return 0;
+}
+
+#define RETURN_READ_BRBTGTN(n) \
+	return read_sysreg_s(SYS_BRBTGT_EL1(n))
+static u64 get_brbtgt_reg(int idx)
+{
+	BRBE_REGN_SWITCH(idx, RETURN_READ_BRBTGTN);
+	return 0;
+}
+
+#define RETURN_READ_BRBINFN(n) \
+	return read_sysreg_s(SYS_BRBINF_EL1(n))
+static u64 get_brbinf_reg(int idx)
+{
+	BRBE_REGN_SWITCH(idx, RETURN_READ_BRBINFN);
+	return 0;
+}
+
+static u64 brbe_record_valid(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_VALID_MASK, brbinf);
+}
+
+static bool brbe_invalid(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_NONE;
+}
+
+static bool brbe_record_is_complete(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_FULL;
+}
+
+static bool brbe_record_is_source_only(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_SOURCE;
+}
+
+static bool brbe_record_is_target_only(u64 brbinf)
+{
+	return brbe_record_valid(brbinf) == BRBINFx_EL1_VALID_TARGET;
+}
+
+static int brbinf_get_in_tx(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_T_MASK, brbinf);
+}
+
+static int brbinf_get_mispredict(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_MPRED_MASK, brbinf);
+}
+
+static int brbinf_get_lastfailed(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_LASTFAILED_MASK, brbinf);
+}
+
+static u16 brbinf_get_cycles(u64 brbinf)
+{
+	u32 exp, mant, cycles;
+	/*
+	 * Captured cycle count is unknown and hence
+	 * should not be passed on to userspace.
+	 */
+	if (brbinf & BRBINFx_EL1_CCU)
+		return 0;
+
+	exp = FIELD_GET(BRBINFx_EL1_CC_EXP_MASK, brbinf);
+	mant = FIELD_GET(BRBINFx_EL1_CC_MANT_MASK, brbinf);
+
+	if (!exp)
+		return mant;
+
+	cycles = (mant | 0x100) << (exp - 1);
+
+	return min(cycles, U16_MAX);
+}
+
+static int brbinf_get_type(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_TYPE_MASK, brbinf);
+}
+
+static int brbinf_get_el(u64 brbinf)
+{
+	return FIELD_GET(BRBINFx_EL1_EL_MASK, brbinf);
+}
+
+void brbe_invalidate(void)
+{
+	/* Ensure all branches before this point are recorded */
+	isb();
+	asm volatile(BRB_IALL_INSN);
+	/* Ensure all branch records are invalidated after this point */
+	isb();
+}
+
+static bool valid_brbe_nr(int brbe_nr)
+{
+	return brbe_nr == BRBIDR0_EL1_NUMREC_8 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_16 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_32 ||
+	       brbe_nr == BRBIDR0_EL1_NUMREC_64;
+}
+
+static bool valid_brbe_cc(int brbe_cc)
+{
+	return brbe_cc == BRBIDR0_EL1_CC_20_BIT;
+}
+
+static bool valid_brbe_format(int brbe_format)
+{
+	return brbe_format == BRBIDR0_EL1_FORMAT_FORMAT_0;
+}
+
+static bool valid_brbidr(u64 brbidr)
+{
+	int brbe_format, brbe_cc, brbe_nr;
+
+	brbe_format = FIELD_GET(BRBIDR0_EL1_FORMAT_MASK, brbidr);
+	brbe_cc = FIELD_GET(BRBIDR0_EL1_CC_MASK, brbidr);
+	brbe_nr = FIELD_GET(BRBIDR0_EL1_NUMREC_MASK, brbidr);
+
+	return valid_brbe_format(brbe_format) && valid_brbe_cc(brbe_cc) && valid_brbe_nr(brbe_nr);
+}
+
+static bool valid_brbe_version(int brbe_version)
+{
+	return brbe_version == ID_AA64DFR0_EL1_BRBE_IMP ||
+	       brbe_version == ID_AA64DFR0_EL1_BRBE_BRBE_V1P1;
+}
+
+static void select_brbe_bank(int bank)
+{
+	u64 brbfcr;
+
+	brbfcr = read_sysreg_s(SYS_BRBFCR_EL1);
+	brbfcr &= ~BRBFCR_EL1_BANK_MASK;
+	brbfcr |= SYS_FIELD_PREP(BRBFCR_EL1, BANK, bank);
+	write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+	/*
+	 * Arm ARM (DDI 0487K.a) D.18.4 rule PPBZP requires explicit sync
+	 * between setting BANK and accessing branch records.
+	 */
+	isb();
+}
+
+static bool __read_brbe_regset(struct brbe_regset *entry, int idx)
+{
+	entry->brbinf = get_brbinf_reg(idx);
+
+	if (brbe_invalid(entry->brbinf))
+		return false;
+
+	entry->brbsrc = get_brbsrc_reg(idx);
+	entry->brbtgt = get_brbtgt_reg(idx);
+	return true;
+}
+
+/*
+ * Generic perf branch filters supported on BRBE
+ *
+ * New branch filters need to be evaluated whether they could be supported on
+ * BRBE. This ensures that such branch filters would not just be accepted, to
+ * fail silently. PERF_SAMPLE_BRANCH_HV is a special case that is selectively
+ * supported only on platforms where kernel is in hyp mode.
+ */
+#define BRBE_EXCLUDE_BRANCH_FILTERS (PERF_SAMPLE_BRANCH_ABORT_TX	| \
+				     PERF_SAMPLE_BRANCH_IN_TX		| \
+				     PERF_SAMPLE_BRANCH_NO_TX		| \
+				     PERF_SAMPLE_BRANCH_CALL_STACK	| \
+				     PERF_SAMPLE_BRANCH_COUNTERS)
+
+#define BRBE_ALLOWED_BRANCH_TYPES   (PERF_SAMPLE_BRANCH_ANY		| \
+				     PERF_SAMPLE_BRANCH_ANY_CALL	| \
+				     PERF_SAMPLE_BRANCH_ANY_RETURN	| \
+				     PERF_SAMPLE_BRANCH_IND_CALL	| \
+				     PERF_SAMPLE_BRANCH_COND		| \
+				     PERF_SAMPLE_BRANCH_IND_JUMP	| \
+				     PERF_SAMPLE_BRANCH_CALL)
+
+
+#define BRBE_ALLOWED_BRANCH_FILTERS (PERF_SAMPLE_BRANCH_USER		| \
+				     PERF_SAMPLE_BRANCH_KERNEL		| \
+				     PERF_SAMPLE_BRANCH_HV		| \
+				     BRBE_ALLOWED_BRANCH_TYPES		| \
+				     PERF_SAMPLE_BRANCH_NO_FLAGS	| \
+				     PERF_SAMPLE_BRANCH_NO_CYCLES	| \
+				     PERF_SAMPLE_BRANCH_TYPE_SAVE	| \
+				     PERF_SAMPLE_BRANCH_HW_INDEX	| \
+				     PERF_SAMPLE_BRANCH_PRIV_SAVE)
+
+#define BRBE_PERF_BRANCH_FILTERS    (BRBE_ALLOWED_BRANCH_FILTERS	| \
+				     BRBE_EXCLUDE_BRANCH_FILTERS)
+
+/*
+ * BRBE supports the following functional branch type filters while
+ * generating branch records. These branch filters can be enabled,
+ * either individually or as a group i.e ORing multiple filters
+ * with each other.
+ *
+ * BRBFCR_EL1_CONDDIR  - Conditional direct branch
+ * BRBFCR_EL1_DIRCALL  - Direct call
+ * BRBFCR_EL1_INDCALL  - Indirect call
+ * BRBFCR_EL1_INDIRECT - Indirect branch
+ * BRBFCR_EL1_DIRECT   - Direct branch
+ * BRBFCR_EL1_RTN      - Subroutine return
+ */
+static u64 branch_type_to_brbfcr(int branch_type)
+{
+	u64 brbfcr = 0;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+		brbfcr |= BRBFCR_EL1_BRANCH_FILTERS;
+		return brbfcr;
+	}
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL) {
+		brbfcr |= BRBFCR_EL1_INDCALL;
+		brbfcr |= BRBFCR_EL1_DIRCALL;
+	}
+
+	if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		brbfcr |= BRBFCR_EL1_RTN;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_IND_CALL)
+		brbfcr |= BRBFCR_EL1_INDCALL;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_COND)
+		brbfcr |= BRBFCR_EL1_CONDDIR;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		brbfcr |= BRBFCR_EL1_INDIRECT;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_CALL)
+		brbfcr |= BRBFCR_EL1_DIRCALL;
+
+	return brbfcr;
+}
+
+/*
+ * BRBE supports the following privilege mode filters while generating
+ * branch records.
+ *
+ * BRBCR_ELx_E0BRE - EL0 branch records
+ * BRBCR_ELx_ExBRE - EL1/EL2 branch records
+ *
+ * BRBE also supports the following additional functional branch type
+ * filters while generating branch records.
+ *
+ * BRBCR_ELx_EXCEPTION - Exception
+ * BRBCR_ELx_ERTN     -  Exception return
+ */
+static u64 branch_type_to_brbcr(int branch_type)
+{
+	u64 brbcr = BRBCR_ELx_FZP | BRBCR_ELx_DEFAULT_TS;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_USER)
+		brbcr |= BRBCR_ELx_E0BRE;
+
+	/*
+	 * When running in the hyp mode, writing into BRBCR_EL1
+	 * actually writes into BRBCR_EL2 instead. Field E2BRE
+	 * is also at the same position as E1BRE.
+	 */
+	if (branch_type & PERF_SAMPLE_BRANCH_KERNEL)
+		brbcr |= BRBCR_ELx_ExBRE;
+
+	if (branch_type & PERF_SAMPLE_BRANCH_HV) {
+		if (is_kernel_in_hyp_mode())
+			brbcr |= BRBCR_ELx_ExBRE;
+	}
+
+	if (!(branch_type & PERF_SAMPLE_BRANCH_NO_CYCLES))
+		brbcr |= BRBCR_ELx_CC;
+
+	if (!(branch_type & PERF_SAMPLE_BRANCH_NO_FLAGS))
+		brbcr |= BRBCR_ELx_MPRED;
+
+	/*
+	 * The exception and exception return branches could be
+	 * captured, irrespective of the perf event's privilege.
+	 * If the perf event does not have enough privilege for
+	 * a given exception level, then addresses which falls
+	 * under that exception level will be reported as zero
+	 * for the captured branch record, creating source only
+	 * or target only records.
+	 */
+	if (branch_type & PERF_SAMPLE_BRANCH_KERNEL) {
+		if (branch_type & PERF_SAMPLE_BRANCH_ANY) {
+			brbcr |= BRBCR_ELx_EXCEPTION;
+			brbcr |= BRBCR_ELx_ERTN;
+		}
+
+		if (branch_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+			brbcr |= BRBCR_ELx_EXCEPTION;
+
+		if (branch_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+			brbcr |= BRBCR_ELx_ERTN;
+	}
+	return brbcr;
+}
+
+bool brbe_branch_attr_valid(struct perf_event *event)
+{
+	u64 branch_type = event->attr.branch_sample_type;
+
+	/*
+	 * Ensure both perf branch filter allowed and exclude
+	 * masks are always in sync with the generic perf ABI.
+	 */
+	BUILD_BUG_ON(BRBE_PERF_BRANCH_FILTERS != (PERF_SAMPLE_BRANCH_MAX - 1));
+
+	if (branch_type & BRBE_EXCLUDE_BRANCH_FILTERS) {
+		pr_debug("requested branch filter not supported 0x%llx\n", branch_type);
+		return false;
+	}
+
+	/* Ensure at least 1 branch type is enabled */
+	if (!(branch_type & BRBE_ALLOWED_BRANCH_TYPES)) {
+		pr_debug("no branch type enabled 0x%llx\n", branch_type);
+		return false;
+	}
+
+	/*
+	 * No branches are recorded in guests nor nVHE hypervisors, so
+	 * excluding the host or both kernel and user is invalid.
+	 *
+	 * Ideally we'd just require exclude_guest and exclude_hv, but setting
+	 * event filters with perf for kernel or user don't set exclude_guest.
+	 * So effectively, exclude_guest and exclude_hv are ignored.
+	 */
+	if (event->attr.exclude_host || (event->attr.exclude_user && event->attr.exclude_kernel)) {
+		pr_debug("branch filter in hypervisor or guest only not supported 0x%llx\n", branch_type);
+		return false;
+	}
+
+	event->hw.branch_reg.config = branch_type_to_brbfcr(event->attr.branch_sample_type);
+	event->hw.extra_reg.config = branch_type_to_brbcr(event->attr.branch_sample_type);
+
+	return true;
+}
+
+unsigned int brbe_num_branch_records(const struct arm_pmu *armpmu)
+{
+	return FIELD_GET(BRBIDR0_EL1_NUMREC_MASK, armpmu->reg_brbidr);
+}
+
+void brbe_probe(struct arm_pmu *armpmu)
+{
+	u64 brbidr, aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1);
+	u32 brbe;
+
+	brbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_EL1_BRBE_SHIFT);
+	if (!valid_brbe_version(brbe))
+		return;
+
+	brbidr = read_sysreg_s(SYS_BRBIDR0_EL1);
+	if (!valid_brbidr(brbidr))
+		return;
+
+	armpmu->reg_brbidr = brbidr;
+}
+
+/*
+ * BRBE is assumed to be disabled/paused on entry
+ */
+void brbe_enable(const struct arm_pmu *arm_pmu)
+{
+	struct pmu_hw_events *cpuc = this_cpu_ptr(arm_pmu->hw_events);
+	u64 brbfcr = 0, brbcr = 0;
+
+	/*
+	 * Discard existing records to avoid a discontinuity, e.g. records
+	 * missed during handling an overflow.
+	 */
+	brbe_invalidate();
+
+	/*
+	 * Merge the permitted branch filters of all events.
+	 */
+	for (int i = 0; i < ARMPMU_MAX_HWEVENTS; i++) {
+		struct perf_event *event = cpuc->events[i];
+
+		if (event && has_branch_stack(event)) {
+			brbfcr |= event->hw.branch_reg.config;
+			brbcr |= event->hw.extra_reg.config;
+		}
+	}
+
+	/*
+	 * In VHE mode with MDCR_EL2.HPMN equal to PMCR_EL0.N, BRBCR_EL1.FZP
+	 * controls freezing the branch records on counter overflow rather than
+	 * BRBCR_EL2.FZP (which writes to BRBCR_EL1 are redirected to).
+	 * The exception levels are enabled/disabled in BRBCR_EL2, so keep EL1
+	 * and EL0 recording disabled for guests.
+	 *
+	 * As BRBCR_EL1 CC and MPRED bits also need to match, use the same
+	 * value for both registers just masking the exception levels.
+	 */
+	if (is_kernel_in_hyp_mode())
+		write_sysreg_s(brbcr & ~(BRBCR_ELx_ExBRE | BRBCR_ELx_E0BRE), SYS_BRBCR_EL12);
+	write_sysreg_s(brbcr, SYS_BRBCR_EL1);
+	/* Ensure BRBCR_ELx settings take effect before unpausing */
+	isb();
+
+	/* Finally write SYS_BRBFCR_EL to unpause BRBE */
+	write_sysreg_s(brbfcr, SYS_BRBFCR_EL1);
+	/* Synchronization in PMCR write ensures ordering WRT PMU enabling */
+}
+
+void brbe_disable(void)
+{
+	/*
+	 * No need for synchronization here as synchronization in PMCR write
+	 * ensures ordering and in the interrupt handler this is a NOP as
+	 * we're already paused.
+	 */
+	write_sysreg_s(BRBFCR_EL1_PAUSED, SYS_BRBFCR_EL1);
+	write_sysreg_s(0, SYS_BRBCR_EL1);
+}
+
+static const int brbe_type_to_perf_type_map[BRBINFx_EL1_TYPE_DEBUG_EXIT + 1][2] = {
+	[BRBINFx_EL1_TYPE_DIRECT_UNCOND] = { PERF_BR_UNCOND, 0 },
+	[BRBINFx_EL1_TYPE_INDIRECT] = { PERF_BR_IND, 0 },
+	[BRBINFx_EL1_TYPE_DIRECT_LINK] = { PERF_BR_CALL, 0 },
+	[BRBINFx_EL1_TYPE_INDIRECT_LINK] = { PERF_BR_IND_CALL, 0 },
+	[BRBINFx_EL1_TYPE_RET] = { PERF_BR_RET, 0 },
+	[BRBINFx_EL1_TYPE_DIRECT_COND] = { PERF_BR_COND, 0 },
+	[BRBINFx_EL1_TYPE_CALL] = { PERF_BR_SYSCALL, 0 },
+	[BRBINFx_EL1_TYPE_ERET] = { PERF_BR_ERET, 0 },
+	[BRBINFx_EL1_TYPE_IRQ] = { PERF_BR_IRQ, 0 },
+	[BRBINFx_EL1_TYPE_TRAP] = { PERF_BR_IRQ, 0 },
+	[BRBINFx_EL1_TYPE_SERROR] = { PERF_BR_SERROR, 0 },
+	[BRBINFx_EL1_TYPE_ALIGN_FAULT] = { PERF_BR_EXTEND_ABI, PERF_BR_NEW_FAULT_ALGN },
+	[BRBINFx_EL1_TYPE_INSN_FAULT] = { PERF_BR_EXTEND_ABI, PERF_BR_NEW_FAULT_INST },
+	[BRBINFx_EL1_TYPE_DATA_FAULT] = { PERF_BR_EXTEND_ABI, PERF_BR_NEW_FAULT_DATA },
+};
+
+static void brbe_set_perf_entry_type(struct perf_branch_entry *entry, u64 brbinf)
+{
+	int brbe_type = brbinf_get_type(brbinf);
+
+	if (brbe_type <= BRBINFx_EL1_TYPE_DEBUG_EXIT) {
+		const int *br_type = brbe_type_to_perf_type_map[brbe_type];
+
+		entry->type = br_type[0];
+		entry->new_type = br_type[1];
+	}
+}
+
+static int brbinf_get_perf_priv(u64 brbinf)
+{
+	int brbe_el = brbinf_get_el(brbinf);
+
+	switch (brbe_el) {
+	case BRBINFx_EL1_EL_EL0:
+		return PERF_BR_PRIV_USER;
+	case BRBINFx_EL1_EL_EL1:
+		return PERF_BR_PRIV_KERNEL;
+	case BRBINFx_EL1_EL_EL2:
+		if (is_kernel_in_hyp_mode())
+			return PERF_BR_PRIV_KERNEL;
+		return PERF_BR_PRIV_HV;
+	default:
+		pr_warn_once("%d - unknown branch privilege captured\n", brbe_el);
+		return PERF_BR_PRIV_UNKNOWN;
+	}
+}
+
+static bool perf_entry_from_brbe_regset(int index, struct perf_branch_entry *entry,
+					const struct perf_event *event)
+{
+	struct brbe_regset bregs;
+	u64 brbinf;
+
+	if (!__read_brbe_regset(&bregs, index))
+		return false;
+
+	brbinf = bregs.brbinf;
+	perf_clear_branch_entry_bitfields(entry);
+	if (brbe_record_is_complete(brbinf)) {
+		entry->from = bregs.brbsrc;
+		entry->to = bregs.brbtgt;
+	} else if (brbe_record_is_source_only(brbinf)) {
+		entry->from = bregs.brbsrc;
+		entry->to = 0;
+	} else if (brbe_record_is_target_only(brbinf)) {
+		entry->from = 0;
+		entry->to = bregs.brbtgt;
+	}
+
+	brbe_set_perf_entry_type(entry, brbinf);
+
+	if (!branch_sample_no_cycles(event))
+		entry->cycles = brbinf_get_cycles(brbinf);
+
+	if (!branch_sample_no_flags(event)) {
+		/* Mispredict info is available for source only and complete branch records. */
+		if (!brbe_record_is_target_only(brbinf)) {
+			entry->mispred = brbinf_get_mispredict(brbinf);
+			entry->predicted = !entry->mispred;
+		}
+
+		/*
+		 * Currently TME feature is neither implemented in any hardware
+		 * nor it is being supported in the kernel. Just warn here once
+		 * if TME related information shows up rather unexpectedly.
+		 */
+		if (brbinf_get_lastfailed(brbinf) || brbinf_get_in_tx(brbinf))
+			pr_warn_once("Unknown transaction states\n");
+	}
+
+	/*
+	 * Branch privilege level is available for target only and complete
+	 * branch records.
+	 */
+	if (!brbe_record_is_source_only(brbinf))
+		entry->priv = brbinf_get_perf_priv(brbinf);
+
+	return true;
+}
+
+#define PERF_BR_ARM64_ALL (				\
+	BIT(PERF_BR_COND) |				\
+	BIT(PERF_BR_UNCOND) |				\
+	BIT(PERF_BR_IND) |				\
+	BIT(PERF_BR_CALL) |				\
+	BIT(PERF_BR_IND_CALL) |				\
+	BIT(PERF_BR_RET))
+
+#define PERF_BR_ARM64_ALL_KERNEL (			\
+	BIT(PERF_BR_SYSCALL) |				\
+	BIT(PERF_BR_IRQ) |				\
+	BIT(PERF_BR_SERROR) |				\
+	BIT(PERF_BR_MAX + PERF_BR_NEW_FAULT_ALGN) |	\
+	BIT(PERF_BR_MAX + PERF_BR_NEW_FAULT_DATA) |	\
+	BIT(PERF_BR_MAX + PERF_BR_NEW_FAULT_INST))
+
+static void prepare_event_branch_type_mask(u64 branch_sample,
+					   unsigned long *event_type_mask)
+{
+	if (branch_sample & PERF_SAMPLE_BRANCH_ANY) {
+		if (branch_sample & PERF_SAMPLE_BRANCH_KERNEL)
+			bitmap_from_u64(event_type_mask,
+				BIT(PERF_BR_ERET) | PERF_BR_ARM64_ALL |
+				PERF_BR_ARM64_ALL_KERNEL);
+		else
+			bitmap_from_u64(event_type_mask, PERF_BR_ARM64_ALL);
+		return;
+	}
+
+	bitmap_zero(event_type_mask, PERF_BR_ARM64_MAX);
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_ANY_CALL) {
+		if (branch_sample & PERF_SAMPLE_BRANCH_KERNEL)
+			bitmap_from_u64(event_type_mask, PERF_BR_ARM64_ALL_KERNEL);
+
+		set_bit(PERF_BR_CALL, event_type_mask);
+		set_bit(PERF_BR_IND_CALL, event_type_mask);
+	}
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_IND_JUMP)
+		set_bit(PERF_BR_IND, event_type_mask);
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_COND)
+		set_bit(PERF_BR_COND, event_type_mask);
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_CALL)
+		set_bit(PERF_BR_CALL, event_type_mask);
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_IND_CALL)
+		set_bit(PERF_BR_IND_CALL, event_type_mask);
+
+	if (branch_sample & PERF_SAMPLE_BRANCH_ANY_RETURN) {
+		set_bit(PERF_BR_RET, event_type_mask);
+
+		if (branch_sample & PERF_SAMPLE_BRANCH_KERNEL)
+			set_bit(PERF_BR_ERET, event_type_mask);
+	}
+}
+
+/*
+ * BRBE is configured with an OR of permissions from all events, so there may
+ * be events which have to be dropped or events where just the source or target
+ * address has to be zeroed.
+ */
+static bool filter_branch_privilege(struct perf_branch_entry *entry, u64 branch_sample_type)
+{
+	bool from_user = access_ok((void __user *)(unsigned long)entry->from, 4);
+	bool to_user = access_ok((void __user *)(unsigned long)entry->to, 4);
+	bool exclude_kernel = !((branch_sample_type & PERF_SAMPLE_BRANCH_KERNEL) ||
+		(is_kernel_in_hyp_mode() && (branch_sample_type & PERF_SAMPLE_BRANCH_HV)));
+
+	/* We can only have a half record if permissions have not been expanded */
+	if (!entry->from || !entry->to)
+		return true;
+
+	/*
+	 * If record is within a single exception level, just need to either
+	 * drop or keep the entire record.
+	 */
+	if (from_user == to_user)
+		return ((entry->priv == PERF_BR_PRIV_KERNEL) && !exclude_kernel) ||
+			((entry->priv == PERF_BR_PRIV_USER) &&
+			 (branch_sample_type & PERF_SAMPLE_BRANCH_USER));
+
+	/*
+	 * Record is across exception levels, mask addresses for the exception
+	 * level we're not capturing.
+	 */
+	if (!(branch_sample_type & PERF_SAMPLE_BRANCH_USER)) {
+		if (from_user)
+			entry->from = 0;
+		if (to_user)
+			entry->to = 0;
+	}
+
+	if (exclude_kernel) {
+		if (!from_user)
+			entry->from = 0;
+		if (!to_user)
+			entry->to = 0;
+	}
+
+	return true;
+}
+
+static bool filter_branch_type(struct perf_branch_entry *entry,
+			       const unsigned long *event_type_mask)
+{
+	if (entry->type == PERF_BR_EXTEND_ABI)
+		return test_bit(PERF_BR_MAX + entry->new_type, event_type_mask);
+	else
+		return test_bit(entry->type, event_type_mask);
+}
+
+static bool filter_branch_record(struct perf_branch_entry *entry,
+				 u64 branch_sample,
+				 const unsigned long *event_type_mask)
+{
+	return filter_branch_type(entry, event_type_mask) &&
+		filter_branch_privilege(entry, branch_sample);
+}
+
+void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack,
+				const struct perf_event *event)
+{
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
+	int nr_hw = brbe_num_branch_records(cpu_pmu);
+	int nr_banks = DIV_ROUND_UP(nr_hw, BRBE_BANK_MAX_ENTRIES);
+	int nr_filtered = 0;
+	u64 branch_sample_type = event->attr.branch_sample_type;
+	DECLARE_BITMAP(event_type_mask, PERF_BR_ARM64_MAX);
+
+	prepare_event_branch_type_mask(branch_sample_type, event_type_mask);
+
+	for (int bank = 0; bank < nr_banks; bank++) {
+		int nr_remaining = nr_hw - (bank * BRBE_BANK_MAX_ENTRIES);
+		int nr_this_bank = min(nr_remaining, BRBE_BANK_MAX_ENTRIES);
+
+		select_brbe_bank(bank);
+
+		for (int i = 0; i < nr_this_bank; i++) {
+			struct perf_branch_entry *pbe = &branch_stack->entries[nr_filtered];
+
+			if (!perf_entry_from_brbe_regset(i, pbe, event))
+				goto done;
+
+			if (!filter_branch_record(pbe, branch_sample_type, event_type_mask))
+				continue;
+
+			nr_filtered++;
+		}
+	}
+
+done:
+	branch_stack->nr = nr_filtered;
+}
diff --git a/drivers/perf/arm_brbe.h b/drivers/perf/arm_brbe.h
new file mode 100644
index 000000000000..b7c7d8796c86
--- /dev/null
+++ b/drivers/perf/arm_brbe.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Branch Record Buffer Extension Helpers.
+ *
+ * Copyright (C) 2022-2025 ARM Limited
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+
+struct arm_pmu;
+struct perf_branch_stack;
+struct perf_event;
+
+#ifdef CONFIG_ARM64_BRBE
+void brbe_probe(struct arm_pmu *arm_pmu);
+unsigned int brbe_num_branch_records(const struct arm_pmu *armpmu);
+void brbe_invalidate(void);
+
+void brbe_enable(const struct arm_pmu *arm_pmu);
+void brbe_disable(void);
+
+bool brbe_branch_attr_valid(struct perf_event *event);
+void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack,
+				const struct perf_event *event);
+#else
+static inline void brbe_probe(struct arm_pmu *arm_pmu) { }
+static inline unsigned int brbe_num_branch_records(const struct arm_pmu *armpmu)
+{
+	return 0;
+}
+
+static inline void brbe_invalidate(void) { }
+
+static inline void brbe_enable(const struct arm_pmu *arm_pmu) { };
+static inline void brbe_disable(void) { };
+
+static inline bool brbe_branch_attr_valid(struct perf_event *event)
+{
+	WARN_ON_ONCE(!has_branch_stack(event));
+	return false;
+}
+
+static void brbe_read_filtered_entries(struct perf_branch_stack *branch_stack,
+				       const struct perf_event *event)
+{
+}
+#endif
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 2f33e69a8caf..5c310e803dd7 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -99,7 +99,7 @@ static const struct pmu_irq_ops percpu_pmunmi_ops = {
 	.free_pmuirq = armpmu_free_percpu_pmunmi
 };
 
-static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
+DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
 static DEFINE_PER_CPU(int, cpu_irq);
 static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops);
 
@@ -318,6 +318,12 @@ armpmu_del(struct perf_event *event, int flags)
 	int idx = hwc->idx;
 
 	armpmu_stop(event, PERF_EF_UPDATE);
+
+	if (has_branch_stack(event)) {
+		hw_events->branch_users--;
+		perf_sched_cb_dec(event->pmu);
+	}
+
 	hw_events->events[idx] = NULL;
 	armpmu->clear_event_idx(hw_events, event);
 	perf_event_update_userpage(event);
@@ -345,6 +351,11 @@ armpmu_add(struct perf_event *event, int flags)
 	/* The newly-allocated counter should be empty */
 	WARN_ON_ONCE(hw_events->events[idx]);
 
+	if (has_branch_stack(event)) {
+		hw_events->branch_users++;
+		perf_sched_cb_inc(event->pmu);
+	}
+
 	event->hw.idx = idx;
 	hw_events->events[idx] = event;
 
@@ -509,8 +520,7 @@ static int armpmu_event_init(struct perf_event *event)
 		!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
 		return -ENOENT;
 
-	/* does not support taken branch sampling */
-	if (has_branch_stack(event))
+	if (has_branch_stack(event) && !armpmu->reg_brbidr)
 		return -EOPNOTSUPP;
 
 	return __hw_perf_event_init(event);
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 3db9f4ed17e8..f6d7bab5d555 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -25,6 +25,8 @@
 #include <linux/smp.h>
 #include <linux/nmi.h>
 
+#include "arm_brbe.h"
+
 /* ARMv8 Cortex-A53 specific event types. */
 #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
 
@@ -438,7 +440,19 @@ static ssize_t threshold_max_show(struct device *dev,
 
 static DEVICE_ATTR_RO(threshold_max);
 
+static ssize_t branches_show(struct device *dev,
+			     struct device_attribute *attr, char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+
+	return sysfs_emit(page, "%d\n", brbe_num_branch_records(cpu_pmu));
+}
+
+static DEVICE_ATTR_RO(branches);
+
 static struct attribute *armv8_pmuv3_caps_attrs[] = {
+	&dev_attr_branches.attr,
 	&dev_attr_slots.attr,
 	&dev_attr_bus_slots.attr,
 	&dev_attr_bus_width.attr,
@@ -446,9 +460,22 @@ static struct attribute *armv8_pmuv3_caps_attrs[] = {
 	NULL,
 };
 
+static umode_t caps_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
+
+	if (i == 0)
+		return brbe_num_branch_records(cpu_pmu) ? attr->mode : 0;
+
+	return attr->mode;
+}
+
 static const struct attribute_group armv8_pmuv3_caps_attr_group = {
 	.name = "caps",
 	.attrs = armv8_pmuv3_caps_attrs,
+	.is_visible = caps_is_visible,
 };
 
 /*
@@ -809,6 +836,7 @@ static void armv8pmu_disable_event(struct perf_event *event)
 static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 {
 	struct perf_event_context *ctx;
+	struct pmu_hw_events *hw_events = this_cpu_ptr(cpu_pmu->hw_events);
 	int nr_user = 0;
 
 	ctx = perf_cpu_task_ctx();
@@ -822,16 +850,34 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 
 	kvm_vcpu_pmu_resync_el0();
 
+	if (hw_events->branch_users)
+		brbe_enable(cpu_pmu);
+
 	/* Enable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
 }
 
 static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
 {
+	struct pmu_hw_events *hw_events = this_cpu_ptr(cpu_pmu->hw_events);
+
+	if (hw_events->branch_users)
+		brbe_disable();
+
 	/* Disable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
 }
 
+static void read_branch_records(struct pmu_hw_events *cpuc,
+				struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	struct perf_branch_stack *branch_stack = cpuc->branch_stack;
+
+	brbe_read_filtered_entries(branch_stack, event);
+	perf_sample_save_brstack(data, event, branch_stack, NULL);
+}
+
 static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 {
 	u64 pmovsr;
@@ -882,6 +928,9 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
 		if (!armpmu_event_set_period(event))
 			continue;
 
+		if (has_branch_stack(event))
+			read_branch_records(cpuc, event, &data);
+
 		/*
 		 * Perf event overflow will queue the processing of the event as
 		 * an irq_work which will be taken care of in the handling of
@@ -938,7 +987,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
 
 	/* Always prefer to place a cycle counter into the cycle counter. */
 	if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
-	    !armv8pmu_event_get_threshold(&event->attr)) {
+	    !armv8pmu_event_get_threshold(&event->attr) && !has_branch_stack(event)) {
 		if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask))
 			return ARMV8_PMU_CYCLE_IDX;
 		else if (armv8pmu_event_is_64bit(event) &&
@@ -987,6 +1036,19 @@ static int armv8pmu_user_event_idx(struct perf_event *event)
 	return event->hw.idx + 1;
 }
 
+static void armv8pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
+				struct task_struct *task, bool sched_in)
+{
+	struct arm_pmu *armpmu = *this_cpu_ptr(&cpu_armpmu);
+	struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
+
+	if (!hw_events->branch_users)
+		return;
+
+	if (sched_in)
+		brbe_invalidate();
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -1004,6 +1066,13 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
 		return -EOPNOTSUPP;
 	}
 
+	if (has_branch_stack(perf_event)) {
+		if (!brbe_num_branch_records(cpu_pmu) || !brbe_branch_attr_valid(perf_event))
+			return -EOPNOTSUPP;
+
+		perf_event->attach_state |= PERF_ATTACH_SCHED_CB;
+	}
+
 	/*
 	 * If we're running in hyp mode, then we *are* the hypervisor.
 	 * Therefore we ignore exclude_hv in this configuration, since
@@ -1070,6 +1139,11 @@ static void armv8pmu_reset(void *info)
 	/* Clear the counters we flip at guest entry/exit */
 	kvm_clr_pmu_events(mask);
 
+	if (brbe_num_branch_records(cpu_pmu)) {
+		brbe_disable();
+		brbe_invalidate();
+	}
+
 	/*
 	 * Initialize & Reset PMNC. Request overflow interrupt for
 	 * 64 bit cycle counter but cheat in armv8pmu_write_counter().
@@ -1238,6 +1312,25 @@ static void __armv8pmu_probe_pmu(void *info)
 		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
+
+	brbe_probe(cpu_pmu);
+}
+
+static int branch_records_alloc(struct arm_pmu *armpmu)
+{
+	size_t size = struct_size_t(struct perf_branch_stack, entries,
+				    brbe_num_branch_records(armpmu));
+	int cpu;
+
+	for_each_cpu(cpu, &armpmu->supported_cpus) {
+		struct pmu_hw_events *events_cpu;
+
+		events_cpu = per_cpu_ptr(armpmu->hw_events, cpu);
+		events_cpu->branch_stack = kmalloc(size, GFP_KERNEL);
+		if (!events_cpu->branch_stack)
+			return -ENOMEM;
+	}
+	return 0;
 }
 
 static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
@@ -1254,7 +1347,15 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	if (ret)
 		return ret;
 
-	return probe.present ? 0 : -ENODEV;
+	if (!probe.present)
+		return -ENODEV;
+
+	if (brbe_num_branch_records(cpu_pmu)) {
+		ret = branch_records_alloc(cpu_pmu);
+		if (ret)
+			return ret;
+	}
+	return 0;
 }
 
 static void armv8pmu_disable_user_access_ipi(void *unused)
@@ -1313,6 +1414,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 
 	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+	if (brbe_num_branch_records(cpu_pmu))
+		cpu_pmu->pmu.sched_task		= armv8pmu_sched_task;
 
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 6dc5e0cd76ca..93c9a26492fc 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -70,6 +70,11 @@ struct pmu_hw_events {
 	struct arm_pmu		*percpu_pmu;
 
 	int irq;
+
+	struct perf_branch_stack	*branch_stack;
+
+	/* Active events requesting branch records */
+	unsigned int		branch_users;
 };
 
 enum armpmu_attr_groups {
@@ -115,6 +120,7 @@ struct arm_pmu {
 	/* PMUv3 only */
 	int		pmuver;
 	u64		reg_pmmir;
+	u64		reg_brbidr;
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS		0x40
 	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
 #define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE	0x4000
@@ -126,6 +132,8 @@ struct arm_pmu {
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
 
+DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu);
+
 u64 armpmu_event_update(struct perf_event *event);
 
 int armpmu_event_set_period(struct perf_event *event);
-- 
cgit v1.2.3


From fc582cd26e888b0652bc1494f252329453fd3b23 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 8 Jul 2025 11:00:32 -0600
Subject: io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU

syzbot reports that defer/local task_work adding via msg_ring can hit
a request that has been freed:

CPU: 1 UID: 0 PID: 19356 Comm: iou-wrk-19354 Not tainted 6.16.0-rc4-syzkaller-00108-g17bbde2e1716 #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:408 [inline]
 print_report+0xd2/0x2b0 mm/kasan/report.c:521
 kasan_report+0x118/0x150 mm/kasan/report.c:634
 io_req_local_work_add io_uring/io_uring.c:1184 [inline]
 __io_req_task_work_add+0x589/0x950 io_uring/io_uring.c:1252
 io_msg_remote_post io_uring/msg_ring.c:103 [inline]
 io_msg_data_remote io_uring/msg_ring.c:133 [inline]
 __io_msg_ring_data+0x820/0xaa0 io_uring/msg_ring.c:151
 io_msg_ring_data io_uring/msg_ring.c:173 [inline]
 io_msg_ring+0x134/0xa00 io_uring/msg_ring.c:314
 __io_issue_sqe+0x17e/0x4b0 io_uring/io_uring.c:1739
 io_issue_sqe+0x165/0xfd0 io_uring/io_uring.c:1762
 io_wq_submit_work+0x6e9/0xb90 io_uring/io_uring.c:1874
 io_worker_handle_work+0x7cd/0x1180 io_uring/io-wq.c:642
 io_wq_worker+0x42f/0xeb0 io_uring/io-wq.c:696
 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>

which is supposed to be safe with how requests are allocated. But msg
ring requests alloc and free on their own, and hence must defer freeing
to a sane time.

Add an rcu_head and use kfree_rcu() in both spots where requests are
freed. Only the one in io_msg_tw_complete() is strictly required as it
has been visible on the other ring, but use it consistently in the other
spot as well.

This should not cause any other issues outside of KASAN rightfully
complaining about it.

Link: https://lore.kernel.org/io-uring/686cd2ea.a00a0220.338033.0007.GAE@google.com/
Reported-by: syzbot+54cbbfb4db9145d26fc2@syzkaller.appspotmail.com
Cc: stable@vger.kernel.org
Fixes: 0617bb500bfa ("io_uring/msg_ring: improve handling of target CQE posting")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 2 ++
 io_uring/msg_ring.c            | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2922635986f5..a7efcec2e3d0 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -698,6 +698,8 @@ struct io_kiocb {
 		struct hlist_node	hash_node;
 		/* For IOPOLL setup queues, with hybrid polling */
 		u64                     iopoll_start;
+		/* for private io_kiocb freeing */
+		struct rcu_head		rcu_head;
 	};
 	/* internal polling, see IORING_FEAT_FAST_POLL */
 	struct async_poll		*apoll;
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 71400d6cefc8..4c2578f2efcb 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
 		spin_unlock(&ctx->msg_lock);
 	}
 	if (req)
-		kmem_cache_free(req_cachep, req);
+		kfree_rcu(req, rcu_head);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			      int res, u32 cflags, u64 user_data)
 {
 	if (!READ_ONCE(ctx->submitter_task)) {
-		kmem_cache_free(req_cachep, req);
+		kfree_rcu(req, rcu_head);
 		return -EOWNERDEAD;
 	}
 	req->opcode = IORING_OP_NOP;
-- 
cgit v1.2.3


From 7ec80fb3f025825e860b433685fb801d6de34bf3 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:10 +0200
Subject: irqchip/gic-v5: Add GICv5 PPI support

The GICv5 CPU interface implements support for PE-Private Peripheral
Interrupts (PPI), that are handled (enabled/prioritized/delivered)
entirely within the CPU interface hardware.

To enable PPI interrupts, implement the baseline GICv5 host kernel
driver infrastructure required to handle interrupts on a GICv5 system.

Add the exception handling code path and definitions for GICv5
instructions.

Add GICv5 PPI handling code as a specific IRQ domain to:

- Set-up PPI priority
- Manage PPI configuration and state
- Manage IRQ flow handler
- IRQs allocation/free
- Hook-up a PPI specific IRQchip to provide the relevant methods

PPI IRQ priority is chosen as the minimum allowed priority by the
system design (after probing the number of priority bits implemented
by the CPU interface).

Co-developed-by: Sascha Bischoff <sascha.bischoff@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Co-developed-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-20-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 MAINTAINERS                        |   2 +
 arch/arm64/include/asm/sysreg.h    |  19 ++
 drivers/irqchip/Kconfig            |   5 +
 drivers/irqchip/Makefile           |   1 +
 drivers/irqchip/irq-gic-v5.c       | 464 +++++++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-gic-v5.h |  19 ++
 6 files changed, 510 insertions(+)
 create mode 100644 drivers/irqchip/irq-gic-v5.c
 create mode 100644 include/linux/irqchip/arm-gic-v5.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index c5452096c593..b1202987eef5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1970,6 +1970,8 @@ M:	Marc Zyngier <maz@kernel.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	Documentation/devicetree/bindings/interrupt-controller/arm,gic-v5*.yaml
+F:	drivers/irqchip/irq-gic-v5*.[ch]
+F:	include/linux/irqchip/arm-gic-v5.h
 
 ARM HDLCD DRM DRIVER
 M:	Liviu Dudau <liviu.dudau@arm.com>
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9b5fc6389715..36b82d74db37 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1082,6 +1082,25 @@
 
 #define GCS_CAP(x)	((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
 					       GCS_CAP_VALID_TOKEN)
+/*
+ * Definitions for GICv5 instructions
+ */
+#define GICV5_OP_GIC_CDDI		sys_insn(1, 0, 12, 2, 0)
+#define GICV5_OP_GIC_CDEOI		sys_insn(1, 0, 12, 1, 7)
+#define GICV5_OP_GICR_CDIA		sys_insn(1, 0, 12, 3, 0)
+
+/* Definitions for GIC CDDI */
+#define GICV5_GIC_CDDI_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDI_ID_MASK		GENMASK_ULL(23, 0)
+
+/* Definitions for GICR CDIA */
+#define GICV5_GIC_CDIA_VALID_MASK	BIT_ULL(32)
+#define GICV5_GICR_CDIA_VALID(r)	FIELD_GET(GICV5_GIC_CDIA_VALID_MASK, r)
+#define GICV5_GIC_CDIA_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDIA_ID_MASK		GENMASK_ULL(23, 0)
+
+#define gicr_insn(insn)			read_sysreg_s(GICV5_OP_GICR_##insn)
+#define gic_insn(v, insn)		write_sysreg_s(v, GICV5_OP_GIC_##insn)
 
 #define ARM64_FEATURE_FIELD_BITS	4
 
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 0d196e447142..3e4fb08b7a4d 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -54,6 +54,11 @@ config ARM_GIC_V3_ITS_FSL_MC
 	depends on FSL_MC_BUS
 	default ARM_GIC_V3_ITS
 
+config ARM_GIC_V5
+	bool
+	select IRQ_DOMAIN_HIERARCHY
+	select GENERIC_IRQ_EFFECTIVE_AFF_MASK
+
 config ARM_NVIC
 	bool
 	select IRQ_DOMAIN_HIERARCHY
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 23ca4959e6ce..3d75659d99eb 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v4.o irq-gic-v3-its-msi-parent.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
+obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
new file mode 100644
index 000000000000..0bb940212e20
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024-2025 ARM Limited, All Rights Reserved.
+ */
+
+#define pr_fmt(fmt)	"GICv5: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/wordpart.h>
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
+#include <asm/cpufeature.h>
+#include <asm/exception.h>
+
+static u8 pri_bits __ro_after_init = 5;
+
+#define GICV5_IRQ_PRI_MASK	0x1f
+#define GICV5_IRQ_PRI_MI	(GICV5_IRQ_PRI_MASK & GENMASK(4, 5 - pri_bits))
+
+#define PPI_NR	128
+
+static bool gicv5_cpuif_has_gcie(void)
+{
+	return this_cpu_has_cap(ARM64_HAS_GICV5_CPUIF);
+}
+
+struct gicv5_chip_data {
+	struct fwnode_handle	*fwnode;
+	struct irq_domain	*ppi_domain;
+};
+
+static struct gicv5_chip_data gicv5_global_data __read_mostly;
+
+static void gicv5_ppi_priority_init(void)
+{
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR0_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR1_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR2_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR3_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR4_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR5_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR6_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR7_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR8_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR9_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR10_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR11_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR12_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR13_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR14_EL1);
+	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR15_EL1);
+
+	/*
+	 * Context syncronization required to make sure system register writes
+	 * effects are synchronised.
+	 */
+	isb();
+}
+
+static void gicv5_ppi_irq_mask(struct irq_data *d)
+{
+	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
+
+	if (d->hwirq < 64)
+		sysreg_clear_set_s(SYS_ICC_PPI_ENABLER0_EL1, hwirq_id_bit, 0);
+	else
+		sysreg_clear_set_s(SYS_ICC_PPI_ENABLER1_EL1, hwirq_id_bit, 0);
+
+	/*
+	 * We must ensure that the disable takes effect immediately to
+	 * guarantee that the lazy-disabled IRQ mechanism works.
+	 * A context synchronization event is required to guarantee it.
+	 * Reference: I_ZLTKB/R_YRGMH GICv5 specification - section 2.9.1.
+	 */
+	isb();
+}
+
+static void gicv5_ppi_irq_unmask(struct irq_data *d)
+{
+	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
+
+	if (d->hwirq < 64)
+		sysreg_clear_set_s(SYS_ICC_PPI_ENABLER0_EL1, 0, hwirq_id_bit);
+	else
+		sysreg_clear_set_s(SYS_ICC_PPI_ENABLER1_EL1, 0, hwirq_id_bit);
+	/*
+	 * We must ensure that the enable takes effect in finite time - a
+	 * context synchronization event is required to guarantee it, we
+	 * can not take for granted that would happen (eg a core going straight
+	 * into idle after enabling a PPI).
+	 * Reference: I_ZLTKB/R_YRGMH GICv5 specification - section 2.9.1.
+	 */
+	isb();
+}
+
+static void gicv5_hwirq_eoi(u32 hwirq_id, u8 hwirq_type)
+{
+	u64 cddi;
+
+	cddi = FIELD_PREP(GICV5_GIC_CDDI_ID_MASK, hwirq_id)	|
+	       FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, hwirq_type);
+
+	gic_insn(cddi, CDDI);
+
+	gic_insn(0, CDEOI);
+}
+
+static void gicv5_ppi_irq_eoi(struct irq_data *d)
+{
+	gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_PPI);
+}
+
+enum ppi_reg {
+	PPI_PENDING,
+	PPI_ACTIVE,
+	PPI_HM
+};
+
+static __always_inline u64 read_ppi_sysreg_s(unsigned int irq,
+					     const enum ppi_reg which)
+{
+	switch (which) {
+	case PPI_PENDING:
+		return irq < 64	? read_sysreg_s(SYS_ICC_PPI_SPENDR0_EL1) :
+				  read_sysreg_s(SYS_ICC_PPI_SPENDR1_EL1);
+	case PPI_ACTIVE:
+		return irq < 64	? read_sysreg_s(SYS_ICC_PPI_SACTIVER0_EL1) :
+				  read_sysreg_s(SYS_ICC_PPI_SACTIVER1_EL1);
+	case PPI_HM:
+		return irq < 64	? read_sysreg_s(SYS_ICC_PPI_HMR0_EL1) :
+				  read_sysreg_s(SYS_ICC_PPI_HMR1_EL1);
+	default:
+		BUILD_BUG_ON(1);
+	}
+}
+
+static __always_inline void write_ppi_sysreg_s(unsigned int irq, bool set,
+					       const enum ppi_reg which)
+{
+	u64 bit = BIT_ULL(irq % 64);
+
+	switch (which) {
+	case PPI_PENDING:
+		if (set) {
+			if (irq < 64)
+				write_sysreg_s(bit, SYS_ICC_PPI_SPENDR0_EL1);
+			else
+				write_sysreg_s(bit, SYS_ICC_PPI_SPENDR1_EL1);
+		} else {
+			if (irq < 64)
+				write_sysreg_s(bit, SYS_ICC_PPI_CPENDR0_EL1);
+			else
+				write_sysreg_s(bit, SYS_ICC_PPI_CPENDR1_EL1);
+		}
+		return;
+	case PPI_ACTIVE:
+		if (set) {
+			if (irq < 64)
+				write_sysreg_s(bit, SYS_ICC_PPI_SACTIVER0_EL1);
+			else
+				write_sysreg_s(bit, SYS_ICC_PPI_SACTIVER1_EL1);
+		} else {
+			if (irq < 64)
+				write_sysreg_s(bit, SYS_ICC_PPI_CACTIVER0_EL1);
+			else
+				write_sysreg_s(bit, SYS_ICC_PPI_CACTIVER1_EL1);
+		}
+		return;
+	default:
+		BUILD_BUG_ON(1);
+	}
+}
+
+static int gicv5_ppi_irq_get_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool *state)
+{
+	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
+
+	switch (which) {
+	case IRQCHIP_STATE_PENDING:
+		*state = !!(read_ppi_sysreg_s(d->hwirq, PPI_PENDING) & hwirq_id_bit);
+		return 0;
+	case IRQCHIP_STATE_ACTIVE:
+		*state = !!(read_ppi_sysreg_s(d->hwirq, PPI_ACTIVE) & hwirq_id_bit);
+		return 0;
+	default:
+		pr_debug("Unexpected PPI irqchip state\n");
+		return -EINVAL;
+	}
+}
+
+static int gicv5_ppi_irq_set_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool state)
+{
+	switch (which) {
+	case IRQCHIP_STATE_PENDING:
+		write_ppi_sysreg_s(d->hwirq, state, PPI_PENDING);
+		return 0;
+	case IRQCHIP_STATE_ACTIVE:
+		write_ppi_sysreg_s(d->hwirq, state, PPI_ACTIVE);
+		return 0;
+	default:
+		pr_debug("Unexpected PPI irqchip state\n");
+		return -EINVAL;
+	}
+}
+
+static bool gicv5_ppi_irq_is_level(irq_hw_number_t hwirq)
+{
+	u64 bit = BIT_ULL(hwirq % 64);
+
+	return !!(read_ppi_sysreg_s(hwirq, PPI_HM) & bit);
+}
+
+static const struct irq_chip gicv5_ppi_irq_chip = {
+	.name			= "GICv5-PPI",
+	.irq_mask		= gicv5_ppi_irq_mask,
+	.irq_unmask		= gicv5_ppi_irq_unmask,
+	.irq_eoi		= gicv5_ppi_irq_eoi,
+	.irq_get_irqchip_state	= gicv5_ppi_irq_get_irqchip_state,
+	.irq_set_irqchip_state	= gicv5_ppi_irq_set_irqchip_state,
+	.flags			= IRQCHIP_SKIP_SET_WAKE	  |
+				  IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static int gicv5_irq_ppi_domain_translate(struct irq_domain *d,
+					  struct irq_fwspec *fwspec,
+					  irq_hw_number_t *hwirq,
+					  unsigned int *type)
+{
+	if (!is_of_node(fwspec->fwnode))
+		return -EINVAL;
+
+	if (fwspec->param_count < 3)
+		return -EINVAL;
+
+	if (fwspec->param[0] != GICV5_HWIRQ_TYPE_PPI)
+		return -EINVAL;
+
+	*hwirq = fwspec->param[1];
+
+	/*
+	 * Handling mode is hardcoded for PPIs, set the type using
+	 * HW reported value.
+	 */
+	*type = gicv5_ppi_irq_is_level(*hwirq) ? IRQ_TYPE_LEVEL_LOW : IRQ_TYPE_EDGE_RISING;
+
+	return 0;
+}
+
+static int gicv5_irq_ppi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs, void *arg)
+{
+	unsigned int type = IRQ_TYPE_NONE;
+	struct irq_fwspec *fwspec = arg;
+	irq_hw_number_t hwirq;
+	int ret;
+
+	if (WARN_ON_ONCE(nr_irqs != 1))
+		return -EINVAL;
+
+	ret = gicv5_irq_ppi_domain_translate(domain, fwspec, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	if (type & IRQ_TYPE_LEVEL_MASK)
+		irq_set_status_flags(virq, IRQ_LEVEL);
+
+	irq_set_percpu_devid(virq);
+	irq_domain_set_info(domain, virq, hwirq, &gicv5_ppi_irq_chip, NULL,
+			    handle_percpu_devid_irq, NULL, NULL);
+
+	return 0;
+}
+
+static void gicv5_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				  unsigned int nr_irqs)
+{
+	struct irq_data *d;
+
+	if (WARN_ON_ONCE(nr_irqs != 1))
+		return;
+
+	d = irq_domain_get_irq_data(domain, virq);
+
+	irq_set_handler(virq, NULL);
+	irq_domain_reset_irq_data(d);
+}
+
+static int gicv5_irq_ppi_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+				       enum irq_domain_bus_token bus_token)
+{
+	if (fwspec->fwnode != d->fwnode)
+		return 0;
+
+	if (fwspec->param[0] != GICV5_HWIRQ_TYPE_PPI)
+		return 0;
+
+	return (d == gicv5_global_data.ppi_domain);
+}
+
+static const struct irq_domain_ops gicv5_irq_ppi_domain_ops = {
+	.translate	= gicv5_irq_ppi_domain_translate,
+	.alloc		= gicv5_irq_ppi_domain_alloc,
+	.free		= gicv5_irq_domain_free,
+	.select		= gicv5_irq_ppi_domain_select
+};
+
+static void handle_irq_per_domain(u32 hwirq)
+{
+	u8 hwirq_type = FIELD_GET(GICV5_HWIRQ_TYPE, hwirq);
+	u32 hwirq_id = FIELD_GET(GICV5_HWIRQ_ID, hwirq);
+	struct irq_domain *domain;
+
+	switch (hwirq_type) {
+	case GICV5_HWIRQ_TYPE_PPI:
+		domain = gicv5_global_data.ppi_domain;
+		break;
+	default:
+		pr_err_once("Unknown IRQ type, bail out\n");
+		return;
+	}
+
+	if (generic_handle_domain_irq(domain, hwirq_id)) {
+		pr_err_once("Could not handle, hwirq = 0x%x", hwirq_id);
+		gicv5_hwirq_eoi(hwirq_id, hwirq_type);
+	}
+}
+
+static void __exception_irq_entry gicv5_handle_irq(struct pt_regs *regs)
+{
+	bool valid;
+	u32 hwirq;
+	u64 ia;
+
+	ia = gicr_insn(CDIA);
+	valid = GICV5_GICR_CDIA_VALID(ia);
+
+	if (!valid)
+		return;
+
+	/*
+	 * Ensure that the CDIA instruction effects (ie IRQ activation) are
+	 * completed before handling the interrupt.
+	 */
+	gsb_ack();
+
+	/*
+	 * Ensure instruction ordering between an acknowledgment and subsequent
+	 * instructions in the IRQ handler using an ISB.
+	 */
+	isb();
+
+	hwirq = FIELD_GET(GICV5_HWIRQ_INTID, ia);
+
+	handle_irq_per_domain(hwirq);
+}
+
+static void gicv5_cpu_disable_interrupts(void)
+{
+	u64 cr0;
+
+	cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 0);
+	write_sysreg_s(cr0, SYS_ICC_CR0_EL1);
+}
+
+static void gicv5_cpu_enable_interrupts(void)
+{
+	u64 cr0, pcr;
+
+	write_sysreg_s(0, SYS_ICC_PPI_ENABLER0_EL1);
+	write_sysreg_s(0, SYS_ICC_PPI_ENABLER1_EL1);
+
+	gicv5_ppi_priority_init();
+
+	pcr = FIELD_PREP(ICC_PCR_EL1_PRIORITY, GICV5_IRQ_PRI_MI);
+	write_sysreg_s(pcr, SYS_ICC_PCR_EL1);
+
+	cr0 = FIELD_PREP(ICC_CR0_EL1_EN, 1);
+	write_sysreg_s(cr0, SYS_ICC_CR0_EL1);
+}
+
+static int gicv5_starting_cpu(unsigned int cpu)
+{
+	if (WARN(!gicv5_cpuif_has_gcie(),
+		 "GICv5 system components present but CPU does not have FEAT_GCIE"))
+		return -ENODEV;
+
+	gicv5_cpu_enable_interrupts();
+
+	return 0;
+}
+
+static void __init gicv5_free_domains(void)
+{
+	if (gicv5_global_data.ppi_domain)
+		irq_domain_remove(gicv5_global_data.ppi_domain);
+
+	gicv5_global_data.ppi_domain = NULL;
+}
+
+static int __init gicv5_init_domains(struct fwnode_handle *handle)
+{
+	struct irq_domain *d;
+
+	d = irq_domain_create_linear(handle, PPI_NR, &gicv5_irq_ppi_domain_ops, NULL);
+	if (!d)
+		return -ENOMEM;
+
+	irq_domain_update_bus_token(d, DOMAIN_BUS_WIRED);
+	gicv5_global_data.ppi_domain = d;
+	gicv5_global_data.fwnode = handle;
+
+	return 0;
+}
+
+static void gicv5_set_cpuif_pribits(void)
+{
+	u64 icc_idr0 = read_sysreg_s(SYS_ICC_IDR0_EL1);
+
+	switch (FIELD_GET(ICC_IDR0_EL1_PRI_BITS, icc_idr0)) {
+	case ICC_IDR0_EL1_PRI_BITS_4BITS:
+		pri_bits = 4;
+		break;
+	case ICC_IDR0_EL1_PRI_BITS_5BITS:
+		pri_bits = 5;
+		break;
+	default:
+		pr_err("Unexpected ICC_IDR0_EL1_PRI_BITS value, default to 4");
+		pri_bits = 4;
+		break;
+	}
+}
+
+static int __init gicv5_of_init(struct device_node *node, struct device_node *parent)
+{
+	int ret = gicv5_init_domains(of_fwnode_handle(node));
+	if (ret)
+		return ret;
+
+	gicv5_set_cpuif_pribits();
+
+	ret = gicv5_starting_cpu(smp_processor_id());
+	if (ret)
+		goto out_dom;
+
+	ret = set_handle_irq(gicv5_handle_irq);
+	if (ret)
+		goto out_int;
+
+	return 0;
+
+out_int:
+	gicv5_cpu_disable_interrupts();
+out_dom:
+	gicv5_free_domains();
+
+	return ret;
+}
+IRQCHIP_DECLARE(gic_v5, "arm,gic-v5", gicv5_of_init);
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
new file mode 100644
index 000000000000..b08ec0308a9b
--- /dev/null
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 ARM Limited, All Rights Reserved.
+ */
+#ifndef __LINUX_IRQCHIP_ARM_GIC_V5_H
+#define __LINUX_IRQCHIP_ARM_GIC_V5_H
+
+#include <asm/sysreg.h>
+
+/*
+ * INTID handling
+ */
+#define GICV5_HWIRQ_ID			GENMASK(23, 0)
+#define GICV5_HWIRQ_TYPE		GENMASK(31, 29)
+#define GICV5_HWIRQ_INTID		GENMASK_ULL(31, 0)
+
+#define GICV5_HWIRQ_TYPE_PPI		UL(0x1)
+
+#endif
-- 
cgit v1.2.3


From 5cb1b6dab2def316671ea2565291a86ad58b884c Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:11 +0200
Subject: irqchip/gic-v5: Add GICv5 IRS/SPI support

The GICv5 Interrupt Routing Service (IRS) component implements
interrupt management and routing in the GICv5 architecture.

A GICv5 system comprises one or more IRSes, that together
handle the interrupt routing and state for the system.

An IRS supports Shared Peripheral Interrupts (SPIs), that are
interrupt sources directly connected to the IRS; they do not
rely on memory for storage. The number of supported SPIs is
fixed for a given implementation and can be probed through IRS
IDR registers.

SPI interrupt state and routing are managed through GICv5
instructions.

Each core (PE in GICv5 terms) in a GICv5 system is identified with
an Interrupt AFFinity ID (IAFFID).

An IRS manages a set of cores that are connected to it.

Firmware provides a topology description that the driver uses
to detect to which IRS a CPU (ie an IAFFID) is associated with.

Use probeable information and firmware description to initialize
the IRSes and implement GICv5 IRS SPIs support through an
SPI-specific IRQ domain.

The GICv5 IRS driver:

- Probes IRSes in the system to detect SPI ranges
- Associates an IRS with a set of cores connected to it
- Adds an IRQchip structure for SPI handling

SPIs priority is set to a value corresponding to the lowest
permissible priority in the system (taking into account the
implemented priority bits of the IRS and CPU interface).

Since all IRQs are set to the same priority value, the value
itself does not matter as long as it is a valid one.

Co-developed-by: Sascha Bischoff <sascha.bischoff@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Co-developed-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-21-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/sysreg.h    |  36 +++
 drivers/irqchip/Makefile           |   2 +-
 drivers/irqchip/irq-gic-v5-irs.c   | 434 +++++++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v5.c       | 347 +++++++++++++++++++++++++++--
 include/linux/irqchip/arm-gic-v5.h | 140 ++++++++++++
 5 files changed, 937 insertions(+), 22 deletions(-)
 create mode 100644 drivers/irqchip/irq-gic-v5-irs.c

(limited to 'include')

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 36b82d74db37..efd2e7a1fbe2 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1085,14 +1085,50 @@
 /*
  * Definitions for GICv5 instructions
  */
+#define GICV5_OP_GIC_CDAFF		sys_insn(1, 0, 12, 1, 3)
 #define GICV5_OP_GIC_CDDI		sys_insn(1, 0, 12, 2, 0)
+#define GICV5_OP_GIC_CDDIS		sys_insn(1, 0, 12, 1, 0)
+#define GICV5_OP_GIC_CDEN		sys_insn(1, 0, 12, 1, 1)
 #define GICV5_OP_GIC_CDEOI		sys_insn(1, 0, 12, 1, 7)
+#define GICV5_OP_GIC_CDPEND		sys_insn(1, 0, 12, 1, 4)
+#define GICV5_OP_GIC_CDPRI		sys_insn(1, 0, 12, 1, 2)
+#define GICV5_OP_GIC_CDRCFG		sys_insn(1, 0, 12, 1, 5)
 #define GICV5_OP_GICR_CDIA		sys_insn(1, 0, 12, 3, 0)
 
+/* Definitions for GIC CDAFF */
+#define GICV5_GIC_CDAFF_IAFFID_MASK	GENMASK_ULL(47, 32)
+#define GICV5_GIC_CDAFF_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDAFF_IRM_MASK	BIT_ULL(28)
+#define GICV5_GIC_CDAFF_ID_MASK		GENMASK_ULL(23, 0)
+
 /* Definitions for GIC CDDI */
 #define GICV5_GIC_CDDI_TYPE_MASK	GENMASK_ULL(31, 29)
 #define GICV5_GIC_CDDI_ID_MASK		GENMASK_ULL(23, 0)
 
+/* Definitions for GIC CDDIS */
+#define GICV5_GIC_CDDIS_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDDIS_TYPE(r)		FIELD_GET(GICV5_GIC_CDDIS_TYPE_MASK, r)
+#define GICV5_GIC_CDDIS_ID_MASK		GENMASK_ULL(23, 0)
+#define GICV5_GIC_CDDIS_ID(r)		FIELD_GET(GICV5_GIC_CDDIS_ID_MASK, r)
+
+/* Definitions for GIC CDEN */
+#define GICV5_GIC_CDEN_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDEN_ID_MASK		GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPEND */
+#define GICV5_GIC_CDPEND_PENDING_MASK	BIT_ULL(32)
+#define GICV5_GIC_CDPEND_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPEND_ID_MASK	GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDPRI */
+#define GICV5_GIC_CDPRI_PRIORITY_MASK	GENMASK_ULL(39, 35)
+#define GICV5_GIC_CDPRI_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDPRI_ID_MASK		GENMASK_ULL(23, 0)
+
+/* Definitions for GIC CDRCFG */
+#define GICV5_GIC_CDRCFG_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDRCFG_ID_MASK	GENMASK_ULL(23, 0)
+
 /* Definitions for GICR CDIA */
 #define GICV5_GIC_CDIA_VALID_MASK	BIT_ULL(32)
 #define GICV5_GICR_CDIA_VALID(r)	FIELD_GET(GICV5_GIC_CDIA_VALID_MASK, r)
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 3d75659d99eb..7a0e6cee09e1 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_ARM_GIC_V3)		+= irq-gic-v3.o irq-gic-v3-mbi.o irq-gic-common.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v4.o irq-gic-v3-its-msi-parent.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
-obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o
+obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
new file mode 100644
index 000000000000..fba8efceb26e
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024-2025 ARM Limited, All Rights Reserved.
+ */
+
+#define pr_fmt(fmt)	"GICv5 IRS: " fmt
+
+#include <linux/of.h>
+#include <linux/of_address.h>
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
+#define IRS_FLAGS_NON_COHERENT		BIT(0)
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct gicv5_irs_chip_data *, per_cpu_irs_data);
+static LIST_HEAD(irs_nodes);
+
+static u32 irs_readl_relaxed(struct gicv5_irs_chip_data *irs_data,
+			     const u32 reg_offset)
+{
+	return readl_relaxed(irs_data->irs_base + reg_offset);
+}
+
+static void irs_writel_relaxed(struct gicv5_irs_chip_data *irs_data,
+			       const u32 val, const u32 reg_offset)
+{
+	writel_relaxed(val, irs_data->irs_base + reg_offset);
+}
+
+struct iaffid_entry {
+	u16	iaffid;
+	bool	valid;
+};
+
+static DEFINE_PER_CPU(struct iaffid_entry, cpu_iaffid);
+
+int gicv5_irs_cpu_to_iaffid(int cpuid, u16 *iaffid)
+{
+	if (!per_cpu(cpu_iaffid, cpuid).valid) {
+		pr_err("IAFFID for CPU %d has not been initialised\n", cpuid);
+		return -ENODEV;
+	}
+
+	*iaffid = per_cpu(cpu_iaffid, cpuid).iaffid;
+
+	return 0;
+}
+
+struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	u32 min, max;
+
+	list_for_each_entry(irs_data, &irs_nodes, entry) {
+		if (!irs_data->spi_range)
+			continue;
+
+		min = irs_data->spi_min;
+		max = irs_data->spi_min + irs_data->spi_range - 1;
+		if (spi_id >= min && spi_id <= max)
+			return irs_data;
+	}
+
+	return NULL;
+}
+
+static int gicv5_irs_wait_for_spi_op(struct gicv5_irs_chip_data *irs_data)
+{
+	u32 statusr;
+	int ret;
+
+	ret = gicv5_wait_for_op_atomic(irs_data->irs_base, GICV5_IRS_SPI_STATUSR,
+				       GICV5_IRS_SPI_STATUSR_IDLE, &statusr);
+	if (ret)
+		return ret;
+
+	return !!FIELD_GET(GICV5_IRS_SPI_STATUSR_V, statusr) ? 0 : -EIO;
+}
+
+static int gicv5_irs_wait_for_irs_pe(struct gicv5_irs_chip_data *irs_data,
+				     bool selr)
+{
+	bool valid = true;
+	u32 statusr;
+	int ret;
+
+	ret = gicv5_wait_for_op_atomic(irs_data->irs_base, GICV5_IRS_PE_STATUSR,
+				       GICV5_IRS_PE_STATUSR_IDLE, &statusr);
+	if (ret)
+		return ret;
+
+	if (selr)
+		valid = !!FIELD_GET(GICV5_IRS_PE_STATUSR_V, statusr);
+
+	return valid ? 0 : -EIO;
+}
+
+static int gicv5_irs_wait_for_pe_selr(struct gicv5_irs_chip_data *irs_data)
+{
+	return gicv5_irs_wait_for_irs_pe(irs_data, true);
+}
+
+static int gicv5_irs_wait_for_pe_cr0(struct gicv5_irs_chip_data *irs_data)
+{
+	return gicv5_irs_wait_for_irs_pe(irs_data, false);
+}
+
+int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	struct gicv5_irs_chip_data *irs_data = d->chip_data;
+	u32 selr, cfgr;
+	bool level;
+	int ret;
+
+	/*
+	 * There is no distinction between HIGH/LOW for level IRQs
+	 * and RISING/FALLING for edge IRQs in the architecture,
+	 * hence consider them equivalent.
+	 */
+	switch (type) {
+	case IRQ_TYPE_EDGE_RISING:
+	case IRQ_TYPE_EDGE_FALLING:
+		level = false;
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+	case IRQ_TYPE_LEVEL_LOW:
+		level = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	guard(raw_spinlock)(&irs_data->spi_config_lock);
+
+	selr = FIELD_PREP(GICV5_IRS_SPI_SELR_ID, d->hwirq);
+	irs_writel_relaxed(irs_data, selr, GICV5_IRS_SPI_SELR);
+	ret = gicv5_irs_wait_for_spi_op(irs_data);
+	if (ret)
+		return ret;
+
+	cfgr = FIELD_PREP(GICV5_IRS_SPI_CFGR_TM, level);
+	irs_writel_relaxed(irs_data, cfgr, GICV5_IRS_SPI_CFGR);
+
+	return gicv5_irs_wait_for_spi_op(irs_data);
+}
+
+static int gicv5_irs_wait_for_idle(struct gicv5_irs_chip_data *irs_data)
+{
+	return gicv5_wait_for_op_atomic(irs_data->irs_base, GICV5_IRS_CR0,
+					GICV5_IRS_CR0_IDLE, NULL);
+}
+
+int gicv5_irs_register_cpu(int cpuid)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	u32 selr, cr0;
+	u16 iaffid;
+	int ret;
+
+	ret = gicv5_irs_cpu_to_iaffid(cpuid, &iaffid);
+	if (ret) {
+		pr_err("IAFFID for CPU %d has not been initialised\n", cpuid);
+		return ret;
+	}
+
+	irs_data = per_cpu(per_cpu_irs_data, cpuid);
+	if (!irs_data) {
+		pr_err("No IRS associated with CPU %u\n", cpuid);
+		return -ENXIO;
+	}
+
+	selr = FIELD_PREP(GICV5_IRS_PE_SELR_IAFFID, iaffid);
+	irs_writel_relaxed(irs_data, selr, GICV5_IRS_PE_SELR);
+
+	ret = gicv5_irs_wait_for_pe_selr(irs_data);
+	if (ret) {
+		pr_err("IAFFID 0x%x used in IRS_PE_SELR is invalid\n", iaffid);
+		return -ENXIO;
+	}
+
+	cr0 = FIELD_PREP(GICV5_IRS_PE_CR0_DPS, 0x1);
+	irs_writel_relaxed(irs_data, cr0, GICV5_IRS_PE_CR0);
+
+	ret = gicv5_irs_wait_for_pe_cr0(irs_data);
+	if (ret)
+		return ret;
+
+	pr_debug("CPU %d enabled PE IAFFID 0x%x\n", cpuid, iaffid);
+
+	return 0;
+}
+
+static void __init gicv5_irs_init_bases(struct gicv5_irs_chip_data *irs_data,
+					void __iomem *irs_base,
+					struct fwnode_handle *handle)
+{
+	struct device_node *np = to_of_node(handle);
+	u32 cr0, cr1;
+
+	irs_data->fwnode = handle;
+	irs_data->irs_base = irs_base;
+
+	if (of_property_read_bool(np, "dma-noncoherent")) {
+		/*
+		 * A non-coherent IRS implies that some cache levels cannot be
+		 * used coherently by the cores and GIC. Our only option is to mark
+		 * memory attributes for the GIC as non-cacheable; by default,
+		 * non-cacheable memory attributes imply outer-shareable
+		 * shareability, the value written into IRS_CR1_SH is ignored.
+		 */
+		cr1 = FIELD_PREP(GICV5_IRS_CR1_VPED_WA, GICV5_NO_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VPED_RA, GICV5_NO_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMD_WA, GICV5_NO_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMD_RA, GICV5_NO_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VPET_RA, GICV5_NO_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMT_RA, GICV5_NO_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IST_WA, GICV5_NO_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IST_RA, GICV5_NO_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IC, GICV5_NON_CACHE)		|
+			FIELD_PREP(GICV5_IRS_CR1_OC, GICV5_NON_CACHE);
+			irs_data->flags |= IRS_FLAGS_NON_COHERENT;
+	} else {
+		cr1 = FIELD_PREP(GICV5_IRS_CR1_VPED_WA, GICV5_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VPED_RA, GICV5_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMD_WA, GICV5_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMD_RA, GICV5_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VPET_RA, GICV5_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_VMT_RA, GICV5_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IST_WA, GICV5_WRITE_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IST_RA, GICV5_READ_ALLOC)	|
+			FIELD_PREP(GICV5_IRS_CR1_IC, GICV5_WB_CACHE)		|
+			FIELD_PREP(GICV5_IRS_CR1_OC, GICV5_WB_CACHE)		|
+			FIELD_PREP(GICV5_IRS_CR1_SH, GICV5_INNER_SHARE);
+	}
+
+	irs_writel_relaxed(irs_data, cr1, GICV5_IRS_CR1);
+
+	cr0 = FIELD_PREP(GICV5_IRS_CR0_IRSEN, 0x1);
+	irs_writel_relaxed(irs_data, cr0, GICV5_IRS_CR0);
+	gicv5_irs_wait_for_idle(irs_data);
+}
+
+static int __init gicv5_irs_of_init_affinity(struct device_node *node,
+					     struct gicv5_irs_chip_data *irs_data,
+					     u8 iaffid_bits)
+{
+	/*
+	 * Detect IAFFID<->CPU mappings from the device tree and
+	 * record IRS<->CPU topology information.
+	 */
+	u16 iaffid_mask = GENMASK(iaffid_bits - 1, 0);
+	int ret, i, ncpus, niaffids;
+
+	ncpus = of_count_phandle_with_args(node, "cpus", NULL);
+	if (ncpus < 0)
+		return -EINVAL;
+
+	niaffids = of_property_count_elems_of_size(node, "arm,iaffids",
+						   sizeof(u16));
+	if (niaffids != ncpus)
+		return -EINVAL;
+
+	u16 *iaffids __free(kfree) = kcalloc(niaffids, sizeof(*iaffids), GFP_KERNEL);
+	if (!iaffids)
+		return -ENOMEM;
+
+	ret = of_property_read_u16_array(node, "arm,iaffids", iaffids, niaffids);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ncpus; i++) {
+		struct device_node *cpu_node;
+		int cpu;
+
+		cpu_node = of_parse_phandle(node, "cpus", i);
+		if (WARN_ON(!cpu_node))
+			continue;
+
+		cpu = of_cpu_node_to_id(cpu_node);
+		of_node_put(cpu_node);
+		if (WARN_ON(cpu < 0))
+			continue;
+
+		if (iaffids[i] & ~iaffid_mask) {
+			pr_warn("CPU %d iaffid 0x%x exceeds IRS iaffid bits\n",
+				cpu, iaffids[i]);
+			continue;
+		}
+
+		per_cpu(cpu_iaffid, cpu).iaffid = iaffids[i];
+		per_cpu(cpu_iaffid, cpu).valid = true;
+
+		/* We also know that the CPU is connected to this IRS */
+		per_cpu(per_cpu_irs_data, cpu) = irs_data;
+	}
+
+	return ret;
+}
+
+static void irs_setup_pri_bits(u32 idr1)
+{
+	switch (FIELD_GET(GICV5_IRS_IDR1_PRIORITY_BITS, idr1)) {
+	case GICV5_IRS_IDR1_PRIORITY_BITS_1BITS:
+		gicv5_global_data.irs_pri_bits = 1;
+		break;
+	case GICV5_IRS_IDR1_PRIORITY_BITS_2BITS:
+		gicv5_global_data.irs_pri_bits = 2;
+		break;
+	case GICV5_IRS_IDR1_PRIORITY_BITS_3BITS:
+		gicv5_global_data.irs_pri_bits = 3;
+		break;
+	case GICV5_IRS_IDR1_PRIORITY_BITS_4BITS:
+		gicv5_global_data.irs_pri_bits = 4;
+		break;
+	case GICV5_IRS_IDR1_PRIORITY_BITS_5BITS:
+		gicv5_global_data.irs_pri_bits = 5;
+		break;
+	default:
+		pr_warn("Detected wrong IDR priority bits value 0x%lx\n",
+			FIELD_GET(GICV5_IRS_IDR1_PRIORITY_BITS, idr1));
+		gicv5_global_data.irs_pri_bits = 1;
+		break;
+	}
+}
+
+static int __init gicv5_irs_init(struct device_node *node)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	void __iomem *irs_base;
+	u32 idr, spi_count;
+	u8 iaffid_bits;
+	int ret;
+
+	irs_data = kzalloc(sizeof(*irs_data), GFP_KERNEL);
+	if (!irs_data)
+		return -ENOMEM;
+
+	raw_spin_lock_init(&irs_data->spi_config_lock);
+
+	ret = of_property_match_string(node, "reg-names", "ns-config");
+	if (ret < 0) {
+		pr_err("%pOF: ns-config reg-name not present\n", node);
+		goto out_err;
+	}
+
+	irs_base = of_io_request_and_map(node, ret, of_node_full_name(node));
+	if (IS_ERR(irs_base)) {
+		pr_err("%pOF: unable to map GICv5 IRS registers\n", node);
+		ret = PTR_ERR(irs_base);
+		goto out_err;
+	}
+
+	gicv5_irs_init_bases(irs_data, irs_base, &node->fwnode);
+
+	idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1);
+	iaffid_bits = FIELD_GET(GICV5_IRS_IDR1_IAFFID_BITS, idr) + 1;
+
+	ret = gicv5_irs_of_init_affinity(node, irs_data, iaffid_bits);
+	if (ret) {
+		pr_err("Failed to parse CPU IAFFIDs from the device tree!\n");
+		goto out_iomem;
+	}
+
+	idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR7);
+	irs_data->spi_min = FIELD_GET(GICV5_IRS_IDR7_SPI_BASE, idr);
+
+	idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR6);
+	irs_data->spi_range = FIELD_GET(GICV5_IRS_IDR6_SPI_IRS_RANGE, idr);
+
+	if (irs_data->spi_range) {
+		pr_info("%s detected SPI range [%u-%u]\n",
+						of_node_full_name(node),
+						irs_data->spi_min,
+						irs_data->spi_min +
+						irs_data->spi_range - 1);
+	}
+
+	/*
+	 * Do the global setting only on the first IRS.
+	 * Global properties (iaffid_bits, global spi count) are guaranteed to
+	 * be consistent across IRSes by the architecture.
+	 */
+	if (list_empty(&irs_nodes)) {
+
+		idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR1);
+		irs_setup_pri_bits(idr);
+
+		idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR5);
+
+		spi_count = FIELD_GET(GICV5_IRS_IDR5_SPI_RANGE, idr);
+		gicv5_global_data.global_spi_count = spi_count;
+
+		pr_debug("Detected %u SPIs globally\n", spi_count);
+	}
+
+	list_add_tail(&irs_data->entry, &irs_nodes);
+
+	return 0;
+
+out_iomem:
+	iounmap(irs_base);
+out_err:
+	kfree(irs_data);
+	return ret;
+}
+
+void __init gicv5_irs_remove(void)
+{
+	struct gicv5_irs_chip_data *irs_data, *tmp_data;
+
+	list_for_each_entry_safe(irs_data, tmp_data, &irs_nodes, entry) {
+		iounmap(irs_data->irs_base);
+		list_del(&irs_data->entry);
+		kfree(irs_data);
+	}
+}
+
+int __init gicv5_irs_of_probe(struct device_node *parent)
+{
+	struct device_node *np;
+	int ret;
+
+	for_each_available_child_of_node(parent, np) {
+		if (!of_device_is_compatible(np, "arm,gic-v5-irs"))
+			continue;
+
+		ret = gicv5_irs_init(np);
+		if (ret)
+			pr_err("Failed to init IRS %s\n", np->full_name);
+	}
+
+	return list_empty(&irs_nodes) ? -ENODEV : 0;
+}
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 0bb940212e20..9c55ddcfa0df 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -26,12 +26,7 @@ static bool gicv5_cpuif_has_gcie(void)
 	return this_cpu_has_cap(ARM64_HAS_GICV5_CPUIF);
 }
 
-struct gicv5_chip_data {
-	struct fwnode_handle	*fwnode;
-	struct irq_domain	*ppi_domain;
-};
-
-static struct gicv5_chip_data gicv5_global_data __read_mostly;
+struct gicv5_chip_data gicv5_global_data __read_mostly;
 
 static void gicv5_ppi_priority_init(void)
 {
@@ -59,6 +54,30 @@ static void gicv5_ppi_priority_init(void)
 	isb();
 }
 
+static void gicv5_hwirq_init(irq_hw_number_t hwirq, u8 priority, u8 hwirq_type)
+{
+	u64 cdpri, cdaff;
+	u16 iaffid;
+	int ret;
+
+	if (hwirq_type == GICV5_HWIRQ_TYPE_SPI) {
+		cdpri = FIELD_PREP(GICV5_GIC_CDPRI_PRIORITY_MASK, priority)	|
+			FIELD_PREP(GICV5_GIC_CDPRI_TYPE_MASK, hwirq_type)	|
+			FIELD_PREP(GICV5_GIC_CDPRI_ID_MASK, hwirq);
+		gic_insn(cdpri, CDPRI);
+
+		ret = gicv5_irs_cpu_to_iaffid(smp_processor_id(), &iaffid);
+
+		if (WARN_ON_ONCE(ret))
+			return;
+
+		cdaff = FIELD_PREP(GICV5_GIC_CDAFF_IAFFID_MASK, iaffid)		|
+			FIELD_PREP(GICV5_GIC_CDAFF_TYPE_MASK, hwirq_type)	|
+			FIELD_PREP(GICV5_GIC_CDAFF_ID_MASK, hwirq);
+		gic_insn(cdaff, CDAFF);
+	}
+}
+
 static void gicv5_ppi_irq_mask(struct irq_data *d)
 {
 	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
@@ -77,6 +96,32 @@ static void gicv5_ppi_irq_mask(struct irq_data *d)
 	isb();
 }
 
+static void gicv5_iri_irq_mask(struct irq_data *d, u8 hwirq_type)
+{
+	u64 cddis;
+
+	cddis = FIELD_PREP(GICV5_GIC_CDDIS_ID_MASK, d->hwirq)	|
+		FIELD_PREP(GICV5_GIC_CDDIS_TYPE_MASK, hwirq_type);
+
+	gic_insn(cddis, CDDIS);
+	/*
+	 * We must make sure that GIC CDDIS write effects are propagated
+	 * immediately to make sure the disable takes effect to guarantee
+	 * that the lazy-disabled IRQ mechanism works.
+	 * Rule R_XCLJC states that the effects of a GIC system instruction
+	 * complete in finite time.
+	 * The GSB ensures completion of the GIC instruction and prevents
+	 * loads, stores and GIC instructions from executing part of their
+	 * functionality before the GSB SYS.
+	 */
+	gsb_sys();
+}
+
+static void gicv5_spi_irq_mask(struct irq_data *d)
+{
+	gicv5_iri_irq_mask(d, GICV5_HWIRQ_TYPE_SPI);
+}
+
 static void gicv5_ppi_irq_unmask(struct irq_data *d)
 {
 	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
@@ -95,6 +140,25 @@ static void gicv5_ppi_irq_unmask(struct irq_data *d)
 	isb();
 }
 
+static void gicv5_iri_irq_unmask(struct irq_data *d, u8 hwirq_type)
+{
+	u64 cden;
+
+	cden = FIELD_PREP(GICV5_GIC_CDEN_ID_MASK, d->hwirq)	|
+	       FIELD_PREP(GICV5_GIC_CDEN_TYPE_MASK, hwirq_type);
+	/*
+	 * Rule R_XCLJC states that the effects of a GIC system instruction
+	 * complete in finite time and that's the only requirement when
+	 * unmasking an SPI IRQ.
+	 */
+	gic_insn(cden, CDEN);
+}
+
+static void gicv5_spi_irq_unmask(struct irq_data *d)
+{
+	gicv5_iri_irq_unmask(d, GICV5_HWIRQ_TYPE_SPI);
+}
+
 static void gicv5_hwirq_eoi(u32 hwirq_id, u8 hwirq_type)
 {
 	u64 cddi;
@@ -112,6 +176,46 @@ static void gicv5_ppi_irq_eoi(struct irq_data *d)
 	gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_PPI);
 }
 
+static void gicv5_spi_irq_eoi(struct irq_data *d)
+{
+	gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_SPI);
+}
+
+static int gicv5_iri_irq_set_affinity(struct irq_data *d,
+				      const struct cpumask *mask_val,
+				      bool force, u8 hwirq_type)
+{
+	int ret, cpuid;
+	u16 iaffid;
+	u64 cdaff;
+
+	if (force)
+		cpuid = cpumask_first(mask_val);
+	else
+		cpuid = cpumask_any_and(mask_val, cpu_online_mask);
+
+	ret = gicv5_irs_cpu_to_iaffid(cpuid, &iaffid);
+	if (ret)
+		return ret;
+
+	cdaff = FIELD_PREP(GICV5_GIC_CDAFF_IAFFID_MASK, iaffid)		|
+		FIELD_PREP(GICV5_GIC_CDAFF_TYPE_MASK, hwirq_type)	|
+		FIELD_PREP(GICV5_GIC_CDAFF_ID_MASK, d->hwirq);
+	gic_insn(cdaff, CDAFF);
+
+	irq_data_update_effective_affinity(d, cpumask_of(cpuid));
+
+	return IRQ_SET_MASK_OK_DONE;
+}
+
+static int gicv5_spi_irq_set_affinity(struct irq_data *d,
+				      const struct cpumask *mask_val,
+				      bool force)
+{
+	return gicv5_iri_irq_set_affinity(d, mask_val, force,
+					  GICV5_HWIRQ_TYPE_SPI);
+}
+
 enum ppi_reg {
 	PPI_PENDING,
 	PPI_ACTIVE,
@@ -192,6 +296,46 @@ static int gicv5_ppi_irq_get_irqchip_state(struct irq_data *d,
 	}
 }
 
+static int gicv5_iri_irq_get_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool *state, u8 hwirq_type)
+{
+	u64 icsr, cdrcfg;
+
+	cdrcfg = d->hwirq | FIELD_PREP(GICV5_GIC_CDRCFG_TYPE_MASK, hwirq_type);
+
+	gic_insn(cdrcfg, CDRCFG);
+	isb();
+	icsr = read_sysreg_s(SYS_ICC_ICSR_EL1);
+
+	if (FIELD_GET(ICC_ICSR_EL1_F, icsr)) {
+		pr_err("ICSR_EL1 is invalid\n");
+		return -EINVAL;
+	}
+
+	switch (which) {
+	case IRQCHIP_STATE_PENDING:
+		*state = !!(FIELD_GET(ICC_ICSR_EL1_Pending, icsr));
+		return 0;
+
+	case IRQCHIP_STATE_ACTIVE:
+		*state = !!(FIELD_GET(ICC_ICSR_EL1_Active, icsr));
+		return 0;
+
+	default:
+		pr_debug("Unexpected irqchip_irq_state\n");
+		return -EINVAL;
+	}
+}
+
+static int gicv5_spi_irq_get_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool *state)
+{
+	return gicv5_iri_irq_get_irqchip_state(d, which, state,
+					       GICV5_HWIRQ_TYPE_SPI);
+}
+
 static int gicv5_ppi_irq_set_irqchip_state(struct irq_data *d,
 					   enum irqchip_irq_state which,
 					   bool state)
@@ -209,6 +353,45 @@ static int gicv5_ppi_irq_set_irqchip_state(struct irq_data *d,
 	}
 }
 
+static void gicv5_iri_irq_write_pending_state(struct irq_data *d, bool state,
+					      u8 hwirq_type)
+{
+	u64 cdpend;
+
+	cdpend = FIELD_PREP(GICV5_GIC_CDPEND_TYPE_MASK, hwirq_type)	|
+		 FIELD_PREP(GICV5_GIC_CDPEND_ID_MASK, d->hwirq)		|
+		 FIELD_PREP(GICV5_GIC_CDPEND_PENDING_MASK, state);
+
+	gic_insn(cdpend, CDPEND);
+}
+
+static void gicv5_spi_irq_write_pending_state(struct irq_data *d, bool state)
+{
+	gicv5_iri_irq_write_pending_state(d, state, GICV5_HWIRQ_TYPE_SPI);
+}
+
+static int gicv5_spi_irq_set_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool state)
+{
+	switch (which) {
+	case IRQCHIP_STATE_PENDING:
+		gicv5_spi_irq_write_pending_state(d, state);
+		break;
+	default:
+		pr_debug("Unexpected irqchip_irq_state\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int gicv5_spi_irq_retrigger(struct irq_data *data)
+{
+	return !gicv5_spi_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING,
+						true);
+}
+
 static bool gicv5_ppi_irq_is_level(irq_hw_number_t hwirq)
 {
 	u64 bit = BIT_ULL(hwirq % 64);
@@ -227,10 +410,26 @@ static const struct irq_chip gicv5_ppi_irq_chip = {
 				  IRQCHIP_MASK_ON_SUSPEND,
 };
 
-static int gicv5_irq_ppi_domain_translate(struct irq_domain *d,
-					  struct irq_fwspec *fwspec,
-					  irq_hw_number_t *hwirq,
-					  unsigned int *type)
+static const struct irq_chip gicv5_spi_irq_chip = {
+	.name			= "GICv5-SPI",
+	.irq_mask		= gicv5_spi_irq_mask,
+	.irq_unmask		= gicv5_spi_irq_unmask,
+	.irq_eoi		= gicv5_spi_irq_eoi,
+	.irq_set_type		= gicv5_spi_irq_set_type,
+	.irq_set_affinity	= gicv5_spi_irq_set_affinity,
+	.irq_retrigger		= gicv5_spi_irq_retrigger,
+	.irq_get_irqchip_state	= gicv5_spi_irq_get_irqchip_state,
+	.irq_set_irqchip_state	= gicv5_spi_irq_set_irqchip_state,
+	.flags			= IRQCHIP_SET_TYPE_MASKED |
+				  IRQCHIP_SKIP_SET_WAKE	  |
+				  IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static __always_inline int gicv5_irq_domain_translate(struct irq_domain *d,
+						      struct irq_fwspec *fwspec,
+						      irq_hw_number_t *hwirq,
+						      unsigned int *type,
+						      const u8 hwirq_type)
 {
 	if (!is_of_node(fwspec->fwnode))
 		return -EINVAL;
@@ -238,20 +437,39 @@ static int gicv5_irq_ppi_domain_translate(struct irq_domain *d,
 	if (fwspec->param_count < 3)
 		return -EINVAL;
 
-	if (fwspec->param[0] != GICV5_HWIRQ_TYPE_PPI)
+	if (fwspec->param[0] != hwirq_type)
 		return -EINVAL;
 
 	*hwirq = fwspec->param[1];
 
-	/*
-	 * Handling mode is hardcoded for PPIs, set the type using
-	 * HW reported value.
-	 */
-	*type = gicv5_ppi_irq_is_level(*hwirq) ? IRQ_TYPE_LEVEL_LOW : IRQ_TYPE_EDGE_RISING;
+	switch (hwirq_type) {
+	case GICV5_HWIRQ_TYPE_PPI:
+		/*
+		 * Handling mode is hardcoded for PPIs, set the type using
+		 * HW reported value.
+		 */
+		*type = gicv5_ppi_irq_is_level(*hwirq) ? IRQ_TYPE_LEVEL_LOW :
+							 IRQ_TYPE_EDGE_RISING;
+		break;
+	case GICV5_HWIRQ_TYPE_SPI:
+		*type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;
+		break;
+	default:
+		BUILD_BUG_ON(1);
+	}
 
 	return 0;
 }
 
+static int gicv5_irq_ppi_domain_translate(struct irq_domain *d,
+					  struct irq_fwspec *fwspec,
+					  irq_hw_number_t *hwirq,
+					  unsigned int *type)
+{
+	return gicv5_irq_domain_translate(d, fwspec, hwirq, type,
+					  GICV5_HWIRQ_TYPE_PPI);
+}
+
 static int gicv5_irq_ppi_domain_alloc(struct irq_domain *domain, unsigned int virq,
 				      unsigned int nr_irqs, void *arg)
 {
@@ -310,6 +528,63 @@ static const struct irq_domain_ops gicv5_irq_ppi_domain_ops = {
 	.select		= gicv5_irq_ppi_domain_select
 };
 
+static int gicv5_irq_spi_domain_translate(struct irq_domain *d,
+					  struct irq_fwspec *fwspec,
+					  irq_hw_number_t *hwirq,
+					  unsigned int *type)
+{
+	return gicv5_irq_domain_translate(d, fwspec, hwirq, type,
+					  GICV5_HWIRQ_TYPE_SPI);
+}
+
+static int gicv5_irq_spi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs, void *arg)
+{
+	struct gicv5_irs_chip_data *chip_data;
+	unsigned int type = IRQ_TYPE_NONE;
+	struct irq_fwspec *fwspec = arg;
+	struct irq_data *irqd;
+	irq_hw_number_t hwirq;
+	int ret;
+
+	if (WARN_ON_ONCE(nr_irqs != 1))
+		return -EINVAL;
+
+	ret = gicv5_irq_spi_domain_translate(domain, fwspec, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	irqd = irq_desc_get_irq_data(irq_to_desc(virq));
+	chip_data = gicv5_irs_lookup_by_spi_id(hwirq);
+
+	irq_domain_set_info(domain, virq, hwirq, &gicv5_spi_irq_chip, chip_data,
+			    handle_fasteoi_irq, NULL, NULL);
+	irq_set_probe(virq);
+	irqd_set_single_target(irqd);
+
+	gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_SPI);
+
+	return 0;
+}
+
+static int gicv5_irq_spi_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+				       enum irq_domain_bus_token bus_token)
+{
+	if (fwspec->fwnode != d->fwnode)
+		return 0;
+
+	if (fwspec->param[0] != GICV5_HWIRQ_TYPE_SPI)
+		return 0;
+
+	return (d == gicv5_global_data.spi_domain);
+}
+
+static const struct irq_domain_ops gicv5_irq_spi_domain_ops = {
+	.translate	= gicv5_irq_spi_domain_translate,
+	.alloc		= gicv5_irq_spi_domain_alloc,
+	.free		= gicv5_irq_domain_free,
+	.select		= gicv5_irq_spi_domain_select
+};
 static void handle_irq_per_domain(u32 hwirq)
 {
 	u8 hwirq_type = FIELD_GET(GICV5_HWIRQ_TYPE, hwirq);
@@ -320,6 +595,9 @@ static void handle_irq_per_domain(u32 hwirq)
 	case GICV5_HWIRQ_TYPE_PPI:
 		domain = gicv5_global_data.ppi_domain;
 		break;
+	case GICV5_HWIRQ_TYPE_SPI:
+		domain = gicv5_global_data.spi_domain;
+		break;
 	default:
 		pr_err_once("Unknown IRQ type, bail out\n");
 		return;
@@ -392,19 +670,23 @@ static int gicv5_starting_cpu(unsigned int cpu)
 
 	gicv5_cpu_enable_interrupts();
 
-	return 0;
+	return gicv5_irs_register_cpu(cpu);
 }
 
 static void __init gicv5_free_domains(void)
 {
 	if (gicv5_global_data.ppi_domain)
 		irq_domain_remove(gicv5_global_data.ppi_domain);
+	if (gicv5_global_data.spi_domain)
+		irq_domain_remove(gicv5_global_data.spi_domain);
 
 	gicv5_global_data.ppi_domain = NULL;
+	gicv5_global_data.spi_domain = NULL;
 }
 
 static int __init gicv5_init_domains(struct fwnode_handle *handle)
 {
+	u32 spi_count = gicv5_global_data.global_spi_count;
 	struct irq_domain *d;
 
 	d = irq_domain_create_linear(handle, PPI_NR, &gicv5_irq_ppi_domain_ops, NULL);
@@ -413,6 +695,20 @@ static int __init gicv5_init_domains(struct fwnode_handle *handle)
 
 	irq_domain_update_bus_token(d, DOMAIN_BUS_WIRED);
 	gicv5_global_data.ppi_domain = d;
+
+	if (spi_count) {
+		d = irq_domain_create_linear(handle, spi_count,
+					     &gicv5_irq_spi_domain_ops, NULL);
+
+		if (!d) {
+			gicv5_free_domains();
+			return -ENOMEM;
+		}
+
+		gicv5_global_data.spi_domain = d;
+		irq_domain_update_bus_token(d, DOMAIN_BUS_WIRED);
+	}
+
 	gicv5_global_data.fwnode = handle;
 
 	return 0;
@@ -424,26 +720,33 @@ static void gicv5_set_cpuif_pribits(void)
 
 	switch (FIELD_GET(ICC_IDR0_EL1_PRI_BITS, icc_idr0)) {
 	case ICC_IDR0_EL1_PRI_BITS_4BITS:
-		pri_bits = 4;
+		gicv5_global_data.cpuif_pri_bits = 4;
 		break;
 	case ICC_IDR0_EL1_PRI_BITS_5BITS:
-		pri_bits = 5;
+		gicv5_global_data.cpuif_pri_bits = 5;
 		break;
 	default:
 		pr_err("Unexpected ICC_IDR0_EL1_PRI_BITS value, default to 4");
-		pri_bits = 4;
+		gicv5_global_data.cpuif_pri_bits = 4;
 		break;
 	}
 }
 
 static int __init gicv5_of_init(struct device_node *node, struct device_node *parent)
 {
-	int ret = gicv5_init_domains(of_fwnode_handle(node));
+	int ret = gicv5_irs_of_probe(node);
 	if (ret)
 		return ret;
 
+	ret = gicv5_init_domains(of_fwnode_handle(node));
+	if (ret)
+		goto out_irs;
+
 	gicv5_set_cpuif_pribits();
 
+	pri_bits = min_not_zero(gicv5_global_data.cpuif_pri_bits,
+				gicv5_global_data.irs_pri_bits);
+
 	ret = gicv5_starting_cpu(smp_processor_id());
 	if (ret)
 		goto out_dom;
@@ -458,6 +761,8 @@ out_int:
 	gicv5_cpu_disable_interrupts();
 out_dom:
 	gicv5_free_domains();
+out_irs:
+	gicv5_irs_remove();
 
 	return ret;
 }
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index b08ec0308a9b..1064a69ab33f 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -5,6 +5,8 @@
 #ifndef __LINUX_IRQCHIP_ARM_GIC_V5_H
 #define __LINUX_IRQCHIP_ARM_GIC_V5_H
 
+#include <linux/iopoll.h>
+
 #include <asm/sysreg.h>
 
 /*
@@ -15,5 +17,143 @@
 #define GICV5_HWIRQ_INTID		GENMASK_ULL(31, 0)
 
 #define GICV5_HWIRQ_TYPE_PPI		UL(0x1)
+#define GICV5_HWIRQ_TYPE_SPI		UL(0x3)
+
+/*
+ * Tables attributes
+ */
+#define GICV5_NO_READ_ALLOC		0b0
+#define GICV5_READ_ALLOC		0b1
+#define GICV5_NO_WRITE_ALLOC		0b0
+#define GICV5_WRITE_ALLOC		0b1
+
+#define GICV5_NON_CACHE			0b00
+#define GICV5_WB_CACHE			0b01
+#define GICV5_WT_CACHE			0b10
+
+#define GICV5_NON_SHARE			0b00
+#define GICV5_OUTER_SHARE		0b10
+#define GICV5_INNER_SHARE		0b11
+
+/*
+ * IRS registers
+ */
+#define GICV5_IRS_IDR1			0x0004
+#define GICV5_IRS_IDR2			0x0008
+#define GICV5_IRS_IDR5			0x0014
+#define GICV5_IRS_IDR6			0x0018
+#define GICV5_IRS_IDR7			0x001c
+#define GICV5_IRS_CR0			0x0080
+#define GICV5_IRS_CR1			0x0084
+#define GICV5_IRS_SPI_SELR		0x0108
+#define GICV5_IRS_SPI_CFGR		0x0114
+#define GICV5_IRS_SPI_STATUSR		0x0118
+#define GICV5_IRS_PE_SELR		0x0140
+#define GICV5_IRS_PE_STATUSR		0x0144
+#define GICV5_IRS_PE_CR0		0x0148
+
+#define GICV5_IRS_IDR1_PRIORITY_BITS	GENMASK(22, 20)
+#define GICV5_IRS_IDR1_IAFFID_BITS	GENMASK(19, 16)
+
+#define GICV5_IRS_IDR1_PRIORITY_BITS_1BITS	0b000
+#define GICV5_IRS_IDR1_PRIORITY_BITS_2BITS	0b001
+#define GICV5_IRS_IDR1_PRIORITY_BITS_3BITS	0b010
+#define GICV5_IRS_IDR1_PRIORITY_BITS_4BITS	0b011
+#define GICV5_IRS_IDR1_PRIORITY_BITS_5BITS	0b100
+
+#define GICV5_IRS_IDR2_ISTMD_SZ		GENMASK(19, 15)
+#define GICV5_IRS_IDR2_ISTMD		BIT(14)
+#define GICV5_IRS_IDR2_IST_L2SZ		GENMASK(13, 11)
+#define GICV5_IRS_IDR2_IST_LEVELS	BIT(10)
+#define GICV5_IRS_IDR2_MIN_LPI_ID_BITS	GENMASK(9, 6)
+#define GICV5_IRS_IDR2_LPI		BIT(5)
+#define GICV5_IRS_IDR2_ID_BITS		GENMASK(4, 0)
+
+#define GICV5_IRS_IDR5_SPI_RANGE	GENMASK(24, 0)
+#define GICV5_IRS_IDR6_SPI_IRS_RANGE	GENMASK(24, 0)
+#define GICV5_IRS_IDR7_SPI_BASE		GENMASK(23, 0)
+#define GICV5_IRS_CR0_IDLE		BIT(1)
+#define GICV5_IRS_CR0_IRSEN		BIT(0)
+
+#define GICV5_IRS_CR1_VPED_WA		BIT(15)
+#define GICV5_IRS_CR1_VPED_RA		BIT(14)
+#define GICV5_IRS_CR1_VMD_WA		BIT(13)
+#define GICV5_IRS_CR1_VMD_RA		BIT(12)
+#define GICV5_IRS_CR1_VPET_WA		BIT(11)
+#define GICV5_IRS_CR1_VPET_RA		BIT(10)
+#define GICV5_IRS_CR1_VMT_WA		BIT(9)
+#define GICV5_IRS_CR1_VMT_RA		BIT(8)
+#define GICV5_IRS_CR1_IST_WA		BIT(7)
+#define GICV5_IRS_CR1_IST_RA		BIT(6)
+#define GICV5_IRS_CR1_IC		GENMASK(5, 4)
+#define GICV5_IRS_CR1_OC		GENMASK(3, 2)
+#define GICV5_IRS_CR1_SH		GENMASK(1, 0)
+
+#define GICV5_IRS_SPI_STATUSR_V		BIT(1)
+#define GICV5_IRS_SPI_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_SPI_SELR_ID		GENMASK(23, 0)
+
+#define GICV5_IRS_SPI_CFGR_TM		BIT(0)
+
+#define GICV5_IRS_PE_SELR_IAFFID	GENMASK(15, 0)
+
+#define GICV5_IRS_PE_STATUSR_V		BIT(1)
+#define GICV5_IRS_PE_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_PE_CR0_DPS		BIT(0)
+
+/*
+ * Global Data structures and functions
+ */
+struct gicv5_chip_data {
+	struct fwnode_handle	*fwnode;
+	struct irq_domain	*ppi_domain;
+	struct irq_domain	*spi_domain;
+	u32			global_spi_count;
+	u8			cpuif_pri_bits;
+	u8			irs_pri_bits;
+};
+
+extern struct gicv5_chip_data gicv5_global_data __read_mostly;
+
+struct gicv5_irs_chip_data {
+	struct list_head	entry;
+	struct fwnode_handle	*fwnode;
+	void __iomem		*irs_base;
+	u32			flags;
+	u32			spi_min;
+	u32			spi_range;
+	raw_spinlock_t		spi_config_lock;
+};
+
+static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset,
+					     const char *reg_s, u32 mask,
+					     u32 *val)
+{
+	void __iomem *reg = addr + offset;
+	u32 tmp;
+	int ret;
+
+	ret = readl_poll_timeout_atomic(reg, tmp, tmp & mask, 1, 10 * USEC_PER_MSEC);
+	if (unlikely(ret == -ETIMEDOUT)) {
+		pr_err_ratelimited("%s timeout...\n", reg_s);
+		return ret;
+	}
+
+	if (val)
+		*val = tmp;
+
+	return 0;
+}
+
+#define gicv5_wait_for_op_atomic(base, reg, mask, val) \
+	gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val)
 
+int gicv5_irs_of_probe(struct device_node *parent);
+void gicv5_irs_remove(void);
+int gicv5_irs_register_cpu(int cpuid);
+int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid);
+struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id);
+int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
 #endif
-- 
cgit v1.2.3


From 0f0101325876859eb463792d4df3fd22b6539d29 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:12 +0200
Subject: irqchip/gic-v5: Add GICv5 LPI/IPI support

An IRS supports Logical Peripheral Interrupts (LPIs) and implement
Linux IPIs on top of it.

LPIs are used for interrupt signals that are translated by a
GICv5 ITS (Interrupt Translation Service) but also for software
generated IRQs - namely interrupts that are not driven by a HW
signal, ie IPIs.

LPIs rely on memory storage for interrupt routing and state.

LPIs state and routing information is kept in the Interrupt
State Table (IST).

IRSes provide support for 1- or 2-level IST tables configured
to support a maximum number of interrupts that depend on the
OS configuration and the HW capabilities.

On systems that provide 2-level IST support, always allow
the maximum number of LPIs; On systems with only 1-level
support, limit the number of LPIs to 2^12 to prevent
wasting memory (presumably a system that supports a 1-level
only IST is not expecting a large number of interrupts).

On a 2-level IST system, L2 entries are allocated on
demand.

The IST table memory is allocated using the kmalloc() interface;
the allocation required may be smaller than a page and must be
made up of contiguous physical pages if larger than a page.

On systems where the IRS is not cache-coherent with the CPUs,
cache mainteinance operations are executed to clean and
invalidate the allocated memory to the point of coherency
making it visible to the IRS components.

On GICv5 systems, IPIs are implemented using LPIs.

Add an LPI IRQ domain and implement an IPI-specific IRQ domain created
as a child/subdomain of the LPI domain to allocate the required number
of LPIs needed to implement the IPIs.

IPIs are backed by LPIs, add LPIs allocation/de-allocation
functions.

The LPI INTID namespace is managed using an IDA to alloc/free LPI INTIDs.

Associate an IPI irqchip with IPI IRQ descriptors to provide
core code with the irqchip.ipi_send_single() method required
to raise an IPI.

Co-developed-by: Sascha Bischoff <sascha.bischoff@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Co-developed-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-22-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/smp.h       |  17 ++
 arch/arm64/include/asm/sysreg.h    |   6 +
 arch/arm64/kernel/smp.c            |  17 --
 drivers/irqchip/irq-gic-v5-irs.c   | 364 +++++++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v5.c       | 299 +++++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-v5.h |  63 ++++++-
 6 files changed, 746 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index d6fd6efb66a6..d48ef6d5abcc 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -50,6 +50,23 @@ struct seq_file;
  */
 extern void smp_init_cpus(void);
 
+enum ipi_msg_type {
+	IPI_RESCHEDULE,
+	IPI_CALL_FUNC,
+	IPI_CPU_STOP,
+	IPI_CPU_STOP_NMI,
+	IPI_TIMER,
+	IPI_IRQ_WORK,
+	NR_IPI,
+	/*
+	 * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
+	 * with trace_ipi_*
+	 */
+	IPI_CPU_BACKTRACE = NR_IPI,
+	IPI_KGDB_ROUNDUP,
+	MAX_IPI
+};
+
 /*
  * Register IPI interrupts with the arch SMP code
  */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index efd2e7a1fbe2..948007cd3684 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1088,6 +1088,7 @@
 #define GICV5_OP_GIC_CDAFF		sys_insn(1, 0, 12, 1, 3)
 #define GICV5_OP_GIC_CDDI		sys_insn(1, 0, 12, 2, 0)
 #define GICV5_OP_GIC_CDDIS		sys_insn(1, 0, 12, 1, 0)
+#define GICV5_OP_GIC_CDHM		sys_insn(1, 0, 12, 2, 1)
 #define GICV5_OP_GIC_CDEN		sys_insn(1, 0, 12, 1, 1)
 #define GICV5_OP_GIC_CDEOI		sys_insn(1, 0, 12, 1, 7)
 #define GICV5_OP_GIC_CDPEND		sys_insn(1, 0, 12, 1, 4)
@@ -1115,6 +1116,11 @@
 #define GICV5_GIC_CDEN_TYPE_MASK	GENMASK_ULL(31, 29)
 #define GICV5_GIC_CDEN_ID_MASK		GENMASK_ULL(23, 0)
 
+/* Definitions for GIC CDHM */
+#define GICV5_GIC_CDHM_HM_MASK		BIT_ULL(32)
+#define GICV5_GIC_CDHM_TYPE_MASK	GENMASK_ULL(31, 29)
+#define GICV5_GIC_CDHM_ID_MASK		GENMASK_ULL(23, 0)
+
 /* Definitions for GIC CDPEND */
 #define GICV5_GIC_CDPEND_PENDING_MASK	BIT_ULL(32)
 #define GICV5_GIC_CDPEND_TYPE_MASK	GENMASK_ULL(31, 29)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2c501e917d38..4797e2c70014 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -64,23 +64,6 @@ struct secondary_data secondary_data;
 /* Number of CPUs which aren't online, but looping in kernel text. */
 static int cpus_stuck_in_kernel;
 
-enum ipi_msg_type {
-	IPI_RESCHEDULE,
-	IPI_CALL_FUNC,
-	IPI_CPU_STOP,
-	IPI_CPU_STOP_NMI,
-	IPI_TIMER,
-	IPI_IRQ_WORK,
-	NR_IPI,
-	/*
-	 * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
-	 * with trace_ipi_*
-	 */
-	IPI_CPU_BACKTRACE = NR_IPI,
-	IPI_KGDB_ROUNDUP,
-	MAX_IPI
-};
-
 static int ipi_irq_base __ro_after_init;
 static int nr_ipi __ro_after_init = NR_IPI;
 
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index fba8efceb26e..f00a4a6fece7 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -5,12 +5,20 @@
 
 #define pr_fmt(fmt)	"GICv5 IRS: " fmt
 
+#include <linux/log2.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 
 #include <linux/irqchip.h>
 #include <linux/irqchip/arm-gic-v5.h>
 
+/*
+ * Hardcoded ID_BITS limit for systems supporting only a 1-level IST
+ * table. Systems supporting only a 1-level IST table aren't expected
+ * to require more than 2^12 LPIs. Tweak as required.
+ */
+#define LPI_ID_BITS_LINEAR		12
+
 #define IRS_FLAGS_NON_COHERENT		BIT(0)
 
 static DEFINE_PER_CPU_READ_MOSTLY(struct gicv5_irs_chip_data *, per_cpu_irs_data);
@@ -28,6 +36,331 @@ static void irs_writel_relaxed(struct gicv5_irs_chip_data *irs_data,
 	writel_relaxed(val, irs_data->irs_base + reg_offset);
 }
 
+static u64 irs_readq_relaxed(struct gicv5_irs_chip_data *irs_data,
+			     const u32 reg_offset)
+{
+	return readq_relaxed(irs_data->irs_base + reg_offset);
+}
+
+static void irs_writeq_relaxed(struct gicv5_irs_chip_data *irs_data,
+			       const u64 val, const u32 reg_offset)
+{
+	writeq_relaxed(val, irs_data->irs_base + reg_offset);
+}
+
+/*
+ * The polling wait (in gicv5_wait_for_op_s_atomic()) on a GIC register
+ * provides the memory barriers (through MMIO accessors)
+ * required to synchronize CPU and GIC access to IST memory.
+ */
+static int gicv5_irs_ist_synchronise(struct gicv5_irs_chip_data *irs_data)
+{
+	return gicv5_wait_for_op_atomic(irs_data->irs_base, GICV5_IRS_IST_STATUSR,
+					GICV5_IRS_IST_STATUSR_IDLE, NULL);
+}
+
+static int __init gicv5_irs_init_ist_linear(struct gicv5_irs_chip_data *irs_data,
+					    unsigned int lpi_id_bits,
+					    unsigned int istsz)
+{
+	size_t l2istsz;
+	u32 n, cfgr;
+	void *ist;
+	u64 baser;
+	int ret;
+
+	/* Taken from GICv5 specifications 10.2.1.13 IRS_IST_BASER */
+	n = max(5, lpi_id_bits + 1 + istsz);
+
+	l2istsz = BIT(n + 1);
+	/*
+	 * Check memory requirements. For a linear IST we cap the
+	 * number of ID bits to a value that should never exceed
+	 * kmalloc interface memory allocation limits, so this
+	 * check is really belt and braces.
+	 */
+	if (l2istsz > KMALLOC_MAX_SIZE) {
+		u8 lpi_id_cap = ilog2(KMALLOC_MAX_SIZE) - 2 + istsz;
+
+		pr_warn("Limiting LPI ID bits from %u to %u\n",
+			lpi_id_bits, lpi_id_cap);
+		lpi_id_bits = lpi_id_cap;
+		l2istsz = KMALLOC_MAX_SIZE;
+	}
+
+	ist = kzalloc(l2istsz, GFP_KERNEL);
+	if (!ist)
+		return -ENOMEM;
+
+	if (irs_data->flags & IRS_FLAGS_NON_COHERENT)
+		dcache_clean_inval_poc((unsigned long)ist,
+				       (unsigned long)ist + l2istsz);
+	else
+		dsb(ishst);
+
+	cfgr = FIELD_PREP(GICV5_IRS_IST_CFGR_STRUCTURE,
+			  GICV5_IRS_IST_CFGR_STRUCTURE_LINEAR)	|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_ISTSZ, istsz)	|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_L2SZ,
+			  GICV5_IRS_IST_CFGR_L2SZ_4K)		|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_LPI_ID_BITS, lpi_id_bits);
+	irs_writel_relaxed(irs_data, cfgr, GICV5_IRS_IST_CFGR);
+
+	gicv5_global_data.ist.l2 = false;
+
+	baser = (virt_to_phys(ist) & GICV5_IRS_IST_BASER_ADDR_MASK) |
+		FIELD_PREP(GICV5_IRS_IST_BASER_VALID, 0x1);
+	irs_writeq_relaxed(irs_data, baser, GICV5_IRS_IST_BASER);
+
+	ret = gicv5_irs_ist_synchronise(irs_data);
+	if (ret) {
+		kfree(ist);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int __init gicv5_irs_init_ist_two_level(struct gicv5_irs_chip_data *irs_data,
+					       unsigned int lpi_id_bits,
+					       unsigned int istsz,
+					       unsigned int l2sz)
+{
+	__le64 *l1ist;
+	u32 cfgr, n;
+	size_t l1sz;
+	u64 baser;
+	int ret;
+
+	/* Taken from GICv5 specifications 10.2.1.13 IRS_IST_BASER */
+	n = max(5, lpi_id_bits - ((10 - istsz) + (2 * l2sz)) + 2);
+
+	l1sz = BIT(n + 1);
+
+	l1ist = kzalloc(l1sz, GFP_KERNEL);
+	if (!l1ist)
+		return -ENOMEM;
+
+	if (irs_data->flags & IRS_FLAGS_NON_COHERENT)
+		dcache_clean_inval_poc((unsigned long)l1ist,
+				       (unsigned long)l1ist + l1sz);
+	else
+		dsb(ishst);
+
+	cfgr = FIELD_PREP(GICV5_IRS_IST_CFGR_STRUCTURE,
+			  GICV5_IRS_IST_CFGR_STRUCTURE_TWO_LEVEL)	|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_ISTSZ, istsz)		|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_L2SZ, l2sz)		|
+	       FIELD_PREP(GICV5_IRS_IST_CFGR_LPI_ID_BITS, lpi_id_bits);
+	irs_writel_relaxed(irs_data, cfgr, GICV5_IRS_IST_CFGR);
+
+	/*
+	 * The L2SZ determine bits required at L2 level. Number of bytes
+	 * required by metadata is reported through istsz - the number of bits
+	 * covered by L2 entries scales accordingly.
+	 */
+	gicv5_global_data.ist.l2_size = BIT(11 + (2 * l2sz) + 1);
+	gicv5_global_data.ist.l2_bits = (10 - istsz) + (2 * l2sz);
+	gicv5_global_data.ist.l1ist_addr = l1ist;
+	gicv5_global_data.ist.l2 = true;
+
+	baser = (virt_to_phys(l1ist) & GICV5_IRS_IST_BASER_ADDR_MASK) |
+		FIELD_PREP(GICV5_IRS_IST_BASER_VALID, 0x1);
+	irs_writeq_relaxed(irs_data, baser, GICV5_IRS_IST_BASER);
+
+	ret = gicv5_irs_ist_synchronise(irs_data);
+	if (ret) {
+		kfree(l1ist);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Alloc L2 IST entries on demand.
+ *
+ * Locking/serialization is guaranteed by irqdomain core code by
+ * taking the hierarchical domain struct irq_domain.root->mutex.
+ */
+int gicv5_irs_iste_alloc(const u32 lpi)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	unsigned int index;
+	u32 l2istr, l2bits;
+	__le64 *l1ist;
+	size_t l2size;
+	void *l2ist;
+	int ret;
+
+	if (!gicv5_global_data.ist.l2)
+		return 0;
+
+	irs_data = per_cpu(per_cpu_irs_data, smp_processor_id());
+	if (!irs_data)
+		return -ENOENT;
+
+	l2size = gicv5_global_data.ist.l2_size;
+	l2bits = gicv5_global_data.ist.l2_bits;
+	l1ist = gicv5_global_data.ist.l1ist_addr;
+	index = lpi >> l2bits;
+
+	if (FIELD_GET(GICV5_ISTL1E_VALID, le64_to_cpu(l1ist[index])))
+		return 0;
+
+	l2ist = kzalloc(l2size, GFP_KERNEL);
+	if (!l2ist)
+		return -ENOMEM;
+
+	l1ist[index] = cpu_to_le64(virt_to_phys(l2ist) & GICV5_ISTL1E_L2_ADDR_MASK);
+
+	if (irs_data->flags & IRS_FLAGS_NON_COHERENT) {
+		dcache_clean_inval_poc((unsigned long)l2ist,
+				       (unsigned long)l2ist + l2size);
+		dcache_clean_poc((unsigned long)(l1ist + index),
+				 (unsigned long)(l1ist + index) + sizeof(*l1ist));
+	} else {
+		dsb(ishst);
+	}
+
+	l2istr = FIELD_PREP(GICV5_IRS_MAP_L2_ISTR_ID, lpi);
+	irs_writel_relaxed(irs_data, l2istr, GICV5_IRS_MAP_L2_ISTR);
+
+	ret = gicv5_irs_ist_synchronise(irs_data);
+	if (ret) {
+		l1ist[index] = 0;
+		kfree(l2ist);
+		return ret;
+	}
+
+	/*
+	 * Make sure we invalidate the cache line pulled before the IRS
+	 * had a chance to update the L1 entry and mark it valid.
+	 */
+	if (irs_data->flags & IRS_FLAGS_NON_COHERENT) {
+		/*
+		 * gicv5_irs_ist_synchronise() includes memory
+		 * barriers (MMIO accessors) required to guarantee that the
+		 * following dcache invalidation is not executed before the
+		 * IST mapping operation has completed.
+		 */
+		dcache_inval_poc((unsigned long)(l1ist + index),
+				 (unsigned long)(l1ist + index) + sizeof(*l1ist));
+	}
+
+	return 0;
+}
+
+/*
+ * Try to match the L2 IST size to the pagesize, and if this is not possible
+ * pick the smallest supported L2 size in order to minimise the requirement for
+ * physically contiguous blocks of memory as page-sized allocations are
+ * guaranteed to be physically contiguous, and are by definition the easiest to
+ * find.
+ *
+ * Fall back to the smallest supported size (in the event that the pagesize
+ * itself is not supported) again serves to make it easier to find physically
+ * contiguous blocks of memory.
+ */
+static unsigned int gicv5_irs_l2_sz(u32 idr2)
+{
+	switch (PAGE_SIZE) {
+	case SZ_64K:
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_64KB(idr2))
+			return GICV5_IRS_IST_CFGR_L2SZ_64K;
+		fallthrough;
+	case SZ_4K:
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+			return GICV5_IRS_IST_CFGR_L2SZ_4K;
+		fallthrough;
+	case SZ_16K:
+		if (GICV5_IRS_IST_L2SZ_SUPPORT_16KB(idr2))
+			return GICV5_IRS_IST_CFGR_L2SZ_16K;
+		break;
+	}
+
+	if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+		return GICV5_IRS_IST_CFGR_L2SZ_4K;
+
+	return GICV5_IRS_IST_CFGR_L2SZ_64K;
+}
+
+static int __init gicv5_irs_init_ist(struct gicv5_irs_chip_data *irs_data)
+{
+	u32 lpi_id_bits, idr2_id_bits, idr2_min_lpi_id_bits, l2_iste_sz, l2sz;
+	u32 l2_iste_sz_split, idr2;
+	bool two_levels, istmd;
+	u64 baser;
+	int ret;
+
+	baser = irs_readq_relaxed(irs_data, GICV5_IRS_IST_BASER);
+	if (FIELD_GET(GICV5_IRS_IST_BASER_VALID, baser)) {
+		pr_err("IST is marked as valid already; cannot allocate\n");
+		return -EPERM;
+	}
+
+	idr2 = irs_readl_relaxed(irs_data, GICV5_IRS_IDR2);
+	two_levels = !!FIELD_GET(GICV5_IRS_IDR2_IST_LEVELS, idr2);
+
+	idr2_id_bits = FIELD_GET(GICV5_IRS_IDR2_ID_BITS, idr2);
+	idr2_min_lpi_id_bits = FIELD_GET(GICV5_IRS_IDR2_MIN_LPI_ID_BITS, idr2);
+
+	/*
+	 * For two level tables we are always supporting the maximum allowed
+	 * number of IDs.
+	 *
+	 * For 1-level tables, we should support a number of bits that
+	 * is >= min_lpi_id_bits but cap it to LPI_ID_BITS_LINEAR lest
+	 * the level 1-table gets too large and its memory allocation
+	 * may fail.
+	 */
+	if (two_levels) {
+		lpi_id_bits = idr2_id_bits;
+	} else {
+		lpi_id_bits = max(LPI_ID_BITS_LINEAR, idr2_min_lpi_id_bits);
+		lpi_id_bits = min(lpi_id_bits, idr2_id_bits);
+	}
+
+	/*
+	 * Cap the ID bits according to the CPUIF supported ID bits
+	 */
+	lpi_id_bits = min(lpi_id_bits, gicv5_global_data.cpuif_id_bits);
+
+	if (two_levels)
+		l2sz = gicv5_irs_l2_sz(idr2);
+
+	istmd = !!FIELD_GET(GICV5_IRS_IDR2_ISTMD, idr2);
+
+	l2_iste_sz = GICV5_IRS_IST_CFGR_ISTSZ_4;
+
+	if (istmd) {
+		l2_iste_sz_split = FIELD_GET(GICV5_IRS_IDR2_ISTMD_SZ, idr2);
+
+		if (lpi_id_bits < l2_iste_sz_split)
+			l2_iste_sz = GICV5_IRS_IST_CFGR_ISTSZ_8;
+		else
+			l2_iste_sz = GICV5_IRS_IST_CFGR_ISTSZ_16;
+	}
+
+	/*
+	 * Follow GICv5 specification recommendation to opt in for two
+	 * level tables (ref: 10.2.1.14 IRS_IST_CFGR).
+	 */
+	if (two_levels && (lpi_id_bits > ((10 - l2_iste_sz) + (2 * l2sz)))) {
+		ret = gicv5_irs_init_ist_two_level(irs_data, lpi_id_bits,
+						   l2_iste_sz, l2sz);
+	} else {
+		ret = gicv5_irs_init_ist_linear(irs_data, lpi_id_bits,
+						l2_iste_sz);
+	}
+	if (ret)
+		return ret;
+
+	gicv5_init_lpis(BIT(lpi_id_bits));
+
+	return 0;
+}
+
 struct iaffid_entry {
 	u16	iaffid;
 	bool	valid;
@@ -362,6 +695,13 @@ static int __init gicv5_irs_init(struct device_node *node)
 		goto out_iomem;
 	}
 
+	idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR2);
+	if (WARN(!FIELD_GET(GICV5_IRS_IDR2_LPI, idr),
+		 "LPI support not available - no IPIs, can't proceed\n")) {
+		ret = -ENODEV;
+		goto out_iomem;
+	}
+
 	idr = irs_readl_relaxed(irs_data, GICV5_IRS_IDR7);
 	irs_data->spi_min = FIELD_GET(GICV5_IRS_IDR7_SPI_BASE, idr);
 
@@ -391,6 +731,8 @@ static int __init gicv5_irs_init(struct device_node *node)
 		spi_count = FIELD_GET(GICV5_IRS_IDR5_SPI_RANGE, idr);
 		gicv5_global_data.global_spi_count = spi_count;
 
+		gicv5_init_lpi_domain();
+
 		pr_debug("Detected %u SPIs globally\n", spi_count);
 	}
 
@@ -409,6 +751,9 @@ void __init gicv5_irs_remove(void)
 {
 	struct gicv5_irs_chip_data *irs_data, *tmp_data;
 
+	gicv5_free_lpi_domain();
+	gicv5_deinit_lpis();
+
 	list_for_each_entry_safe(irs_data, tmp_data, &irs_nodes, entry) {
 		iounmap(irs_data->irs_base);
 		list_del(&irs_data->entry);
@@ -416,6 +761,25 @@ void __init gicv5_irs_remove(void)
 	}
 }
 
+int __init gicv5_irs_enable(void)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	int ret;
+
+	irs_data = list_first_entry_or_null(&irs_nodes,
+					    struct gicv5_irs_chip_data, entry);
+	if (!irs_data)
+		return -ENODEV;
+
+	ret = gicv5_irs_init_ist(irs_data);
+	if (ret) {
+		pr_err("Failed to init IST\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 int __init gicv5_irs_of_probe(struct device_node *parent)
 {
 	struct device_node *np;
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 9c55ddcfa0df..84ed13c4f2b1 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -5,7 +5,9 @@
 
 #define pr_fmt(fmt)	"GICv5: " fmt
 
+#include <linux/idr.h>
 #include <linux/irqdomain.h>
+#include <linux/slab.h>
 #include <linux/wordpart.h>
 
 #include <linux/irqchip.h>
@@ -28,6 +30,42 @@ static bool gicv5_cpuif_has_gcie(void)
 
 struct gicv5_chip_data gicv5_global_data __read_mostly;
 
+static DEFINE_IDA(lpi_ida);
+static u32 num_lpis __ro_after_init;
+
+void __init gicv5_init_lpis(u32 lpis)
+{
+	num_lpis = lpis;
+}
+
+void __init gicv5_deinit_lpis(void)
+{
+	num_lpis = 0;
+}
+
+static int alloc_lpi(void)
+{
+	if (!num_lpis)
+		return -ENOSPC;
+
+	return ida_alloc_max(&lpi_ida, num_lpis - 1, GFP_KERNEL);
+}
+
+static void release_lpi(u32 lpi)
+{
+	ida_free(&lpi_ida, lpi);
+}
+
+int gicv5_alloc_lpi(void)
+{
+	return alloc_lpi();
+}
+
+void gicv5_free_lpi(u32 lpi)
+{
+	release_lpi(lpi);
+}
+
 static void gicv5_ppi_priority_init(void)
 {
 	write_sysreg_s(REPEAT_BYTE(GICV5_IRQ_PRI_MI), SYS_ICC_PPI_PRIORITYR0_EL1);
@@ -60,7 +98,7 @@ static void gicv5_hwirq_init(irq_hw_number_t hwirq, u8 priority, u8 hwirq_type)
 	u16 iaffid;
 	int ret;
 
-	if (hwirq_type == GICV5_HWIRQ_TYPE_SPI) {
+	if (hwirq_type == GICV5_HWIRQ_TYPE_LPI || hwirq_type == GICV5_HWIRQ_TYPE_SPI) {
 		cdpri = FIELD_PREP(GICV5_GIC_CDPRI_PRIORITY_MASK, priority)	|
 			FIELD_PREP(GICV5_GIC_CDPRI_TYPE_MASK, hwirq_type)	|
 			FIELD_PREP(GICV5_GIC_CDPRI_ID_MASK, hwirq);
@@ -122,6 +160,11 @@ static void gicv5_spi_irq_mask(struct irq_data *d)
 	gicv5_iri_irq_mask(d, GICV5_HWIRQ_TYPE_SPI);
 }
 
+static void gicv5_lpi_irq_mask(struct irq_data *d)
+{
+	gicv5_iri_irq_mask(d, GICV5_HWIRQ_TYPE_LPI);
+}
+
 static void gicv5_ppi_irq_unmask(struct irq_data *d)
 {
 	u64 hwirq_id_bit = BIT_ULL(d->hwirq % 64);
@@ -149,7 +192,7 @@ static void gicv5_iri_irq_unmask(struct irq_data *d, u8 hwirq_type)
 	/*
 	 * Rule R_XCLJC states that the effects of a GIC system instruction
 	 * complete in finite time and that's the only requirement when
-	 * unmasking an SPI IRQ.
+	 * unmasking an SPI/LPI IRQ.
 	 */
 	gic_insn(cden, CDEN);
 }
@@ -159,6 +202,11 @@ static void gicv5_spi_irq_unmask(struct irq_data *d)
 	gicv5_iri_irq_unmask(d, GICV5_HWIRQ_TYPE_SPI);
 }
 
+static void gicv5_lpi_irq_unmask(struct irq_data *d)
+{
+	gicv5_iri_irq_unmask(d, GICV5_HWIRQ_TYPE_LPI);
+}
+
 static void gicv5_hwirq_eoi(u32 hwirq_id, u8 hwirq_type)
 {
 	u64 cddi;
@@ -181,6 +229,11 @@ static void gicv5_spi_irq_eoi(struct irq_data *d)
 	gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_SPI);
 }
 
+static void gicv5_lpi_irq_eoi(struct irq_data *d)
+{
+	gicv5_hwirq_eoi(d->hwirq, GICV5_HWIRQ_TYPE_LPI);
+}
+
 static int gicv5_iri_irq_set_affinity(struct irq_data *d,
 				      const struct cpumask *mask_val,
 				      bool force, u8 hwirq_type)
@@ -216,6 +269,14 @@ static int gicv5_spi_irq_set_affinity(struct irq_data *d,
 					  GICV5_HWIRQ_TYPE_SPI);
 }
 
+static int gicv5_lpi_irq_set_affinity(struct irq_data *d,
+				      const struct cpumask *mask_val,
+				      bool force)
+{
+	return gicv5_iri_irq_set_affinity(d, mask_val, force,
+					  GICV5_HWIRQ_TYPE_LPI);
+}
+
 enum ppi_reg {
 	PPI_PENDING,
 	PPI_ACTIVE,
@@ -336,6 +397,14 @@ static int gicv5_spi_irq_get_irqchip_state(struct irq_data *d,
 					       GICV5_HWIRQ_TYPE_SPI);
 }
 
+static int gicv5_lpi_irq_get_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool *state)
+{
+	return gicv5_iri_irq_get_irqchip_state(d, which, state,
+					       GICV5_HWIRQ_TYPE_LPI);
+}
+
 static int gicv5_ppi_irq_set_irqchip_state(struct irq_data *d,
 					   enum irqchip_irq_state which,
 					   bool state)
@@ -370,6 +439,11 @@ static void gicv5_spi_irq_write_pending_state(struct irq_data *d, bool state)
 	gicv5_iri_irq_write_pending_state(d, state, GICV5_HWIRQ_TYPE_SPI);
 }
 
+static void gicv5_lpi_irq_write_pending_state(struct irq_data *d, bool state)
+{
+	gicv5_iri_irq_write_pending_state(d, state, GICV5_HWIRQ_TYPE_LPI);
+}
+
 static int gicv5_spi_irq_set_irqchip_state(struct irq_data *d,
 					   enum irqchip_irq_state which,
 					   bool state)
@@ -386,12 +460,41 @@ static int gicv5_spi_irq_set_irqchip_state(struct irq_data *d,
 	return 0;
 }
 
+static int gicv5_lpi_irq_set_irqchip_state(struct irq_data *d,
+					   enum irqchip_irq_state which,
+					   bool state)
+{
+	switch (which) {
+	case IRQCHIP_STATE_PENDING:
+		gicv5_lpi_irq_write_pending_state(d, state);
+		break;
+
+	default:
+		pr_debug("Unexpected irqchip_irq_state\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int gicv5_spi_irq_retrigger(struct irq_data *data)
 {
 	return !gicv5_spi_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING,
 						true);
 }
 
+static int gicv5_lpi_irq_retrigger(struct irq_data *data)
+{
+	return !gicv5_lpi_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING,
+						true);
+}
+
+static void gicv5_ipi_send_single(struct irq_data *d, unsigned int cpu)
+{
+	/* Mark the LPI pending */
+	irq_chip_retrigger_hierarchy(d);
+}
+
 static bool gicv5_ppi_irq_is_level(irq_hw_number_t hwirq)
 {
 	u64 bit = BIT_ULL(hwirq % 64);
@@ -425,6 +528,32 @@ static const struct irq_chip gicv5_spi_irq_chip = {
 				  IRQCHIP_MASK_ON_SUSPEND,
 };
 
+static const struct irq_chip gicv5_lpi_irq_chip = {
+	.name			= "GICv5-LPI",
+	.irq_mask		= gicv5_lpi_irq_mask,
+	.irq_unmask		= gicv5_lpi_irq_unmask,
+	.irq_eoi		= gicv5_lpi_irq_eoi,
+	.irq_set_affinity	= gicv5_lpi_irq_set_affinity,
+	.irq_retrigger		= gicv5_lpi_irq_retrigger,
+	.irq_get_irqchip_state	= gicv5_lpi_irq_get_irqchip_state,
+	.irq_set_irqchip_state	= gicv5_lpi_irq_set_irqchip_state,
+	.flags			= IRQCHIP_SKIP_SET_WAKE	  |
+				  IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static const struct irq_chip gicv5_ipi_irq_chip = {
+	.name			= "GICv5-IPI",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+	.irq_get_irqchip_state	= irq_chip_get_parent_state,
+	.irq_set_irqchip_state	= irq_chip_set_parent_state,
+	.ipi_send_single	= gicv5_ipi_send_single,
+	.flags			= IRQCHIP_SKIP_SET_WAKE	  |
+				  IRQCHIP_MASK_ON_SUSPEND,
+};
+
 static __always_inline int gicv5_irq_domain_translate(struct irq_domain *d,
 						      struct irq_fwspec *fwspec,
 						      irq_hw_number_t *hwirq,
@@ -585,6 +714,130 @@ static const struct irq_domain_ops gicv5_irq_spi_domain_ops = {
 	.free		= gicv5_irq_domain_free,
 	.select		= gicv5_irq_spi_domain_select
 };
+
+static void gicv5_lpi_config_reset(struct irq_data *d)
+{
+	u64 cdhm;
+
+	/*
+	 * Reset LPIs handling mode to edge by default and clear pending
+	 * state to make sure we start the LPI with a clean state from
+	 * previous incarnations.
+	 */
+	cdhm = FIELD_PREP(GICV5_GIC_CDHM_HM_MASK, 0)				|
+	       FIELD_PREP(GICV5_GIC_CDHM_TYPE_MASK, GICV5_HWIRQ_TYPE_LPI)	|
+	       FIELD_PREP(GICV5_GIC_CDHM_ID_MASK, d->hwirq);
+	gic_insn(cdhm, CDHM);
+
+	gicv5_lpi_irq_write_pending_state(d, false);
+}
+
+static int gicv5_irq_lpi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs, void *arg)
+{
+	irq_hw_number_t hwirq;
+	struct irq_data *irqd;
+	u32 *lpi = arg;
+	int ret;
+
+	if (WARN_ON_ONCE(nr_irqs != 1))
+		return -EINVAL;
+
+	hwirq = *lpi;
+
+	irqd = irq_domain_get_irq_data(domain, virq);
+
+	irq_domain_set_info(domain, virq, hwirq, &gicv5_lpi_irq_chip, NULL,
+			    handle_fasteoi_irq, NULL, NULL);
+	irqd_set_single_target(irqd);
+
+	ret = gicv5_irs_iste_alloc(hwirq);
+	if (ret < 0)
+		return ret;
+
+	gicv5_hwirq_init(hwirq, GICV5_IRQ_PRI_MI, GICV5_HWIRQ_TYPE_LPI);
+	gicv5_lpi_config_reset(irqd);
+
+	return 0;
+}
+
+static const struct irq_domain_ops gicv5_irq_lpi_domain_ops = {
+	.alloc	= gicv5_irq_lpi_domain_alloc,
+	.free	= gicv5_irq_domain_free,
+};
+
+void __init gicv5_init_lpi_domain(void)
+{
+	struct irq_domain *d;
+
+	d = irq_domain_create_tree(NULL, &gicv5_irq_lpi_domain_ops, NULL);
+	gicv5_global_data.lpi_domain = d;
+}
+
+void __init gicv5_free_lpi_domain(void)
+{
+	irq_domain_remove(gicv5_global_data.lpi_domain);
+	gicv5_global_data.lpi_domain = NULL;
+}
+
+static int gicv5_irq_ipi_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs, void *arg)
+{
+	struct irq_data *irqd;
+	int ret, i;
+	u32 lpi;
+
+	for (i = 0; i < nr_irqs; i++) {
+		ret = gicv5_alloc_lpi();
+		if (ret < 0)
+			return ret;
+
+		lpi = ret;
+
+		ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi);
+		if (ret) {
+			gicv5_free_lpi(lpi);
+			return ret;
+		}
+
+		irqd = irq_domain_get_irq_data(domain, virq + i);
+
+		irq_domain_set_hwirq_and_chip(domain, virq + i, i,
+				&gicv5_ipi_irq_chip, NULL);
+
+		irqd_set_single_target(irqd);
+
+		irq_set_handler(virq + i, handle_percpu_irq);
+	}
+
+	return 0;
+}
+
+static void gicv5_irq_ipi_domain_free(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs)
+{
+	struct irq_data *d;
+	unsigned int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		d = irq_domain_get_irq_data(domain, virq + i);
+
+		if (!d)
+			return;
+
+		gicv5_free_lpi(d->parent_data->hwirq);
+
+		irq_set_handler(virq + i, NULL);
+		irq_domain_reset_irq_data(d);
+		irq_domain_free_irqs_parent(domain, virq + i, 1);
+	}
+}
+
+static const struct irq_domain_ops gicv5_irq_ipi_domain_ops = {
+	.alloc	= gicv5_irq_ipi_domain_alloc,
+	.free	= gicv5_irq_ipi_domain_free,
+};
+
 static void handle_irq_per_domain(u32 hwirq)
 {
 	u8 hwirq_type = FIELD_GET(GICV5_HWIRQ_TYPE, hwirq);
@@ -598,6 +851,9 @@ static void handle_irq_per_domain(u32 hwirq)
 	case GICV5_HWIRQ_TYPE_SPI:
 		domain = gicv5_global_data.spi_domain;
 		break;
+	case GICV5_HWIRQ_TYPE_LPI:
+		domain = gicv5_global_data.lpi_domain;
+		break;
 	default:
 		pr_err_once("Unknown IRQ type, bail out\n");
 		return;
@@ -679,9 +935,12 @@ static void __init gicv5_free_domains(void)
 		irq_domain_remove(gicv5_global_data.ppi_domain);
 	if (gicv5_global_data.spi_domain)
 		irq_domain_remove(gicv5_global_data.spi_domain);
+	if (gicv5_global_data.ipi_domain)
+		irq_domain_remove(gicv5_global_data.ipi_domain);
 
 	gicv5_global_data.ppi_domain = NULL;
 	gicv5_global_data.spi_domain = NULL;
+	gicv5_global_data.ipi_domain = NULL;
 }
 
 static int __init gicv5_init_domains(struct fwnode_handle *handle)
@@ -709,6 +968,19 @@ static int __init gicv5_init_domains(struct fwnode_handle *handle)
 		irq_domain_update_bus_token(d, DOMAIN_BUS_WIRED);
 	}
 
+	if (!WARN(!gicv5_global_data.lpi_domain,
+		  "LPI domain uninitialized, can't set up IPIs")) {
+		d = irq_domain_create_hierarchy(gicv5_global_data.lpi_domain,
+						0, GICV5_IPIS_PER_CPU * nr_cpu_ids,
+						NULL, &gicv5_irq_ipi_domain_ops,
+						NULL);
+
+		if (!d) {
+			gicv5_free_domains();
+			return -ENOMEM;
+		}
+		gicv5_global_data.ipi_domain = d;
+	}
 	gicv5_global_data.fwnode = handle;
 
 	return 0;
@@ -732,6 +1004,24 @@ static void gicv5_set_cpuif_pribits(void)
 	}
 }
 
+static void gicv5_set_cpuif_idbits(void)
+{
+	u32 icc_idr0 = read_sysreg_s(SYS_ICC_IDR0_EL1);
+
+	switch (FIELD_GET(ICC_IDR0_EL1_ID_BITS, icc_idr0)) {
+	case ICC_IDR0_EL1_ID_BITS_16BITS:
+		gicv5_global_data.cpuif_id_bits = 16;
+		break;
+	case ICC_IDR0_EL1_ID_BITS_24BITS:
+		gicv5_global_data.cpuif_id_bits = 24;
+		break;
+	default:
+		pr_err("Unexpected ICC_IDR0_EL1_ID_BITS value, default to 16");
+		gicv5_global_data.cpuif_id_bits = 16;
+		break;
+	}
+}
+
 static int __init gicv5_of_init(struct device_node *node, struct device_node *parent)
 {
 	int ret = gicv5_irs_of_probe(node);
@@ -743,6 +1033,7 @@ static int __init gicv5_of_init(struct device_node *node, struct device_node *pa
 		goto out_irs;
 
 	gicv5_set_cpuif_pribits();
+	gicv5_set_cpuif_idbits();
 
 	pri_bits = min_not_zero(gicv5_global_data.cpuif_pri_bits,
 				gicv5_global_data.irs_pri_bits);
@@ -755,6 +1046,10 @@ static int __init gicv5_of_init(struct device_node *node, struct device_node *pa
 	if (ret)
 		goto out_int;
 
+	ret = gicv5_irs_enable();
+	if (ret)
+		goto out_int;
+
 	return 0;
 
 out_int:
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 1064a69ab33f..680eed794a35 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -7,8 +7,12 @@
 
 #include <linux/iopoll.h>
 
+#include <asm/cacheflush.h>
+#include <asm/smp.h>
 #include <asm/sysreg.h>
 
+#define GICV5_IPIS_PER_CPU		MAX_IPI
+
 /*
  * INTID handling
  */
@@ -17,6 +21,7 @@
 #define GICV5_HWIRQ_INTID		GENMASK_ULL(31, 0)
 
 #define GICV5_HWIRQ_TYPE_PPI		UL(0x1)
+#define GICV5_HWIRQ_TYPE_LPI		UL(0x2)
 #define GICV5_HWIRQ_TYPE_SPI		UL(0x3)
 
 /*
@@ -36,7 +41,7 @@
 #define GICV5_INNER_SHARE		0b11
 
 /*
- * IRS registers
+ * IRS registers and tables structures
  */
 #define GICV5_IRS_IDR1			0x0004
 #define GICV5_IRS_IDR2			0x0008
@@ -51,6 +56,10 @@
 #define GICV5_IRS_PE_SELR		0x0140
 #define GICV5_IRS_PE_STATUSR		0x0144
 #define GICV5_IRS_PE_CR0		0x0148
+#define GICV5_IRS_IST_BASER		0x0180
+#define GICV5_IRS_IST_CFGR		0x0190
+#define GICV5_IRS_IST_STATUSR		0x0194
+#define GICV5_IRS_MAP_L2_ISTR		0x01c0
 
 #define GICV5_IRS_IDR1_PRIORITY_BITS	GENMASK(22, 20)
 #define GICV5_IRS_IDR1_IAFFID_BITS	GENMASK(19, 16)
@@ -72,6 +81,11 @@
 #define GICV5_IRS_IDR5_SPI_RANGE	GENMASK(24, 0)
 #define GICV5_IRS_IDR6_SPI_IRS_RANGE	GENMASK(24, 0)
 #define GICV5_IRS_IDR7_SPI_BASE		GENMASK(23, 0)
+
+#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(11), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(12), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(13), (r))
+
 #define GICV5_IRS_CR0_IDLE		BIT(1)
 #define GICV5_IRS_CR0_IRSEN		BIT(0)
 
@@ -103,6 +117,33 @@
 
 #define GICV5_IRS_PE_CR0_DPS		BIT(0)
 
+#define GICV5_IRS_IST_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_IST_CFGR_STRUCTURE	BIT(16)
+#define GICV5_IRS_IST_CFGR_ISTSZ	GENMASK(8, 7)
+#define GICV5_IRS_IST_CFGR_L2SZ		GENMASK(6, 5)
+#define GICV5_IRS_IST_CFGR_LPI_ID_BITS	GENMASK(4, 0)
+
+#define GICV5_IRS_IST_CFGR_STRUCTURE_LINEAR	0b0
+#define GICV5_IRS_IST_CFGR_STRUCTURE_TWO_LEVEL	0b1
+
+#define GICV5_IRS_IST_CFGR_ISTSZ_4	0b00
+#define GICV5_IRS_IST_CFGR_ISTSZ_8	0b01
+#define GICV5_IRS_IST_CFGR_ISTSZ_16	0b10
+
+#define GICV5_IRS_IST_CFGR_L2SZ_4K	0b00
+#define GICV5_IRS_IST_CFGR_L2SZ_16K	0b01
+#define GICV5_IRS_IST_CFGR_L2SZ_64K	0b10
+
+#define GICV5_IRS_IST_BASER_ADDR_MASK	GENMASK_ULL(55, 6)
+#define GICV5_IRS_IST_BASER_VALID	BIT_ULL(0)
+
+#define GICV5_IRS_MAP_L2_ISTR_ID	GENMASK(23, 0)
+
+#define GICV5_ISTL1E_VALID		BIT_ULL(0)
+
+#define GICV5_ISTL1E_L2_ADDR_MASK	GENMASK_ULL(55, 12)
+
 /*
  * Global Data structures and functions
  */
@@ -110,9 +151,18 @@ struct gicv5_chip_data {
 	struct fwnode_handle	*fwnode;
 	struct irq_domain	*ppi_domain;
 	struct irq_domain	*spi_domain;
+	struct irq_domain	*lpi_domain;
+	struct irq_domain	*ipi_domain;
 	u32			global_spi_count;
 	u8			cpuif_pri_bits;
+	u8			cpuif_id_bits;
 	u8			irs_pri_bits;
+	struct {
+		__le64 *l1ist_addr;
+		u32 l2_size;
+		u8 l2_bits;
+		bool l2;
+	} ist;
 };
 
 extern struct gicv5_chip_data gicv5_global_data __read_mostly;
@@ -150,10 +200,21 @@ static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset,
 #define gicv5_wait_for_op_atomic(base, reg, mask, val) \
 	gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val)
 
+void __init gicv5_init_lpi_domain(void);
+void __init gicv5_free_lpi_domain(void);
+
 int gicv5_irs_of_probe(struct device_node *parent);
 void gicv5_irs_remove(void);
+int gicv5_irs_enable(void);
 int gicv5_irs_register_cpu(int cpuid);
 int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid);
 struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id);
 int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
+int gicv5_irs_iste_alloc(u32 lpi);
+
+void gicv5_init_lpis(u32 max);
+void gicv5_deinit_lpis(void);
+
+int gicv5_alloc_lpi(void);
+void gicv5_free_lpi(u32 lpi);
 #endif
-- 
cgit v1.2.3


From 31fd3becb920e0b31b99cf202ace637f75dd7e78 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:14 +0200
Subject: of/irq: Add of_msi_xlate() helper function

Add an of_msi_xlate() helper that maps a device ID and returns
the device node of the MSI controller the device ID is mapped to.

Required by core functions that need an MSI controller device node
pointer at the same time as a mapped device ID, of_msi_map_id() is not
sufficient for that purpose.

Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-24-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/of/irq.c       | 22 +++++++++++++++++-----
 include/linux/of_irq.h |  5 +++++
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index f8ad79b9b1c9..74aaea61de13 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -670,8 +670,20 @@ err:
 	}
 }
 
-static u32 __of_msi_map_id(struct device *dev, struct device_node **np,
-			    u32 id_in)
+/**
+ * of_msi_xlate - map a MSI ID and find relevant MSI controller node
+ * @dev: device for which the mapping is to be done.
+ * @msi_np: Pointer to store the MSI controller node
+ * @id_in: Device ID.
+ *
+ * Walk up the device hierarchy looking for devices with a "msi-map"
+ * property. If found, apply the mapping to @id_in. @msi_np pointed
+ * value must be NULL on entry, if an MSI controller is found @msi_np is
+ * initialized to the MSI controller node with a reference held.
+ *
+ * Returns: The mapped MSI id.
+ */
+u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in)
 {
 	struct device *parent_dev;
 	u32 id_out = id_in;
@@ -682,7 +694,7 @@ static u32 __of_msi_map_id(struct device *dev, struct device_node **np,
 	 */
 	for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent)
 		if (!of_map_id(parent_dev->of_node, id_in, "msi-map",
-				"msi-map-mask", np, &id_out))
+				"msi-map-mask", msi_np, &id_out))
 			break;
 	return id_out;
 }
@@ -700,7 +712,7 @@ static u32 __of_msi_map_id(struct device *dev, struct device_node **np,
  */
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in)
 {
-	return __of_msi_map_id(dev, &msi_np, id_in);
+	return of_msi_xlate(dev, &msi_np, id_in);
 }
 
 /**
@@ -719,7 +731,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
 {
 	struct device_node *np = NULL;
 
-	__of_msi_map_id(dev, &np, id);
+	of_msi_xlate(dev, &np, id);
 	return irq_find_matching_host(np, bus_token);
 }
 
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 6337ad4e5fe8..a480063c9cb1 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -54,6 +54,7 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 							u32 id,
 							u32 bus_token);
 extern void of_msi_configure(struct device *dev, const struct device_node *np);
+extern u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in);
 u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
 static inline void of_irq_init(const struct of_device_id *matches)
@@ -100,6 +101,10 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev
 static inline void of_msi_configure(struct device *dev, struct device_node *np)
 {
 }
+static inline u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in)
+{
+	return id_in;
+}
 static inline u32 of_msi_map_id(struct device *dev,
 				 struct device_node *msi_np, u32 id_in)
 {
-- 
cgit v1.2.3


From cd0ec59affb1f31347170bbafadb9c5cc716bca9 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:15 +0200
Subject: PCI/MSI: Add pci_msi_map_rid_ctlr_node() helper function

IRQchip drivers need a PCI/MSI function to map a RID to a MSI
controller deviceID namespace and at the same time retrieve the
struct device_node pointer of the MSI controller the RID is mapped
to.

Add pci_msi_map_rid_ctlr_node() to achieve this purpose.

Cc  Bjorn Helgaas <bhelgaas@google.com>

Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-25-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/pci/msi/irqdomain.c | 20 ++++++++++++++++++++
 include/linux/msi.h         |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index c05152733993..8a6e80d3963a 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -427,6 +427,26 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev)
 	return rid;
 }
 
+/**
+ * pci_msi_map_rid_ctlr_node - Get the MSI controller node and MSI requester id (RID)
+ * @pdev:	The PCI device
+ * @node:	Pointer to store the MSI controller device node
+ *
+ * Use the firmware data to find the MSI controller node for @pdev.
+ * If found map the RID and initialize @node with it. @node value must
+ * be set to NULL on entry.
+ *
+ * Returns: The RID.
+ */
+u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node)
+{
+	u32 rid = pci_dev_id(pdev);
+
+	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
+
+	return of_msi_xlate(&pdev->dev, node, rid);
+}
+
 /**
  * pci_msi_get_device_domain - Get the MSI domain for a given PCI device
  * @pdev:	The PCI device
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6863540f4b71..a418e2695b05 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -705,6 +705,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 					     struct msi_domain_info *info,
 					     struct irq_domain *parent);
 u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev);
+u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node);
 struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev);
 #else /* CONFIG_PCI_MSI */
 static inline struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
-- 
cgit v1.2.3


From 8b65db1e93a227ad9cc1b67cb221b06869f0b35f Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:17 +0200
Subject: irqchip/msi-lib: Add IRQ_DOMAIN_FLAG_FWNODE_PARENT handling

In some irqchip implementations the fwnode representing the IRQdomain
and the MSI controller fwnode do not match; in particular the IRQdomain
fwnode is the MSI controller fwnode parent.

To support selecting such IRQ domains, add a flag in core IRQ domain
code that explicitly tells the MSI lib to use the parent fwnode while
carrying out IRQ domain selection.

Update the msi-lib select callback with the resulting logic.

Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-27-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/irqchip/irq-msi-lib.c | 5 ++++-
 include/linux/irqdomain.h     | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/irqchip/irq-msi-lib.c b/drivers/irqchip/irq-msi-lib.c
index 246c30205af4..454c7f16dd4d 100644
--- a/drivers/irqchip/irq-msi-lib.c
+++ b/drivers/irqchip/irq-msi-lib.c
@@ -133,11 +133,14 @@ int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec,
 {
 	const struct msi_parent_ops *ops = d->msi_parent_ops;
 	u32 busmask = BIT(bus_token);
+	struct fwnode_handle *fwh;
 
 	if (!ops)
 		return 0;
 
-	if (fwspec->fwnode != d->fwnode || fwspec->param_count != 0)
+	fwh = d->flags & IRQ_DOMAIN_FLAG_FWNODE_PARENT ? fwnode_get_parent(fwspec->fwnode)
+						       : fwspec->fwnode;
+	if (fwh != d->fwnode || fwspec->param_count != 0)
 		return 0;
 
 	/* Handle pure domain searches */
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 7387d183029b..25c7cbeed393 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -212,6 +212,9 @@ enum {
 	/* Address and data pair is mutable when irq_set_affinity() */
 	IRQ_DOMAIN_FLAG_MSI_IMMUTABLE	= (1 << 11),
 
+	/* IRQ domain requires parent fwnode matching */
+	IRQ_DOMAIN_FLAG_FWNODE_PARENT	= (1 << 12),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
-- 
cgit v1.2.3


From 57d72196dfc8502b7e376ecdffb11c4f8766f26d Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:18 +0200
Subject: irqchip/gic-v5: Add GICv5 ITS support

The GICv5 architecture implements Interrupt Translation Service
(ITS) components in order to translate events coming from peripherals
into interrupt events delivered to the connected IRSes.

Events (ie MSI memory writes to ITS translate frame), are translated
by the ITS using tables kept in memory.

ITS translation tables for peripherals is kept in memory storage
(device table [DT] and Interrupt Translation Table [ITT]) that
is allocated by the driver on boot.

Both tables can be 1- or 2-level; the structure is chosen by the
driver after probing the ITS HW parameters and checking the
allowed table splits and supported {device/event}_IDbits.

DT table entries are allocated on demand (ie when a device is
probed); the DT table is sized using the number of supported
deviceID bits in that that's a system design decision (ie the
number of deviceID bits implemented should reflect the number
of devices expected in a system) therefore it makes sense to
allocate a DT table that can cater for the maximum number of
devices.

DT and ITT tables are allocated using the kmalloc interface;
the allocation size may be smaller than a page or larger,
and must provide contiguous memory pages.

LPIs INTIDs backing the device events are allocated one-by-one
and only upon Linux IRQ allocation; this to avoid preallocating
a large number of LPIs to cover the HW device MSI vector
size whereas few MSI entries are actually enabled by a device.

ITS cacheability/shareability attributes are programmed
according to the provided firmware ITS description.

The GICv5 partially reuses the GICv3 ITS MSI parent infrastructure
and adds functions required to retrieve the ITS translate frame
addresses out of msi-map and msi-parent properties to implement
the GICv5 ITS MSI parent callbacks.

Co-developed-by: Sascha Bischoff <sascha.bischoff@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Co-developed-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-28-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 MAINTAINERS                              |    1 +
 drivers/irqchip/Kconfig                  |    3 +
 drivers/irqchip/Makefile                 |    2 +-
 drivers/irqchip/irq-gic-its-msi-parent.c |  166 ++++
 drivers/irqchip/irq-gic-its-msi-parent.h |    1 +
 drivers/irqchip/irq-gic-v5-irs.c         |   24 +
 drivers/irqchip/irq-gic-v5-its.c         | 1206 ++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v5.c             |    2 +
 include/linux/irqchip/arm-gic-v5.h       |  157 ++++
 9 files changed, 1561 insertions(+), 1 deletion(-)
 create mode 100644 drivers/irqchip/irq-gic-v5-its.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index b1202987eef5..b035802ea41d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1970,6 +1970,7 @@ M:	Marc Zyngier <maz@kernel.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	Documentation/devicetree/bindings/interrupt-controller/arm,gic-v5*.yaml
+F:	drivers/irqchip/irq-gic-its-msi-parent.[ch]
 F:	drivers/irqchip/irq-gic-v5*.[ch]
 F:	include/linux/irqchip/arm-gic-v5.h
 
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index f9eae1a645c9..67d79d33407a 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -62,6 +62,9 @@ config ARM_GIC_V5
 	bool
 	select IRQ_DOMAIN_HIERARCHY
 	select GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	select GENERIC_MSI_IRQ
+	select IRQ_MSI_LIB
+	select ARM_GIC_ITS_PARENT
 
 config ARM_NVIC
 	bool
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 3ce6ea9a371b..5459f66e597f 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -37,7 +37,7 @@ obj-$(CONFIG_ARM_GIC_ITS_PARENT)	+= irq-gic-its-msi-parent.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v4.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
-obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o
+obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
diff --git a/drivers/irqchip/irq-gic-its-msi-parent.c b/drivers/irqchip/irq-gic-its-msi-parent.c
index 8beecfed2b84..eb1473f1448a 100644
--- a/drivers/irqchip/irq-gic-its-msi-parent.c
+++ b/drivers/irqchip/irq-gic-its-msi-parent.c
@@ -5,6 +5,7 @@
 // Copyright (C) 2022 Intel
 
 #include <linux/acpi_iort.h>
+#include <linux/of_address.h>
 #include <linux/pci.h>
 
 #include "irq-gic-its-msi-parent.h"
@@ -18,6 +19,23 @@
 				 MSI_FLAG_PCI_MSIX      |	\
 				 MSI_FLAG_MULTI_PCI_MSI)
 
+static int its_translate_frame_address(struct device_node *msi_node, phys_addr_t *pa)
+{
+	struct resource res;
+	int ret;
+
+	ret = of_property_match_string(msi_node, "reg-names", "ns-translate");
+	if (ret < 0)
+		return ret;
+
+	ret = of_address_to_resource(msi_node, ret, &res);
+	if (ret)
+		return ret;
+
+	*pa = res.start;
+	return 0;
+}
+
 #ifdef CONFIG_PCI_MSI
 static int its_pci_msi_vec_count(struct pci_dev *pdev, void *data)
 {
@@ -82,8 +100,46 @@ static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
 	msi_info = msi_get_domain_info(domain->parent);
 	return msi_info->ops->msi_prepare(domain->parent, dev, nvec, info);
 }
+
+static int its_v5_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+				  int nvec, msi_alloc_info_t *info)
+{
+	struct device_node *msi_node = NULL;
+	struct msi_domain_info *msi_info;
+	struct pci_dev *pdev;
+	phys_addr_t pa;
+	u32 rid;
+	int ret;
+
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+
+	rid = pci_msi_map_rid_ctlr_node(pdev, &msi_node);
+	if (!msi_node)
+		return -ENODEV;
+
+	ret = its_translate_frame_address(msi_node, &pa);
+	if (ret)
+		return -ENODEV;
+
+	of_node_put(msi_node);
+
+	/* ITS specific DeviceID */
+	info->scratchpad[0].ul = rid;
+	/* ITS translate frame physical address */
+	info->scratchpad[1].ul = pa;
+
+	/* Always allocate power of two vectors */
+	nvec = roundup_pow_of_two(nvec);
+
+	msi_info = msi_get_domain_info(domain->parent);
+	return msi_info->ops->msi_prepare(domain->parent, dev, nvec, info);
+}
 #else /* CONFIG_PCI_MSI */
 #define its_pci_msi_prepare	NULL
+#define its_v5_pci_msi_prepare	NULL
 #endif /* !CONFIG_PCI_MSI */
 
 static int of_pmsi_get_dev_id(struct irq_domain *domain, struct device *dev,
@@ -118,6 +174,53 @@ static int of_pmsi_get_dev_id(struct irq_domain *domain, struct device *dev,
 	return ret;
 }
 
+static int of_v5_pmsi_get_msi_info(struct irq_domain *domain, struct device *dev,
+				   u32 *dev_id, phys_addr_t *pa)
+{
+	int ret, index = 0;
+	/*
+	 * Retrieve the DeviceID and the ITS translate frame node pointer
+	 * out of the msi-parent property.
+	 */
+	do {
+		struct of_phandle_args args;
+
+		ret = of_parse_phandle_with_args(dev->of_node,
+						 "msi-parent", "#msi-cells",
+						 index, &args);
+		if (ret)
+			break;
+		/*
+		 * The IRQ domain fwnode is the msi controller parent
+		 * in GICv5 (where the msi controller nodes are the
+		 * ITS translate frames).
+		 */
+		if (args.np->parent == irq_domain_get_of_node(domain)) {
+			if (WARN_ON(args.args_count != 1))
+				return -EINVAL;
+			*dev_id = args.args[0];
+
+			ret = its_translate_frame_address(args.np, pa);
+			if (ret)
+				return -ENODEV;
+			break;
+		}
+		index++;
+	} while (!ret);
+
+	if (ret) {
+		struct device_node *np = NULL;
+
+		ret = of_map_id(dev->of_node, dev->id, "msi-map", "msi-map-mask", &np, dev_id);
+		if (np) {
+			ret = its_translate_frame_address(np, pa);
+			of_node_put(np);
+		}
+	}
+
+	return ret;
+}
+
 int __weak iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id)
 {
 	return -1;
@@ -148,6 +251,33 @@ static int its_pmsi_prepare(struct irq_domain *domain, struct device *dev,
 					  dev, nvec, info);
 }
 
+static int its_v5_pmsi_prepare(struct irq_domain *domain, struct device *dev,
+			       int nvec, msi_alloc_info_t *info)
+{
+	struct msi_domain_info *msi_info;
+	phys_addr_t pa;
+	u32 dev_id;
+	int ret;
+
+	if (!dev->of_node)
+		return -ENODEV;
+
+	ret = of_v5_pmsi_get_msi_info(domain->parent, dev, &dev_id, &pa);
+	if (ret)
+		return ret;
+
+	/* ITS specific DeviceID */
+	info->scratchpad[0].ul = dev_id;
+	/* ITS translate frame physical address */
+	info->scratchpad[1].ul = pa;
+
+	/* Allocate always as a power of 2 */
+	nvec = roundup_pow_of_two(nvec);
+
+	msi_info = msi_get_domain_info(domain->parent);
+	return msi_info->ops->msi_prepare(domain->parent, dev, nvec, info);
+}
+
 static void its_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *info)
 {
 	struct msi_domain_info *msi_info;
@@ -199,6 +329,32 @@ static bool its_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 	return true;
 }
 
+static bool its_v5_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				     struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
+
+	switch (info->bus_token) {
+	case DOMAIN_BUS_PCI_DEVICE_MSI:
+	case DOMAIN_BUS_PCI_DEVICE_MSIX:
+		info->ops->msi_prepare = its_v5_pci_msi_prepare;
+		info->ops->msi_teardown = its_msi_teardown;
+		break;
+	case DOMAIN_BUS_DEVICE_MSI:
+	case DOMAIN_BUS_WIRED_TO_MSI:
+		info->ops->msi_prepare = its_v5_pmsi_prepare;
+		info->ops->msi_teardown = its_msi_teardown;
+		break;
+	default:
+		/* Confused. How did the lib return true? */
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	return true;
+}
+
 const struct msi_parent_ops gic_v3_its_msi_parent_ops = {
 	.supported_flags	= ITS_MSI_FLAGS_SUPPORTED,
 	.required_flags		= ITS_MSI_FLAGS_REQUIRED,
@@ -208,3 +364,13 @@ const struct msi_parent_ops gic_v3_its_msi_parent_ops = {
 	.prefix			= "ITS-",
 	.init_dev_msi_info	= its_init_dev_msi_info,
 };
+
+const struct msi_parent_ops gic_v5_its_msi_parent_ops = {
+	.supported_flags	= ITS_MSI_FLAGS_SUPPORTED,
+	.required_flags		= ITS_MSI_FLAGS_REQUIRED,
+	.chip_flags		= MSI_CHIP_FLAG_SET_EOI,
+	.bus_select_token	= DOMAIN_BUS_NEXUS,
+	.bus_select_mask	= MATCH_PCI_MSI | MATCH_PLATFORM_MSI,
+	.prefix			= "ITS-v5-",
+	.init_dev_msi_info	= its_v5_init_dev_msi_info,
+};
diff --git a/drivers/irqchip/irq-gic-its-msi-parent.h b/drivers/irqchip/irq-gic-its-msi-parent.h
index 75e223e673ce..df016f347337 100644
--- a/drivers/irqchip/irq-gic-its-msi-parent.h
+++ b/drivers/irqchip/irq-gic-its-msi-parent.h
@@ -7,5 +7,6 @@
 #define _IRQ_GIC_ITS_MSI_PARENT_H
 
 extern const struct msi_parent_ops gic_v3_its_msi_parent_ops;
+extern const struct msi_parent_ops gic_v5_its_msi_parent_ops;
 
 #endif /* _IRQ_GIC_ITS_MSI_PARENT_H */
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index f00a4a6fece7..f845415f9143 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -484,6 +484,22 @@ static int gicv5_irs_wait_for_idle(struct gicv5_irs_chip_data *irs_data)
 					GICV5_IRS_CR0_IDLE, NULL);
 }
 
+void gicv5_irs_syncr(void)
+{
+	struct gicv5_irs_chip_data *irs_data;
+	u32 syncr;
+
+	irs_data = list_first_entry_or_null(&irs_nodes, struct gicv5_irs_chip_data, entry);
+	if (WARN_ON_ONCE(!irs_data))
+		return;
+
+	syncr = FIELD_PREP(GICV5_IRS_SYNCR_SYNC, 1);
+	irs_writel_relaxed(irs_data, syncr, GICV5_IRS_SYNCR);
+
+	gicv5_wait_for_op(irs_data->irs_base, GICV5_IRS_SYNC_STATUSR,
+			  GICV5_IRS_SYNC_STATUSR_IDLE);
+}
+
 int gicv5_irs_register_cpu(int cpuid)
 {
 	struct gicv5_irs_chip_data *irs_data;
@@ -780,6 +796,14 @@ int __init gicv5_irs_enable(void)
 	return 0;
 }
 
+void __init gicv5_irs_its_probe(void)
+{
+	struct gicv5_irs_chip_data *irs_data;
+
+	list_for_each_entry(irs_data, &irs_nodes, entry)
+		gicv5_its_of_probe(to_of_node(irs_data->fwnode));
+}
+
 int __init gicv5_irs_of_probe(struct device_node *parent)
 {
 	struct device_node *np;
diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c
new file mode 100644
index 000000000000..700df6d0687e
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v5-its.c
@@ -0,0 +1,1206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024-2025 ARM Limited, All Rights Reserved.
+ */
+
+#define pr_fmt(fmt)	"GICv5 ITS: " fmt
+
+#include <linux/bitmap.h>
+#include <linux/iommu.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-v5.h>
+#include <linux/irqchip/irq-msi-lib.h>
+
+#include "irq-gic-its-msi-parent.h"
+
+#define ITS_FLAGS_NON_COHERENT		BIT(0)
+
+struct gicv5_its_chip_data {
+	struct	xarray			its_devices;
+	struct	mutex			dev_alloc_lock;
+	struct	fwnode_handle		*fwnode;
+	struct gicv5_its_devtab_cfg	devtab_cfgr;
+	void	__iomem			*its_base;
+	u32				flags;
+	unsigned int			msi_domain_flags;
+};
+
+struct gicv5_its_dev {
+	struct gicv5_its_chip_data	*its_node;
+	struct gicv5_its_itt_cfg	itt_cfg;
+	unsigned long			*event_map;
+	u32				device_id;
+	u32				num_events;
+	phys_addr_t			its_trans_phys_base;
+};
+
+static u32 its_readl_relaxed(struct gicv5_its_chip_data *its_node, const u32 reg_offset)
+{
+	return readl_relaxed(its_node->its_base + reg_offset);
+}
+
+static void its_writel_relaxed(struct gicv5_its_chip_data *its_node, const u32 val,
+			       const u32 reg_offset)
+{
+	writel_relaxed(val, its_node->its_base + reg_offset);
+}
+
+static void its_writeq_relaxed(struct gicv5_its_chip_data *its_node, const u64 val,
+			       const u32 reg_offset)
+{
+	writeq_relaxed(val, its_node->its_base + reg_offset);
+}
+
+static void gicv5_its_dcache_clean(struct gicv5_its_chip_data *its, void *start,
+				   size_t sz)
+{
+	void *end = start + sz;
+
+	if (its->flags & ITS_FLAGS_NON_COHERENT)
+		dcache_clean_inval_poc((unsigned long)start, (unsigned long)end);
+	else
+		dsb(ishst);
+}
+
+static void its_write_table_entry(struct gicv5_its_chip_data *its, __le64 *entry,
+				  u64 val)
+{
+	WRITE_ONCE(*entry, cpu_to_le64(val));
+	gicv5_its_dcache_clean(its, entry, sizeof(*entry));
+}
+
+#define devtab_cfgr_field(its, f)	\
+	FIELD_GET(GICV5_ITS_DT_CFGR_##f, (its)->devtab_cfgr.cfgr)
+
+static int gicv5_its_cache_sync(struct gicv5_its_chip_data *its)
+{
+	return gicv5_wait_for_op_atomic(its->its_base, GICV5_ITS_STATUSR,
+					GICV5_ITS_STATUSR_IDLE, NULL);
+}
+
+static void gicv5_its_syncr(struct gicv5_its_chip_data *its,
+			    struct gicv5_its_dev *its_dev)
+{
+	u64 syncr;
+
+	syncr = FIELD_PREP(GICV5_ITS_SYNCR_SYNC, 1) |
+		FIELD_PREP(GICV5_ITS_SYNCR_DEVICEID, its_dev->device_id);
+
+	its_writeq_relaxed(its, syncr, GICV5_ITS_SYNCR);
+
+	gicv5_wait_for_op(its->its_base, GICV5_ITS_SYNC_STATUSR, GICV5_ITS_SYNC_STATUSR_IDLE);
+}
+
+/* Number of bits required for each L2 {device/interrupt translation} table size */
+#define ITS_L2SZ_64K_L2_BITS	13
+#define ITS_L2SZ_16K_L2_BITS	11
+#define ITS_L2SZ_4K_L2_BITS	9
+
+static unsigned int gicv5_its_l2sz_to_l2_bits(unsigned int sz)
+{
+	switch (sz) {
+	case GICV5_ITS_DT_ITT_CFGR_L2SZ_64k:
+		return ITS_L2SZ_64K_L2_BITS;
+	case GICV5_ITS_DT_ITT_CFGR_L2SZ_16k:
+		return ITS_L2SZ_16K_L2_BITS;
+	case GICV5_ITS_DT_ITT_CFGR_L2SZ_4k:
+	default:
+		return ITS_L2SZ_4K_L2_BITS;
+	}
+}
+
+static int gicv5_its_itt_cache_inv(struct gicv5_its_chip_data *its, u32 device_id,
+				   u16 event_id)
+{
+	u32 eventr, eidr;
+	u64 didr;
+
+	didr = FIELD_PREP(GICV5_ITS_DIDR_DEVICEID, device_id);
+	eidr = FIELD_PREP(GICV5_ITS_EIDR_EVENTID, event_id);
+	eventr = FIELD_PREP(GICV5_ITS_INV_EVENTR_I, 0x1);
+
+	its_writeq_relaxed(its, didr, GICV5_ITS_DIDR);
+	its_writel_relaxed(its, eidr, GICV5_ITS_EIDR);
+	its_writel_relaxed(its, eventr, GICV5_ITS_INV_EVENTR);
+
+	return gicv5_its_cache_sync(its);
+}
+
+static void gicv5_its_free_itt_linear(struct gicv5_its_dev *its_dev)
+{
+	kfree(its_dev->itt_cfg.linear.itt);
+}
+
+static void gicv5_its_free_itt_two_level(struct gicv5_its_dev *its_dev)
+{
+	unsigned int i, num_ents = its_dev->itt_cfg.l2.num_l1_ents;
+
+	for (i = 0; i < num_ents; i++)
+		kfree(its_dev->itt_cfg.l2.l2ptrs[i]);
+
+	kfree(its_dev->itt_cfg.l2.l2ptrs);
+	kfree(its_dev->itt_cfg.l2.l1itt);
+}
+
+static void gicv5_its_free_itt(struct gicv5_its_dev *its_dev)
+{
+	if (!its_dev->itt_cfg.l2itt)
+		gicv5_its_free_itt_linear(its_dev);
+	else
+		gicv5_its_free_itt_two_level(its_dev);
+}
+
+static int gicv5_its_create_itt_linear(struct gicv5_its_chip_data *its,
+				       struct gicv5_its_dev *its_dev,
+				       unsigned int event_id_bits)
+{
+	unsigned int num_ents = BIT(event_id_bits);
+	__le64 *itt;
+
+	itt = kcalloc(num_ents, sizeof(*itt), GFP_KERNEL);
+	if (!itt)
+		return -ENOMEM;
+
+	its_dev->itt_cfg.linear.itt = itt;
+	its_dev->itt_cfg.linear.num_ents = num_ents;
+	its_dev->itt_cfg.l2itt = false;
+	its_dev->itt_cfg.event_id_bits = event_id_bits;
+
+	gicv5_its_dcache_clean(its, itt, num_ents * sizeof(*itt));
+
+	return 0;
+}
+
+/*
+ * Allocate a two-level ITT. All ITT entries are allocated in one go, unlike
+ * with the device table. Span may be used to limit the second level table
+ * size, where possible.
+ */
+static int gicv5_its_create_itt_two_level(struct gicv5_its_chip_data *its,
+					  struct gicv5_its_dev *its_dev,
+					  unsigned int event_id_bits,
+					  unsigned int itt_l2sz,
+					  unsigned int num_events)
+{
+	unsigned int l1_bits, l2_bits, span, events_per_l2_table;
+	unsigned int i, complete_tables, final_span, num_ents;
+	__le64 *itt_l1, *itt_l2, **l2ptrs;
+	int ret;
+	u64 val;
+
+	ret = gicv5_its_l2sz_to_l2_bits(itt_l2sz);
+	if (ret >= event_id_bits) {
+		pr_debug("Incorrect l2sz (0x%x) for %u EventID bits. Cannot allocate ITT\n",
+			 itt_l2sz, event_id_bits);
+		return -EINVAL;
+	}
+
+	l2_bits = ret;
+
+	l1_bits = event_id_bits - l2_bits;
+
+	num_ents = BIT(l1_bits);
+
+	itt_l1 = kcalloc(num_ents, sizeof(*itt_l1), GFP_KERNEL);
+	if (!itt_l1)
+		return -ENOMEM;
+
+	l2ptrs = kcalloc(num_ents, sizeof(*l2ptrs), GFP_KERNEL);
+	if (!l2ptrs) {
+		kfree(itt_l1);
+		return -ENOMEM;
+	}
+
+	its_dev->itt_cfg.l2.l2ptrs = l2ptrs;
+
+	its_dev->itt_cfg.l2.l2sz = itt_l2sz;
+	its_dev->itt_cfg.l2.l1itt = itt_l1;
+	its_dev->itt_cfg.l2.num_l1_ents = num_ents;
+	its_dev->itt_cfg.l2itt = true;
+	its_dev->itt_cfg.event_id_bits = event_id_bits;
+
+	/*
+	 * Need to determine how many entries there are per L2 - this is based
+	 * on the number of bits in the table.
+	 */
+	events_per_l2_table = BIT(l2_bits);
+	complete_tables = num_events / events_per_l2_table;
+	final_span = order_base_2(num_events % events_per_l2_table);
+
+	for (i = 0; i < num_ents; i++) {
+		size_t l2sz;
+
+		span = i == complete_tables ? final_span : l2_bits;
+
+		itt_l2 = kcalloc(BIT(span), sizeof(*itt_l2), GFP_KERNEL);
+		if (!itt_l2) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		its_dev->itt_cfg.l2.l2ptrs[i] = itt_l2;
+
+		l2sz = BIT(span) * sizeof(*itt_l2);
+
+		gicv5_its_dcache_clean(its, itt_l2, l2sz);
+
+		val = (virt_to_phys(itt_l2) & GICV5_ITTL1E_L2_ADDR_MASK) |
+		       FIELD_PREP(GICV5_ITTL1E_SPAN, span)		 |
+		       FIELD_PREP(GICV5_ITTL1E_VALID, 0x1);
+
+		WRITE_ONCE(itt_l1[i], cpu_to_le64(val));
+	}
+
+	gicv5_its_dcache_clean(its, itt_l1, num_ents * sizeof(*itt_l1));
+
+	return 0;
+
+out_free:
+	for (i = i - 1; i >= 0; i--)
+		kfree(its_dev->itt_cfg.l2.l2ptrs[i]);
+
+	kfree(its_dev->itt_cfg.l2.l2ptrs);
+	kfree(itt_l1);
+	return ret;
+}
+
+/*
+ * Function to check whether the device table or ITT table support
+ * a two-level table and if so depending on the number of id_bits
+ * requested, determine whether a two-level table is required.
+ *
+ * Return the 2-level size value if a two level table is deemed
+ * necessary.
+ */
+static bool gicv5_its_l2sz_two_level(bool devtab, u32 its_idr1, u8 id_bits, u8 *sz)
+{
+	unsigned int l2_bits, l2_sz;
+
+	if (devtab && !FIELD_GET(GICV5_ITS_IDR1_DT_LEVELS, its_idr1))
+		return false;
+
+	if (!devtab && !FIELD_GET(GICV5_ITS_IDR1_ITT_LEVELS, its_idr1))
+		return false;
+
+	/*
+	 * Pick an L2 size that matches the pagesize; if a match
+	 * is not found, go for the smallest supported l2 size granule.
+	 *
+	 * This ensures that we will always be able to allocate
+	 * contiguous memory at L2.
+	 */
+	switch (PAGE_SIZE) {
+	case SZ_64K:
+		if (GICV5_ITS_IDR1_L2SZ_SUPPORT_64KB(its_idr1)) {
+			l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_64k;
+			break;
+		}
+		fallthrough;
+	case SZ_4K:
+		if (GICV5_ITS_IDR1_L2SZ_SUPPORT_4KB(its_idr1)) {
+			l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_4k;
+			break;
+		}
+		fallthrough;
+	case SZ_16K:
+		if (GICV5_ITS_IDR1_L2SZ_SUPPORT_16KB(its_idr1)) {
+			l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_16k;
+			break;
+		}
+		if (GICV5_ITS_IDR1_L2SZ_SUPPORT_4KB(its_idr1)) {
+			l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_4k;
+			break;
+		}
+		if (GICV5_ITS_IDR1_L2SZ_SUPPORT_64KB(its_idr1)) {
+			l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_64k;
+			break;
+		}
+
+		l2_sz = GICV5_ITS_DT_ITT_CFGR_L2SZ_4k;
+		break;
+	}
+
+	l2_bits = gicv5_its_l2sz_to_l2_bits(l2_sz);
+
+	if (l2_bits > id_bits)
+		return false;
+
+	*sz = l2_sz;
+
+	return true;
+}
+
+static __le64 *gicv5_its_device_get_itte_ref(struct gicv5_its_dev *its_dev,
+					     u16 event_id)
+{
+	unsigned int l1_idx, l2_idx, l2_bits;
+	__le64 *l2_itt;
+
+	if (!its_dev->itt_cfg.l2itt) {
+		__le64 *itt = its_dev->itt_cfg.linear.itt;
+
+		return &itt[event_id];
+	}
+
+	l2_bits = gicv5_its_l2sz_to_l2_bits(its_dev->itt_cfg.l2.l2sz);
+	l1_idx = event_id >> l2_bits;
+	l2_idx = event_id & GENMASK(l2_bits - 1, 0);
+	l2_itt = its_dev->itt_cfg.l2.l2ptrs[l1_idx];
+
+	return &l2_itt[l2_idx];
+}
+
+static int gicv5_its_device_cache_inv(struct gicv5_its_chip_data *its,
+				      struct gicv5_its_dev *its_dev)
+{
+	u32 devicer;
+	u64 didr;
+
+	didr = FIELD_PREP(GICV5_ITS_DIDR_DEVICEID, its_dev->device_id);
+	devicer = FIELD_PREP(GICV5_ITS_INV_DEVICER_I, 0x1)	|
+		  FIELD_PREP(GICV5_ITS_INV_DEVICER_EVENTID_BITS,
+			     its_dev->itt_cfg.event_id_bits)	|
+		  FIELD_PREP(GICV5_ITS_INV_DEVICER_L1, 0x0);
+	its_writeq_relaxed(its, didr, GICV5_ITS_DIDR);
+	its_writel_relaxed(its, devicer, GICV5_ITS_INV_DEVICER);
+
+	return gicv5_its_cache_sync(its);
+}
+
+/*
+ * Allocate a level 2 device table entry, update L1 parent to reference it.
+ * Only used for 2-level device tables, and it is called on demand.
+ */
+static int gicv5_its_alloc_l2_devtab(struct gicv5_its_chip_data *its,
+				     unsigned int l1_index)
+{
+	__le64 *l2devtab, *l1devtab = its->devtab_cfgr.l2.l1devtab;
+	u8 span, l2sz, l2_bits;
+	u64 l1dte;
+
+	if (FIELD_GET(GICV5_DTL1E_VALID, le64_to_cpu(l1devtab[l1_index])))
+		return 0;
+
+	span = FIELD_GET(GICV5_DTL1E_SPAN, le64_to_cpu(l1devtab[l1_index]));
+	l2sz = devtab_cfgr_field(its, L2SZ);
+
+	l2_bits = gicv5_its_l2sz_to_l2_bits(l2sz);
+
+	/*
+	 * Span allows us to create a smaller L2 device table.
+	 * If it is too large, use the number of allowed L2 bits.
+	 */
+	if (span > l2_bits)
+		span = l2_bits;
+
+	l2devtab = kcalloc(BIT(span), sizeof(*l2devtab), GFP_KERNEL);
+	if (!l2devtab)
+		return -ENOMEM;
+
+	its->devtab_cfgr.l2.l2ptrs[l1_index] = l2devtab;
+
+	l1dte = FIELD_PREP(GICV5_DTL1E_SPAN, span)			|
+		(virt_to_phys(l2devtab) & GICV5_DTL1E_L2_ADDR_MASK)	|
+		FIELD_PREP(GICV5_DTL1E_VALID, 0x1);
+	its_write_table_entry(its, &l1devtab[l1_index], l1dte);
+
+	return 0;
+}
+
+static __le64 *gicv5_its_devtab_get_dte_ref(struct gicv5_its_chip_data *its,
+					    u32 device_id, bool alloc)
+{
+	u8 str = devtab_cfgr_field(its, STRUCTURE);
+	unsigned int l2sz, l2_bits, l1_idx, l2_idx;
+	__le64 *l2devtab;
+	int ret;
+
+	if (str == GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR) {
+		l2devtab = its->devtab_cfgr.linear.devtab;
+		return &l2devtab[device_id];
+	}
+
+	l2sz = devtab_cfgr_field(its, L2SZ);
+	l2_bits = gicv5_its_l2sz_to_l2_bits(l2sz);
+	l1_idx = device_id >> l2_bits;
+	l2_idx = device_id & GENMASK(l2_bits - 1, 0);
+
+	if (alloc) {
+		/*
+		 * Allocate a new L2 device table here before
+		 * continuing. We make the assumption that the span in
+		 * the L1 table has been set correctly, and blindly use
+		 * that value.
+		 */
+		ret = gicv5_its_alloc_l2_devtab(its, l1_idx);
+		if (ret)
+			return NULL;
+	}
+
+	l2devtab = its->devtab_cfgr.l2.l2ptrs[l1_idx];
+	return &l2devtab[l2_idx];
+}
+
+/*
+ * Register a new device in the device table. Allocate an ITT and
+ * program the L2DTE entry according to the ITT structure that
+ * was chosen.
+ */
+static int gicv5_its_device_register(struct gicv5_its_chip_data *its,
+				     struct gicv5_its_dev *its_dev)
+{
+	u8 event_id_bits, device_id_bits, itt_struct, itt_l2sz;
+	phys_addr_t itt_phys_base;
+	bool two_level_itt;
+	u32 idr1, idr2;
+	__le64 *dte;
+	u64 val;
+	int ret;
+
+	device_id_bits = devtab_cfgr_field(its, DEVICEID_BITS);
+
+	if (its_dev->device_id >= BIT(device_id_bits)) {
+		pr_err("Supplied DeviceID (%u) outside of Device Table range (%u)!",
+		       its_dev->device_id, (u32)GENMASK(device_id_bits - 1, 0));
+		return -EINVAL;
+	}
+
+	dte = gicv5_its_devtab_get_dte_ref(its, its_dev->device_id, true);
+	if (!dte)
+		return -ENOMEM;
+
+	if (FIELD_GET(GICV5_DTL2E_VALID, le64_to_cpu(*dte)))
+		return -EBUSY;
+
+	/*
+	 * Determine how many bits we need, validate those against the max.
+	 * Based on these, determine if we should go for a 1- or 2-level ITT.
+	 */
+	event_id_bits = order_base_2(its_dev->num_events);
+
+	idr2 = its_readl_relaxed(its, GICV5_ITS_IDR2);
+
+	if (event_id_bits > FIELD_GET(GICV5_ITS_IDR2_EVENTID_BITS, idr2)) {
+		pr_err("Required EventID bits (%u) larger than supported bits (%u)!",
+		       event_id_bits,
+		       (u8)FIELD_GET(GICV5_ITS_IDR2_EVENTID_BITS, idr2));
+		return -EINVAL;
+	}
+
+	idr1 = its_readl_relaxed(its, GICV5_ITS_IDR1);
+
+	/*
+	 * L2 ITT size is programmed into the L2DTE regardless of
+	 * whether a two-level or linear ITT is built, init it.
+	 */
+	itt_l2sz = 0;
+
+	two_level_itt = gicv5_its_l2sz_two_level(false, idr1, event_id_bits,
+						  &itt_l2sz);
+	if (two_level_itt)
+		ret = gicv5_its_create_itt_two_level(its, its_dev, event_id_bits,
+						     itt_l2sz,
+						     its_dev->num_events);
+	else
+		ret = gicv5_its_create_itt_linear(its, its_dev, event_id_bits);
+	if (ret)
+		return ret;
+
+	itt_phys_base = two_level_itt ? virt_to_phys(its_dev->itt_cfg.l2.l1itt) :
+					virt_to_phys(its_dev->itt_cfg.linear.itt);
+
+	itt_struct = two_level_itt ? GICV5_ITS_DT_ITT_CFGR_STRUCTURE_TWO_LEVEL :
+				     GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR;
+
+	val = FIELD_PREP(GICV5_DTL2E_EVENT_ID_BITS, event_id_bits)	|
+	      FIELD_PREP(GICV5_DTL2E_ITT_STRUCTURE, itt_struct)		|
+	      (itt_phys_base & GICV5_DTL2E_ITT_ADDR_MASK)		|
+	      FIELD_PREP(GICV5_DTL2E_ITT_L2SZ, itt_l2sz)		|
+	      FIELD_PREP(GICV5_DTL2E_VALID, 0x1);
+
+	its_write_table_entry(its, dte, val);
+
+	ret = gicv5_its_device_cache_inv(its, its_dev);
+	if (ret) {
+		its_write_table_entry(its, dte, 0);
+		gicv5_its_free_itt(its_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Unregister a device in the device table. Lookup the device by ID, free the
+ * corresponding ITT, mark the device as invalid in the device table.
+ */
+static int gicv5_its_device_unregister(struct gicv5_its_chip_data *its,
+				       struct gicv5_its_dev *its_dev)
+{
+	__le64 *dte;
+
+	dte = gicv5_its_devtab_get_dte_ref(its, its_dev->device_id, false);
+
+	if (!FIELD_GET(GICV5_DTL2E_VALID, le64_to_cpu(*dte))) {
+		pr_debug("Device table entry for DeviceID 0x%x is not valid. Nothing to clean up!",
+			 its_dev->device_id);
+		return -EINVAL;
+	}
+
+	/* Zero everything - make it clear that this is an invalid entry */
+	its_write_table_entry(its, dte, 0);
+
+	gicv5_its_free_itt(its_dev);
+
+	return gicv5_its_device_cache_inv(its, its_dev);
+}
+
+/*
+ * Allocate a 1-level device table. All entries are allocated, but marked
+ * invalid.
+ */
+static int gicv5_its_alloc_devtab_linear(struct gicv5_its_chip_data *its,
+					u8 device_id_bits)
+{
+	__le64 *devtab;
+	size_t sz;
+	u64 baser;
+	u32 cfgr;
+
+	/*
+	 * We expect a GICv5 implementation requiring a large number of
+	 * deviceID bits to support a 2-level device table. If that's not
+	 * the case, cap the number of deviceIDs supported according to the
+	 * kmalloc limits so that the system can chug along with a linear
+	 * device table.
+	 */
+	sz = BIT_ULL(device_id_bits) * sizeof(*devtab);
+	if (sz > KMALLOC_MAX_SIZE) {
+		u8 device_id_cap = ilog2(KMALLOC_MAX_SIZE/sizeof(*devtab));
+
+		pr_warn("Limiting device ID bits from %u to %u\n",
+			device_id_bits, device_id_cap);
+		device_id_bits = device_id_cap;
+	}
+
+	devtab = kcalloc(BIT(device_id_bits), sizeof(*devtab), GFP_KERNEL);
+	if (!devtab)
+		return -ENOMEM;
+
+	gicv5_its_dcache_clean(its, devtab, sz);
+
+	cfgr = FIELD_PREP(GICV5_ITS_DT_CFGR_STRUCTURE,
+			  GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR)	|
+	       FIELD_PREP(GICV5_ITS_DT_CFGR_L2SZ, 0)			|
+	       FIELD_PREP(GICV5_ITS_DT_CFGR_DEVICEID_BITS, device_id_bits);
+	its_writel_relaxed(its, cfgr, GICV5_ITS_DT_CFGR);
+
+	baser = virt_to_phys(devtab) & GICV5_ITS_DT_BASER_ADDR_MASK;
+	its_writeq_relaxed(its, baser, GICV5_ITS_DT_BASER);
+
+	its->devtab_cfgr.cfgr = cfgr;
+	its->devtab_cfgr.linear.devtab = devtab;
+
+	return 0;
+}
+
+/*
+ * Allocate a 2-level device table. L2 entries are not allocated,
+ * they are allocated on-demand.
+ */
+static int gicv5_its_alloc_devtab_two_level(struct gicv5_its_chip_data *its,
+					    u8 device_id_bits,
+					    u8 devtab_l2sz)
+{
+	unsigned int l1_bits, l2_bits, i;
+	__le64 *l1devtab, **l2ptrs;
+	size_t l1_sz;
+	u64 baser;
+	u32 cfgr;
+
+	l2_bits = gicv5_its_l2sz_to_l2_bits(devtab_l2sz);
+
+	l1_bits = device_id_bits - l2_bits;
+	l1_sz = BIT(l1_bits) * sizeof(*l1devtab);
+	/*
+	 * With 2-level device table support it is highly unlikely
+	 * that we are not able to allocate the required amount of
+	 * device table memory to cover deviceID space; cap the
+	 * deviceID space if we encounter such set-up.
+	 * If this ever becomes a problem we could revisit the policy
+	 * behind level 2 size selection to reduce level-1 deviceID bits.
+	 */
+	if (l1_sz > KMALLOC_MAX_SIZE) {
+		l1_bits = ilog2(KMALLOC_MAX_SIZE/sizeof(*l1devtab));
+
+		pr_warn("Limiting device ID bits from %u to %u\n",
+			device_id_bits, l1_bits + l2_bits);
+		device_id_bits = l1_bits + l2_bits;
+		l1_sz = KMALLOC_MAX_SIZE;
+	}
+
+	l1devtab = kcalloc(BIT(l1_bits), sizeof(*l1devtab), GFP_KERNEL);
+	if (!l1devtab)
+		return -ENOMEM;
+
+	l2ptrs = kcalloc(BIT(l1_bits), sizeof(*l2ptrs), GFP_KERNEL);
+	if (!l2ptrs) {
+		kfree(l1devtab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < BIT(l1_bits); i++)
+		l1devtab[i] = cpu_to_le64(FIELD_PREP(GICV5_DTL1E_SPAN, l2_bits));
+
+	gicv5_its_dcache_clean(its, l1devtab, l1_sz);
+
+	cfgr = FIELD_PREP(GICV5_ITS_DT_CFGR_STRUCTURE,
+			  GICV5_ITS_DT_ITT_CFGR_STRUCTURE_TWO_LEVEL)	|
+	       FIELD_PREP(GICV5_ITS_DT_CFGR_L2SZ, devtab_l2sz)		|
+	       FIELD_PREP(GICV5_ITS_DT_CFGR_DEVICEID_BITS, device_id_bits);
+	its_writel_relaxed(its, cfgr, GICV5_ITS_DT_CFGR);
+
+	baser = virt_to_phys(l1devtab) & GICV5_ITS_DT_BASER_ADDR_MASK;
+	its_writeq_relaxed(its, baser, GICV5_ITS_DT_BASER);
+
+	its->devtab_cfgr.cfgr = cfgr;
+	its->devtab_cfgr.l2.l1devtab = l1devtab;
+	its->devtab_cfgr.l2.l2ptrs = l2ptrs;
+
+	return 0;
+}
+
+/*
+ * Initialise the device table as either 1- or 2-level depending on what is
+ * supported by the hardware.
+ */
+static int gicv5_its_init_devtab(struct gicv5_its_chip_data *its)
+{
+	u8 device_id_bits, devtab_l2sz;
+	bool two_level_devtab;
+	u32 idr1;
+
+	idr1 = its_readl_relaxed(its, GICV5_ITS_IDR1);
+
+	device_id_bits = FIELD_GET(GICV5_ITS_IDR1_DEVICEID_BITS, idr1);
+	two_level_devtab = gicv5_its_l2sz_two_level(true, idr1, device_id_bits,
+						    &devtab_l2sz);
+	if (two_level_devtab)
+		return gicv5_its_alloc_devtab_two_level(its, device_id_bits,
+						       devtab_l2sz);
+	else
+		return gicv5_its_alloc_devtab_linear(its, device_id_bits);
+}
+
+static void gicv5_its_deinit_devtab(struct gicv5_its_chip_data *its)
+{
+	u8 str = devtab_cfgr_field(its, STRUCTURE);
+
+	if (str == GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR) {
+		kfree(its->devtab_cfgr.linear.devtab);
+	} else {
+		kfree(its->devtab_cfgr.l2.l1devtab);
+		kfree(its->devtab_cfgr.l2.l2ptrs);
+	}
+}
+
+static void gicv5_its_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+	struct gicv5_its_dev *its_dev = irq_data_get_irq_chip_data(d);
+	u64 addr = its_dev->its_trans_phys_base;
+
+	msg->data = FIELD_GET(GICV5_ITS_HWIRQ_EVENT_ID, d->hwirq);
+	msi_msg_set_addr(irq_data_get_msi_desc(d), msg, addr);
+}
+
+static const struct irq_chip gicv5_its_irq_chip = {
+	.name			= "GICv5-ITS-MSI",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+	.irq_get_irqchip_state	= irq_chip_get_parent_state,
+	.irq_set_irqchip_state	= irq_chip_set_parent_state,
+	.irq_compose_msi_msg	= gicv5_its_compose_msi_msg,
+};
+
+static struct gicv5_its_dev *gicv5_its_find_device(struct gicv5_its_chip_data *its,
+						   u32 device_id)
+{
+	struct gicv5_its_dev *dev = xa_load(&its->its_devices, device_id);
+
+	return dev ? dev : ERR_PTR(-ENODEV);
+}
+
+static struct gicv5_its_dev *gicv5_its_alloc_device(struct gicv5_its_chip_data *its, int nvec,
+						    u32 dev_id)
+{
+	struct gicv5_its_dev *its_dev;
+	void *entry;
+	int ret;
+
+	its_dev = gicv5_its_find_device(its, dev_id);
+	if (!IS_ERR(its_dev)) {
+		pr_err("A device with this DeviceID (0x%x) has already been registered.\n",
+		       dev_id);
+
+		return ERR_PTR(-EBUSY);
+	}
+
+	its_dev = kzalloc(sizeof(*its_dev), GFP_KERNEL);
+	if (!its_dev)
+		return ERR_PTR(-ENOMEM);
+
+	its_dev->device_id = dev_id;
+	its_dev->num_events = nvec;
+
+	ret = gicv5_its_device_register(its, its_dev);
+	if (ret) {
+		pr_err("Failed to register the device\n");
+		goto out_dev_free;
+	}
+
+	gicv5_its_device_cache_inv(its, its_dev);
+
+	its_dev->its_node = its;
+
+	its_dev->event_map = (unsigned long *)bitmap_zalloc(its_dev->num_events, GFP_KERNEL);
+	if (!its_dev->event_map) {
+		ret = -ENOMEM;
+		goto out_unregister;
+	}
+
+	entry = xa_store(&its->its_devices, dev_id, its_dev, GFP_KERNEL);
+	if (xa_is_err(entry)) {
+		ret = xa_err(entry);
+		goto out_bitmap_free;
+	}
+
+	return its_dev;
+
+out_bitmap_free:
+	bitmap_free(its_dev->event_map);
+out_unregister:
+	gicv5_its_device_unregister(its, its_dev);
+out_dev_free:
+	kfree(its_dev);
+	return ERR_PTR(ret);
+}
+
+static int gicv5_its_msi_prepare(struct irq_domain *domain, struct device *dev,
+				 int nvec, msi_alloc_info_t *info)
+{
+	u32 dev_id = info->scratchpad[0].ul;
+	struct msi_domain_info *msi_info;
+	struct gicv5_its_chip_data *its;
+	struct gicv5_its_dev *its_dev;
+
+	msi_info = msi_get_domain_info(domain);
+	its = msi_info->data;
+
+	guard(mutex)(&its->dev_alloc_lock);
+
+	its_dev = gicv5_its_alloc_device(its, nvec, dev_id);
+	if (IS_ERR(its_dev))
+		return PTR_ERR(its_dev);
+
+	its_dev->its_trans_phys_base = info->scratchpad[1].ul;
+	info->scratchpad[0].ptr = its_dev;
+
+	return 0;
+}
+
+static void gicv5_its_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *info)
+{
+	struct gicv5_its_dev *its_dev = info->scratchpad[0].ptr;
+	struct msi_domain_info *msi_info;
+	struct gicv5_its_chip_data *its;
+
+	msi_info = msi_get_domain_info(domain);
+	its = msi_info->data;
+
+	guard(mutex)(&its->dev_alloc_lock);
+
+	if (WARN_ON_ONCE(!bitmap_empty(its_dev->event_map, its_dev->num_events)))
+		return;
+
+	xa_erase(&its->its_devices, its_dev->device_id);
+	bitmap_free(its_dev->event_map);
+	gicv5_its_device_unregister(its, its_dev);
+	kfree(its_dev);
+}
+
+static struct msi_domain_ops gicv5_its_msi_domain_ops = {
+	.msi_prepare	= gicv5_its_msi_prepare,
+	.msi_teardown	= gicv5_its_msi_teardown,
+};
+
+static int gicv5_its_map_event(struct gicv5_its_dev *its_dev, u16 event_id, u32 lpi)
+{
+	struct gicv5_its_chip_data *its = its_dev->its_node;
+	u64 itt_entry;
+	__le64 *itte;
+
+	itte = gicv5_its_device_get_itte_ref(its_dev, event_id);
+
+	if (FIELD_GET(GICV5_ITTL2E_VALID, *itte))
+		return -EEXIST;
+
+	itt_entry = FIELD_PREP(GICV5_ITTL2E_LPI_ID, lpi) |
+		    FIELD_PREP(GICV5_ITTL2E_VALID, 0x1);
+
+	its_write_table_entry(its, itte, itt_entry);
+
+	gicv5_its_itt_cache_inv(its, its_dev->device_id, event_id);
+
+	return 0;
+}
+
+static void gicv5_its_unmap_event(struct gicv5_its_dev *its_dev, u16 event_id)
+{
+	struct gicv5_its_chip_data *its = its_dev->its_node;
+	u64 itte_val;
+	__le64 *itte;
+
+	itte = gicv5_its_device_get_itte_ref(its_dev, event_id);
+
+	itte_val = le64_to_cpu(*itte);
+	itte_val &= ~GICV5_ITTL2E_VALID;
+
+	its_write_table_entry(its, itte, itte_val);
+
+	gicv5_its_itt_cache_inv(its, its_dev->device_id, event_id);
+}
+
+static int gicv5_its_alloc_eventid(struct gicv5_its_dev *its_dev,
+				   unsigned int nr_irqs, u32 *eventid)
+{
+	int ret;
+
+	ret = bitmap_find_free_region(its_dev->event_map,
+				      its_dev->num_events,
+				      get_count_order(nr_irqs));
+
+	if (ret < 0)
+		return ret;
+
+	*eventid = ret;
+
+	return 0;
+}
+
+static void gicv5_its_free_eventid(struct gicv5_its_dev *its_dev, u32 event_id_base,
+				   unsigned int nr_irqs)
+{
+	bitmap_release_region(its_dev->event_map, event_id_base,
+			      get_count_order(nr_irqs));
+}
+
+static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs, void *arg)
+{
+	u32 device_id, event_id_base, lpi;
+	struct gicv5_its_dev *its_dev;
+	msi_alloc_info_t *info = arg;
+	irq_hw_number_t hwirq;
+	struct irq_data *irqd;
+	int ret, i;
+
+	its_dev = info->scratchpad[0].ptr;
+
+	ret = gicv5_its_alloc_eventid(its_dev, nr_irqs, &event_id_base);
+	if (ret)
+		return ret;
+
+	ret = iommu_dma_prepare_msi(info->desc, its_dev->its_trans_phys_base);
+	if (ret)
+		goto out_eventid;
+
+	device_id = its_dev->device_id;
+
+	for (i = 0; i < nr_irqs; i++) {
+		lpi = gicv5_alloc_lpi();
+		if (ret < 0) {
+			pr_debug("Failed to find free LPI!\n");
+			goto out_eventid;
+		}
+
+		ret = irq_domain_alloc_irqs_parent(domain, virq + i, 1, &lpi);
+		if (ret)
+			goto out_free_lpi;
+
+		/*
+		 * Store eventid and deviceid into the hwirq for later use.
+		 *
+		 *	hwirq  = event_id << 32 | device_id
+		 */
+		hwirq = FIELD_PREP(GICV5_ITS_HWIRQ_DEVICE_ID, device_id) |
+			FIELD_PREP(GICV5_ITS_HWIRQ_EVENT_ID, (u64)event_id_base + i);
+		irq_domain_set_info(domain, virq + i, hwirq,
+				    &gicv5_its_irq_chip, its_dev,
+				    handle_fasteoi_irq, NULL, NULL);
+
+		irqd = irq_get_irq_data(virq + i);
+		irqd_set_single_target(irqd);
+		irqd_set_affinity_on_activate(irqd);
+		irqd_set_resend_when_in_progress(irqd);
+	}
+
+	return 0;
+
+out_free_lpi:
+	gicv5_free_lpi(lpi);
+out_eventid:
+	gicv5_its_free_eventid(its_dev, event_id_base, nr_irqs);
+	return ret;
+}
+
+static void gicv5_its_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+				      unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+	struct gicv5_its_chip_data *its;
+	struct gicv5_its_dev *its_dev;
+	u16 event_id_base;
+	unsigned int i;
+
+	its_dev = irq_data_get_irq_chip_data(d);
+	its = its_dev->its_node;
+
+	event_id_base = FIELD_GET(GICV5_ITS_HWIRQ_EVENT_ID, d->hwirq);
+
+	bitmap_release_region(its_dev->event_map, event_id_base,
+			      get_count_order(nr_irqs));
+
+	/*  Hierarchically free irq data */
+	for (i = 0; i < nr_irqs; i++) {
+		d = irq_domain_get_irq_data(domain, virq + i);
+
+		gicv5_free_lpi(d->parent_data->hwirq);
+		irq_domain_reset_irq_data(d);
+		irq_domain_free_irqs_parent(domain, virq + i, 1);
+	}
+
+	gicv5_its_syncr(its, its_dev);
+	gicv5_irs_syncr();
+}
+
+static int gicv5_its_irq_domain_activate(struct irq_domain *domain, struct irq_data *d,
+					 bool reserve)
+{
+	struct gicv5_its_dev *its_dev = irq_data_get_irq_chip_data(d);
+	u16 event_id;
+	u32 lpi;
+
+	event_id = FIELD_GET(GICV5_ITS_HWIRQ_EVENT_ID, d->hwirq);
+	lpi = d->parent_data->hwirq;
+
+	return gicv5_its_map_event(its_dev, event_id, lpi);
+}
+
+static void gicv5_its_irq_domain_deactivate(struct irq_domain *domain,
+					    struct irq_data *d)
+{
+	struct gicv5_its_dev *its_dev = irq_data_get_irq_chip_data(d);
+	u16 event_id;
+
+	event_id = FIELD_GET(GICV5_ITS_HWIRQ_EVENT_ID, d->hwirq);
+
+	gicv5_its_unmap_event(its_dev, event_id);
+}
+
+static const struct irq_domain_ops gicv5_its_irq_domain_ops = {
+	.alloc		= gicv5_its_irq_domain_alloc,
+	.free		= gicv5_its_irq_domain_free,
+	.activate	= gicv5_its_irq_domain_activate,
+	.deactivate	= gicv5_its_irq_domain_deactivate,
+	.select		= msi_lib_irq_domain_select,
+};
+
+static int gicv5_its_write_cr0(struct gicv5_its_chip_data *its, bool enable)
+{
+	u32 cr0 = FIELD_PREP(GICV5_ITS_CR0_ITSEN, enable);
+
+	its_writel_relaxed(its, cr0, GICV5_ITS_CR0);
+	return gicv5_wait_for_op_atomic(its->its_base, GICV5_ITS_CR0,
+					GICV5_ITS_CR0_IDLE, NULL);
+}
+
+static int gicv5_its_enable(struct gicv5_its_chip_data *its)
+{
+	return gicv5_its_write_cr0(its, true);
+}
+
+static int gicv5_its_disable(struct gicv5_its_chip_data *its)
+{
+	return gicv5_its_write_cr0(its, false);
+}
+
+static void gicv5_its_print_info(struct gicv5_its_chip_data *its_node)
+{
+	bool devtab_linear;
+	u8 device_id_bits;
+	u8 str;
+
+	device_id_bits = devtab_cfgr_field(its_node, DEVICEID_BITS);
+
+	str = devtab_cfgr_field(its_node, STRUCTURE);
+	devtab_linear = (str == GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR);
+
+	pr_info("ITS %s enabled using %s device table device_id_bits %u\n",
+		fwnode_get_name(its_node->fwnode),
+		devtab_linear ? "linear" : "2-level",
+		device_id_bits);
+}
+
+static int gicv5_its_init_domain(struct gicv5_its_chip_data *its, struct irq_domain *parent)
+{
+	struct irq_domain_info dom_info = {
+		.fwnode		= its->fwnode,
+		.ops		= &gicv5_its_irq_domain_ops,
+		.domain_flags	= its->msi_domain_flags,
+		.parent		= parent,
+	};
+	struct msi_domain_info *info;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->ops = &gicv5_its_msi_domain_ops;
+	info->data = its;
+	dom_info.host_data = info;
+
+	if (!msi_create_parent_irq_domain(&dom_info, &gic_v5_its_msi_parent_ops)) {
+		kfree(info);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int __init gicv5_its_init_bases(void __iomem *its_base, struct fwnode_handle *handle,
+				       struct irq_domain *parent_domain)
+{
+	struct device_node *np = to_of_node(handle);
+	struct gicv5_its_chip_data *its_node;
+	u32 cr0, cr1;
+	bool enabled;
+	int ret;
+
+	its_node = kzalloc(sizeof(*its_node), GFP_KERNEL);
+	if (!its_node)
+		return -ENOMEM;
+
+	mutex_init(&its_node->dev_alloc_lock);
+	xa_init(&its_node->its_devices);
+	its_node->fwnode = handle;
+	its_node->its_base = its_base;
+	its_node->msi_domain_flags = IRQ_DOMAIN_FLAG_ISOLATED_MSI |
+				     IRQ_DOMAIN_FLAG_FWNODE_PARENT;
+
+	cr0 = its_readl_relaxed(its_node, GICV5_ITS_CR0);
+	enabled = FIELD_GET(GICV5_ITS_CR0_ITSEN, cr0);
+	if (WARN(enabled, "ITS %s enabled, disabling it before proceeding\n", np->full_name)) {
+		ret = gicv5_its_disable(its_node);
+		if (ret)
+			goto out_free_node;
+	}
+
+	if (of_property_read_bool(np, "dma-noncoherent")) {
+		/*
+		 * A non-coherent ITS implies that some cache levels cannot be
+		 * used coherently by the cores and GIC. Our only option is to mark
+		 * memory attributes for the GIC as non-cacheable; by default,
+		 * non-cacheable memory attributes imply outer-shareable
+		 * shareability, the value written into ITS_CR1_SH is ignored.
+		 */
+		cr1 = FIELD_PREP(GICV5_ITS_CR1_ITT_RA, GICV5_NO_READ_ALLOC)	|
+		      FIELD_PREP(GICV5_ITS_CR1_DT_RA, GICV5_NO_READ_ALLOC)	|
+		      FIELD_PREP(GICV5_ITS_CR1_IC, GICV5_NON_CACHE)		|
+		      FIELD_PREP(GICV5_ITS_CR1_OC, GICV5_NON_CACHE);
+		its_node->flags |= ITS_FLAGS_NON_COHERENT;
+	} else {
+		cr1 = FIELD_PREP(GICV5_ITS_CR1_ITT_RA, GICV5_READ_ALLOC)	|
+		      FIELD_PREP(GICV5_ITS_CR1_DT_RA, GICV5_READ_ALLOC)		|
+		      FIELD_PREP(GICV5_ITS_CR1_IC, GICV5_WB_CACHE)		|
+		      FIELD_PREP(GICV5_ITS_CR1_OC, GICV5_WB_CACHE)		|
+		      FIELD_PREP(GICV5_ITS_CR1_SH, GICV5_INNER_SHARE);
+	}
+
+	its_writel_relaxed(its_node, cr1, GICV5_ITS_CR1);
+
+	ret = gicv5_its_init_devtab(its_node);
+	if (ret)
+		goto out_free_node;
+
+	ret = gicv5_its_enable(its_node);
+	if (ret)
+		goto out_free_devtab;
+
+	ret = gicv5_its_init_domain(its_node, parent_domain);
+	if (ret)
+		goto out_disable_its;
+
+	gicv5_its_print_info(its_node);
+
+	return 0;
+
+out_disable_its:
+	gicv5_its_disable(its_node);
+out_free_devtab:
+	gicv5_its_deinit_devtab(its_node);
+out_free_node:
+	kfree(its_node);
+	return ret;
+}
+
+static int __init gicv5_its_init(struct device_node *node)
+{
+	void __iomem *its_base;
+	int ret, idx;
+
+	idx = of_property_match_string(node, "reg-names", "ns-config");
+	if (idx < 0) {
+		pr_err("%pOF: ns-config reg-name not present\n", node);
+		return -ENODEV;
+	}
+
+	its_base = of_io_request_and_map(node, idx, of_node_full_name(node));
+	if (IS_ERR(its_base)) {
+		pr_err("%pOF: unable to map GICv5 ITS_CONFIG_FRAME\n", node);
+		return PTR_ERR(its_base);
+	}
+
+	ret = gicv5_its_init_bases(its_base, of_fwnode_handle(node),
+				   gicv5_global_data.lpi_domain);
+	if (ret)
+		goto out_unmap;
+
+	return 0;
+
+out_unmap:
+	iounmap(its_base);
+	return ret;
+}
+
+void __init gicv5_its_of_probe(struct device_node *parent)
+{
+	struct device_node *np;
+
+	for_each_available_child_of_node(parent, np) {
+		if (!of_device_is_compatible(np, "arm,gic-v5-its"))
+			continue;
+
+		if (gicv5_its_init(np))
+			pr_err("Failed to init ITS %s\n", np->full_name);
+	}
+}
diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 97ff935d79bd..a9e3e75adb96 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -1071,6 +1071,8 @@ static int __init gicv5_of_init(struct device_node *node, struct device_node *pa
 
 	gicv5_smp_init();
 
+	gicv5_irs_its_probe();
+
 	return 0;
 
 out_int:
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 680eed794a35..07b952549bfa 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -50,6 +50,8 @@
 #define GICV5_IRS_IDR7			0x001c
 #define GICV5_IRS_CR0			0x0080
 #define GICV5_IRS_CR1			0x0084
+#define GICV5_IRS_SYNCR			0x00c0
+#define GICV5_IRS_SYNC_STATUSR		0x00c4
 #define GICV5_IRS_SPI_SELR		0x0108
 #define GICV5_IRS_SPI_CFGR		0x0114
 #define GICV5_IRS_SPI_STATUSR		0x0118
@@ -103,6 +105,10 @@
 #define GICV5_IRS_CR1_OC		GENMASK(3, 2)
 #define GICV5_IRS_CR1_SH		GENMASK(1, 0)
 
+#define GICV5_IRS_SYNCR_SYNC		BIT(31)
+
+#define GICV5_IRS_SYNC_STATUSR_IDLE	BIT(0)
+
 #define GICV5_IRS_SPI_STATUSR_V		BIT(1)
 #define GICV5_IRS_SPI_STATUSR_IDLE	BIT(0)
 
@@ -144,6 +150,104 @@
 
 #define GICV5_ISTL1E_L2_ADDR_MASK	GENMASK_ULL(55, 12)
 
+/*
+ * ITS registers and tables structures
+ */
+#define GICV5_ITS_IDR1		0x0004
+#define GICV5_ITS_IDR2		0x0008
+#define GICV5_ITS_CR0		0x0080
+#define GICV5_ITS_CR1		0x0084
+#define GICV5_ITS_DT_BASER	0x00c0
+#define GICV5_ITS_DT_CFGR	0x00d0
+#define GICV5_ITS_DIDR		0x0100
+#define GICV5_ITS_EIDR		0x0108
+#define GICV5_ITS_INV_EVENTR	0x010c
+#define GICV5_ITS_INV_DEVICER	0x0110
+#define GICV5_ITS_STATUSR	0x0120
+#define GICV5_ITS_SYNCR		0x0140
+#define GICV5_ITS_SYNC_STATUSR	0x0148
+
+#define GICV5_ITS_IDR1_L2SZ			GENMASK(10, 8)
+#define GICV5_ITS_IDR1_ITT_LEVELS		BIT(7)
+#define GICV5_ITS_IDR1_DT_LEVELS		BIT(6)
+#define GICV5_ITS_IDR1_DEVICEID_BITS		GENMASK(5, 0)
+
+#define GICV5_ITS_IDR1_L2SZ_SUPPORT_4KB(r)	FIELD_GET(BIT(8), (r))
+#define GICV5_ITS_IDR1_L2SZ_SUPPORT_16KB(r)	FIELD_GET(BIT(9), (r))
+#define GICV5_ITS_IDR1_L2SZ_SUPPORT_64KB(r)	FIELD_GET(BIT(10), (r))
+
+#define GICV5_ITS_IDR2_XDMN_EVENTs		GENMASK(6, 5)
+#define GICV5_ITS_IDR2_EVENTID_BITS		GENMASK(4, 0)
+
+#define GICV5_ITS_CR0_IDLE			BIT(1)
+#define GICV5_ITS_CR0_ITSEN			BIT(0)
+
+#define GICV5_ITS_CR1_ITT_RA			BIT(7)
+#define GICV5_ITS_CR1_DT_RA			BIT(6)
+#define GICV5_ITS_CR1_IC			GENMASK(5, 4)
+#define GICV5_ITS_CR1_OC			GENMASK(3, 2)
+#define GICV5_ITS_CR1_SH			GENMASK(1, 0)
+
+#define GICV5_ITS_DT_CFGR_STRUCTURE		BIT(16)
+#define GICV5_ITS_DT_CFGR_L2SZ			GENMASK(7, 6)
+#define GICV5_ITS_DT_CFGR_DEVICEID_BITS		GENMASK(5, 0)
+
+#define GICV5_ITS_DT_BASER_ADDR_MASK		GENMASK_ULL(55, 3)
+
+#define GICV5_ITS_INV_DEVICER_I			BIT(31)
+#define GICV5_ITS_INV_DEVICER_EVENTID_BITS	GENMASK(5, 1)
+#define GICV5_ITS_INV_DEVICER_L1		BIT(0)
+
+#define GICV5_ITS_DIDR_DEVICEID			GENMASK_ULL(31, 0)
+
+#define GICV5_ITS_EIDR_EVENTID			GENMASK(15, 0)
+
+#define GICV5_ITS_INV_EVENTR_I			BIT(31)
+#define GICV5_ITS_INV_EVENTR_ITT_L2SZ		GENMASK(2, 1)
+#define GICV5_ITS_INV_EVENTR_L1			BIT(0)
+
+#define GICV5_ITS_STATUSR_IDLE			BIT(0)
+
+#define GICV5_ITS_SYNCR_SYNC			BIT_ULL(63)
+#define GICV5_ITS_SYNCR_SYNCALL			BIT_ULL(32)
+#define GICV5_ITS_SYNCR_DEVICEID		GENMASK_ULL(31, 0)
+
+#define GICV5_ITS_SYNC_STATUSR_IDLE		BIT(0)
+
+#define GICV5_DTL1E_VALID			BIT_ULL(0)
+/* Note that there is no shift for the address by design */
+#define GICV5_DTL1E_L2_ADDR_MASK		GENMASK_ULL(55, 3)
+#define GICV5_DTL1E_SPAN			GENMASK_ULL(63, 60)
+
+#define GICV5_DTL2E_VALID			BIT_ULL(0)
+#define GICV5_DTL2E_ITT_L2SZ			GENMASK_ULL(2, 1)
+/* Note that there is no shift for the address by design */
+#define GICV5_DTL2E_ITT_ADDR_MASK		GENMASK_ULL(55, 3)
+#define GICV5_DTL2E_ITT_DSWE			BIT_ULL(57)
+#define GICV5_DTL2E_ITT_STRUCTURE		BIT_ULL(58)
+#define GICV5_DTL2E_EVENT_ID_BITS		GENMASK_ULL(63, 59)
+
+#define GICV5_ITTL1E_VALID			BIT_ULL(0)
+/* Note that there is no shift for the address by design */
+#define GICV5_ITTL1E_L2_ADDR_MASK		GENMASK_ULL(55, 3)
+#define GICV5_ITTL1E_SPAN			GENMASK_ULL(63, 60)
+
+#define GICV5_ITTL2E_LPI_ID			GENMASK_ULL(23, 0)
+#define GICV5_ITTL2E_DAC			GENMASK_ULL(29, 28)
+#define GICV5_ITTL2E_VIRTUAL			BIT_ULL(30)
+#define GICV5_ITTL2E_VALID			BIT_ULL(31)
+#define GICV5_ITTL2E_VM_ID			GENMASK_ULL(47, 32)
+
+#define GICV5_ITS_DT_ITT_CFGR_L2SZ_4k		0b00
+#define GICV5_ITS_DT_ITT_CFGR_L2SZ_16k		0b01
+#define GICV5_ITS_DT_ITT_CFGR_L2SZ_64k		0b10
+
+#define GICV5_ITS_DT_ITT_CFGR_STRUCTURE_LINEAR		0
+#define GICV5_ITS_DT_ITT_CFGR_STRUCTURE_TWO_LEVEL	1
+
+#define GICV5_ITS_HWIRQ_DEVICE_ID		GENMASK_ULL(31, 0)
+#define GICV5_ITS_HWIRQ_EVENT_ID		GENMASK_ULL(63, 32)
+
 /*
  * Global Data structures and functions
  */
@@ -197,24 +301,77 @@ static inline int gicv5_wait_for_op_s_atomic(void __iomem *addr, u32 offset,
 	return 0;
 }
 
+static inline int gicv5_wait_for_op_s(void __iomem *addr, u32 offset,
+				      const char *reg_s, u32 mask)
+{
+	void __iomem *reg = addr + offset;
+	u32 val;
+	int ret;
+
+	ret = readl_poll_timeout(reg, val, val & mask, 1, 10 * USEC_PER_MSEC);
+	if (unlikely(ret == -ETIMEDOUT)) {
+		pr_err_ratelimited("%s timeout...\n", reg_s);
+		return ret;
+	}
+
+	return 0;
+}
+
 #define gicv5_wait_for_op_atomic(base, reg, mask, val) \
 	gicv5_wait_for_op_s_atomic(base, reg, #reg, mask, val)
 
+#define gicv5_wait_for_op(base, reg, mask) \
+	gicv5_wait_for_op_s(base, reg, #reg, mask)
+
 void __init gicv5_init_lpi_domain(void);
 void __init gicv5_free_lpi_domain(void);
 
 int gicv5_irs_of_probe(struct device_node *parent);
 void gicv5_irs_remove(void);
 int gicv5_irs_enable(void);
+void gicv5_irs_its_probe(void);
 int gicv5_irs_register_cpu(int cpuid);
 int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid);
 struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id);
 int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
 int gicv5_irs_iste_alloc(u32 lpi);
+void gicv5_irs_syncr(void);
+
+struct gicv5_its_devtab_cfg {
+	union {
+		struct {
+			__le64	*devtab;
+		} linear;
+		struct {
+			__le64	*l1devtab;
+			__le64	**l2ptrs;
+		} l2;
+	};
+	u32	cfgr;
+};
+
+struct gicv5_its_itt_cfg {
+	union {
+		struct {
+			__le64		*itt;
+			unsigned int	num_ents;
+		} linear;
+		struct {
+			__le64		*l1itt;
+			__le64		**l2ptrs;
+			unsigned int	num_l1_ents;
+			u8		l2sz;
+		} l2;
+	};
+	u8	event_id_bits;
+	bool	l2itt;
+};
 
 void gicv5_init_lpis(u32 max);
 void gicv5_deinit_lpis(void);
 
 int gicv5_alloc_lpi(void);
 void gicv5_free_lpi(u32 lpi);
+
+void __init gicv5_its_of_probe(struct device_node *parent);
 #endif
-- 
cgit v1.2.3


From 695949d8b16f11f2f172d8d0c7ccc1ae09ed6cb7 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lpieralisi@kernel.org>
Date: Thu, 3 Jul 2025 12:25:19 +0200
Subject: irqchip/gic-v5: Add GICv5 IWB support

The GICv5 architecture implements the Interrupt Wire Bridge (IWB) in
order to support wired interrupts that cannot be connected directly
to an IRS and instead uses the ITS to translate a wire event into
an IRQ signal.

Add the wired-to-MSI IWB driver to manage IWB wired interrupts.

An IWB is connected to an ITS and it has its own deviceID for all
interrupt wires that it manages; the IWB input wire number must be
exposed to the ITS as an eventID with a 1:1 mapping.

This eventID is not programmable and therefore requires a new
msi_alloc_info_t flag to make sure the ITS driver does not allocate
an eventid for the wire but rather it uses the msi_alloc_info_t.hwirq
number to gather the ITS eventID.

Co-developed-by: Sascha Bischoff <sascha.bischoff@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Co-developed-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250703-gicv5-host-v7-29-12e71f1b3528@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 drivers/irqchip/Makefile           |   3 +-
 drivers/irqchip/irq-gic-v5-its.c   |  40 ++++--
 drivers/irqchip/irq-gic-v5-iwb.c   | 284 +++++++++++++++++++++++++++++++++++++
 include/asm-generic/msi.h          |   1 +
 include/linux/irqchip/arm-gic-v5.h |  17 +++
 5 files changed, 335 insertions(+), 10 deletions(-)
 create mode 100644 drivers/irqchip/irq-gic-v5-iwb.c

(limited to 'include')

diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 5459f66e597f..e83dad932ac0 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -37,7 +37,8 @@ obj-$(CONFIG_ARM_GIC_ITS_PARENT)	+= irq-gic-its-msi-parent.o
 obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v4.o
 obj-$(CONFIG_ARM_GIC_V3_ITS_FSL_MC)	+= irq-gic-v3-its-fsl-mc-msi.o
 obj-$(CONFIG_PARTITION_PERCPU)		+= irq-partition-percpu.o
-obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o
+obj-$(CONFIG_ARM_GIC_V5)		+= irq-gic-v5.o irq-gic-v5-irs.o irq-gic-v5-its.o \
+					   irq-gic-v5-iwb.o
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
diff --git a/drivers/irqchip/irq-gic-v5-its.c b/drivers/irqchip/irq-gic-v5-its.c
index 700df6d0687e..340640fdbdf6 100644
--- a/drivers/irqchip/irq-gic-v5-its.c
+++ b/drivers/irqchip/irq-gic-v5-its.c
@@ -880,19 +880,41 @@ static void gicv5_its_unmap_event(struct gicv5_its_dev *its_dev, u16 event_id)
 	gicv5_its_itt_cache_inv(its, its_dev->device_id, event_id);
 }
 
-static int gicv5_its_alloc_eventid(struct gicv5_its_dev *its_dev,
+static int gicv5_its_alloc_eventid(struct gicv5_its_dev *its_dev, msi_alloc_info_t *info,
 				   unsigned int nr_irqs, u32 *eventid)
 {
-	int ret;
+	int event_id_base;
+
+	if (!(info->flags & MSI_ALLOC_FLAGS_FIXED_MSG_DATA)) {
+		event_id_base = bitmap_find_free_region(its_dev->event_map,
+							its_dev->num_events,
+							get_count_order(nr_irqs));
+		if (event_id_base < 0)
+			return event_id_base;
+	} else {
+		/*
+		 * We want to have a fixed EventID mapped for hardcoded
+		 * message data allocations.
+		 */
+		if (WARN_ON_ONCE(nr_irqs != 1))
+			return -EINVAL;
 
-	ret = bitmap_find_free_region(its_dev->event_map,
-				      its_dev->num_events,
-				      get_count_order(nr_irqs));
+		event_id_base = info->hwirq;
 
-	if (ret < 0)
-		return ret;
+		if (event_id_base >= its_dev->num_events) {
+			pr_err("EventID ouside of ITT range; cannot allocate an ITT entry!\n");
+
+			return -EINVAL;
+		}
+
+		if (test_and_set_bit(event_id_base, its_dev->event_map)) {
+			pr_warn("Can't reserve event_id bitmap\n");
+			return -EINVAL;
+
+		}
+	}
 
-	*eventid = ret;
+	*eventid = event_id_base;
 
 	return 0;
 }
@@ -916,7 +938,7 @@ static int gicv5_its_irq_domain_alloc(struct irq_domain *domain, unsigned int vi
 
 	its_dev = info->scratchpad[0].ptr;
 
-	ret = gicv5_its_alloc_eventid(its_dev, nr_irqs, &event_id_base);
+	ret = gicv5_its_alloc_eventid(its_dev, info, nr_irqs, &event_id_base);
 	if (ret)
 		return ret;
 
diff --git a/drivers/irqchip/irq-gic-v5-iwb.c b/drivers/irqchip/irq-gic-v5-iwb.c
new file mode 100644
index 000000000000..ed72fbdd4900
--- /dev/null
+++ b/drivers/irqchip/irq-gic-v5-iwb.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024-2025 ARM Limited, All Rights Reserved.
+ */
+#define pr_fmt(fmt)	"GICv5 IWB: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
+struct gicv5_iwb_chip_data {
+	void __iomem	*iwb_base;
+	u16		nr_regs;
+};
+
+static u32 iwb_readl_relaxed(struct gicv5_iwb_chip_data *iwb_node, const u32 reg_offset)
+{
+	return readl_relaxed(iwb_node->iwb_base + reg_offset);
+}
+
+static void iwb_writel_relaxed(struct gicv5_iwb_chip_data *iwb_node, const u32 val,
+			       const u32 reg_offset)
+{
+	writel_relaxed(val, iwb_node->iwb_base + reg_offset);
+}
+
+static int gicv5_iwb_wait_for_wenabler(struct gicv5_iwb_chip_data *iwb_node)
+{
+	return gicv5_wait_for_op_atomic(iwb_node->iwb_base, GICV5_IWB_WENABLE_STATUSR,
+					GICV5_IWB_WENABLE_STATUSR_IDLE, NULL);
+}
+
+static int __gicv5_iwb_set_wire_enable(struct gicv5_iwb_chip_data *iwb_node,
+				       u32 iwb_wire, bool enable)
+{
+	u32 n = iwb_wire / 32;
+	u8 i = iwb_wire % 32;
+	u32 val;
+
+	if (n >= iwb_node->nr_regs) {
+		pr_err("IWB_WENABLER<n> is invalid for n=%u\n", n);
+		return -EINVAL;
+	}
+
+	/*
+	 * Enable IWB wire/pin at this point
+	 * Note: This is not the same as enabling the interrupt
+	 */
+	val = iwb_readl_relaxed(iwb_node, GICV5_IWB_WENABLER + (4 * n));
+	if (enable)
+		val |= BIT(i);
+	else
+		val &= ~BIT(i);
+	iwb_writel_relaxed(iwb_node, val, GICV5_IWB_WENABLER + (4 * n));
+
+	return gicv5_iwb_wait_for_wenabler(iwb_node);
+}
+
+static int gicv5_iwb_enable_wire(struct gicv5_iwb_chip_data *iwb_node,
+				 u32 iwb_wire)
+{
+	return __gicv5_iwb_set_wire_enable(iwb_node, iwb_wire, true);
+}
+
+static int gicv5_iwb_disable_wire(struct gicv5_iwb_chip_data *iwb_node,
+				  u32 iwb_wire)
+{
+	return __gicv5_iwb_set_wire_enable(iwb_node, iwb_wire, false);
+}
+
+static void gicv5_iwb_irq_disable(struct irq_data *d)
+{
+	struct gicv5_iwb_chip_data *iwb_node = irq_data_get_irq_chip_data(d);
+
+	gicv5_iwb_disable_wire(iwb_node, d->hwirq);
+	irq_chip_disable_parent(d);
+}
+
+static void gicv5_iwb_irq_enable(struct irq_data *d)
+{
+	struct gicv5_iwb_chip_data *iwb_node = irq_data_get_irq_chip_data(d);
+
+	gicv5_iwb_enable_wire(iwb_node, d->hwirq);
+	irq_chip_enable_parent(d);
+}
+
+static int gicv5_iwb_set_type(struct irq_data *d, unsigned int type)
+{
+	struct gicv5_iwb_chip_data *iwb_node = irq_data_get_irq_chip_data(d);
+	u32 iwb_wire, n, wtmr;
+	u8 i;
+
+	iwb_wire = d->hwirq;
+	i = iwb_wire % 32;
+	n = iwb_wire / 32;
+
+	if (n >= iwb_node->nr_regs) {
+		pr_err_once("reg %u out of range\n", n);
+		return -EINVAL;
+	}
+
+	wtmr = iwb_readl_relaxed(iwb_node, GICV5_IWB_WTMR + (4 * n));
+
+	switch (type) {
+	case IRQ_TYPE_LEVEL_HIGH:
+	case IRQ_TYPE_LEVEL_LOW:
+		wtmr |= BIT(i);
+		break;
+	case IRQ_TYPE_EDGE_RISING:
+	case IRQ_TYPE_EDGE_FALLING:
+		wtmr &= ~BIT(i);
+		break;
+	default:
+		pr_debug("unexpected wire trigger mode");
+		return -EINVAL;
+	}
+
+	iwb_writel_relaxed(iwb_node, wtmr, GICV5_IWB_WTMR + (4 * n));
+
+	return 0;
+}
+
+static void gicv5_iwb_domain_set_desc(msi_alloc_info_t *alloc_info, struct msi_desc *desc)
+{
+	alloc_info->desc = desc;
+	alloc_info->hwirq = (u32)desc->data.icookie.value;
+}
+
+static int gicv5_iwb_irq_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec,
+					  irq_hw_number_t *hwirq,
+					  unsigned int *type)
+{
+	if (!is_of_node(fwspec->fwnode))
+		return -EINVAL;
+
+	if (fwspec->param_count < 2)
+		return -EINVAL;
+
+	/*
+	 * param[0] is be the wire
+	 * param[1] is the interrupt type
+	 */
+	*hwirq = fwspec->param[0];
+	*type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+
+	return 0;
+}
+
+static void gicv5_iwb_write_msi_msg(struct irq_data *d, struct msi_msg *msg) {}
+
+static const struct msi_domain_template iwb_msi_template = {
+	.chip = {
+		.name			= "GICv5-IWB",
+		.irq_mask		= irq_chip_mask_parent,
+		.irq_unmask		= irq_chip_unmask_parent,
+		.irq_enable		= gicv5_iwb_irq_enable,
+		.irq_disable		= gicv5_iwb_irq_disable,
+		.irq_eoi		= irq_chip_eoi_parent,
+		.irq_set_type		= gicv5_iwb_set_type,
+		.irq_write_msi_msg	= gicv5_iwb_write_msi_msg,
+		.irq_set_affinity	= irq_chip_set_affinity_parent,
+		.irq_get_irqchip_state	= irq_chip_get_parent_state,
+		.irq_set_irqchip_state	= irq_chip_set_parent_state,
+		.flags			= IRQCHIP_SET_TYPE_MASKED |
+					  IRQCHIP_SKIP_SET_WAKE |
+					  IRQCHIP_MASK_ON_SUSPEND,
+	},
+
+	.ops = {
+		.set_desc		= gicv5_iwb_domain_set_desc,
+		.msi_translate		= gicv5_iwb_irq_domain_translate,
+	},
+
+	.info = {
+		.bus_token		= DOMAIN_BUS_WIRED_TO_MSI,
+		.flags			= MSI_FLAG_USE_DEV_FWNODE,
+	},
+
+	.alloc_info = {
+		.flags			= MSI_ALLOC_FLAGS_FIXED_MSG_DATA,
+	},
+};
+
+static bool gicv5_iwb_create_device_domain(struct device *dev, unsigned int size,
+				     struct gicv5_iwb_chip_data *iwb_node)
+{
+	if (WARN_ON_ONCE(!dev->msi.domain))
+		return false;
+
+	return msi_create_device_irq_domain(dev, MSI_DEFAULT_DOMAIN,
+					    &iwb_msi_template, size,
+					    NULL, iwb_node);
+}
+
+static struct gicv5_iwb_chip_data *
+gicv5_iwb_init_bases(void __iomem *iwb_base, struct platform_device *pdev)
+{
+	u32 nr_wires, idr0, cr0;
+	unsigned int n;
+	int ret;
+
+	struct gicv5_iwb_chip_data *iwb_node __free(kfree) = kzalloc(sizeof(*iwb_node),
+								     GFP_KERNEL);
+	if (!iwb_node)
+		return ERR_PTR(-ENOMEM);
+
+	iwb_node->iwb_base = iwb_base;
+
+	idr0 = iwb_readl_relaxed(iwb_node, GICV5_IWB_IDR0);
+	nr_wires = (FIELD_GET(GICV5_IWB_IDR0_IW_RANGE, idr0) + 1) * 32;
+
+	cr0 = iwb_readl_relaxed(iwb_node, GICV5_IWB_CR0);
+	if (!FIELD_GET(GICV5_IWB_CR0_IWBEN, cr0)) {
+		dev_err(&pdev->dev, "IWB must be enabled in firmware\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	iwb_node->nr_regs = FIELD_GET(GICV5_IWB_IDR0_IW_RANGE, idr0) + 1;
+
+	for (n = 0; n < iwb_node->nr_regs; n++)
+		iwb_writel_relaxed(iwb_node, 0, GICV5_IWB_WENABLER + (sizeof(u32) * n));
+
+	ret = gicv5_iwb_wait_for_wenabler(iwb_node);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!gicv5_iwb_create_device_domain(&pdev->dev, nr_wires, iwb_node))
+		return ERR_PTR(-ENOMEM);
+
+	return_ptr(iwb_node);
+}
+
+static int gicv5_iwb_device_probe(struct platform_device *pdev)
+{
+	struct gicv5_iwb_chip_data *iwb_node;
+	void __iomem *iwb_base;
+	struct resource *res;
+	int ret;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -EINVAL;
+
+	iwb_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
+	if (!iwb_base) {
+		dev_err(&pdev->dev, "failed to ioremap %pR\n", res);
+		return -ENOMEM;
+	}
+
+	iwb_node = gicv5_iwb_init_bases(iwb_base, pdev);
+	if (IS_ERR(iwb_node)) {
+		ret = PTR_ERR(iwb_node);
+		goto out_unmap;
+	}
+
+	return 0;
+
+out_unmap:
+	iounmap(iwb_base);
+	return ret;
+}
+
+static const struct of_device_id gicv5_iwb_of_match[] = {
+	{ .compatible = "arm,gic-v5-iwb" },
+	{ /* END */ }
+};
+MODULE_DEVICE_TABLE(of, gicv5_iwb_of_match);
+
+static struct platform_driver gicv5_iwb_platform_driver = {
+	.driver = {
+		.name			= "GICv5 IWB",
+		.of_match_table		= gicv5_iwb_of_match,
+		.suppress_bind_attrs	= true,
+	},
+	.probe				= gicv5_iwb_device_probe,
+};
+
+module_platform_driver(gicv5_iwb_platform_driver);
diff --git a/include/asm-generic/msi.h b/include/asm-generic/msi.h
index 124c734ca5d9..92cca4b23f13 100644
--- a/include/asm-generic/msi.h
+++ b/include/asm-generic/msi.h
@@ -33,6 +33,7 @@ typedef struct msi_alloc_info {
 
 /* Device generating MSIs is proxying for another device */
 #define MSI_ALLOC_FLAGS_PROXY_DEVICE	(1UL << 0)
+#define MSI_ALLOC_FLAGS_FIXED_MSG_DATA	(1UL << 1)
 
 #define GENERIC_MSI_DOMAIN_OPS		1
 
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 07b952549bfa..68ddcdb1cec5 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -248,6 +248,23 @@
 #define GICV5_ITS_HWIRQ_DEVICE_ID		GENMASK_ULL(31, 0)
 #define GICV5_ITS_HWIRQ_EVENT_ID		GENMASK_ULL(63, 32)
 
+/*
+ * IWB registers
+ */
+#define GICV5_IWB_IDR0				0x0000
+#define GICV5_IWB_CR0				0x0080
+#define GICV5_IWB_WENABLE_STATUSR		0x00c0
+#define GICV5_IWB_WENABLER			0x2000
+#define GICV5_IWB_WTMR				0x4000
+
+#define GICV5_IWB_IDR0_INT_DOMS			GENMASK(14, 11)
+#define GICV5_IWB_IDR0_IW_RANGE			GENMASK(10, 0)
+
+#define GICV5_IWB_CR0_IDLE			BIT(1)
+#define GICV5_IWB_CR0_IWBEN			BIT(0)
+
+#define GICV5_IWB_WENABLE_STATUSR_IDLE		BIT(0)
+
 /*
  * Global Data structures and functions
  */
-- 
cgit v1.2.3


From 4e655028c29fbc455fdbbd3ca074443361adfa44 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 7 Jul 2025 11:41:14 -0700
Subject: net: ethtool: remove the compat code for _rxfh_context ops

All drivers are now converted to dedicated _rxfh_context ops.
Remove the use of >set_rxfh() to manage additional contexts.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
Link: https://patch.msgid.link/20250707184115.2285277-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h |  4 ---
 net/core/dev.c          | 15 +-----------
 net/ethtool/ioctl.c     | 65 +++++++++++--------------------------------------
 net/ethtool/rss.c       |  3 +--
 4 files changed, 16 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 59877fd2a1d3..de5bd76a400c 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -865,9 +865,6 @@ struct kernel_ethtool_ts_info {
  * @supported_input_xfrm: supported types of input xfrm from %RXH_XFRM_*.
  * @cap_link_lanes_supported: indicates if the driver supports lanes
  *	parameter.
- * @cap_rss_ctx_supported: indicates if the driver supports RSS
- *	contexts via legacy API, drivers implementing @create_rxfh_context
- *	do not have to set this bit.
  * @rxfh_per_ctx_fields: device supports selecting different header fields
  *	for Rx hash calculation and RSS for each additional context.
  * @rxfh_per_ctx_key: device supports setting different RSS key for each
@@ -1100,7 +1097,6 @@ struct kernel_ethtool_ts_info {
 struct ethtool_ops {
 	u32     supported_input_xfrm:8;
 	u32     cap_link_lanes_supported:1;
-	u32     cap_rss_ctx_supported:1;
 	u32	rxfh_per_ctx_fields:1;
 	u32	rxfh_per_ctx_key:1;
 	u32	cap_rss_rxnfc_adds:1;
diff --git a/net/core/dev.c b/net/core/dev.c
index ea129aa08317..fe677ccec5b0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -11979,21 +11979,8 @@ static void netdev_rss_contexts_free(struct net_device *dev)
 
 	mutex_lock(&dev->ethtool->rss_lock);
 	xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
-		struct ethtool_rxfh_param rxfh;
-
-		rxfh.indir = ethtool_rxfh_context_indir(ctx);
-		rxfh.key = ethtool_rxfh_context_key(ctx);
-		rxfh.hfunc = ctx->hfunc;
-		rxfh.input_xfrm = ctx->input_xfrm;
-		rxfh.rss_context = context;
-		rxfh.rss_delete = true;
-
 		xa_erase(&dev->ethtool->rss_ctx, context);
-		if (dev->ethtool_ops->create_rxfh_context)
-			dev->ethtool_ops->remove_rxfh_context(dev, ctx,
-							      context, NULL);
-		else
-			dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
+		dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
 		kfree(ctx);
 	}
 	xa_destroy(&dev->ethtool->rss_ctx);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index b6d96e562c9a..d8a17350d3e8 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1391,8 +1391,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
-	if (rxfh.rss_context && !(ops->cap_rss_ctx_supported ||
-				  ops->create_rxfh_context))
+	if (rxfh.rss_context && !ops->create_rxfh_context)
 		return -EOPNOTSUPP;
 
 	rxfh.indir_size = rxfh_dev.indir_size;
@@ -1534,8 +1533,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
 		return -EINVAL;
 	/* Most drivers don't handle rss_context, check it's 0 as well */
-	if (rxfh.rss_context && !(ops->cap_rss_ctx_supported ||
-				  ops->create_rxfh_context))
+	if (rxfh.rss_context && !ops->create_rxfh_context)
 		return -EOPNOTSUPP;
 	/* Check input data transformation capabilities */
 	if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR &&
@@ -1634,6 +1632,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	}
 
 	if (create) {
+		u32 limit, ctx_id;
+
 		if (rxfh_dev.rss_delete) {
 			ret = -EINVAL;
 			goto out_unlock;
@@ -1644,21 +1644,15 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 			goto out_unlock;
 		}
 
-		if (ops->create_rxfh_context) {
-			u32 limit = ops->rxfh_max_num_contexts ?: U32_MAX;
-			u32 ctx_id;
-
-			/* driver uses new API, core allocates ID */
-			ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx,
-				       XA_LIMIT(1, limit - 1),
-				       GFP_KERNEL_ACCOUNT);
-			if (ret < 0) {
-				kfree(ctx);
-				goto out_unlock;
-			}
-			WARN_ON(!ctx_id); /* can't happen */
-			rxfh.rss_context = ctx_id;
+		limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+		ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx,
+			       XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+		if (ret < 0) {
+			kfree(ctx);
+			goto out_unlock;
 		}
+		WARN_ON(!ctx_id); /* can't happen */
+		rxfh.rss_context = ctx_id;
 	} else if (rxfh.rss_context) {
 		ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context);
 		if (!ctx) {
@@ -1670,7 +1664,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	rxfh_dev.rss_context = rxfh.rss_context;
 	rxfh_dev.input_xfrm = rxfh.input_xfrm;
 
-	if (rxfh.rss_context && ops->create_rxfh_context) {
+	if (rxfh.rss_context) {
 		if (create) {
 			ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev,
 						       extack);
@@ -1693,8 +1687,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	if (ret) {
 		if (create) {
 			/* failed to create, free our new tracking entry */
-			if (ops->create_rxfh_context)
-				xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context);
+			xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context);
 			kfree(ctx);
 		}
 		goto out_unlock;
@@ -1713,36 +1706,6 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 			dev->priv_flags |= IFF_RXFH_CONFIGURED;
 	}
 	/* Update rss_ctx tracking */
-	if (create && !ops->create_rxfh_context) {
-		/* driver uses old API, it chose context ID */
-		if (WARN_ON(xa_load(&dev->ethtool->rss_ctx, rxfh_dev.rss_context))) {
-			/* context ID reused, our tracking is screwed */
-			kfree(ctx);
-			goto out_unlock;
-		}
-		/* Allocate the exact ID the driver gave us */
-		if (xa_is_err(xa_store(&dev->ethtool->rss_ctx, rxfh_dev.rss_context,
-				       ctx, GFP_KERNEL))) {
-			kfree(ctx);
-			goto out_unlock;
-		}
-
-		/* Fetch the defaults for the old API, in the new API drivers
-		 * should write defaults into ctx themselves.
-		 */
-		rxfh_dev.indir = (u32 *)rss_config;
-		rxfh_dev.indir_size = dev_indir_size;
-
-		rxfh_dev.key = rss_config + indir_bytes;
-		rxfh_dev.key_size = dev_key_size;
-
-		ret = ops->get_rxfh(dev, &rxfh_dev);
-		if (WARN_ON(ret)) {
-			xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context);
-			kfree(ctx);
-			goto out_unlock;
-		}
-	}
 	if (rxfh_dev.rss_delete) {
 		WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx);
 		kfree(ctx);
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index e717f23cbc10..4e8ca2c38175 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -163,8 +163,7 @@ rss_prepare_data(const struct ethnl_req_info *req_base,
 		return -EOPNOTSUPP;
 
 	/* Some drivers don't handle rss_context */
-	if (request->rss_context &&
-	    !ops->cap_rss_ctx_supported && !ops->create_rxfh_context)
+	if (request->rss_context && !ops->create_rxfh_context)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&dev->ethtool->rss_lock);
-- 
cgit v1.2.3


From 1ec38ce3d024bebdff2a9ffb526e4d198605204d Mon Sep 17 00:00:00 2001
From: Sascha Bischoff <Sascha.Bischoff@arm.com>
Date: Fri, 27 Jun 2025 10:09:01 +0000
Subject: irqchip/gic-v5: Populate struct gic_kvm_info

Populate the gic_kvm_info struct based on support for
FEAT_GCIE_LEGACY.  The struct is used by KVM to probe for a compatible
GIC.

Co-authored-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Link: https://lore.kernel.org/r/20250627100847.1022515-3-sascha.bischoff@arm.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 drivers/irqchip/irq-gic-v5.c          | 33 +++++++++++++++++++++++++++++++++
 include/linux/irqchip/arm-vgic-info.h |  4 ++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c
index 0e4789818131..4bd224f359a7 100644
--- a/drivers/irqchip/irq-gic-v5.c
+++ b/drivers/irqchip/irq-gic-v5.c
@@ -13,6 +13,7 @@
 
 #include <linux/irqchip.h>
 #include <linux/irqchip/arm-gic-v5.h>
+#include <linux/irqchip/arm-vgic-info.h>
 
 #include <asm/cpufeature.h>
 #include <asm/exception.h>
@@ -1058,6 +1059,36 @@ static void gicv5_set_cpuif_idbits(void)
 	}
 }
 
+#ifdef CONFIG_KVM
+static struct gic_kvm_info gic_v5_kvm_info __initdata;
+
+static bool __init gicv5_cpuif_has_gcie_legacy(void)
+{
+	u64 idr0 = read_sysreg_s(SYS_ICC_IDR0_EL1);
+	return !!FIELD_GET(ICC_IDR0_EL1_GCIE_LEGACY, idr0);
+}
+
+static void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+	gic_v5_kvm_info.type = GIC_V5;
+	gic_v5_kvm_info.has_gcie_v3_compat = gicv5_cpuif_has_gcie_legacy();
+
+	/* GIC Virtual CPU interface maintenance interrupt */
+	gic_v5_kvm_info.no_maint_irq_mask = false;
+	gic_v5_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
+	if (!gic_v5_kvm_info.maint_irq) {
+		pr_warn("cannot find GICv5 virtual CPU interface maintenance interrupt\n");
+		return;
+	}
+
+	vgic_set_kvm_info(&gic_v5_kvm_info);
+}
+#else
+static inline void __init gic_of_setup_kvm_info(struct device_node *node)
+{
+}
+#endif // CONFIG_KVM
+
 static int __init gicv5_of_init(struct device_node *node, struct device_node *parent)
 {
 	int ret = gicv5_irs_of_probe(node);
@@ -1090,6 +1121,8 @@ static int __init gicv5_of_init(struct device_node *node, struct device_node *pa
 
 	gicv5_irs_its_probe();
 
+	gic_of_setup_kvm_info(node);
+
 	return 0;
 
 out_int:
diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h
index a75b2c7de69d..ca1713fac6e3 100644
--- a/include/linux/irqchip/arm-vgic-info.h
+++ b/include/linux/irqchip/arm-vgic-info.h
@@ -15,6 +15,8 @@ enum gic_type {
 	GIC_V2,
 	/* Full GICv3, optionally with v2 compat */
 	GIC_V3,
+	/* Full GICv5, optionally with v3 compat */
+	GIC_V5,
 };
 
 struct gic_kvm_info {
@@ -34,6 +36,8 @@ struct gic_kvm_info {
 	bool		has_v4_1;
 	/* Deactivation impared, subpar stuff */
 	bool		no_hw_deactivation;
+	/* v3 compat support (GICv5 hosts, only) */
+	bool		has_gcie_v3_compat;
 };
 
 #ifdef CONFIG_KVM
-- 
cgit v1.2.3


From c017e49ed1381001ba7a6521daae8f968b11cf09 Mon Sep 17 00:00:00 2001
From: Sascha Bischoff <Sascha.Bischoff@arm.com>
Date: Fri, 27 Jun 2025 10:09:02 +0000
Subject: KVM: arm64: gic-v5: Support GICv3 compat

Add support for GICv3 compat mode (FEAT_GCIE_LEGACY) which allows a
GICv5 host to run GICv3-based VMs. This change enables the
VHE/nVHE/hVHE/protected modes, but does not support nested
virtualization.

A lazy-disable approach is taken for compat mode; it is enabled on the
vgic_v3_load path but not disabled on the vgic_v3_put path. A
non-GICv3 VM, i.e., one based on GICv5, is responsible for disabling
compat mode on the corresponding vgic_v5_load path. Currently, GICv5
is not supported, and hence compat mode is not disabled again once it
is enabled, and this function is intentionally omitted from the code.

Co-authored-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Timothy Hayes <timothy.hayes@arm.com>
Signed-off-by: Sascha Bischoff <sascha.bischoff@arm.com>
Link: https://lore.kernel.org/r/20250627100847.1022515-5-sascha.bischoff@arm.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/kvm/hyp/vgic-v3-sr.c | 51 ++++++++++++++++++++++++++++++++++-------
 arch/arm64/kvm/sys_regs.c       | 10 +++++++-
 arch/arm64/kvm/vgic/vgic-init.c |  6 +++--
 arch/arm64/kvm/vgic/vgic.h      | 11 +++++++++
 include/kvm/arm_vgic.h          |  6 ++++-
 5 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index f162b0df5cae..6ce88e56ccb8 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -296,12 +296,19 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
 	}
 
 	/*
-	 * Prevent the guest from touching the ICC_SRE_EL1 system
-	 * register. Note that this may not have any effect, as
-	 * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+	 * GICv5 BET0 FEAT_GCIE_LEGACY doesn't include ICC_SRE_EL2. This is due
+	 * to be relaxed in a future spec release, at which point this in
+	 * condition can be dropped.
 	 */
-	write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
-		     ICC_SRE_EL2);
+	if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+		/*
+		 * Prevent the guest from touching the ICC_SRE_EL1 system
+		 * register. Note that this may not have any effect, as
+		 * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
+		 */
+		write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
+			     ICC_SRE_EL2);
+	}
 
 	/*
 	 * If we need to trap system registers, we must write
@@ -322,8 +329,14 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
 		cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
 	}
 
-	val = read_gicreg(ICC_SRE_EL2);
-	write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+	/*
+	 * Can be dropped in the future when GICv5 spec is relaxed. See comment
+	 * above.
+	 */
+	if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF)) {
+		val = read_gicreg(ICC_SRE_EL2);
+		write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);
+	}
 
 	if (!cpu_if->vgic_sre) {
 		/* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
@@ -423,9 +436,19 @@ void __vgic_v3_init_lrs(void)
  */
 u64 __vgic_v3_get_gic_config(void)
 {
-	u64 val, sre = read_gicreg(ICC_SRE_EL1);
+	u64 val, sre;
 	unsigned long flags = 0;
 
+	/*
+	 * In compat mode, we cannot access ICC_SRE_EL1 at any EL
+	 * other than EL1 itself; just return the
+	 * ICH_VTR_EL2. ICC_IDR0_EL1 is only implemented on a GICv5
+	 * system, so we first check if we have GICv5 support.
+	 */
+	if (cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+		return read_gicreg(ICH_VTR_EL2);
+
+	sre = read_gicreg(ICC_SRE_EL1);
 	/*
 	 * To check whether we have a MMIO-based (GICv2 compatible)
 	 * CPU interface, we need to disable the system register
@@ -471,6 +494,16 @@ u64 __vgic_v3_get_gic_config(void)
 	return val;
 }
 
+static void __vgic_v3_compat_mode_enable(void)
+{
+	if (!cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF))
+		return;
+
+	sysreg_clear_set_s(SYS_ICH_VCTLR_EL2, 0, ICH_VCTLR_EL2_V3);
+	/* Wait for V3 to become enabled */
+	isb();
+}
+
 static u64 __vgic_v3_read_vmcr(void)
 {
 	return read_gicreg(ICH_VMCR_EL2);
@@ -490,6 +523,8 @@ void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
 
 void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
+	__vgic_v3_compat_mode_enable();
+
 	/*
 	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
 	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 76c2f0da821f..f01953c7c2a9 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1811,7 +1811,7 @@ static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
 	}
 
-	if (kvm_vgic_global_state.type == VGIC_V3) {
+	if (vgic_is_v3(vcpu->kvm)) {
 		val &= ~ID_AA64PFR0_EL1_GIC_MASK;
 		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
 	}
@@ -1953,6 +1953,14 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
 	    (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val)))
 		return -EINVAL;
 
+	/*
+	 * If we are running on a GICv5 host and support FEAT_GCIE_LEGACY, then
+	 * we support GICv3. Fail attempts to do anything but set that to IMP.
+	 */
+	if (vgic_is_v3_compat(vcpu->kvm) &&
+	    FIELD_GET(ID_AA64PFR0_EL1_GIC_MASK, user_val) != ID_AA64PFR0_EL1_GIC_IMP)
+		return -EINVAL;
+
 	return set_id_reg(vcpu, rd, user_val);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index eb1205654ac8..1f1f0c9ce64f 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -674,10 +674,12 @@ void kvm_vgic_init_cpu_hardware(void)
 	 * We want to make sure the list registers start out clear so that we
 	 * only have the program the used registers.
 	 */
-	if (kvm_vgic_global_state.type == VGIC_V2)
+	if (kvm_vgic_global_state.type == VGIC_V2) {
 		vgic_v2_init_lrs();
-	else
+	} else if (kvm_vgic_global_state.type == VGIC_V3 ||
+		   kvm_vgic_global_state.has_gcie_v3_compat) {
 		kvm_call_hyp(__vgic_v3_init_lrs);
+	}
 }
 
 /**
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 4349084cb9a6..23d393998085 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -389,6 +389,17 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
 void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
 void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);
 
+static inline bool vgic_is_v3_compat(struct kvm *kvm)
+{
+	return cpus_have_final_cap(ARM64_HAS_GICV5_CPUIF) &&
+		kvm_vgic_global_state.has_gcie_v3_compat;
+}
+
+static inline bool vgic_is_v3(struct kvm *kvm)
+{
+	return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
+}
+
 int vgic_its_debug_init(struct kvm_device *dev);
 void vgic_its_debug_destroy(struct kvm_device *dev);
 
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4a34f7f0a864..5c293e0ff5c1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -38,6 +38,7 @@
 enum vgic_type {
 	VGIC_V2,		/* Good ol' GICv2 */
 	VGIC_V3,		/* New fancy GICv3 */
+	VGIC_V5,		/* Newer, fancier GICv5 */
 };
 
 /* same for all guests, as depending only on the _host's_ GIC model */
@@ -77,9 +78,12 @@ struct vgic_global {
 	/* Pseudo GICv3 from outer space */
 	bool			no_hw_deactivation;
 
-	/* GIC system register CPU interface */
+	/* GICv3 system register CPU interface */
 	struct static_key_false gicv3_cpuif;
 
+	/* GICv3 compat mode on a GICv5 host */
+	bool			has_gcie_v3_compat;
+
 	u32			ich_vtr_el2;
 };
 
-- 
cgit v1.2.3


From c56f97c5c71f17d781461d44acb777cd21521b81 Mon Sep 17 00:00:00 2001
From: "Yury Norov [NVIDIA]" <yury.norov@gmail.com>
Date: Thu, 19 Jun 2025 14:26:23 -0400
Subject: bitmap: generalize node_random()

Generalize node_random() and make it available to general bitmaps and
cpumasks users.

Notice, find_first_bit() is generally faster than find_nth_bit(), and we
employ it when there's a single set bit in the bitmap.

See commit 3e061d924fe9c7b4 ("lib/nodemask: optimize node_random for
nodemask with single NUMA node").

CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Yury Norov [NVIDIA]" <yury.norov@gmail.com>
---
 include/linux/find.h     |  2 ++
 include/linux/nodemask.h | 18 +++---------------
 lib/find_bit.c           | 24 ++++++++++++++++++++++++
 3 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/find.h b/include/linux/find.h
index 5a2c267ea7f9..98c61838002c 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -44,6 +44,8 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
 				long size, unsigned long offset);
 #endif
 
+unsigned long find_random_bit(const unsigned long *addr, unsigned long size);
+
 #ifndef find_next_bit
 /**
  * find_next_bit - find the next set bit in a memory region
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index f08ae71585fa..7ad1f5c7407e 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -492,21 +492,9 @@ static __always_inline int num_node_state(enum node_states state)
 static __always_inline int node_random(const nodemask_t *maskp)
 {
 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
-	int w, bit;
-
-	w = nodes_weight(*maskp);
-	switch (w) {
-	case 0:
-		bit = NUMA_NO_NODE;
-		break;
-	case 1:
-		bit = first_node(*maskp);
-		break;
-	default:
-		bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w));
-		break;
-	}
-	return bit;
+	int node = find_random_bit(maskp->bits, MAX_NUMNODES);
+
+	return node < MAX_NUMNODES ? node : NUMA_NO_NODE;
 #else
 	return 0;
 #endif
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 06b6342aa3ae..d4b5a29e3e72 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -18,6 +18,7 @@
 #include <linux/math.h>
 #include <linux/minmax.h>
 #include <linux/swab.h>
+#include <linux/random.h>
 
 /*
  * Common helper for find_bit() function family
@@ -291,3 +292,26 @@ EXPORT_SYMBOL(_find_next_bit_le);
 #endif
 
 #endif /* __BIG_ENDIAN */
+
+/**
+ * find_random_bit - find a set bit at random position
+ * @addr: The address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns: a position of a random set bit; >= @size otherwise
+ */
+unsigned long find_random_bit(const unsigned long *addr, unsigned long size)
+{
+	int w = bitmap_weight(addr, size);
+
+	switch (w) {
+	case 0:
+		return size;
+	case 1:
+		/* Performance trick for single-bit bitmaps */
+		return find_first_bit(addr, size);
+	default:
+		return find_nth_bit(addr, size, get_random_u32_below(w));
+	}
+}
+EXPORT_SYMBOL(find_random_bit);
-- 
cgit v1.2.3


From 012b1043420c0bc62e52902499de40b66f37fd6a Mon Sep 17 00:00:00 2001
From: "Yury Norov [NVIDIA]" <yury.norov@gmail.com>
Date: Thu, 19 Jun 2025 14:26:24 -0400
Subject: cpumask: introduce cpumask_random()

Propagate find_random_bit() to cpumask API.

CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Yury Norov [NVIDIA]" <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 7ae80a7ca81e..39b71b662da3 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -354,6 +354,18 @@ unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
 	return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
 }
 
+/**
+ * cpumask_random - get random cpu in *src.
+ * @src: cpumask pointer
+ *
+ * Return: random set bit, or >= nr_cpu_ids if @src is empty.
+ */
+static __always_inline
+unsigned int cpumask_random(const struct cpumask *src)
+{
+	return find_random_bit(cpumask_bits(src), nr_cpu_ids);
+}
+
 /**
  * for_each_cpu - iterate over every cpu in a mask
  * @cpu: the (optionally unsigned) integer iterator
-- 
cgit v1.2.3


From f4e1fb04c12384fb1b69a95c33527b515a652a74 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 2 Jul 2025 22:35:16 +0000
Subject: af_unix: Use cached value for SOCK_STREAM in unix_inq_len().

Compared to TCP, ioctl(SIOCINQ) for AF_UNIX SOCK_STREAM socket is more
expensive, as unix_inq_len() requires iterating through the receive queue
and accumulating skb->len.

Let's cache the value for SOCK_STREAM to a new field during sendmsg()
and recvmsg().

The field is protected by the receive queue lock.

Note that ioctl(SIOCINQ) for SOCK_DGRAM returns the length of the first
skb in the queue.

SOCK_SEQPACKET still requires iterating through the queue because we do
not touch functions shared with unix_dgram_ops.  But, if really needed,
we can support it by switching __skb_try_recv_datagram() to a custom
version.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250702223606.1054680-5-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    | 38 ++++++++++++++++++++++++++++----------
 2 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 1af1841b7601..603f8cd026e5 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -47,6 +47,7 @@ struct unix_sock {
 #define peer_wait		peer_wq.wait
 	wait_queue_entry_t	peer_wake;
 	struct scm_stat		scm_stat;
+	int			inq_len;
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 	struct sk_buff		*oob_skb;
 #endif
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 94596d6c37e9..d9e604295a71 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2297,6 +2297,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
 
 	spin_lock(&other->sk_receive_queue.lock);
 	WRITE_ONCE(ousk->oob_skb, skb);
+	WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1);
 	__skb_queue_tail(&other->sk_receive_queue, skb);
 	spin_unlock(&other->sk_receive_queue.lock);
 
@@ -2319,6 +2320,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb = NULL;
 	struct sock *other = NULL;
+	struct unix_sock *otheru;
 	struct scm_cookie scm;
 	bool fds_sent = false;
 	int err, sent = 0;
@@ -2342,14 +2344,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 	if (msg->msg_namelen) {
 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
 		goto out_err;
-	} else {
-		other = unix_peer(sk);
-		if (!other) {
-			err = -ENOTCONN;
-			goto out_err;
-		}
 	}
 
+	other = unix_peer(sk);
+	if (!other) {
+		err = -ENOTCONN;
+		goto out_err;
+	}
+
+	otheru = unix_sk(other);
+
 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
 		goto out_pipe;
 
@@ -2417,7 +2421,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
 
 		unix_maybe_add_creds(skb, sk, other);
 		scm_stat_add(other, skb);
-		skb_queue_tail(&other->sk_receive_queue, skb);
+
+		spin_lock(&other->sk_receive_queue.lock);
+		WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len);
+		__skb_queue_tail(&other->sk_receive_queue, skb);
+		spin_unlock(&other->sk_receive_queue.lock);
+
 		unix_state_unlock(other);
 		other->sk_data_ready(other);
 		sent += size;
@@ -2704,6 +2713,7 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state)
 
 	if (!(state->flags & MSG_PEEK)) {
 		WRITE_ONCE(u->oob_skb, NULL);
+		WRITE_ONCE(u->inq_len, u->inq_len - 1);
 
 		if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
 		    !unix_skb_len(oob_skb->prev)) {
@@ -2808,6 +2818,8 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 		return -EAGAIN;
 	}
 
+	WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
+
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 	if (skb == u->oob_skb) {
 		WRITE_ONCE(u->oob_skb, NULL);
@@ -2988,7 +3000,11 @@ unlock:
 			if (unix_skb_len(skb))
 				break;
 
-			skb_unlink(skb, &sk->sk_receive_queue);
+			spin_lock(&sk->sk_receive_queue.lock);
+			WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			spin_unlock(&sk->sk_receive_queue.lock);
+
 			consume_skb(skb);
 
 			if (scm.fp)
@@ -3159,9 +3175,11 @@ long unix_inq_len(struct sock *sk)
 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
 		return -EINVAL;
 
+	if (sk->sk_type == SOCK_STREAM)
+		return READ_ONCE(unix_sk(sk)->inq_len);
+
 	spin_lock(&sk->sk_receive_queue.lock);
-	if (sk->sk_type == SOCK_STREAM ||
-	    sk->sk_type == SOCK_SEQPACKET) {
+	if (sk->sk_type == SOCK_SEQPACKET) {
 		skb_queue_walk(&sk->sk_receive_queue, skb)
 			amount += unix_skb_len(skb);
 	} else {
-- 
cgit v1.2.3


From df30285b3670bf52e1e5512e4d4482bec5e93c16 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 2 Jul 2025 22:35:18 +0000
Subject: af_unix: Introduce SO_INQ.

We have an application that uses almost the same code for TCP and
AF_UNIX (SOCK_STREAM).

TCP can use TCP_INQ, but AF_UNIX doesn't have it and requires an
extra syscall, ioctl(SIOCINQ) or getsockopt(SO_MEMINFO) as an
alternative.

Let's introduce the generic version of TCP_INQ.

If SO_INQ is enabled, recvmsg() will put a cmsg of SCM_INQ that
contains the exact value of ioctl(SIOCINQ).  The cmsg is also
included when msg->msg_get_inq is non-zero to make sockets
io_uring-friendly.

Note that SOCK_CUSTOM_SOCKOPT is flagged only for SOCK_STREAM to
override setsockopt() for SOL_SOCKET.

By having the flag in struct unix_sock, instead of struct sock, we
can later add SO_INQ support for TCP and reuse tcp_sk(sk)->recvmsg_inq.

Note also that supporting custom getsockopt() for SOL_SOCKET will need
preparation for other SOCK_CUSTOM_SOCKOPT users (UDP, vsock, MPTCP).

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250702223606.1054680-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 arch/alpha/include/uapi/asm/socket.h  |  3 ++
 arch/mips/include/uapi/asm/socket.h   |  3 ++
 arch/parisc/include/uapi/asm/socket.h |  3 ++
 arch/sparc/include/uapi/asm/socket.h  |  3 ++
 include/net/af_unix.h                 |  1 +
 include/uapi/asm-generic/socket.h     |  3 ++
 net/unix/af_unix.c                    | 62 +++++++++++++++++++++++++++++++++--
 7 files changed, 76 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 8f1f18adcdb5..5ef57f88df6b 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -152,6 +152,9 @@
 
 #define SO_PASSRIGHTS		83
 
+#define SO_INQ			84
+#define SCM_INQ			SO_INQ
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 31ac655b7837..72fb1b006da9 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -163,6 +163,9 @@
 
 #define SO_PASSRIGHTS		83
 
+#define SO_INQ			84
+#define SCM_INQ			SO_INQ
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 1f2d5b7a7f5d..c16ec36dfee6 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -144,6 +144,9 @@
 
 #define SO_PASSRIGHTS		0x4051
 
+#define SO_INQ			0x4052
+#define SCM_INQ			SO_INQ
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index adcba7329386..71befa109e1c 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -145,6 +145,9 @@
 
 #define SO_PASSRIGHTS            0x005c
 
+#define SO_INQ                   0x005d
+#define SCM_INQ                  SO_INQ
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 603f8cd026e5..34f53dde65ce 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -48,6 +48,7 @@ struct unix_sock {
 	wait_queue_entry_t	peer_wake;
 	struct scm_stat		scm_stat;
 	int			inq_len;
+	bool			recvmsg_inq;
 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
 	struct sk_buff		*oob_skb;
 #endif
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index f333a0ac4ee4..53b5a8c002b1 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -147,6 +147,9 @@
 
 #define SO_PASSRIGHTS		83
 
+#define SO_INQ			84
+#define SCM_INQ			SO_INQ
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c3dd41596d89..7a92733706fe 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -934,6 +934,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 #define unix_show_fdinfo NULL
 #endif
 
+static bool unix_custom_sockopt(int optname)
+{
+	switch (optname) {
+	case SO_INQ:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int unix_setsockopt(struct socket *sock, int level, int optname,
+			   sockptr_t optval, unsigned int optlen)
+{
+	struct unix_sock *u = unix_sk(sock->sk);
+	struct sock *sk = sock->sk;
+	int val;
+
+	if (level != SOL_SOCKET)
+		return -EOPNOTSUPP;
+
+	if (!unix_custom_sockopt(optname))
+		return sock_setsockopt(sock, level, optname, optval, optlen);
+
+	if (optlen != sizeof(int))
+		return -EINVAL;
+
+	if (copy_from_sockptr(&val, optval, sizeof(val)))
+		return -EFAULT;
+
+	switch (optname) {
+	case SO_INQ:
+		if (sk->sk_type != SOCK_STREAM)
+			return -EINVAL;
+
+		if (val > 1 || val < 0)
+			return -EINVAL;
+
+		WRITE_ONCE(u->recvmsg_inq, val);
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	return 0;
+}
+
 static const struct proto_ops unix_stream_ops = {
 	.family =	PF_UNIX,
 	.owner =	THIS_MODULE,
@@ -950,6 +996,7 @@ static const struct proto_ops unix_stream_ops = {
 #endif
 	.listen =	unix_listen,
 	.shutdown =	unix_shutdown,
+	.setsockopt =	unix_setsockopt,
 	.sendmsg =	unix_stream_sendmsg,
 	.recvmsg =	unix_stream_recvmsg,
 	.read_skb =	unix_stream_read_skb,
@@ -1116,6 +1163,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 
 	switch (sock->type) {
 	case SOCK_STREAM:
+		set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
 		sock->ops = &unix_stream_ops;
 		break;
 		/*
@@ -1847,6 +1895,9 @@ static int unix_accept(struct socket *sock, struct socket *newsock,
 	skb_free_datagram(sk, skb);
 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
 
+	if (tsk->sk_type == SOCK_STREAM)
+		set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
+
 	/* attach accepted sock to socket */
 	unix_state_lock(tsk);
 	unix_update_edges(unix_sk(tsk));
@@ -3034,10 +3085,17 @@ unlock:
 	} while (size);
 
 	mutex_unlock(&u->iolock);
-	if (msg)
+	if (msg) {
 		scm_recv_unix(sock, msg, &scm, flags);
-	else
+
+		if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) {
+			msg->msg_inq = READ_ONCE(u->inq_len);
+			put_cmsg(msg, SOL_SOCKET, SCM_INQ,
+				 sizeof(msg->msg_inq), &msg->msg_inq);
+		}
+	} else {
 		scm_destroy(&scm);
+	}
 out:
 	return copied ? : err;
 }
-- 
cgit v1.2.3


From eb1ac9ff6c4a5720b1a1476233be374c5dc44bff Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 2 Jul 2025 16:01:31 -0700
Subject: ipv6: anycast: Don't hold RTNL for IPV6_JOIN_ANYCAST.

inet6_sk(sk)->ipv6_ac_list is protected by lock_sock().

In ipv6_sock_ac_join(), only __dev_get_by_index(), __dev_get_by_flags(),
and __in6_dev_get() require RTNL.

__dev_get_by_flags() is only used by ipv6_sock_ac_join() and can be
converted to RCU version.

Let's replace RCU version helper and drop RTNL from IPV6_JOIN_ANYCAST.

setsockopt_needs_rtnl() will be removed in the next patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250702230210.3115355-15-kuni1840@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 38 ++++++++++++++++++--------------------
 net/ipv6/anycast.c        | 22 ++++++++++++++--------
 net/ipv6/ipv6_sockglue.c  |  4 ----
 4 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5847c20994d3..a80d21a14612 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3332,8 +3332,8 @@ int dev_get_iflink(const struct net_device *dev);
 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
 int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 			  struct net_device_path_stack *stack);
-struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
-				      unsigned short mask);
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags,
+					unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
 struct net_device *__dev_get_by_name(struct net *net, const char *name);
diff --git a/net/core/dev.c b/net/core/dev.c
index fe677ccec5b0..e365b099484e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1267,33 +1267,31 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 
 /**
- *	__dev_get_by_flags - find any device with given flags
- *	@net: the applicable net namespace
- *	@if_flags: IFF_* values
- *	@mask: bitmask of bits in if_flags to check
+ * dev_get_by_flags_rcu - find any device with given flags
+ * @net: the applicable net namespace
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
  *
- *	Search for any interface with the given flags. Returns NULL if a device
- *	is not found or a pointer to the device. Must be called inside
- *	rtnl_lock(), and result refcount is unchanged.
+ * Search for any interface with the given flags.
+ *
+ * Context: rcu_read_lock() must be held.
+ * Returns: NULL if a device is not found or a pointer to the device.
  */
-
-struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
-				      unsigned short mask)
+struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
+					unsigned short mask)
 {
-	struct net_device *dev, *ret;
-
-	ASSERT_RTNL();
+	struct net_device *dev;
 
-	ret = NULL;
-	for_each_netdev(net, dev) {
-		if (((dev->flags ^ if_flags) & mask) == 0) {
-			ret = dev;
-			break;
+	for_each_netdev_rcu(net, dev) {
+		if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
+			dev_hold(dev);
+			return dev;
 		}
 	}
-	return ret;
+
+	return NULL;
 }
-EXPORT_SYMBOL(__dev_get_by_flags);
+EXPORT_IPV6_MOD(dev_get_by_flags_rcu);
 
 /**
  *	dev_valid_name - check if name is okay for network device
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index fd3d104c6c05..53cf68e0242b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -73,15 +73,13 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	struct inet6_dev *idev;
 	int err = 0, ishost;
 
-	ASSERT_RTNL();
-
 	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 	if (ipv6_addr_is_multicast(addr))
 		return -EINVAL;
 
 	if (ifindex)
-		dev = __dev_get_by_index(net, ifindex);
+		dev = dev_get_by_index(net, ifindex);
 
 	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) {
 		err = -EINVAL;
@@ -102,18 +100,22 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 
+		rcu_read_lock();
 		rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
 		if (rt) {
-			dev = rt->dst.dev;
+			dev = dst_dev(&rt->dst);
+			dev_hold(dev);
 			ip6_rt_put(rt);
 		} else if (ishost) {
+			rcu_read_unlock();
 			err = -EADDRNOTAVAIL;
 			goto error;
 		} else {
 			/* router, no matching interface: just pick one */
-			dev = __dev_get_by_flags(net, IFF_UP,
-						 IFF_UP | IFF_LOOPBACK);
+			dev = dev_get_by_flags_rcu(net, IFF_UP,
+						   IFF_UP | IFF_LOOPBACK);
 		}
+		rcu_read_unlock();
 	}
 
 	if (!dev) {
@@ -121,7 +123,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		goto error;
 	}
 
-	idev = __in6_dev_get(dev);
+	idev = in6_dev_get(dev);
 	if (!idev) {
 		if (ifindex)
 			err = -ENODEV;
@@ -144,7 +146,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		if (ishost)
 			err = -EADDRNOTAVAIL;
 		if (err)
-			goto error;
+			goto error_idev;
 	}
 
 	err = __ipv6_dev_ac_inc(idev, addr);
@@ -154,7 +156,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		pac = NULL;
 	}
 
+error_idev:
+	in6_dev_put(idev);
 error:
+	dev_put(dev);
+
 	if (pac)
 		sock_kfree_s(sk, pac, sizeof(*pac));
 	return err;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3d891aa6e7f5..702dc33e50ad 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -119,10 +119,6 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 
 static bool setsockopt_needs_rtnl(int optname)
 {
-	switch (optname) {
-	case IPV6_JOIN_ANYCAST:
-		return true;
-	}
 	return false;
 }
 
-- 
cgit v1.2.3


From 98269398c02ab20eb9ed6d77416023a2627049d8 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Tue, 8 Jul 2025 12:31:39 +0300
Subject: RDMA/uverbs: Add empty rdma_uattrs_has_raw_cap() declaration

The call to rdma_uattrs_has_raw_cap() is placed in mlx5 fs.c file,
which is compiled without relation to CONFIG_INFINIBAND_USER_ACCESS.

Despite the check is used only in flows with CONFIG_INFINIBAND_USER_ACCESS=y|m,
the compilers generate the following error for CONFIG_INFINIBAND_USER_ACCESS=n
builds.

>> ERROR: modpost: "rdma_uattrs_has_raw_cap" [drivers/infiniband/hw/mlx5/mlx5_ib.ko] undefined!

Fixes: f458ccd2aa2c ("RDMA/uverbs: Check CAP_NET_RAW in user namespace for flow create")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202507080725.bh7xrhpg-lkp@intel.com/
Link: https://patch.msgid.link/72dee6b379bd709255a5d8e8010b576d50e47170.1751967071.git.leon@kernel.org
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/rdma/ib_verbs.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 010594dc755b..1d123812a1f9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4810,15 +4810,19 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
 
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
+bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs);
 #else
 static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
 {
 	return 0;
 }
+static inline bool
+rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs)
+{
+	return false;
+}
 #endif
 
-bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs);
-
 struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
 				     enum rdma_netdev_t type, const char *name,
 				     unsigned char name_assign_type,
-- 
cgit v1.2.3


From be1ba9ed221ffb95a8bb15f4c83d0694225ba808 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Mon, 9 Jun 2025 21:35:13 +0300
Subject: wifi: mac80211: avoid weird state in error path

If we get to the error path of ieee80211_prep_connection, for example
because of a FW issue, then ieee80211_vif_set_links is called
with 0.
But the call to drv_change_vif_links from ieee80211_vif_update_links
will probably fail as well, for the same reason.
In this case, the valid_links and active_links bitmaps will be reverted
to the value of the failing connection.
Then, in the next connection, due to the logic of
ieee80211_set_vif_links_bitmaps, valid_links will be set to the ID of
the new connection assoc link, but the active_links will remain with the
ID of the old connection's assoc link.
If those IDs are different, we get into a weird state of valid_links and
active_links being different. One of the consequences of this state is
to call drv_change_vif_links with new_links as 0, since the & operation
between the bitmaps will be 0.

Since a removal of a link should always succeed, ignore the return value
of drv_change_vif_links if it was called to only remove links, which is
the case for the ieee80211_prep_connection's error path.
That way, the bitmaps will not be reverted to have the value from the
failing connection and will have 0, so the next connection will have a
good state.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Link: https://patch.msgid.link/20250609213231.ba2011fb435f.Id87ff6dab5e1cf757b54094ac2d714c656165059@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/link.c    | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index dcd5969bb559..a61ffdbf99be 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4477,6 +4477,8 @@ struct ieee80211_prep_tx_info {
  *	new links bitmaps may be 0 if going from/to a non-MLO situation.
  *	The @old array contains pointers to the old bss_conf structures
  *	that were already removed, in case they're needed.
+ *	Note that removal of link should always succeed, so the return value
+ *	will be ignored in a removal only case.
  *	This callback can sleep.
  * @change_sta_links: Change the valid links of a station, similar to
  *	@change_vif_links. This callback can sleep.
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 4f7b7d0f64f2..d71eabe5abf8 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -2,7 +2,7 @@
 /*
  * MLO link handling
  *
- * Copyright (C) 2022-2024 Intel Corporation
+ * Copyright (C) 2022-2025 Intel Corporation
  */
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -368,6 +368,13 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata,
 			ieee80211_update_apvlan_links(sdata);
 	}
 
+	/*
+	 * Ignore errors if we are only removing links as removal should
+	 * always succeed
+	 */
+	if (!new_links)
+		ret = 0;
+
 	if (ret) {
 		/* restore config */
 		memcpy(sdata->link, old_data, sizeof(old_data));
-- 
cgit v1.2.3


From 76164ca0d113e6a9f3033f948c739586fc606ed1 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 1 Jul 2025 10:57:57 +0200
Subject: vdso/vsyscall: Split up __arch_update_vsyscall() into
 __arch_update_vdso_clock()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The upcoming auxiliary clocks need this hook, too.
To separate the architecture hooks from the timekeeper internals, refactor
the hook to only operate on a single vDSO clock.

While at it, use a more robust #define for the hook override.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-3-df7d9f87b9b8@linutronix.de
---
 arch/arm64/include/asm/vdso/vsyscall.h | 7 +++----
 include/asm-generic/vdso/vsyscall.h    | 6 +++---
 kernel/time/vsyscall.c                 | 3 ++-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/vdso/vsyscall.h b/arch/arm64/include/asm/vdso/vsyscall.h
index de58951b8df6..417aae5763a8 100644
--- a/arch/arm64/include/asm/vdso/vsyscall.h
+++ b/arch/arm64/include/asm/vdso/vsyscall.h
@@ -13,12 +13,11 @@
  * Update the vDSO data page to keep in sync with kernel timekeeping.
  */
 static __always_inline
-void __arm64_update_vsyscall(struct vdso_time_data *vdata)
+void __arch_update_vdso_clock(struct vdso_clock *vc)
 {
-	vdata->clock_data[CS_HRES_COARSE].mask	= VDSO_PRECISION_MASK;
-	vdata->clock_data[CS_RAW].mask		= VDSO_PRECISION_MASK;
+	vc->mask	= VDSO_PRECISION_MASK;
 }
-#define __arch_update_vsyscall __arm64_update_vsyscall
+#define __arch_update_vdso_clock __arch_update_vdso_clock
 
 /* The asm-generic header needs to be included after the definitions above */
 #include <asm-generic/vdso/vsyscall.h>
diff --git a/include/asm-generic/vdso/vsyscall.h b/include/asm-generic/vdso/vsyscall.h
index b550afa15ecd..7fc0b560007d 100644
--- a/include/asm-generic/vdso/vsyscall.h
+++ b/include/asm-generic/vdso/vsyscall.h
@@ -22,11 +22,11 @@ static __always_inline const struct vdso_rng_data *__arch_get_vdso_u_rng_data(vo
 
 #endif /* CONFIG_GENERIC_VDSO_DATA_STORE */
 
-#ifndef __arch_update_vsyscall
-static __always_inline void __arch_update_vsyscall(struct vdso_time_data *vdata)
+#ifndef __arch_update_vdso_clock
+static __always_inline void __arch_update_vdso_clock(struct vdso_clock *vc)
 {
 }
-#endif /* __arch_update_vsyscall */
+#endif /* __arch_update_vdso_clock */
 
 #ifndef __arch_sync_vdso_time_data
 static __always_inline void __arch_sync_vdso_time_data(struct vdso_time_data *vdata)
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index d655df259733..df6bada2d58e 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -118,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk)
 	if (clock_mode != VDSO_CLOCKMODE_NONE)
 		update_vdso_time_data(vdata, tk);
 
-	__arch_update_vsyscall(vdata);
+	__arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+	__arch_update_vdso_clock(&vc[CS_RAW]);
 
 	vdso_write_end(vdata);
 
-- 
cgit v1.2.3


From ad64d71d7409a0602b50ee71c7f9663a3385c286 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 1 Jul 2025 10:57:58 +0200
Subject: vdso/helpers: Add helpers for seqlocks of single vdso_clock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auxiliary clocks will have their vDSO data in a dedicated 'struct vdso_clock',
which needs to be synchronized independently.

Add a helper to synchronize a single vDSO clock.

[ tglx: Move the SMP memory barriers to the call sites and get rid of the
  	confusing first/last arguments and conditional barriers ]

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-4-df7d9f87b9b8@linutronix.de
---
 include/vdso/helpers.h | 50 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h
index 0a98fed550ba..1a5ee9d9052c 100644
--- a/include/vdso/helpers.h
+++ b/include/vdso/helpers.h
@@ -28,17 +28,47 @@ static __always_inline u32 vdso_read_retry(const struct vdso_clock *vc,
 	return seq != start;
 }
 
-static __always_inline void vdso_write_begin(struct vdso_time_data *vd)
+static __always_inline void vdso_write_seq_begin(struct vdso_clock *vc)
 {
-	struct vdso_clock *vc = vd->clock_data;
+	/*
+	 * WRITE_ONCE() is required otherwise the compiler can validly tear
+	 * updates to vc->seq and it is possible that the value seen by the
+	 * reader is inconsistent.
+	 */
+	WRITE_ONCE(vc->seq, vc->seq + 1);
+}
 
+static __always_inline void vdso_write_seq_end(struct vdso_clock *vc)
+{
 	/*
 	 * WRITE_ONCE() is required otherwise the compiler can validly tear
-	 * updates to vd[x].seq and it is possible that the value seen by the
+	 * updates to vc->seq and it is possible that the value seen by the
 	 * reader is inconsistent.
 	 */
-	WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1);
-	WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1);
+	WRITE_ONCE(vc->seq, vc->seq + 1);
+}
+
+static __always_inline void vdso_write_begin_clock(struct vdso_clock *vc)
+{
+	vdso_write_seq_begin(vc);
+	/* Ensure the sequence invalidation is visible before data is modified */
+	smp_wmb();
+}
+
+static __always_inline void vdso_write_end_clock(struct vdso_clock *vc)
+{
+	/* Ensure the data update is visible before the sequence is set valid again */
+	smp_wmb();
+	vdso_write_seq_end(vc);
+}
+
+static __always_inline void vdso_write_begin(struct vdso_time_data *vd)
+{
+	struct vdso_clock *vc = vd->clock_data;
+
+	vdso_write_seq_begin(&vc[CS_HRES_COARSE]);
+	vdso_write_seq_begin(&vc[CS_RAW]);
+	/* Ensure the sequence invalidation is visible before data is modified */
 	smp_wmb();
 }
 
@@ -46,14 +76,10 @@ static __always_inline void vdso_write_end(struct vdso_time_data *vd)
 {
 	struct vdso_clock *vc = vd->clock_data;
 
+	/* Ensure the data update is visible before the sequence is set valid again */
 	smp_wmb();
-	/*
-	 * WRITE_ONCE() is required otherwise the compiler can validly tear
-	 * updates to vd[x].seq and it is possible that the value seen by the
-	 * reader is inconsistent.
-	 */
-	WRITE_ONCE(vc[CS_HRES_COARSE].seq, vc[CS_HRES_COARSE].seq + 1);
-	WRITE_ONCE(vc[CS_RAW].seq, vc[CS_RAW].seq + 1);
+	vdso_write_seq_end(&vc[CS_HRES_COARSE]);
+	vdso_write_seq_end(&vc[CS_RAW]);
 }
 
 #endif /* !__ASSEMBLY__ */
-- 
cgit v1.2.3


From f0df91b6a7120d85c873f5e77bc183fb6eccda16 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 9 Jun 2025 21:35:19 +0300
Subject: wifi: cfg80211: hide scan internals

Hide the internal scan fields from mac80211 and drivers, the
'notified' variable is for internal tracking, and the 'info'
is output that's passed to cfg80211_scan_done() and stored
only for delayed userspace notification.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213231.6a62e41858e2.I004f66e9c087cc6e6ae4a24951cf470961ee9466@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  |   6 --
 net/wireless/core.c     |   4 +-
 net/wireless/core.h     |  11 ++-
 net/wireless/nl80211.c  |  97 +++++++++++++------------
 net/wireless/rdev-ops.h |   6 +-
 net/wireless/scan.c     | 188 +++++++++++++++++++++++++-----------------------
 net/wireless/sme.c      |  40 +++++------
 net/wireless/trace.h    |  23 +++---
 8 files changed, 196 insertions(+), 179 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4a092da3a9de..5d5ad7926877 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2748,8 +2748,6 @@ struct cfg80211_scan_6ghz_params {
  * @wiphy: the wiphy this was for
  * @scan_start: time (in jiffies) when the scan started
  * @wdev: the wireless device to scan for
- * @info: (internal) information about completed scan
- * @notified: (internal) scan request was notified as done or aborted
  * @no_cck: used to send probe requests at non CCK rate in 2GHz band
  * @mac_addr: MAC address used with randomisation
  * @mac_addr_mask: MAC address mask used with randomisation, bits that
@@ -2780,12 +2778,8 @@ struct cfg80211_scan_request {
 	u8 mac_addr[ETH_ALEN] __aligned(2);
 	u8 mac_addr_mask[ETH_ALEN] __aligned(2);
 	u8 bssid[ETH_ALEN] __aligned(2);
-
-	/* internal */
 	struct wiphy *wiphy;
 	unsigned long scan_start;
-	struct cfg80211_scan_info info;
-	bool notified;
 	bool no_cck;
 	bool scan_6ghz;
 	u32 n_6ghz_params;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index f3cd70757ef2..a7e2931ffb2e 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -239,7 +239,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
 
 	rdev->opencount--;
 
-	if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
+	if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) {
 		if (WARN_ON(!rdev->scan_req->notified &&
 			    (!rdev->int_scan_req ||
 			     !rdev->int_scan_req->notified)))
@@ -1574,7 +1574,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 	case NETDEV_DOWN:
 		wiphy_lock(&rdev->wiphy);
 		cfg80211_update_iface_num(rdev, wdev->iftype, -1);
-		if (rdev->scan_req && rdev->scan_req->wdev == wdev) {
+		if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) {
 			if (WARN_ON(!rdev->scan_req->notified &&
 				    (!rdev->int_scan_req ||
 				     !rdev->int_scan_req->notified)))
diff --git a/net/wireless/core.h b/net/wireless/core.h
index c56a35040caa..b6bd7f4d6385 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -21,6 +21,13 @@
 
 #define WIPHY_IDX_INVALID	-1
 
+struct cfg80211_scan_request_int {
+	struct cfg80211_scan_info info;
+	bool notified;
+	/* must be last - variable members */
+	struct cfg80211_scan_request req;
+};
+
 struct cfg80211_registered_device {
 	const struct cfg80211_ops *ops;
 	struct list_head list;
@@ -70,8 +77,8 @@ struct cfg80211_registered_device {
 	struct rb_root bss_tree;
 	u32 bss_generation;
 	u32 bss_entries;
-	struct cfg80211_scan_request *scan_req; /* protected by RTNL */
-	struct cfg80211_scan_request *int_scan_req;
+	struct cfg80211_scan_request_int *scan_req; /* protected by RTNL */
+	struct cfg80211_scan_request_int *int_scan_req;
 	struct sk_buff *scan_msg;
 	struct list_head sched_scan_req_list;
 	time64_t suspend_at;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 70ca74a75f22..18f27f193772 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -9831,12 +9831,12 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
 		mac_addr = req->mac_addr;
 		mac_addr_mask = req->mac_addr_mask;
 	} else {
-		struct cfg80211_scan_request *req = request;
+		struct cfg80211_scan_request_int *req = request;
 
 		randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR;
-		flags = &req->flags;
-		mac_addr = req->mac_addr;
-		mac_addr_mask = req->mac_addr_mask;
+		flags = &req->req.flags;
+		mac_addr = req->req.mac_addr;
+		mac_addr_mask = req->req.mac_addr_mask;
 	}
 
 	*flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]);
@@ -9891,7 +9891,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct wireless_dev *wdev = info->user_ptr[1];
-	struct cfg80211_scan_request *request;
+	struct cfg80211_scan_request_int *request;
 	struct nlattr *scan_freqs = NULL;
 	bool scan_freqs_khz = false;
 	struct nlattr *attr;
@@ -9943,21 +9943,21 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	if (ie_len > wiphy->max_scan_ie_len)
 		return -EINVAL;
 
-	size = struct_size(request, channels, n_channels);
+	size = struct_size(request, req.channels, n_channels);
 	ssids_offset = size;
-	size = size_add(size, array_size(sizeof(*request->ssids), n_ssids));
+	size = size_add(size, array_size(sizeof(*request->req.ssids), n_ssids));
 	ie_offset = size;
 	size = size_add(size, ie_len);
 	request = kzalloc(size, GFP_KERNEL);
 	if (!request)
 		return -ENOMEM;
-	request->n_channels = n_channels;
+	request->req.n_channels = n_channels;
 
 	if (n_ssids)
-		request->ssids = (void *)request + ssids_offset;
-	request->n_ssids = n_ssids;
+		request->req.ssids = (void *)request + ssids_offset;
+	request->req.n_ssids = n_ssids;
 	if (ie_len)
-		request->ie = (void *)request + ie_offset;
+		request->req.ie = (void *)request + ie_offset;
 
 	i = 0;
 	if (scan_freqs) {
@@ -9980,7 +9980,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 			    !cfg80211_wdev_channel_allowed(wdev, chan))
 				continue;
 
-			request->channels[i] = chan;
+			request->req.channels[i] = chan;
 			i++;
 		}
 	} else {
@@ -10001,7 +10001,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 				    !cfg80211_wdev_channel_allowed(wdev, chan))
 					continue;
 
-				request->channels[i] = chan;
+				request->req.channels[i] = chan;
 				i++;
 			}
 		}
@@ -10012,10 +10012,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 		goto out_free;
 	}
 
-	request->n_channels = i;
+	request->req.n_channels = i;
 
-	for (i = 0; i < request->n_channels; i++) {
-		struct ieee80211_channel *chan = request->channels[i];
+	for (i = 0; i < request->req.n_channels; i++) {
+		struct ieee80211_channel *chan = request->req.channels[i];
 
 		/* if we can go off-channel to the target channel we're good */
 		if (cfg80211_off_channel_oper_allowed(wdev, chan))
@@ -10034,22 +10034,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 				err = -EINVAL;
 				goto out_free;
 			}
-			request->ssids[i].ssid_len = nla_len(attr);
-			memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr));
+			request->req.ssids[i].ssid_len = nla_len(attr);
+			memcpy(request->req.ssids[i].ssid,
+			       nla_data(attr), nla_len(attr));
 			i++;
 		}
 	}
 
 	if (info->attrs[NL80211_ATTR_IE]) {
-		request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
-		memcpy((void *)request->ie,
+		request->req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
+		memcpy((void *)request->req.ie,
 		       nla_data(info->attrs[NL80211_ATTR_IE]),
-		       request->ie_len);
+		       request->req.ie_len);
 	}
 
 	for (i = 0; i < NUM_NL80211_BANDS; i++)
 		if (wiphy->bands[i])
-			request->rates[i] =
+			request->req.rates[i] =
 				(1 << wiphy->bands[i]->n_bitrates) - 1;
 
 	if (info->attrs[NL80211_ATTR_SCAN_SUPP_RATES]) {
@@ -10069,16 +10070,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 			err = ieee80211_get_ratemask(wiphy->bands[band],
 						     nla_data(attr),
 						     nla_len(attr),
-						     &request->rates[band]);
+						     &request->req.rates[band]);
 			if (err)
 				goto out_free;
 		}
 	}
 
 	if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) {
-		request->duration =
+		request->req.duration =
 			nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]);
-		request->duration_mandatory =
+		request->req.duration_mandatory =
 			nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]);
 	}
 
@@ -10087,7 +10088,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto out_free;
 
-	request->no_cck =
+	request->req.no_cck =
 		nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]);
 
 	/* Initial implementation used NL80211_ATTR_MAC to set the specific
@@ -10100,19 +10101,21 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
 	 * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use).
 	 */
 	if (info->attrs[NL80211_ATTR_BSSID])
-		memcpy(request->bssid,
+		memcpy(request->req.bssid,
 		       nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN);
-	else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
+	else if (!(request->req.flags & NL80211_SCAN_FLAG_RANDOM_ADDR) &&
 		 info->attrs[NL80211_ATTR_MAC])
-		memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]),
+		memcpy(request->req.bssid,
+		       nla_data(info->attrs[NL80211_ATTR_MAC]),
 		       ETH_ALEN);
 	else
-		eth_broadcast_addr(request->bssid);
+		eth_broadcast_addr(request->req.bssid);
 
-	request->tsf_report_link_id = nl80211_link_id_or_invalid(info->attrs);
-	request->wdev = wdev;
-	request->wiphy = &rdev->wiphy;
-	request->scan_start = jiffies;
+	request->req.tsf_report_link_id =
+		nl80211_link_id_or_invalid(info->attrs);
+	request->req.wdev = wdev;
+	request->req.wiphy = &rdev->wiphy;
+	request->req.scan_start = jiffies;
 
 	rdev->scan_req = request;
 	err = cfg80211_scan(rdev);
@@ -18414,7 +18417,7 @@ void nl80211_notify_iface(struct cfg80211_registered_device *rdev,
 static int nl80211_add_scan_req(struct sk_buff *msg,
 				struct cfg80211_registered_device *rdev)
 {
-	struct cfg80211_scan_request *req = rdev->scan_req;
+	struct cfg80211_scan_request_int *req = rdev->scan_req;
 	struct nlattr *nest;
 	int i;
 	struct cfg80211_scan_info *info;
@@ -18425,19 +18428,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 	nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_SSIDS);
 	if (!nest)
 		goto nla_put_failure;
-	for (i = 0; i < req->n_ssids; i++) {
-		if (nla_put(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid))
+	for (i = 0; i < req->req.n_ssids; i++) {
+		if (nla_put(msg, i, req->req.ssids[i].ssid_len,
+			    req->req.ssids[i].ssid))
 			goto nla_put_failure;
 	}
 	nla_nest_end(msg, nest);
 
-	if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) {
+	if (req->req.flags & NL80211_SCAN_FLAG_FREQ_KHZ) {
 		nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ);
 		if (!nest)
 			goto nla_put_failure;
-		for (i = 0; i < req->n_channels; i++) {
+		for (i = 0; i < req->req.n_channels; i++) {
 			if (nla_put_u32(msg, i,
-				   ieee80211_channel_to_khz(req->channels[i])))
+					ieee80211_channel_to_khz(req->req.channels[i])))
 				goto nla_put_failure;
 		}
 		nla_nest_end(msg, nest);
@@ -18446,19 +18450,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg,
 					     NL80211_ATTR_SCAN_FREQUENCIES);
 		if (!nest)
 			goto nla_put_failure;
-		for (i = 0; i < req->n_channels; i++) {
-			if (nla_put_u32(msg, i, req->channels[i]->center_freq))
+		for (i = 0; i < req->req.n_channels; i++) {
+			if (nla_put_u32(msg, i,
+					req->req.channels[i]->center_freq))
 				goto nla_put_failure;
 		}
 		nla_nest_end(msg, nest);
 	}
 
-	if (req->ie &&
-	    nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie))
+	if (req->req.ie &&
+	    nla_put(msg, NL80211_ATTR_IE, req->req.ie_len, req->req.ie))
 		goto nla_put_failure;
 
-	if (req->flags &&
-	    nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags))
+	if (req->req.flags &&
+	    nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->req.flags))
 		goto nla_put_failure;
 
 	info = rdev->int_scan_req ? &rdev->int_scan_req->info :
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 803b39c26587..ac6884bacf3f 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -456,15 +456,15 @@ rdev_set_monitor_channel(struct cfg80211_registered_device *rdev,
 }
 
 static inline int rdev_scan(struct cfg80211_registered_device *rdev,
-			    struct cfg80211_scan_request *request)
+			    struct cfg80211_scan_request_int *request)
 {
 	int ret;
 
-	if (WARN_ON_ONCE(!request->n_ssids && request->ssids))
+	if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids))
 		return -EINVAL;
 
 	trace_rdev_scan(&rdev->wiphy, request);
-	ret = rdev->ops->scan(&rdev->wiphy, request);
+	ret = rdev->ops->scan(&rdev->wiphy, &request->req);
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index e8a4fe44ec2d..a75cecc47d78 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -782,9 +782,9 @@ cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies,
 }
 EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap);
 
-static  void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request,
-					struct ieee80211_channel *chan,
-					bool add_to_6ghz)
+static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request,
+				       struct ieee80211_channel *chan,
+				       bool add_to_6ghz)
 {
 	int i;
 	u32 n_channels = request->n_channels;
@@ -843,25 +843,25 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 	u8 i;
 	struct cfg80211_colocated_ap *ap;
 	int n_channels, count = 0, err;
-	struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req;
+	struct cfg80211_scan_request_int *request, *rdev_req = rdev->scan_req;
 	LIST_HEAD(coloc_ap_list);
 	bool need_scan_psc = true;
 	const struct ieee80211_sband_iftype_data *iftd;
 	size_t size, offs_ssids, offs_6ghz_params, offs_ies;
 
-	rdev_req->scan_6ghz = true;
+	rdev_req->req.scan_6ghz = true;
 
 	if (!rdev->wiphy.bands[NL80211_BAND_6GHZ])
 		return -EOPNOTSUPP;
 
 	iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ],
-					       rdev_req->wdev->iftype);
+					       rdev_req->req.wdev->iftype);
 	if (!iftd || !iftd->he_cap.has_he)
 		return -EOPNOTSUPP;
 
 	n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels;
 
-	if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) {
+	if (rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) {
 		struct cfg80211_internal_bss *intbss;
 
 		spin_lock_bh(&rdev->bss_lock);
@@ -883,8 +883,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 			 * This is relevant for ML probe requests when the lower
 			 * band APs have not been discovered.
 			 */
-			if (is_broadcast_ether_addr(rdev_req->bssid) ||
-			    !ether_addr_equal(rdev_req->bssid, res->bssid) ||
+			if (is_broadcast_ether_addr(rdev_req->req.bssid) ||
+			    !ether_addr_equal(rdev_req->req.bssid, res->bssid) ||
 			    res->channel->band != NL80211_BAND_6GHZ)
 				continue;
 
@@ -911,13 +911,13 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 		spin_unlock_bh(&rdev->bss_lock);
 	}
 
-	size = struct_size(request, channels, n_channels);
+	size = struct_size(request, req.channels, n_channels);
 	offs_ssids = size;
-	size += sizeof(*request->ssids) * rdev_req->n_ssids;
+	size += sizeof(*request->req.ssids) * rdev_req->req.n_ssids;
 	offs_6ghz_params = size;
-	size += sizeof(*request->scan_6ghz_params) * count;
+	size += sizeof(*request->req.scan_6ghz_params) * count;
 	offs_ies = size;
-	size += rdev_req->ie_len;
+	size += rdev_req->req.ie_len;
 
 	request = kzalloc(size, GFP_KERNEL);
 	if (!request) {
@@ -926,26 +926,26 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 	}
 
 	*request = *rdev_req;
-	request->n_channels = 0;
-	request->n_6ghz_params = 0;
-	if (rdev_req->n_ssids) {
+	request->req.n_channels = 0;
+	request->req.n_6ghz_params = 0;
+	if (rdev_req->req.n_ssids) {
 		/*
 		 * Add the ssids from the parent scan request to the new
 		 * scan request, so the driver would be able to use them
 		 * in its probe requests to discover hidden APs on PSC
 		 * channels.
 		 */
-		request->ssids = (void *)request + offs_ssids;
-		memcpy(request->ssids, rdev_req->ssids,
-		       sizeof(*request->ssids) * request->n_ssids);
+		request->req.ssids = (void *)request + offs_ssids;
+		memcpy(request->req.ssids, rdev_req->req.ssids,
+		       sizeof(*request->req.ssids) * request->req.n_ssids);
 	}
-	request->scan_6ghz_params = (void *)request + offs_6ghz_params;
+	request->req.scan_6ghz_params = (void *)request + offs_6ghz_params;
 
-	if (rdev_req->ie_len) {
+	if (rdev_req->req.ie_len) {
 		void *ie = (void *)request + offs_ies;
 
-		memcpy(ie, rdev_req->ie, rdev_req->ie_len);
-		request->ie = ie;
+		memcpy(ie, rdev_req->req.ie, rdev_req->req.ie_len);
+		request->req.ie = ie;
 	}
 
 	/*
@@ -953,10 +953,12 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 	 * and at least one of the reported co-located APs with same SSID
 	 * indicating that all APs in the same ESS are co-located
 	 */
-	if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) {
+	if (count &&
+	    request->req.n_ssids == 1 &&
+	    request->req.ssids[0].ssid_len) {
 		list_for_each_entry(ap, &coloc_ap_list, list) {
 			if (ap->colocated_ess &&
-			    cfg80211_find_ssid_match(ap, request)) {
+			    cfg80211_find_ssid_match(ap, &request->req)) {
 				need_scan_psc = false;
 				break;
 			}
@@ -968,51 +970,52 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 	 * regardless of the collocated APs (PSC channels or all channels
 	 * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set)
 	 */
-	for (i = 0; i < rdev_req->n_channels; i++) {
-		if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ &&
+	for (i = 0; i < rdev_req->req.n_channels; i++) {
+		if (rdev_req->req.channels[i]->band == NL80211_BAND_6GHZ &&
 		    ((need_scan_psc &&
-		      cfg80211_channel_is_psc(rdev_req->channels[i])) ||
-		     !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) {
-			cfg80211_scan_req_add_chan(request,
-						   rdev_req->channels[i],
+		      cfg80211_channel_is_psc(rdev_req->req.channels[i])) ||
+		     !(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) {
+			cfg80211_scan_req_add_chan(&request->req,
+						   rdev_req->req.channels[i],
 						   false);
 		}
 	}
 
-	if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))
+	if (!(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))
 		goto skip;
 
 	list_for_each_entry(ap, &coloc_ap_list, list) {
 		bool found = false;
 		struct cfg80211_scan_6ghz_params *scan_6ghz_params =
-			&request->scan_6ghz_params[request->n_6ghz_params];
+			&request->req.scan_6ghz_params[request->req.n_6ghz_params];
 		struct ieee80211_channel *chan =
 			ieee80211_get_channel(&rdev->wiphy, ap->center_freq);
 
 		if (!chan || chan->flags & IEEE80211_CHAN_DISABLED ||
-		    !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan))
+		    !cfg80211_wdev_channel_allowed(rdev_req->req.wdev, chan))
 			continue;
 
-		for (i = 0; i < rdev_req->n_channels; i++) {
-			if (rdev_req->channels[i] == chan)
+		for (i = 0; i < rdev_req->req.n_channels; i++) {
+			if (rdev_req->req.channels[i] == chan)
 				found = true;
 		}
 
 		if (!found)
 			continue;
 
-		if (request->n_ssids > 0 &&
-		    !cfg80211_find_ssid_match(ap, request))
+		if (request->req.n_ssids > 0 &&
+		    !cfg80211_find_ssid_match(ap, &request->req))
 			continue;
 
-		if (!is_broadcast_ether_addr(request->bssid) &&
-		    !ether_addr_equal(request->bssid, ap->bssid))
+		if (!is_broadcast_ether_addr(request->req.bssid) &&
+		    !ether_addr_equal(request->req.bssid, ap->bssid))
 			continue;
 
-		if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid)
+		if (!request->req.n_ssids && ap->multi_bss &&
+		    !ap->transmitted_bssid)
 			continue;
 
-		cfg80211_scan_req_add_chan(request, chan, true);
+		cfg80211_scan_req_add_chan(&request->req, chan, true);
 		memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN);
 		scan_6ghz_params->short_ssid = ap->short_ssid;
 		scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid;
@@ -1028,14 +1031,14 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 		if (cfg80211_channel_is_psc(chan) && !need_scan_psc)
 			scan_6ghz_params->psc_no_listen = true;
 
-		request->n_6ghz_params++;
+		request->req.n_6ghz_params++;
 	}
 
 skip:
 	cfg80211_free_coloc_ap_list(&coloc_ap_list);
 
-	if (request->n_channels) {
-		struct cfg80211_scan_request *old = rdev->int_scan_req;
+	if (request->req.n_channels) {
+		struct cfg80211_scan_request_int *old = rdev->int_scan_req;
 
 		rdev->int_scan_req = request;
 
@@ -1063,35 +1066,36 @@ skip:
 
 int cfg80211_scan(struct cfg80211_registered_device *rdev)
 {
-	struct cfg80211_scan_request *request;
-	struct cfg80211_scan_request *rdev_req = rdev->scan_req;
+	struct cfg80211_scan_request_int *request;
+	struct cfg80211_scan_request_int *rdev_req = rdev->scan_req;
 	u32 n_channels = 0, idx, i;
 
 	if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ))
 		return rdev_scan(rdev, rdev_req);
 
-	for (i = 0; i < rdev_req->n_channels; i++) {
-		if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
+	for (i = 0; i < rdev_req->req.n_channels; i++) {
+		if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ)
 			n_channels++;
 	}
 
 	if (!n_channels)
 		return cfg80211_scan_6ghz(rdev);
 
-	request = kzalloc(struct_size(request, channels, n_channels),
+	request = kzalloc(struct_size(request, req.channels, n_channels),
 			  GFP_KERNEL);
 	if (!request)
 		return -ENOMEM;
 
 	*request = *rdev_req;
-	request->n_channels = n_channels;
+	request->req.n_channels = n_channels;
 
-	for (i = idx = 0; i < rdev_req->n_channels; i++) {
-		if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ)
-			request->channels[idx++] = rdev_req->channels[i];
+	for (i = idx = 0; i < rdev_req->req.n_channels; i++) {
+		if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ)
+			request->req.channels[idx++] =
+				rdev_req->req.channels[i];
 	}
 
-	rdev_req->scan_6ghz = false;
+	rdev_req->req.scan_6ghz = false;
 	rdev->int_scan_req = request;
 	return rdev_scan(rdev, request);
 }
@@ -1099,7 +1103,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
 void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 			   bool send_message)
 {
-	struct cfg80211_scan_request *request, *rdev_req;
+	struct cfg80211_scan_request_int *request, *rdev_req;
 	struct wireless_dev *wdev;
 	struct sk_buff *msg;
 #ifdef CONFIG_CFG80211_WEXT
@@ -1118,12 +1122,12 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	if (!rdev_req)
 		return;
 
-	wdev = rdev_req->wdev;
+	wdev = rdev_req->req.wdev;
 	request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req;
 
 	if (wdev_running(wdev) &&
 	    (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) &&
-	    !rdev_req->scan_6ghz && !request->info.aborted &&
+	    !rdev_req->req.scan_6ghz && !request->info.aborted &&
 	    !cfg80211_scan_6ghz(rdev))
 		return;
 
@@ -1136,10 +1140,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 		cfg80211_sme_scan_done(wdev->netdev);
 
 	if (!request->info.aborted &&
-	    request->flags & NL80211_SCAN_FLAG_FLUSH) {
+	    request->req.flags & NL80211_SCAN_FLAG_FLUSH) {
 		/* flush entries from previous scans */
 		spin_lock_bh(&rdev->bss_lock);
-		__cfg80211_bss_expire(rdev, request->scan_start);
+		__cfg80211_bss_expire(rdev, request->req.scan_start);
 		spin_unlock_bh(&rdev->bss_lock);
 	}
 
@@ -1175,13 +1179,16 @@ void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk)
 void cfg80211_scan_done(struct cfg80211_scan_request *request,
 			struct cfg80211_scan_info *info)
 {
-	struct cfg80211_scan_info old_info = request->info;
+	struct cfg80211_scan_request_int *intreq =
+		container_of(request, struct cfg80211_scan_request_int, req);
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(request->wiphy);
+	struct cfg80211_scan_info old_info = intreq->info;
 
-	trace_cfg80211_scan_done(request, info);
-	WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req &&
-		request != wiphy_to_rdev(request->wiphy)->int_scan_req);
+	trace_cfg80211_scan_done(intreq, info);
+	WARN_ON(intreq != rdev->scan_req &&
+		intreq != rdev->int_scan_req);
 
-	request->info = *info;
+	intreq->info = *info;
 
 	/*
 	 * In case the scan is split, the scan_start_tsf and tsf_bssid should
@@ -1189,14 +1196,13 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request,
 	 * be non zero.
 	 */
 	if (request->scan_6ghz && old_info.scan_start_tsf) {
-		request->info.scan_start_tsf = old_info.scan_start_tsf;
-		memcpy(request->info.tsf_bssid, old_info.tsf_bssid,
-		       sizeof(request->info.tsf_bssid));
+		intreq->info.scan_start_tsf = old_info.scan_start_tsf;
+		memcpy(intreq->info.tsf_bssid, old_info.tsf_bssid,
+		       sizeof(intreq->info.tsf_bssid));
 	}
 
-	request->notified = true;
-	wiphy_work_queue(request->wiphy,
-			 &wiphy_to_rdev(request->wiphy)->scan_done_wk);
+	intreq->notified = true;
+	wiphy_work_queue(request->wiphy, &rdev->scan_done_wk);
 }
 EXPORT_SYMBOL(cfg80211_scan_done);
 
@@ -3496,7 +3502,7 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 	struct cfg80211_registered_device *rdev;
 	struct wiphy *wiphy;
 	struct iw_scan_req *wreq = NULL;
-	struct cfg80211_scan_request *creq;
+	struct cfg80211_scan_request_int *creq;
 	int i, err, n_channels = 0;
 	enum nl80211_band band;
 
@@ -3526,19 +3532,20 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 		n_channels = ieee80211_get_num_supported_channels(wiphy);
 	}
 
-	creq = kzalloc(struct_size(creq, channels, n_channels) +
+	creq = kzalloc(struct_size(creq, req.channels, n_channels) +
 		       sizeof(struct cfg80211_ssid),
 		       GFP_ATOMIC);
 	if (!creq)
 		return -ENOMEM;
 
-	creq->wiphy = wiphy;
-	creq->wdev = dev->ieee80211_ptr;
+	creq->req.wiphy = wiphy;
+	creq->req.wdev = dev->ieee80211_ptr;
 	/* SSIDs come after channels */
-	creq->ssids = (void *)creq + struct_size(creq, channels, n_channels);
-	creq->n_channels = n_channels;
-	creq->n_ssids = 1;
-	creq->scan_start = jiffies;
+	creq->req.ssids = (void *)creq +
+			  struct_size(creq, req.channels, n_channels);
+	creq->req.n_channels = n_channels;
+	creq->req.n_ssids = 1;
+	creq->req.scan_start = jiffies;
 
 	/* translate "Scan on frequencies" request */
 	i = 0;
@@ -3554,7 +3561,7 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 			/* ignore disabled channels */
 			chan = &wiphy->bands[band]->channels[j];
 			if (chan->flags & IEEE80211_CHAN_DISABLED ||
-			    !cfg80211_wdev_channel_allowed(creq->wdev, chan))
+			    !cfg80211_wdev_channel_allowed(creq->req.wdev, chan))
 				continue;
 
 			/* If we have a wireless request structure and the
@@ -3577,7 +3584,8 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 			}
 
 		wext_freq_found:
-			creq->channels[i] = &wiphy->bands[band]->channels[j];
+			creq->req.channels[i] =
+				&wiphy->bands[band]->channels[j];
 			i++;
 		wext_freq_not_found: ;
 		}
@@ -3588,28 +3596,30 @@ int cfg80211_wext_siwscan(struct net_device *dev,
 		goto out;
 	}
 
-	/* Set real number of channels specified in creq->channels[] */
-	creq->n_channels = i;
+	/* Set real number of channels specified in creq->req.channels[] */
+	creq->req.n_channels = i;
 
 	/* translate "Scan for SSID" request */
 	if (wreq) {
 		if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
 			if (wreq->essid_len > IEEE80211_MAX_SSID_LEN)
 				return -EINVAL;
-			memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
-			creq->ssids[0].ssid_len = wreq->essid_len;
+			memcpy(creq->req.ssids[0].ssid, wreq->essid,
+			       wreq->essid_len);
+			creq->req.ssids[0].ssid_len = wreq->essid_len;
 		}
 		if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) {
-			creq->ssids = NULL;
-			creq->n_ssids = 0;
+			creq->req.ssids = NULL;
+			creq->req.n_ssids = 0;
 		}
 	}
 
 	for (i = 0; i < NUM_NL80211_BANDS; i++)
 		if (wiphy->bands[i])
-			creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1;
+			creq->req.rates[i] =
+				(1 << wiphy->bands[i]->n_bitrates) - 1;
 
-	eth_broadcast_addr(creq->bssid);
+	eth_broadcast_addr(creq->req.bssid);
 
 	scoped_guard(wiphy, &rdev->wiphy) {
 		rdev->scan_req = creq;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index cf998500a965..6d7a7e7f0fc2 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -5,7 +5,7 @@
  * (for nl80211's connect() and wext)
  *
  * Copyright 2009	Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2009, 2020, 2022-2024 Intel Corporation. All rights reserved.
+ * Copyright (C) 2009, 2020, 2022-2025 Intel Corporation. All rights reserved.
  * Copyright 2017	Intel Deutschland GmbH
  */
 
@@ -64,7 +64,7 @@ static void cfg80211_sme_free(struct wireless_dev *wdev)
 static int cfg80211_conn_scan(struct wireless_dev *wdev)
 {
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
-	struct cfg80211_scan_request *request;
+	struct cfg80211_scan_request_int *request;
 	int n_channels, err;
 
 	lockdep_assert_wiphy(wdev->wiphy);
@@ -77,13 +77,13 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 	else
 		n_channels = ieee80211_get_num_supported_channels(wdev->wiphy);
 
-	request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) +
-			  sizeof(request->channels[0]) * n_channels,
+	request = kzalloc(sizeof(*request) + sizeof(request->req.ssids[0]) +
+			  sizeof(request->req.channels[0]) * n_channels,
 			  GFP_KERNEL);
 	if (!request)
 		return -ENOMEM;
 
-	request->n_channels = n_channels;
+	request->req.n_channels = n_channels;
 	if (wdev->conn->params.channel) {
 		enum nl80211_band band = wdev->conn->params.channel->band;
 		struct ieee80211_supported_band *sband =
@@ -93,8 +93,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 			kfree(request);
 			return -EINVAL;
 		}
-		request->channels[0] = wdev->conn->params.channel;
-		request->rates[band] = (1 << sband->n_bitrates) - 1;
+		request->req.channels[0] = wdev->conn->params.channel;
+		request->req.rates[band] = (1 << sband->n_bitrates) - 1;
 	} else {
 		int i = 0, j;
 		enum nl80211_band band;
@@ -109,26 +109,26 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev)
 				channel = &bands->channels[j];
 				if (channel->flags & IEEE80211_CHAN_DISABLED)
 					continue;
-				request->channels[i++] = channel;
+				request->req.channels[i++] = channel;
 			}
-			request->rates[band] = (1 << bands->n_bitrates) - 1;
+			request->req.rates[band] = (1 << bands->n_bitrates) - 1;
 		}
 		n_channels = i;
 	}
-	request->n_channels = n_channels;
-	request->ssids = (void *)request +
-		struct_size(request, channels, n_channels);
-	request->n_ssids = 1;
+	request->req.n_channels = n_channels;
+	request->req.ssids = (void *)request +
+			     struct_size(request, req.channels, n_channels);
+	request->req.n_ssids = 1;
 
-	memcpy(request->ssids[0].ssid, wdev->conn->params.ssid,
-		wdev->conn->params.ssid_len);
-	request->ssids[0].ssid_len = wdev->conn->params.ssid_len;
+	memcpy(request->req.ssids[0].ssid, wdev->conn->params.ssid,
+	       wdev->conn->params.ssid_len);
+	request->req.ssids[0].ssid_len = wdev->conn->params.ssid_len;
 
-	eth_broadcast_addr(request->bssid);
+	eth_broadcast_addr(request->req.bssid);
 
-	request->wdev = wdev;
-	request->wiphy = &rdev->wiphy;
-	request->scan_start = jiffies;
+	request->req.wdev = wdev;
+	request->req.wiphy = &rdev->wiphy;
+	request->req.scan_start = jiffies;
 
 	rdev->scan_req = request;
 
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 7e43ab9de923..a07d88d61bec 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -373,7 +373,8 @@ TRACE_EVENT(rdev_return_int,
 );
 
 TRACE_EVENT(rdev_scan,
-	TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request),
+	TP_PROTO(struct wiphy *wiphy,
+		 struct cfg80211_scan_request_int *request),
 	TP_ARGS(wiphy, request),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
@@ -3716,12 +3717,12 @@ TRACE_EVENT(cfg80211_tdls_oper_request,
 	);
 
 TRACE_EVENT(cfg80211_scan_done,
-	TP_PROTO(struct cfg80211_scan_request *request,
+	TP_PROTO(struct cfg80211_scan_request_int *request,
 		 struct cfg80211_scan_info *info),
 	TP_ARGS(request, info),
 	TP_STRUCT__entry(
 		__field(u32, n_channels)
-		__dynamic_array(u8, ie, request ? request->ie_len : 0)
+		__dynamic_array(u8, ie, request ? request->req.ie_len : 0)
 		__array(u32, rates, NUM_NL80211_BANDS)
 		__field(u32, wdev_id)
 		MAC_ENTRY(wiphy_mac)
@@ -3732,16 +3733,16 @@ TRACE_EVENT(cfg80211_scan_done,
 	),
 	TP_fast_assign(
 		if (request) {
-			memcpy(__get_dynamic_array(ie), request->ie,
-			       request->ie_len);
-			memcpy(__entry->rates, request->rates,
+			memcpy(__get_dynamic_array(ie), request->req.ie,
+			       request->req.ie_len);
+			memcpy(__entry->rates, request->req.rates,
 			       NUM_NL80211_BANDS);
-			__entry->wdev_id = request->wdev ?
-					request->wdev->identifier : 0;
-			if (request->wiphy)
+			__entry->wdev_id = request->req.wdev ?
+					request->req.wdev->identifier : 0;
+			if (request->req.wiphy)
 				MAC_ASSIGN(wiphy_mac,
-					   request->wiphy->perm_addr);
-			__entry->no_cck = request->no_cck;
+					   request->req.wiphy->perm_addr);
+			__entry->no_cck = request->req.no_cck;
 		}
 		if (info) {
 			__entry->aborted = info->aborted;
-- 
cgit v1.2.3


From afebe192ebfef7f6e0be0a070325995771bcd7e8 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Mon, 9 Jun 2025 21:35:21 +0300
Subject: wifi: cfg80211: only verify part of Extended MLD Capabilities

We verify that the Extended MLD Capabilities are matching between links.
However, some bits are reserved and in particular the Recommended Max
Links subfield may not necessarily match. So only verify the known
subfields that can reliably be expected to be the same. More information
can be found in Table 9-417o, in IEEE P802.11be/D7.0.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213231.a2fad48dd3e6.Iae1740cd2ac833bc4a64fd2af718e1485158fd42@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  9 ++++++++-
 net/wireless/mlme.c       | 21 +++++++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 120de474a8bf..e05219a912f9 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2024 Intel Corporation
+ * Copyright (c) 2018 - 2025 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -5333,6 +5333,13 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data)
 	return get_unaligned_le16(common);
 }
 
+/* Defined in Figure 9-1074t in P802.11be_D7.0 */
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE           0x0001
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK    0x001e
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE               0x0020
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK     0x0040
+#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP     0x0080
+
 /**
  * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities
  *	and operations.
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 29e1ce8aff42..bb5bc6ff09d4 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -352,8 +352,25 @@ cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a,
 		return -EINVAL;
 	}
 
-	if (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) !=
-	    ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b)) {
+	/*
+	 * Only verify the values in Extended MLD Capabilities that are
+	 * not reserved when transmitted by an AP (and expected to remain the
+	 * same over time).
+	 * The Recommended Max Simultaneous Links subfield in particular is
+	 * reserved when included in a unicast Probe Response frame and may
+	 * also change when the AP adds/removes links. The BTM MLD
+	 * Recommendation For Multiple APs Support subfield is reserved when
+	 * transmitted by an AP. All other bits are currently reserved.
+	 * See IEEE P802.11be/D7.0, Table 9-417o.
+	 */
+	if ((ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) &
+	     (IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE |
+	      IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE |
+	      IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK)) !=
+	    (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b) &
+	     (IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE |
+	      IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE |
+	      IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK))) {
 		NL_SET_ERR_MSG(extack,
 			       "extended link MLD capabilities/ops mismatch");
 		return -EINVAL;
-- 
cgit v1.2.3


From 984462751d57047828ff4a799cc7d4670a2cfeb2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 9 Jun 2025 21:35:22 +0300
Subject: wifi: mac80211: remove DISALLOW_PUNCTURING_5GHZ code

Since iwlwifi was the only driver using this and no
longer does, we can remove all this code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213231.4dff5fb8890f.Ie531f912b252a0042c18c0734db50c3afe1adfb5@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 3 ---
 net/mac80211/debugfs.c | 3 +--
 net/mac80211/mlme.c    | 4 ----
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a61ffdbf99be..14a6bd120f25 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2852,8 +2852,6 @@ struct ieee80211_txq {
  *
  * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT
  *	and connecting with a lower bandwidth instead
- * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in
- *	EHT in 5 GHz and connecting with a lower bandwidth instead
  *
  * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so
  *	no need to stop queues. This really should be set by a driver that
@@ -2923,7 +2921,6 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_DETECTS_COLOR_COLLISION,
 	IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
 	IEEE80211_HW_DISALLOW_PUNCTURING,
-	IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ,
 	IEEE80211_HW_HANDLES_QUIET_CSA,
 	IEEE80211_HW_STRICT,
 
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 69e03630f64c..e8b78ec682da 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -4,7 +4,7 @@
  *
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation
+ * Copyright (C) 2018 - 2019, 2021-2025 Intel Corporation
  */
 
 #include <linux/debugfs.h>
@@ -490,7 +490,6 @@ static const char *hw_flag_names[] = {
 	FLAG(DETECTS_COLOR_COLLISION),
 	FLAG(MLO_MCAST_MULTI_LINK_TX),
 	FLAG(DISALLOW_PUNCTURING),
-	FLAG(DISALLOW_PUNCTURING_5GHZ),
 	FLAG(HANDLES_QUIET_CSA),
 	FLAG(STRICT),
 #undef FLAG
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2c700d12eef6..75dfbb06dff2 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -776,10 +776,6 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata,
 	    ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING))
 		return false;
 
-	if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ &&
-	    ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ))
-		return false;
-
 	return true;
 }
 
-- 
cgit v1.2.3


From 62c57ebb3107842482bc5e3568a0202295a8db0d Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Mon, 9 Jun 2025 21:35:23 +0300
Subject: wifi: cfg80211: add a flag for the first part of a scan

When there are no non-6 GHz channels, then the 6 GHz scan is the first
part of a split scan. Add a boolean denoting whether the scan is the
first part of a scan as it might be useful to drivers for internal
bookkeeping. This flag is also set if the scan is not split.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213231.07e5a8a452ec.Ibf18f513e507422078fb31b28947e582a20df87a@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  6 +++++-
 net/wireless/scan.c    | 15 ++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5d5ad7926877..6ec9a8865b8b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2754,7 +2754,10 @@ struct cfg80211_scan_6ghz_params {
  *	are 0 in the mask should be randomised, bits that are 1 should
  *	be taken from the @mac_addr
  * @scan_6ghz: relevant for split scan request only,
- *	true if this is the second scan request
+ *	true if this is a 6 GHz scan request
+ * @first_part: %true if this is the first part of a split scan request or a
+ *	scan that was not split. May be %true for a @scan_6ghz scan if no other
+ *	channels were requested
  * @n_6ghz_params: number of 6 GHz params
  * @scan_6ghz_params: 6 GHz params
  * @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
@@ -2782,6 +2785,7 @@ struct cfg80211_scan_request {
 	unsigned long scan_start;
 	bool no_cck;
 	bool scan_6ghz;
+	bool first_part;
 	u32 n_6ghz_params;
 	struct cfg80211_scan_6ghz_params *scan_6ghz_params;
 	s8 tsf_report_link_id;
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index a75cecc47d78..b963ca5c606e 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -838,7 +838,8 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap,
 	return false;
 }
 
-static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
+static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev,
+			      bool first_part)
 {
 	u8 i;
 	struct cfg80211_colocated_ap *ap;
@@ -850,6 +851,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev)
 	size_t size, offs_ssids, offs_6ghz_params, offs_ies;
 
 	rdev_req->req.scan_6ghz = true;
+	rdev_req->req.first_part = first_part;
 
 	if (!rdev->wiphy.bands[NL80211_BAND_6GHZ])
 		return -EOPNOTSUPP;
@@ -1046,7 +1048,7 @@ skip:
 		 * If this scan follows a previous scan, save the scan start
 		 * info from the first part of the scan
 		 */
-		if (old)
+		if (!first_part && !WARN_ON(!old))
 			rdev->int_scan_req->info = old->info;
 
 		err = rdev_scan(rdev, request);
@@ -1070,8 +1072,10 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
 	struct cfg80211_scan_request_int *rdev_req = rdev->scan_req;
 	u32 n_channels = 0, idx, i;
 
-	if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ))
+	if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) {
+		rdev_req->req.first_part = true;
 		return rdev_scan(rdev, rdev_req);
+	}
 
 	for (i = 0; i < rdev_req->req.n_channels; i++) {
 		if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ)
@@ -1079,7 +1083,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
 	}
 
 	if (!n_channels)
-		return cfg80211_scan_6ghz(rdev);
+		return cfg80211_scan_6ghz(rdev, true);
 
 	request = kzalloc(struct_size(request, req.channels, n_channels),
 			  GFP_KERNEL);
@@ -1096,6 +1100,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev)
 	}
 
 	rdev_req->req.scan_6ghz = false;
+	rdev_req->req.first_part = true;
 	rdev->int_scan_req = request;
 	return rdev_scan(rdev, request);
 }
@@ -1128,7 +1133,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev,
 	if (wdev_running(wdev) &&
 	    (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) &&
 	    !rdev_req->req.scan_6ghz && !request->info.aborted &&
-	    !cfg80211_scan_6ghz(rdev))
+	    !cfg80211_scan_6ghz(rdev, false))
 		return;
 
 	/*
-- 
cgit v1.2.3


From a11ec0dc920b2a23da9d88063b5f15d7eada6128 Mon Sep 17 00:00:00 2001
From: Somashekhar Puttagangaiah <somashekhar.puttagangaiah@intel.com>
Date: Mon, 9 Jun 2025 21:35:26 +0300
Subject: wifi: cfg80211/mac80211: implement dot11ExtendedRegInfoSupport

Implement dot11ExtendedRegInfoSupport to advertise non-AP station
regulatory power capability as part of regulatory connectivity
element in (Re)Association request frames so that AP can achieve
maximum client connectivity. Control field which was interpreted
using value of 3-bits B5 to B3, now uses value of 4-bits B6 to B3 to
interpret the type of AP. Hence update IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO
to parse 4-bits control field. If older AP still updates only 3-bits
value of control field, station can still interpret the value as per
section E.2.7 of IEEE 802.11 REVme D7.0 and support the appropriate
AP type.

Also update IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP as the value of
standard power AP is changed to 8 instead of 4 so that AP can support both
LPI AP and SP AP to maximize the connectivity with stations. For backward
compatibility, keeping value 4 as old AP by limiting it to SP AP only.

Signed-off-by: Somashekhar Puttagangaiah <somashekhar.puttagangaiah@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213232.90cdef116aad.I85da390fbee59355e3855691933e6a5e55c47ac4@changeid
[fix kernel-doc]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 34 ++++++++++++++++++++++++++++------
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/mlme.c        | 41 +++++++++++++++++++++++++++++++++++++++++
 net/mac80211/util.c        | 17 +++++++++++++++++
 net/wireless/scan.c        |  1 +
 5 files changed, 89 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e05219a912f9..ea95c4a60fa6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2837,11 +2837,12 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
 
-#define IEEE80211_6GHZ_CTRL_REG_LPI_AP		0
-#define IEEE80211_6GHZ_CTRL_REG_SP_AP		1
-#define IEEE80211_6GHZ_CTRL_REG_VLP_AP		2
-#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP	3
-#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP	4
+#define IEEE80211_6GHZ_CTRL_REG_LPI_AP			0
+#define IEEE80211_6GHZ_CTRL_REG_SP_AP			1
+#define IEEE80211_6GHZ_CTRL_REG_VLP_AP			2
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP		3
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD	4
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP		8
 
 /**
  * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
@@ -2859,13 +2860,31 @@ struct ieee80211_he_6ghz_oper {
 #define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ	2
 #define		IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ	3
 #define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON	0x4
-#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO	0x38
+#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO	0x78
 	u8 control;
 	u8 ccfs0;
 	u8 ccfs1;
 	u8 minrate;
 } __packed;
 
+/**
+ * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits.
+ *
+ * This enumeration defines bit flags used to represent regulatory connectivity
+ * field bits.
+ *
+ * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid.
+ * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit.
+ * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid.
+ * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit.
+ */
+enum ieee80211_reg_conn_bits {
+	IEEE80211_REG_CONN_LPI_VALID = BIT(0),
+	IEEE80211_REG_CONN_LPI_VALUE = BIT(1),
+	IEEE80211_REG_CONN_SP_VALID = BIT(2),
+	IEEE80211_REG_CONN_SP_VALUE = BIT(3),
+};
+
 /* transmit power interpretation type of transmit power envelope element */
 enum ieee80211_tx_power_intrpt_type {
 	IEEE80211_TPE_LOCAL_EIRP,
@@ -3847,6 +3866,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
 	WLAN_EID_EXT_FILS_NONCE = 13,
 	WLAN_EID_EXT_FUTURE_CHAN_GUIDANCE = 14,
+	WLAN_EID_EXT_DH_PARAMETER = 32,
 	WLAN_EID_EXT_HE_CAPABILITY = 35,
 	WLAN_EID_EXT_HE_OPERATION = 36,
 	WLAN_EID_EXT_UORA = 37,
@@ -3870,6 +3890,8 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_EHT_CAPABILITY = 108,
 	WLAN_EID_EXT_TID_TO_LINK_MAPPING = 109,
 	WLAN_EID_EXT_BANDWIDTH_INDICATION = 135,
+	WLAN_EID_EXT_KNOWN_STA_IDENTIFCATION = 136,
+	WLAN_EID_EXT_NON_AP_STA_REG_CON = 137,
 };
 
 /* Action category code */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 4ef7b3656aca..ec68204fddc9 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -2642,6 +2642,8 @@ int ieee80211_put_eht_cap(struct sk_buff *skb,
 			  struct ieee80211_sub_if_data *sdata,
 			  const struct ieee80211_supported_band *sband,
 			  const struct ieee80211_conn_settings *conn);
+int ieee80211_put_reg_conn(struct sk_buff *skb,
+			   enum ieee80211_channel_flags flags);
 
 /* channel management */
 bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 1bedd8d6e891..aaff7e9c3eb7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1641,6 +1641,30 @@ static size_t ieee80211_add_before_he_elems(struct sk_buff *skb,
 	return noffset;
 }
 
+static size_t ieee80211_add_before_reg_conn(struct sk_buff *skb,
+					    const u8 *elems, size_t elems_len,
+					    size_t offset)
+{
+	static const u8 before_reg_conn[] = {
+		/*
+		 * no need to list the ones split off before HE
+		 * or generated here
+		 */
+		WLAN_EID_EXTENSION, WLAN_EID_EXT_DH_PARAMETER,
+		WLAN_EID_EXTENSION, WLAN_EID_EXT_KNOWN_STA_IDENTIFCATION,
+	};
+	size_t noffset;
+
+	if (!elems_len)
+		return offset;
+
+	noffset = ieee80211_ie_split(elems, elems_len, before_reg_conn,
+				     ARRAY_SIZE(before_reg_conn), offset);
+	skb_put_data(skb, elems + offset, noffset - offset);
+
+	return noffset;
+}
+
 #define PRESENT_ELEMS_MAX	8
 #define PRESENT_ELEM_EXT_OFFS	0x100
 
@@ -1801,6 +1825,22 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata,
 		ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode);
 	}
 
+	/*
+	 * if present, add any custom IEs that go before regulatory
+	 * connectivity element
+	 */
+	offset = ieee80211_add_before_reg_conn(skb, extra_elems,
+					       extra_elems_len, offset);
+
+	if (sband->band == NL80211_BAND_6GHZ) {
+		/*
+		 * as per Section E.2.7 of IEEE 802.11 REVme D7.0, non-AP STA
+		 * capable of operating on the 6 GHz band shall transmit
+		 * regulatory connectivity element.
+		 */
+		ieee80211_put_reg_conn(skb, chan->flags);
+	}
+
 	/*
 	 * careful - need to know about all the present elems before
 	 * calling ieee80211_assoc_add_ml_elem(), so add this one if
@@ -5931,6 +5971,7 @@ ieee80211_ap_power_type(u8 control)
 		return IEEE80211_REG_LPI_AP;
 	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
 		return IEEE80211_REG_SP_AP;
 	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
 		return IEEE80211_REG_VLP_AP;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ff6c5d5e631d..a1cb63222b6d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2556,6 +2556,23 @@ end:
 	return 0;
 }
 
+int ieee80211_put_reg_conn(struct sk_buff *skb,
+			   enum ieee80211_channel_flags flags)
+{
+	u8 reg_conn = IEEE80211_REG_CONN_LPI_VALID |
+		      IEEE80211_REG_CONN_LPI_VALUE |
+		      IEEE80211_REG_CONN_SP_VALID;
+
+	if (!(flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT))
+		reg_conn |= IEEE80211_REG_CONN_SP_VALUE;
+
+	skb_put_u8(skb, WLAN_EID_EXTENSION);
+	skb_put_u8(skb, 1 + sizeof(reg_conn));
+	skb_put_u8(skb, WLAN_EID_EXT_NON_AP_STA_REG_CON);
+	skb_put_u8(skb, reg_conn);
+	return 0;
+}
+
 int ieee80211_put_he_6ghz_cap(struct sk_buff *skb,
 			      struct ieee80211_sub_if_data *sdata,
 			      enum ieee80211_smps_mode smps_mode)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b963ca5c606e..a8339ed52404 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2231,6 +2231,7 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len)
 		return IEEE80211_REG_LPI_AP;
 	case IEEE80211_6GHZ_CTRL_REG_SP_AP:
 	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
+	case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
 		return IEEE80211_REG_SP_AP;
 	case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
 		return IEEE80211_REG_VLP_AP;
-- 
cgit v1.2.3


From 6b04716cdcac37bdbacde34def08bc6fdb5fc4e2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 9 Jun 2025 21:35:27 +0300
Subject: wifi: mac80211: don't complete management TX on SAE commit

When SAE commit is sent and received in response, there's no
ordering for the SAE confirm messages. As such, don't call
drivers to stop listening on the channel when the confirm
message is still expected.

This fixes an issue if the local confirm is transmitted later
than the AP's confirm, for iwlwifi (and possibly mt76) the
AP's confirm would then get lost since the device isn't on
the channel at the time the AP transmit the confirm.

For iwlwifi at least, this also improves the overall timing
of the authentication handshake (by about 15ms according to
the report), likely since the session protection won't be
aborted and rescheduled.

Note that even before this, mgd_complete_tx() wasn't always
called for each call to mgd_prepare_tx() (e.g. in the case
of WEP key shared authentication), and the current drivers
that have the complete callback don't seem to mind. Document
this as well though.

Reported-by: Jan Hendrik Farr <kernel@jfarr.cc>
Closes: https://lore.kernel.org/all/aB30Ea2kRG24LINR@archlinux/
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250609213232.12691580e140.I3f1d3127acabcd58348a110ab11044213cf147d3@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 net/mac80211/mlme.c    | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 14a6bd120f25..577fd6a8c372 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4310,6 +4310,8 @@ struct ieee80211_prep_tx_info {
  * @mgd_complete_tx: Notify the driver that the response frame for a previously
  *	transmitted frame announced with @mgd_prepare_tx was received, the data
  *	is filled similarly to @mgd_prepare_tx though the duration is not used.
+ *	Note that this isn't always called for each mgd_prepare_tx() call, for
+ *	example for SAE the 'confirm' messages can be on the air in any order.
  *
  * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
  *	a TDLS discovery-request, we expect a reply to arrive on the AP's
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index aaff7e9c3eb7..8b9c132cce3d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4780,6 +4780,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_prep_tx_info info = {
 		.subtype = IEEE80211_STYPE_AUTH,
 	};
+	bool sae_need_confirm = false;
 
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
@@ -4825,6 +4826,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 				jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY;
 			ifmgd->auth_data->timeout_started = true;
 			run_again(sdata, ifmgd->auth_data->timeout);
+			if (auth_transaction == 1)
+				sae_need_confirm = true;
 			goto notify_driver;
 		}
 
@@ -4867,6 +4870,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	     ifmgd->auth_data->expected_transaction == 2)) {
 		if (!ieee80211_mark_sta_auth(sdata))
 			return; /* ignore frame -- wait for timeout */
+	} else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
+		   auth_transaction == 1) {
+		sae_need_confirm = true;
 	} else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
 		   auth_transaction == 2) {
 		sdata_info(sdata, "SAE peer confirmed\n");
@@ -4875,7 +4881,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 
 	cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
 notify_driver:
-	drv_mgd_complete_tx(sdata->local, sdata, &info);
+	if (!sae_need_confirm)
+		drv_mgd_complete_tx(sdata->local, sdata, &info);
 }
 
 #define case_WLAN(type) \
-- 
cgit v1.2.3


From 488e6eaab88cfa4b6fd2e2bb72fac9cfdc8c403b Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Fri, 4 Jul 2025 17:57:49 +0800
Subject: usb: core: add dma-noncoherent buffer alloc and free API

This will add usb_alloc_noncoherent() and usb_free_noncoherent()
functions to support alloc and free buffer in a dma-noncoherent way.

To explicit manage the memory ownership for the kernel and device,
this will also add usb_dma_noncoherent_sync_for_cpu/device() functions
and call it at proper time.  The management requires the user save
sg_table returned by usb_alloc_noncoherent() to urb->sgt.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Reviewed-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/20250704095751.73765-2-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c | 29 +++++++++++++-----
 drivers/usb/core/usb.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h    | 11 +++++++
 3 files changed, 112 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index c22de97432a0..03771bbc6c01 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1342,29 +1342,35 @@ void usb_hcd_unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb)
 
 	dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	if (IS_ENABLED(CONFIG_HAS_DMA) &&
-	    (urb->transfer_flags & URB_DMA_MAP_SG))
+	    (urb->transfer_flags & URB_DMA_MAP_SG)) {
 		dma_unmap_sg(hcd->self.sysdev,
 				urb->sg,
 				urb->num_sgs,
 				dir);
-	else if (IS_ENABLED(CONFIG_HAS_DMA) &&
-		 (urb->transfer_flags & URB_DMA_MAP_PAGE))
+	} else if (IS_ENABLED(CONFIG_HAS_DMA) &&
+		 (urb->transfer_flags & URB_DMA_MAP_PAGE)) {
 		dma_unmap_page(hcd->self.sysdev,
 				urb->transfer_dma,
 				urb->transfer_buffer_length,
 				dir);
-	else if (IS_ENABLED(CONFIG_HAS_DMA) &&
-		 (urb->transfer_flags & URB_DMA_MAP_SINGLE))
+	} else if (IS_ENABLED(CONFIG_HAS_DMA) &&
+		 (urb->transfer_flags & URB_DMA_MAP_SINGLE)) {
 		dma_unmap_single(hcd->self.sysdev,
 				urb->transfer_dma,
 				urb->transfer_buffer_length,
 				dir);
-	else if (urb->transfer_flags & URB_MAP_LOCAL)
+	} else if (urb->transfer_flags & URB_MAP_LOCAL) {
 		hcd_free_coherent(urb->dev->bus,
 				&urb->transfer_dma,
 				&urb->transfer_buffer,
 				urb->transfer_buffer_length,
 				dir);
+	} else if ((urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) && urb->sgt) {
+		dma_sync_sgtable_for_cpu(hcd->self.sysdev, urb->sgt, dir);
+		if (dir == DMA_FROM_DEVICE)
+			invalidate_kernel_vmap_range(urb->transfer_buffer,
+						     urb->transfer_buffer_length);
+	}
 
 	/* Make it safe to call this routine more than once */
 	urb->transfer_flags &= ~(URB_DMA_MAP_SG | URB_DMA_MAP_PAGE |
@@ -1425,8 +1431,15 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 	}
 
 	dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-	if (urb->transfer_buffer_length != 0
-	    && !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)) {
+	if (urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) {
+		if (!urb->sgt)
+			return 0;
+
+		if (dir == DMA_TO_DEVICE)
+			flush_kernel_vmap_range(urb->transfer_buffer,
+						urb->transfer_buffer_length);
+		dma_sync_sgtable_for_device(hcd->self.sysdev, urb->sgt, dir);
+	} else if (urb->transfer_buffer_length != 0) {
 		if (hcd->localmem_pool) {
 			ret = hcd_alloc_coherent(
 					urb->dev->bus, mem_flags,
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 118fa4c93a79..fca7735fc660 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1030,6 +1030,86 @@ void usb_free_coherent(struct usb_device *dev, size_t size, void *addr,
 }
 EXPORT_SYMBOL_GPL(usb_free_coherent);
 
+/**
+ * usb_alloc_noncoherent - allocate dma-noncoherent buffer for URB_NO_xxx_DMA_MAP
+ * @dev: device the buffer will be used with
+ * @size: requested buffer size
+ * @mem_flags: affect whether allocation may block
+ * @dma: used to return DMA address of buffer
+ * @dir: DMA transfer direction
+ * @table: used to return sg_table of allocated memory
+ *
+ * To explicit manage the memory ownership for the kernel vs the device by
+ * USB core, the user needs save sg_table to urb->sgt. Then USB core will
+ * do DMA sync for CPU and device properly.
+ *
+ * When the buffer is no longer used, free it with usb_free_noncoherent().
+ *
+ * Return: Either null (indicating no buffer could be allocated), or the
+ * cpu-space pointer to a buffer that may be used to perform DMA to the
+ * specified device.  Such cpu-space buffers are returned along with the DMA
+ * address (through the pointer provided).
+ */
+void *usb_alloc_noncoherent(struct usb_device *dev, size_t size,
+			    gfp_t mem_flags, dma_addr_t *dma,
+			    enum dma_data_direction dir,
+			    struct sg_table **table)
+{
+	struct device *dmadev;
+	struct sg_table *sgt;
+	void *buffer;
+
+	if (!dev || !dev->bus)
+		return NULL;
+
+	dmadev = bus_to_hcd(dev->bus)->self.sysdev;
+
+	sgt = dma_alloc_noncontiguous(dmadev, size, dir, mem_flags, 0);
+	if (!sgt)
+		return NULL;
+
+	buffer = dma_vmap_noncontiguous(dmadev, size, sgt);
+	if (!buffer) {
+		dma_free_noncontiguous(dmadev, size, sgt, dir);
+		return NULL;
+	}
+
+	*table = sgt;
+	*dma = sg_dma_address(sgt->sgl);
+
+	return buffer;
+}
+EXPORT_SYMBOL_GPL(usb_alloc_noncoherent);
+
+/**
+ * usb_free_noncoherent - free memory allocated with usb_alloc_noncoherent()
+ * @dev: device the buffer was used with
+ * @size: requested buffer size
+ * @addr: CPU address of buffer
+ * @dir: DMA transfer direction
+ * @table: describe the allocated and DMA mapped memory,
+ *
+ * This reclaims an I/O buffer, letting it be reused.  The memory must have
+ * been allocated using usb_alloc_noncoherent(), and the parameters must match
+ * those provided in that allocation request.
+ */
+void usb_free_noncoherent(struct usb_device *dev, size_t size,
+			  void *addr, enum dma_data_direction dir,
+			  struct sg_table *table)
+{
+	struct device *dmadev;
+
+	if (!dev || !dev->bus)
+		return;
+	if (!addr)
+		return;
+
+	dmadev = bus_to_hcd(dev->bus)->self.sysdev;
+	dma_vunmap_noncontiguous(dmadev, addr);
+	dma_free_noncontiguous(dmadev, size, table, dir);
+}
+EXPORT_SYMBOL_GPL(usb_free_noncoherent);
+
 /*
  * Notifications of device and interface registration
  */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 68166718ab30..535ac37198a1 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1621,6 +1621,7 @@ struct urb {
 	void *transfer_buffer;		/* (in) associated data buffer */
 	dma_addr_t transfer_dma;	/* (in) dma addr for transfer_buffer */
 	struct scatterlist *sg;		/* (in) scatter gather buffer list */
+	struct sg_table *sgt;		/* (in) scatter gather table for noncoherent buffer */
 	int num_mapped_sgs;		/* (internal) mapped sg entries */
 	int num_sgs;			/* (in) number of entries in the sg list */
 	u32 transfer_buffer_length;	/* (in) data buffer length */
@@ -1826,6 +1827,16 @@ void *usb_alloc_coherent(struct usb_device *dev, size_t size,
 void usb_free_coherent(struct usb_device *dev, size_t size,
 	void *addr, dma_addr_t dma);
 
+enum dma_data_direction;
+
+void *usb_alloc_noncoherent(struct usb_device *dev, size_t size,
+			    gfp_t mem_flags, dma_addr_t *dma,
+			    enum dma_data_direction dir,
+			    struct sg_table **table);
+void usb_free_noncoherent(struct usb_device *dev, size_t size,
+			  void *addr, enum dma_data_direction dir,
+			  struct sg_table *table);
+
 /*-------------------------------------------------------------------*
  *                         SYNCHRONOUS CALL SUPPORT                  *
  *-------------------------------------------------------------------*/
-- 
cgit v1.2.3


From f393a761763c542761abcf978252d431269366d6 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 8 Jul 2025 14:56:23 +0200
Subject: efi: add ovmf debug log driver

Recent OVMF versions (edk2-stable202508 + newer) can write their debug
log to a memory buffer.  This driver exposes the log content via sysfs
(/sys/firmware/efi/ovmf_debug_log).

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/Kconfig          |   8 +++
 drivers/firmware/efi/Makefile         |   1 +
 drivers/firmware/efi/efi.c            |   8 +++
 drivers/firmware/efi/ovmf-debug-log.c | 111 ++++++++++++++++++++++++++++++++++
 include/linux/efi.h                   |   4 ++
 5 files changed, 132 insertions(+)
 create mode 100644 drivers/firmware/efi/ovmf-debug-log.c

(limited to 'include')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index db8c5c03d3a2..eb1bff6968a5 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -263,6 +263,14 @@ config EFI_COCO_SECRET
 	  virt/coco/efi_secret module to access the secrets, which in turn
 	  allows userspace programs to access the injected secrets.
 
+config OVMF_DEBUG_LOG
+	bool "Expose OVMF firmware debug log via sysfs"
+	depends on EFI
+	help
+	  Recent OVMF versions (edk2-stable202508 + newer) can write
+	  their debug log to a memory buffer.  This driver exposes the
+	  log content via sysfs (/sys/firmware/efi/ovmf_debug_log).
+
 config UNACCEPTED_MEMORY
 	bool
 	depends on EFI_STUB
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index a2d0009560d0..8efbcf699e4f 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_APPLE_PROPERTIES)		+= apple-properties.o
 obj-$(CONFIG_EFI_RCI2_TABLE)		+= rci2-table.o
 obj-$(CONFIG_EFI_EMBEDDED_FIRMWARE)	+= embedded-firmware.o
 obj-$(CONFIG_LOAD_UEFI_KEYS)		+= mokvar-table.o
+obj-$(CONFIG_OVMF_DEBUG_LOG)		+= ovmf-debug-log.o
 
 obj-$(CONFIG_SYSFB)			+= sysfb_efi.o
 
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index e57bff702b5f..1ce428e2ac8a 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -45,6 +45,7 @@ struct efi __read_mostly efi = {
 	.esrt			= EFI_INVALID_TABLE_ADDR,
 	.tpm_log		= EFI_INVALID_TABLE_ADDR,
 	.tpm_final_log		= EFI_INVALID_TABLE_ADDR,
+	.ovmf_debug_log         = EFI_INVALID_TABLE_ADDR,
 #ifdef CONFIG_LOAD_UEFI_KEYS
 	.mokvar_table		= EFI_INVALID_TABLE_ADDR,
 #endif
@@ -473,6 +474,10 @@ static int __init efisubsys_init(void)
 		platform_device_register_simple("efi_secret", 0, NULL, 0);
 #endif
 
+	if (IS_ENABLED(CONFIG_OVMF_DEBUG_LOG) &&
+	    efi.ovmf_debug_log != EFI_INVALID_TABLE_ADDR)
+		ovmf_log_probe(efi.ovmf_debug_log);
+
 	return 0;
 
 err_remove_group:
@@ -617,6 +622,9 @@ static const efi_config_table_type_t common_tables[] __initconst = {
 	{LINUX_EFI_MEMRESERVE_TABLE_GUID,	&mem_reserve,		"MEMRESERVE"	},
 	{LINUX_EFI_INITRD_MEDIA_GUID,		&initrd,		"INITRD"	},
 	{EFI_RT_PROPERTIES_TABLE_GUID,		&rt_prop,		"RTPROP"	},
+#ifdef CONFIG_OVMF_DEBUG_LOG
+	{OVMF_MEMORY_LOG_TABLE_GUID,		&efi.ovmf_debug_log,	"OvmfDebugLog"	},
+#endif
 #ifdef CONFIG_EFI_RCI2_TABLE
 	{DELLEMC_EFI_RCI2_TABLE_GUID,		&rci2_table_phys			},
 #endif
diff --git a/drivers/firmware/efi/ovmf-debug-log.c b/drivers/firmware/efi/ovmf-debug-log.c
new file mode 100644
index 000000000000..5b2471ffaeed
--- /dev/null
+++ b/drivers/firmware/efi/ovmf-debug-log.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/sysfs.h>
+
+#define OVMF_DEBUG_LOG_MAGIC1  0x3167646d666d766f  // "ovmfmdg1"
+#define OVMF_DEBUG_LOG_MAGIC2  0x3267646d666d766f  // "ovmfmdg2"
+
+struct ovmf_debug_log_header {
+	u64    magic1;
+	u64    magic2;
+	u64    hdr_size;
+	u64    log_size;
+	u64    lock; // edk2 spinlock
+	u64    head_off;
+	u64    tail_off;
+	u64    truncated;
+	u8     fw_version[128];
+};
+
+static struct ovmf_debug_log_header *hdr;
+static u8 *logbuf;
+static u64 logbufsize;
+
+static ssize_t ovmf_log_read(struct file *filp, struct kobject *kobj,
+			     const struct bin_attribute *attr, char *buf,
+			     loff_t offset, size_t count)
+{
+	u64 start, end;
+
+	start = hdr->head_off + offset;
+	if (hdr->head_off > hdr->tail_off && start >= hdr->log_size)
+		start -= hdr->log_size;
+
+	end = start + count;
+	if (start > hdr->tail_off) {
+		if (end > hdr->log_size)
+			end = hdr->log_size;
+	} else {
+		if (end > hdr->tail_off)
+			end = hdr->tail_off;
+	}
+
+	if (start > logbufsize || end > logbufsize)
+		return 0;
+	if (start >= end)
+		return 0;
+
+	memcpy(buf, logbuf + start, end - start);
+	return end - start;
+}
+
+static struct bin_attribute ovmf_log_bin_attr = {
+	.attr = {
+		.name = "ovmf_debug_log",
+		.mode = 0444,
+	},
+	.read = ovmf_log_read,
+};
+
+int __init ovmf_log_probe(unsigned long ovmf_debug_log_table)
+{
+	int ret = -EINVAL;
+	u64 size;
+
+	/* map + verify header */
+	hdr = memremap(ovmf_debug_log_table, sizeof(*hdr), MEMREMAP_WB);
+	if (!hdr) {
+		pr_err("OVMF debug log: header map failed\n");
+		return -EINVAL;
+	}
+
+	if (hdr->magic1 != OVMF_DEBUG_LOG_MAGIC1 ||
+	    hdr->magic2 != OVMF_DEBUG_LOG_MAGIC2) {
+		printk(KERN_ERR "OVMF debug log: magic mismatch\n");
+		goto err_unmap;
+	}
+
+	size = hdr->hdr_size + hdr->log_size;
+	pr_info("OVMF debug log: firmware version: \"%s\"\n", hdr->fw_version);
+	pr_info("OVMF debug log: buffer size: %lluk\n", size / 1024);
+
+	/* map complete log buffer */
+	memunmap(hdr);
+	hdr = memremap(ovmf_debug_log_table, size, MEMREMAP_WB);
+	if (!hdr) {
+		pr_err("OVMF debug log: buffer map failed\n");
+		return -EINVAL;
+	}
+	logbuf = (void *)hdr + hdr->hdr_size;
+	logbufsize = hdr->log_size;
+
+	ovmf_log_bin_attr.size = size;
+	ret = sysfs_create_bin_file(efi_kobj, &ovmf_log_bin_attr);
+	if (ret != 0) {
+		pr_err("OVMF debug log: sysfs register failed\n");
+		goto err_unmap;
+	}
+
+	return 0;
+
+err_unmap:
+	memunmap(hdr);
+	return ret;
+}
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 7d63d1d75f22..50db7df0efab 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -439,6 +439,7 @@ void efi_native_runtime_setup(void);
 
 /* OVMF protocol GUIDs */
 #define OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID	EFI_GUID(0xc5a010fe, 0x38a7, 0x4531,  0x8a, 0x4a, 0x05, 0x00, 0xd2, 0xfd, 0x16, 0x49)
+#define OVMF_MEMORY_LOG_TABLE_GUID		EFI_GUID(0x95305139, 0xb20f, 0x4723,  0x84, 0x25, 0x62, 0x7c, 0x88, 0x8f, 0xf1, 0x21)
 
 typedef struct {
 	efi_guid_t guid;
@@ -642,6 +643,7 @@ extern struct efi {
 	unsigned long			esrt;			/* ESRT table */
 	unsigned long			tpm_log;		/* TPM2 Event Log table */
 	unsigned long			tpm_final_log;		/* TPM2 Final Events Log table */
+	unsigned long                   ovmf_debug_log;
 	unsigned long			mokvar_table;		/* MOK variable config table */
 	unsigned long			coco_secret;		/* Confidential computing secret table */
 	unsigned long			unaccepted;		/* Unaccepted memory table */
@@ -1344,6 +1346,8 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 
 umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n);
 
+int ovmf_log_probe(unsigned long ovmf_debug_log_table);
+
 /*
  * efivar ops event type
  */
-- 
cgit v1.2.3


From 6c3b746fd536b7612b23e5c2041365014b85082e Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:09 +0200
Subject: pmdomain: core: Export a common ->sync_state() helper for genpd
 providers

In some cases the typical platform driver that act as genpd provider, may
need its own ->sync_state() callback to manage various things. In this
regards, the provider most likely wants to allow its corresponding genpds
to be powered-off.

For this reason, let's introduce a new genpd helper function,
of_genpd_sync_state() that helps genpd provider drivers to achieve this.

Suggested-by: Saravana Kannan <saravanak@google.com>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-8-ulf.hansson@linaro.org
---
 drivers/pmdomain/core.c   | 27 +++++++++++++++++++++++++++
 include/linux/pm_domain.h |  3 +++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 79dc0bf406f0..0a6593a1b1c8 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -3396,6 +3396,33 @@ int of_genpd_parse_idle_states(struct device_node *dn,
 }
 EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
 
+/**
+ * of_genpd_sync_state() - A common sync_state function for genpd providers
+ * @np: The device node the genpd provider is associated with.
+ *
+ * The @np that corresponds to a genpd provider may provide one or multiple
+ * genpds. This function makes use @np to find the genpds that belongs to the
+ * provider. For each genpd we try a power-off.
+ */
+void of_genpd_sync_state(struct device_node *np)
+{
+	struct generic_pm_domain *genpd;
+
+	if (!np)
+		return;
+
+	mutex_lock(&gpd_list_lock);
+	list_for_each_entry(genpd, &gpd_list, gpd_list_node) {
+		if (genpd->provider == of_fwnode_handle(np)) {
+			genpd_lock(genpd);
+			genpd_power_off(genpd, false, 0);
+			genpd_unlock(genpd);
+		}
+	}
+	mutex_unlock(&gpd_list_lock);
+}
+EXPORT_SYMBOL_GPL(of_genpd_sync_state);
+
 static int genpd_provider_probe(struct device *dev)
 {
 	return 0;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 0b18160901a2..3578196e6626 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -431,6 +431,7 @@ int of_genpd_remove_subdomain(const struct of_phandle_args *parent_spec,
 struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
 int of_genpd_parse_idle_states(struct device_node *dn,
 			       struct genpd_power_state **states, int *n);
+void of_genpd_sync_state(struct device_node *np);
 
 int genpd_dev_pm_attach(struct device *dev);
 struct device *genpd_dev_pm_attach_by_id(struct device *dev,
@@ -476,6 +477,8 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn,
 	return -ENODEV;
 }
 
+static inline void of_genpd_sync_state(struct device_node *np) {}
+
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
 	return 0;
-- 
cgit v1.2.3


From c8c196220ce5eba9b7d4aca37a7fd4bbb965d2ed Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:10 +0200
Subject: pmdomain: core: Prepare to add the common ->sync_state() support

Before we can implement the common ->sync_state() support in genpd, we need
to allow a few specific genpd providers to opt out from the new behaviour.

Let's introduce GENPD_FLAG_NO_SYNC_STATE as a new genpd config option, to
allow providers to opt out.

Suggested-by: Saravana Kannan <saravanak@google.com>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-9-ulf.hansson@linaro.org
---
 include/linux/pm_domain.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 3578196e6626..9329554b9c4a 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -104,6 +104,11 @@ struct dev_pm_domain_list {
  * GENPD_FLAG_DEV_NAME_FW:	Instructs genpd to generate an unique device name
  *				using ida. It is used by genpd providers which
  *				get their genpd-names directly from FW.
+ *
+ * GENPD_FLAG_NO_SYNC_STATE:	The ->sync_state() support is implemented in a
+ *				genpd provider specific way, likely through a
+ *				parent device node. This flag makes genpd to
+ *				skip its internal support for this.
  */
 #define GENPD_FLAG_PM_CLK	 (1U << 0)
 #define GENPD_FLAG_IRQ_SAFE	 (1U << 1)
@@ -114,6 +119,7 @@ struct dev_pm_domain_list {
 #define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
 #define GENPD_FLAG_OPP_TABLE_FW	 (1U << 7)
 #define GENPD_FLAG_DEV_NAME_FW	 (1U << 8)
+#define GENPD_FLAG_NO_SYNC_STATE (1U << 9)
 
 enum gpd_status {
 	GENPD_STATE_ON = 0,	/* PM domain is on */
-- 
cgit v1.2.3


From 10086a4f391f4b9c969a21e785e7fa0fa6c023e5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:17 +0200
Subject: firmware: xilinx: Don't share zynqmp_pm_init_finalize()

As there no longer any users outside the zynqmp firmware driver of
zynqmp_pm_init_finalize(), let's turn into a local static function.

Cc: Michal Simek <michal.simek@amd.com>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-16-ulf.hansson@linaro.org
---
 drivers/firmware/xilinx/zynqmp.c     | 3 +--
 include/linux/firmware/xlnx-zynqmp.h | 6 ------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c
index a91a0191c689..87ddbb7d11c2 100644
--- a/drivers/firmware/xilinx/zynqmp.c
+++ b/drivers/firmware/xilinx/zynqmp.c
@@ -1299,11 +1299,10 @@ EXPORT_SYMBOL_GPL(zynqmp_pm_bootmode_write);
  * This API function is to be used for notify the power management controller
  * about the completed power management initialization.
  */
-int zynqmp_pm_init_finalize(void)
+static int zynqmp_pm_init_finalize(void)
 {
 	return zynqmp_pm_invoke_fn(PM_PM_INIT_FINALIZE, NULL, 0);
 }
-EXPORT_SYMBOL_GPL(zynqmp_pm_init_finalize);
 
 /**
  * zynqmp_pm_set_suspend_mode()	- Set system suspend mode
diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 6d4dbc196b93..ae48d619c4e0 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -585,7 +585,6 @@ int zynqmp_pm_reset_assert(const u32 reset,
 int zynqmp_pm_reset_get_status(const u32 reset, u32 *status);
 unsigned int zynqmp_pm_bootmode_read(u32 *ps_mode);
 int zynqmp_pm_bootmode_write(u32 ps_mode);
-int zynqmp_pm_init_finalize(void);
 int zynqmp_pm_set_suspend_mode(u32 mode);
 int zynqmp_pm_request_node(const u32 node, const u32 capabilities,
 			   const u32 qos, const enum zynqmp_pm_request_ack ack);
@@ -746,11 +745,6 @@ static inline int zynqmp_pm_bootmode_write(u32 ps_mode)
 	return -ENODEV;
 }
 
-static inline int zynqmp_pm_init_finalize(void)
-{
-	return -ENODEV;
-}
-
 static inline int zynqmp_pm_set_suspend_mode(u32 mode)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 9a4681a485ee1203ac968065490a8eeaa6615503 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:19 +0200
Subject: driver core: Export get_dev_from_fwnode()

It has turned out get_dev_from_fwnode() is useful at a few other places
outside of the driver core, as in gpiolib.c for example. Therefore let's
make it available as a common helper function.

Suggested-by: Saravana Kannan <saravanak@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-18-ulf.hansson@linaro.org
---
 drivers/base/core.c    | 8 ++++++--
 include/linux/device.h | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index cbc0099d8ef2..6f91ece7c06a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1881,8 +1881,6 @@ static void fw_devlink_unblock_consumers(struct device *dev)
 	device_links_write_unlock();
 }
 
-#define get_dev_from_fwnode(fwnode)	get_device((fwnode)->dev)
-
 static bool fwnode_init_without_drv(struct fwnode_handle *fwnode)
 {
 	struct device *dev;
@@ -5281,6 +5279,12 @@ void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(device_set_node);
 
+struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode)
+{
+	return get_device((fwnode)->dev);
+}
+EXPORT_SYMBOL_GPL(get_dev_from_fwnode);
+
 int device_match_name(struct device *dev, const void *name)
 {
 	return sysfs_streq(dev_name(dev), name);
diff --git a/include/linux/device.h b/include/linux/device.h
index 4940db137fff..315b00171335 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1048,6 +1048,7 @@ void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
 int device_add_of_node(struct device *dev, struct device_node *of_node);
 void device_remove_of_node(struct device *dev);
 void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);
+struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode);
 
 static inline struct device_node *dev_of_node(struct device *dev)
 {
-- 
cgit v1.2.3


From 3b7b8acacf372945b4855a136634775c064a57f8 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:20 +0200
Subject: pmdomain: core: Add common ->sync_state() support for genpd providers

If the genpd provider's fwnode doesn't have an associated struct device
with it, we can make use of the generic genpd->dev and it corresponding
driver internally in genpd to manage ->sync_state().

More precisely, while adding a genpd OF provider let's check if the fwnode
has a device and if not, make the preparation to handle ->sync_state()
internally through the genpd_provider_driver and the genpd_provider_bus.

Note that, genpd providers may opt out from this behaviour by setting the
GENPD_FLAG_NO_SYNC_STATE config options for the genpds in question.

Suggested-by: Saravana Kannan <saravanak@google.com>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-19-ulf.hansson@linaro.org
---
 drivers/pmdomain/core.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/pm_domain.h |  7 +++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 0a6593a1b1c8..ca47f91b9e91 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -186,6 +186,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = {
 #define genpd_is_rpm_always_on(genpd)	(genpd->flags & GENPD_FLAG_RPM_ALWAYS_ON)
 #define genpd_is_opp_table_fw(genpd)	(genpd->flags & GENPD_FLAG_OPP_TABLE_FW)
 #define genpd_is_dev_name_fw(genpd)	(genpd->flags & GENPD_FLAG_DEV_NAME_FW)
+#define genpd_is_no_sync_state(genpd)	(genpd->flags & GENPD_FLAG_NO_SYNC_STATE)
 
 static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
 		const struct generic_pm_domain *genpd)
@@ -2351,6 +2352,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
 	genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON;
+	genpd->sync_state = GENPD_SYNC_STATE_OFF;
 	genpd->device_count = 0;
 	genpd->provider = NULL;
 	genpd->device_id = -ENXIO;
@@ -2606,6 +2608,8 @@ static bool genpd_present(const struct generic_pm_domain *genpd)
 int of_genpd_add_provider_simple(struct device_node *np,
 				 struct generic_pm_domain *genpd)
 {
+	struct fwnode_handle *fwnode;
+	struct device *dev;
 	int ret;
 
 	if (!np || !genpd)
@@ -2619,6 +2623,15 @@ int of_genpd_add_provider_simple(struct device_node *np,
 
 	genpd->dev.of_node = np;
 
+	fwnode = of_fwnode_handle(np);
+	dev = get_dev_from_fwnode(fwnode);
+	if (!dev && !genpd_is_no_sync_state(genpd)) {
+		genpd->sync_state = GENPD_SYNC_STATE_SIMPLE;
+		device_set_node(&genpd->dev, fwnode);
+	}
+
+	put_device(dev);
+
 	ret = device_add(&genpd->dev);
 	if (ret)
 		return ret;
@@ -2643,7 +2656,7 @@ int of_genpd_add_provider_simple(struct device_node *np,
 	if (ret)
 		goto err_opp;
 
-	genpd->provider = &np->fwnode;
+	genpd->provider = fwnode;
 	genpd->has_provider = true;
 
 	return 0;
@@ -2668,8 +2681,11 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 				  struct genpd_onecell_data *data)
 {
 	struct generic_pm_domain *genpd;
+	struct fwnode_handle *fwnode;
+	struct device *dev;
 	unsigned int i;
 	int ret = -EINVAL;
+	bool sync_state = false;
 
 	if (!np || !data)
 		return -EINVAL;
@@ -2680,6 +2696,13 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 	if (!data->xlate)
 		data->xlate = genpd_xlate_onecell;
 
+	fwnode = of_fwnode_handle(np);
+	dev = get_dev_from_fwnode(fwnode);
+	if (!dev)
+		sync_state = true;
+
+	put_device(dev);
+
 	for (i = 0; i < data->num_domains; i++) {
 		genpd = data->domains[i];
 
@@ -2690,6 +2713,12 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 
 		genpd->dev.of_node = np;
 
+		if (sync_state && !genpd_is_no_sync_state(genpd)) {
+			genpd->sync_state = GENPD_SYNC_STATE_ONECELL;
+			device_set_node(&genpd->dev, fwnode);
+			sync_state = false;
+		}
+
 		ret = device_add(&genpd->dev);
 		if (ret)
 			goto error;
@@ -2712,7 +2741,7 @@ int of_genpd_add_provider_onecell(struct device_node *np,
 			WARN_ON(IS_ERR(genpd->opp_table));
 		}
 
-		genpd->provider = &np->fwnode;
+		genpd->provider = fwnode;
 		genpd->has_provider = true;
 	}
 
@@ -3430,6 +3459,25 @@ static int genpd_provider_probe(struct device *dev)
 
 static void genpd_provider_sync_state(struct device *dev)
 {
+	struct generic_pm_domain *genpd = container_of(dev, struct generic_pm_domain, dev);
+
+	switch (genpd->sync_state) {
+	case GENPD_SYNC_STATE_OFF:
+		break;
+
+	case GENPD_SYNC_STATE_ONECELL:
+		of_genpd_sync_state(dev->of_node);
+		break;
+
+	case GENPD_SYNC_STATE_SIMPLE:
+		genpd_lock(genpd);
+		genpd_power_off(genpd, false, 0);
+		genpd_unlock(genpd);
+		break;
+
+	default:
+		break;
+	}
 }
 
 static struct device_driver genpd_provider_drv = {
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 9329554b9c4a..d68e07dadc99 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -133,6 +133,12 @@ enum genpd_notication {
 	GENPD_NOTIFY_ON,
 };
 
+enum genpd_sync_state {
+	GENPD_SYNC_STATE_OFF = 0,
+	GENPD_SYNC_STATE_SIMPLE,
+	GENPD_SYNC_STATE_ONECELL,
+};
+
 struct dev_power_governor {
 	bool (*power_down_ok)(struct dev_pm_domain *domain);
 	bool (*suspend_ok)(struct device *dev);
@@ -193,6 +199,7 @@ struct generic_pm_domain {
 	unsigned int performance_state;	/* Aggregated max performance state */
 	cpumask_var_t cpus;		/* A cpumask of the attached CPUs */
 	bool synced_poweroff;		/* A consumer needs a synced poweroff */
+	enum genpd_sync_state sync_state; /* How sync_state is managed. */
 	int (*power_off)(struct generic_pm_domain *domain);
 	int (*power_on)(struct generic_pm_domain *domain);
 	struct raw_notifier_head power_notifiers; /* Power on/off notifiers */
-- 
cgit v1.2.3


From 2b5630e9886f9121535d421ebfb240a9535f3f1e Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Tue, 1 Jul 2025 13:47:21 +0200
Subject: driver core: Add dev_set_drv_sync_state()

This can be used by frameworks to set the sync_state() helper functions
for drivers that don't already have them set.

Signed-off-by: Saravana Kannan <saravanak@google.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-20-ulf.hansson@linaro.org
---
 include/linux/device.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 315b00171335..686f2a578fbd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -917,6 +917,18 @@ static inline bool dev_has_sync_state(struct device *dev)
 	return false;
 }
 
+static inline int dev_set_drv_sync_state(struct device *dev,
+					 void (*fn)(struct device *dev))
+{
+	if (!dev || !dev->driver)
+		return 0;
+	if (dev->driver->sync_state && dev->driver->sync_state != fn)
+		return -EBUSY;
+	if (!dev->driver->sync_state)
+		dev->driver->sync_state = fn;
+	return 0;
+}
+
 static inline void dev_set_removable(struct device *dev,
 				     enum device_removable removable)
 {
-- 
cgit v1.2.3


From 13a4b7fb62600e1c0738fdb0b7176555ff05aadf Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 1 Jul 2025 13:47:23 +0200
Subject: pmdomain: core: Leave powered-on genpds on until late_initcall_sync

Powering-off a genpd that was on during boot, before all of its consumer
devices have been probed, is certainly prone to problems.

As a step to improve this situation, let's prevent these genpds from being
powered-off until genpd_power_off_unused() gets called, which is a
late_initcall_sync().

Note that, this still doesn't guarantee that all the consumer devices has
been probed before we allow to power-off the genpds. Yet, this should be a
step in the right direction.

Suggested-by: Saravana Kannan <saravanak@google.com>
Tested-by: Hiago De Franco <hiago.franco@toradex.com> # Colibri iMX8X
Tested-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com> # TI AM62A,Xilinx ZynqMP ZCU106
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250701114733.636510-22-ulf.hansson@linaro.org
---
 drivers/pmdomain/core.c   | 10 ++++++++--
 include/linux/pm_domain.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 5cef6de60c72..18951ed6295d 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -931,11 +931,12 @@ static void genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
 	 * The domain is already in the "power off" state.
 	 * System suspend is in progress.
 	 * The domain is configured as always on.
+	 * The domain was on at boot and still need to stay on.
 	 * The domain has a subdomain being powered on.
 	 */
 	if (!genpd_status_on(genpd) || genpd->prepared_count > 0 ||
 	    genpd_is_always_on(genpd) || genpd_is_rpm_always_on(genpd) ||
-	    atomic_read(&genpd->sd_count) > 0)
+	    genpd->stay_on || atomic_read(&genpd->sd_count) > 0)
 		return;
 
 	/*
@@ -1346,8 +1347,12 @@ static int __init genpd_power_off_unused(void)
 	pr_info("genpd: Disabling unused power domains\n");
 	mutex_lock(&gpd_list_lock);
 
-	list_for_each_entry(genpd, &gpd_list, gpd_list_node)
+	list_for_each_entry(genpd, &gpd_list, gpd_list_node) {
+		genpd_lock(genpd);
+		genpd->stay_on = false;
+		genpd_unlock(genpd);
 		genpd_queue_power_off_work(genpd);
+	}
 
 	mutex_unlock(&gpd_list_lock);
 
@@ -2352,6 +2357,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
 	genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON;
+	genpd->stay_on = !is_off;
 	genpd->sync_state = GENPD_SYNC_STATE_OFF;
 	genpd->device_count = 0;
 	genpd->provider = NULL;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index d68e07dadc99..99556589f45e 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -199,6 +199,7 @@ struct generic_pm_domain {
 	unsigned int performance_state;	/* Aggregated max performance state */
 	cpumask_var_t cpus;		/* A cpumask of the attached CPUs */
 	bool synced_poweroff;		/* A consumer needs a synced poweroff */
+	bool stay_on;			/* Stay powered-on during boot. */
 	enum genpd_sync_state sync_state; /* How sync_state is managed. */
 	int (*power_off)(struct generic_pm_domain *domain);
 	int (*power_on)(struct generic_pm_domain *domain);
-- 
cgit v1.2.3


From d9bc88aa54d6aa22ff1e850a86be7a37f0503889 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jul 2025 22:24:19 +0100
Subject: debugfs: split short and full proxy wrappers, kill
 debugfs_real_fops()

All users outside of fs/debugfs/file.c are gone, in there we can just
fully split the wrappers for full and short cases and be done with that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20250702212419.GG3406663@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/file.c       | 87 +++++++++++++++++++++----------------------------
 include/linux/debugfs.h |  2 --
 2 files changed, 38 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 69e9ddcb113d..77784091a10f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -53,23 +53,6 @@ const void *debugfs_get_aux(const struct file *file)
 }
 EXPORT_SYMBOL_GPL(debugfs_get_aux);
 
-const struct file_operations *debugfs_real_fops(const struct file *filp)
-{
-	struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
-
-	if (!fsd) {
-		/*
-		 * Urgh, we've been called w/o a protecting
-		 * debugfs_file_get().
-		 */
-		WARN_ON(1);
-		return NULL;
-	}
-
-	return fsd->real_fops;
-}
-EXPORT_SYMBOL_GPL(debugfs_real_fops);
-
 enum dbgfs_get_mode {
 	DBGFS_GET_ALREADY,
 	DBGFS_GET_REGULAR,
@@ -302,15 +285,13 @@ static int debugfs_locked_down(struct inode *inode,
 static int open_proxy_open(struct inode *inode, struct file *filp)
 {
 	struct dentry *dentry = F_DENTRY(filp);
-	const struct file_operations *real_fops = NULL;
+	const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops;
 	int r;
 
 	r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
 	if (r)
 		return r == -EIO ? -ENOENT : r;
 
-	real_fops = debugfs_real_fops(filp);
-
 	r = debugfs_locked_down(inode, filp, real_fops);
 	if (r)
 		goto out;
@@ -352,7 +333,6 @@ static ret_type full_proxy_ ## name(proto)				\
 {									\
 	struct dentry *dentry = F_DENTRY(filp);				\
 	struct debugfs_fsdata *fsd = dentry->d_fsdata;			\
-	const struct file_operations *real_fops;			\
 	ret_type r;							\
 									\
 	if (!(fsd->methods & bit))					\
@@ -360,14 +340,13 @@ static ret_type full_proxy_ ## name(proto)				\
 	r = debugfs_file_get(dentry);					\
 	if (unlikely(r))						\
 		return r;						\
-	real_fops = debugfs_real_fops(filp);				\
-	r = real_fops->name(args);					\
+	r = fsd->real_fops->name(args);					\
 	debugfs_file_put(dentry);					\
 	return r;							\
 }
 
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret)	\
-static ret_type full_proxy_ ## name(proto)				\
+#define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret)	\
+static ret_type short_proxy_ ## name(proto)				\
 {									\
 	struct dentry *dentry = F_DENTRY(filp);				\
 	struct debugfs_fsdata *fsd = dentry->d_fsdata;			\
@@ -378,27 +357,38 @@ static ret_type full_proxy_ ## name(proto)				\
 	r = debugfs_file_get(dentry);					\
 	if (unlikely(r))						\
 		return r;						\
-	if (fsd->real_fops)						\
-		r = fsd->real_fops->name(args);				\
-	else								\
-		r = fsd->short_fops->name(args);			\
+	r = fsd->short_fops->name(args);				\
 	debugfs_file_put(dentry);					\
 	return r;							\
 }
 
-FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
-		     PROTO(struct file *filp, loff_t offset, int whence),
-		     ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
+SHORT_PROXY_FUNC(llseek, loff_t, filp,
+		PROTO(struct file *filp, loff_t offset, int whence),
+		ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
 
-FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
-		     PROTO(struct file *filp, char __user *buf, size_t size,
-			   loff_t *ppos),
-		     ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+		PROTO(struct file *filp, loff_t offset, int whence),
+		ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
 
-FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
-		     PROTO(struct file *filp, const char __user *buf,
-			   size_t size, loff_t *ppos),
-		     ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+SHORT_PROXY_FUNC(read, ssize_t, filp,
+		PROTO(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+		PROTO(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos),
+		ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+SHORT_PROXY_FUNC(write, ssize_t, filp,
+		PROTO(struct file *filp, const char __user *buf,
+			size_t size, loff_t *ppos),
+		ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+		PROTO(struct file *filp, const char __user *buf,
+			size_t size, loff_t *ppos),
+		ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
 
 FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
 		PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -410,22 +400,21 @@ static __poll_t full_proxy_poll(struct file *filp,
 	struct dentry *dentry = F_DENTRY(filp);
 	struct debugfs_fsdata *fsd = dentry->d_fsdata;
 	__poll_t r = 0;
-	const struct file_operations *real_fops;
 
 	if (!(fsd->methods & HAS_POLL))
 		return DEFAULT_POLLMASK;
 	if (debugfs_file_get(dentry))
 		return EPOLLHUP;
 
-	real_fops = debugfs_real_fops(filp);
-	r = real_fops->poll(filp, wait);
+	r = fsd->real_fops->poll(filp, wait);
 	debugfs_file_put(dentry);
 	return r;
 }
 
-static int full_proxy_release(struct inode *inode, struct file *filp)
+static int full_proxy_release(struct inode *inode, struct file *file)
 {
-	const struct file_operations *real_fops = debugfs_real_fops(filp);
+	struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata;
+	const struct file_operations *real_fops = fsd->real_fops;
 	int r = 0;
 
 	/*
@@ -435,7 +424,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
 	 * ->i_private is still being meaningful here.
 	 */
 	if (real_fops->release)
-		r = real_fops->release(inode, filp);
+		r = real_fops->release(inode, file);
 
 	fops_put(real_fops);
 	return r;
@@ -517,9 +506,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp)
 
 const struct file_operations debugfs_full_short_proxy_file_operations = {
 	.open = full_proxy_open_short,
-	.llseek = full_proxy_llseek,
-	.read = full_proxy_read,
-	.write = full_proxy_write,
+	.llseek = short_proxy_llseek,
+	.read = short_proxy_read,
+	.write = short_proxy_write,
 };
 
 ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index fa2568b4380d..a420152105d0 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -162,7 +162,6 @@ void debugfs_remove(struct dentry *dentry);
 
 void debugfs_lookup_and_remove(const char *name, struct dentry *parent);
 
-const struct file_operations *debugfs_real_fops(const struct file *filp);
 const void *debugfs_get_aux(const struct file *file);
 
 int debugfs_file_get(struct dentry *dentry);
@@ -329,7 +328,6 @@ static inline void debugfs_lookup_and_remove(const char *name,
 					     struct dentry *parent)
 { }
 
-const struct file_operations *debugfs_real_fops(const struct file *filp);
 void *debugfs_get_aux(const struct file *file);
 
 static inline int debugfs_file_get(struct dentry *dentry)
-- 
cgit v1.2.3


From 9d3b96be2ee81a7d6ad08cb5094753f06382db1b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jul 2025 22:26:16 +0100
Subject: debugfs_get_aux(): allow storing non-const void *

typechecking is up to users, anyway...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20250702212616.GI3406663@ZenIV
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/greybus/camera.c | 2 +-
 fs/debugfs/file.c                | 2 +-
 fs/debugfs/inode.c               | 2 +-
 fs/debugfs/internal.h            | 2 +-
 include/linux/debugfs.h          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/staging/greybus/camera.c b/drivers/staging/greybus/camera.c
index ec9fddfc0b14..5ac19c0055d9 100644
--- a/drivers/staging/greybus/camera.c
+++ b/drivers/staging/greybus/camera.c
@@ -1128,7 +1128,7 @@ done:
 
 static int gb_camera_debugfs_open(struct inode *inode, struct file *file)
 {
-	file->private_data = (void *)debugfs_get_aux(file);
+	file->private_data = debugfs_get_aux(file);
 	return 0;
 }
 
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 77784091a10f..3ec3324c2060 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,7 +47,7 @@ const struct file_operations debugfs_noop_file_operations = {
 
 #define F_DENTRY(filp) ((filp)->f_path.dentry)
 
-const void *debugfs_get_aux(const struct file *file)
+void *debugfs_get_aux(const struct file *file)
 {
 	return DEBUGFS_I(file_inode(file))->aux;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30c4944e1862..43e5d1bf1f32 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -459,7 +459,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
 		proxy_fops = &debugfs_noop_file_operations;
 	inode->i_fop = proxy_fops;
 	DEBUGFS_I(inode)->raw = real_fops;
-	DEBUGFS_I(inode)->aux = aux;
+	DEBUGFS_I(inode)->aux = (void *)aux;
 
 	d_instantiate(dentry, inode);
 	fsnotify_create(d_inode(dentry->d_parent), dentry);
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 93483fe84425..427987f81571 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,7 +19,7 @@ struct debugfs_inode_info {
 		const struct debugfs_short_fops *short_fops;
 		debugfs_automount_t automount;
 	};
-	const void *aux;
+	void *aux;
 };
 
 static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index a420152105d0..7cecda29447e 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -162,7 +162,7 @@ void debugfs_remove(struct dentry *dentry);
 
 void debugfs_lookup_and_remove(const char *name, struct dentry *parent);
 
-const void *debugfs_get_aux(const struct file *file);
+void *debugfs_get_aux(const struct file *file);
 
 int debugfs_file_get(struct dentry *dentry);
 void debugfs_file_put(struct dentry *dentry);
-- 
cgit v1.2.3


From 570c8efd5eb79c3725ba439ce105ed1bedc5acd9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 May 2025 17:28:00 +0200
Subject: sched/psi: Optimize psi_group_change() cpu_clock() usage

Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus
pressure spikes from aggregation race") caused a regression for him on
a high context switch rate benchmark (schbench) due to the now
repeating cpu_clock() calls.

In particular the problem is that get_recent_times() will extrapolate
the current state to 'now'. But if an update uses a timestamp from
before the start of the update, it is possible to get two reads
with inconsistent results. It is effectively back-dating an update.

(note that this all hard-relies on the clock being synchronized across
CPUs -- if this is not the case, all bets are off).

Combine this problem with the fact that there are per-group-per-cpu
seqcounts, the commit in question pushed the clock read into the group
iteration, causing tree-depth cpu_clock() calls. On architectures
where cpu_clock() has appreciable overhead, this hurts.

Instead move to a per-cpu seqcount, which allows us to have a single
clock read for all group updates, increasing internal consistency and
lowering update overhead. This comes at the cost of a longer update
side (proportional to the tree depth) which can cause the read side to
retry more often.

Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race")
Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>,
Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net
---
 include/linux/psi_types.h |   6 +--
 kernel/sched/psi.c        | 121 +++++++++++++++++++++++++---------------------
 2 files changed, 68 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index f1fd3a8044e0..dd10c22299ab 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -84,11 +84,9 @@ enum psi_aggregators {
 struct psi_group_cpu {
 	/* 1st cacheline updated by the scheduler */
 
-	/* Aggregator needs to know of concurrent changes */
-	seqcount_t seq ____cacheline_aligned_in_smp;
-
 	/* States of the tasks belonging to this group */
-	unsigned int tasks[NR_PSI_TASK_COUNTS];
+	unsigned int tasks[NR_PSI_TASK_COUNTS]
+			____cacheline_aligned_in_smp;
 
 	/* Aggregate pressure state derived from the tasks */
 	u32 state_mask;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 5837cd8e7b97..2024c1d36402 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -176,6 +176,28 @@ struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
+static DEFINE_PER_CPU(seqcount_t, psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+	write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+	write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+	return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+	return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
 static void psi_avgs_work(struct work_struct *work);
 
 static void poll_timer_fn(struct timer_list *t);
@@ -186,7 +208,7 @@ static void group_init(struct psi_group *group)
 
 	group->enabled = true;
 	for_each_possible_cpu(cpu)
-		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+		seqcount_init(per_cpu_ptr(&psi_seq, cpu));
 	group->avg_last_update = sched_clock();
 	group->avg_next_update = group->avg_last_update + psi_period;
 	mutex_init(&group->avgs_lock);
@@ -266,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
 
 	/* Snapshot a coherent view of the CPU state */
 	do {
-		seq = read_seqcount_begin(&groupc->seq);
+		seq = psi_read_begin(cpu);
 		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
 		if (cpu == current_cpu)
 			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
-	} while (read_seqcount_retry(&groupc->seq, seq));
+	} while (psi_read_retry(cpu, seq));
 
 	/* Calculate state time deltas against the previous snapshot */
 	for (s = 0; s < NR_PSI_STATES; s++) {
@@ -772,30 +794,20 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
+#define for_each_group(iter, group) \
+	for (typeof(group) iter = group; iter; iter = iter->parent)
+
 static void psi_group_change(struct psi_group *group, int cpu,
 			     unsigned int clear, unsigned int set,
-			     bool wake_clock)
+			     u64 now, bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
 	u32 state_mask;
-	u64 now;
 
 	lockdep_assert_rq_held(cpu_rq(cpu));
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
-	/*
-	 * First we update the task counts according to the state
-	 * change requested through the @clear and @set bits.
-	 *
-	 * Then if the cgroup PSI stats accounting enabled, we
-	 * assess the aggregate resource states this CPU's tasks
-	 * have been in since the last change, and account any
-	 * SOME and FULL time these may have resulted in.
-	 */
-	write_seqcount_begin(&groupc->seq);
-	now = cpu_clock(cpu);
-
 	/*
 	 * Start with TSK_ONCPU, which doesn't have a corresponding
 	 * task count - it's just a boolean flag directly encoded in
@@ -847,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 		groupc->state_mask = state_mask;
 
-		write_seqcount_end(&groupc->seq);
 		return;
 	}
 
@@ -868,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
 
 	groupc->state_mask = state_mask;
 
-	write_seqcount_end(&groupc->seq);
-
 	if (state_mask & group->rtpoll_states)
 		psi_schedule_rtpoll_work(group, 1, false);
 
@@ -904,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
 void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
-	struct psi_group *group;
+	u64 now;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
 
-	group = task_psi_group(task);
-	do {
-		psi_group_change(group, cpu, clear, set, true);
-	} while ((group = group->parent));
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
+	for_each_group(group, task_psi_group(task))
+		psi_group_change(group, cpu, clear, set, now, true);
+	psi_write_end(cpu);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep)
 {
-	struct psi_group *group, *common = NULL;
+	struct psi_group *common = NULL;
 	int cpu = task_cpu(prev);
+	u64 now;
+
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 * ancestors with @prev, those will already have @prev's
 		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 */
-		group = task_psi_group(next);
-		do {
-			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
-			    PSI_ONCPU) {
+		for_each_group(group, task_psi_group(next)) {
+			struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+			if (groupc->state_mask & PSI_ONCPU) {
 				common = group;
 				break;
 			}
-
-			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
-		} while ((group = group->parent));
+			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+		}
 	}
 
 	if (prev->pid) {
@@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 
 		psi_flags_change(prev, clear, set);
 
-		group = task_psi_group(prev);
-		do {
+		for_each_group(group, task_psi_group(prev)) {
 			if (group == common)
 				break;
-			psi_group_change(group, cpu, clear, set, wake_clock);
-		} while ((group = group->parent));
+			psi_group_change(group, cpu, clear, set, now, wake_clock);
+		}
 
 		/*
 		 * TSK_ONCPU is handled up to the common ancestor. If there are
@@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		 */
 		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = group->parent)
-				psi_group_change(group, cpu, clear, set, wake_clock);
+			for_each_group(group, common)
+				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
+	psi_write_end(cpu);
 }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
 {
 	int cpu = task_cpu(curr);
-	struct psi_group *group;
 	struct psi_group_cpu *groupc;
 	s64 delta;
 	u64 irq;
+	u64 now;
 
 	if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
 		return;
@@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 		return;
 
 	lockdep_assert_rq_held(rq);
-	group = task_psi_group(curr);
-	if (prev && task_psi_group(prev) == group)
+	if (prev && task_psi_group(prev) == task_psi_group(curr))
 		return;
 
 	irq = irq_time_read(cpu);
@@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 		return;
 	rq->psi_irq_time = irq;
 
-	do {
-		u64 now;
+	psi_write_begin(cpu);
+	now = cpu_clock(cpu);
 
+	for_each_group(group, task_psi_group(curr)) {
 		if (!group->enabled)
 			continue;
 
 		groupc = per_cpu_ptr(group->pcpu, cpu);
 
-		write_seqcount_begin(&groupc->seq);
-		now = cpu_clock(cpu);
-
 		record_times(groupc, now);
 		groupc->times[PSI_IRQ_FULL] += delta;
 
-		write_seqcount_end(&groupc->seq);
-
 		if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
 			psi_schedule_rtpoll_work(group, 1, false);
-	} while ((group = group->parent));
+	}
+	psi_write_end(cpu);
 }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
@@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group)
 		return;
 
 	for_each_possible_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
-		struct rq_flags rf;
+		u64 now;
 
-		rq_lock_irq(rq, &rf);
-		psi_group_change(group, cpu, 0, 0, true);
-		rq_unlock_irq(rq, &rf);
+		guard(rq_lock_irq)(cpu_rq(cpu));
+
+		psi_write_begin(cpu);
+		now = cpu_clock(cpu);
+		psi_group_change(group, cpu, 0, 0, now, true);
+		psi_write_end(cpu);
 	}
 }
 #endif /* CONFIG_CGROUPS */
-- 
cgit v1.2.3


From cccb45d7c4295bbfeba616582d0249f2d21e6df5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 20 May 2025 11:19:30 +0200
Subject: sched/deadline: Less agressive dl_server handling

Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default
bandwidth control") caused a significant dip in his favourite
benchmark of the day. Simply disabling dl_server cured things.

His workload hammers the 0->1, 1->0 transitions, and the
dl_server_{start,stop}() overhead kills it -- fairly obviously a bad
idea in hind sight and all that.

Change things around to only disable the dl_server when there has not
been a fair task around for a whole period. Since the default period
is 1 second, this ensures the benchmark never trips this, overhead
gone.

Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server")
Reported-by: Chris Mason <clm@meta.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lkml.kernel.org/r/20250702121158.465086194@infradead.org
---
 include/linux/sched.h   |  1 +
 kernel/sched/deadline.c | 25 ++++++++++++++++++++++---
 kernel/sched/fair.c     |  9 ---------
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eec6b225e9d1..4802fcf738cd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -698,6 +698,7 @@ struct sched_dl_entity {
 	unsigned int			dl_defer	  : 1;
 	unsigned int			dl_defer_armed	  : 1;
 	unsigned int			dl_defer_running  : 1;
+	unsigned int			dl_server_idle    : 1;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0f30697ad795..23668fc60bd3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1150,6 +1150,8 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
 /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
 static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
 
+static bool dl_server_stopped(struct sched_dl_entity *dl_se);
+
 static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
 {
 	struct rq *rq = rq_of_dl_se(dl_se);
@@ -1169,6 +1171,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
 
 		if (!dl_se->server_has_tasks(dl_se)) {
 			replenish_dl_entity(dl_se);
+			dl_server_stopped(dl_se);
 			return HRTIMER_NORESTART;
 		}
 
@@ -1572,8 +1575,10 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
 {
 	/* 0 runtime = fair server disabled */
-	if (dl_se->dl_runtime)
+	if (dl_se->dl_runtime) {
+		dl_se->dl_server_idle = 0;
 		update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+	}
 }
 
 void dl_server_start(struct sched_dl_entity *dl_se)
@@ -1596,7 +1601,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
 		setup_new_dl_entity(dl_se);
 	}
 
-	if (!dl_se->dl_runtime)
+	if (!dl_se->dl_runtime || dl_se->dl_server_active)
 		return;
 
 	dl_se->dl_server_active = 1;
@@ -1617,6 +1622,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
 	dl_se->dl_server_active = 0;
 }
 
+static bool dl_server_stopped(struct sched_dl_entity *dl_se)
+{
+	if (!dl_se->dl_server_active)
+		return false;
+
+	if (dl_se->dl_server_idle) {
+		dl_server_stop(dl_se);
+		return true;
+	}
+
+	dl_se->dl_server_idle = 1;
+	return false;
+}
+
 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
 		    dl_server_has_tasks_f has_tasks,
 		    dl_server_pick_f pick_task)
@@ -2354,7 +2373,7 @@ again:
 	if (dl_server(dl_se)) {
 		p = dl_se->server_pick_task(dl_se);
 		if (!p) {
-			if (dl_server_active(dl_se)) {
+			if (!dl_server_stopped(dl_se)) {
 				dl_se->dl_yielded = 1;
 				update_curr_dl_se(rq, dl_se, 0);
 			}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab0822cc51c2..a1350c513a87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5802,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long queued_delta, runnable_delta, idle_delta, dequeue = 1;
-	long rq_h_nr_queued = rq->cfs.h_nr_queued;
 
 	raw_spin_lock(&cfs_b->lock);
 	/* This will start the period timer if necessary */
@@ -5886,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
 	/* At this point se is NULL and we are at root level*/
 	sub_nr_running(rq, queued_delta);
-
-	/* Stop the fair server if throttling resulted in no runnable tasks */
-	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
-		dl_server_stop(&rq->fair_server);
 done:
 	/*
 	 * Note: distribution will already see us throttled via the
@@ -6966,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se);
 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 {
 	bool was_sched_idle = sched_idle_rq(rq);
-	int rq_h_nr_queued = rq->cfs.h_nr_queued;
 	bool task_sleep = flags & DEQUEUE_SLEEP;
 	bool task_delayed = flags & DEQUEUE_DELAYED;
 	struct task_struct *p = NULL;
@@ -7050,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 	sub_nr_running(rq, h_nr_queued);
 
-	if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
-		dl_server_stop(&rq->fair_server);
-
 	/* balance early to pull high priority tasks */
 	if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
 		rq->next_balance = jiffies;
-- 
cgit v1.2.3


From 74eec63661d46a7153d04c2e0249eeb76cc76d44 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Jul 2025 18:56:26 +0200
Subject: sched/fair: Fix NO_RUN_TO_PARITY case

EEVDF expects the scheduler to allocate a time quantum to the selected
entity and then pick a new entity for next quantum.
Although this notion of time quantum is not strictly doable in our case,
we can ensure a minimum runtime for each task most of the time and pick a
new entity after a minimum time has elapsed.
Reuse the slice protection of run to parity to ensure such runtime
quantum.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20250708165630.1948751-3-vincent.guittot@linaro.org
---
 include/linux/sched.h | 10 +++++++++-
 kernel/sched/fair.c   | 31 ++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4802fcf738cd..55921385927d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -583,7 +583,15 @@ struct sched_entity {
 	u64				sum_exec_runtime;
 	u64				prev_sum_exec_runtime;
 	u64				vruntime;
-	s64				vlag;
+	union {
+		/*
+		 * When !@on_rq this field is vlag.
+		 * When cfs_rq->curr == se (which implies @on_rq)
+		 * this field is vprot. See protect_slice().
+		 */
+		s64                     vlag;
+		u64                     vprot;
+	};
 	u64				slice;
 
 	u64				nr_migrations;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 43fe5c831dd5..8d288df79a6b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -882,23 +882,35 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 }
 
 /*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we protect the entity up to its deadline.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
  */
 static inline void set_protect_slice(struct sched_entity *se)
 {
-	se->vlag = se->deadline;
+	u64 slice = se->slice;
+	u64 vprot = se->deadline;
+
+	if (!sched_feat(RUN_TO_PARITY))
+		slice = min(slice, normalized_sysctl_sched_base_slice);
+
+	if (slice != se->slice)
+		vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+	se->vprot = vprot;
 }
 
 static inline bool protect_slice(struct sched_entity *se)
 {
-	return se->vlag == se->deadline;
+	return ((s64)(se->vprot - se->vruntime) > 0);
 }
 
 static inline void cancel_protect_slice(struct sched_entity *se)
 {
 	if (protect_slice(se))
-		se->vlag = se->deadline + 1;
+		se->vprot = se->vruntime;
 }
 
 /*
@@ -937,7 +949,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
 	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
 		curr = NULL;
 
-	if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
+	if (curr && protect_slice(curr))
 		return curr;
 
 	/* Pick the leftmost entity if it's eligible */
@@ -1156,11 +1168,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
 	cgroup_account_cputime(p, delta_exec);
 }
 
-static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	if (!sched_feat(PREEMPT_SHORT))
-		return false;
-
 	if (protect_slice(curr))
 		return false;
 
@@ -1248,7 +1257,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (cfs_rq->nr_queued == 1)
 		return;
 
-	if (resched || did_preempt_short(cfs_rq, curr)) {
+	if (resched || resched_next_slice(cfs_rq, curr)) {
 		resched_curr_lazy(rq);
 		clear_buddies(cfs_rq, curr);
 	}
-- 
cgit v1.2.3


From f6bfc9afc7510cb5e6fbe0a17c507917b0120280 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 7 Jul 2025 15:11:55 +0200
Subject: drm/framebuffer: Acquire internal references on GEM handles
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Acquire GEM handles in drm_framebuffer_init() and release them in
the corresponding drm_framebuffer_cleanup(). Ties the handle's
lifetime to the framebuffer. Not all GEM buffer objects have GEM
handles. If not set, no refcounting takes place. This is the case
for some fbdev emulation. This is not a problem as these GEM objects
do not use dma-bufs and drivers will not release them while fbdev
emulation is running. Framebuffer flags keep a bit per color plane
of which the framebuffer holds a GEM handle reference.

As all drivers use drm_framebuffer_init(), they will now all hold
dma-buf references as fixed in commit 5307dce878d4 ("drm/gem: Acquire
references on GEM handles for framebuffers").

In the GEM framebuffer helpers, restore the original ref counting
on buffer objects. As the helpers for handle refcounting are now
no longer called from outside the DRM core, unexport the symbols.

v3:
- don't mix internal flags with mode flags (Christian)
v2:
- track framebuffer handle refs by flag
- drop gma500 cleanup (Christian)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Fixes: 5307dce878d4 ("drm/gem: Acquire references on GEM handles for framebuffers")
Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes: https://lore.kernel.org/dri-devel/20250703115915.3096-1-spasswolf@web.de/
Tested-by: Bert Karwatzki <spasswolf@web.de>
Tested-by: Mario Limonciello <superm1@kernel.org>
Tested-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Anusha Srivatsa <asrivats@redhat.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Cc: <stable@vger.kernel.org>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20250707131224.249496-1-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_framebuffer.c            | 31 +++++++++++++++++++++--
 drivers/gpu/drm/drm_gem.c                    | 38 ++++++++++++++++++----------
 drivers/gpu/drm/drm_gem_framebuffer_helper.c | 16 +++++-------
 drivers/gpu/drm/drm_internal.h               |  2 +-
 include/drm/drm_framebuffer.h                |  7 +++++
 5 files changed, 68 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c
index b781601946db..63a70f285cce 100644
--- a/drivers/gpu/drm/drm_framebuffer.c
+++ b/drivers/gpu/drm/drm_framebuffer.c
@@ -862,11 +862,23 @@ EXPORT_SYMBOL_FOR_TESTS_ONLY(drm_framebuffer_free);
 int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
 			 const struct drm_framebuffer_funcs *funcs)
 {
+	unsigned int i;
 	int ret;
+	bool exists;
 
 	if (WARN_ON_ONCE(fb->dev != dev || !fb->format))
 		return -EINVAL;
 
+	for (i = 0; i < fb->format->num_planes; i++) {
+		if (drm_WARN_ON_ONCE(dev, fb->internal_flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i)))
+			fb->internal_flags &= ~DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+		if (fb->obj[i]) {
+			exists = drm_gem_object_handle_get_if_exists_unlocked(fb->obj[i]);
+			if (exists)
+				fb->internal_flags |= DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+		}
+	}
+
 	INIT_LIST_HEAD(&fb->filp_head);
 
 	fb->funcs = funcs;
@@ -875,7 +887,7 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
 	ret = __drm_mode_object_add(dev, &fb->base, DRM_MODE_OBJECT_FB,
 				    false, drm_framebuffer_free);
 	if (ret)
-		goto out;
+		goto err;
 
 	mutex_lock(&dev->mode_config.fb_lock);
 	dev->mode_config.num_fb++;
@@ -883,7 +895,16 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
 	mutex_unlock(&dev->mode_config.fb_lock);
 
 	drm_mode_object_register(dev, &fb->base);
-out:
+
+	return 0;
+
+err:
+	for (i = 0; i < fb->format->num_planes; i++) {
+		if (fb->internal_flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i)) {
+			drm_gem_object_handle_put_unlocked(fb->obj[i]);
+			fb->internal_flags &= ~DRM_FRAMEBUFFER_HAS_HANDLE_REF(i);
+		}
+	}
 	return ret;
 }
 EXPORT_SYMBOL(drm_framebuffer_init);
@@ -960,6 +981,12 @@ EXPORT_SYMBOL(drm_framebuffer_unregister_private);
 void drm_framebuffer_cleanup(struct drm_framebuffer *fb)
 {
 	struct drm_device *dev = fb->dev;
+	unsigned int i;
+
+	for (i = 0; i < fb->format->num_planes; i++) {
+		if (fb->internal_flags & DRM_FRAMEBUFFER_HAS_HANDLE_REF(i))
+			drm_gem_object_handle_put_unlocked(fb->obj[i]);
+	}
 
 	mutex_lock(&dev->mode_config.fb_lock);
 	list_del(&fb->head);
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 4bf0a76bb35e..aad6ac9748cc 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -223,23 +223,34 @@ static void drm_gem_object_handle_get(struct drm_gem_object *obj)
 }
 
 /**
- * drm_gem_object_handle_get_unlocked - acquire reference on user-space handles
+ * drm_gem_object_handle_get_if_exists_unlocked - acquire reference on user-space handle, if any
  * @obj: GEM object
  *
- * Acquires a reference on the GEM buffer object's handle. Required
- * to keep the GEM object alive. Call drm_gem_object_handle_put_unlocked()
- * to release the reference.
+ * Acquires a reference on the GEM buffer object's handle. Required to keep
+ * the GEM object alive. Call drm_gem_object_handle_put_if_exists_unlocked()
+ * to release the reference. Does nothing if the buffer object has no handle.
+ *
+ * Returns:
+ * True if a handle exists, or false otherwise
  */
-void drm_gem_object_handle_get_unlocked(struct drm_gem_object *obj)
+bool drm_gem_object_handle_get_if_exists_unlocked(struct drm_gem_object *obj)
 {
 	struct drm_device *dev = obj->dev;
 
 	guard(mutex)(&dev->object_name_lock);
 
-	drm_WARN_ON(dev, !obj->handle_count); /* first ref taken in create-tail helper */
+	/*
+	 * First ref taken during GEM object creation, if any. Some
+	 * drivers set up internal framebuffers with GEM objects that
+	 * do not have a GEM handle. Hence, this counter can be zero.
+	 */
+	if (!obj->handle_count)
+		return false;
+
 	drm_gem_object_handle_get(obj);
+
+	return true;
 }
-EXPORT_SYMBOL(drm_gem_object_handle_get_unlocked);
 
 /**
  * drm_gem_object_handle_free - release resources bound to userspace handles
@@ -272,7 +283,7 @@ static void drm_gem_object_exported_dma_buf_free(struct drm_gem_object *obj)
 }
 
 /**
- * drm_gem_object_handle_put_unlocked - releases reference on user-space handles
+ * drm_gem_object_handle_put_unlocked - releases reference on user-space handle
  * @obj: GEM object
  *
  * Releases a reference on the GEM buffer object's handle. Possibly releases
@@ -283,14 +294,14 @@ void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj)
 	struct drm_device *dev = obj->dev;
 	bool final = false;
 
-	if (WARN_ON(READ_ONCE(obj->handle_count) == 0))
+	if (drm_WARN_ON(dev, READ_ONCE(obj->handle_count) == 0))
 		return;
 
 	/*
-	* Must bump handle count first as this may be the last
-	* ref, in which case the object would disappear before we
-	* checked for a name
-	*/
+	 * Must bump handle count first as this may be the last
+	 * ref, in which case the object would disappear before
+	 * we checked for a name.
+	 */
 
 	mutex_lock(&dev->object_name_lock);
 	if (--obj->handle_count == 0) {
@@ -303,7 +314,6 @@ void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj)
 	if (final)
 		drm_gem_object_put(obj);
 }
-EXPORT_SYMBOL(drm_gem_object_handle_put_unlocked);
 
 /*
  * Called at device or object close to release the file's
diff --git a/drivers/gpu/drm/drm_gem_framebuffer_helper.c b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
index 14a87788695d..6f72e7a0f427 100644
--- a/drivers/gpu/drm/drm_gem_framebuffer_helper.c
+++ b/drivers/gpu/drm/drm_gem_framebuffer_helper.c
@@ -99,7 +99,7 @@ void drm_gem_fb_destroy(struct drm_framebuffer *fb)
 	unsigned int i;
 
 	for (i = 0; i < fb->format->num_planes; i++)
-		drm_gem_object_handle_put_unlocked(fb->obj[i]);
+		drm_gem_object_put(fb->obj[i]);
 
 	drm_framebuffer_cleanup(fb);
 	kfree(fb);
@@ -182,10 +182,8 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
 		if (!objs[i]) {
 			drm_dbg_kms(dev, "Failed to lookup GEM object\n");
 			ret = -ENOENT;
-			goto err_gem_object_handle_put_unlocked;
+			goto err_gem_object_put;
 		}
-		drm_gem_object_handle_get_unlocked(objs[i]);
-		drm_gem_object_put(objs[i]);
 
 		min_size = (height - 1) * mode_cmd->pitches[i]
 			 + drm_format_info_min_pitch(info, i, width)
@@ -195,22 +193,22 @@ int drm_gem_fb_init_with_funcs(struct drm_device *dev,
 			drm_dbg_kms(dev,
 				    "GEM object size (%zu) smaller than minimum size (%u) for plane %d\n",
 				    objs[i]->size, min_size, i);
-			drm_gem_object_handle_put_unlocked(objs[i]);
+			drm_gem_object_put(objs[i]);
 			ret = -EINVAL;
-			goto err_gem_object_handle_put_unlocked;
+			goto err_gem_object_put;
 		}
 	}
 
 	ret = drm_gem_fb_init(dev, fb, mode_cmd, objs, i, funcs);
 	if (ret)
-		goto err_gem_object_handle_put_unlocked;
+		goto err_gem_object_put;
 
 	return 0;
 
-err_gem_object_handle_put_unlocked:
+err_gem_object_put:
 	while (i > 0) {
 		--i;
-		drm_gem_object_handle_put_unlocked(objs[i]);
+		drm_gem_object_put(objs[i]);
 	}
 	return ret;
 }
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index be77d61a16ce..60c282881958 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -161,7 +161,7 @@ void drm_sysfs_lease_event(struct drm_device *dev);
 
 /* drm_gem.c */
 int drm_gem_init(struct drm_device *dev);
-void drm_gem_object_handle_get_unlocked(struct drm_gem_object *obj);
+bool drm_gem_object_handle_get_if_exists_unlocked(struct drm_gem_object *obj);
 void drm_gem_object_handle_put_unlocked(struct drm_gem_object *obj);
 int drm_gem_handle_create_tail(struct drm_file *file_priv,
 			       struct drm_gem_object *obj,
diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index 668077009fce..38b24fc8978d 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -23,6 +23,7 @@
 #ifndef __DRM_FRAMEBUFFER_H__
 #define __DRM_FRAMEBUFFER_H__
 
+#include <linux/bits.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/sched.h>
@@ -100,6 +101,8 @@ struct drm_framebuffer_funcs {
 		     unsigned num_clips);
 };
 
+#define DRM_FRAMEBUFFER_HAS_HANDLE_REF(_i)	BIT(0u + (_i))
+
 /**
  * struct drm_framebuffer - frame buffer object
  *
@@ -188,6 +191,10 @@ struct drm_framebuffer {
 	 * DRM_MODE_FB_MODIFIERS.
 	 */
 	int flags;
+	/**
+	 * @internal_flags: Framebuffer flags like DRM_FRAMEBUFFER_HAS_HANDLE_REF.
+	 */
+	unsigned int internal_flags;
 	/**
 	 * @filp_head: Placed on &drm_file.fbs, protected by &drm_file.fbs_lock.
 	 */
-- 
cgit v1.2.3


From bd46cece51a36ef088f22ef0416ac13b0a46d5b0 Mon Sep 17 00:00:00 2001
From: Simona Vetter <simona.vetter@ffwll.ch>
Date: Mon, 7 Jul 2025 17:18:13 +0200
Subject: drm/gem: Fix race in drm_gem_handle_create_tail()

Object creation is a careful dance where we must guarantee that the
object is fully constructed before it is visible to other threads, and
GEM buffer objects are no difference.

Final publishing happens by calling drm_gem_handle_create(). After
that the only allowed thing to do is call drm_gem_object_put() because
a concurrent call to the GEM_CLOSE ioctl with a correctly guessed id
(which is trivial since we have a linear allocator) can already tear
down the object again.

Luckily most drivers get this right, the very few exceptions I've
pinged the relevant maintainers for. Unfortunately we also need
drm_gem_handle_create() when creating additional handles for an
already existing object (e.g. GETFB ioctl or the various bo import
ioctl), and hence we cannot have a drm_gem_handle_create_and_put() as
the only exported function to stop these issues from happening.

Now unfortunately the implementation of drm_gem_handle_create() isn't
living up to standards: It does correctly finishe object
initialization at the global level, and hence is safe against a
concurrent tear down. But it also sets up the file-private aspects of
the handle, and that part goes wrong: We fully register the object in
the drm_file.object_idr before calling drm_vma_node_allow() or
obj->funcs->open, which opens up races against concurrent removal of
that handle in drm_gem_handle_delete().

Fix this with the usual two-stage approach of first reserving the
handle id, and then only registering the object after we've completed
the file-private setup.

Jacek reported this with a testcase of concurrently calling GEM_CLOSE
on a freshly-created object (which also destroys the object), but it
should be possible to hit this with just additional handles created
through import or GETFB without completed destroying the underlying
object with the concurrent GEM_CLOSE ioctl calls.

Note that the close-side of this race was fixed in f6cd7daecff5 ("drm:
Release driver references to handle before making it available
again"), which means a cool 9 years have passed until someone noticed
that we need to make this symmetry or there's still gaps left :-/
Without the 2-stage close approach we'd still have a race, therefore
that's an integral part of this bugfix.

More importantly, this means we can have NULL pointers behind
allocated id in our drm_file.object_idr. We need to check for that
now:

- drm_gem_handle_delete() checks for ERR_OR_NULL already

- drm_gem.c:object_lookup() also chekcs for NULL

- drm_gem_release() should never be called if there's another thread
  still existing that could call into an IOCTL that creates a new
  handle, so cannot race. For paranoia I added a NULL check to
  drm_gem_object_release_handle() though.

- most drivers (etnaviv, i915, msm) are find because they use
  idr_find(), which maps both ENOENT and NULL to NULL.

- drivers using idr_for_each_entry() should also be fine, because
  idr_get_next does filter out NULL entries and continues the
  iteration.

- The same holds for drm_show_memory_stats().

v2: Use drm_WARN_ON (Thomas)

Reported-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Tested-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Cc: stable@vger.kernel.org
Cc: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Signed-off-by: Simona Vetter <simona.vetter@intel.com>
Signed-off-by: Simona Vetter <simona.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20250707151814.603897-1-simona.vetter@ffwll.ch
---
 drivers/gpu/drm/drm_gem.c | 10 +++++++++-
 include/drm/drm_file.h    |  3 +++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index aad6ac9748cc..ac0524595bd6 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -325,6 +325,9 @@ drm_gem_object_release_handle(int id, void *ptr, void *data)
 	struct drm_file *file_priv = data;
 	struct drm_gem_object *obj = ptr;
 
+	if (drm_WARN_ON(obj->dev, !data))
+		return 0;
+
 	if (obj->funcs->close)
 		obj->funcs->close(obj, file_priv);
 
@@ -445,7 +448,7 @@ drm_gem_handle_create_tail(struct drm_file *file_priv,
 	idr_preload(GFP_KERNEL);
 	spin_lock(&file_priv->table_lock);
 
-	ret = idr_alloc(&file_priv->object_idr, obj, 1, 0, GFP_NOWAIT);
+	ret = idr_alloc(&file_priv->object_idr, NULL, 1, 0, GFP_NOWAIT);
 
 	spin_unlock(&file_priv->table_lock);
 	idr_preload_end();
@@ -466,6 +469,11 @@ drm_gem_handle_create_tail(struct drm_file *file_priv,
 			goto err_revoke;
 	}
 
+	/* mirrors drm_gem_handle_delete to avoid races */
+	spin_lock(&file_priv->table_lock);
+	obj = idr_replace(&file_priv->object_idr, obj, handle);
+	WARN_ON(obj != NULL);
+	spin_unlock(&file_priv->table_lock);
 	*handlep = handle;
 	return 0;
 
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index 5c3b2aa3e69d..d344d41e6cfe 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -300,6 +300,9 @@ struct drm_file {
 	 *
 	 * Mapping of mm object handles to object pointers. Used by the GEM
 	 * subsystem. Protected by @table_lock.
+	 *
+	 * Note that allocated entries might be NULL as a transient state when
+	 * creating or deleting a handle.
 	 */
 	struct idr object_idr;
 
-- 
cgit v1.2.3


From a7cec20845a67ff4f3c924255519341f37d993f9 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 25 Jun 2025 17:12:22 -0700
Subject: KVM: x86: Provide a capability to disable APERF/MPERF read intercepts

Allow a guest to read the physical IA32_APERF and IA32_MPERF MSRs
without interception.

The IA32_APERF and IA32_MPERF MSRs are not virtualized. Writes are not
handled at all. The MSR values are not zeroed on vCPU creation, saved
on suspend, or restored on resume. No accommodation is made for
processor migration or for sharing a logical processor with other
tasks. No adjustments are made for non-unit TSC multipliers. The MSRs
do not account for time the same way as the comparable PMU events,
whether the PMU is virtualized by the traditional emulation method or
the new mediated pass-through approach.

Nonetheless, in a properly constrained environment, this capability
can be combined with a guest CPUID table that advertises support for
CPUID.6:ECX.APERFMPERF[bit 0] to induce a Linux guest to report the
effective physical CPU frequency in /proc/cpuinfo. Moreover, there is
no performance cost for this capability.

Signed-off-by: Jim Mattson <jmattson@google.com>
Link: https://lore.kernel.org/r/20250530185239.2335185-3-jmattson@google.com
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/r/20250626001225.744268-3-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 Documentation/virt/kvm/api.rst | 23 +++++++++++++++++++++++
 arch/x86/kvm/svm/nested.c      |  4 +++-
 arch/x86/kvm/svm/svm.c         |  5 +++++
 arch/x86/kvm/vmx/nested.c      |  6 ++++++
 arch/x86/kvm/vmx/vmx.c         |  4 ++++
 arch/x86/kvm/x86.c             |  6 +++++-
 arch/x86/kvm/x86.h             |  5 +++++
 include/uapi/linux/kvm.h       |  1 +
 tools/include/uapi/linux/kvm.h |  1 +
 9 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index f0d961436d0f..6be1ddedec49 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -7844,6 +7844,7 @@ Valid bits in args[0] are::
   #define KVM_X86_DISABLE_EXITS_HLT              (1 << 1)
   #define KVM_X86_DISABLE_EXITS_PAUSE            (1 << 2)
   #define KVM_X86_DISABLE_EXITS_CSTATE           (1 << 3)
+  #define KVM_X86_DISABLE_EXITS_APERFMPERF       (1 << 4)
 
 Enabling this capability on a VM provides userspace with a way to no
 longer intercept some instructions for improved latency in some
@@ -7854,6 +7855,28 @@ all such vmexits.
 
 Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits.
 
+Virtualizing the ``IA32_APERF`` and ``IA32_MPERF`` MSRs requires more
+than just disabling APERF/MPERF exits. While both Intel and AMD
+document strict usage conditions for these MSRs--emphasizing that only
+the ratio of their deltas over a time interval (T0 to T1) is
+architecturally defined--simply passing through the MSRs can still
+produce an incorrect ratio.
+
+This erroneous ratio can occur if, between T0 and T1:
+
+1. The vCPU thread migrates between logical processors.
+2. Live migration or suspend/resume operations take place.
+3. Another task shares the vCPU's logical processor.
+4. C-states lower than C0 are emulated (e.g., via HLT interception).
+5. The guest TSC frequency doesn't match the host TSC frequency.
+
+Due to these complexities, KVM does not automatically associate this
+passthrough capability with the guest CPUID bit,
+``CPUID.6:ECX.APERFMPERF[bit 0]``. Userspace VMMs that deem this
+mechanism adequate for virtualizing the ``IA32_APERF`` and
+``IA32_MPERF`` MSRs must set the guest CPUID bit explicitly.
+
+
 7.14 KVM_CAP_S390_HPAGE_1M
 --------------------------
 
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 749f7b866ac8..b7fd2e869998 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -194,7 +194,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
  * Hardcode the capacity of the array based on the maximum number of _offsets_.
  * MSRs are batched together, so there are fewer offsets than MSRs.
  */
-static int nested_svm_msrpm_merge_offsets[6] __ro_after_init;
+static int nested_svm_msrpm_merge_offsets[7] __ro_after_init;
 static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
 typedef unsigned long nsvm_msrpm_merge_t;
 
@@ -216,6 +216,8 @@ int __init nested_svm_init_msrpm_merge_offsets(void)
 		MSR_IA32_SPEC_CTRL,
 		MSR_IA32_PRED_CMD,
 		MSR_IA32_FLUSH_CMD,
+		MSR_IA32_APERF,
+		MSR_IA32_MPERF,
 		MSR_IA32_LASTBRANCHFROMIP,
 		MSR_IA32_LASTBRANCHTOIP,
 		MSR_IA32_LASTINTFROMIP,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1261447ffcdd..fedf68c63318 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -838,6 +838,11 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
 	svm_set_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW,
 				  guest_cpuid_is_intel_compatible(vcpu));
 
+	if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+		svm_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+		svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+	}
+
 	if (sev_es_guest(vcpu->kvm))
 		sev_es_recalc_msr_intercepts(vcpu);
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c69df3aba8d1..b8ea1969113d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -715,6 +715,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
 					 MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
 
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+					 MSR_IA32_APERF, MSR_TYPE_R);
+
+	nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+					 MSR_IA32_MPERF, MSR_TYPE_R);
+
 	kvm_vcpu_unmap(vcpu, &map);
 
 	vmx->nested.force_msr_bitmap_recalc = false;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b064e50c6e64..77bbb2b93418 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4084,6 +4084,10 @@ void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu)
 		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
 		vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
 	}
+	if (kvm_aperfmperf_in_guest(vcpu->kvm)) {
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R);
+		vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R);
+	}
 
 	/* PT MSRs can be passed through iff PT is exposed to the guest. */
 	if (vmx_pt_mode_is_host_guest())
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6dda7bf4c44c..912260e3725d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4577,6 +4577,9 @@ static u64 kvm_get_allowed_disable_exits(void)
 {
 	u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
 
+	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+		r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
 	if (!mitigate_smt_rsb) {
 		r |= KVM_X86_DISABLE_EXITS_HLT |
 			KVM_X86_DISABLE_EXITS_CSTATE;
@@ -6613,7 +6616,8 @@ split_irqchip_unlock:
 
 		if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
 		    cpu_smt_possible() &&
-		    (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+		    (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+				      KVM_X86_DISABLE_EXITS_APERFMPERF)))
 			pr_warn_once(SMT_RSB_MSG);
 
 		kvm_disable_exits(kvm, cap->args[0]);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 17ec8436e565..e77281b6e2b2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -524,6 +524,11 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
 	return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_CSTATE;
 }
 
+static inline bool kvm_aperfmperf_in_guest(struct kvm *kvm)
+{
+	return kvm->arch.disabled_exits & KVM_X86_DISABLE_EXITS_APERFMPERF;
+}
+
 static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
 {
 	return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7a4c35ff03fe..aeb2ca10b190 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -644,6 +644,7 @@ struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
+#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index b6ae8ad8934b..eef57c117140 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -617,6 +617,7 @@ struct kvm_ioeventfd {
 #define KVM_X86_DISABLE_EXITS_HLT            (1 << 1)
 #define KVM_X86_DISABLE_EXITS_PAUSE          (1 << 2)
 #define KVM_X86_DISABLE_EXITS_CSTATE         (1 << 3)
+#define KVM_X86_DISABLE_EXITS_APERFMPERF     (1 << 4)
 
 /* for KVM_ENABLE_CAP */
 struct kvm_enable_cap {
-- 
cgit v1.2.3


From 0af3ecdde58676f6c42eeec07d6816d5bf87ff88 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 9 Jul 2025 21:21:13 +0200
Subject: printk: Make vprintk_deferred() public

vprintk_deferred() is useful for implementing runtime verification
reactors. Make it public.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/printk.h   | 7 +++++++
 kernel/printk/internal.h | 1 -
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 5b462029d03c..5d22b803f51e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -154,6 +154,8 @@ int vprintk_emit(int facility, int level,
 
 asmlinkage __printf(1, 0)
 int vprintk(const char *fmt, va_list args);
+__printf(1, 0)
+int vprintk_deferred(const char *fmt, va_list args);
 
 asmlinkage __printf(1, 2) __cold
 int _printk(const char *fmt, ...);
@@ -214,6 +216,11 @@ int vprintk(const char *s, va_list args)
 {
 	return 0;
 }
+static inline __printf(1, 0)
+int vprintk_deferred(const char *fmt, va_list args)
+{
+	return 0;
+}
 static inline __printf(1, 2) __cold
 int _printk(const char *s, ...)
 {
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 48a24e7b309d..bbed41ad29cf 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -72,7 +72,6 @@ int vprintk_store(int facility, int level,
 		  const char *fmt, va_list args);
 
 __printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
 
 void __printk_safe_enter(void);
 void __printk_safe_exit(void);
-- 
cgit v1.2.3


From 3f045de7f557850ca6b3632c6d45c2cdaf948694 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 9 Jul 2025 21:21:14 +0200
Subject: panic: Add vpanic()

vpanic() is useful for implementing runtime verification reactors. Add it.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/panic.h |  3 +++
 kernel/panic.c        | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/panic.h b/include/linux/panic.h
index 4adc65766935..0332c6d6771f 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -3,6 +3,7 @@
 #define _LINUX_PANIC_H
 
 #include <linux/compiler_attributes.h>
+#include <linux/stdarg.h>
 #include <linux/types.h>
 
 struct pt_regs;
@@ -10,6 +11,8 @@ struct pt_regs;
 extern long (*panic_blink)(int state);
 __printf(1, 2)
 void panic(const char *fmt, ...) __noreturn __cold;
+__printf(1, 0)
+void vpanic(const char *fmt, va_list args) __noreturn __cold;
 void nmi_panic(struct pt_regs *regs, const char *msg);
 void check_panic_on_warn(const char *origin);
 extern void oops_enter(void);
diff --git a/kernel/panic.c b/kernel/panic.c
index b0b9a8bf4560..6a1823c383d0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -309,13 +309,13 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
 /**
  * panic - halt the system
  * @fmt: The text string to print
+ * @args: Arguments for the format string
  *
  * Display a message, then perform cleanups. This function never returns.
  */
-void panic(const char *fmt, ...)
+void vpanic(const char *fmt, va_list args)
 {
 	static char buf[1024];
-	va_list args;
 	long i, i_next = 0, len;
 	int state = 0;
 	int old_cpu, this_cpu;
@@ -366,9 +366,7 @@ void panic(const char *fmt, ...)
 
 	console_verbose();
 	bust_spinlocks(1);
-	va_start(args, fmt);
 	len = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
 
 	if (len && buf[len - 1] == '\n')
 		buf[len - 1] = '\0';
@@ -505,7 +503,17 @@ void panic(const char *fmt, ...)
 		mdelay(PANIC_TIMER_STEP);
 	}
 }
+EXPORT_SYMBOL(vpanic);
 
+/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
+void panic(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vpanic(fmt, args);
+	va_end(args);
+}
 EXPORT_SYMBOL(panic);
 
 #define TAINT_FLAG(taint, _c_true, _c_false, _module)			\
-- 
cgit v1.2.3


From ff4e233d8ab70fe6ae460ecc8c0e5b24dd0fedb0 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 9 Jul 2025 21:21:15 +0200
Subject: rv: Let the reactors take care of buffers

Each RV monitor has one static buffer to send to the reactors. If multiple
errors are detected simultaneously, the one buffer could be overwritten.

Instead, leave it to the reactors to handle buffering.

Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h               |  9 ++++++--
 include/rv/da_monitor.h          | 45 +++++++++-------------------------------
 kernel/trace/rv/reactor_panic.c  |  8 +++++--
 kernel/trace/rv/reactor_printk.c |  8 +++++--
 kernel/trace/rv/rv_reactors.c    |  2 +-
 5 files changed, 30 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 3452b5e4b29e..9428e62eb8e9 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -38,7 +38,7 @@ union rv_task_monitor {
 struct rv_reactor {
 	const char		*name;
 	const char		*description;
-	void			(*react)(char *msg);
+	__printf(1, 2) void	(*react)(const char *msg, ...);
 };
 #endif
 
@@ -50,7 +50,7 @@ struct rv_monitor {
 	void			(*disable)(void);
 	void			(*reset)(void);
 #ifdef CONFIG_RV_REACTORS
-	void			(*react)(char *msg);
+	__printf(1, 2) void	(*react)(const char *msg, ...);
 #endif
 };
 
@@ -64,6 +64,11 @@ void rv_put_task_monitor_slot(int slot);
 bool rv_reacting_on(void);
 int rv_unregister_reactor(struct rv_reactor *reactor);
 int rv_register_reactor(struct rv_reactor *reactor);
+#else
+static inline bool rv_reacting_on(void)
+{
+	return false;
+}
 #endif /* CONFIG_RV_REACTORS */
 
 #endif /* CONFIG_RV */
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 510c88bfabd4..15f9ed4e4bb6 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -19,45 +19,22 @@
 #ifdef CONFIG_RV_REACTORS
 
 #define DECLARE_RV_REACTING_HELPERS(name, type)							\
-static char REACT_MSG_##name[1024];								\
-												\
-static inline char *format_react_msg_##name(type curr_state, type event)			\
-{												\
-	snprintf(REACT_MSG_##name, 1024,							\
-		 "rv: monitor %s does not allow event %s on state %s\n",			\
-		 #name,										\
-		 model_get_event_name_##name(event),						\
-		 model_get_state_name_##name(curr_state));					\
-	return REACT_MSG_##name;								\
-}												\
-												\
-static void cond_react_##name(char *msg)							\
+static void cond_react_##name(type curr_state, type event)					\
 {												\
-	if (rv_##name.react)									\
-		rv_##name.react(msg);								\
-}												\
-												\
-static bool rv_reacting_on_##name(void)								\
-{												\
-	return rv_reacting_on();								\
+	if (!rv_reacting_on() || !rv_##name.react)						\
+		return;										\
+	rv_##name.react("rv: monitor %s does not allow event %s on state %s\n",			\
+			#name,									\
+			model_get_event_name_##name(event),					\
+			model_get_state_name_##name(curr_state));				\
 }
 
 #else /* CONFIG_RV_REACTOR */
 
 #define DECLARE_RV_REACTING_HELPERS(name, type)							\
-static inline char *format_react_msg_##name(type curr_state, type event)			\
-{												\
-	return NULL;										\
-}												\
-												\
-static void cond_react_##name(char *msg)							\
+static void cond_react_##name(type curr_state, type event)					\
 {												\
 	return;											\
-}												\
-												\
-static bool rv_reacting_on_##name(void)								\
-{												\
-	return 0;										\
 }
 #endif
 
@@ -170,8 +147,7 @@ da_event_##name(struct da_monitor *da_mon, enum events_##name event)				\
 		return true;									\
 	}											\
 												\
-	if (rv_reacting_on_##name())								\
-		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+	cond_react_##name(curr_state, event);							\
 												\
 	trace_error_##name(model_get_state_name_##name(curr_state),				\
 			   model_get_event_name_##name(event));					\
@@ -202,8 +178,7 @@ static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct
 		return true;									\
 	}											\
 												\
-	if (rv_reacting_on_##name())								\
-		cond_react_##name(format_react_msg_##name(curr_state, event));			\
+	cond_react_##name(curr_state, event);							\
 												\
 	trace_error_##name(tsk->pid,								\
 			   model_get_state_name_##name(curr_state),				\
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 0186ff4cbd0b..74c6bcc2c749 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,9 +13,13 @@
 #include <linux/init.h>
 #include <linux/rv.h>
 
-static void rv_panic_reaction(char *msg)
+__printf(1, 2) static void rv_panic_reaction(const char *msg, ...)
 {
-	panic(msg);
+	va_list args;
+
+	va_start(args, msg);
+	vpanic(msg, args);
+	va_end(args);
 }
 
 static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 178759dbf89f..2dae2916c05f 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,9 +12,13 @@
 #include <linux/init.h>
 #include <linux/rv.h>
 
-static void rv_printk_reaction(char *msg)
+__printf(1, 2) static void rv_printk_reaction(const char *msg, ...)
 {
-	printk_deferred(msg);
+	va_list args;
+
+	va_start(args, msg);
+	vprintk_deferred(msg, args);
+	va_end(args);
 }
 
 static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 9501ca886d83..740603670dd1 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -490,7 +490,7 @@ void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
 /*
  * Nop reactor register
  */
-static void rv_nop_reaction(char *msg)
+__printf(1, 2) static void rv_nop_reaction(const char *msg, ...)
 {
 }
 
-- 
cgit v1.2.3


From a9769a5b987838f03f3dd57b097794cd4c691098 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 9 Jul 2025 21:21:17 +0200
Subject: rv: Add support for LTL monitors

While attempting to implement DA monitors for some complex specifications,
deterministic automaton is found to be inappropriate as the specification
language. The automaton is complicated, hard to understand, and
error-prone.

For these cases, linear temporal logic is more suitable as the
specification language.

Add support for linear temporal logic runtime verification monitor.

Cc: John Ogness <john.ogness@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/d366c1fed60ed4e8f6451f3c15a99755f2740b5f.1752088709.git.namcao@linutronix.de
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h         |  63 +++++++++++++++-
 include/rv/ltl_monitor.h   | 184 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c              |   5 +-
 kernel/trace/rv/Kconfig    |   7 ++
 kernel/trace/rv/rv_trace.h |  47 ++++++++++++
 5 files changed, 298 insertions(+), 8 deletions(-)
 create mode 100644 include/rv/ltl_monitor.h

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 9428e62eb8e9..1d5579f9b75a 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -10,6 +10,10 @@
 #define MAX_DA_NAME_LEN	32
 
 #ifdef CONFIG_RV
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/array_size.h>
+
 /*
  * Deterministic automaton per-object variables.
  */
@@ -18,6 +22,59 @@ struct da_monitor {
 	unsigned int	curr_state;
 };
 
+#ifdef CONFIG_RV_LTL_MONITOR
+
+/*
+ * In the future, if the number of atomic propositions or the size of Buchi
+ * automaton is larger, we can switch to dynamic allocation. For now, the code
+ * is simpler this way.
+ */
+#define RV_MAX_LTL_ATOM 32
+#define RV_MAX_BA_STATES 32
+
+/**
+ * struct ltl_monitor - A linear temporal logic runtime verification monitor
+ * @states:	States in the Buchi automaton. As Buchi automaton is a
+ *		non-deterministic state machine, the monitor can be in multiple
+ *		states simultaneously. This is a bitmask of all possible states.
+ *		If this is zero, that means either:
+ *		    - The monitor has not started yet (e.g. because not all
+ *		      atomic propositions are known).
+ *		    - There is no possible state to be in. In other words, a
+ *		      violation of the LTL property is detected.
+ * @atoms:	The values of atomic propositions.
+ * @unknown_atoms: Atomic propositions which are still unknown.
+ */
+struct ltl_monitor {
+	DECLARE_BITMAP(states, RV_MAX_BA_STATES);
+	DECLARE_BITMAP(atoms, RV_MAX_LTL_ATOM);
+	DECLARE_BITMAP(unknown_atoms, RV_MAX_LTL_ATOM);
+};
+
+static inline bool rv_ltl_valid_state(struct ltl_monitor *mon)
+{
+	for (int i = 0; i < ARRAY_SIZE(mon->states); ++i) {
+		if (mon->states[i])
+			return true;
+	}
+	return false;
+}
+
+static inline bool rv_ltl_all_atoms_known(struct ltl_monitor *mon)
+{
+	for (int i = 0; i < ARRAY_SIZE(mon->unknown_atoms); ++i) {
+		if (mon->unknown_atoms[i])
+			return false;
+	}
+	return true;
+}
+
+#else
+
+struct ltl_monitor {};
+
+#endif /* CONFIG_RV_LTL_MONITOR */
+
 /*
  * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
  * If we find justification for more monitors, we can think about
@@ -27,11 +84,9 @@ struct da_monitor {
 #define RV_PER_TASK_MONITORS		1
 #define RV_PER_TASK_MONITOR_INIT	(RV_PER_TASK_MONITORS)
 
-/*
- * Futher monitor types are expected, so make this a union.
- */
 union rv_task_monitor {
-	struct da_monitor da_mon;
+	struct da_monitor	da_mon;
+	struct ltl_monitor	ltl_mon;
 };
 
 #ifdef CONFIG_RV_REACTORS
diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
new file mode 100644
index 000000000000..9a583125b566
--- /dev/null
+++ b/include/rv/ltl_monitor.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * This file must be combined with the $(MODEL_NAME).h file generated by
+ * tools/verification/rvgen.
+ */
+
+#include <linux/args.h>
+#include <linux/rv.h>
+#include <linux/stringify.h>
+#include <linux/seq_buf.h>
+#include <rv/instrumentation.h>
+#include <trace/events/task.h>
+#include <trace/events/sched.h>
+
+#ifndef MONITOR_NAME
+#error "Please include $(MODEL_NAME).h generated by rvgen"
+#endif
+
+#ifdef CONFIG_RV_REACTORS
+#define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME)
+static struct rv_monitor RV_MONITOR_NAME;
+
+static void rv_cond_react(struct task_struct *task)
+{
+	if (!rv_reacting_on() || !RV_MONITOR_NAME.react)
+		return;
+	RV_MONITOR_NAME.react("rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n",
+			      task->comm, task->pid);
+}
+#else
+static void rv_cond_react(struct task_struct *task)
+{
+}
+#endif
+
+static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT;
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon);
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation);
+
+static struct ltl_monitor *ltl_get_monitor(struct task_struct *task)
+{
+	return &task->rv[ltl_monitor_slot].ltl_mon;
+}
+
+static void ltl_task_init(struct task_struct *task, bool task_creation)
+{
+	struct ltl_monitor *mon = ltl_get_monitor(task);
+
+	memset(&mon->states, 0, sizeof(mon->states));
+
+	for (int i = 0; i < LTL_NUM_ATOM; ++i)
+		__set_bit(i, mon->unknown_atoms);
+
+	ltl_atoms_init(task, mon, task_creation);
+	ltl_atoms_fetch(task, mon);
+}
+
+static void handle_task_newtask(void *data, struct task_struct *task, unsigned long flags)
+{
+	ltl_task_init(task, true);
+}
+
+static int ltl_monitor_init(void)
+{
+	struct task_struct *g, *p;
+	int ret, cpu;
+
+	ret = rv_get_task_monitor_slot();
+	if (ret < 0)
+		return ret;
+
+	ltl_monitor_slot = ret;
+
+	rv_attach_trace_probe(name, task_newtask, handle_task_newtask);
+
+	read_lock(&tasklist_lock);
+
+	for_each_process_thread(g, p)
+		ltl_task_init(p, false);
+
+	for_each_present_cpu(cpu)
+		ltl_task_init(idle_task(cpu), false);
+
+	read_unlock(&tasklist_lock);
+
+	return 0;
+}
+
+static void ltl_monitor_destroy(void)
+{
+	rv_detach_trace_probe(name, task_newtask, handle_task_newtask);
+
+	rv_put_task_monitor_slot(ltl_monitor_slot);
+	ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT;
+}
+
+static void ltl_illegal_state(struct task_struct *task, struct ltl_monitor *mon)
+{
+	CONCATENATE(trace_error_, MONITOR_NAME)(task);
+	rv_cond_react(task);
+}
+
+static void ltl_attempt_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+	if (rv_ltl_all_atoms_known(mon))
+		ltl_start(task, mon);
+}
+
+static inline void ltl_atom_set(struct ltl_monitor *mon, enum ltl_atom atom, bool value)
+{
+	__clear_bit(atom, mon->unknown_atoms);
+	if (value)
+		__set_bit(atom, mon->atoms);
+	else
+		__clear_bit(atom, mon->atoms);
+}
+
+static void
+ltl_trace_event(struct task_struct *task, struct ltl_monitor *mon, unsigned long *next_state)
+{
+	const char *format_str = "%s";
+	DECLARE_SEQ_BUF(atoms, 64);
+	char states[32], next[32];
+	int i;
+
+	if (!CONCATENATE(CONCATENATE(trace_event_, MONITOR_NAME), _enabled)())
+		return;
+
+	snprintf(states, sizeof(states), "%*pbl", RV_MAX_BA_STATES, mon->states);
+	snprintf(next, sizeof(next), "%*pbl", RV_MAX_BA_STATES, next_state);
+
+	for (i = 0; i < LTL_NUM_ATOM; ++i) {
+		if (test_bit(i, mon->atoms)) {
+			seq_buf_printf(&atoms, format_str, ltl_atom_str(i));
+			format_str = ",%s";
+		}
+	}
+
+	CONCATENATE(trace_event_, MONITOR_NAME)(task, states, atoms.buffer, next);
+}
+
+static void ltl_validate(struct task_struct *task, struct ltl_monitor *mon)
+{
+	DECLARE_BITMAP(next_states, RV_MAX_BA_STATES) = {0};
+
+	if (!rv_ltl_valid_state(mon))
+		return;
+
+	for (unsigned int i = 0; i < RV_NUM_BA_STATES; ++i) {
+		if (test_bit(i, mon->states))
+			ltl_possible_next_states(mon, i, next_states);
+	}
+
+	ltl_trace_event(task, mon, next_states);
+
+	memcpy(mon->states, next_states, sizeof(next_states));
+
+	if (!rv_ltl_valid_state(mon))
+		ltl_illegal_state(task, mon);
+}
+
+static void ltl_atom_update(struct task_struct *task, enum ltl_atom atom, bool value)
+{
+	struct ltl_monitor *mon = ltl_get_monitor(task);
+
+	ltl_atom_set(mon, atom, value);
+	ltl_atoms_fetch(task, mon);
+
+	if (!rv_ltl_valid_state(mon))
+		ltl_attempt_start(task, mon);
+
+	ltl_validate(task, mon);
+}
+
+static void __maybe_unused ltl_atom_pulse(struct task_struct *task, enum ltl_atom atom, bool value)
+{
+	struct ltl_monitor *mon = ltl_get_monitor(task);
+
+	ltl_atom_update(task, atom, value);
+
+	ltl_atom_set(mon, atom, !value);
+	ltl_validate(task, mon);
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..1f06559d17bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1886,10 +1886,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
 #ifdef CONFIG_RV
 static void rv_task_fork(struct task_struct *p)
 {
-	int i;
-
-	for (i = 0; i < RV_PER_TASK_MONITORS; i++)
-		p->rv[i].da_mon.monitoring = false;
+	memset(&p->rv, 0, sizeof(p->rv));
 }
 #else
 #define rv_task_fork(p) do {} while (0)
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 6cdffc04b73c..6e157f964991 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -11,6 +11,13 @@ config DA_MON_EVENTS_ID
 	select RV_MON_EVENTS
 	bool
 
+config LTL_MON_EVENTS_ID
+	select RV_MON_EVENTS
+	bool
+
+config RV_LTL_MONITOR
+	bool
+
 menuconfig RV
 	bool "Runtime Verification"
 	depends on TRACING
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 99c3801616d4..fd3111ad1d51 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -127,6 +127,53 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
 // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
 
 #endif /* CONFIG_DA_MON_EVENTS_ID */
+#ifdef CONFIG_LTL_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_ltl_monitor_id,
+
+	TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+
+	TP_ARGS(task, states, atoms, next),
+
+	TP_STRUCT__entry(
+		__string(comm, task->comm)
+		__field(pid_t, pid)
+		__string(states, states)
+		__string(atoms, atoms)
+		__string(next, next)
+	),
+
+	TP_fast_assign(
+		__assign_str(comm);
+		__entry->pid = task->pid;
+		__assign_str(states);
+		__assign_str(atoms);
+		__assign_str(next);
+	),
+
+	TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid,
+		  __get_str(states), __get_str(atoms), __get_str(next))
+);
+
+DECLARE_EVENT_CLASS(error_ltl_monitor_id,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__string(comm, task->comm)
+		__field(pid_t, pid)
+	),
+
+	TP_fast_assign(
+		__assign_str(comm);
+		__entry->pid = task->pid;
+	),
+
+	TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid)
+);
+// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
+#endif /* CONFIG_LTL_MON_EVENTS_ID */
 #endif /* _TRACE_RV_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From fac5493251a680cb74343895d0e76843624a90d8 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Wed, 9 Jul 2025 21:21:23 +0200
Subject: rv: Allow to configure the number of per-task monitor

Now that there are 2 monitors for real-time applications, users may want to
enable both of them simultaneously. Make the number of per-task monitor
configurable. Default it to 2 for now.

Cc: John Ogness <john.ogness@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/93e83313fc4ba7f6e66f4abe80ca5f5494d658d0.1752088709.git.namcao@linutronix.de
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h                     | 9 +--------
 include/linux/sched.h                  | 8 +++-----
 kernel/trace/rv/Kconfig                | 9 +++++++++
 kernel/trace/rv/monitors/rtapp/Kconfig | 1 +
 kernel/trace/rv/rv.c                   | 8 ++++----
 5 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 1d5579f9b75a..97baf58d88b2 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -75,14 +75,7 @@ struct ltl_monitor {};
 
 #endif /* CONFIG_RV_LTL_MONITOR */
 
-/*
- * Per-task RV monitors count. Nowadays fixed in RV_PER_TASK_MONITORS.
- * If we find justification for more monitors, we can think about
- * adding more or developing a dynamic method. So far, none of
- * these are justified.
- */
-#define RV_PER_TASK_MONITORS		1
-#define RV_PER_TASK_MONITOR_INIT	(RV_PER_TASK_MONITORS)
+#define RV_PER_TASK_MONITOR_INIT	(CONFIG_RV_PER_TASK_MONITORS)
 
 union rv_task_monitor {
 	struct da_monitor	da_mon;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..fabd7fe1a07a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1642,12 +1642,10 @@ struct task_struct {
 
 #ifdef CONFIG_RV
 	/*
-	 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
-	 * If we find justification for more monitors, we can think
-	 * about adding more or developing a dynamic method. So far,
-	 * none of these are justified.
+	 * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS.
+	 * If memory becomes a concern, we can think about a dynamic method.
 	 */
-	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
+	union rv_task_monitor		rv[CONFIG_RV_PER_TASK_MONITORS];
 #endif
 
 #ifdef CONFIG_USER_EVENTS
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 942d57575e67..c11bf7e61ebf 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -32,6 +32,15 @@ menuconfig RV
 	  For further information, see:
 	    Documentation/trace/rv/runtime-verification.rst
 
+config RV_PER_TASK_MONITORS
+	int "Maximum number of per-task monitor"
+	depends on RV
+	range 1 8
+	default 2
+	help
+	  This option configures the maximum number of per-task RV monitors that can run
+	  simultaneously.
+
 source "kernel/trace/rv/monitors/wip/Kconfig"
 source "kernel/trace/rv/monitors/wwnr/Kconfig"
 source "kernel/trace/rv/monitors/sched/Kconfig"
diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig
index b7415c3570bb..1ce9370a9ba8 100644
--- a/kernel/trace/rv/monitors/rtapp/Kconfig
+++ b/kernel/trace/rv/monitors/rtapp/Kconfig
@@ -1,5 +1,6 @@
 config RV_MON_RTAPP
 	depends on RV
+	depends on RV_PER_TASK_MONITORS >= 2
 	bool "rtapp monitor"
 	help
 	  Collection of monitors to check for common problems with real-time
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index e25d65fe432a..108429d16ec1 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void)
 LIST_HEAD(rv_monitors_list);
 
 static int task_monitor_count;
-static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS];
 
 int rv_get_task_monitor_slot(void)
 {
@@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void)
 
 	lockdep_assert_held(&rv_interface_lock);
 
-	if (task_monitor_count == RV_PER_TASK_MONITORS)
+	if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS)
 		return -EBUSY;
 
 	task_monitor_count++;
 
-	for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+	for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) {
 		if (task_monitor_slots[i] == false) {
 			task_monitor_slots[i] = true;
 			return i;
@@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot)
 {
 	lockdep_assert_held(&rv_interface_lock);
 
-	if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+	if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) {
 		WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
 		return;
 	}
-- 
cgit v1.2.3


From 878e1e94a8aafb2f93a333a1aaed5e1c5f17e339 Mon Sep 17 00:00:00 2001
From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Date: Fri, 20 Jun 2025 17:30:20 -0700
Subject: tracing/sched: Remove obsolete comment on suffixes

Commit ac01fa73f530 ("tracepoint: Have tracepoints created with DECLARE_
TRACE() have _tp suffix") makes it unnecessary to manually add a suffix.

Remove a now obsolete comment.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ricardo Neri <ricardo.neri@intel.com>
Link: https://lore.kernel.org/20250620-rneri-tp-comment-fix-v1-1-e0f6495ac33c@linux.intel.com
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3..f24c373bcc44 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -829,8 +829,6 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
 /*
  * Following tracepoints are not exported in tracefs and provide hooking
  * mechanisms only for testing and debugging purposes.
- *
- * Postfixed with _tp to make them easily identifiable in the code.
  */
 DECLARE_TRACE(pelt_cfs,
 	TP_PROTO(struct cfs_rq *cfs_rq),
-- 
cgit v1.2.3


From c0ef1446959101d23fdf1b1bdefc6613a83dba03 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Fri, 4 Jul 2025 20:21:53 +0200
Subject: devlink: Add support for u64 parameters

Only 8, 16 and 32-bit integers are supported for numeric devlink
parameters. The subsequent patch adds support for DPLL clock ID
that is defined as 64-bit number. Add support for u64 parameter
type.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20250704182202.1641943-4-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  2 ++
 net/devlink/param.c   | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index d0ce5a7e984c..4a5896b846a4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -425,6 +425,7 @@ enum devlink_param_type {
 	DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8,
 	DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16,
 	DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32,
+	DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64,
 	DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING,
 	DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG,
 };
@@ -433,6 +434,7 @@ union devlink_param_value {
 	u8 vu8;
 	u16 vu16;
 	u32 vu32;
+	u64 vu64;
 	char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE];
 	bool vbool;
 };
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 396b8a7f6013..9709b41664aa 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -200,6 +200,11 @@ devlink_nl_param_value_fill_one(struct sk_buff *msg,
 		if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
 			goto value_nest_cancel;
 		break;
+	case DEVLINK_PARAM_TYPE_U64:
+		if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
+				       val.vu64))
+			goto value_nest_cancel;
+		break;
 	case DEVLINK_PARAM_TYPE_STRING:
 		if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
 				   val.vstr))
@@ -434,6 +439,11 @@ devlink_param_value_get_from_info(const struct devlink_param *param,
 			return -EINVAL;
 		value->vu32 = nla_get_u32(param_data);
 		break;
+	case DEVLINK_PARAM_TYPE_U64:
+		if (nla_len(param_data) != sizeof(u64))
+			return -EINVAL;
+		value->vu64 = nla_get_u64(param_data);
+		break;
 	case DEVLINK_PARAM_TYPE_STRING:
 		len = strnlen(nla_data(param_data), nla_len(param_data));
 		if (len == nla_len(param_data) ||
-- 
cgit v1.2.3


From de9ccf2296ac323a571e442b5730ca9cc259fbf0 Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Fri, 4 Jul 2025 20:21:54 +0200
Subject: devlink: Add new "clock_id" generic device param

Add a new device generic parameter to specify clock ID that should
be used by the device for registering DPLL devices and pins.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/20250704182202.1641943-5-ivecera@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/devlink-params.rst | 3 +++
 include/net/devlink.h                               | 4 ++++
 net/devlink/param.c                                 | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst
index 3da8f4ef2417..211b58177e12 100644
--- a/Documentation/networking/devlink/devlink-params.rst
+++ b/Documentation/networking/devlink/devlink-params.rst
@@ -140,3 +140,6 @@ own name.
    * - ``enable_phc``
      - Boolean
      - Enable PHC (PTP Hardware Clock) functionality in the device.
+   * - ``clock_id``
+     - u64
+     - Clock ID used by the device for registering DPLL devices and pins.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4a5896b846a4..93640a29427c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -525,6 +525,7 @@ enum devlink_param_generic_id {
 	DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
 	DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
 	DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+	DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
 
 	/* add new param generic ids above here*/
 	__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -586,6 +587,9 @@ enum devlink_param_generic_id {
 #define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc"
 #define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL
 
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id"
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64
+
 #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate)	\
 {									\
 	.id = DEVLINK_PARAM_GENERIC_ID_##_id,				\
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 9709b41664aa..41dcc86cfd94 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -97,6 +97,11 @@ static const struct devlink_param devlink_param_generic[] = {
 		.name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME,
 		.type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE,
 	},
+	{
+		.id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
+		.name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME,
+		.type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE,
+	},
 };
 
 static int devlink_param_generic_verify(const struct devlink_param *param)
-- 
cgit v1.2.3


From 67c0170566b55b1f6ee3567c94ff679104277e2d Mon Sep 17 00:00:00 2001
From: Kamil Horák - 2N <kamilh@axis.com>
Date: Tue, 8 Jul 2025 11:01:37 +0200
Subject: net: phy: MII-Lite PHY interface mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some Broadcom PHYs are capable to operate in simplified MII mode,
without TXER, RXER, CRS and COL signals as defined for the MII.
The MII-Lite mode can be used on most Ethernet controllers with full
MII interface by just leaving the input signals (RXER, CRS, COL)
inactive. The absence of COL signal makes half-duplex link modes
impossible but does not interfere with BroadR-Reach link modes on
Broadcom PHYs, because they are all full-duplex only.

Add MII-Lite interface mode, especially for Broadcom two-wire PHYs.

Signed-off-by: Kamil Horák - 2N <kamilh@axis.com>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20250708090140.61355-2-kamilh@axis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/phy.rst | 7 +++++++
 drivers/net/phy/phy-core.c       | 1 +
 drivers/net/phy/phy_caps.c       | 4 ++++
 drivers/net/phy/phylink.c        | 1 +
 include/linux/phy.h              | 4 ++++
 5 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index f64641417c54..7f159043ad5a 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -333,6 +333,13 @@ Some of the interface modes are described below:
     SerDes lane, each port having speeds of 2.5G / 1G / 100M / 10M achieved
     through symbol replication. The PCS expects the standard USXGMII code word.
 
+``PHY_INTERFACE_MODE_MIILITE``
+    Non-standard, simplified MII mode, without TXER, RXER, CRS and COL signals
+    as defined for the MII. The absence of COL signal makes half-duplex link
+    modes impossible but does not interfere with BroadR-Reach link modes on
+    Broadcom (and other two-wire Ethernet) PHYs, because they are full-duplex
+    only.
+
 Pause frames / flow control
 ===========================
 
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index c480bb40fa73..605ca20ae192 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -115,6 +115,7 @@ int phy_interface_num_ports(phy_interface_t interface)
 		return 0;
 	case PHY_INTERFACE_MODE_INTERNAL:
 	case PHY_INTERFACE_MODE_MII:
+	case PHY_INTERFACE_MODE_MIILITE:
 	case PHY_INTERFACE_MODE_GMII:
 	case PHY_INTERFACE_MODE_TBI:
 	case PHY_INTERFACE_MODE_REVMII:
diff --git a/drivers/net/phy/phy_caps.c b/drivers/net/phy/phy_caps.c
index d11ce1c7e712..2cc9ee97e867 100644
--- a/drivers/net/phy/phy_caps.c
+++ b/drivers/net/phy/phy_caps.c
@@ -316,6 +316,10 @@ unsigned long phy_caps_from_interface(phy_interface_t interface)
 		link_caps |= BIT(LINK_CAPA_100HD) | BIT(LINK_CAPA_100FD);
 		break;
 
+	case PHY_INTERFACE_MODE_MIILITE:
+		link_caps |= BIT(LINK_CAPA_10FD) | BIT(LINK_CAPA_100FD);
+		break;
+
 	case PHY_INTERFACE_MODE_TBI:
 	case PHY_INTERFACE_MODE_MOCA:
 	case PHY_INTERFACE_MODE_RTBI:
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f5473510b762..c7f867b361dd 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -237,6 +237,7 @@ static int phylink_interface_max_speed(phy_interface_t interface)
 	case PHY_INTERFACE_MODE_SMII:
 	case PHY_INTERFACE_MODE_REVMII:
 	case PHY_INTERFACE_MODE_MII:
+	case PHY_INTERFACE_MODE_MIILITE:
 		return SPEED_100;
 
 	case PHY_INTERFACE_MODE_TBI:
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 543a94751a6b..4c2b8b6e7187 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -106,6 +106,7 @@ extern const int phy_basic_ports_array[3];
  * @PHY_INTERFACE_MODE_50GBASER: 50GBase-R - with Clause 134 FEC
  * @PHY_INTERFACE_MODE_LAUI: 50 Gigabit Attachment Unit Interface
  * @PHY_INTERFACE_MODE_100GBASEP: 100GBase-P - with Clause 134 FEC
+ * @PHY_INTERFACE_MODE_MIILITE: MII-Lite - MII without RXER TXER CRS COL
  * @PHY_INTERFACE_MODE_MAX: Book keeping
  *
  * Describes the interface between the MAC and PHY.
@@ -150,6 +151,7 @@ typedef enum {
 	PHY_INTERFACE_MODE_50GBASER,
 	PHY_INTERFACE_MODE_LAUI,
 	PHY_INTERFACE_MODE_100GBASEP,
+	PHY_INTERFACE_MODE_MIILITE,
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
@@ -272,6 +274,8 @@ static inline const char *phy_modes(phy_interface_t interface)
 		return "laui";
 	case PHY_INTERFACE_MODE_100GBASEP:
 		return "100gbase-p";
+	case PHY_INTERFACE_MODE_MIILITE:
+		return "mii-lite";
 	default:
 		return "unknown";
 	}
-- 
cgit v1.2.3


From 34bf222824f6c9ac03620ee18aaf93d3b6138db3 Mon Sep 17 00:00:00 2001
From: Kamil Horák - 2N <kamilh@axis.com>
Date: Tue, 8 Jul 2025 11:01:39 +0200
Subject: net: phy: bcm5481x: MII-Lite activation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Broadcom PHYs featuring the BroadR-Reach two-wire link mode are usually
capable to operate in simplified MII mode, without TXER, RXER, CRS and
COL signals as defined for the MII. The absence of COL signal makes
half-duplex link modes impossible, however, the BroadR-Reach modes are
all full-duplex only.
Depending on the IC encapsulation, there exist MII-Lite-only PHYs such
as bcm54811 in MLP. The PHY itself is hardware-strapped to select among
multiple RGMII and MII-Lite modes, but the MII-Lite mode must be also
activated by software.

Add MII-Lite activation for bcm5481x PHYs.

Signed-off-by: Kamil Horák - 2N <kamilh@axis.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://patch.msgid.link/20250708090140.61355-4-kamilh@axis.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/broadcom.c | 14 +++++++++++++-
 include/linux/brcmphy.h    |  6 ++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 9b1de54fd483..8547983bd72f 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -407,7 +407,7 @@ static int bcm5481x_set_brrmode(struct phy_device *phydev, bool on)
 static int bcm54811_config_init(struct phy_device *phydev)
 {
 	struct bcm54xx_phy_priv *priv = phydev->priv;
-	int err, reg;
+	int err, reg, exp_sync_ethernet;
 
 	/* Enable CLK125 MUX on LED4 if ref clock is enabled. */
 	if (!(phydev->dev_flags & PHY_BRCM_RX_REFCLK_UNUSED)) {
@@ -424,6 +424,18 @@ static int bcm54811_config_init(struct phy_device *phydev)
 	if (priv->brr_mode)
 		phydev->autoneg = 0;
 
+	/* Enable MII Lite (No TXER, RXER, CRS, COL) if configured */
+	if (phydev->interface == PHY_INTERFACE_MODE_MIILITE)
+		exp_sync_ethernet = BCM_EXP_SYNC_ETHERNET_MII_LITE;
+	else
+		exp_sync_ethernet = 0;
+
+	err = bcm_phy_modify_exp(phydev, BCM_EXP_SYNC_ETHERNET,
+				 BCM_EXP_SYNC_ETHERNET_MII_LITE,
+				 exp_sync_ethernet);
+	if (err < 0)
+		return err;
+
 	return bcm5481x_set_brrmode(phydev, priv->brr_mode);
 }
 
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 028b3e00378e..15c35655f482 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -182,6 +182,12 @@
 #define BCM_LED_MULTICOLOR_ACT		0x9
 #define BCM_LED_MULTICOLOR_PROGRAM	0xa
 
+/*
+ * Broadcom Synchronous Ethernet Controls (expansion register 0x0E)
+ */
+#define BCM_EXP_SYNC_ETHERNET		(MII_BCM54XX_EXP_SEL_ER + 0x0E)
+#define BCM_EXP_SYNC_ETHERNET_MII_LITE	BIT(11)
+
 /*
  * BCM5482: Shadow registers
  * Shadow values go into bits [14:10] of register 0x1c to select a shadow
-- 
cgit v1.2.3


From 82241a83cd15aaaf28200a40ad1a8b480012edaf Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Thu, 5 Jun 2025 20:58:29 +0800
Subject: mm: fix the inaccurate memory statistics issue for users

On some large machines with a high number of CPUs running a 64K pagesize
kernel, we found that the 'RES' field is always 0 displayed by the top
command for some processes, which will cause a lot of confusion for users.

    PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
 875525 root      20   0   12480      0      0 R   0.3   0.0   0:00.08 top
      1 root      20   0  172800      0      0 S   0.0   0.0   0:04.52 systemd

The main reason is that the batch size of the percpu counter is quite
large on these machines, caching a significant percpu value, since
converting mm's rss stats into percpu_counter by commit f1a7941243c1 ("mm:
convert mm's rss stats into percpu_counter").  Intuitively, the batch
number should be optimized, but on some paths, performance may take
precedence over statistical accuracy.  Therefore, introducing a new
interface to add the percpu statistical count and display it to users,
which can remove the confusion.  In addition, this change is not expected
to be on a performance-critical path, so the modification should be
acceptable.

In addition, the 'mm->rss_stat' is updated by using add_mm_counter() and
dec/inc_mm_counter(), which are all wrappers around
percpu_counter_add_batch().  In percpu_counter_add_batch(), there is
percpu batch caching to avoid 'fbc->lock' contention.  This patch changes
task_mem() and task_statm() to get the accurate mm counters under the
'fbc->lock', but this should not exacerbate kernel 'mm->rss_stat' lock
contention due to the percpu batch caching of the mm counters.  The
following test also confirm the theoretical analysis.

I run the stress-ng that stresses anon page faults in 32 threads on my 32
cores machine, while simultaneously running a script that starts 32
threads to busy-loop pread each stress-ng thread's /proc/pid/status
interface.  From the following data, I did not observe any obvious impact
of this patch on the stress-ng tests.

w/o patch:
stress-ng: info:  [6848]          4,399,219,085,152 CPU Cycles          67.327 B/sec
stress-ng: info:  [6848]          1,616,524,844,832 Instructions          24.740 B/sec (0.367 instr. per cycle)
stress-ng: info:  [6848]          39,529,792 Page Faults Total           0.605 M/sec
stress-ng: info:  [6848]          39,529,792 Page Faults Minor           0.605 M/sec

w/patch:
stress-ng: info:  [2485]          4,462,440,381,856 CPU Cycles          68.382 B/sec
stress-ng: info:  [2485]          1,615,101,503,296 Instructions          24.750 B/sec (0.362 instr. per cycle)
stress-ng: info:  [2485]          39,439,232 Page Faults Total           0.604 M/sec
stress-ng: info:  [2485]          39,439,232 Page Faults Minor           0.604 M/sec

On comparing a very simple app which just allocates & touches some
memory against v6.1 (which doesn't have f1a7941243c1) and latest Linus
tree (4c06e63b9203) I can see that on latest Linus tree the values for
VmRSS, RssAnon and RssFile from /proc/self/status are all zeroes while
they do report values on v6.1 and a Linus tree with this patch.

Link: https://lkml.kernel.org/r/f4586b17f66f97c174f7fd1f8647374fdb53de1c.1749119050.git.baolin.wang@linux.alibaba.com
Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter")
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by: Aboorva Devarajan <aboorvad@linux.ibm.com>
Tested-by Donet Tom <donettom@linux.ibm.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c | 14 +++++++-------
 include/linux/mm.h |  5 +++++
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4be91eb6ea5c..751479eb128f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -36,9 +36,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
-	anon = get_mm_counter(mm, MM_ANONPAGES);
-	file = get_mm_counter(mm, MM_FILEPAGES);
-	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+	anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+	file = get_mm_counter_sum(mm, MM_FILEPAGES);
+	shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = min(text, mm->exec_vm << PAGE_SHIFT);
 	lib = (mm->exec_vm << PAGE_SHIFT) - text;
 
-	swap = get_mm_counter(mm, MM_SWAPENTS);
+	swap = get_mm_counter_sum(mm, MM_SWAPENTS);
 	SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
 	SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
 	SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -92,12 +92,12 @@ unsigned long task_statm(struct mm_struct *mm,
 			 unsigned long *shared, unsigned long *text,
 			 unsigned long *data, unsigned long *resident)
 {
-	*shared = get_mm_counter(mm, MM_FILEPAGES) +
-			get_mm_counter(mm, MM_SHMEMPAGES);
+	*shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+			get_mm_counter_sum(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->data_vm + mm->stack_vm;
-	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+	*resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
 	return mm->total_vm;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..fa538feaa8d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2568,6 +2568,11 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 	return percpu_counter_read_positive(&mm->rss_stat[member]);
 }
 
+static inline unsigned long get_mm_counter_sum(struct mm_struct *mm, int member)
+{
+	return percpu_counter_sum_positive(&mm->rss_stat[member]);
+}
+
 void mm_trace_rss_stat(struct mm_struct *mm, int member);
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
-- 
cgit v1.2.3


From db6cc3f4ac2e6cdc898fc9cbc8b32ae1bf56bdad Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Fri, 4 Jul 2025 21:56:20 +0800
Subject: Revert "sched/numa: add statistics of numa balance task"

This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382.

This commit introduces per-memcg/task NUMA balance statistics, but
unfortunately it introduced a NULL pointer exception due to the following
race condition: After a swap task candidate was chosen, its mm_struct
pointer was set to NULL due to task exit.  Later, when performing the
actual task swapping, the p->mm caused the problem.

CPU0                                   CPU1
:
...
task_numa_migrate
     task_numa_find_cpu
      task_numa_compare
        # a normal task p is chosen
        env->best_task = p

                                          # p exit:
                                          exit_signals(p);
                                             p->flags |= PF_EXITING
                                          exit_mm
                                             p->mm = NULL;

      migrate_swap_stop
        __migrate_swap_task((arg->src_task, arg->dst_cpu)
         count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL

task_lock() should be held and the PF_EXITING flag needs to be checked to
prevent this from happening.  After discussion, the conclusion was that
adding a lock is not worthwhile for some statistics calculations.  Revert
the change and rely on the tracepoint for this purpose.

Link: https://lkml.kernel.org/r/20250704135620.685752-1-yu.c.chen@intel.com
Link: https://lkml.kernel.org/r/20250708064917.BBD13C4CEED@smtp.kernel.org
Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task")
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reported-by: Jirka Hladky <jhladky@redhat.com>
Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/
Reported-by: Srikanth Aithal <Srikanth.Aithal@amd.com>
Reported-by: Suneeth D <Suneeth.D@amd.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Hladky <jhladky@redhat.com>
Cc: Libo Chen <libo.chen@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/cgroup-v2.rst | 6 ------
 include/linux/sched.h                   | 4 ----
 include/linux/vm_event_item.h           | 2 --
 kernel/sched/core.c                     | 9 ++-------
 kernel/sched/debug.c                    | 4 ----
 mm/memcontrol.c                         | 2 --
 mm/vmstat.c                             | 2 --
 7 files changed, 2 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 0cc35a14afbe..bd98ea3175ec 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1732,12 +1732,6 @@ The following nested keys are defined.
 	  numa_hint_faults (npn)
 		Number of NUMA hinting faults.
 
-	  numa_task_migrated (npn)
-		Number of task migration by NUMA balancing.
-
-	  numa_task_swapped (npn)
-		Number of task swap by NUMA balancing.
-
 	  pgdemote_kswapd
 		Number of pages demoted by kswapd.
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..aa9c5be7a632 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -548,10 +548,6 @@ struct sched_statistics {
 	u64				nr_failed_migrations_running;
 	u64				nr_failed_migrations_hot;
 	u64				nr_forced_migrations;
-#ifdef CONFIG_NUMA_BALANCING
-	u64				numa_task_migrated;
-	u64				numa_task_swapped;
-#endif
 
 	u64				nr_wakeups;
 	u64				nr_wakeups_sync;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 91a3ce9a2687..9e15a088ba38 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -66,8 +66,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		NUMA_HINT_FAULTS,
 		NUMA_HINT_FAULTS_LOCAL,
 		NUMA_PAGE_MIGRATE,
-		NUMA_TASK_MIGRATE,
-		NUMA_TASK_SWAP,
 #endif
 #ifdef CONFIG_MIGRATION
 		PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec68fc686bd7..81c6df746df1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #ifdef CONFIG_NUMA_BALANCING
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-	__schedstat_inc(p->stats.numa_task_swapped);
-	count_vm_numa_event(NUMA_TASK_SWAP);
-	count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
-
 	if (task_on_rq_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 		struct rq_flags srf, drf;
@@ -7939,9 +7935,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
 	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
 		return -EINVAL;
 
-	__schedstat_inc(p->stats.numa_task_migrated);
-	count_vm_numa_event(NUMA_TASK_MIGRATE);
-	count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
+	/* TODO: This is not properly updating schedstats */
+
 	trace_sched_move_numa(p, curr_cpu, target_cpu);
 	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9d71baf08075..557246880a7e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		P_SCHEDSTAT(nr_failed_migrations_running);
 		P_SCHEDSTAT(nr_failed_migrations_hot);
 		P_SCHEDSTAT(nr_forced_migrations);
-#ifdef CONFIG_NUMA_BALANCING
-		P_SCHEDSTAT(numa_task_migrated);
-		P_SCHEDSTAT(numa_task_swapped);
-#endif
 		P_SCHEDSTAT(nr_wakeups);
 		P_SCHEDSTAT(nr_wakeups_sync);
 		P_SCHEDSTAT(nr_wakeups_migrate);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 902da8a9c643..70fdeda1120b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -474,8 +474,6 @@ static const unsigned int memcg_vm_event_stat[] = {
 	NUMA_PAGE_MIGRATE,
 	NUMA_PTE_UPDATES,
 	NUMA_HINT_FAULTS,
-	NUMA_TASK_MIGRATE,
-	NUMA_TASK_SWAP,
 #endif
 };
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 429ae5339bfe..a78d70ddeacd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1346,8 +1346,6 @@ const char * const vmstat_text[] = {
 	"numa_hint_faults",
 	"numa_hint_faults_local",
 	"numa_pages_migrated",
-	"numa_task_migrated",
-	"numa_task_swapped",
 #endif
 #ifdef CONFIG_MIGRATION
 	"pgmigrate_success",
-- 
cgit v1.2.3


From de195c67bfcbc770bb6327bbfd3010f1c371216a Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 29 May 2025 18:15:45 +0100
Subject: mm: ksm: have KSM VMA checks not require a VMA pointer

Patch series "mm: ksm: prevent KSM from breaking merging of new VMAs", v3.

When KSM-by-default is established using prctl(PR_SET_MEMORY_MERGE), this
defaults all newly mapped VMAs to having VM_MERGEABLE set, and thus makes
them available to KSM for samepage merging.  It also sets VM_MERGEABLE in
all existing VMAs.

However this causes an issue upon mapping of new VMAs - the initial flags
will never have VM_MERGEABLE set when attempting a merge with adjacent
VMAs (this is set later in the mmap() logic), and adjacent VMAs will
ALWAYS have VM_MERGEABLE set.

This renders all newly mapped VMAs unmergeable.

To avoid this, this series performs the check for PR_SET_MEMORY_MERGE far
earlier in the mmap() logic, prior to the merge being attempted.

However we run into complexity with the depreciated .mmap() callback - if
a driver hooks this, it might change flags which adjust KSM merge
eligibility.

We have to worry about this because, while KSM is only applicable to
private mappings, this includes both anonymous and MAP_PRIVATE-mapped
file-backed mappings.

This isn't a problem for brk(), where the VMA must be anonymous.  However
in mmap() we must be conservative - if the VMA is anonymous then we can
always proceed, however if not, we permit only shmem mappings (whose .mmap
hook does not affect KSM eligibility) and drivers which implement
.mmap_prepare() (invoked prior to the KSM eligibility check).

If we can't be sure of the driver changing things, then we maintain the
same behaviour of performing the KSM check later in the mmap() logic (and
thus losing new VMA mergeability).

A great many use-cases for this logic will use anonymous mappings any
rate, so this change should already cover the majority of actual KSM
use-cases.


This patch (of 4):

In subsequent commits we are going to determine KSM eligibility prior to a
VMA being constructed, at which point we will of course not yet have
access to a VMA pointer.

It is trivial to boil down the check logic to be parameterised on
mm_struct, file and VMA flags, so do so.

As a part of this change, additionally expose and use file_is_dax() to
determine whether a file is being mapped under a DAX inode.

Link: https://lkml.kernel.org/r/cover.1748537921.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/36ad13eb50cdbd8aac6dcfba22c65d5031667295.1748537921.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Xu Xin <xu.xin16@zte.com.cn>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  7 ++++++-
 mm/ksm.c           | 32 ++++++++++++++++++++------------
 2 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 040c0036320f..62634af97da6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3726,9 +3726,14 @@ void setattr_copy(struct mnt_idmap *, struct inode *inode,
 
 extern int file_update_time(struct file *file);
 
+static inline bool file_is_dax(const struct file *file)
+{
+	return file && IS_DAX(file->f_mapping->host);
+}
+
 static inline bool vma_is_dax(const struct vm_area_struct *vma)
 {
-	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+	return file_is_dax(vma->vm_file);
 }
 
 static inline bool vma_is_fsdax(struct vm_area_struct *vma)
diff --git a/mm/ksm.c b/mm/ksm.c
index 8583fb91ef13..08d486f188ff 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -677,28 +677,33 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v
 	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
 
-static bool vma_ksm_compatible(struct vm_area_struct *vma)
+static bool ksm_compatible(const struct file *file, vm_flags_t vm_flags)
 {
-	if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
-			     VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
-			     VM_MIXEDMAP| VM_DROPPABLE))
+	if (vm_flags & (VM_SHARED   | VM_MAYSHARE   | VM_PFNMAP  |
+			VM_IO       | VM_DONTEXPAND | VM_HUGETLB |
+			VM_MIXEDMAP | VM_DROPPABLE))
 		return false;		/* just ignore the advice */
 
-	if (vma_is_dax(vma))
+	if (file_is_dax(file))
 		return false;
 
 #ifdef VM_SAO
-	if (vma->vm_flags & VM_SAO)
+	if (vm_flags & VM_SAO)
 		return false;
 #endif
 #ifdef VM_SPARC_ADI
-	if (vma->vm_flags & VM_SPARC_ADI)
+	if (vm_flags & VM_SPARC_ADI)
 		return false;
 #endif
 
 	return true;
 }
 
+static bool vma_ksm_compatible(struct vm_area_struct *vma)
+{
+	return ksm_compatible(vma->vm_file, vma->vm_flags);
+}
+
 static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
 		unsigned long addr)
 {
@@ -2696,14 +2701,17 @@ static int ksm_scan_thread(void *nothing)
 	return 0;
 }
 
-static void __ksm_add_vma(struct vm_area_struct *vma)
+static bool __ksm_should_add_vma(const struct file *file, vm_flags_t vm_flags)
 {
-	unsigned long vm_flags = vma->vm_flags;
-
 	if (vm_flags & VM_MERGEABLE)
-		return;
+		return false;
+
+	return ksm_compatible(file, vm_flags);
+}
 
-	if (vma_ksm_compatible(vma))
+static void __ksm_add_vma(struct vm_area_struct *vma)
+{
+	if (__ksm_should_add_vma(vma->vm_file, vma->vm_flags))
 		vm_flags_set(vma, VM_MERGEABLE);
 }
 
-- 
cgit v1.2.3


From cf7e7a3503df0b71afd68ee84e9a09d4514cc2dd Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 29 May 2025 18:15:47 +0100
Subject: mm: prevent KSM from breaking VMA merging for new VMAs

If a user wishes to enable KSM mergeability for an entire process and all
fork/exec'd processes that come after it, they use the prctl()
PR_SET_MEMORY_MERGE operation.

This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
(in order to indicate they are KSM mergeable), as well as setting this
flag for all existing VMAs and propagating this across fork/exec.

However it also breaks VMA merging for new VMAs, both in the process and
all forked (and fork/exec'd) child processes.

This is because when a new mapping is proposed, the flags specified will
never have VM_MERGEABLE set.  However all adjacent VMAs will already have
VM_MERGEABLE set, rendering VMAs unmergeable by default.

To work around this, we try to set the VM_MERGEABLE flag prior to
attempting a merge.  In the case of brk() this can always be done.

However on mmap() things are more complicated - while KSM is not supported
for MAP_SHARED file-backed mappings, it is supported for MAP_PRIVATE
file-backed mappings.

These mappings may have deprecated .mmap() callbacks specified which
could, in theory, adjust flags and thus KSM eligibility.

So we check to determine whether this is possible.  If not, we set
VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
previous behaviour.

This fixes VMA merging for all new anonymous mappings, which covers the
majority of real-world cases, so we should see a significant improvement
in VMA mergeability.

For MAP_PRIVATE file-backed mappings, those which implement the
.mmap_prepare() hook and shmem are both known to be safe, so we allow
these, disallowing all other cases.

Also add stubs for newly introduced function invocations to VMA userland
testing.

[lorenzo.stoakes@oracle.com: correctly invoke late KSM check after mmap hook]
  Link: https://lkml.kernel.org/r/5861f8f6-cf5a-4d82-a062-139fb3f9cddb@lucifer.local
Link: https://lkml.kernel.org/r/3ba660af716d87a18ca5b4e635f2101edeb56340.1748537921.git.lorenzo.stoakes@oracle.com
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") # please no backport!
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Xu Xin <xu.xin16@zte.com.cn>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h              |  8 ++++---
 mm/ksm.c                         | 18 +++++++++-----
 mm/vma.c                         | 51 ++++++++++++++++++++++++++++++++++++++--
 tools/testing/vma/vma_internal.h | 11 +++++++++
 4 files changed, 77 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index d73095b5cd96..51787f0b0208 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -17,8 +17,8 @@
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
-
-void ksm_add_vma(struct vm_area_struct *vma);
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+			 vm_flags_t vm_flags);
 int ksm_enable_merge_any(struct mm_struct *mm);
 int ksm_disable_merge_any(struct mm_struct *mm);
 int ksm_disable(struct mm_struct *mm);
@@ -97,8 +97,10 @@ bool ksm_process_mergeable(struct mm_struct *mm);
 
 #else  /* !CONFIG_KSM */
 
-static inline void ksm_add_vma(struct vm_area_struct *vma)
+static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
+		const struct file *file, vm_flags_t vm_flags)
 {
+	return vm_flags;
 }
 
 static inline int ksm_disable(struct mm_struct *mm)
diff --git a/mm/ksm.c b/mm/ksm.c
index d0c763abd499..18b3690bb69a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2731,16 +2731,22 @@ static int __ksm_del_vma(struct vm_area_struct *vma)
 	return 0;
 }
 /**
- * ksm_add_vma - Mark vma as mergeable if compatible
+ * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible
  *
- * @vma:  Pointer to vma
+ * @mm:       Proposed VMA's mm_struct
+ * @file:     Proposed VMA's file-backed mapping, if any.
+ * @vm_flags: Proposed VMA"s flags.
+ *
+ * Returns: @vm_flags possibly updated to mark mergeable.
  */
-void ksm_add_vma(struct vm_area_struct *vma)
+vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
+			 vm_flags_t vm_flags)
 {
-	struct mm_struct *mm = vma->vm_mm;
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) &&
+	    __ksm_should_add_vma(file, vm_flags))
+		vm_flags |= VM_MERGEABLE;
 
-	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
-		__ksm_add_vma(vma);
+	return vm_flags;
 }
 
 static void ksm_add_vmas(struct mm_struct *mm)
diff --git a/mm/vma.c b/mm/vma.c
index fef67a66a095..079540ebfb72 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -32,6 +32,9 @@ struct mmap_state {
 	struct vma_munmap_struct vms;
 	struct ma_state mas_detach;
 	struct maple_tree mt_detach;
+
+	/* Determine if we can check KSM flags early in mmap() logic. */
+	bool check_ksm_early;
 };
 
 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
@@ -2320,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
 	vms_complete_munmap_vmas(vms, mas_detach);
 }
 
+static void update_ksm_flags(struct mmap_state *map)
+{
+	map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
+}
+
 /*
  * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
  * unmapped once the map operation is completed, check limits, account mapping
@@ -2424,6 +2432,7 @@ static int __mmap_new_file_vma(struct mmap_state *map,
 			!(map->flags & VM_MAYWRITE) &&
 			(vma->vm_flags & VM_MAYWRITE));
 
+	map->file = vma->vm_file;
 	map->flags = vma->vm_flags;
 
 	return 0;
@@ -2473,6 +2482,11 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 	if (error)
 		goto free_iter_vma;
 
+	if (!map->check_ksm_early) {
+		update_ksm_flags(map);
+		vm_flags_init(vma, map->flags);
+	}
+
 #ifdef CONFIG_SPARC64
 	/* TODO: Fix SPARC ADI! */
 	WARN_ON_ONCE(!arch_validate_flags(map->flags));
@@ -2490,7 +2504,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 	 */
 	if (!vma_is_anonymous(vma))
 		khugepaged_enter_vma(vma, map->flags);
-	ksm_add_vma(vma);
 	*vmap = vma;
 	return 0;
 
@@ -2593,6 +2606,35 @@ static void set_vma_user_defined_fields(struct vm_area_struct *vma,
 	vma->vm_private_data = map->vm_private_data;
 }
 
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+	struct file *file = map->file;
+
+	/* Anonymous mappings have no driver which can change them. */
+	if (!file)
+		return true;
+
+	/*
+	 * If .mmap_prepare() is specified, then the driver will have already
+	 * manipulated state prior to updating KSM flags. So no need to worry
+	 * about mmap callbacks modifying VMA flags after the KSM flag has been
+	 * updated here, which could otherwise affect KSM eligibility.
+	 */
+	if (file->f_op->mmap_prepare)
+		return true;
+
+	/* shmem is safe. */
+	if (shmem_file(file))
+		return true;
+
+	/* Any other .mmap callback is not safe. */
+	return false;
+}
+
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
 		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
 		struct list_head *uf)
@@ -2604,12 +2646,17 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
 	VMA_ITERATOR(vmi, mm, addr);
 	MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
 
+	map.check_ksm_early = can_set_ksm_flags_early(&map);
+
 	error = __mmap_prepare(&map, uf);
 	if (!error && have_mmap_prepare)
 		error = call_mmap_prepare(&map);
 	if (error)
 		goto abort_munmap;
 
+	if (map.check_ksm_early)
+		update_ksm_flags(&map);
+
 	/* Attempt to merge with adjacent VMAs... */
 	if (map.prev || map.next) {
 		VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2721,6 +2768,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 * Note: This happens *after* clearing old mappings in some code paths.
 	 */
 	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+	flags = ksm_vma_flags(mm, NULL, flags);
 	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
@@ -2764,7 +2812,6 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 
 	mm->map_count++;
 	validate_mm(mm);
-	ksm_add_vma(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 14718ca23a05..0f013784da89 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1484,4 +1484,15 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
 	fput(file);
 }
 
+static inline bool shmem_file(struct file *)
+{
+	return false;
+}
+
+static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct file *,
+			 vm_flags_t vm_flags)
+{
+	return vm_flags;
+}
+
 #endif	/* __MM_VMA_INTERNAL_H */
-- 
cgit v1.2.3


From e399a07a8a527dd023ca720a338fc508bedd364f Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Fri, 11 Apr 2025 10:17:45 -0600
Subject: mm: remove unused mmap tracepoints

The vma_mas_szero and vma_store tracepoints are unused since commit
fbcc3104b843 ("mmap: convert __vma_adjust() to use vma iterator").  Remove
them so they are no longer listed as available tracepoints.

Link: https://lkml.kernel.org/r/20250411161746.1043239-1-csander@purestorage.com
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reported-by: Eric Mueller <emueller@purestorage.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/mmap.h | 52 ---------------------------------------------
 1 file changed, 52 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
index f8d61485de16..ee2843a5daef 100644
--- a/include/trace/events/mmap.h
+++ b/include/trace/events/mmap.h
@@ -43,58 +43,6 @@ TRACE_EVENT(vm_unmapped_area,
 		__entry->align_offset)
 );
 
-TRACE_EVENT(vma_mas_szero,
-	TP_PROTO(struct maple_tree *mt, unsigned long start,
-		 unsigned long end),
-
-	TP_ARGS(mt, start, end),
-
-	TP_STRUCT__entry(
-			__field(struct maple_tree *, mt)
-			__field(unsigned long, start)
-			__field(unsigned long, end)
-	),
-
-	TP_fast_assign(
-			__entry->mt		= mt;
-			__entry->start		= start;
-			__entry->end		= end;
-	),
-
-	TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,",
-		  __entry->mt,
-		  (unsigned long) __entry->start,
-		  (unsigned long) __entry->end
-	)
-);
-
-TRACE_EVENT(vma_store,
-	TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma),
-
-	TP_ARGS(mt, vma),
-
-	TP_STRUCT__entry(
-			__field(struct maple_tree *, mt)
-			__field(struct vm_area_struct *, vma)
-			__field(unsigned long, vm_start)
-			__field(unsigned long, vm_end)
-	),
-
-	TP_fast_assign(
-			__entry->mt		= mt;
-			__entry->vma		= vma;
-			__entry->vm_start	= vma->vm_start;
-			__entry->vm_end		= vma->vm_end - 1;
-	),
-
-	TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,",
-		  __entry->mt, __entry->vma,
-		  (unsigned long) __entry->vm_start,
-		  (unsigned long) __entry->vm_end
-	)
-);
-
-
 TRACE_EVENT(exit_mmap,
 	TP_PROTO(struct mm_struct *mm),
 
-- 
cgit v1.2.3


From 792b429db7e0217faf7bce9fe46e7708135cf83c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 4 Jun 2025 16:05:44 +0200
Subject: mm/gup: remove (VM_)BUG_ONs

Especially once we hit one of the assertions in
sanity_check_pinned_pages(), observing follow-up assertions failing in
other code can give good clues about what went wrong, so use
VM_WARN_ON_ONCE instead.

While at it, let's just convert all VM_BUG_ON to VM_WARN_ON_ONCE as well.
Add one comment for the pfn_valid() check.

We have to introduce VM_WARN_ON_ONCE_VMA() to make that fly.

Drop the BUG_ON after mmap_read_lock_killable(), if that ever returns
something > 0 we're in bigger trouble.  Convert the other BUG_ON's into
VM_WARN_ON_ONCE as well, they are in a similar domain "should never
happen", but more reasonable to check for during early testing.

[david@redhat.com: use the _FOLIO variant where possible, per Lorenzo]
  Link: https://lkml.kernel.org/r/844bd929-a551-48e3-a12e-285cd65ba580@redhat.com
Link: https://lkml.kernel.org/r/20250604140544.688711-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmdebug.h | 12 ++++++++++++
 mm/gup.c                | 41 +++++++++++++++++++----------------------
 2 files changed, 31 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index a0a3894900ed..14a45979cccc 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -89,6 +89,17 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
 	}								\
 	unlikely(__ret_warn_once);					\
 })
+#define VM_WARN_ON_ONCE_VMA(cond, vma)		({			\
+	static bool __section(".data..once") __warned;			\
+	int __ret_warn_once = !!(cond);					\
+									\
+	if (unlikely(__ret_warn_once && !__warned)) {			\
+		dump_vma(vma);						\
+		__warned = true;					\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn_once);					\
+})
 #define VM_WARN_ON_VMG(cond, vmg)		({			\
 	int __ret_warn = !!(cond);					\
 									\
@@ -115,6 +126,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi);
 #define VM_WARN_ON_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_MM(cond, mm)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_VMA(cond, vma)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_VMG(cond, vmg)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
diff --git a/mm/gup.c b/mm/gup.c
index 3c39cbbeebef..7f2644a433a0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,11 +64,11 @@ static inline void sanity_check_pinned_pages(struct page **pages,
 		    !folio_test_anon(folio))
 			continue;
 		if (!folio_test_large(folio) || folio_test_hugetlb(folio))
-			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+			VM_WARN_ON_ONCE_FOLIO(!PageAnonExclusive(&folio->page), folio);
 		else
 			/* Either a PTE-mapped or a PMD-mapped THP. */
-			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
-				       !PageAnonExclusive(page), page);
+			VM_WARN_ON_ONCE_PAGE(!PageAnonExclusive(&folio->page) &&
+					     !PageAnonExclusive(page), page);
 	}
 }
 
@@ -760,8 +760,8 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma,
 	if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
 		return ERR_PTR(-EMLINK);
 
-	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
-			!PageAnonExclusive(page), page);
+	VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+			     !PageAnonExclusive(page), page);
 
 	ret = try_grab_folio(page_folio(page), 1, flags);
 	if (ret)
@@ -899,8 +899,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 		goto out;
 	}
 
-	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
-		       !PageAnonExclusive(page), page);
+	VM_WARN_ON_ONCE_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+			     !PageAnonExclusive(page), page);
 
 	/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
 	ret = try_grab_folio(folio, 1, flags);
@@ -1180,7 +1180,7 @@ static int faultin_page(struct vm_area_struct *vma,
 	if (unshare) {
 		fault_flags |= FAULT_FLAG_UNSHARE;
 		/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
-		VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+		VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_WRITE);
 	}
 
 	ret = handle_mm_fault(vma, address, fault_flags, NULL);
@@ -1760,10 +1760,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
 		}
 
 		/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
-		if (!*locked) {
-			BUG_ON(ret < 0);
-			BUG_ON(ret >= nr_pages);
-		}
+		VM_WARN_ON_ONCE(!*locked && (ret < 0 || ret >= nr_pages));
 
 		if (ret > 0) {
 			nr_pages -= ret;
@@ -1808,7 +1805,6 @@ retry:
 
 		ret = mmap_read_lock_killable(mm);
 		if (ret) {
-			BUG_ON(ret > 0);
 			if (!pages_done)
 				pages_done = ret;
 			break;
@@ -1819,11 +1815,11 @@ retry:
 				       pages, locked);
 		if (!*locked) {
 			/* Continue to retry until we succeeded */
-			BUG_ON(ret != 0);
+			VM_WARN_ON_ONCE(ret != 0);
 			goto retry;
 		}
 		if (ret != 1) {
-			BUG_ON(ret > 1);
+			VM_WARN_ON_ONCE(ret > 1);
 			if (!pages_done)
 				pages_done = ret;
 			break;
@@ -1885,10 +1881,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
 	int gup_flags;
 	long ret;
 
-	VM_BUG_ON(!PAGE_ALIGNED(start));
-	VM_BUG_ON(!PAGE_ALIGNED(end));
-	VM_BUG_ON_VMA(start < vma->vm_start, vma);
-	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
+	VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+	VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
+	VM_WARN_ON_ONCE_VMA(start < vma->vm_start, vma);
+	VM_WARN_ON_ONCE_VMA(end   > vma->vm_end, vma);
 	mmap_assert_locked(mm);
 
 	/*
@@ -1957,8 +1953,8 @@ long faultin_page_range(struct mm_struct *mm, unsigned long start,
 	int gup_flags;
 	long ret;
 
-	VM_BUG_ON(!PAGE_ALIGNED(start));
-	VM_BUG_ON(!PAGE_ALIGNED(end));
+	VM_WARN_ON_ONCE(!PAGE_ALIGNED(start));
+	VM_WARN_ON_ONCE(!PAGE_ALIGNED(end));
 	mmap_assert_locked(mm);
 
 	/*
@@ -2914,7 +2910,8 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 		} else if (pte_special(pte))
 			goto pte_unmap;
 
-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		/* If it's not marked as special it must have a valid memmap. */
+		VM_WARN_ON_ONCE(!pfn_valid(pte_pfn(pte)));
 		page = pte_page(pte);
 
 		folio = try_grab_folio_fast(page, 1, flags);
-- 
cgit v1.2.3


From 3800d55250976b7a4bd42c255b267bc242669709 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Wed, 4 Jun 2025 17:14:27 -0400
Subject: mm: rename CONFIG_PAGE_BLOCK_ORDER to CONFIG_PAGE_BLOCK_MAX_ORDER

The config is in fact an additional upper limit of pageblock_order, so
rename it to avoid confusion.

Link: https://lkml.kernel.org/r/20250604211427.1590859-1-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Juan Yescas <jyescas@google.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Isaac J. Manjarres" <isaacmanjarres@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: T.J. Mercier <tjmercier@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h          | 14 +++++++-------
 include/linux/pageblock-flags.h |  8 ++++----
 mm/Kconfig                      | 15 ++++++++-------
 mm/mm_init.c                    |  2 +-
 4 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 283913d42d7b..5bec8b1d0e66 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -38,19 +38,19 @@
 #define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)
 
 /* Defines the order for the number of pages that have a migrate type. */
-#ifndef CONFIG_PAGE_BLOCK_ORDER
-#define PAGE_BLOCK_ORDER MAX_PAGE_ORDER
+#ifndef CONFIG_PAGE_BLOCK_MAX_ORDER
+#define PAGE_BLOCK_MAX_ORDER MAX_PAGE_ORDER
 #else
-#define PAGE_BLOCK_ORDER CONFIG_PAGE_BLOCK_ORDER
-#endif /* CONFIG_PAGE_BLOCK_ORDER */
+#define PAGE_BLOCK_MAX_ORDER CONFIG_PAGE_BLOCK_MAX_ORDER
+#endif /* CONFIG_PAGE_BLOCK_MAX_ORDER */
 
 /*
  * The MAX_PAGE_ORDER, which defines the max order of pages to be allocated
- * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_ORDER,
+ * by the buddy allocator, has to be larger or equal to the PAGE_BLOCK_MAX_ORDER,
  * which defines the order for the number of pages that can have a migrate type
  */
-#if (PAGE_BLOCK_ORDER > MAX_PAGE_ORDER)
-#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_ORDER
+#if (PAGE_BLOCK_MAX_ORDER > MAX_PAGE_ORDER)
+#error MAX_PAGE_ORDER must be >= PAGE_BLOCK_MAX_ORDER
 #endif
 
 /*
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e73a4292ef02..6297c6343c55 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,18 +41,18 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		MIN_T(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HUGETLB_PAGE_ORDER, PAGE_BLOCK_MAX_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
 
-#define pageblock_order		MIN_T(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HPAGE_PMD_ORDER, PAGE_BLOCK_MAX_ORDER)
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-/* If huge pages are not used, group by PAGE_BLOCK_ORDER */
-#define pageblock_order		PAGE_BLOCK_ORDER
+/* If huge pages are not used, group by PAGE_BLOCK_MAX_ORDER */
+#define pageblock_order		PAGE_BLOCK_MAX_ORDER
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 15716b1a5e21..3b2060a61c05 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1005,8 +1005,8 @@ config ARCH_FORCE_MAX_ORDER
 # the default page block order is MAX_PAGE_ORDER (10) as per
 # include/linux/mmzone.h.
 #
-config PAGE_BLOCK_ORDER
-	int "Page Block Order"
+config PAGE_BLOCK_MAX_ORDER
+	int "Page Block Order Upper Limit"
 	range 1 10 if ARCH_FORCE_MAX_ORDER = 0
 	default 10 if ARCH_FORCE_MAX_ORDER = 0
 	range 1 ARCH_FORCE_MAX_ORDER if ARCH_FORCE_MAX_ORDER != 0
@@ -1014,12 +1014,13 @@ config PAGE_BLOCK_ORDER
 	help
 	  The page block order refers to the power of two number of pages that
 	  are physically contiguous and can have a migrate type associated to
-	  them. The maximum size of the page block order is limited by
-	  ARCH_FORCE_MAX_ORDER.
+	  them. The maximum size of the page block order is at least limited by
+	  ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER.
 
-	  This config allows overriding the default page block order when the
-	  page block order is required to be smaller than ARCH_FORCE_MAX_ORDER
-	  or MAX_PAGE_ORDER.
+	  This config adds a new upper limit of default page block
+	  order when the page block order is required to be smaller than
+	  ARCH_FORCE_MAX_ORDER/MAX_PAGE_ORDER or other limits
+	  (see include/linux/pageblock-flags.h for details).
 
 	  Reducing pageblock order can negatively impact THP generation
 	  success rate. If your workloads use THP heavily, please use this
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f2944748f526..02f41e2bdf60 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1509,7 +1509,7 @@ static inline void setup_usemap(struct zone *zone) {}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
 void __init set_pageblock_order(void)
 {
-	unsigned int order = PAGE_BLOCK_ORDER;
+	unsigned int order = PAGE_BLOCK_MAX_ORDER;
 
 	/* Check that pageblock_nr_pages has not already been setup */
 	if (pageblock_order)
-- 
cgit v1.2.3


From 44b1b073eb36143ec65a918c0fbaa582f3ec2aa1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2025 07:49:38 +0200
Subject: mm: stop passing a writeback_control structure to shmem_writeout

shmem_writeout only needs the swap_iocb cookie and the split folio list.
Pass those explicitly and remove the now unused list member from struct
writeback_control.

Link: https://lkml.kernel.org/r/20250610054959.2057526-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |  2 +-
 drivers/gpu/drm/ttm/ttm_backup.c          |  9 +--------
 include/linux/shmem_fs.h                  |  5 ++++-
 include/linux/writeback.h                 |  3 ---
 mm/shmem.c                                | 26 +++++++++++++++-----------
 mm/vmscan.c                               | 12 +++++-------
 6 files changed, 26 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 19a3eb82dc6a..24d8daa4fdb3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -317,7 +317,7 @@ void __shmem_writeback(size_t size, struct address_space *mapping)
 		if (folio_mapped(folio))
 			folio_redirty_for_writepage(&wbc, folio);
 		else
-			error = shmem_writeout(folio, &wbc);
+			error = shmem_writeout(folio, NULL, NULL);
 	}
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_backup.c b/drivers/gpu/drm/ttm/ttm_backup.c
index ffaab68bd5dd..6f2e58be4f3e 100644
--- a/drivers/gpu/drm/ttm/ttm_backup.c
+++ b/drivers/gpu/drm/ttm/ttm_backup.c
@@ -112,15 +112,8 @@ ttm_backup_backup_page(struct file *backup, struct page *page,
 
 	if (writeback && !folio_mapped(to_folio) &&
 	    folio_clear_dirty_for_io(to_folio)) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_NONE,
-			.nr_to_write = SWAP_CLUSTER_MAX,
-			.range_start = 0,
-			.range_end = LLONG_MAX,
-			.for_reclaim = 1,
-		};
 		folio_set_reclaim(to_folio);
-		ret = shmem_writeout(to_folio, &wbc);
+		ret = shmem_writeout(to_folio, NULL, NULL);
 		if (!folio_test_writeback(to_folio))
 			folio_clear_reclaim(to_folio);
 		/*
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 5f03a39a26f7..6d0f9c599ff7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -11,6 +11,8 @@
 #include <linux/fs_parser.h>
 #include <linux/userfaultfd_k.h>
 
+struct swap_iocb;
+
 /* inode in-kernel data */
 
 #ifdef CONFIG_TMPFS_QUOTA
@@ -107,7 +109,8 @@ static inline bool shmem_mapping(struct address_space *mapping)
 void shmem_unlock_mapping(struct address_space *mapping);
 struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 					pgoff_t index, gfp_t gfp_mask);
-int shmem_writeout(struct folio *folio, struct writeback_control *wbc);
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+		struct list_head *folio_list);
 void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
 int shmem_unuse(unsigned int type);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index eda4b62511f7..82f217970092 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,9 +79,6 @@ struct writeback_control {
 	 */
 	struct swap_iocb **swap_plug;
 
-	/* Target list for splitting a large folio */
-	struct list_head *list;
-
 	/* internal fields used by the ->writepages implementation: */
 	struct folio_batch fbatch;
 	pgoff_t index;
diff --git a/mm/shmem.c b/mm/shmem.c
index 3a5a65b1f41a..ad8db487e721 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1540,11 +1540,13 @@ start_over:
 /**
  * shmem_writeout - Write the folio to swap
  * @folio: The folio to write
- * @wbc: How writeback is to be done
+ * @plug: swap plug
+ * @folio_list: list to put back folios on split
  *
  * Move the folio from the page cache to the swap cache.
  */
-int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
+int shmem_writeout(struct folio *folio, struct swap_iocb **plug,
+		struct list_head *folio_list)
 {
 	struct address_space *mapping = folio->mapping;
 	struct inode *inode = mapping->host;
@@ -1554,9 +1556,6 @@ int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
 	int nr_pages;
 	bool split = false;
 
-	if (WARN_ON_ONCE(!wbc->for_reclaim))
-		goto redirty;
-
 	if ((info->flags & VM_LOCKED) || sbinfo->noswap)
 		goto redirty;
 
@@ -1583,7 +1582,7 @@ int shmem_writeout(struct folio *folio, struct writeback_control *wbc)
 try_split:
 		/* Ensure the subpages are still dirty */
 		folio_test_set_dirty(folio);
-		if (split_folio_to_list(folio, wbc->list))
+		if (split_folio_to_list(folio, folio_list))
 			goto redirty;
 		folio_clear_dirty(folio);
 	}
@@ -1636,13 +1635,21 @@ try_split:
 		list_add(&info->swaplist, &shmem_swaplist);
 
 	if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
+		struct writeback_control wbc = {
+			.sync_mode	= WB_SYNC_NONE,
+			.nr_to_write	= SWAP_CLUSTER_MAX,
+			.range_start	= 0,
+			.range_end	= LLONG_MAX,
+			.for_reclaim	= 1,
+			.swap_plug	= plug,
+		};
 		shmem_recalc_inode(inode, 0, nr_pages);
 		swap_shmem_alloc(folio->swap, nr_pages);
 		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
 
 		mutex_unlock(&shmem_swaplist_mutex);
 		BUG_ON(folio_mapped(folio));
-		return swap_writeout(folio, wbc);
+		return swap_writeout(folio, &wbc);
 	}
 	if (!info->swapped)
 		list_del_init(&info->swaplist);
@@ -1651,10 +1658,7 @@ try_split:
 		goto try_split;
 redirty:
 	folio_mark_dirty(folio);
-	if (wbc->for_reclaim)
-		return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
-	folio_unlock(folio);
-	return 0;
+	return AOP_WRITEPAGE_ACTIVATE;	/* Return with folio locked */
 }
 EXPORT_SYMBOL_GPL(shmem_writeout);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6dd1708fe82..3cceb619a853 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -669,15 +669,13 @@ static pageout_t writeout(struct folio *folio, struct address_space *mapping,
 
 	/*
 	 * The large shmem folio can be split if CONFIG_THP_SWAP is not enabled
-	 * or we failed to allocate contiguous swap entries.
+	 * or we failed to allocate contiguous swap entries, in which case
+	 * the split out folios get added back to folio_list.
 	 */
-	if (shmem_mapping(mapping)) {
-		if (folio_test_large(folio))
-			wbc.list = folio_list;
-		res = shmem_writeout(folio, &wbc);
-	} else {
+	if (shmem_mapping(mapping))
+		res = shmem_writeout(folio, plug, folio_list);
+	else
 		res = swap_writeout(folio, &wbc);
-	}
 
 	if (res < 0)
 		handle_write_error(mapping, folio, res);
-- 
cgit v1.2.3


From 624043dbd5be03cc5a2b9175c3934e6fb0ef7c70 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2025 07:49:41 +0200
Subject: mm: stop passing a writeback_control structure to swap_writeout

swap_writeout only needs the swap_iocb cookie from the writeback_control
structure, so pass it explicitly.

Link: https://lkml.kernel.org/r/20250610054959.2057526-6-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h |  7 -------
 mm/page_io.c              |  4 ++--
 mm/shmem.c                | 10 +---------
 mm/swap.h                 |  7 +++++--
 mm/vmscan.c               | 10 +---------
 5 files changed, 9 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 82f217970092..9e960f2faf79 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -72,13 +72,6 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
-	/* To enable batching of swap writes to non-block-device backends,
-	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
-	 * writes have been submitted, if with swap_iocb is not NULL,
-	 * swap_write_unplug() should be called.
-	 */
-	struct swap_iocb **swap_plug;
-
 	/* internal fields used by the ->writepages implementation: */
 	struct folio_batch fbatch;
 	pgoff_t index;
diff --git a/mm/page_io.c b/mm/page_io.c
index fb52bedcc966..a2056a5ecb13 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,7 +237,7 @@ static void swap_zeromap_folio_clear(struct folio *folio)
  * We may have stale swap cache pages in memory: notice
  * them here and get rid of the unnecessary final write.
  */
-int swap_writeout(struct folio *folio, struct writeback_control *wbc)
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug)
 {
 	int ret = 0;
 
@@ -281,7 +281,7 @@ int swap_writeout(struct folio *folio, struct writeback_control *wbc)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
-	__swap_writepage(folio, wbc->swap_plug);
+	__swap_writepage(folio, swap_plug);
 	return 0;
 out_unlock:
 	folio_unlock(folio);
diff --git a/mm/shmem.c b/mm/shmem.c
index ad8db487e721..eda35be2a8d9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1635,21 +1635,13 @@ try_split:
 		list_add(&info->swaplist, &shmem_swaplist);
 
 	if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.nr_to_write	= SWAP_CLUSTER_MAX,
-			.range_start	= 0,
-			.range_end	= LLONG_MAX,
-			.for_reclaim	= 1,
-			.swap_plug	= plug,
-		};
 		shmem_recalc_inode(inode, 0, nr_pages);
 		swap_shmem_alloc(folio->swap, nr_pages);
 		shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));
 
 		mutex_unlock(&shmem_swaplist_mutex);
 		BUG_ON(folio_mapped(folio));
-		return swap_writeout(folio, &wbc);
+		return swap_writeout(folio, plug);
 	}
 	if (!info->swapped)
 		list_del_init(&info->swaplist);
diff --git a/mm/swap.h b/mm/swap.h
index 911817f051c2..911ad5ff0f89 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -3,6 +3,8 @@
 #define _MM_SWAP_H
 
 struct mempolicy;
+struct swap_iocb;
+
 extern int page_cluster;
 
 #ifdef CONFIG_SWAP
@@ -20,7 +22,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 		__swap_read_unplug(plug);
 }
 void swap_write_unplug(struct swap_iocb *sio);
-int swap_writeout(struct folio *folio, struct writeback_control *wbc);
+int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
 void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
 
 /* linux/mm/swap_state.c */
@@ -160,7 +162,8 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
-static inline int swap_writeout(struct folio *f, struct writeback_control *wbc)
+static inline int swap_writeout(struct folio *folio,
+		struct swap_iocb **swap_plug)
 {
 	return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3cceb619a853..a93a1ba9009e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -655,14 +655,6 @@ typedef enum {
 static pageout_t writeout(struct folio *folio, struct address_space *mapping,
 		struct swap_iocb **plug, struct list_head *folio_list)
 {
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_NONE,
-		.nr_to_write = SWAP_CLUSTER_MAX,
-		.range_start = 0,
-		.range_end = LLONG_MAX,
-		.for_reclaim = 1,
-		.swap_plug = plug,
-	};
 	int res;
 
 	folio_set_reclaim(folio);
@@ -675,7 +667,7 @@ static pageout_t writeout(struct folio *folio, struct address_space *mapping,
 	if (shmem_mapping(mapping))
 		res = shmem_writeout(folio, plug, folio_list);
 	else
-		res = swap_writeout(folio, &wbc);
+		res = swap_writeout(folio, plug);
 
 	if (res < 0)
 		handle_write_error(mapping, folio, res);
-- 
cgit v1.2.3


From a8fb49c6abbbe5c71e1a8a888ef2c4b3e341d169 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2025 07:49:42 +0200
Subject: mm: remove the for_reclaim field from struct writeback_control

This field is now only set to one in the i915 gem code that only calls
writeback_iter on it, which ignores the flag.  All other checks are thuse
dead code and the field can be removed.

Link: https://lkml.kernel.org/r/20250610054959.2057526-7-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |  1 -
 fs/fuse/file.c                            | 11 -----------
 fs/nfs/write.c                            |  2 +-
 include/linux/writeback.h                 |  1 -
 include/trace/events/btrfs.h              |  7 ++-----
 include/trace/events/writeback.h          |  8 ++------
 6 files changed, 5 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 24d8daa4fdb3..f263615f6ece 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -302,7 +302,6 @@ void __shmem_writeback(size_t size, struct address_space *mapping)
 		.nr_to_write = SWAP_CLUSTER_MAX,
 		.range_start = 0,
 		.range_end = LLONG_MAX,
-		.for_reclaim = 1,
 	};
 	struct folio *folio = NULL;
 	int error = 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 47006d0753f1..95a657a57786 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1927,17 +1927,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct fuse_file *ff;
 	int err;
 
-	/*
-	 * Inode is always written before the last reference is dropped and
-	 * hence this should not be reached from reclaim.
-	 *
-	 * Writing back the inode from reclaim can deadlock if the request
-	 * processing itself needs an allocation.  Allocations triggering
-	 * reclaim while serving a request can't be prevented, because it can
-	 * involve any number of unrelated userspace processes.
-	 */
-	WARN_ON(wbc->for_reclaim);
-
 	ff = __fuse_write_file_get(fi);
 	err = fuse_flush_times(inode, ff);
 	if (ff)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 374fc6b34c79..cf1d720b8251 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -720,7 +720,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
 	if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
-	    wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+	    wbc->for_background || wbc->for_sync) {
 		ioc = nfs_io_completion_alloc(GFP_KERNEL);
 		if (ioc)
 			nfs_io_completion_init(ioc, nfs_io_completion_commit,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9e960f2faf79..a2848d731a46 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -59,7 +59,6 @@ struct writeback_control {
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
 	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
-	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
 	unsigned unpinned_netfs_wb:1;	/* Cleared I_PINNING_NETFS_WB */
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bebc252db865..0adc40f5e72b 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -686,7 +686,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
 		__field(	char,   for_kupdate		)
-		__field(	char,   for_reclaim		)
 		__field(	char,   range_cyclic		)
 		__field(	unsigned long,  writeback_index	)
 		__field(	u64,    root_objectid		)
@@ -700,7 +699,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
 		__entry->for_kupdate	= wbc->for_kupdate;
-		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->root_objectid	= btrfs_root_id(BTRFS_I(inode)->root);
@@ -709,13 +707,12 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 	TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu "
 		  "nr_to_write=%ld pages_skipped=%ld range_start=%llu "
 		  "range_end=%llu for_kupdate=%d "
-		  "for_reclaim=%d range_cyclic=%d writeback_index=%lu",
+		  "range_cyclic=%d writeback_index=%lu",
 		  show_root_type(__entry->root_objectid),
 		  __entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
-		  __entry->for_kupdate,
-		  __entry->for_reclaim, __entry->range_cyclic,
+		  __entry->for_kupdate, __entry->range_cyclic,
 		  __entry->writeback_index)
 );
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 0ff388131fc9..1e23919c0da9 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -459,7 +459,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(int, sync_mode)
 		__field(int, for_kupdate)
 		__field(int, for_background)
-		__field(int, for_reclaim)
 		__field(int, range_cyclic)
 		__field(long, range_start)
 		__field(long, range_end)
@@ -473,23 +472,20 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->sync_mode	= wbc->sync_mode;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_background	= wbc->for_background;
-		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
 		__entry->cgroup_ino	= __trace_wbc_assign_cgroup(wbc);
 	),
 
-	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d "
-		"start=0x%lx end=0x%lx cgroup_ino=%lu",
+	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d bgrd=%d "
+		"cyclic=%d start=0x%lx end=0x%lx cgroup_ino=%lu",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
 		__entry->sync_mode,
 		__entry->for_kupdate,
 		__entry->for_background,
-		__entry->for_reclaim,
 		__entry->range_cyclic,
 		__entry->range_start,
 		__entry->range_end,
-- 
cgit v1.2.3


From 4f745def815d86eece3614aa0cd10a25cf94fed4 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Wed, 28 May 2025 12:18:00 -0500
Subject: drivers/base/node: optimize memory block registration to reduce boot
 time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "drivers/base/node.c: optimization and cleanups", v7.


This patch (of 7)

During node device initialization, `memory blocks` are registered under
each NUMA node.  The `memory blocks` to be registered are identified using
the node's start and end PFNs, which are obtained from the node's pg_data

However, not all PFNs within this range necessarily belong to the same
node—some may belong to other nodes.  Additionally, due to the
discontiguous nature of physical memory, certain sections within a `memory
block` may be absent.

As a result, `memory blocks` that fall between a node's start and end PFNs
may span across multiple nodes, and some sections within those blocks may
be missing.  `Memory blocks` have a fixed size, which is architecture
dependent.

Due to these considerations, the memory block registration is currently
performed as follows:

for_each_online_node(nid):
    start_pfn = pgdat->node_start_pfn;
    end_pfn = pgdat->node_start_pfn + node_spanned_pages;
    for_each_memory_block_between(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn))
        mem_blk = memory_block_id(pfn_to_section_nr(pfn));
        pfn_mb_start=section_nr_to_pfn(mem_blk->start_section_nr)
        pfn_mb_end = pfn_start + memory_block_pfns - 1
        for (pfn = pfn_mb_start; pfn < pfn_mb_end; pfn++):
            if (get_nid_for_pfn(pfn) != nid):
                continue;
            else
                do_register_memory_block_under_node(nid, mem_blk,
                                                        MEMINIT_EARLY);

Here, we derive the start and end PFNs from the node's pg_data, then
determine the memory blocks that may belong to the node.  For each `memory
block` in this range, we inspect all PFNs it contains and check their
associated NUMA node ID.  If a PFN within the block matches the current
node, the memory block is registered under that node.

If CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, get_nid_for_pfn() performs
a binary search in the `memblock regions` to determine the NUMA node ID
for a given PFN.  If it is not enabled, the node ID is retrieved directly
from the struct page.

On large systems, this process can become time-consuming, especially since
we iterate over each `memory block` and all PFNs within it until a match
is found.  When CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled, the
additional overhead of the binary search increases the execution time
significantly, potentially leading to soft lockups during boot.

In this patch, we iterate over `memblock region` to identify the `memory
blocks` that belong to the current NUMA node.  `memblock regions` are
contiguous memory ranges, each associated with a single NUMA node, and
they do not span across multiple nodes.

for_each_memory_region(r): // r => region
  if (!node_online(r->nid)):
    continue;
  else
    for_each_memory_block_between(r->base, r->base + r->size - 1):
      do_register_memory_block_under_node(r->nid, mem_blk, MEMINIT_EARLY);

We iterate over all memblock regions, and if the node associated with the
region is online, we calculate the start and end memory blocks based on
the region's start and end PFNs.  We then register all the memory blocks
within that range under the region node.

Test Results on My system with 32TB RAM
=======================================
1. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT enabled.

Without this patch
------------------
Startup finished in 1min 16.528s (kernel)

With this patch
---------------
Startup finished in 17.236s (kernel) - 78% Improvement

2. Boot time with CONFIG_DEFERRED_STRUCT_PAGE_INIT disabled.

Without this patch
------------------
Startup finished in 28.320s (kernel)

With this patch
---------------
Startup finished in 15.621s (kernel) - 46% Improvement

[donettom@linux.ibm.com: restore removed extra line]
  Link: https://lkml.kernel.org/r/20250609140354.467908-1-donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/2a0a05c2dffc62a742bf1dd030098be4ce99be28.1748452241.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/memory.c  | 21 ++++-----------------
 drivers/base/node.c    | 35 +++++++++++++++++++++++++++++++++--
 include/linux/memory.h | 18 ++++++++++++++++++
 include/linux/node.h   |  3 +++
 4 files changed, 58 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index ed3e69dc785c..5c6c1d6bb59f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -22,6 +22,7 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/xarray.h>
+#include <linux/export.h>
 
 #include <linux/atomic.h>
 #include <linux/uaccess.h>
@@ -48,22 +49,8 @@ int mhp_online_type_from_str(const char *str)
 
 #define to_memory_block(dev) container_of(dev, struct memory_block, dev)
 
-static int sections_per_block;
-
-static inline unsigned long memory_block_id(unsigned long section_nr)
-{
-	return section_nr / sections_per_block;
-}
-
-static inline unsigned long pfn_to_block_id(unsigned long pfn)
-{
-	return memory_block_id(pfn_to_section_nr(pfn));
-}
-
-static inline unsigned long phys_to_block_id(unsigned long phys)
-{
-	return pfn_to_block_id(PFN_DOWN(phys));
-}
+int sections_per_block;
+EXPORT_SYMBOL(sections_per_block);
 
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
@@ -683,7 +670,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  *
  * Called under device_hotplug_lock.
  */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id)
+struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
 	struct memory_block *mem;
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c19094481630..a854d04cea5d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -21,6 +21,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/memblock.h>
 
 static const struct bus_type node_subsys = {
 	.name = "node",
@@ -859,6 +860,34 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 			  kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
 }
 
+/* register all memory blocks under the corresponding nodes */
+static void register_memory_blocks_under_nodes(void)
+{
+	struct memblock_region *r;
+
+	for_each_mem_region(r) {
+		const unsigned long start_block_id = phys_to_block_id(r->base);
+		const unsigned long end_block_id = phys_to_block_id(r->base + r->size - 1);
+		const int nid = memblock_get_region_node(r);
+		unsigned long block_id;
+
+		if (!node_online(nid))
+			continue;
+
+		for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+			struct memory_block *mem;
+
+			mem = find_memory_block_by_id(block_id);
+			if (!mem)
+				continue;
+
+			do_register_memory_block_under_node(nid, mem, MEMINIT_EARLY);
+			put_device(&mem->dev);
+		}
+
+	}
+}
+
 void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
 				       unsigned long end_pfn,
 				       enum meminit_context context)
@@ -980,11 +1009,13 @@ void __init node_dev_init(void)
 
 	/*
 	 * Create all node devices, which will properly link the node
-	 * to applicable memory block devices and already created cpu devices.
+	 * to already created cpu devices.
 	 */
 	for_each_online_node(i) {
-		ret = register_one_node(i);
+		ret =  __register_one_node(i);
 		if (ret)
 			panic("%s() failed to add node: %d\n", __func__, ret);
 	}
+
+	register_memory_blocks_under_nodes();
 }
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 5ec4e6d209b9..bd4440bc4a57 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -179,12 +179,30 @@ struct memory_group *memory_group_find_by_id(int mgid);
 typedef int (*walk_memory_groups_func_t)(struct memory_group *, void *);
 int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func,
 			       struct memory_group *excluded, void *arg);
+struct memory_block *find_memory_block_by_id(unsigned long block_id);
 #define hotplug_memory_notifier(fn, pri) ({		\
 	static __meminitdata struct notifier_block fn##_mem_nb =\
 		{ .notifier_call = fn, .priority = pri };\
 	register_memory_notifier(&fn##_mem_nb);			\
 })
 
+extern int sections_per_block;
+
+static inline unsigned long memory_block_id(unsigned long section_nr)
+{
+	return section_nr / sections_per_block;
+}
+
+static inline unsigned long pfn_to_block_id(unsigned long pfn)
+{
+	return memory_block_id(pfn_to_section_nr(pfn));
+}
+
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+	return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 #ifdef CONFIG_NUMA
 void memory_block_add_nid(struct memory_block *mem, int nid,
 			  enum meminit_context context);
diff --git a/include/linux/node.h b/include/linux/node.h
index 2b7517892230..485370f3bc17 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -120,6 +120,9 @@ static inline void register_memory_blocks_under_node(int nid, unsigned long star
 						     enum meminit_context context)
 {
 }
+static inline void register_memory_blocks_under_nodes(void)
+{
+}
 #endif
 
 extern void unregister_node(struct node *node);
-- 
cgit v1.2.3


From ac24f6cd87d88150fc6c1fef904794571f62dc5e Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Wed, 28 May 2025 12:18:02 -0500
Subject: drivers/base/node: remove register_memory_blocks_under_node()
 function call from register_one_node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

register_one_node() is now only called via cpu_up() →
__try_online_node() during CPU hotplug operations to online a node.

At this stage, the node has not yet had any memory added.  As a result,
there are no memory blocks to walk or register, so calling
register_memory_blocks_under_node() is unnecessary.

Therefore, the call to register_memory_blocks_under_node() has been
removed from register_one_node().

Link: https://lkml.kernel.org/r/ecf07075b1a41015fcf58823997d5c2ed7b8c18f.1748452242.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/node.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/node.h b/include/linux/node.h
index 485370f3bc17..b15de78e0408 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -134,21 +134,7 @@ extern int __register_one_node(int nid);
 /* Registers an online node */
 static inline int register_one_node(int nid)
 {
-	int error = 0;
-
-	if (node_online(nid)) {
-		struct pglist_data *pgdat = NODE_DATA(nid);
-		unsigned long start_pfn = pgdat->node_start_pfn;
-		unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
-
-		error = __register_one_node(nid);
-		if (error)
-			return error;
-		register_memory_blocks_under_node(nid, start_pfn, end_pfn,
-						  MEMINIT_EARLY);
-	}
-
-	return error;
+	return __register_one_node(nid);
 }
 
 extern void unregister_one_node(int nid);
-- 
cgit v1.2.3


From 10f09d82f8b7c25bbd0b6d5142ff6df6e634132d Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Wed, 28 May 2025 12:18:03 -0500
Subject: drivers/base/node: rename register_memory_blocks_under_node() and
 remove context argument

The function register_memory_blocks_under_node() is now only called from
the memory hotplug path, as register_memory_blocks_under_node_early()
handles registration during early boot.  Therefore, the context argument
used to differentiate between early boot and hotplug is no longer needed
and was removed.

Since the function is only called from the hotplug path, we renamed
register_memory_blocks_under_node() to
register_memory_blocks_under_node_hotplug()

Link: https://lkml.kernel.org/r/907c22292b0ee4975107876efc875c75c11badd9.1748452242.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/base/node.c  |  5 ++---
 include/linux/node.h | 11 +++++------
 mm/memory_hotplug.c  |  5 ++---
 3 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 24c32cdbe97d..302cb99853dd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -839,9 +839,8 @@ static void register_memory_blocks_under_nodes(void)
 	}
 }
 
-void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
-				       unsigned long end_pfn,
-				       enum meminit_context context)
+void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
+					       unsigned long end_pfn)
 {
 	walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
 			   (void *)&nid, register_mem_block_under_node_hotplug);
diff --git a/include/linux/node.h b/include/linux/node.h
index b15de78e0408..75b036a100d2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -111,13 +111,12 @@ struct memory_block;
 extern struct node *node_devices[];
 
 #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
-void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
-				       unsigned long end_pfn,
-				       enum meminit_context context);
+void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
+					       unsigned long end_pfn);
 #else
-static inline void register_memory_blocks_under_node(int nid, unsigned long start_pfn,
-						     unsigned long end_pfn,
-						     enum meminit_context context)
+static inline void register_memory_blocks_under_node_hotplug(int nid,
+							     unsigned long start_pfn,
+							     unsigned long end_pfn)
 {
 }
 static inline void register_memory_blocks_under_nodes(void)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b1caedbade5b..c6ce1aba64e6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1575,9 +1575,8 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		BUG_ON(ret);
 	}
 
-	register_memory_blocks_under_node(nid, PFN_DOWN(start),
-					  PFN_UP(start + size - 1),
-					  MEMINIT_HOTPLUG);
+	register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start),
+					  PFN_UP(start + size - 1));
 
 	/* create new memmap entry */
 	if (!strcmp(res->name, "System RAM"))
-- 
cgit v1.2.3


From a5352f8a40a8d1b385abeca0b2cff5d2468e31a1 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Wed, 28 May 2025 12:18:04 -0500
Subject: drivers/base/node: rename __register_one_node() to
 register_one_node()

The register_one_node() function was a simple wrapper around
__register_one_node().  To simplify the code, register_one_node() has been
removed, and __register_one_node() has been renamed to
register_one_node().

Link: https://lkml.kernel.org/r/8262cd0f44eeb048a1fcd3ac8382760d7f7dea60.1748452242.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/pci_dlpar.c |  2 +-
 drivers/base/node.c                        |  4 ++--
 include/linux/node.h                       | 13 +------------
 mm/memory_hotplug.c                        |  2 +-
 4 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 52e2623a741d..aeb8633a3d00 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
 	nid = of_node_to_nid(dn);
 	if (likely((nid) >= 0)) {
 		if (!node_online(nid)) {
-			if (__register_one_node(nid)) {
+			if (register_one_node(nid)) {
 				pr_err("PCI: Failed to register node %d\n", nid);
 			} else {
 				update_numa_distance(dn);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 302cb99853dd..6cbeca45c451 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -848,7 +848,7 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-int __register_one_node(int nid)
+int register_one_node(int nid)
 {
 	int error;
 	int cpu;
@@ -955,7 +955,7 @@ void __init node_dev_init(void)
 	 * to already created cpu devices.
 	 */
 	for_each_online_node(i) {
-		ret =  __register_one_node(i);
+		ret =  register_one_node(i);
 		if (ret)
 			panic("%s() failed to add node: %d\n", __func__, ret);
 	}
diff --git a/include/linux/node.h b/include/linux/node.h
index 75b036a100d2..88bceebcbfa5 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -128,14 +128,7 @@ extern void unregister_node(struct node *node);
 #ifdef CONFIG_NUMA
 extern void node_dev_init(void);
 /* Core of the node registration - only memory hotplug should use this */
-extern int __register_one_node(int nid);
-
-/* Registers an online node */
-static inline int register_one_node(int nid)
-{
-	return __register_one_node(nid);
-}
-
+extern int register_one_node(int nid);
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
@@ -148,10 +141,6 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 static inline void node_dev_init(void)
 {
 }
-static inline int __register_one_node(int nid)
-{
-	return 0;
-}
 static inline int register_one_node(int nid)
 {
 	return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c6ce1aba64e6..bec20a91e757 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1571,7 +1571,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		 * We online node here. We can't roll back from here.
 		 */
 		node_set_online(nid);
-		ret = __register_one_node(nid);
+		ret = register_one_node(nid);
 		BUG_ON(ret);
 	}
 
-- 
cgit v1.2.3


From 7208cc6497c2615ed5a334b52c92ae98bda91198 Mon Sep 17 00:00:00 2001
From: Tal Zussman <tz2294@columbia.edu>
Date: Thu, 19 Jun 2025 21:24:23 -0400
Subject: userfaultfd: correctly prevent registering VM_DROPPABLE regions

Patch series "mm: userfaultfd: assorted fixes and cleanups", v3.

Two fixes and two cleanups for userfaultfd.

Note that the third patch yields a small change in the ABI, but we seem to
have concluded that it is acceptable in this case.


This patch (of 4):

vma_can_userfault() masks off non-userfaultfd VM flags from vm_flags.  The
vm_flags & VM_DROPPABLE test will then always be false, incorrectly
allowing VM_DROPPABLE regions to be registered with userfaultfd.

Additionally, vm_flags is not guaranteed to correspond to the actual VMA's
flags.  Fix this test by checking the VMA's flags directly.

Link: https://lkml.kernel.org/r/20250619-uffd-fixes-v3-0-a7274d3bd5e4@columbia.edu
Link: https://lore.kernel.org/linux-mm/5a875a3a-2243-4eab-856f-bc53ccfec3ea@redhat.com/
Link: https://lkml.kernel.org/r/20250619-uffd-fixes-v3-1-a7274d3bd5e4@columbia.edu
Fixes: 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings")
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 75342022d144..f3b3d2c9dd5e 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -218,7 +218,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
 {
 	vm_flags &= __VM_UFFD_FLAGS;
 
-	if (vm_flags & VM_DROPPABLE)
+	if (vma->vm_flags & VM_DROPPABLE)
 		return false;
 
 	if ((vm_flags & VM_UFFD_MINOR) &&
-- 
cgit v1.2.3


From 5e00e31867d16e235bb693b900c85e86dc2c3464 Mon Sep 17 00:00:00 2001
From: Tal Zussman <tz2294@columbia.edu>
Date: Thu, 19 Jun 2025 21:24:26 -0400
Subject: userfaultfd: remove UFFD_CLOEXEC, UFFD_NONBLOCK, and UFFD_FLAGS_SET

UFFD_CLOEXEC, UFFD_NONBLOCK, and UFFD_FLAGS_SET have been unused since
they were added in commit 932b18e0aec6 ("userfaultfd:
linux/userfaultfd_k.h").  Remove them and the associated BUILD_BUG_ON()
checks.

Link: https://lkml.kernel.org/r/20250619-uffd-fixes-v3-4-a7274d3bd5e4@columbia.edu
Signed-off-by: Tal Zussman <tz2294@columbia.edu>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 2 --
 include/linux/userfaultfd_k.h | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 771e81ea4ef6..a2928b0aec6f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2114,8 +2114,6 @@ static int new_userfaultfd(int flags)
 
 	/* Check the UFFD_* constants for consistency.  */
 	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
-	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
-	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
 
 	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
 		return -EINVAL;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f3b3d2c9dd5e..ccad58602846 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -30,11 +30,7 @@
  * from userfaultfd, in order to leave a free define-space for
  * shared O_* flags.
  */
-#define UFFD_CLOEXEC O_CLOEXEC
-#define UFFD_NONBLOCK O_NONBLOCK
-
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
-#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
-- 
cgit v1.2.3


From ff7ec8dc1b646296f8d94c39339e8d3833d16c05 Mon Sep 17 00:00:00 2001
From: wangzijie <wangzijie1@honor.com>
Date: Sat, 7 Jun 2025 10:13:53 +0800
Subject: proc: use the same treatment to check proc_lseek as ones for
 proc_read_iter et.al

Check pde->proc_ops->proc_lseek directly may cause UAF in rmmod scenario.
It's a gap in proc_reg_open() after commit 654b33ada4ab("proc: fix UAF in
proc_get_inode()").  Followed by AI Viro's suggestion, fix it in same
manner.

Link: https://lkml.kernel.org/r/20250607021353.1127963-1-wangzijie1@honor.com
Fixes: 3f61631d47f1 ("take care to handle NULL ->proc_lseek()")
Signed-off-by: wangzijie <wangzijie1@honor.com>
Reviewed-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Edgecombe, Rick P" <rick.p.edgecombe@intel.com>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/generic.c       | 2 ++
 fs/proc/inode.c         | 2 +-
 fs/proc/internal.h      | 5 +++++
 include/linux/proc_fs.h | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a3e22803cddf..e0e50914ab25 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -569,6 +569,8 @@ static void pde_set_flags(struct proc_dir_entry *pde)
 	if (pde->proc_ops->proc_compat_ioctl)
 		pde->flags |= PROC_ENTRY_proc_compat_ioctl;
 #endif
+	if (pde->proc_ops->proc_lseek)
+		pde->flags |= PROC_ENTRY_proc_lseek;
 }
 
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3604b616311c..129490151be1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -473,7 +473,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	typeof_member(struct proc_ops, proc_open) open;
 	struct pde_opener *pdeo;
 
-	if (!pde->proc_ops->proc_lseek)
+	if (!pde_has_proc_lseek(pde))
 		file->f_mode &= ~FMODE_LSEEK;
 
 	if (pde_is_permanent(pde)) {
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 96122e91c645..3d48ffe72583 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -99,6 +99,11 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
 #endif
 }
 
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+	return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
 extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index ea62201c74c4..703d0c76cc9a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -27,6 +27,7 @@ enum {
 
 	PROC_ENTRY_proc_read_iter	= 1U << 1,
 	PROC_ENTRY_proc_compat_ioctl	= 1U << 2,
+	PROC_ENTRY_proc_lseek		= 1U << 3,
 };
 
 struct proc_ops {
-- 
cgit v1.2.3


From f5e8b140cd1324cf2c9c17487b8f444098624797 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 9 Jun 2025 10:27:25 +0100
Subject: mm/readahead: make space in struct file_ra_state

We need to be able to store the preferred folio order associated with a
readahead request in the struct file_ra_state so that we can more
accurately increase the order across subsequent readahead requests.  But
struct file_ra_state is per-struct file, so we don't really want to
increase it's size.

mmap_miss is currently 32 bits but it is only counted up to 10 *
MMAP_LOTSAMISS, which is currently defined as 1000.  So 16 bits should be
plenty.  Redefine it to unsigned short, making room for order as unsigned
short in follow up commit.

Link: https://lkml.kernel.org/r/20250609092729.274960-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  2 +-
 mm/filemap.c       | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 62634af97da6..ef819b232d66 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1054,7 +1054,7 @@ struct file_ra_state {
 	unsigned int size;
 	unsigned int async_size;
 	unsigned int ra_pages;
-	unsigned int mmap_miss;
+	unsigned short mmap_miss;
 	loff_t prev_pos;
 };
 
diff --git a/mm/filemap.c b/mm/filemap.c
index a6459874bb2a..7bb4ffca8487 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3217,7 +3217,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
 	struct file *fpin = NULL;
 	unsigned long vm_flags = vmf->vma->vm_flags;
-	unsigned int mmap_miss;
+	unsigned short mmap_miss;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	/* Use the readahead code, even if readahead is disabled */
@@ -3285,7 +3285,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
 	struct file_ra_state *ra = &file->f_ra;
 	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
 	struct file *fpin = NULL;
-	unsigned int mmap_miss;
+	unsigned short mmap_miss;
 
 	/* If we don't want any read-ahead, don't bother */
 	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
@@ -3605,7 +3605,7 @@ skip:
 static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 			struct folio *folio, unsigned long start,
 			unsigned long addr, unsigned int nr_pages,
-			unsigned long *rss, unsigned int *mmap_miss)
+			unsigned long *rss, unsigned short *mmap_miss)
 {
 	vm_fault_t ret = 0;
 	struct page *page = folio_page(folio, start);
@@ -3667,7 +3667,7 @@ skip:
 
 static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
 		struct folio *folio, unsigned long addr,
-		unsigned long *rss, unsigned int *mmap_miss)
+		unsigned long *rss, unsigned short *mmap_miss)
 {
 	vm_fault_t ret = 0;
 	struct page *page = &folio->page;
@@ -3709,7 +3709,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
 	struct folio *folio;
 	vm_fault_t ret = 0;
 	unsigned long rss = 0;
-	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;
+	unsigned int nr_pages = 0, folio_type;
+	unsigned short mmap_miss = 0, mmap_miss_saved;
 
 	rcu_read_lock();
 	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
-- 
cgit v1.2.3


From c4602f9fa77fc6bb956ca51a23e7a39439e75cb6 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 9 Jun 2025 10:27:26 +0100
Subject: mm/readahead: store folio order in struct file_ra_state

Previously the folio order of the previous readahead request was inferred
from the folio who's readahead marker was hit.  But due to the way we have
to round to non-natural boundaries sometimes, this first folio in the
readahead block is often smaller than the preferred order for that
request.  This means that for cases where the initial sync readahead is
poorly aligned, the folio order will ramp up much more slowly.

So instead, let's store the order in struct file_ra_state so we are not
affected by any required alignment.  We previously made enough room in the
struct for a 16 order field.  This should be plenty big enough since we
are limited to MAX_PAGECACHE_ORDER anyway, which is certainly never larger
than ~20.

Since we now pass order in struct file_ra_state, page_cache_ra_order() no
longer needs it's new_order parameter, so let's remove that.

Worked example:

Here we are touching pages 17-256 sequentially just as we did in the
previous commit, but now that we are remembering the preferred order
explicitly, we no longer have the slow ramp up problem.  Note specifically
that we no longer have 2 rounds (2x ~128K) of order-2 folios:

TYPE    STARTOFFS     ENDOFFS        SIZE  STARTPG    ENDPG   NRPG  ORDER  RA
-----  ----------  ----------  ----------  -------  -------  -----  -----  --
HOLE   0x00000000  0x00001000        4096        0        1      1
FOLIO  0x00001000  0x00002000        4096        1        2      1      0
FOLIO  0x00002000  0x00003000        4096        2        3      1      0
FOLIO  0x00003000  0x00004000        4096        3        4      1      0
FOLIO  0x00004000  0x00005000        4096        4        5      1      0
FOLIO  0x00005000  0x00006000        4096        5        6      1      0
FOLIO  0x00006000  0x00007000        4096        6        7      1      0
FOLIO  0x00007000  0x00008000        4096        7        8      1      0
FOLIO  0x00008000  0x00009000        4096        8        9      1      0
FOLIO  0x00009000  0x0000a000        4096        9       10      1      0
FOLIO  0x0000a000  0x0000b000        4096       10       11      1      0
FOLIO  0x0000b000  0x0000c000        4096       11       12      1      0
FOLIO  0x0000c000  0x0000d000        4096       12       13      1      0
FOLIO  0x0000d000  0x0000e000        4096       13       14      1      0
FOLIO  0x0000e000  0x0000f000        4096       14       15      1      0
FOLIO  0x0000f000  0x00010000        4096       15       16      1      0
FOLIO  0x00010000  0x00011000        4096       16       17      1      0
FOLIO  0x00011000  0x00012000        4096       17       18      1      0
FOLIO  0x00012000  0x00013000        4096       18       19      1      0
FOLIO  0x00013000  0x00014000        4096       19       20      1      0
FOLIO  0x00014000  0x00015000        4096       20       21      1      0
FOLIO  0x00015000  0x00016000        4096       21       22      1      0
FOLIO  0x00016000  0x00017000        4096       22       23      1      0
FOLIO  0x00017000  0x00018000        4096       23       24      1      0
FOLIO  0x00018000  0x00019000        4096       24       25      1      0
FOLIO  0x00019000  0x0001a000        4096       25       26      1      0
FOLIO  0x0001a000  0x0001b000        4096       26       27      1      0
FOLIO  0x0001b000  0x0001c000        4096       27       28      1      0
FOLIO  0x0001c000  0x0001d000        4096       28       29      1      0
FOLIO  0x0001d000  0x0001e000        4096       29       30      1      0
FOLIO  0x0001e000  0x0001f000        4096       30       31      1      0
FOLIO  0x0001f000  0x00020000        4096       31       32      1      0
FOLIO  0x00020000  0x00021000        4096       32       33      1      0
FOLIO  0x00021000  0x00022000        4096       33       34      1      0
FOLIO  0x00022000  0x00024000        8192       34       36      2      1
FOLIO  0x00024000  0x00028000       16384       36       40      4      2
FOLIO  0x00028000  0x0002c000       16384       40       44      4      2
FOLIO  0x0002c000  0x00030000       16384       44       48      4      2
FOLIO  0x00030000  0x00034000       16384       48       52      4      2
FOLIO  0x00034000  0x00038000       16384       52       56      4      2
FOLIO  0x00038000  0x0003c000       16384       56       60      4      2
FOLIO  0x0003c000  0x00040000       16384       60       64      4      2
FOLIO  0x00040000  0x00050000       65536       64       80     16      4
FOLIO  0x00050000  0x00060000       65536       80       96     16      4
FOLIO  0x00060000  0x00080000      131072       96      128     32      5
FOLIO  0x00080000  0x000a0000      131072      128      160     32      5
FOLIO  0x000a0000  0x000c0000      131072      160      192     32      5
FOLIO  0x000c0000  0x000e0000      131072      192      224     32      5
FOLIO  0x000e0000  0x00100000      131072      224      256     32      5
FOLIO  0x00100000  0x00120000      131072      256      288     32      5
FOLIO  0x00120000  0x00140000      131072      288      320     32      5  Y
HOLE   0x00140000  0x00800000     7077888      320     2048   1728

Link: https://lkml.kernel.org/r/20250609092729.274960-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h |  2 ++
 mm/filemap.c       |  6 ++++--
 mm/internal.h      |  3 +--
 mm/readahead.c     | 21 +++++++++++++--------
 4 files changed, 20 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef819b232d66..e14e9d11ca0f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1043,6 +1043,7 @@ struct fown_struct {
  *      and so were/are genuinely "ahead".  Start next readahead when
  *      the first of these pages is accessed.
  * @ra_pages: Maximum size of a readahead request, copied from the bdi.
+ * @order: Preferred folio order used for most recent readahead.
  * @mmap_miss: How many mmap accesses missed in the page cache.
  * @prev_pos: The last byte in the most recent read request.
  *
@@ -1054,6 +1055,7 @@ struct file_ra_state {
 	unsigned int size;
 	unsigned int async_size;
 	unsigned int ra_pages;
+	unsigned short order;
 	unsigned short mmap_miss;
 	loff_t prev_pos;
 };
diff --git a/mm/filemap.c b/mm/filemap.c
index 7bb4ffca8487..4b5c8d69f04c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3232,7 +3232,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 		if (!(vm_flags & VM_RAND_READ))
 			ra->size *= 2;
 		ra->async_size = HPAGE_PMD_NR;
-		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+		ra->order = HPAGE_PMD_ORDER;
+		page_cache_ra_order(&ractl, ra);
 		return fpin;
 	}
 #endif
@@ -3268,8 +3269,9 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
 	ra->size = ra->ra_pages;
 	ra->async_size = ra->ra_pages / 4;
+	ra->order = 0;
 	ractl._index = ra->start;
-	page_cache_ra_order(&ractl, ra, 0);
+	page_cache_ra_order(&ractl, ra);
 	return fpin;
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..f91688e2894f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -436,8 +436,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb,
 int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
 			   gfp_t gfp);
 
-void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
-		unsigned int order);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *);
 void force_page_cache_ra(struct readahead_control *, unsigned long nr);
 static inline void force_page_cache_readahead(struct address_space *mapping,
 		struct file *file, pgoff_t index, unsigned long nr_to_read)
diff --git a/mm/readahead.c b/mm/readahead.c
index 87be20ae00d0..95a24f12d1e7 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -457,7 +457,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
 }
 
 void page_cache_ra_order(struct readahead_control *ractl,
-		struct file_ra_state *ra, unsigned int new_order)
+		struct file_ra_state *ra)
 {
 	struct address_space *mapping = ractl->mapping;
 	pgoff_t start = readahead_index(ractl);
@@ -468,9 +468,12 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	unsigned int nofs;
 	int err = 0;
 	gfp_t gfp = readahead_gfp_mask(mapping);
+	unsigned int new_order = ra->order;
 
-	if (!mapping_large_folio_support(mapping))
+	if (!mapping_large_folio_support(mapping)) {
+		ra->order = 0;
 		goto fallback;
+	}
 
 	limit = min(limit, index + ra->size - 1);
 
@@ -478,6 +481,8 @@ void page_cache_ra_order(struct readahead_control *ractl,
 	new_order = min_t(unsigned int, new_order, ilog2(ra->size));
 	new_order = max(new_order, min_order);
 
+	ra->order = new_order;
+
 	/* See comment in page_cache_ra_unbounded() */
 	nofs = memalloc_nofs_save();
 	filemap_invalidate_lock_shared(mapping);
@@ -609,8 +614,9 @@ void page_cache_sync_ra(struct readahead_control *ractl,
 	ra->size = min(contig_count + req_count, max_pages);
 	ra->async_size = 1;
 readit:
+	ra->order = 0;
 	ractl->_index = ra->start;
-	page_cache_ra_order(ractl, ra, 0);
+	page_cache_ra_order(ractl, ra);
 }
 EXPORT_SYMBOL_GPL(page_cache_sync_ra);
 
@@ -621,7 +627,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	struct file_ra_state *ra = ractl->ra;
 	pgoff_t index = readahead_index(ractl);
 	pgoff_t expected, start, end, aligned_end, align;
-	unsigned int order = folio_order(folio);
 
 	/* no readahead */
 	if (!ra->ra_pages)
@@ -644,7 +649,7 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	 * Ramp up sizes, and push forward the readahead window.
 	 */
 	expected = round_down(ra->start + ra->size - ra->async_size,
-			1UL << order);
+			1UL << folio_order(folio));
 	if (index == expected) {
 		ra->start += ra->size;
 		/*
@@ -673,15 +678,15 @@ void page_cache_async_ra(struct readahead_control *ractl,
 	ra->size += req_count;
 	ra->size = get_next_ra_size(ra, max_pages);
 readit:
-	order += 2;
-	align = 1UL << min(order, ffs(max_pages) - 1);
+	ra->order += 2;
+	align = 1UL << min(ra->order, ffs(max_pages) - 1);
 	end = ra->start + ra->size;
 	aligned_end = round_down(end, align);
 	if (aligned_end > ra->start)
 		ra->size -= end - aligned_end;
 	ra->async_size = ra->size;
 	ractl->_index = ra->start;
-	page_cache_ra_order(ractl, ra, order);
+	page_cache_ra_order(ractl, ra);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_ra);
 
-- 
cgit v1.2.3


From 38b0ece6d76374b989928021b5d310be11b99b5c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 9 Jun 2025 10:27:27 +0100
Subject: mm/filemap: allow arch to request folio size for exec memory

Change the readahead config so that if it is being requested for an
executable mapping, do a synchronous read into a set of folios with an
arch-specified order and in a naturally aligned manner.  We no longer
center the read on the faulting page but simply align it down to the
previous natural boundary.  Additionally, we don't bother with an
asynchronous part.

On arm64 if memory is physically contiguous and naturally aligned to the
"contpte" size, we can use contpte mappings, which improves utilization of
the TLB.  When paired with the "multi-size THP" feature, this works well
to reduce dTLB pressure.  However iTLB pressure is still high due to
executable mappings having a low likelihood of being in the required folio
size and mapping alignment, even when the filesystem supports readahead
into large folios (e.g.  XFS).

The reason for the low likelihood is that the current readahead algorithm
starts with an order-0 folio and increases the folio order by 2 every time
the readahead mark is hit.  But most executable memory tends to be
accessed randomly and so the readahead mark is rarely hit and most
executable folios remain order-0.

So let's special-case the read(ahead) logic for executable mappings.  The
trade-off is performance improvement (due to more efficient storage of the
translations in iTLB) vs potential for making reclaim more difficult (due
to the folios being larger so if a part of the folio is hot the whole
thing is considered hot).  But executable memory is a small portion of the
overall system memory so I doubt this will even register from a reclaim
perspective.

I've chosen 64K folio size for arm64 which benefits both the 4K and 16K
base page size configs.  Crucially the same amount of data is still read
(usually 128K) so I'm not expecting any read amplification issues.  I
don't anticipate any write amplification because text is always RO.

Note that the text region of an ELF file could be populated into the page
cache for other reasons than taking a fault in a mmapped area.  The most
common case is due to the loader read()ing the header which can be shared
with the beginning of text.  So some text will still remain in small
folios, but this simple, best effort change provides good performance
improvements as is.

Confine this special-case approach to the bounds of the VMA.  This
prevents wasting memory for any padding that might exist in the file
between sections.  Previously the padding would have been contained in
order-0 folios and would be easy to reclaim.  But now it would be part of
a larger folio so more difficult to reclaim.  Solve this by simply not
reading it into memory in the first place.

Benchmarking
============

The below shows pgbench and redis benchmarks on Graviton3 arm64 system.

First, confirmation that this patch causes more text to be contained in
64K folios:

+----------------------+---------------+---------------+---------------+
| File-backed folios by|  system boot  |    pgbench    |     redis     |
| size as percentage of+-------+-------+-------+-------+-------+-------+
| all mapped text mem  |before | after |before | after |before | after |
+======================+=======+=======+=======+=======+=======+=======+
| base-page-4kB        |   78% |   30% |   78% |   11% |   73% |   14% |
| thp-aligned-8kB      |    1% |    0% |    0% |    0% |    1% |    0% |
| thp-aligned-16kB     |   17% |    4% |   17% |    3% |   20% |    4% |
| thp-aligned-32kB     |    1% |    1% |    1% |    2% |    1% |    1% |
| thp-aligned-64kB     |    3% |   63% |    3% |   81% |    4% |   77% |
| thp-aligned-128kB    |    0% |    1% |    1% |    1% |    1% |    2% |
| thp-unaligned-64kB   |    0% |    0% |    0% |    1% |    0% |    1% |
| thp-unaligned-128kB  |    0% |    1% |    0% |    0% |    0% |    0% |
| thp-partial          |    0% |    0% |    0% |    1% |    0% |    1% |
+----------------------+-------+-------+-------+-------+-------+-------+
| cont-aligned-64kB    |    4% |   65% |    4% |   83% |    6% |   79% |
+----------------------+-------+-------+-------+-------+-------+-------+

The above shows that for both workloads (each isolated with cgroups) as
well as the general system state after boot, the amount of text backed by
4K and 16K folios reduces and the amount backed by 64K folios increases
significantly.  And the amount of text that is contpte-mapped
significantly increases (see last row).

And this is reflected in performance improvement.  "(I)" indicates a
statistically significant improvement.  Note TPS and Reqs/sec are rates so
bigger is better, ms is time so smaller is better:

+-------------+-------------------------------------------+------------+
| Benchmark   | Result Class                              | Improvemnt |
+=============+===========================================+============+
| pts/pgbench | Scale: 1 Clients: 1 RO (TPS)              |  (I) 3.47% |
|             | Scale: 1 Clients: 1 RO - Latency (ms)     |     -2.88% |
|             | Scale: 1 Clients: 250 RO (TPS)            |  (I) 5.02% |
|             | Scale: 1 Clients: 250 RO - Latency (ms)   | (I) -4.79% |
|             | Scale: 1 Clients: 1000 RO (TPS)           |  (I) 6.16% |
|             | Scale: 1 Clients: 1000 RO - Latency (ms)  | (I) -5.82% |
|             | Scale: 100 Clients: 1 RO (TPS)            |      2.51% |
|             | Scale: 100 Clients: 1 RO - Latency (ms)   |     -3.51% |
|             | Scale: 100 Clients: 250 RO (TPS)          |  (I) 4.75% |
|             | Scale: 100 Clients: 250 RO - Latency (ms) | (I) -4.44% |
|             | Scale: 100 Clients: 1000 RO (TPS)         |  (I) 6.34% |
|             | Scale: 100 Clients: 1000 RO - Latency (ms)| (I) -5.95% |
+-------------+-------------------------------------------+------------+
| pts/redis   | Test: GET Connections: 50 (Reqs/sec)      |  (I) 3.20% |
|             | Test: GET Connections: 1000 (Reqs/sec)    |  (I) 2.55% |
|             | Test: LPOP Connections: 50 (Reqs/sec)     |  (I) 4.59% |
|             | Test: LPOP Connections: 1000 (Reqs/sec)   |  (I) 4.81% |
|             | Test: LPUSH Connections: 50 (Reqs/sec)    |  (I) 5.31% |
|             | Test: LPUSH Connections: 1000 (Reqs/sec)  |  (I) 4.36% |
|             | Test: SADD Connections: 50 (Reqs/sec)     |  (I) 2.64% |
|             | Test: SADD Connections: 1000 (Reqs/sec)   |  (I) 4.15% |
|             | Test: SET Connections: 50 (Reqs/sec)      |  (I) 3.11% |
|             | Test: SET Connections: 1000 (Reqs/sec)    |  (I) 3.36% |
+-------------+-------------------------------------------+------------+

[ryan.roberts@arm.com: fix use-after-free]
  Link: https://lkml.kernel.org/r/ea7f9da7-9a9f-4b85-9d0a-35b320f5ed25@arm.com
[ryan.roberts@arm.com: use the vma_pages() helper instead of open-coding]
  Link: https://lkml.kernel.org/r/0e0f674b-3b7e-494f-ae7a-fc9dbb98dad4@arm.com
Link: https://lkml.kernel.org/r/20250609092729.274960-6-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Will Deacon <will@kernel.org>
Cc: Chaitanya S Prakash <chaitanyas.prakash@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h |  8 +++++++
 include/linux/pgtable.h          | 11 +++++++++
 mm/filemap.c                     | 48 ++++++++++++++++++++++++++++++++--------
 3 files changed, 58 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 192d86e1cc76..e511f909f63c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1643,6 +1643,14 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
  */
 #define arch_wants_old_prefaulted_pte	cpu_has_hw_af
 
+/*
+ * Request exec memory is read into pagecache in at least 64K folios. This size
+ * can be contpte-mapped when 4K base pages are in use (16 pages into 1 iTLB
+ * entry), and HPA can coalesce it (4 pages into 1 TLB entry) when 16K base
+ * pages are in use.
+ */
+#define exec_folio_order() ilog2(SZ_64K >> PAGE_SHIFT)
+
 static inline bool pud_sect_supported(void)
 {
 	return PAGE_SIZE == SZ_4K;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 0b6e1f781d86..e4a3895c043b 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -456,6 +456,17 @@ static inline bool arch_has_hw_pte_young(void)
 }
 #endif
 
+#ifndef exec_folio_order
+/*
+ * Returns preferred minimum folio order for executable file-backed memory. Must
+ * be in range [0, PMD_ORDER). Default to order-0.
+ */
+static inline unsigned int exec_folio_order(void)
+{
+	return 0;
+}
+#endif
+
 #ifndef arch_check_zapped_pte
 static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
 					 pte_t pte)
diff --git a/mm/filemap.c b/mm/filemap.c
index 4b5c8d69f04c..3cf955740148 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3238,8 +3238,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	}
 #endif
 
-	/* If we don't want any read-ahead, don't bother */
-	if (vm_flags & VM_RAND_READ)
+	/*
+	 * If we don't want any read-ahead, don't bother. VM_EXEC case below is
+	 * already intended for random access.
+	 */
+	if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ)
 		return fpin;
 	if (!ra->ra_pages)
 		return fpin;
@@ -3262,14 +3265,41 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	if (mmap_miss > MMAP_LOTSAMISS)
 		return fpin;
 
-	/*
-	 * mmap read-around
-	 */
+	if (vm_flags & VM_EXEC) {
+		/*
+		 * Allow arch to request a preferred minimum folio order for
+		 * executable memory. This can often be beneficial to
+		 * performance if (e.g.) arm64 can contpte-map the folio.
+		 * Executable memory rarely benefits from readahead, due to its
+		 * random access nature, so set async_size to 0.
+		 *
+		 * Limit to the boundaries of the VMA to avoid reading in any
+		 * pad that might exist between sections, which would be a waste
+		 * of memory.
+		 */
+		struct vm_area_struct *vma = vmf->vma;
+		unsigned long start = vma->vm_pgoff;
+		unsigned long end = start + vma_pages(vma);
+		unsigned long ra_end;
+
+		ra->order = exec_folio_order();
+		ra->start = round_down(vmf->pgoff, 1UL << ra->order);
+		ra->start = max(ra->start, start);
+		ra_end = round_up(ra->start + ra->ra_pages, 1UL << ra->order);
+		ra_end = min(ra_end, end);
+		ra->size = ra_end - ra->start;
+		ra->async_size = 0;
+	} else {
+		/*
+		 * mmap read-around
+		 */
+		ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
+		ra->size = ra->ra_pages;
+		ra->async_size = ra->ra_pages / 4;
+		ra->order = 0;
+	}
+
 	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
-	ra->size = ra->ra_pages;
-	ra->async_size = ra->ra_pages / 4;
-	ra->order = 0;
 	ractl._index = ra->start;
 	page_cache_ra_order(&ractl, ra);
 	return fpin;
-- 
cgit v1.2.3


From 7e43195c609f0499e46c6bfa9472e39c76af445b Mon Sep 17 00:00:00 2001
From: Casey Chen <cachen@purestorage.com>
Date: Tue, 10 Jun 2025 10:22:58 -0600
Subject: alloc_tag: remove empty module tag section

The empty MOD_CODETAG_SECTIONS() macro added an incomplete .data section
in module linker script, which caused symbol lookup tools like gdb to
misinterpret symbol addresses e.g., __ib_process_cq incorrectly mapping to
unrelated functions like below.

  (gdb) disas __ib_process_cq
  Dump of assembler code for function trace_event_fields_cq_schedule:

Removing the empty section restores proper symbol resolution and layout,
ensuring .data placement behaves as expected.

Link: https://lkml.kernel.org/r/20250610162258.324645-1-cachen@purestorage.com
Fixes: 0db6f8d7820a ("alloc_tag: load module tags into separate contiguous memory")
       22d407b164ff ("lib: add allocation tagging support for memory allocation profiling")
Signed-off-by: Casey Chen <cachen@purestorage.com>
Reviewed-by: Yuanyuan Zhong <yzhong@purestorage.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/codetag.lds.h | 6 ------
 scripts/module.lds.S              | 5 -----
 2 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
index 372c320c5043..a45fe3d141a1 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -11,12 +11,6 @@
 #define CODETAG_SECTIONS()		\
 	SECTION_WITH_BOUNDARIES(alloc_tags)
 
-/*
- * Module codetags which aren't used after module unload, therefore have the
- * same lifespan as the module and can be safely unloaded with the module.
- */
-#define MOD_CODETAG_SECTIONS()
-
 #define MOD_SEPARATE_CODETAG_SECTION(_name)	\
 	.codetag.##_name : {			\
 		SECTION_WITH_BOUNDARIES(_name)	\
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 450f1088d5fd..ee79c41059f3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -52,17 +52,12 @@ SECTIONS {
 	.data : {
 		*(.data .data.[0-9a-zA-Z_]*)
 		*(.data..L*)
-		MOD_CODETAG_SECTIONS()
 	}
 
 	.rodata : {
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
 	}
-#else
-	.data : {
-		MOD_CODETAG_SECTIONS()
-	}
 #endif
 	MOD_SEPARATE_CODETAG_SECTIONS()
 }
-- 
cgit v1.2.3


From 96d81e4766f9e88b66a0502b5a7f34a4c20ac754 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 5 Jun 2025 14:51:04 +0100
Subject: mm/pagewalk: split walk_page_range_novma() into kernel/user parts

walk_page_range_novma() is rather confusing - it supports two modes, one
used often, the other used only for debugging.

The first mode is the common case of traversal of kernel page tables,
which is what nearly all callers use this for.

Secondly it provides an unusual debugging interface that allows for the
traversal of page tables in a userland range of memory even for that
memory which is not described by a VMA.

It is far from certain that such page tables should even exist, but
perhaps this is precisely why it is useful as a debugging mechanism.

As a result, this is utilised by ptdump only.  Historically, things were
reversed - ptdump was the only user, and other parts of the kernel evolved
to use the kernel page table walking here.

Since we have some complicated and confusing locking rules for the novma
case, it makes sense to separate the two usages into their own functions.

Doing this also provide self-documentation as to the intent of the caller
- are they doing something rather unusual or are they simply doing a
standard kernel page table walk?

We therefore establish two separate functions - walk_page_range_debug()
for this single usage, and walk_kernel_page_table_range() for general
kernel page table walking.

The walk_page_range_debug() function is currently used to traverse both
userland and kernel mappings, so we maintain this and in the case of
kernel mappings being traversed, we have walk_page_range_debug() invoke
walk_kernel_page_table_range() internally.

We additionally make walk_page_range_debug() internal to mm.

Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Barry Song <baohua@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/mm/pageattr.c |  2 +-
 arch/openrisc/kernel/dma.c   |  4 +--
 arch/riscv/mm/pageattr.c     |  8 ++---
 include/linux/pagewalk.h     |  7 ++--
 mm/hugetlb_vmemmap.c         |  2 +-
 mm/internal.h                |  3 ++
 mm/pagewalk.c                | 77 +++++++++++++++++++++++++++++++-------------
 mm/ptdump.c                  |  3 +-
 8 files changed, 71 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c
index 99165903908a..f5e910b68229 100644
--- a/arch/loongarch/mm/pageattr.c
+++ b/arch/loongarch/mm/pageattr.c
@@ -118,7 +118,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, pgp
 		return 0;
 
 	mmap_write_lock(&init_mm);
-	ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, &masks);
+	ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks);
 	mmap_write_unlock(&init_mm);
 
 	flush_tlb_kernel_range(start, end);
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 3a7b5baaa450..af932a4ad306 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -72,7 +72,7 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size)
 	 * them and setting the cache-inhibit bit.
 	 */
 	mmap_write_lock(&init_mm);
-	error = walk_page_range_novma(&init_mm, va, va + size,
+	error = walk_kernel_page_table_range(va, va + size,
 			&set_nocache_walk_ops, NULL, NULL);
 	mmap_write_unlock(&init_mm);
 
@@ -87,7 +87,7 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size)
 
 	mmap_write_lock(&init_mm);
 	/* walk_page_range shouldn't be able to fail here */
-	WARN_ON(walk_page_range_novma(&init_mm, va, va + size,
+	WARN_ON(walk_kernel_page_table_range(va, va + size,
 			&clear_nocache_walk_ops, NULL, NULL));
 	mmap_write_unlock(&init_mm);
 }
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index d815448758a1..3f76db3d2769 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -299,7 +299,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
 			if (ret)
 				goto unlock;
 
-			ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+			ret = walk_kernel_page_table_range(lm_start, lm_end,
 						    &pageattr_ops, NULL, &masks);
 			if (ret)
 				goto unlock;
@@ -317,13 +317,13 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask,
 		if (ret)
 			goto unlock;
 
-		ret = walk_page_range_novma(&init_mm, lm_start, lm_end,
+		ret = walk_kernel_page_table_range(lm_start, lm_end,
 					    &pageattr_ops, NULL, &masks);
 		if (ret)
 			goto unlock;
 	}
 
-	ret =  walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+	ret =  walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
 				     &masks);
 
 unlock:
@@ -335,7 +335,7 @@ unlock:
 	 */
 	flush_tlb_all();
 #else
-	ret =  walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL,
+	ret =  walk_kernel_page_table_range(start, end, &pageattr_ops, NULL,
 				     &masks);
 
 	mmap_write_unlock(&init_mm);
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 9700a29f8afb..8ac2f6d6d2a3 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -129,10 +129,9 @@ struct mm_walk {
 int walk_page_range(struct mm_struct *mm, unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		void *private);
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
-			  unsigned long end, const struct mm_walk_ops *ops,
-			  pgd_t *pgd,
-			  void *private);
+int walk_kernel_page_table_range(unsigned long start,
+		unsigned long end, const struct mm_walk_ops *ops,
+		pgd_t *pgd, void *private);
 int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
 			unsigned long end, const struct mm_walk_ops *ops,
 			void *private);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 27245e86df25..ba0fb1b6a5a8 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
 	VM_BUG_ON(!PAGE_ALIGNED(start | end));
 
 	mmap_read_lock(&init_mm);
-	ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
+	ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
 				    NULL, walk);
 	mmap_read_unlock(&init_mm);
 	if (ret)
diff --git a/mm/internal.h b/mm/internal.h
index f91688e2894f..2c0d9f197d81 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1604,6 +1604,9 @@ static inline void accept_page(struct page *page)
 int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
 		unsigned long end, const struct mm_walk_ops *ops,
 		void *private);
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
+			  unsigned long end, const struct mm_walk_ops *ops,
+			  pgd_t *pgd, void *private);
 
 /* pt_reclaim.c */
 bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..ff5299eca687 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -585,8 +585,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
 }
 
 /**
- * walk_page_range_novma - walk a range of pagetables not backed by a vma
- * @mm:		mm_struct representing the target process of page table walk
+ * walk_kernel_page_table_range - walk a range of kernel pagetables.
  * @start:	start address of the virtual address range
  * @end:	end address of the virtual address range
  * @ops:	operation to call during the walk
@@ -596,17 +595,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
  * Similar to walk_page_range() but can walk any page tables even if they are
  * not backed by VMAs. Because 'unusual' entries may be walked this function
  * will also not lock the PTEs for the pte_entry() callback. This is useful for
- * walking the kernel pages tables or page tables for firmware.
+ * walking kernel pages tables or page tables for firmware.
  *
  * Note: Be careful to walk the kernel pages tables, the caller may be need to
  * take other effective approaches (mmap lock may be insufficient) to prevent
  * the intermediate kernel page tables belonging to the specified address range
  * from being freed (e.g. memory hot-remove).
  */
-int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+int walk_kernel_page_table_range(unsigned long start, unsigned long end,
+		const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
+{
+	struct mm_struct *mm = &init_mm;
+	struct mm_walk walk = {
+		.ops		= ops,
+		.mm		= mm,
+		.pgd		= pgd,
+		.private	= private,
+		.no_vma		= true
+	};
+
+	if (start >= end)
+		return -EINVAL;
+	if (!check_ops_valid(ops))
+		return -EINVAL;
+
+	/*
+	 * Kernel intermediate page tables are usually not freed, so the mmap
+	 * read lock is sufficient. But there are some exceptions.
+	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
+	 * to prevent the intermediate kernel pages tables belonging to the
+	 * specified address range from being freed. The caller should take
+	 * other actions to prevent this race.
+	 */
+	mmap_assert_locked(mm);
+
+	return walk_pgd_range(start, end, &walk);
+}
+
+/**
+ * walk_page_range_debug - walk a range of pagetables not backed by a vma
+ * @mm:		mm_struct representing the target process of page table walk
+ * @start:	start address of the virtual address range
+ * @end:	end address of the virtual address range
+ * @ops:	operation to call during the walk
+ * @pgd:	pgd to walk if different from mm->pgd
+ * @private:	private data for callbacks' usage
+ *
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback.
+ *
+ * This is for debugging purposes ONLY.
+ */
+int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
 			  unsigned long end, const struct mm_walk_ops *ops,
-			  pgd_t *pgd,
-			  void *private)
+			  pgd_t *pgd, void *private)
 {
 	struct mm_walk walk = {
 		.ops		= ops,
@@ -616,34 +659,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
 		.no_vma		= true
 	};
 
+	/* For convenience, we allow traversal of kernel mappings. */
+	if (mm == &init_mm)
+		return walk_kernel_page_table_range(start, end, ops,
+						    pgd, private);
 	if (start >= end || !walk.mm)
 		return -EINVAL;
 	if (!check_ops_valid(ops))
 		return -EINVAL;
 
 	/*
-	 * 1) For walking the user virtual address space:
-	 *
 	 * The mmap lock protects the page walker from changes to the page
 	 * tables during the walk.  However a read lock is insufficient to
 	 * protect those areas which don't have a VMA as munmap() detaches
 	 * the VMAs before downgrading to a read lock and actually tearing
 	 * down PTEs/page tables. In which case, the mmap write lock should
-	 * be hold.
-	 *
-	 * 2) For walking the kernel virtual address space:
-	 *
-	 * The kernel intermediate page tables usually do not be freed, so
-	 * the mmap map read lock is sufficient. But there are some exceptions.
-	 * E.g. memory hot-remove. In which case, the mmap lock is insufficient
-	 * to prevent the intermediate kernel pages tables belonging to the
-	 * specified address range from being freed. The caller should take
-	 * other actions to prevent this race.
+	 * be held.
 	 */
-	if (mm == &init_mm)
-		mmap_assert_locked(walk.mm);
-	else
-		mmap_assert_write_locked(walk.mm);
+	mmap_assert_write_locked(mm);
 
 	return walk_pgd_range(start, end, &walk);
 }
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 9374f29cdc6f..61a352aa12ed 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,6 +4,7 @@
 #include <linux/debugfs.h>
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
+#include "internal.h"
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 /*
@@ -177,7 +178,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
 
 	mmap_write_lock(mm);
 	while (range->start != range->end) {
-		walk_page_range_novma(mm, range->start, range->end,
+		walk_page_range_debug(mm, range->start, range->end,
 				      &ptdump_ops, pgd, st);
 		range++;
 	}
-- 
cgit v1.2.3


From b0da7709c28c35e0a51d4b1b350c9028358dfb14 Mon Sep 17 00:00:00 2001
From: David Wang <00107082@163.com>
Date: Mon, 9 Jun 2025 14:42:00 +0800
Subject: alloc_tag: add sequence number for module and iterator

Codetag iterator use <id,address> pair to guarantee the validness.  But
both id and address can be reused, there is theoretical possibility when
module inserted right after another module removed, kmalloc returns an
address same as the address kfree by previous module and IDR key reuses
the key recently removed.

Add a sequence number to codetag_module and code_iterator, the sequence
number is strickly incremented whenever a module is loaded.  An iterator
is valid if and only if its sequence number match codetag_module's.

Link: https://lkml.kernel.org/r/20250609064200.112639-1-00107082@163.com
Signed-off-by: David Wang <00107082@163.com>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/codetag.h |  1 +
 lib/codetag.c           | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 5f2b9a1f722c..457ed8fd3214 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -54,6 +54,7 @@ struct codetag_iterator {
 	struct codetag_module *cmod;
 	unsigned long mod_id;
 	struct codetag *ct;
+	unsigned long mod_seq;
 };
 
 #ifdef MODULE
diff --git a/lib/codetag.c b/lib/codetag.c
index 650d54d7e14d..545911cebd25 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -11,8 +11,14 @@ struct codetag_type {
 	struct list_head link;
 	unsigned int count;
 	struct idr mod_idr;
-	struct rw_semaphore mod_lock; /* protects mod_idr */
+	/*
+	 * protects mod_idr, next_mod_seq,
+	 * iter->mod_seq and cmod->mod_seq
+	 */
+	struct rw_semaphore mod_lock;
 	struct codetag_type_desc desc;
+	/* generates unique sequence number for module load */
+	unsigned long next_mod_seq;
 };
 
 struct codetag_range {
@@ -23,6 +29,7 @@ struct codetag_range {
 struct codetag_module {
 	struct module *mod;
 	struct codetag_range range;
+	unsigned long mod_seq;
 };
 
 static DEFINE_MUTEX(codetag_lock);
@@ -48,6 +55,7 @@ struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
 		.cmod = NULL,
 		.mod_id = 0,
 		.ct = NULL,
+		.mod_seq = 0,
 	};
 
 	return iter;
@@ -91,11 +99,13 @@ struct codetag *codetag_next_ct(struct codetag_iterator *iter)
 		if (!cmod)
 			break;
 
-		if (cmod != iter->cmod) {
+		if (!iter->cmod || iter->mod_seq != cmod->mod_seq) {
 			iter->cmod = cmod;
+			iter->mod_seq = cmod->mod_seq;
 			ct = get_first_module_ct(cmod);
-		} else
+		} else {
 			ct = get_next_module_ct(iter);
+		}
 
 		if (ct)
 			break;
@@ -191,6 +201,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 	cmod->range = range;
 
 	down_write(&cttype->mod_lock);
+	cmod->mod_seq = ++cttype->next_mod_seq;
 	mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
 	if (mod_id >= 0) {
 		if (cttype->desc.module_load) {
-- 
cgit v1.2.3


From cce35103135c7ffc7bebc32ebfc74fe1f2c3cb5d Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Tue, 10 Jun 2025 14:27:51 +0800
Subject: mm/memory-tier: fix abstract distance calculation overflow

In mt_perf_to_adistance(), the calculation of abstract distance (adist)
involves multiplying several int values including
MEMTIER_ADISTANCE_DRAM.

*adist = MEMTIER_ADISTANCE_DRAM *
		(perf->read_latency + perf->write_latency) /
		(default_dram_perf.read_latency + default_dram_perf.write_latency) *
		(default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) /
		(perf->read_bandwidth + perf->write_bandwidth);

Since these values can be large, the multiplication may exceed the
maximum value of an int (INT_MAX) and overflow (Our platform did),
leading to an incorrect adist.

User-visible impact:
The memory tiering subsystem will misinterpret slow memory (like CXL)
as faster than DRAM, causing inappropriate demotion of pages from
CXL (slow memory) to DRAM (fast memory).

For example, we will see the following demotion chains from the dmesg, where
Node0,1 are DRAM, and Node2,3 are CXL node:
 Demotion targets for Node 0: null
 Demotion targets for Node 1: null
 Demotion targets for Node 2: preferred: 0-1, fallback: 0-1
 Demotion targets for Node 3: preferred: 0-1, fallback: 0-1

Change MEMTIER_ADISTANCE_DRAM to be a long constant by writing it with
the 'L' suffix.  This prevents the overflow because the multiplication
will then be done in the long type which has a larger range.

Link: https://lkml.kernel.org/r/20250611023439.2845785-1-lizhijian@fujitsu.com
Link: https://lkml.kernel.org/r/20250610062751.2365436-1-lizhijian@fujitsu.com
Fixes: 3718c02dbd4c ("acpi, hmat: calculate abstract distance with HMAT")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
Acked-by: Balbir Singh <balbirs@nvidia.com>
Reviewed-by: Donet Tom <donettom@linux.ibm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-tiers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 0dc0cf2863e2..7a805796fcfd 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -18,7 +18,7 @@
  * adistance value (slightly faster) than default DRAM adistance to be part of
  * the same memory tier.
  */
-#define MEMTIER_ADISTANCE_DRAM	((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
+#define MEMTIER_ADISTANCE_DRAM	((4L * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1))
 
 struct memory_tier;
 struct memory_dev_type {
-- 
cgit v1.2.3


From 4f8ba33bbdfc475fdc1ac5de6a93f5de93203ed5 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Wed, 11 Jun 2025 22:47:45 +1200
Subject: mm: madvise: use per_vma lock for MADV_FREE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory
freeing in user-space native or Java heap memory management.  For example,
jemalloc can be configured to use MADV_FREE, and recent versions of the
Android Java heap have also increasingly adopted MADV_FREE.  Supporting
per-VMA locking for MADV_FREE thus appears increasingly necessary.

We have replaced walk_page_range() with walk_page_range_vma().  Along with
the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is
now in place to begin exploring per-VMA locking support for MADV_FREE and
potentially other madvise using walk_page_range_vma().

This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in
walk_page_range_vma(), and leverages madvise_lock_mode from madv_behavior
to select the appropriate walk_lock—either mmap_lock or per-VMA
lock—based on the context.

Because we now dynamically update the walk_ops->walk_lock field, we must
ensure this is thread-safe.  The madvise_free_walk_ops is now defined as a
stack variable instead of a global constant.

Link: https://lkml.kernel.org/r/20250611104745.57405-1-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagewalk.h |  2 ++
 mm/madvise.c             | 25 +++++++++++++++++++------
 mm/pagewalk.c            |  5 ++++-
 3 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 8ac2f6d6d2a3..682472c15495 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -14,6 +14,8 @@ enum page_walk_lock {
 	PGWALK_WRLOCK = 1,
 	/* vma is expected to be already write-locked during the walk */
 	PGWALK_WRLOCK_VERIFY = 2,
+	/* vma is expected to be already read-locked during the walk */
+	PGWALK_VMA_RDLOCK_VERIFY = 3,
 };
 
 /**
diff --git a/mm/madvise.c b/mm/madvise.c
index 7d78d4b5fb18..d451438af999 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -777,10 +777,19 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static const struct mm_walk_ops madvise_free_walk_ops = {
-	.pmd_entry		= madvise_free_pte_range,
-	.walk_lock		= PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+	switch (mode) {
+	case MADVISE_VMA_READ_LOCK:
+		return PGWALK_VMA_RDLOCK_VERIFY;
+	case MADVISE_MMAP_READ_LOCK:
+		return PGWALK_RDLOCK;
+	default:
+		/* Other modes don't require fixing up the walk_lock */
+		WARN_ON_ONCE(1);
+		return PGWALK_RDLOCK;
+	}
+}
 
 static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
 			struct vm_area_struct *vma,
@@ -789,6 +798,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_notifier_range range;
 	struct mmu_gather *tlb = madv_behavior->tlb;
+	struct mm_walk_ops walk_ops = {
+		.pmd_entry		= madvise_free_pte_range,
+	};
 
 	/* MADV_FREE works for only anon vma at the moment */
 	if (!vma_is_anonymous(vma))
@@ -808,8 +820,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
 
 	mmu_notifier_invalidate_range_start(&range);
 	tlb_start_vma(tlb, vma);
+	walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
 	walk_page_range_vma(vma, range.start, range.end,
-			&madvise_free_walk_ops, tlb);
+			&walk_ops, tlb);
 	tlb_end_vma(tlb, vma);
 	mmu_notifier_invalidate_range_end(&range);
 	return 0;
@@ -1658,7 +1671,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
 	case MADV_WILLNEED:
 	case MADV_COLD:
 	case MADV_PAGEOUT:
-	case MADV_FREE:
 	case MADV_POPULATE_READ:
 	case MADV_POPULATE_WRITE:
 	case MADV_COLLAPSE:
@@ -1667,6 +1679,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
 		return MADVISE_MMAP_READ_LOCK;
 	case MADV_DONTNEED:
 	case MADV_DONTNEED_LOCKED:
+	case MADV_FREE:
 		return MADVISE_VMA_READ_LOCK;
 	default:
 		return MADVISE_MMAP_WRITE_LOCK;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ff5299eca687..a214a2b40ab9 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -422,7 +422,7 @@ static inline void process_mm_walk_lock(struct mm_struct *mm,
 {
 	if (walk_lock == PGWALK_RDLOCK)
 		mmap_assert_locked(mm);
-	else
+	else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
 		mmap_assert_write_locked(mm);
 }
 
@@ -437,6 +437,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
 	case PGWALK_WRLOCK_VERIFY:
 		vma_assert_write_locked(vma);
 		break;
+	case PGWALK_VMA_RDLOCK_VERIFY:
+		vma_assert_locked(vma);
+		break;
 	case PGWALK_RDLOCK:
 		/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
 		break;
-- 
cgit v1.2.3


From 234dda7a49ff94154b527784687f549b9f1417c1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 12 Jun 2025 15:34:41 +0100
Subject: mm: remove zero_user()

All users have now been converted to either memzero_page() or
folio_zero_range().

Link: https://lkml.kernel.org/r/20250612143443.2848197-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Alex Markuze <amarkuze@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Dan Carpenter <dan.carpenter@linaro.org>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index e48d7f27b0b9..a30526cc53a7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -292,12 +292,6 @@ static inline void zero_user_segment(struct page *page,
 	zero_user_segments(page, start, end, 0, 0);
 }
 
-static inline void zero_user(struct page *page,
-	unsigned start, unsigned size)
-{
-	zero_user_segments(page, start, start + size, 0, 0);
-}
-
 #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
 
 static inline void copy_user_highpage(struct page *to, struct page *from,
-- 
cgit v1.2.3


From 02825c0925fbd53c085e88a1b7603eee8c9c6751 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 13 Jun 2025 11:27:02 +0200
Subject: mm/huge_memory: don't mark refcounted folios special in
 vmf_insert_folio_pud()

Marking PUDs that map a "normal" refcounted folios as special is against
our rules documented for vm_normal_page().  normal (refcounted) folios
shall never have the page table mapping marked as special.

Fortunately, there are not that many pud_special() check that can be
mislead and are right now rather harmless: e.g., none so far bases
decisions whether to grab a folio reference on that decision.

Well, and GUP-fast will fallback to GUP-slow.  All in all, so far no big
implications as it seems.

Getting this right will get more important as we introduce
folio_normal_page_pud() and start using it in more place where we
currently special-case based on other VMA flags.

Fix it just like we fixed vmf_insert_folio_pmd().

Add folio_mk_pud() to mimic what we do with folio_mk_pmd().

Link: https://lkml.kernel.org/r/20250613092702.1943533-4-david@redhat.com
Fixes: dbe54153296d ("mm/huge_memory: add vmf_insert_folio_pud()")
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 ++++++++++++++++++-
 mm/huge_memory.c   | 52 +++++++++++++++++++++++++++++-----------------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..b7e2abd8ce0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1816,7 +1816,24 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot)
 {
 	return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot));
 }
-#endif
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+/**
+ * folio_mk_pud - Create a PUD for this folio
+ * @folio: The folio to create a PUD for
+ * @pgprot: The page protection bits to use
+ *
+ * Create a page table entry for the first page of this folio.
+ * This is suitable for passing to set_pud_at().
+ *
+ * Return: A page table entry suitable for mapping this folio.
+ */
+static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot)
+{
+	return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot));
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_MMU */
 
 static inline bool folio_has_pincount(const struct folio *folio)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d1e3e253c714..bbc1dab98f2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1536,15 +1536,18 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 	return pud;
 }
 
-static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
+static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+		pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t entry;
 
 	if (!pud_none(*pud)) {
+		const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
+					  pfn_t_to_pfn(fop.pfn);
+
 		if (write) {
-			if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
+			if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
 				return;
 			entry = pud_mkyoung(*pud);
 			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
@@ -1554,11 +1557,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 		return;
 	}
 
-	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
-	if (pfn_t_devmap(pfn))
-		entry = pud_mkdevmap(entry);
-	else
-		entry = pud_mkspecial(entry);
+	if (fop.is_folio) {
+		entry = folio_mk_pud(fop.folio, vma->vm_page_prot);
+
+		folio_get(fop.folio);
+		folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
+		add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
+	} else {
+		entry = pud_mkhuge(pfn_t_pud(fop.pfn, prot));
+
+		if (pfn_t_devmap(fop.pfn))
+			entry = pud_mkdevmap(entry);
+		else
+			entry = pud_mkspecial(entry);
+	}
 	if (write) {
 		entry = pud_mkyoung(pud_mkdirty(entry));
 		entry = maybe_pud_mkwrite(entry, vma);
@@ -1582,6 +1594,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 	unsigned long addr = vmf->address & PUD_MASK;
 	struct vm_area_struct *vma = vmf->vma;
 	pgprot_t pgprot = vma->vm_page_prot;
+	struct folio_or_pfn fop = {
+		.pfn = pfn,
+	};
 	spinlock_t *ptl;
 
 	/*
@@ -1601,7 +1616,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 	pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
 
 	ptl = pud_lock(vma->vm_mm, vmf->pud);
-	insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
+	insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
 	spin_unlock(ptl);
 
 	return VM_FAULT_NOPAGE;
@@ -1623,6 +1638,10 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
 	unsigned long addr = vmf->address & PUD_MASK;
 	pud_t *pud = vmf->pud;
 	struct mm_struct *mm = vma->vm_mm;
+	struct folio_or_pfn fop = {
+		.folio = folio,
+		.is_folio = true,
+	};
 	spinlock_t *ptl;
 
 	if (addr < vma->vm_start || addr >= vma->vm_end)
@@ -1632,20 +1651,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
 		return VM_FAULT_SIGBUS;
 
 	ptl = pud_lock(mm, pud);
-
-	/*
-	 * If there is already an entry present we assume the folio is
-	 * already mapped, hence no need to take another reference. We
-	 * still call insert_pfn_pud() though in case the mapping needs
-	 * upgrading to writeable.
-	 */
-	if (pud_none(*vmf->pud)) {
-		folio_get(folio);
-		folio_add_file_rmap_pud(folio, &folio->page, vma);
-		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
-	}
-	insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
-		       vma->vm_page_prot, write);
+	insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
 	spin_unlock(ptl);
 
 	return VM_FAULT_NOPAGE;
-- 
cgit v1.2.3


From 9e82db9c0cdafa068969373b4bad140f72988e32 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 13 Jun 2025 20:48:23 +0100
Subject: highmem: remove a use of folio->page

Call folio_address() instead of page_address().

Link: https://lkml.kernel.org/r/20250613194825.3175276-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h
index 9a7683d79a4b..36053c3d6d64 100644
--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -195,7 +195,7 @@ static inline void *kmap_local_page_try_from_panic(struct page *page)
 
 static inline void *kmap_local_folio(struct folio *folio, size_t offset)
 {
-	return page_address(&folio->page) + offset;
+	return folio_address(folio) + offset;
 }
 
 static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
-- 
cgit v1.2.3


From 78ddaa358ec4cdd60bd0e243ced1c83a52c30241 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Wed, 18 Jun 2025 20:42:52 +0100
Subject: mm: change vm_get_page_prot() to accept vm_flags_t argument

Patch series "use vm_flags_t consistently".

The VMA flags field vma->vm_flags is of type vm_flags_t.  Right now this
is exactly equivalent to unsigned long, but it should not be assumed to
be.

Much code that references vma->vm_flags already correctly uses vm_flags_t,
but a fairly large chunk of code simply uses unsigned long and assumes
that the two are equivalent.

This series corrects that and has us use vm_flags_t consistently.

This series is motivated by the desire to, in a future series, adjust
vm_flags_t to be a u64 regardless of whether the kernel is 32-bit or
64-bit in order to deal with the VMA flag exhaustion issue and avoid all
the various problems that arise from it (being unable to use certain
features in 32-bit, being unable to add new flags except for 64-bit, etc.)

This is therefore a critical first step towards that goal.  At any rate,
using the correct type is of value regardless.

We additionally take the opportunity to refer to VMA flags as vm_flags
where possible to make clear what we're referring to.

Overall, this series does not introduce any functional change.


This patch (of 3):

We abstract the type of the VMA flags to vm_flags_t, however in may places
it is simply assumed this is unsigned long, which is simply incorrect.

At the moment this is simply an incongruity, however in future we plan to
change this type and therefore this change is a critical requirement for
doing so.

Overall, this patch does not introduce any functional change.

[lorenzo.stoakes@oracle.com: add missing vm_get_page_prot() instance, remove include]
  Link: https://lkml.kernel.org/r/552f88e1-2df8-4e95-92b8-812f7c8db829@lucifer.local
Link: https://lkml.kernel.org/r/cover.1750274467.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/a12769720a2743f235643b158c4f4f0a9911daf0.1750274467.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>	[arm64]
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/mmap.c                       | 2 +-
 arch/powerpc/include/asm/book3s/64/pkeys.h | 2 +-
 arch/powerpc/mm/book3s64/pgtable.c         | 2 +-
 arch/sparc/mm/init_64.c                    | 2 +-
 arch/x86/mm/pgprot.c                       | 2 +-
 include/linux/mm.h                         | 4 ++--
 include/linux/pgtable.h                    | 2 +-
 tools/testing/vma/vma_internal.h           | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index c86c348857c4..08ee177432c2 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -81,7 +81,7 @@ static int __init adjust_protection_map(void)
 }
 arch_initcall(adjust_protection_map);
 
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	ptdesc_t prot;
 
diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 5b178139f3c0..ff911b4251d9 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -5,7 +5,7 @@
 
 #include <asm/book3s/64/hash-pkey.h>
 
-static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
+static inline u64 vmflag_to_pte_pkey_bits(vm_flags_t vm_flags)
 {
 	if (!mmu_has_feature(MMU_FTR_PKEY))
 		return 0x0UL;
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 0db01e10a3f8..a89ef89101fc 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -644,7 +644,7 @@ unsigned long memremap_compat_align(void)
 EXPORT_SYMBOL_GPL(memremap_compat_align);
 #endif
 
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	unsigned long prot;
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 25ae4c897aae..7ed58bf3aaca 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -3201,7 +3201,7 @@ void copy_highpage(struct page *to, struct page *from)
 }
 EXPORT_SYMBOL(copy_highpage);
 
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	unsigned long prot = pgprot_val(protection_map[vm_flags &
 					(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c
index c84bd9540b16..dc1afd5c839d 100644
--- a/arch/x86/mm/pgprot.c
+++ b/arch/x86/mm/pgprot.c
@@ -32,7 +32,7 @@ void add_encrypt_protection_map(void)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
 }
 
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	unsigned long val = pgprot_val(protection_map[vm_flags &
 				      (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7e2abd8ce0d..78bb177ba55f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3489,10 +3489,10 @@ static inline bool range_in_vma(struct vm_area_struct *vma,
 }
 
 #ifdef CONFIG_MMU
-pgprot_t vm_get_page_prot(unsigned long vm_flags);
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags);
 void vma_set_page_prot(struct vm_area_struct *vma);
 #else
-static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
+static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	return __pgprot(0);
 }
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e4a3895c043b..d05e35a0facf 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -2016,7 +2016,7 @@ typedef unsigned int pgtbl_mod_mask;
  *								x: (yes) yes
  */
 #define DECLARE_VM_GET_PAGE_PROT					\
-pgprot_t vm_get_page_prot(unsigned long vm_flags)			\
+pgprot_t vm_get_page_prot(vm_flags_t vm_flags)				\
 {									\
 		return protection_map[vm_flags &			\
 			(VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];	\
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 0f013784da89..3b1b45256d56 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -576,7 +576,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
 	return __pgprot(pgprot_val(oldprot) | pgprot_val(newprot));
 }
 
-static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
+static inline pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 {
 	return __pgprot(vm_flags);
 }
-- 
cgit v1.2.3


From bfbe71109fa40e8cc05a0f99e6734b7d76ee00b0 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Wed, 18 Jun 2025 20:42:53 +0100
Subject: mm: update core kernel code to use vm_flags_t consistently

The core kernel code is currently very inconsistent in its use of
vm_flags_t vs.  unsigned long.  This prevents us from changing the type of
vm_flags_t in the future and is simply not correct, so correct this.

While this results in rather a lot of churn, it is a critical
pre-requisite for a future planned change to VMA flag type.

Additionally, update VMA userland tests to account for the changes.

To make review easier and to break things into smaller parts, driver and
architecture-specific changes is left for a subsequent commit.

The code has been adjusted to cascade the changes across all calling code
as far as is needed.

We will adjust architecture-specific and driver code in a subsequent patch.

Overall, this patch does not introduce any functional change.

Link: https://lkml.kernel.org/r/d1588e7bb96d1ea3fe7b9df2c699d5b4592d901d.1750274467.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Kees Cook <kees@kernel.org>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c                        |   2 +-
 fs/userfaultfd.c                 |   2 +-
 include/linux/coredump.h         |   2 +-
 include/linux/huge_mm.h          |  12 +-
 include/linux/khugepaged.h       |   4 +-
 include/linux/ksm.h              |   4 +-
 include/linux/memfd.h            |   4 +-
 include/linux/mm.h               |   6 +-
 include/linux/mm_types.h         |   2 +-
 include/linux/mman.h             |   4 +-
 include/linux/rmap.h             |   4 +-
 include/linux/userfaultfd_k.h    |   4 +-
 include/trace/events/fs_dax.h    |   6 +-
 mm/debug.c                       |   2 +-
 mm/execmem.c                     |   8 +-
 mm/filemap.c                     |   2 +-
 mm/gup.c                         |   2 +-
 mm/huge_memory.c                 |   2 +-
 mm/hugetlb.c                     |   4 +-
 mm/internal.h                    |   4 +-
 mm/khugepaged.c                  |   4 +-
 mm/ksm.c                         |   2 +-
 mm/madvise.c                     |   4 +-
 mm/mapping_dirty_helpers.c       |   2 +-
 mm/memfd.c                       |   8 +-
 mm/memory.c                      |   4 +-
 mm/mmap.c                        |  16 +--
 mm/mprotect.c                    |   8 +-
 mm/mremap.c                      |   2 +-
 mm/nommu.c                       |  12 +-
 mm/rmap.c                        |   4 +-
 mm/shmem.c                       |   6 +-
 mm/userfaultfd.c                 |  14 +--
 mm/vma.c                         |  80 ++++++------
 mm/vma.h                         |  16 +--
 mm/vmscan.c                      |   4 +-
 tools/testing/vma/vma.c          | 266 +++++++++++++++++++--------------------
 tools/testing/vma/vma_internal.h |   8 +-
 38 files changed, 270 insertions(+), 270 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index ba400aafd640..9faf9052bed9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -604,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = bprm->vma;
 	struct vm_area_struct *prev = NULL;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 	unsigned long stack_base;
 	unsigned long stack_size;
 	unsigned long stack_expand;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index a2928b0aec6f..48e82e19d831 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1242,7 +1242,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	int ret;
 	struct uffdio_register uffdio_register;
 	struct uffdio_register __user *user_uffdio_register;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 	bool found;
 	bool basic_ioctls;
 	unsigned long start, end;
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 76e41805b92d..c504b0faecc2 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -10,7 +10,7 @@
 #ifdef CONFIG_COREDUMP
 struct core_vma_metadata {
 	unsigned long start, end;
-	unsigned long flags;
+	vm_flags_t flags;
 	unsigned long dump_size;
 	unsigned long pgoff;
 	struct file   *file;
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2f190c90192d..7753daac49f7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -261,7 +261,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
 }
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
-					 unsigned long vm_flags,
+					 vm_flags_t vm_flags,
 					 unsigned long tva_flags,
 					 unsigned long orders);
 
@@ -282,7 +282,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
  */
 static inline
 unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
-				       unsigned long vm_flags,
+				       vm_flags_t vm_flags,
 				       unsigned long tva_flags,
 				       unsigned long orders)
 {
@@ -317,7 +317,7 @@ struct thpsize {
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
 
 static inline bool vma_thp_disabled(struct vm_area_struct *vma,
-		unsigned long vm_flags)
+		vm_flags_t vm_flags)
 {
 	/*
 	 * Explicitly disabled through madvise or prctl, or some
@@ -431,7 +431,7 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			__split_huge_pud(__vma, __pud, __address);	\
 	}  while (0)
 
-int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
+int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags,
 		     int advice);
 int madvise_collapse(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev,
@@ -524,7 +524,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
 }
 
 static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
-					unsigned long vm_flags,
+					vm_flags_t vm_flags,
 					unsigned long tva_flags,
 					unsigned long orders)
 {
@@ -593,7 +593,7 @@ static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
 	do { } while (0)
 
 static inline int hugepage_madvise(struct vm_area_struct *vma,
-				   unsigned long *vm_flags, int advice)
+				   vm_flags_t *vm_flags, int advice)
 {
 	return -EINVAL;
 }
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index b8d69cfbb58b..ff6120463745 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -12,7 +12,7 @@ extern int start_stop_khugepaged(void);
 extern void __khugepaged_enter(struct mm_struct *mm);
 extern void __khugepaged_exit(struct mm_struct *mm);
 extern void khugepaged_enter_vma(struct vm_area_struct *vma,
-				 unsigned long vm_flags);
+				 vm_flags_t vm_flags);
 extern void khugepaged_min_free_kbytes_update(void);
 extern bool current_is_khugepaged(void);
 extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
@@ -37,7 +37,7 @@ static inline void khugepaged_exit(struct mm_struct *mm)
 {
 }
 static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
-					unsigned long vm_flags)
+					vm_flags_t vm_flags)
 {
 }
 static inline int collapse_pte_mapped_thp(struct mm_struct *mm,
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 51787f0b0208..c17b955e7b0b 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,7 +16,7 @@
 
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags);
+		unsigned long end, int advice, vm_flags_t *vm_flags);
 vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
 			 vm_flags_t vm_flags);
 int ksm_enable_merge_any(struct mm_struct *mm);
@@ -133,7 +133,7 @@ static inline void collect_procs_ksm(const struct folio *folio,
 
 #ifdef CONFIG_MMU
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags)
+		unsigned long end, int advice, vm_flags_t *vm_flags)
 {
 	return 0;
 }
diff --git a/include/linux/memfd.h b/include/linux/memfd.h
index 246daadbfde8..6f606d9573c3 100644
--- a/include/linux/memfd.h
+++ b/include/linux/memfd.h
@@ -14,7 +14,7 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx);
  * We also update VMA flags if appropriate by manipulating the VMA flags pointed
  * to by vm_flags_ptr.
  */
-int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr);
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr);
 #else
 static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a)
 {
@@ -25,7 +25,7 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 	return ERR_PTR(-EINVAL);
 }
 static inline int memfd_check_seals_mmap(struct file *file,
-					 unsigned long *vm_flags_ptr)
+					 vm_flags_t *vm_flags_ptr)
 {
 	return 0;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 78bb177ba55f..fc365420dfa8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2564,7 +2564,7 @@ extern long change_protection(struct mmu_gather *tlb,
 			      unsigned long end, unsigned long cp_flags);
 extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 	  struct vm_area_struct *vma, struct vm_area_struct **pprev,
-	  unsigned long start, unsigned long end, unsigned long newflags);
+	  unsigned long start, unsigned long end, vm_flags_t newflags);
 
 /*
  * doesn't attempt to fault and will return short.
@@ -3323,9 +3323,9 @@ extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
 
 extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
 				   const struct vm_special_mapping *sm);
-extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
+struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
 				   unsigned long addr, unsigned long len,
-				   unsigned long flags,
+				   vm_flags_t vm_flags,
 				   const struct vm_special_mapping *spec);
 
 unsigned long randomize_stack_top(unsigned long stack_top);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d6b91e8a66d6..804d269a4f5e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1081,7 +1081,7 @@ struct mm_struct {
 		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
 		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
 		unsigned long stack_vm;	   /* VM_STACK */
-		unsigned long def_flags;
+		vm_flags_t def_flags;
 
 		/**
 		 * @write_protect_seq: Locked when any thread is write
diff --git a/include/linux/mman.h b/include/linux/mman.h
index f4c6346a8fcd..de9e8e6229a4 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -137,7 +137,7 @@ static inline bool arch_validate_flags(unsigned long flags)
 /*
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
-static inline unsigned long
+static inline vm_flags_t
 calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
 	return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
@@ -149,7 +149,7 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 /*
  * Combine the mmap "flags" argument into "vm_flags" used internally.
  */
-static inline unsigned long
+static inline vm_flags_t
 calc_vm_flag_bits(struct file *file, unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c4f4903b1088..20803fcb49a7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -893,7 +893,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
  * Called from mm/vmscan.c to handle paging out
  */
 int folio_referenced(struct folio *, int is_locked,
-			struct mem_cgroup *memcg, unsigned long *vm_flags);
+			struct mem_cgroup *memcg, vm_flags_t *vm_flags);
 
 void try_to_migrate(struct folio *folio, enum ttu_flags flags);
 void try_to_unmap(struct folio *, enum ttu_flags flags);
@@ -1025,7 +1025,7 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
 
 static inline int folio_referenced(struct folio *folio, int is_locked,
 				  struct mem_cgroup *memcg,
-				  unsigned long *vm_flags)
+				  vm_flags_t *vm_flags)
 {
 	*vm_flags = 0;
 	return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ccad58602846..df85330bcfa6 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -209,7 +209,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 }
 
 static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags,
+				     vm_flags_t vm_flags,
 				     bool wp_async)
 {
 	vm_flags &= __VM_UFFD_FLAGS;
@@ -281,7 +281,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
 
 int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
 			       struct vm_area_struct *vma,
-			       unsigned long vm_flags,
+			       vm_flags_t vm_flags,
 			       unsigned long start, unsigned long end,
 			       bool wp_async);
 
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 76b56f78abb0..50ebc1290ab0 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -15,7 +15,7 @@ DECLARE_EVENT_CLASS(dax_pmd_fault_class,
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_start)
 		__field(unsigned long, vm_end)
-		__field(unsigned long, vm_flags)
+		__field(vm_flags_t, vm_flags)
 		__field(unsigned long, address)
 		__field(pgoff_t, pgoff)
 		__field(pgoff_t, max_pgoff)
@@ -67,7 +67,7 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_ARGS(inode, vmf, zero_folio, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
-		__field(unsigned long, vm_flags)
+		__field(vm_flags_t, vm_flags)
 		__field(unsigned long, address)
 		__field(struct folio *, zero_folio)
 		__field(void *, radix_entry)
@@ -107,7 +107,7 @@ DECLARE_EVENT_CLASS(dax_pte_fault_class,
 	TP_ARGS(inode, vmf, result),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
-		__field(unsigned long, vm_flags)
+		__field(vm_flags_t, vm_flags)
 		__field(unsigned long, address)
 		__field(pgoff_t, pgoff)
 		__field(dev_t, dev)
diff --git a/mm/debug.c b/mm/debug.c
index 907382257062..e2973e1b3812 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -290,7 +290,7 @@ void dump_vmg(const struct vma_merge_struct *vmg, const char *reason)
 		vmg->vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0,
 		vmg->vmi ? vma_iter_end(vmg->vmi) : 0,
 		vmg->prev, vmg->middle, vmg->next, vmg->target,
-		vmg->start, vmg->end, vmg->flags,
+		vmg->start, vmg->end, vmg->vm_flags,
 		vmg->file, vmg->anon_vma, vmg->policy,
 #ifdef CONFIG_USERFAULTFD
 		vmg->uffd_ctx.ctx,
diff --git a/mm/execmem.c b/mm/execmem.c
index 2b683e7d864d..627e6cf64f4f 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init;
 
 #ifdef CONFIG_MMU
 static void *execmem_vmalloc(struct execmem_range *range, size_t size,
-			     pgprot_t pgprot, unsigned long vm_flags)
+			     pgprot_t pgprot, vm_flags_t vm_flags)
 {
 	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
@@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size)
 }
 #else
 static void *execmem_vmalloc(struct execmem_range *range, size_t size,
-			     pgprot_t pgprot, unsigned long vm_flags)
+			     pgprot_t pgprot, vm_flags_t vm_flags)
 {
 	return vmalloc(size);
 }
@@ -256,7 +256,7 @@ out_unlock:
 
 static int execmem_cache_populate(struct execmem_range *range, size_t size)
 {
-	unsigned long vm_flags = VM_ALLOW_HUGE_VMAP;
+	vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP;
 	struct vm_struct *vm;
 	size_t alloc_size;
 	int err = -ENOMEM;
@@ -373,7 +373,7 @@ void *execmem_alloc(enum execmem_type type, size_t size)
 {
 	struct execmem_range *range = &execmem_info->ranges[type];
 	bool use_cache = range->flags & EXECMEM_ROX_CACHE;
-	unsigned long vm_flags = VM_FLUSH_RESET_PERMS;
+	vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS;
 	pgprot_t pgprot = range->pgprot;
 	void *p;
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 3cf955740148..0d0369fb5fa1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3216,7 +3216,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
 	struct address_space *mapping = file->f_mapping;
 	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
 	struct file *fpin = NULL;
-	unsigned long vm_flags = vmf->vma->vm_flags;
+	vm_flags_t vm_flags = vmf->vma->vm_flags;
 	unsigned short mmap_miss;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/gup.c b/mm/gup.c
index cbe8e4b9845b..c08b97e0d344 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2044,7 +2044,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
 {
 	struct vm_area_struct *vma;
 	bool must_unlock = false;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 	long i;
 
 	if (!nr_pages)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1b31985cef11..6411f3107af1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -99,7 +99,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 }
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
-					 unsigned long vm_flags,
+					 vm_flags_t vm_flags,
 					 unsigned long tva_flags,
 					 unsigned long orders)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7a7df0b2a561..c7ba95030241 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7465,8 +7465,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	unsigned long s_end = sbase + PUD_SIZE;
 
 	/* Allow segments to share if only one is marked locked */
-	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
-	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
+	vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+	vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
 
 	/*
 	 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/internal.h b/mm/internal.h
index 3eb51c31a041..fe83dfca3c72 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -928,7 +928,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, int *locked);
 extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
 		unsigned long end, bool write, int *locked);
-extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
 			       unsigned long bytes);
 
 /*
@@ -1358,7 +1358,7 @@ int migrate_device_coherent_folio(struct folio *folio);
 
 struct vm_struct *__get_vm_area_node(unsigned long size,
 				     unsigned long align, unsigned long shift,
-				     unsigned long flags, unsigned long start,
+				     vm_flags_t vm_flags, unsigned long start,
 				     unsigned long end, int node, gfp_t gfp_mask,
 				     const void *caller);
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 15203ea7d007..6b09b09c8f82 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = {
 #endif /* CONFIG_SYSFS */
 
 int hugepage_madvise(struct vm_area_struct *vma,
-		     unsigned long *vm_flags, int advice)
+		     vm_flags_t *vm_flags, int advice)
 {
 	switch (advice) {
 	case MADV_HUGEPAGE:
@@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm)
 }
 
 void khugepaged_enter_vma(struct vm_area_struct *vma,
-			  unsigned long vm_flags)
+			  vm_flags_t vm_flags)
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    hugepage_pmd_enabled()) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 18b3690bb69a..ef73b25fd65a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2840,7 +2840,7 @@ int ksm_disable(struct mm_struct *mm)
 }
 
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end, int advice, unsigned long *vm_flags)
+		unsigned long end, int advice, vm_flags_t *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
diff --git a/mm/madvise.c b/mm/madvise.c
index d451438af999..92f427b1b330 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -130,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
  */
 static int madvise_update_vma(struct vm_area_struct *vma,
 			      struct vm_area_struct **prev, unsigned long start,
-			      unsigned long end, unsigned long new_flags,
+			      unsigned long end, vm_flags_t new_flags,
 			      struct anon_vma_name *anon_name)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -1258,7 +1258,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 	int behavior = arg->behavior;
 	int error;
 	struct anon_vma_name *anon_name;
-	unsigned long new_flags = vma->vm_flags;
+	vm_flags_t new_flags = vma->vm_flags;
 
 	if (unlikely(!can_modify_vma_madv(vma, behavior)))
 		return -EPERM;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2f8829b3541a..dc1692ff9e58 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk)
 static int wp_clean_test_walk(unsigned long start, unsigned long end,
 			      struct mm_walk *walk)
 {
-	unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
+	vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags);
 
 	/* Skip non-applicable VMAs */
 	if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
diff --git a/mm/memfd.c b/mm/memfd.c
index 65a107f72e39..b558c4c3bd27 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -332,10 +332,10 @@ static inline bool is_write_sealed(unsigned int seals)
 	return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
 }
 
-static int check_write_seal(unsigned long *vm_flags_ptr)
+static int check_write_seal(vm_flags_t *vm_flags_ptr)
 {
-	unsigned long vm_flags = *vm_flags_ptr;
-	unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);
+	vm_flags_t vm_flags = *vm_flags_ptr;
+	vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
 
 	/* If a private mapping then writability is irrelevant. */
 	if (!(mask & VM_SHARED))
@@ -357,7 +357,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr)
 	return 0;
 }
 
-int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
 {
 	int err = 0;
 	unsigned int *seals_ptr = memfd_file_seals_ptr(file);
diff --git a/mm/memory.c b/mm/memory.c
index b0cda5aab398..833426fa5fe0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -797,7 +797,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
 		struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 {
-	unsigned long vm_flags = dst_vma->vm_flags;
+	vm_flags_t vm_flags = dst_vma->vm_flags;
 	pte_t orig_pte = ptep_get(src_pte);
 	pte_t pte = orig_pte;
 	struct folio *folio;
@@ -6128,7 +6128,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		.gfp_mask = __get_fault_gfp_mask(vma),
 	};
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	vm_fault_t ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index 09c563c95112..8f92cf10b656 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
 /* Update vma->vm_page_prot to reflect vma->vm_flags. */
 void vma_set_page_prot(struct vm_area_struct *vma)
 {
-	unsigned long vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 	pgprot_t vm_page_prot;
 
 	vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
@@ -228,12 +228,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
 	return hint;
 }
 
-bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
 			unsigned long bytes)
 {
 	unsigned long locked_pages, limit_pages;
 
-	if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+	if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
 		return true;
 
 	locked_pages = bytes >> PAGE_SHIFT;
@@ -1207,7 +1207,7 @@ out:
 	return ret;
 }
 
-int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
+int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
@@ -1224,7 +1224,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 		return 0;
 
 	/* Until we need other flags, refuse anything except VM_EXEC. */
-	if ((flags & (~VM_EXEC)) != 0)
+	if ((vm_flags & (~VM_EXEC)) != 0)
 		return -EINVAL;
 
 	if (mmap_write_lock_killable(mm))
@@ -1239,7 +1239,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
 		goto munmap_failed;
 
 	vma = vma_prev(&vmi);
-	ret = do_brk_flags(&vmi, vma, addr, len, flags);
+	ret = do_brk_flags(&vmi, vma, addr, len, vm_flags);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	mmap_write_unlock(mm);
 	userfaultfd_unmap_complete(mm, &uf);
@@ -1444,7 +1444,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
 static struct vm_area_struct *__install_special_mapping(
 	struct mm_struct *mm,
 	unsigned long addr, unsigned long len,
-	unsigned long vm_flags, void *priv,
+	vm_flags_t vm_flags, void *priv,
 	const struct vm_operations_struct *ops)
 {
 	int ret;
@@ -1496,7 +1496,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
 struct vm_area_struct *_install_special_mapping(
 	struct mm_struct *mm,
 	unsigned long addr, unsigned long len,
-	unsigned long vm_flags, const struct vm_special_mapping *spec)
+	vm_flags_t vm_flags, const struct vm_special_mapping *spec)
 {
 	return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
 					&special_mapping_vmops);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88608d0dc2c2..b873b98ab705 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -596,10 +596,10 @@ static const struct mm_walk_ops prot_none_walk_ops = {
 int
 mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
 	       struct vm_area_struct *vma, struct vm_area_struct **pprev,
-	       unsigned long start, unsigned long end, unsigned long newflags)
+	       unsigned long start, unsigned long end, vm_flags_t newflags)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long oldflags = READ_ONCE(vma->vm_flags);
+	vm_flags_t oldflags = READ_ONCE(vma->vm_flags);
 	long nrpages = (end - start) >> PAGE_SHIFT;
 	unsigned int mm_cp_flags = 0;
 	unsigned long charged = 0;
@@ -774,8 +774,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
 	nstart = start;
 	tmp = vma->vm_start;
 	for_each_vma_range(vmi, vma, end) {
-		unsigned long mask_off_old_flags;
-		unsigned long newflags;
+		vm_flags_t mask_off_old_flags;
+		vm_flags_t newflags;
 		int new_vma_pkey;
 
 		if (vma->vm_start != tmp) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 18b215521ada..7e93d3344828 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1025,7 +1025,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
 	struct vm_area_struct *vma = vrm->vma;
 	unsigned long old_addr = vrm->addr;
 	unsigned long old_len = vrm->old_len;
-	unsigned long dummy = vma->vm_flags;
+	vm_flags_t dummy = vma->vm_flags;
 
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
diff --git a/mm/nommu.c b/mm/nommu.c
index b624acec6d2e..87e1acab0d64 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
 
 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
 		unsigned long start, unsigned long end, gfp_t gfp_mask,
-		pgprot_t prot, unsigned long vm_flags, int node,
+		pgprot_t prot, vm_flags_t vm_flags, int node,
 		const void *caller)
 {
 	return __vmalloc_noprof(size, gfp_mask);
@@ -844,12 +844,12 @@ static int validate_mmap_request(struct file *file,
  * we've determined that we can make the mapping, now translate what we
  * now know into VMA flags
  */
-static unsigned long determine_vm_flags(struct file *file,
-					unsigned long prot,
-					unsigned long flags,
-					unsigned long capabilities)
+static vm_flags_t determine_vm_flags(struct file *file,
+		unsigned long prot,
+		unsigned long flags,
+		unsigned long capabilities)
 {
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 
 	vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..a312cae16bb5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -839,7 +839,7 @@ out:
 struct folio_referenced_arg {
 	int mapcount;
 	int referenced;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 	struct mem_cgroup *memcg;
 };
 
@@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
  * the function bailed out due to rmap lock contention.
  */
 int folio_referenced(struct folio *folio, int is_locked,
-		     struct mem_cgroup *memcg, unsigned long *vm_flags)
+		     struct mem_cgroup *memcg, vm_flags_t *vm_flags)
 {
 	bool we_locked = false;
 	struct folio_referenced_arg pra = {
diff --git a/mm/shmem.c b/mm/shmem.c
index eda35be2a8d9..334b7b4a61a0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -615,7 +615,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode,
 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 					      loff_t write_end, bool shmem_huge_force,
 					      struct vm_area_struct *vma,
-					      unsigned long vm_flags)
+					      vm_flags_t vm_flags)
 {
 	unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
 		0 : BIT(HPAGE_PMD_ORDER);
@@ -862,7 +862,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
 static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
 					      loff_t write_end, bool shmem_huge_force,
 					      struct vm_area_struct *vma,
-					      unsigned long vm_flags)
+					      vm_flags_t vm_flags)
 {
 	return 0;
 }
@@ -1753,7 +1753,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
 {
 	unsigned long mask = READ_ONCE(huge_shmem_orders_always);
 	unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
-	unsigned long vm_flags = vma ? vma->vm_flags : 0;
+	vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
 	unsigned int global_orders;
 
 	if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9ff970980496..95dd8dea6ee4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1901,11 +1901,11 @@ out:
 }
 
 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
-				     vm_flags_t flags)
+				     vm_flags_t vm_flags)
 {
-	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+	const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP;
 
-	vm_flags_reset(vma, flags);
+	vm_flags_reset(vma, vm_flags);
 	/*
 	 * For shared mappings, we want to enable writenotify while
 	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
@@ -1917,12 +1917,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
 
 static void userfaultfd_set_ctx(struct vm_area_struct *vma,
 				struct userfaultfd_ctx *ctx,
-				unsigned long flags)
+				vm_flags_t vm_flags)
 {
 	vma_start_write(vma);
 	vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
 	userfaultfd_set_vm_flags(vma,
-				 (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
+				 (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags);
 }
 
 void userfaultfd_reset_ctx(struct vm_area_struct *vma)
@@ -1968,14 +1968,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
 /* Assumes mmap write lock taken, and mm_struct pinned. */
 int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
 			       struct vm_area_struct *vma,
-			       unsigned long vm_flags,
+			       vm_flags_t vm_flags,
 			       unsigned long start, unsigned long end,
 			       bool wp_async)
 {
 	VMA_ITERATOR(vmi, ctx->mm, start);
 	struct vm_area_struct *prev = vma_prev(&vmi);
 	unsigned long vma_end;
-	unsigned long new_flags;
+	vm_flags_t new_flags;
 
 	if (vma->vm_start < start)
 		prev = vma;
diff --git a/mm/vma.c b/mm/vma.c
index 4b6d0be9ba39..b3d880652359 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,7 +15,7 @@ struct mmap_state {
 	unsigned long end;
 	pgoff_t pgoff;
 	unsigned long pglen;
-	unsigned long flags;
+	vm_flags_t vm_flags;
 	struct file *file;
 	pgprot_t page_prot;
 
@@ -37,7 +37,7 @@ struct mmap_state {
 	bool check_ksm_early;
 };
 
-#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
 	struct mmap_state name = {					\
 		.mm = mm_,						\
 		.vmi = vmi_,						\
@@ -45,9 +45,9 @@ struct mmap_state {
 		.end = (addr_) + (len_),				\
 		.pgoff = pgoff_,					\
 		.pglen = PHYS_PFN(len_),				\
-		.flags = flags_,					\
+		.vm_flags = vm_flags_,					\
 		.file = file_,						\
-		.page_prot = vm_get_page_prot(flags_),			\
+		.page_prot = vm_get_page_prot(vm_flags_),		\
 	}
 
 #define VMG_MMAP_STATE(name, map_, vma_)				\
@@ -56,7 +56,7 @@ struct mmap_state {
 		.vmi = (map_)->vmi,					\
 		.start = (map_)->addr,					\
 		.end = (map_)->end,					\
-		.flags = (map_)->flags,					\
+		.vm_flags = (map_)->vm_flags,				\
 		.pgoff = (map_)->pgoff,					\
 		.file = (map_)->file,					\
 		.prev = (map_)->prev,					\
@@ -95,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
 	 * the kernel to generate new VMAs when old one could be
 	 * extended instead.
 	 */
-	if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+	if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
 		return false;
 	if (vma->vm_file != vmg->file)
 		return false;
@@ -843,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
 	 * furthermost left or right side of the VMA, then we have no chance of
 	 * merging and should abort.
 	 */
-	if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+	if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
 		return NULL;
 
 	if (left_side)
@@ -973,7 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
 	if (err || commit_merge(vmg))
 		goto abort;
 
-	khugepaged_enter_vma(vmg->target, vmg->flags);
+	khugepaged_enter_vma(vmg->target, vmg->vm_flags);
 	vmg->state = VMA_MERGE_SUCCESS;
 	return vmg->target;
 
@@ -1055,7 +1055,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	vmg->state = VMA_MERGE_NOMERGE;
 
 	/* Special VMAs are unmergeable, also if no prev/next. */
-	if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+	if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
 		return NULL;
 
 	can_merge_left = can_vma_merge_left(vmg);
@@ -1093,7 +1093,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
 	 * following VMA if we have VMAs on both sides.
 	 */
 	if (vmg->target && !vma_expand(vmg)) {
-		khugepaged_enter_vma(vmg->target, vmg->flags);
+		khugepaged_enter_vma(vmg->target, vmg->vm_flags);
 		vmg->state = VMA_MERGE_SUCCESS;
 		return vmg->target;
 	}
@@ -1640,11 +1640,11 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
 struct vm_area_struct *vma_modify_flags(
 	struct vma_iterator *vmi, struct vm_area_struct *prev,
 	struct vm_area_struct *vma, unsigned long start, unsigned long end,
-	unsigned long new_flags)
+	vm_flags_t vm_flags)
 {
 	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
 
-	vmg.flags = new_flags;
+	vmg.vm_flags = vm_flags;
 
 	return vma_modify(&vmg);
 }
@@ -1655,12 +1655,12 @@ struct vm_area_struct
 		       struct vm_area_struct *vma,
 		       unsigned long start,
 		       unsigned long end,
-		       unsigned long new_flags,
+		       vm_flags_t vm_flags,
 		       struct anon_vma_name *new_name)
 {
 	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
 
-	vmg.flags = new_flags;
+	vmg.vm_flags = vm_flags;
 	vmg.anon_name = new_name;
 
 	return vma_modify(&vmg);
@@ -1685,13 +1685,13 @@ struct vm_area_struct
 		       struct vm_area_struct *prev,
 		       struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end,
-		       unsigned long new_flags,
+		       vm_flags_t vm_flags,
 		       struct vm_userfaultfd_ctx new_ctx,
 		       bool give_up_on_oom)
 {
 	VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
 
-	vmg.flags = new_flags;
+	vmg.vm_flags = vm_flags;
 	vmg.uffd_ctx = new_ctx;
 	if (give_up_on_oom)
 		vmg.give_up_on_oom = true;
@@ -2327,7 +2327,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
 
 static void update_ksm_flags(struct mmap_state *map)
 {
-	map->flags = ksm_vma_flags(map->mm, map->file, map->flags);
+	map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
 }
 
 /*
@@ -2372,11 +2372,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
 	}
 
 	/* Check against address space limit. */
-	if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+	if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
 		return -ENOMEM;
 
 	/* Private writable mapping: check memory availability. */
-	if (accountable_mapping(map->file, map->flags)) {
+	if (accountable_mapping(map->file, map->vm_flags)) {
 		map->charged = map->pglen;
 		map->charged -= vms->nr_accounted;
 		if (map->charged) {
@@ -2386,7 +2386,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
 		}
 
 		vms->nr_accounted = 0;
-		map->flags |= VM_ACCOUNT;
+		map->vm_flags |= VM_ACCOUNT;
 	}
 
 	/*
@@ -2430,12 +2430,12 @@ static int __mmap_new_file_vma(struct mmap_state *map,
 	 * Drivers should not permit writability when previously it was
 	 * disallowed.
 	 */
-	VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
-			!(map->flags & VM_MAYWRITE) &&
+	VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
+			!(map->vm_flags & VM_MAYWRITE) &&
 			(vma->vm_flags & VM_MAYWRITE));
 
 	map->file = vma->vm_file;
-	map->flags = vma->vm_flags;
+	map->vm_flags = vma->vm_flags;
 
 	return 0;
 }
@@ -2466,7 +2466,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 
 	vma_iter_config(vmi, map->addr, map->end);
 	vma_set_range(vma, map->addr, map->end, map->pgoff);
-	vm_flags_init(vma, map->flags);
+	vm_flags_init(vma, map->vm_flags);
 	vma->vm_page_prot = map->page_prot;
 
 	if (vma_iter_prealloc(vmi, vma)) {
@@ -2476,7 +2476,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 
 	if (map->file)
 		error = __mmap_new_file_vma(map, vma);
-	else if (map->flags & VM_SHARED)
+	else if (map->vm_flags & VM_SHARED)
 		error = shmem_zero_setup(vma);
 	else
 		vma_set_anonymous(vma);
@@ -2486,12 +2486,12 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 
 	if (!map->check_ksm_early) {
 		update_ksm_flags(map);
-		vm_flags_init(vma, map->flags);
+		vm_flags_init(vma, map->vm_flags);
 	}
 
 #ifdef CONFIG_SPARC64
 	/* TODO: Fix SPARC ADI! */
-	WARN_ON_ONCE(!arch_validate_flags(map->flags));
+	WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
 #endif
 
 	/* Lock the VMA since it is modified after insertion into VMA tree */
@@ -2505,7 +2505,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
 	 * call covers the non-merge case.
 	 */
 	if (!vma_is_anonymous(vma))
-		khugepaged_enter_vma(vma, map->flags);
+		khugepaged_enter_vma(vma, map->vm_flags);
 	*vmap = vma;
 	return 0;
 
@@ -2526,7 +2526,7 @@ free_vma:
 static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = map->mm;
-	unsigned long vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 
 	perf_event_mmap(vma);
 
@@ -2579,7 +2579,7 @@ static int call_mmap_prepare(struct mmap_state *map)
 
 		.pgoff = map->pgoff,
 		.file = map->file,
-		.vm_flags = map->flags,
+		.vm_flags = map->vm_flags,
 		.page_prot = map->page_prot,
 	};
 
@@ -2591,7 +2591,7 @@ static int call_mmap_prepare(struct mmap_state *map)
 	/* Update fields permitted to be changed. */
 	map->pgoff = desc.pgoff;
 	map->file = desc.file;
-	map->flags = desc.vm_flags;
+	map->vm_flags = desc.vm_flags;
 	map->page_prot = desc.page_prot;
 	/* User-defined fields. */
 	map->vm_ops = desc.vm_ops;
@@ -2754,14 +2754,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
  * @addr: The start address
  * @len: The length of the increase
  * @vma: The vma,
- * @flags: The VMA Flags
+ * @vm_flags: The VMA Flags
  *
  * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
  * do not match then create a new anonymous VMA.  Eventually we may be able to
  * do some brk-specific accounting here.
  */
 int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
-		 unsigned long addr, unsigned long len, unsigned long flags)
+		 unsigned long addr, unsigned long len, vm_flags_t vm_flags)
 {
 	struct mm_struct *mm = current->mm;
 
@@ -2769,9 +2769,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 * Check against address space limits by the changed size
 	 * Note: This happens *after* clearing old mappings in some code paths.
 	 */
-	flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-	flags = ksm_vma_flags(mm, NULL, flags);
-	if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+	vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+	vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
+	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
 		return -ENOMEM;
 
 	if (mm->map_count > sysctl_max_map_count)
@@ -2785,7 +2785,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	 * occur after forking, so the expand will only happen on new VMAs.
 	 */
 	if (vma && vma->vm_end == addr) {
-		VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+		VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
 
 		vmg.prev = vma;
 		/* vmi is positioned at prev, which this mode expects. */
@@ -2806,8 +2806,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
 
 	vma_set_anonymous(vma);
 	vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
-	vm_flags_init(vma, flags);
-	vma->vm_page_prot = vm_get_page_prot(flags);
+	vm_flags_init(vma, vm_flags);
+	vma->vm_page_prot = vm_get_page_prot(vm_flags);
 	vma_start_write(vma);
 	if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
 		goto mas_store_fail;
@@ -2818,7 +2818,7 @@ out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
 	mm->data_vm += len >> PAGE_SHIFT;
-	if (flags & VM_LOCKED)
+	if (vm_flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vm_flags_set(vma, VM_SOFTDIRTY);
 	return 0;
diff --git a/mm/vma.h b/mm/vma.h
index f47112a352db..cf6e3a6371b6 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -98,7 +98,7 @@ struct vma_merge_struct {
 	unsigned long end;
 	pgoff_t pgoff;
 
-	unsigned long flags;
+	vm_flags_t vm_flags;
 	struct file *file;
 	struct anon_vma *anon_vma;
 	struct mempolicy *policy;
@@ -164,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 	return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
 }
 
-#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_)	\
+#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_)	\
 	struct vma_merge_struct name = {				\
 		.mm = mm_,						\
 		.vmi = vmi_,						\
 		.start = start_,					\
 		.end = end_,						\
-		.flags = flags_,					\
+		.vm_flags = vm_flags_,					\
 		.pgoff = pgoff_,					\
 		.state = VMA_MERGE_START,				\
 	}
@@ -184,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
 		.next = NULL,					\
 		.start = start_,				\
 		.end = end_,					\
-		.flags = vma_->vm_flags,			\
+		.vm_flags = vma_->vm_flags,			\
 		.pgoff = vma_pgoff_offset(vma_, start_),	\
 		.file = vma_->vm_file,				\
 		.anon_vma = vma_->anon_vma,			\
@@ -288,7 +288,7 @@ __must_check struct vm_area_struct
 *vma_modify_flags(struct vma_iterator *vmi,
 		struct vm_area_struct *prev, struct vm_area_struct *vma,
 		unsigned long start, unsigned long end,
-		unsigned long new_flags);
+		vm_flags_t vm_flags);
 
 /* We are about to modify the VMA's flags and/or anon_name. */
 __must_check struct vm_area_struct
@@ -297,7 +297,7 @@ __must_check struct vm_area_struct
 		       struct vm_area_struct *vma,
 		       unsigned long start,
 		       unsigned long end,
-		       unsigned long new_flags,
+		       vm_flags_t vm_flags,
 		       struct anon_vma_name *new_name);
 
 /* We are about to modify the VMA's memory policy. */
@@ -314,7 +314,7 @@ __must_check struct vm_area_struct
 		       struct vm_area_struct *prev,
 		       struct vm_area_struct *vma,
 		       unsigned long start, unsigned long end,
-		       unsigned long new_flags,
+		       vm_flags_t vm_flags,
 		       struct vm_userfaultfd_ctx new_ctx,
 		       bool give_up_on_oom);
 
@@ -375,7 +375,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma
 }
 
 #ifdef CONFIG_MMU
-static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags)
 {
 	return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 620dce753b64..56d540b8a1d0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -907,7 +907,7 @@ static enum folio_references folio_check_references(struct folio *folio,
 						  struct scan_control *sc)
 {
 	int referenced_ptes, referenced_folio;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 
 	referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
 					   &vm_flags);
@@ -2120,7 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 {
 	unsigned long nr_taken;
 	unsigned long nr_scanned;
-	unsigned long vm_flags;
+	vm_flags_t vm_flags;
 	LIST_HEAD(l_hold);	/* The folios which were snipped off */
 	LIST_HEAD(l_active);
 	LIST_HEAD(l_inactive);
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 7fec5b3de83f..656e1c75b711 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -65,7 +65,7 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
 					unsigned long start,
 					unsigned long end,
 					pgoff_t pgoff,
-					vm_flags_t flags)
+					vm_flags_t vm_flags)
 {
 	struct vm_area_struct *ret = vm_area_alloc(mm);
 
@@ -75,7 +75,7 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm,
 	ret->vm_start = start;
 	ret->vm_end = end;
 	ret->vm_pgoff = pgoff;
-	ret->__vm_flags = flags;
+	ret->__vm_flags = vm_flags;
 	vma_assert_detached(ret);
 
 	return ret;
@@ -103,9 +103,9 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
 						 unsigned long start,
 						 unsigned long end,
 						 pgoff_t pgoff,
-						 vm_flags_t flags)
+						 vm_flags_t vm_flags)
 {
-	struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags);
+	struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags);
 
 	if (vma == NULL)
 		return NULL;
@@ -172,7 +172,7 @@ static int expand_existing(struct vma_merge_struct *vmg)
  * specified new range.
  */
 static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
-			  unsigned long end, pgoff_t pgoff, vm_flags_t flags)
+			  unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags)
 {
 	vma_iter_set(vmg->vmi, start);
 
@@ -184,7 +184,7 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
 	vmg->start = start;
 	vmg->end = end;
 	vmg->pgoff = pgoff;
-	vmg->flags = flags;
+	vmg->vm_flags = vm_flags;
 
 	vmg->just_expand = false;
 	vmg->__remove_middle = false;
@@ -195,10 +195,10 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start,
 
 /* Helper function to set both the VMG range and its anon_vma. */
 static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start,
-				   unsigned long end, pgoff_t pgoff, vm_flags_t flags,
+				   unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags,
 				   struct anon_vma *anon_vma)
 {
-	vmg_set_range(vmg, start, end, pgoff, flags);
+	vmg_set_range(vmg, start, end, pgoff, vm_flags);
 	vmg->anon_vma = anon_vma;
 }
 
@@ -211,12 +211,12 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s
 static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
 						struct vma_merge_struct *vmg,
 						unsigned long start, unsigned long end,
-						pgoff_t pgoff, vm_flags_t flags,
+						pgoff_t pgoff, vm_flags_t vm_flags,
 						bool *was_merged)
 {
 	struct vm_area_struct *merged;
 
-	vmg_set_range(vmg, start, end, pgoff, flags);
+	vmg_set_range(vmg, start, end, pgoff, vm_flags);
 
 	merged = merge_new(vmg);
 	if (merged) {
@@ -229,7 +229,7 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm,
 
 	ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE);
 
-	return alloc_and_link_vma(mm, start, end, pgoff, flags);
+	return alloc_and_link_vma(mm, start, end, pgoff, vm_flags);
 }
 
 /*
@@ -301,17 +301,17 @@ static void vma_set_dummy_anon_vma(struct vm_area_struct *vma,
 static bool test_simple_merge(void)
 {
 	struct vm_area_struct *vma;
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
-	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags);
-	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags);
+	struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
+	struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags);
 	VMA_ITERATOR(vmi, &mm, 0x1000);
 	struct vma_merge_struct vmg = {
 		.mm = &mm,
 		.vmi = &vmi,
 		.start = 0x1000,
 		.end = 0x2000,
-		.flags = flags,
+		.vm_flags = vm_flags,
 		.pgoff = 1,
 	};
 
@@ -324,7 +324,7 @@ static bool test_simple_merge(void)
 	ASSERT_EQ(vma->vm_start, 0);
 	ASSERT_EQ(vma->vm_end, 0x3000);
 	ASSERT_EQ(vma->vm_pgoff, 0);
-	ASSERT_EQ(vma->vm_flags, flags);
+	ASSERT_EQ(vma->vm_flags, vm_flags);
 
 	detach_free_vma(vma);
 	mtree_destroy(&mm.mm_mt);
@@ -335,9 +335,9 @@ static bool test_simple_merge(void)
 static bool test_simple_modify(void)
 {
 	struct vm_area_struct *vma;
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
-	struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
+	struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
 	VMA_ITERATOR(vmi, &mm, 0x1000);
 
 	ASSERT_FALSE(attach_vma(&mm, init_vma));
@@ -394,9 +394,9 @@ static bool test_simple_modify(void)
 
 static bool test_simple_expand(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
-	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags);
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags);
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
 		.vmi = &vmi,
@@ -422,9 +422,9 @@ static bool test_simple_expand(void)
 
 static bool test_simple_shrink(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
-	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags);
+	struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags);
 	VMA_ITERATOR(vmi, &mm, 0);
 
 	ASSERT_FALSE(attach_vma(&mm, vma));
@@ -443,7 +443,7 @@ static bool test_simple_shrink(void)
 
 static bool test_merge_new(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
@@ -473,18 +473,18 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AA B       CC
 	 */
-	vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+	vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
 	ASSERT_NE(vma_a, NULL);
 	/* We give each VMA a single avc so we can test anon_vma duplication. */
 	INIT_LIST_HEAD(&vma_a->anon_vma_chain);
 	list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain);
 
-	vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+	vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
 	ASSERT_NE(vma_b, NULL);
 	INIT_LIST_HEAD(&vma_b->anon_vma_chain);
 	list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain);
 
-	vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags);
+	vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags);
 	ASSERT_NE(vma_c, NULL);
 	INIT_LIST_HEAD(&vma_c->anon_vma_chain);
 	list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain);
@@ -495,7 +495,7 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AA B   **  CC
 	 */
-	vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged);
+	vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged);
 	ASSERT_NE(vma_d, NULL);
 	INIT_LIST_HEAD(&vma_d->anon_vma_chain);
 	list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain);
@@ -510,7 +510,7 @@ static bool test_merge_new(void)
 	 */
 	vma_a->vm_ops = &vm_ops; /* This should have no impact. */
 	vma_b->anon_vma = &dummy_anon_vma;
-	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_a);
 	/* Merge with A, delete B. */
 	ASSERT_TRUE(merged);
@@ -527,7 +527,7 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AAAA*  DD  CC
 	 */
-	vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_a);
 	/* Extend A. */
 	ASSERT_TRUE(merged);
@@ -546,7 +546,7 @@ static bool test_merge_new(void)
 	 */
 	vma_d->anon_vma = &dummy_anon_vma;
 	vma_d->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_d);
 	/* Prepend. */
 	ASSERT_TRUE(merged);
@@ -564,7 +564,7 @@ static bool test_merge_new(void)
 	 * AAAAA*DDD  CC
 	 */
 	vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */
-	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_a);
 	/* Merge with A, delete D. */
 	ASSERT_TRUE(merged);
@@ -582,7 +582,7 @@ static bool test_merge_new(void)
 	 * AAAAAAAAA *CC
 	 */
 	vma_c->anon_vma = &dummy_anon_vma;
-	vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_c);
 	/* Prepend C. */
 	ASSERT_TRUE(merged);
@@ -599,7 +599,7 @@ static bool test_merge_new(void)
 	 * 0123456789abc
 	 * AAAAAAAAA*CCC
 	 */
-	vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged);
+	vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged);
 	ASSERT_EQ(vma, vma_a);
 	/* Extend A and delete C. */
 	ASSERT_TRUE(merged);
@@ -639,7 +639,7 @@ static bool test_merge_new(void)
 
 static bool test_vma_merge_special_flags(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
@@ -661,7 +661,7 @@ static bool test_vma_merge_special_flags(void)
 	 * 01234
 	 * AAA
 	 */
-	vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
 	ASSERT_NE(vma_left, NULL);
 
 	/* 1. Set up new VMA with special flag that would otherwise merge. */
@@ -672,12 +672,12 @@ static bool test_vma_merge_special_flags(void)
 	 *
 	 * This should merge if not for the VM_SPECIAL flag.
 	 */
-	vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags);
 	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
 		vm_flags_t special_flag = special_flags[i];
 
-		vma_left->__vm_flags = flags | special_flag;
-		vmg.flags = flags | special_flag;
+		vma_left->__vm_flags = vm_flags | special_flag;
+		vmg.vm_flags = vm_flags | special_flag;
 		vma = merge_new(&vmg);
 		ASSERT_EQ(vma, NULL);
 		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
@@ -691,15 +691,15 @@ static bool test_vma_merge_special_flags(void)
 	 *
 	 * Create a VMA to modify.
 	 */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
 	ASSERT_NE(vma, NULL);
 	vmg.middle = vma;
 
 	for (i = 0; i < ARRAY_SIZE(special_flags); i++) {
 		vm_flags_t special_flag = special_flags[i];
 
-		vma_left->__vm_flags = flags | special_flag;
-		vmg.flags = flags | special_flag;
+		vma_left->__vm_flags = vm_flags | special_flag;
+		vmg.vm_flags = vm_flags | special_flag;
 		vma = merge_existing(&vmg);
 		ASSERT_EQ(vma, NULL);
 		ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
@@ -711,7 +711,7 @@ static bool test_vma_merge_special_flags(void)
 
 static bool test_vma_merge_with_close(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
@@ -791,11 +791,11 @@ static bool test_vma_merge_with_close(void)
 	 * PPPPPPNNN
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
 	vma_next->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	ASSERT_EQ(merge_new(&vmg), vma_prev);
 	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
 	ASSERT_EQ(vma_prev->vm_start, 0);
@@ -816,11 +816,11 @@ static bool test_vma_merge_with_close(void)
 	 * proceed.
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vma->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -844,11 +844,11 @@ static bool test_vma_merge_with_close(void)
 	 * proceed.
 	 */
 
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
 	vma->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	/*
@@ -872,12 +872,12 @@ static bool test_vma_merge_with_close(void)
 	 * PPPVVNNNN
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
 	vma->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -898,12 +898,12 @@ static bool test_vma_merge_with_close(void)
 	 * PPPPPNNNN
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags);
 	vma_next->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -920,15 +920,15 @@ static bool test_vma_merge_with_close(void)
 
 static bool test_vma_merge_new_with_close(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
 		.mm = &mm,
 		.vmi = &vmi,
 	};
-	struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
-	struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, flags);
+	struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+	struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags);
 	const struct vm_operations_struct vm_ops = {
 		.close = dummy_close,
 	};
@@ -958,7 +958,7 @@ static bool test_vma_merge_new_with_close(void)
 	vma_prev->vm_ops = &vm_ops;
 	vma_next->vm_ops = &vm_ops;
 
-	vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags);
+	vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags);
 	vma = merge_new(&vmg);
 	ASSERT_NE(vma, NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS);
@@ -975,7 +975,7 @@ static bool test_vma_merge_new_with_close(void)
 
 static bool test_merge_existing(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vm_area_struct *vma, *vma_prev, *vma_next;
@@ -998,11 +998,11 @@ static bool test_merge_existing(void)
 	 * 0123456789
 	 *   VNNNNNN
 	 */
-	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
 	vma->vm_ops = &vm_ops; /* This should have no impact. */
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags);
 	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
 	vmg.middle = vma;
 	vmg.prev = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
@@ -1032,10 +1032,10 @@ static bool test_merge_existing(void)
 	 * 0123456789
 	 *   NNNNNNN
 	 */
-	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags);
+	vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags);
 	vma_next->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, flags, &dummy_anon_vma);
+	vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma);
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
 	ASSERT_EQ(merge_existing(&vmg), vma_next);
@@ -1060,11 +1060,11 @@ static bool test_merge_existing(void)
 	 * 0123456789
 	 * PPPPPPV
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
 	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
 	vma->vm_ops = &vm_ops; /* This should have no impact. */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
@@ -1094,10 +1094,10 @@ static bool test_merge_existing(void)
 	 * 0123456789
 	 * PPPPPPP
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
 	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
@@ -1123,11 +1123,11 @@ static bool test_merge_existing(void)
 	 * 0123456789
 	 * PPPPPPPPPP
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
 	vma_prev->vm_ops = &vm_ops; /* This should have no impact. */
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
@@ -1158,41 +1158,41 @@ static bool test_merge_existing(void)
 	 * PPPVVVVVNNN
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags);
 
-	vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags);
+	vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags);
+	vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags);
+	vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags);
+	vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
 	ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE);
 
-	vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags);
+	vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 	ASSERT_EQ(merge_existing(&vmg), NULL);
@@ -1205,7 +1205,7 @@ static bool test_merge_existing(void)
 
 static bool test_anon_vma_non_mergeable(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vm_area_struct *vma, *vma_prev, *vma_next;
@@ -1229,9 +1229,9 @@ static bool test_anon_vma_non_mergeable(void)
 	 * 0123456789
 	 * PPPPPPPNNN
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
 
 	/*
 	 * Give both prev and next single anon_vma_chain fields, so they will
@@ -1239,7 +1239,7 @@ static bool test_anon_vma_non_mergeable(void)
 	 *
 	 * However, when prev is compared to next, the merge should fail.
 	 */
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
@@ -1267,10 +1267,10 @@ static bool test_anon_vma_non_mergeable(void)
 	 * 0123456789
 	 * PPPPPPPNNN
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags);
 
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL);
 	vmg.prev = vma_prev;
 	vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1);
 	__vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2);
@@ -1292,7 +1292,7 @@ static bool test_anon_vma_non_mergeable(void)
 
 static bool test_dup_anon_vma(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
@@ -1313,11 +1313,11 @@ static bool test_dup_anon_vma(void)
 	 * This covers new VMA merging, as these operations amount to a VMA
 	 * expand.
 	 */
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vma_next->anon_vma = &dummy_anon_vma;
 
-	vmg_set_range(&vmg, 0, 0x5000, 0, flags);
+	vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags);
 	vmg.target = vma_prev;
 	vmg.next = vma_next;
 
@@ -1339,16 +1339,16 @@ static bool test_dup_anon_vma(void)
 	 *  extend   delete  delete
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
 
 	/* Initialise avc so mergeability check passes. */
 	INIT_LIST_HEAD(&vma_next->anon_vma_chain);
 	list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain);
 
 	vma_next->anon_vma = &dummy_anon_vma;
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -1372,12 +1372,12 @@ static bool test_dup_anon_vma(void)
 	 *  extend   delete  delete
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
 	vmg.anon_vma = &dummy_anon_vma;
 	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -1401,11 +1401,11 @@ static bool test_dup_anon_vma(void)
 	 *  extend shrink/delete
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags);
 
 	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 
@@ -1429,11 +1429,11 @@ static bool test_dup_anon_vma(void)
 	 * shrink/delete extend
 	 */
 
-	vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags);
+	vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags);
 
 	vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain);
-	vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma;
 	vmg.middle = vma;
 
@@ -1452,7 +1452,7 @@ static bool test_dup_anon_vma(void)
 
 static bool test_vmi_prealloc_fail(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vma_merge_struct vmg = {
@@ -1468,11 +1468,11 @@ static bool test_vmi_prealloc_fail(void)
 	 * the duplicated anon_vma is unlinked.
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vma->anon_vma = &dummy_anon_vma;
 
-	vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, flags, &dummy_anon_vma);
+	vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma);
 	vmg.prev = vma_prev;
 	vmg.middle = vma;
 	vma_set_dummy_anon_vma(vma, &avc);
@@ -1496,11 +1496,11 @@ static bool test_vmi_prealloc_fail(void)
 	 * performed in this case too.
 	 */
 
-	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags);
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vma->anon_vma = &dummy_anon_vma;
 
-	vmg_set_range(&vmg, 0, 0x5000, 3, flags);
+	vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags);
 	vmg.target = vma_prev;
 	vmg.next = vma;
 
@@ -1518,13 +1518,13 @@ static bool test_vmi_prealloc_fail(void)
 
 static bool test_merge_extend(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0x1000);
 	struct vm_area_struct *vma;
 
-	vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags);
-	alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags);
+	vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags);
+	alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags);
 
 	/*
 	 * Extend a VMA into the gap between itself and the following VMA.
@@ -1548,7 +1548,7 @@ static bool test_merge_extend(void)
 
 static bool test_copy_vma(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	bool need_locks = false;
 	VMA_ITERATOR(vmi, &mm, 0);
@@ -1556,7 +1556,7 @@ static bool test_copy_vma(void)
 
 	/* Move backwards and do not merge. */
 
-	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks);
 	ASSERT_NE(vma_new, vma);
 	ASSERT_EQ(vma_new->vm_start, 0);
@@ -1568,8 +1568,8 @@ static bool test_copy_vma(void)
 
 	/* Move a VMA into position next to another and merge the two. */
 
-	vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
-	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, flags);
+	vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
+	vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags);
 	vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks);
 	vma_assert_attached(vma_new);
 
@@ -1581,11 +1581,11 @@ static bool test_copy_vma(void)
 
 static bool test_expand_only_mode(void)
 {
-	unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
+	vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE;
 	struct mm_struct mm = {};
 	VMA_ITERATOR(vmi, &mm, 0);
 	struct vm_area_struct *vma_prev, *vma;
-	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5);
+	VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5);
 
 	/*
 	 * Place a VMA prior to the one we're expanding so we assert that we do
@@ -1593,14 +1593,14 @@ static bool test_expand_only_mode(void)
 	 * have, through the use of the just_expand flag, indicated we do not
 	 * need to do so.
 	 */
-	alloc_and_link_vma(&mm, 0, 0x2000, 0, flags);
+	alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags);
 
 	/*
 	 * We will be positioned at the prev VMA, but looking to expand to
 	 * 0x9000.
 	 */
 	vma_iter_set(&vmi, 0x3000);
-	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags);
+	vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags);
 	vmg.prev = vma_prev;
 	vmg.just_expand = true;
 
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 3b1b45256d56..f684649b1008 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -1084,7 +1084,7 @@ static inline bool mpol_equal(struct mempolicy *, struct mempolicy *)
 }
 
 static inline void khugepaged_enter_vma(struct vm_area_struct *vma,
-			  unsigned long vm_flags)
+			  vm_flags_t vm_flags)
 {
 	(void)vma;
 	(void)vm_flags;
@@ -1200,7 +1200,7 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 /* Update vma->vm_page_prot to reflect vma->vm_flags. */
 static inline void vma_set_page_prot(struct vm_area_struct *vma)
 {
-	unsigned long vm_flags = vma->vm_flags;
+	vm_flags_t vm_flags = vma->vm_flags;
 	pgprot_t vm_page_prot;
 
 	/* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */
@@ -1280,12 +1280,12 @@ static inline bool capable(int cap)
 	return true;
 }
 
-static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
+static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags,
 			unsigned long bytes)
 {
 	unsigned long locked_pages, limit_pages;
 
-	if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
+	if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
 		return true;
 
 	locked_pages = bytes >> PAGE_SHIFT;
-- 
cgit v1.2.3


From f9550e1fcf3bef802c18fadc5c65485b66b28a63 Mon Sep 17 00:00:00 2001
From: Nathan Gao <zcgao@amazon.com>
Date: Wed, 18 Jun 2025 09:33:31 -0700
Subject: mm/damon: fix minor typos in damon header

Fix typos in include/linux/damon.h.

Link: https://lkml.kernel.org/r/20250618163331.54910-1-sj@kernel.org
Signed-off-by: Nathan Gao <zcgao@amazon.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index a4011726cb3b..bb58e36f019e 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -450,7 +450,7 @@ struct damos_access_pattern {
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
- * @action:		&damo_action to be applied to the target regions.
+ * @action:		&damos_action to be applied to the target regions.
  * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
@@ -656,7 +656,7 @@ struct damon_call_control {
  * struct damon_intervals_goal - Monitoring intervals auto-tuning goal.
  *
  * @access_bp:		Access events observation ratio to achieve in bp.
- * @aggrs:		Number of aggregations to acheive @access_bp within.
+ * @aggrs:		Number of aggregations to achieve @access_bp within.
  * @min_sample_us:	Minimum resulting sampling interval in microseconds.
  * @max_sample_us:	Maximum resulting sampling interval in microseconds.
  *
-- 
cgit v1.2.3


From d29d64afa2b20d9bd210c01bfff78545675b5135 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Wed, 18 Jun 2025 14:50:35 +0200
Subject: codetag: avoid unused alloc_tags sections/symbols

With CONFIG_MEM_ALLOC_PROFILING=n, vmlinux and all modules unnecessarily
contain the symbols __start_alloc_tags and __stop_alloc_tags, which define
an empty range.  In the case of modules, the presence of these symbols
also forces the linker to create an empty .codetag.alloc_tags section.

Update codetag.lds.h to make the data conditional on
CONFIG_MEM_ALLOC_PROFILING.

Link: https://lkml.kernel.org/r/20250618125037.53182-1-petr.pavlu@suse.com
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Casey Chen <cachen@purestorage.com>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/codetag.lds.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
index a45fe3d141a1..a14f4bdafdda 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -2,6 +2,12 @@
 #ifndef __ASM_GENERIC_CODETAG_LDS_H
 #define __ASM_GENERIC_CODETAG_LDS_H
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+#define IF_MEM_ALLOC_PROFILING(...) __VA_ARGS__
+#else
+#define IF_MEM_ALLOC_PROFILING(...)
+#endif
+
 #define SECTION_WITH_BOUNDARIES(_name)	\
 	. = ALIGN(8);			\
 	__start_##_name = .;		\
@@ -9,7 +15,7 @@
 	__stop_##_name = .;
 
 #define CODETAG_SECTIONS()		\
-	SECTION_WITH_BOUNDARIES(alloc_tags)
+	IF_MEM_ALLOC_PROFILING(SECTION_WITH_BOUNDARIES(alloc_tags))
 
 #define MOD_SEPARATE_CODETAG_SECTION(_name)	\
 	.codetag.##_name : {			\
@@ -22,6 +28,6 @@
  * unload them individually once unused.
  */
 #define MOD_SEPARATE_CODETAG_SECTIONS()		\
-	MOD_SEPARATE_CODETAG_SECTION(alloc_tags)
+	IF_MEM_ALLOC_PROFILING(MOD_SEPARATE_CODETAG_SECTION(alloc_tags))
 
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
-- 
cgit v1.2.3


From 986f5f2b4be3b7eab9ecd85c472d03a2191d6fc0 Mon Sep 17 00:00:00 2001
From: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date: Tue, 17 Jun 2025 22:30:53 -0700
Subject: mm/hugetlb: make hugetlb_reserve_pages() return nr of entries updated

Patch series "mm/memfd: Reserve hugetlb folios before allocation", v4.

There are cases when we try to pin a folio but discover that it has not
been faulted-in.  So, we try to allocate it in memfd_alloc_folio() but the
allocation request may not succeed if there are no active reservations in
the system at that instant.

Therefore, making a reservation (by calling hugetlb_reserve_pages())
associated with the allocation will ensure that our request would not fail
due to lack of reservations.  This will also ensure that proper
region/subpool accounting is done with our allocation.


This patch (of 3):

Currently, hugetlb_reserve_pages() returns a bool to indicate whether the
reservation map update for the range [from, to] was successful or not.
This is not sufficient for the case where the caller needs to determine
how many entries were updated for the range.

Therefore, have hugetlb_reserve_pages() return the number of entries
updated in the reservation map associated with the range [from, to].
Also, update the callers of hugetlb_reserve_pages() to handle the new
return value.

Link: https://lkml.kernel.org/r/20250618053415.1036185-1-vivek.kasireddy@intel.com
Link: https://lkml.kernel.org/r/20250618053415.1036185-2-vivek.kasireddy@intel.com
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  8 ++++----
 include/linux/hugetlb.h |  2 +-
 mm/hugetlb.c            | 19 +++++++++++++------
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e4de5425838d..00b2d1a032fd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -150,10 +150,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (inode->i_flags & S_PRIVATE)
 		vm_flags |= VM_NORESERVE;
 
-	if (!hugetlb_reserve_pages(inode,
+	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
 				len >> huge_page_shift(h), vma,
-				vm_flags))
+				vm_flags) < 0)
 		goto out;
 
 	ret = 0;
@@ -1561,9 +1561,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	inode->i_size = size;
 	clear_nlink(inode);
 
-	if (!hugetlb_reserve_pages(inode, 0,
+	if (hugetlb_reserve_pages(inode, 0,
 			size >> huge_page_shift(hstate_inode(inode)), NULL,
-			acctflag))
+			acctflag) < 0)
 		file = ERR_PTR(-ENOMEM);
 	else
 		file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 42f374e828a2..d8310b0f36dd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -149,7 +149,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 			     uffd_flags_t flags,
 			     struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
-bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
+long hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c7ba95030241..e1570735012a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7244,8 +7244,15 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
 	return pages > 0 ? (pages << h->order) : pages;
 }
 
-/* Return true if reservation was successful, false otherwise.  */
-bool hugetlb_reserve_pages(struct inode *inode,
+/*
+ * Update the reservation map for the range [from, to].
+ *
+ * Returns the number of entries that would be added to the reservation map
+ * associated with the range [from, to].  This number is greater or equal to
+ * zero. -EINVAL or -ENOMEM is returned in case of any errors.
+ */
+
+long hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
 					struct vm_area_struct *vma,
 					vm_flags_t vm_flags)
@@ -7260,7 +7267,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
 	/* This should never happen */
 	if (from > to) {
 		VM_WARN(1, "%s called with a negative range\n", __func__);
-		return false;
+		return -EINVAL;
 	}
 
 	/*
@@ -7275,7 +7282,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
 	 * without using reserves
 	 */
 	if (vm_flags & VM_NORESERVE)
-		return true;
+		return 0;
 
 	/*
 	 * Shared mappings base their reservation on the number of pages that
@@ -7382,7 +7389,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
 			hugetlb_cgroup_put_rsvd_cgroup(h_cg);
 		}
 	}
-	return true;
+	return chg;
 
 out_put_pages:
 	spool_resv = chg - gbl_reserve;
@@ -7410,7 +7417,7 @@ out_err:
 		kref_put(&resv_map->refs, resv_map_release);
 		set_vma_resv_map(vma, NULL);
 	}
-	return false;
+	return chg < 0 ? chg : add < 0 ? add : -EINVAL;
 }
 
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
-- 
cgit v1.2.3


From 717cf9357325055ab6b41c4e0581f4d67601eb58 Mon Sep 17 00:00:00 2001
From: Vivek Kasireddy <vivek.kasireddy@intel.com>
Date: Tue, 17 Jun 2025 22:30:54 -0700
Subject: mm/memfd: reserve hugetlb folios before allocation

When we try to allocate a folio via alloc_hugetlb_folio_reserve(), we need
to ensure that there is an active reservation associated with the
allocation.  Otherwise, our allocation request would fail if there are no
active reservations made at that moment against any other allocations.
This is because alloc_hugetlb_folio_reserve() checks h->resv_huge_pages
before proceeding with the allocation.

Therefore, to address this issue, we just need to make a reservation (by
calling hugetlb_reserve_pages()) before we try to allocate the folio.
This will also ensure that proper region/subpool accounting is done
associated with our allocation.

Link: https://lkml.kernel.org/r/20250618053415.1036185-3-vivek.kasireddy@intel.com
Signed-off-by: Vivek Kasireddy <vivek.kasireddy@intel.com>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  5 +++++
 mm/hugetlb.c            |  5 -----
 mm/memfd.c              | 17 ++++++++++++++---
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d8310b0f36dd..c6c87eae4a8d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -740,6 +740,11 @@ extern unsigned int default_hstate_idx;
 
 #define default_hstate (hstates[default_hstate_idx])
 
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+	return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+
 static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
 {
 	return folio->_hugetlb_subpool;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e1570735012a..da9f10710c27 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -284,11 +284,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
 	return ret;
 }
 
-static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
-{
-	return HUGETLBFS_SB(inode->i_sb)->spool;
-}
-
 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 {
 	return subpool_inode(file_inode(vma->vm_file));
diff --git a/mm/memfd.c b/mm/memfd.c
index b558c4c3bd27..32fa6bfe57d1 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -70,7 +70,6 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 #ifdef CONFIG_HUGETLB_PAGE
 	struct folio *folio;
 	gfp_t gfp_mask;
-	int err;
 
 	if (is_file_hugepages(memfd)) {
 		/*
@@ -79,12 +78,19 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 		 * alloc from. Also, the folio will be pinned for an indefinite
 		 * amount of time, so it is not expected to be migrated away.
 		 */
+		struct inode *inode = file_inode(memfd);
 		struct hstate *h = hstate_file(memfd);
+		int err = -ENOMEM;
+		long nr_resv;
 
 		gfp_mask = htlb_alloc_mask(h);
 		gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
 		idx >>= huge_page_order(h);
 
+		nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+		if (nr_resv < 0)
+			return ERR_PTR(nr_resv);
+
 		folio = alloc_hugetlb_folio_reserve(h,
 						    numa_node_id(),
 						    NULL,
@@ -95,12 +101,17 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
 							idx);
 			if (err) {
 				folio_put(folio);
-				return ERR_PTR(err);
+				goto err_unresv;
 			}
+
+			hugetlb_set_folio_subpool(folio, subpool_inode(inode));
 			folio_unlock(folio);
 			return folio;
 		}
-		return ERR_PTR(-ENOMEM);
+err_unresv:
+		if (nr_resv > 0)
+			hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
+		return ERR_PTR(err);
 	}
 #endif
 	return shmem_read_folio(memfd->f_mapping, idx);
-- 
cgit v1.2.3


From 59b5ed409d03bc8b7bb153d78afcd7cea9d7bbfa Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Wed, 18 Jun 2025 09:58:09 +0800
Subject: mm/percpu: conditionally define _shared_alloc_tag via
 CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU

Recently discovered this entry while checking kallsyms on ARM64:
ffff800083e509c0 D _shared_alloc_tag

If ARCH_NEEDS_WEAK_PER_CPU is not defined(it is only defined for s390 and
alpha architectures), there's no need to statically define the percpu
variable _shared_alloc_tag.

Therefore, we need to implement isolation for this purpose.

When building the core kernel code for s390 or alpha architectures,
ARCH_NEEDS_WEAK_PER_CPU remains undefined (as it is gated by #if
defined(MODULE)).  However, when building modules for these architectures,
the macro is explicitly defined.

Therefore, we remove all instances of ARCH_NEEDS_WEAK_PER_CPU from the
code and introduced CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU to replace the
relevant logic.  We can now conditionally define the perpcu variable
_shared_alloc_tag based on CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU.  This
allows architectures (such as s390/alpha) that require weak definitions
for percpu variables in modules to include the definition, while others
can omit it via compile-time exclusion.

Link: https://lkml.kernel.org/r/20250618015809.1235761-1-hao.ge@linux.dev
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Suggested-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>	[s390]
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Chistoph Lameter <cl@linux.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/Kconfig              | 1 +
 arch/alpha/include/asm/percpu.h | 5 ++---
 arch/s390/Kconfig               | 1 +
 arch/s390/include/asm/percpu.h  | 5 ++---
 include/linux/alloc_tag.h       | 6 +++---
 include/linux/percpu-defs.h     | 7 ++++---
 lib/alloc_tag.c                 | 2 ++
 mm/Kconfig                      | 7 +++++++
 8 files changed, 22 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 109a4cddcd13..80367f2cf821 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -7,6 +7,7 @@ config ALPHA
 	select ARCH_HAS_DMA_OPS if PCI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
+	select ARCH_MODULE_NEEDS_WEAK_PER_CPU if SMP
 	select ARCH_NO_PREEMPT
 	select ARCH_NO_SG_CHAIN
 	select ARCH_USE_CMPXCHG_LOCKREF
diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h
index 6923249f2d49..4383d66341dc 100644
--- a/arch/alpha/include/asm/percpu.h
+++ b/arch/alpha/include/asm/percpu.h
@@ -9,10 +9,9 @@
  * way above 4G.
  *
  * Always use weak definitions for percpu variables in modules.
+ * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ * in the Kconfig.
  */
-#if defined(MODULE) && defined(CONFIG_SMP)
-#define ARCH_NEEDS_WEAK_PER_CPU
-#endif
 
 #include <asm-generic/percpu.h>
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0c16dc443e2f..b652cb952f31 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -132,6 +132,7 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+	select ARCH_MODULE_NEEDS_WEAK_PER_CPU
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h
index 84f6b8357b45..96af7d964014 100644
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -16,10 +16,9 @@
  * For 64 bit module code, the module may be more than 4G above the
  * per cpu area, use weak definitions to force the compiler to
  * generate external references.
+ * Therefore, we have enabled CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
+ * in the Kconfig.
  */
-#if defined(MODULE)
-#define ARCH_NEEDS_WEAK_PER_CPU
-#endif
 
 /*
  * We use a compare-and-swap loop since that uses less cpu cycles than
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8f7931eb7d16..9ef2633e2c08 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -88,7 +88,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 	return container_of(ct, struct alloc_tag, ct);
 }
 
-#ifdef ARCH_NEEDS_WEAK_PER_CPU
+#if defined(CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU) && defined(MODULE)
 /*
  * When percpu variables are required to be defined as weak, static percpu
  * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
@@ -102,7 +102,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 		.ct = CODE_TAG_INIT,						\
 		.counters = &_shared_alloc_tag };
 
-#else /* ARCH_NEEDS_WEAK_PER_CPU */
+#else /* CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU && MODULE */
 
 #ifdef MODULE
 
@@ -123,7 +123,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 
 #endif /* MODULE */
 
-#endif /* ARCH_NEEDS_WEAK_PER_CPU */
+#endif /* CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU && MODULE */
 
 DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
 			mem_alloc_profiling_key);
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index c16cdeaa505e..12d90360f6db 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -63,14 +63,15 @@
  * 1. The symbol must be globally unique, even the static ones.
  * 2. Static percpu variables cannot be defined inside a function.
  *
- * Archs which need weak percpu definitions should define
- * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
+ * Archs which need weak percpu definitions should set
+ * CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU when necessary.
  *
  * To ensure that the generic code observes the above two
  * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
  * definition is used for all cases.
  */
-#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
+#if (defined(CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU) && defined(MODULE)) || \
+	defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
 /*
  * __pcpu_scope_* dummy variable is used to enforce scope.  It
  * receives the static modifier when it's used in front of
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 36f07dc95069..41ccfb035b7b 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -25,8 +25,10 @@ static bool mem_profiling_support;
 
 static struct codetag_type *alloc_tag_cttype;
 
+#ifdef CONFIG_ARCH_MODULE_NEEDS_WEAK_PER_CPU
 DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
 EXPORT_SYMBOL(_shared_alloc_tag);
+#endif
 
 DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
 			mem_alloc_profiling_key);
diff --git a/mm/Kconfig b/mm/Kconfig
index 3b2060a61c05..065b1f19dd99 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -933,6 +933,13 @@ config ARCH_SUPPORTS_PUD_PFNMAP
 	def_bool y
 	depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 
+#
+# Architectures that always use weak definitions for percpu
+# variables in modules should set this.
+#
+config ARCH_MODULE_NEEDS_WEAK_PER_CPU
+       bool
+
 #
 # UP and nommu archs use km based percpu allocator
 #
-- 
cgit v1.2.3


From fd2825b0760a10a9626986cca64f5664302ffdfc Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Thu, 19 Jun 2025 18:57:57 +1000
Subject: mm/gup: remove pXX_devmap usage from get_user_pages()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GUP uses pXX_devmap() calls to see if it needs to a get a reference on the
associated pgmap data structure to ensure the pages won't go away.
However it's a driver responsibility to ensure that if pages are mapped
(ie.  discoverable by GUP) that they are not offlined or removed from the
memmap so there is no need to hold a reference on the pgmap data structure
to ensure this.

Furthermore mappings with PFN_DEV are no longer created, hence this
effectively dead code anyway so can be removed.

Link: https://lkml.kernel.org/r/708b2be76876659ec5261fe5d059b07268b98b36.1750323463.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Björn Töpel <bjorn@rivosinc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Inki Dae <m.szyprowski@samsung.com>
Cc: John Groves <john@groves.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |   3 -
 mm/gup.c                | 160 ++----------------------------------------------
 mm/huge_memory.c        |  40 ------------
 3 files changed, 5 insertions(+), 198 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7753daac49f7..a2df2308cb2c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -473,9 +473,6 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 	return folio_order(folio) >= HPAGE_PMD_ORDER;
 }
 
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
 extern struct folio *huge_zero_folio;
diff --git a/mm/gup.c b/mm/gup.c
index c08b97e0d344..30d320719fa2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -679,31 +679,9 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma,
 		return NULL;
 
 	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-
-	if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
-	    pud_devmap(pud)) {
-		/*
-		 * device mapped pages can only be returned if the caller
-		 * will manage the page reference count.
-		 *
-		 * At least one of FOLL_GET | FOLL_PIN must be set, so
-		 * assert that here:
-		 */
-		if (!(flags & (FOLL_GET | FOLL_PIN)))
-			return ERR_PTR(-EEXIST);
-
-		if (flags & FOLL_TOUCH)
-			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
-
-		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
-		if (!ctx->pgmap)
-			return ERR_PTR(-EFAULT);
-	}
-
 	page = pfn_to_page(pfn);
 
-	if (!pud_devmap(pud) && !pud_write(pud) &&
-	    gup_must_unshare(vma, flags, page))
+	if (!pud_write(pud) && gup_must_unshare(vma, flags, page))
 		return ERR_PTR(-EMLINK);
 
 	ret = try_grab_folio(page_folio(page), 1, flags);
@@ -857,8 +835,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	page = vm_normal_page(vma, address, pte);
 
 	/*
-	 * We only care about anon pages in can_follow_write_pte() and don't
-	 * have to worry about pte_devmap() because they are never anon.
+	 * We only care about anon pages in can_follow_write_pte().
 	 */
 	if ((flags & FOLL_WRITE) &&
 	    !can_follow_write_pte(pte, page, vma, flags)) {
@@ -866,18 +843,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 		goto out;
 	}
 
-	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
-		/*
-		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
-		 * case since they are only valid while holding the pgmap
-		 * reference.
-		 */
-		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
-		if (*pgmap)
-			page = pte_page(pte);
-		else
-			goto no_page;
-	} else if (unlikely(!page)) {
+	if (unlikely(!page)) {
 		if (flags & FOLL_DUMP) {
 			/* Avoid special (like zero) pages in core dumps */
 			page = ERR_PTR(-EFAULT);
@@ -959,14 +925,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 		return no_page_table(vma, flags, address);
 	if (!pmd_present(pmdval))
 		return no_page_table(vma, flags, address);
-	if (pmd_devmap(pmdval)) {
-		ptl = pmd_lock(mm, pmd);
-		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
-		spin_unlock(ptl);
-		if (page)
-			return page;
-		return no_page_table(vma, flags, address);
-	}
 	if (likely(!pmd_leaf(pmdval)))
 		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
 
@@ -2896,7 +2854,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 		int *nr)
 {
 	struct dev_pagemap *pgmap = NULL;
-	int nr_start = *nr, ret = 0;
+	int ret = 0;
 	pte_t *ptep, *ptem;
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
@@ -2920,16 +2878,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
 			goto pte_unmap;
 
-		if (pte_devmap(pte)) {
-			if (unlikely(flags & FOLL_LONGTERM))
-				goto pte_unmap;
-
-			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
-			if (unlikely(!pgmap)) {
-				gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
-				goto pte_unmap;
-			}
-		} else if (pte_special(pte))
+		if (pte_special(pte))
 			goto pte_unmap;
 
 		/* If it's not marked as special it must have a valid memmap. */
@@ -3001,91 +2950,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
-	unsigned long end, unsigned int flags, struct page **pages, int *nr)
-{
-	int nr_start = *nr;
-	struct dev_pagemap *pgmap = NULL;
-
-	do {
-		struct folio *folio;
-		struct page *page = pfn_to_page(pfn);
-
-		pgmap = get_dev_pagemap(pfn, pgmap);
-		if (unlikely(!pgmap)) {
-			gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
-			break;
-		}
-
-		folio = try_grab_folio_fast(page, 1, flags);
-		if (!folio) {
-			gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
-			break;
-		}
-		folio_set_referenced(folio);
-		pages[*nr] = page;
-		(*nr)++;
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
-
-	put_dev_pagemap(pgmap);
-	return addr == end;
-}
-
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-		unsigned long end, unsigned int flags, struct page **pages,
-		int *nr)
-{
-	unsigned long fault_pfn;
-	int nr_start = *nr;
-
-	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-	if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
-		return 0;
-
-	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
-		gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
-		return 0;
-	}
-	return 1;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
-		unsigned long end, unsigned int flags, struct page **pages,
-		int *nr)
-{
-	unsigned long fault_pfn;
-	int nr_start = *nr;
-
-	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-	if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
-		return 0;
-
-	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
-		gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
-		return 0;
-	}
-	return 1;
-}
-#else
-static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-		unsigned long end, unsigned int flags, struct page **pages,
-		int *nr)
-{
-	BUILD_BUG();
-	return 0;
-}
-
-static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
-		unsigned long end, unsigned int flags, struct page **pages,
-		int *nr)
-{
-	BUILD_BUG();
-	return 0;
-}
-#endif
-
 static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, unsigned int flags, struct page **pages,
 		int *nr)
@@ -3100,13 +2964,6 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	if (pmd_special(orig))
 		return 0;
 
-	if (pmd_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
-			return 0;
-		return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
-					        pages, nr);
-	}
-
 	page = pmd_page(orig);
 	refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);
 
@@ -3147,13 +3004,6 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
 	if (pud_special(orig))
 		return 0;
 
-	if (pud_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
-			return 0;
-		return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
-					        pages, nr);
-	}
-
 	page = pud_page(orig);
 	refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6411f3107af1..7434d177b97c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1672,46 +1672,6 @@ void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pmd(vma, addr, pmd);
 }
 
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
-	unsigned long pfn = pmd_pfn(*pmd);
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int ret;
-
-	assert_spin_locked(pmd_lockptr(mm, pmd));
-
-	if (flags & FOLL_WRITE && !pmd_write(*pmd))
-		return NULL;
-
-	if (pmd_present(*pmd) && pmd_devmap(*pmd))
-		/* pass */;
-	else
-		return NULL;
-
-	if (flags & FOLL_TOUCH)
-		touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
-
-	/*
-	 * device mapped pages can only be returned if the
-	 * caller will manage the page reference count.
-	 */
-	if (!(flags & (FOLL_GET | FOLL_PIN)))
-		return ERR_PTR(-EEXIST);
-
-	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-	ret = try_grab_folio(page_folio(page), 1, flags);
-	if (ret)
-		page = ERR_PTR(ret);
-
-	return page;
-}
-
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
-- 
cgit v1.2.3


From 8a6a984c2e0ea406459b445a3910a454bece3aa1 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Thu, 19 Jun 2025 18:57:59 +1000
Subject: mm: remove redundant pXd_devmap calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DAX was the only thing that created pmd_devmap and pud_devmap entries
however it no longer does as DAX pages are now refcounted normally and
pXd_trans_huge() returns true for those.  Therefore checking both
pXd_devmap and pXd_trans_huge() is redundant and the former can be removed
without changing behaviour as it will always be false.

Link: https://lkml.kernel.org/r/d58f089dc16b7feb7c6728164f37dea65d64a0d3.1750323463.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Björn Töpel <bjorn@rivosinc.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Inki Dae <m.szyprowski@samsung.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Groves <john@groves.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                   |  5 ++---
 include/linux/huge_mm.h    | 10 ++++------
 include/linux/pgtable.h    |  2 +-
 mm/hmm.c                   |  4 ++--
 mm/huge_memory.c           | 23 +++++++++--------------
 mm/mapping_dirty_helpers.c |  4 ++--
 mm/memory.c                | 15 ++++++---------
 mm/migrate_device.c        |  2 +-
 mm/mprotect.c              |  2 +-
 mm/mremap.c                |  5 ++---
 mm/page_vma_mapped.c       |  5 ++---
 mm/pagewalk.c              |  8 +++-----
 mm/pgtable-generic.c       |  7 +++----
 mm/userfaultfd.c           |  4 ++--
 mm/vmscan.c                |  3 ---
 15 files changed, 40 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index ea0c35794bf9..7d4ecb9d23af 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1937,7 +1937,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * the PTE we need to set up.  If so just return and the fault will be
 	 * retried.
 	 */
-	if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+	if (pmd_trans_huge(*vmf->pmd)) {
 		ret = VM_FAULT_NOPAGE;
 		goto unlock_entry;
 	}
@@ -2060,8 +2060,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 	 * the PMD we need to set up.  If so just return and the fault will be
 	 * retried.
 	 */
-	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
-			!pmd_devmap(*vmf->pmd)) {
+	if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
 		ret = 0;
 		goto unlock_entry;
 	}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a2df2308cb2c..26607f2c65fb 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -400,8 +400,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 #define split_huge_pmd(__vma, __pmd, __address)				\
 	do {								\
 		pmd_t *____pmd = (__pmd);				\
-		if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)	\
-					|| pmd_devmap(*____pmd))	\
+		if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd))	\
 			__split_huge_pmd(__vma, __pmd, __address,	\
 					 false);			\
 	}  while (0)
@@ -426,8 +425,7 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 #define split_huge_pud(__vma, __pud, __address)				\
 	do {								\
 		pud_t *____pud = (__pud);				\
-		if (pud_trans_huge(*____pud)				\
-					|| pud_devmap(*____pud))	\
+		if (pud_trans_huge(*____pud))				\
 			__split_huge_pud(__vma, __pud, __address);	\
 	}  while (0)
 
@@ -450,7 +448,7 @@ static inline int is_swap_pmd(pmd_t pmd)
 static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 		struct vm_area_struct *vma)
 {
-	if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
+	if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))
 		return __pmd_trans_huge_lock(pmd, vma);
 	else
 		return NULL;
@@ -458,7 +456,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
 static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 		struct vm_area_struct *vma)
 {
-	if (pud_trans_huge(*pud) || pud_devmap(*pud))
+	if (pud_trans_huge(*pud))
 		return __pud_trans_huge_lock(pud, vma);
 	else
 		return NULL;
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index d05e35a0facf..ffcd966cf2d4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1672,7 +1672,7 @@ static inline int pud_trans_unstable(pud_t *pud)
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 	pud_t pudval = READ_ONCE(*pud);
 
-	if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
+	if (pud_none(pudval) || pud_trans_huge(pudval))
 		return 1;
 	if (unlikely(pud_bad(pudval))) {
 		pud_clear_bad(pud);
diff --git a/mm/hmm.c b/mm/hmm.c
index 14914da98416..62d3082dc55c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -360,7 +360,7 @@ again:
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 	}
 
-	if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
+	if (pmd_trans_huge(pmd)) {
 		/*
 		 * No need to take pmd_lock here, even if some other thread
 		 * is splitting the huge pmd we will get that event through
@@ -371,7 +371,7 @@ again:
 		 * values.
 		 */
 		pmd = pmdp_get_lockless(pmdp);
-		if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+		if (!pmd_trans_huge(pmd))
 			goto again;
 
 		return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 54b5c37d9515..cf808b2eea29 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1459,8 +1459,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 	 * but we need to be consistent with PTEs and architectures that
 	 * can't support a 'special' bit.
 	 */
-	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
-			!pfn_t_devmap(pfn));
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 						(VM_PFNMAP|VM_MIXEDMAP));
 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1596,8 +1595,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 	 * but we need to be consistent with PTEs and architectures that
 	 * can't support a 'special' bit.
 	 */
-	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
-			!pfn_t_devmap(pfn));
+	BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
 	BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 						(VM_PFNMAP|VM_MIXEDMAP));
 	BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
@@ -1815,7 +1813,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	ret = -EAGAIN;
 	pud = *src_pud;
-	if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
+	if (unlikely(!pud_trans_huge(pud)))
 		goto out_unlock;
 
 	/*
@@ -2677,8 +2675,7 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 {
 	spinlock_t *ptl;
 	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
-			pmd_devmap(*pmd)))
+	if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)))
 		return ptl;
 	spin_unlock(ptl);
 	return NULL;
@@ -2695,7 +2692,7 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
 	spinlock_t *ptl;
 
 	ptl = pud_lock(vma->vm_mm, pud);
-	if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
+	if (likely(pud_trans_huge(*pud)))
 		return ptl;
 	spin_unlock(ptl);
 	return NULL;
@@ -2747,7 +2744,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
-	VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
+	VM_BUG_ON(!pud_trans_huge(*pud));
 
 	count_vm_event(THP_SPLIT_PUD);
 
@@ -2780,7 +2777,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
 				(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
 	ptl = pud_lock(vma->vm_mm, pud);
-	if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
+	if (unlikely(!pud_trans_huge(*pud)))
 		goto out;
 	__split_huge_pud_locked(vma, pud, range.start);
 
@@ -2853,8 +2850,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
-				&& !pmd_devmap(*pmd));
+	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
 
 	count_vm_event(THP_SPLIT_PMD);
 
@@ -3062,8 +3058,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze)
 {
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
-	if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
-	    is_pmd_migration_entry(*pmd))
+	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
 		__split_huge_pmd_locked(vma, pmd, address, freeze);
 }
 
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index dc1692ff9e58..c193de6cb23a 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -129,7 +129,7 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pmd_t pmdval = pmdp_get_lockless(pmd);
 
 	/* Do not split a huge pmd, present or migrated */
-	if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) {
+	if (pmd_trans_huge(pmdval)) {
 		WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
 		walk->action = ACTION_CONTINUE;
 	}
@@ -152,7 +152,7 @@ static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
 	pud_t pudval = READ_ONCE(*pud);
 
 	/* Do not split a huge pud */
-	if (pud_trans_huge(pudval) || pud_devmap(pudval)) {
+	if (pud_trans_huge(pudval)) {
 		WARN_ON(pud_write(pudval) || pud_dirty(pudval));
 		walk->action = ACTION_CONTINUE;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 01d51bd95197..150bb62855b1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -675,8 +675,6 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 		}
 	}
 
-	if (pmd_devmap(pmd))
-		return NULL;
 	if (is_huge_zero_pmd(pmd))
 		return NULL;
 	if (unlikely(pfn > highest_memmap_pfn))
@@ -1240,8 +1238,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	src_pmd = pmd_offset(src_pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
-			|| pmd_devmap(*src_pmd)) {
+		if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
 			int err;
 			VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
 			err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1277,7 +1274,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	src_pud = pud_offset(src_p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+		if (pud_trans_huge(*src_pud)) {
 			int err;
 
 			VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1791,7 +1788,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+		if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
 			if (next - addr != HPAGE_PMD_SIZE)
 				__split_huge_pmd(vma, pmd, addr, false);
 			else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
@@ -1833,7 +1830,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 	pud = pud_offset(p4d, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+		if (pud_trans_huge(*pud)) {
 			if (next - addr != HPAGE_PUD_SIZE) {
 				mmap_assert_locked(tlb->mm);
 				split_huge_pud(vma, pud, addr);
@@ -6136,7 +6133,7 @@ retry_pud:
 		pud_t orig_pud = *vmf.pud;
 
 		barrier();
-		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+		if (pud_trans_huge(orig_pud)) {
 
 			/*
 			 * TODO once we support anonymous PUDs: NUMA case and
@@ -6177,7 +6174,7 @@ retry_pud:
 				pmd_migration_entry_wait(mm, vmf.pmd);
 			return 0;
 		}
-		if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+		if (pmd_trans_huge(vmf.orig_pmd)) {
 			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf);
 
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 3158afe7eb23..e05e14d6eacd 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -615,7 +615,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	pmdp = pmd_alloc(mm, pudp, addr);
 	if (!pmdp)
 		goto abort;
-	if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+	if (pmd_trans_huge(*pmdp))
 		goto abort;
 	if (pte_alloc(mm, pmdp))
 		goto abort;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b873b98ab705..88709c01177b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -376,7 +376,7 @@ again:
 			goto next;
 
 		_pmd = pmdp_get_lockless(pmd);
-		if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
+		if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd)) {
 			if ((next - addr != HPAGE_PMD_SIZE) ||
 			    pgtable_split_needed(vma, cp_flags)) {
 				__split_huge_pmd(vma, pmd, addr, false);
diff --git a/mm/mremap.c b/mm/mremap.c
index 7e93d3344828..36585041c760 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -820,7 +820,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
 		new_pud = alloc_new_pud(mm, pmc->new_addr);
 		if (!new_pud)
 			break;
-		if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+		if (pud_trans_huge(*old_pud)) {
 			if (extent == HPAGE_PUD_SIZE) {
 				move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
 				/* We ignore and continue on error? */
@@ -839,8 +839,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc)
 		if (!new_pmd)
 			break;
 again:
-		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
-		    pmd_devmap(*old_pmd)) {
+		if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
 			if (extent == HPAGE_PMD_SIZE &&
 			    move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
 				continue;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index e463c3be934a..e981a1a292d2 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -246,8 +246,7 @@ restart:
 		 */
 		pmde = pmdp_get_lockless(pvmw->pmd);
 
-		if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
-		    (pmd_present(pmde) && pmd_devmap(pmde))) {
+		if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
 			pvmw->ptl = pmd_lock(mm, pvmw->pmd);
 			pmde = *pvmw->pmd;
 			if (!pmd_present(pmde)) {
@@ -262,7 +261,7 @@ restart:
 					return not_found(pvmw);
 				return true;
 			}
-			if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+			if (likely(pmd_trans_huge(pmde))) {
 				if (pvmw->flags & PVMW_MIGRATION)
 					return not_found(pvmw);
 				if (!check_pmd(pmd_pfn(pmde), pvmw))
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index a214a2b40ab9..648038247a8d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -143,8 +143,7 @@ again:
 			 * We are ONLY installing, so avoid unnecessarily
 			 * splitting a present huge page.
 			 */
-			if (pmd_present(*pmd) &&
-			    (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+			if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
 				continue;
 		}
 
@@ -210,8 +209,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
 			 * We are ONLY installing, so avoid unnecessarily
 			 * splitting a present huge page.
 			 */
-			if (pud_present(*pud) &&
-			    (pud_trans_huge(*pud) || pud_devmap(*pud)))
+			if (pud_present(*pud) && pud_trans_huge(*pud))
 				continue;
 		}
 
@@ -908,7 +906,7 @@ struct folio *folio_walk_start(struct folio_walk *fw,
 		 * TODO: FW_MIGRATION support for PUD migration entries
 		 * once there are relevant users.
 		 */
-		if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
+		if (!pud_present(pud) || pud_special(pud)) {
 			spin_unlock(ptl);
 			goto not_found;
 		} else if (!pud_leaf(pud)) {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a882f2b10f9..567e2d084071 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -139,8 +139,7 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 {
 	pmd_t pmd;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-	VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
-			   !pmd_devmap(*pmdp));
+	VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	return pmd;
@@ -153,7 +152,7 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
 	pud_t pud;
 
 	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
-	VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
+	VM_BUG_ON(!pud_trans_huge(*pudp));
 	pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
 	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
 	return pud;
@@ -293,7 +292,7 @@ pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
 		*pmdvalp = pmdval;
 	if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
 		goto nomap;
-	if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
+	if (unlikely(pmd_trans_huge(pmdval)))
 		goto nomap;
 	if (unlikely(pmd_bad(pmdval))) {
 		pmd_clear_bad(pmd);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dd2a25fafb82..cbed91b09640 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -795,8 +795,8 @@ retry:
 		 * (This includes the case where the PMD used to be THP and
 		 * changed back to none after __pte_alloc().)
 		 */
-		if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
-			     pmd_devmap(dst_pmdval))) {
+		if (unlikely(!pmd_present(dst_pmdval) ||
+				pmd_trans_huge(dst_pmdval))) {
 			err = -EEXIST;
 			break;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6698fadf5d04..c86a2495138a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3450,9 +3450,6 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
 	if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
 		return -1;
 
-	if (WARN_ON_ONCE(pmd_devmap(pmd)))
-		return -1;
-
 	if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm))
 		return -1;
 
-- 
cgit v1.2.3


From d438d273417055241ebaaf1ba3be23459fc27cba Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Thu, 19 Jun 2025 18:58:03 +1000
Subject: mm: remove devmap related functions and page table bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that DAX and all other reference counts to ZONE_DEVICE pages are
managed normally there is no need for the special devmap PTE/PMD/PUD page
table bits.  So drop all references to these, freeing up a software
defined page table bit on architectures supporting it.

Link: https://lkml.kernel.org/r/6389398c32cc9daa3dfcaa9f79c7972525d310ce.1750323463.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Will Deacon <will@kernel.org> # arm64
Acked-by: David Hildenbrand <david@redhat.com>
Suggested-by: Chunyan Zhang <zhang.lyra@gmail.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Inki Dae <m.szyprowski@samsung.com>
Cc: John Groves <john@groves.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/arch_pgtable_helpers.rst     |  6 ---
 arch/arm64/Kconfig                            |  1 -
 arch/arm64/include/asm/pgtable-prot.h         |  1 -
 arch/arm64/include/asm/pgtable.h              | 24 -----------
 arch/loongarch/Kconfig                        |  1 -
 arch/loongarch/include/asm/pgtable-bits.h     |  6 +--
 arch/loongarch/include/asm/pgtable.h          | 19 ---------
 arch/powerpc/Kconfig                          |  1 -
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  6 ---
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  7 +---
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 53 +-----------------------
 arch/powerpc/include/asm/book3s/64/radix.h    | 14 +------
 arch/riscv/Kconfig                            |  1 -
 arch/riscv/include/asm/pgtable-64.h           | 16 --------
 arch/riscv/include/asm/pgtable-bits.h         |  1 -
 arch/riscv/include/asm/pgtable.h              | 22 ----------
 arch/x86/Kconfig                              |  1 -
 arch/x86/include/asm/pgtable.h                | 51 +----------------------
 arch/x86/include/asm/pgtable_types.h          |  5 +--
 include/linux/mm.h                            |  7 ----
 include/linux/pgtable.h                       | 19 +--------
 mm/Kconfig                                    |  4 --
 mm/debug_vm_pgtable.c                         | 59 ---------------------------
 mm/hmm.c                                      |  3 +-
 mm/madvise.c                                  |  8 ++--
 25 files changed, 17 insertions(+), 319 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index af245161d8e7..c88c7fa665d6 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -30,8 +30,6 @@ PTE Page Table Helpers
 +---------------------------+--------------------------------------------------+
 | pte_protnone              | Tests a PROT_NONE PTE                            |
 +---------------------------+--------------------------------------------------+
-| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
-+---------------------------+--------------------------------------------------+
 | pte_soft_dirty            | Tests a soft dirty PTE                           |
 +---------------------------+--------------------------------------------------+
 | pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
@@ -104,8 +102,6 @@ PMD Page Table Helpers
 +---------------------------+--------------------------------------------------+
 | pmd_protnone              | Tests a PROT_NONE PMD                            |
 +---------------------------+--------------------------------------------------+
-| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
-+---------------------------+--------------------------------------------------+
 | pmd_soft_dirty            | Tests a soft dirty PMD                           |
 +---------------------------+--------------------------------------------------+
 | pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
@@ -177,8 +173,6 @@ PUD Page Table Helpers
 +---------------------------+--------------------------------------------------+
 | pud_write                 | Tests a writable PUD                             |
 +---------------------------+--------------------------------------------------+
-| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
-+---------------------------+--------------------------------------------------+
 | pud_mkyoung               | Creates a young PUD                              |
 +---------------------------+--------------------------------------------------+
 | pud_mkold                 | Creates an old PUD                               |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331af337..94b48b1dae71 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -44,7 +44,6 @@ config ARM64
 	select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
 	select ARCH_HAS_PREEMPT_LAZY
 	select ARCH_HAS_PTDUMP
-	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
 	select ARCH_HAS_SETUP_DMA_OPS
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 7830d031742e..85dceb1c66f4 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -17,7 +17,6 @@
 #define PTE_SWP_EXCLUSIVE	(_AT(pteval_t, 1) << 2)	 /* only for swp ptes */
 #define PTE_DIRTY		(_AT(pteval_t, 1) << 55)
 #define PTE_SPECIAL		(_AT(pteval_t, 1) << 56)
-#define PTE_DEVMAP		(_AT(pteval_t, 1) << 57)
 
 /*
  * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e511f909f63c..ba63c8736666 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -190,7 +190,6 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 #define pte_user(pte)		(!!(pte_val(pte) & PTE_USER))
 #define pte_user_exec(pte)	(!(pte_val(pte) & PTE_UXN))
 #define pte_cont(pte)		(!!(pte_val(pte) & PTE_CONT))
-#define pte_devmap(pte)		(!!(pte_val(pte) & PTE_DEVMAP))
 #define pte_tagged(pte)		((pte_val(pte) & PTE_ATTRINDX_MASK) == \
 				 PTE_ATTRINDX(MT_NORMAL_TAGGED))
 
@@ -372,11 +371,6 @@ static inline pmd_t pmd_mkcont(pmd_t pmd)
 	return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
 }
 
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
-	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
-}
-
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
 static inline int pte_uffd_wp(pte_t pte)
 {
@@ -653,14 +647,6 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd)
 	return __pmd((pmd_val(pmd) & ~mask) | val);
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pmd_devmap(pmd)		pte_devmap(pmd_pte(pmd))
-#endif
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
-	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
-}
-
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 #define pmd_special(pte)	(!!((pmd_val(pte) & PTE_SPECIAL)))
 static inline pmd_t pmd_mkspecial(pmd_t pmd)
@@ -1302,16 +1288,6 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 	return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
 							pmd_pte(entry), dirty);
 }
-
-static inline int pud_devmap(pud_t pud)
-{
-	return 0;
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
 #endif
 
 #ifdef CONFIG_PAGE_TABLE_CHECK
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4b19f93379a1..edb3db230bac 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -25,7 +25,6 @@ config LOONGARCH
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PREEMPT_LAZY
-	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SET_DIRECT_MAP
diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h
index 7bbfb04a54cc..2fc3789220ac 100644
--- a/arch/loongarch/include/asm/pgtable-bits.h
+++ b/arch/loongarch/include/asm/pgtable-bits.h
@@ -22,7 +22,6 @@
 #define	_PAGE_PFN_SHIFT		12
 #define	_PAGE_SWP_EXCLUSIVE_SHIFT 23
 #define	_PAGE_PFN_END_SHIFT	48
-#define	_PAGE_DEVMAP_SHIFT	59
 #define	_PAGE_PRESENT_INVALID_SHIFT 60
 #define	_PAGE_NO_READ_SHIFT	61
 #define	_PAGE_NO_EXEC_SHIFT	62
@@ -36,7 +35,6 @@
 #define _PAGE_MODIFIED		(_ULCAST_(1) << _PAGE_MODIFIED_SHIFT)
 #define _PAGE_PROTNONE		(_ULCAST_(1) << _PAGE_PROTNONE_SHIFT)
 #define _PAGE_SPECIAL		(_ULCAST_(1) << _PAGE_SPECIAL_SHIFT)
-#define _PAGE_DEVMAP		(_ULCAST_(1) << _PAGE_DEVMAP_SHIFT)
 
 /* We borrow bit 23 to store the exclusive marker in swap PTEs. */
 #define _PAGE_SWP_EXCLUSIVE	(_ULCAST_(1) << _PAGE_SWP_EXCLUSIVE_SHIFT)
@@ -76,8 +74,8 @@
 #define __READABLE	(_PAGE_VALID)
 #define __WRITEABLE	(_PAGE_DIRTY | _PAGE_WRITE)
 
-#define _PAGE_CHG_MASK	(_PAGE_MODIFIED | _PAGE_SPECIAL | _PAGE_DEVMAP | _PFN_MASK | _CACHE_MASK | _PAGE_PLV)
-#define _HPAGE_CHG_MASK	(_PAGE_MODIFIED | _PAGE_SPECIAL | _PAGE_DEVMAP | _PFN_MASK | _CACHE_MASK | _PAGE_PLV | _PAGE_HUGE)
+#define _PAGE_CHG_MASK	(_PAGE_MODIFIED | _PAGE_SPECIAL | _PFN_MASK | _CACHE_MASK | _PAGE_PLV)
+#define _HPAGE_CHG_MASK	(_PAGE_MODIFIED | _PAGE_SPECIAL | _PFN_MASK | _CACHE_MASK | _PAGE_PLV | _PAGE_HUGE)
 
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_NO_READ | \
 				 _PAGE_USER | _CACHE_CC)
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index f2aeff544cee..bd128696e96d 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -409,9 +409,6 @@ static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL;
 static inline pte_t pte_mkspecial(pte_t pte)	{ pte_val(pte) |= _PAGE_SPECIAL; return pte; }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-static inline int pte_devmap(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_DEVMAP); }
-static inline pte_t pte_mkdevmap(pte_t pte)	{ pte_val(pte) |= _PAGE_DEVMAP; return pte; }
-
 #define pte_accessible pte_accessible
 static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
 {
@@ -540,17 +537,6 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
 	return pmd;
 }
 
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
-}
-
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
-	pmd_val(pmd) |= _PAGE_DEVMAP;
-	return pmd;
-}
-
 static inline struct page *pmd_page(pmd_t pmd)
 {
 	if (pmd_trans_huge(pmd))
@@ -606,11 +592,6 @@ static inline long pmd_protnone(pmd_t pmd)
 #define pmd_leaf(pmd)		((pmd_val(pmd) & _PAGE_HUGE) != 0)
 #define pud_leaf(pud)		((pud_val(pud) & _PAGE_HUGE) != 0)
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pud_devmap(pud)		(0)
-#define pgd_devmap(pgd)		(0)
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
 /*
  * We provide our own get_unmapped area to cope with the virtual aliasing
  * constraints placed on us by the cache architecture.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c3e0cc83f120..7a555c1d3171 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -149,7 +149,6 @@ config PPC
 	select ARCH_HAS_PMEM_API
 	select ARCH_HAS_PREEMPT_LAZY
 	select ARCH_HAS_PTDUMP
-	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_SET_MEMORY
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index aa90a048f319..7132392fa7cd 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -168,12 +168,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 extern int hash__has_transparent_hugepage(void);
 #endif
 
-static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
-{
-	BUG();
-	return pmd;
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 0bf6fd0bf42a..0fb5b7da9478 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -259,7 +259,7 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
  */
 static inline int hash__pmd_trans_huge(pmd_t pmd)
 {
-	return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)) ==
+	return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) ==
 		  (_PAGE_PTE | H_PAGE_THP_HUGE));
 }
 
@@ -281,11 +281,6 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 extern int hash__has_transparent_hugepage(void);
 #endif /*  CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP));
-}
-
 #endif	/* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index a2ddcbb3fcb9..c19800365315 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -88,7 +88,6 @@
 
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
-#define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
@@ -109,7 +108,7 @@
  */
 #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
 			 _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \
-			 _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+			 _PAGE_SOFT_DIRTY)
 /*
  * user access blocked by key
  */
@@ -123,7 +122,7 @@
  */
 #define _PAGE_CHG_MASK	(PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
 			 _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE |	\
-			 _PAGE_SOFT_DIRTY | _PAGE_DEVMAP)
+			 _PAGE_SOFT_DIRTY)
 
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
@@ -609,24 +608,6 @@ static inline pte_t pte_mkhuge(pte_t pte)
 	return pte;
 }
 
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
-	return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP));
-}
-
-/*
- * This is potentially called with a pmd as the argument, in which case it's not
- * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
- * That's because the bit we use for _PAGE_DEVMAP is not reserved for software
- * use in page directory entries (ie. non-ptes).
- */
-static inline int pte_devmap(pte_t pte)
-{
-	__be64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
-
-	return (pte_raw(pte) & mask) == mask;
-}
-
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
 	/* FIXME!! check whether this need to be a conditional */
@@ -1379,36 +1360,6 @@ static inline bool arch_needs_pgtable_deposit(void)
 }
 extern void serialize_against_pte_lookup(struct mm_struct *mm);
 
-
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
-	if (radix_enabled())
-		return radix__pmd_mkdevmap(pmd);
-	return hash__pmd_mkdevmap(pmd);
-}
-
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
-	if (radix_enabled())
-		return radix__pud_mkdevmap(pud);
-	BUG();
-	return pud;
-}
-
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return pte_devmap(pmd_pte(pmd));
-}
-
-static inline int pud_devmap(pud_t pud)
-{
-	return pte_devmap(pud_pte(pud));
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 8f55ff74bb68..df23a8267e4d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -264,7 +264,7 @@ static inline int radix__p4d_bad(p4d_t p4d)
 
 static inline int radix__pmd_trans_huge(pmd_t pmd)
 {
-	return (pmd_val(pmd) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+	return (pmd_val(pmd) & _PAGE_PTE) == _PAGE_PTE;
 }
 
 static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
@@ -274,7 +274,7 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
 
 static inline int radix__pud_trans_huge(pud_t pud)
 {
-	return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
+	return (pud_val(pud) & _PAGE_PTE) == _PAGE_PTE;
 }
 
 static inline pud_t radix__pud_mkhuge(pud_t pud)
@@ -315,16 +315,6 @@ static inline int radix__has_transparent_pud_hugepage(void)
 }
 #endif
 
-static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd)
-{
-	return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP));
-}
-
-static inline pud_t radix__pud_mkdevmap(pud_t pud)
-{
-	return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP));
-}
-
 struct vmem_altmap;
 struct dev_pagemap;
 extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d71ea0f4466f..23df26f39472 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -46,7 +46,6 @@ config RISCV
 	select ARCH_HAS_PREEMPT_LAZY
 	select ARCH_HAS_PREPARE_SYNC_CORE_CMD
 	select ARCH_HAS_PTDUMP if MMU
-	select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_SET_DIRECT_MAP if MMU
 	select ARCH_HAS_SET_MEMORY if MMU
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 7de05db7d3bd..1018d2216901 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -397,24 +397,8 @@ static inline struct page *pgd_page(pgd_t pgd)
 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pte_devmap(pte_t pte);
 static inline pte_t pmd_pte(pmd_t pmd);
 static inline pte_t pud_pte(pud_t pud);
-
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return pte_devmap(pmd_pte(pmd));
-}
-
-static inline int pud_devmap(pud_t pud)
-{
-	return pte_devmap(pud_pte(pud));
-}
-
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
 #endif
 
 #endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h
index a8f5205cea54..179bd4afece4 100644
--- a/arch/riscv/include/asm/pgtable-bits.h
+++ b/arch/riscv/include/asm/pgtable-bits.h
@@ -19,7 +19,6 @@
 #define _PAGE_SOFT      (3 << 8)    /* Reserved for software */
 
 #define _PAGE_SPECIAL   (1 << 8)    /* RSW: 0x1 */
-#define _PAGE_DEVMAP    (1 << 9)    /* RSW, devmap */
 #define _PAGE_TABLE     _PAGE_PRESENT
 
 /*
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 5bd5aae60d53..91697fbf1f90 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -409,13 +409,6 @@ static inline int pte_special(pte_t pte)
 	return pte_val(pte) & _PAGE_SPECIAL;
 }
 
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t pte)
-{
-	return pte_val(pte) & _PAGE_DEVMAP;
-}
-#endif
-
 /* static inline pte_t pte_rdprotect(pte_t pte) */
 
 static inline pte_t pte_wrprotect(pte_t pte)
@@ -457,11 +450,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	return __pte(pte_val(pte) | _PAGE_SPECIAL);
 }
 
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
-	return __pte(pte_val(pte) | _PAGE_DEVMAP);
-}
-
 static inline pte_t pte_mkhuge(pte_t pte)
 {
 	return pte;
@@ -790,11 +778,6 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
 	return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
 }
 
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
-	return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
-}
-
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 static inline bool pmd_special(pmd_t pmd)
 {
@@ -946,11 +929,6 @@ static inline pud_t pud_mkhuge(pud_t pud)
 	return pud;
 }
 
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
-	return pte_pud(pte_mkdevmap(pud_pte(pud)));
-}
-
 static inline int pudp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pud_t *pudp,
 					pud_t entry, int dirty)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71019b3b54ea..bb9b63d76a19 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -101,7 +101,6 @@ config X86
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PREEMPT_LAZY
-	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
 	select ARCH_HAS_NONLEAF_PMD_YOUNG	if PGTABLE_LEVELS > 2
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 97954c936c54..e33df3da6980 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -301,16 +301,15 @@ static inline bool pmd_leaf(pmd_t pte)
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_leaf */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pmd_val(pmd) & _PAGE_PSE) == _PAGE_PSE;
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static inline int pud_trans_huge(pud_t pud)
 {
-	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pud_val(pud) & _PAGE_PSE) == _PAGE_PSE;
 }
 #endif
 
@@ -320,24 +319,6 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static inline int pud_devmap(pud_t pud)
-{
-	return !!(pud_val(pud) & _PAGE_DEVMAP);
-}
-#else
-static inline int pud_devmap(pud_t pud)
-{
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 static inline bool pmd_special(pmd_t pmd)
 {
@@ -361,12 +342,6 @@ static inline pud_t pud_mkspecial(pud_t pud)
 	return pud_set_flags(pud, _PAGE_SPECIAL);
 }
 #endif	/* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
-
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
-#endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -527,11 +502,6 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
-static inline pte_t pte_mkdevmap(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
-}
-
 /* See comments above mksaveddirty_shift() */
 static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
 {
@@ -603,11 +573,6 @@ static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_DIRTY);
 }
 
-static inline pmd_t pmd_mkdevmap(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_DEVMAP);
-}
-
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
 	return pmd_set_flags(pmd, _PAGE_PSE);
@@ -673,11 +638,6 @@ static inline pud_t pud_mkdirty(pud_t pud)
 	return pud_mksaveddirty(pud);
 }
 
-static inline pud_t pud_mkdevmap(pud_t pud)
-{
-	return pud_set_flags(pud, _PAGE_DEVMAP);
-}
-
 static inline pud_t pud_mkhuge(pud_t pud)
 {
 	return pud_set_flags(pud, _PAGE_PSE);
@@ -1008,13 +968,6 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t a)
-{
-	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
-}
-#endif
-
 #define pte_accessible pte_accessible
 static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
 {
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..f63ae8d0aac8 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -34,7 +34,6 @@
 #define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_KERNEL_4K	_PAGE_BIT_SOFTW3 /* page must not be converted to large */
-#define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
 
 #ifdef CONFIG_X86_64
 #define _PAGE_BIT_SAVED_DIRTY	_PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
@@ -121,11 +120,9 @@
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
-#define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
 #define _PAGE_SOFTW4	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
-#define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #define _PAGE_SOFTW4	(_AT(pteval_t, 0))
 #endif
 
@@ -154,7 +151,7 @@
 #define _COMMON_PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |	\
 				 _PAGE_SPECIAL | _PAGE_ACCESSED |	\
 				 _PAGE_DIRTY_BITS | _PAGE_SOFT_DIRTY |	\
-				 _PAGE_DEVMAP | _PAGE_CC | _PAGE_UFFD_WP)
+				 _PAGE_CC | _PAGE_UFFD_WP)
 #define _PAGE_CHG_MASK	(_COMMON_PAGE_CHG_MASK | _PAGE_PAT)
 #define _HPAGE_CHG_MASK (_COMMON_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_PAT_LARGE)
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc365420dfa8..4d833f159988 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2704,13 +2704,6 @@ static inline pud_t pud_mkspecial(pud_t pud)
 }
 #endif	/* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */
 
-#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline int pte_devmap(pte_t pte)
-{
-	return 0;
-}
-#endif
-
 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
 			       spinlock_t **ptl);
 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index ffcd966cf2d4..cf1515c163e2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1643,21 +1643,6 @@ static inline int pud_write(pud_t pud)
 }
 #endif /* pud_write */
 
-#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return 0;
-}
-static inline int pud_devmap(pud_t pud)
-{
-	return 0;
-}
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
-#endif
-
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
 	!defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 static inline int pud_trans_huge(pud_t pud)
@@ -1912,8 +1897,8 @@ typedef unsigned int pgtbl_mod_mask;
  * - It should contain a huge PFN, which points to a huge page larger than
  *   PAGE_SIZE of the platform.  The PFN format isn't important here.
  *
- * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
- *   pXd_devmap(), or hugetlb mappings).
+ * - It should cover all kinds of huge mappings (i.e. pXd_trans_huge()
+ *   or hugetlb mappings).
  */
 #ifndef pgd_leaf
 #define pgd_leaf(x)	false
diff --git a/mm/Kconfig b/mm/Kconfig
index 065b1f19dd99..d5d4eca947a6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1117,9 +1117,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
 	  register alias named "current_stack_pointer", this config can be
 	  selected.
 
-config ARCH_HAS_PTE_DEVMAP
-	bool
-
 config ARCH_HAS_ZONE_DMA_SET
 	bool
 
@@ -1137,7 +1134,6 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 7731b238b534..d84d0c49012f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -348,12 +348,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 	vaddr &= HPAGE_PUD_MASK;
 
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
-	/*
-	 * Some architectures have debug checks to make sure
-	 * huge pud mapping are only found with devmap entries
-	 * For now test with only devmap entries.
-	 */
-	pud = pud_mkdevmap(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
 	flush_dcache_page(page);
 	pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -366,7 +360,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 	WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
-	pud = pud_mkdevmap(pud);
 	pud = pud_wrprotect(pud);
 	pud = pud_mkclean(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -384,7 +377,6 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 	pud = pfn_pud(args->pud_pfn, args->page_prot);
-	pud = pud_mkdevmap(pud);
 	pud = pud_mkyoung(pud);
 	set_pud_at(args->mm, vaddr, args->pudp, pud);
 	flush_dcache_page(page);
@@ -693,53 +685,6 @@ static void __init pmd_protnone_tests(struct pgtable_debug_args *args)
 static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static void __init pte_devmap_tests(struct pgtable_debug_args *args)
-{
-	pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
-
-	pr_debug("Validating PTE devmap\n");
-	WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
-}
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args)
-{
-	pmd_t pmd;
-
-	if (!has_transparent_hugepage())
-		return;
-
-	pr_debug("Validating PMD devmap\n");
-	pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot);
-	WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_devmap_tests(struct pgtable_debug_args *args)
-{
-	pud_t pud;
-
-	if (!has_transparent_pud_hugepage())
-		return;
-
-	pr_debug("Validating PUD devmap\n");
-	pud = pfn_pud(args->fixed_pud_pfn, args->page_prot);
-	WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
-}
-#else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-#else  /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-#else
-static void __init pte_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { }
-static void __init pud_devmap_tests(struct pgtable_debug_args *args) { }
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-
 static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args)
 {
 	pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
@@ -1333,10 +1278,6 @@ static int __init debug_vm_pgtable(void)
 	pte_protnone_tests(&args);
 	pmd_protnone_tests(&args);
 
-	pte_devmap_tests(&args);
-	pmd_devmap_tests(&args);
-	pud_devmap_tests(&args);
-
 	pte_soft_dirty_tests(&args);
 	pmd_soft_dirty_tests(&args);
 	pte_swap_soft_dirty_tests(&args);
diff --git a/mm/hmm.c b/mm/hmm.c
index 62d3082dc55c..f2415b4b2cdd 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -405,8 +405,7 @@ again:
 	return 0;
 }
 
-#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
-    defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#if defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
 						 pud_t pud)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index 92f427b1b330..070132f9842b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1069,7 +1069,7 @@ static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
 	pud_t pudval = pudp_get(pud);
 
 	/* If huge return >0 so we abort the operation + zap. */
-	return pud_trans_huge(pudval) || pud_devmap(pudval);
+	return pud_trans_huge(pudval);
 }
 
 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1078,7 +1078,7 @@ static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
 	pmd_t pmdval = pmdp_get(pmd);
 
 	/* If huge return >0 so we abort the operation + zap. */
-	return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+	return pmd_trans_huge(pmdval);
 }
 
 static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
@@ -1189,7 +1189,7 @@ static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
 	pud_t pudval = pudp_get(pud);
 
 	/* If huge, cannot have guard pages present, so no-op - skip. */
-	if (pud_trans_huge(pudval) || pud_devmap(pudval))
+	if (pud_trans_huge(pudval))
 		walk->action = ACTION_CONTINUE;
 
 	return 0;
@@ -1201,7 +1201,7 @@ static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
 	pmd_t pmdval = pmdp_get(pmd);
 
 	/* If huge, cannot have guard pages present, so no-op - skip. */
-	if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+	if (pmd_trans_huge(pmdval))
 		walk->action = ACTION_CONTINUE;
 
 	return 0;
-- 
cgit v1.2.3


From 984921edea68bf24bcc87e1317bfc90451ff46c6 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Thu, 19 Jun 2025 18:58:04 +1000
Subject: mm: remove PFN_DEV, PFN_MAP, PFN_SPECIAL, PFN_SG_CHAIN and
 PFN_SG_LAST
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The PFN_MAP flag is no longer used for anything, so remove it.  The
PFN_SG_CHAIN and PFN_SG_LAST flags never appear to have been used so also
remove them.  The last user of PFN_SPECIAL was removed by 653d7825c149
("dcssblk: mark DAX broken, remove FS_DAX_LIMITED support").

Users of PFN_DEV were removed earlier in this series by "mm: Remove
remaining uses of PFN_DEV".

Link: https://lkml.kernel.org/r/670b3950d70b4d97b905bb597dadfd3633de4314.1750323463.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Björn Töpel <bjorn@rivosinc.com>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Inki Dae <m.szyprowski@samsung.com>
Cc: John Groves <john@groves.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pfn_t.h             | 50 ++-------------------------------------
 mm/memory.c                       |  2 --
 tools/testing/nvdimm/test/iomap.c |  4 ----
 3 files changed, 2 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 2d9148221e9a..2c002936a0f6 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -5,26 +5,11 @@
 
 /*
  * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags
- * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry
- * PFN_SG_LAST - pfn references a page and is the last scatterlist entry
- * PFN_DEV - pfn is not covered by system memmap by default
- * PFN_MAP - pfn has a dynamic page mapping established by a device driver
- * PFN_SPECIAL - for CONFIG_FS_DAX_LIMITED builds to allow XIP, but not
- *		 get_user_pages
  */
 #define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
-#define PFN_SG_CHAIN (1ULL << (BITS_PER_LONG_LONG - 1))
-#define PFN_SG_LAST (1ULL << (BITS_PER_LONG_LONG - 2))
-#define PFN_DEV (1ULL << (BITS_PER_LONG_LONG - 3))
-#define PFN_MAP (1ULL << (BITS_PER_LONG_LONG - 4))
-#define PFN_SPECIAL (1ULL << (BITS_PER_LONG_LONG - 5))
 
 #define PFN_FLAGS_TRACE \
-	{ PFN_SPECIAL,	"SPECIAL" }, \
-	{ PFN_SG_CHAIN,	"SG_CHAIN" }, \
-	{ PFN_SG_LAST,	"SG_LAST" }, \
-	{ PFN_DEV,	"DEV" }, \
-	{ PFN_MAP,	"MAP" }
+	{ }
 
 static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
 {
@@ -46,7 +31,7 @@ static inline pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
 
 static inline bool pfn_t_has_page(pfn_t pfn)
 {
-	return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0;
+	return true;
 }
 
 static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
@@ -97,35 +82,4 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
-static inline bool pfn_t_devmap(pfn_t pfn)
-{
-	const u64 flags = PFN_DEV|PFN_MAP;
-
-	return (pfn.val & flags) == flags;
-}
-#else
-static inline bool pfn_t_devmap(pfn_t pfn)
-{
-	return false;
-}
-pte_t pte_mkdevmap(pte_t pte);
-pmd_t pmd_mkdevmap(pmd_t pmd);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
-	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-pud_t pud_mkdevmap(pud_t pud);
-#endif
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
-
-#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static inline bool pfn_t_special(pfn_t pfn)
-{
-	return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL;
-}
-#else
-static inline bool pfn_t_special(pfn_t pfn)
-{
-	return false;
-}
-#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 #endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/memory.c b/mm/memory.c
index 150bb62855b1..e932a007af4c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2581,8 +2581,6 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
 	/* these checks mirror the abort conditions in vm_normal_page */
 	if (vma->vm_flags & VM_MIXEDMAP)
 		return true;
-	if (pfn_t_special(pfn))
-		return true;
 	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
 		return true;
 	return false;
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index e4313726fae3..ddceb04b4a9a 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -137,10 +137,6 @@ EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
 pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
 {
-	struct nfit_test_resource *nfit_res = get_nfit_res(addr);
-
-	if (nfit_res)
-		flags &= ~PFN_MAP;
         return phys_to_pfn_t(addr, flags);
 }
 EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
-- 
cgit v1.2.3


From 21aa65bf82a78c1e70447a45a85e533689b7f1a7 Mon Sep 17 00:00:00 2001
From: Alistair Popple <apopple@nvidia.com>
Date: Thu, 19 Jun 2025 18:58:05 +1000
Subject: mm: remove callers of pfn_t functionality
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All PFN_* pfn_t flags have been removed.  Therefore there is no longer a
need for the pfn_t type and all uses can be replaced with normal pfns.

Link: https://lkml.kernel.org/r/bbedfa576c9822f8032494efbe43544628698b1f.1750323463.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Björn Töpel <bjorn@kernel.org>
Cc: Björn Töpel <bjorn@rivosinc.com>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Inki Dae <m.szyprowski@samsung.com>
Cc: John Groves <john@groves.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/pat/memtype.c                |  1 -
 drivers/dax/device.c                     | 23 +++++----
 drivers/dax/hmem/hmem.c                  |  1 -
 drivers/dax/kmem.c                       |  1 -
 drivers/dax/pmem.c                       |  1 -
 drivers/dax/super.c                      |  3 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c  |  1 -
 drivers/gpu/drm/gma500/fbdev.c           |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c |  1 -
 drivers/gpu/drm/msm/msm_gem.c            |  1 -
 drivers/gpu/drm/omapdrm/omap_gem.c       |  6 +--
 drivers/gpu/drm/v3d/v3d_bo.c             |  1 -
 drivers/hwtracing/intel_th/msu.c         |  3 +-
 drivers/md/dm-linear.c                   |  2 +-
 drivers/md/dm-log-writes.c               |  2 +-
 drivers/md/dm-stripe.c                   |  2 +-
 drivers/md/dm-target.c                   |  2 +-
 drivers/md/dm-writecache.c               | 11 ++---
 drivers/md/dm.c                          |  2 +-
 drivers/nvdimm/pmem.c                    |  8 ++-
 drivers/nvdimm/pmem.h                    |  4 +-
 drivers/s390/block/dcssblk.c             |  9 ++--
 drivers/vfio/pci/vfio_pci_core.c         |  5 +-
 fs/cramfs/inode.c                        |  5 +-
 fs/dax.c                                 | 50 +++++++++----------
 fs/ext4/file.c                           |  2 +-
 fs/fuse/dax.c                            |  3 +-
 fs/fuse/virtio_fs.c                      |  5 +-
 fs/xfs/xfs_file.c                        |  2 +-
 include/linux/dax.h                      |  9 ++--
 include/linux/device-mapper.h            |  2 +-
 include/linux/huge_mm.h                  |  6 ++-
 include/linux/mm.h                       |  4 +-
 include/linux/pfn.h                      |  9 ----
 include/linux/pfn_t.h                    | 85 --------------------------------
 mm/debug_vm_pgtable.c                    |  1 -
 mm/huge_memory.c                         | 21 ++++----
 mm/memory.c                              | 31 ++++++------
 mm/memremap.c                            |  1 -
 mm/migrate.c                             |  1 -
 tools/testing/nvdimm/pmem-dax.c          |  6 +--
 tools/testing/nvdimm/test/iomap.c        |  7 ---
 tools/testing/nvdimm/test/nfit_test.h    |  1 -
 43 files changed, 109 insertions(+), 235 deletions(-)
 delete mode 100644 include/linux/pfn_t.h

(limited to 'include')

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 2e7923844afe..c09284302dd3 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -36,7 +36,6 @@
 #include <linux/debugfs.h>
 #include <linux/ioport.h>
 #include <linux/kernel.h>
-#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/mm.h>
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 328231cfb028..2bb40a6060af 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -4,7 +4,6 @@
 #include <linux/pagemap.h>
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/pfn_t.h>
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/dax.h>
@@ -73,7 +72,7 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 	return -1;
 }
 
-static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
+static void dax_set_mapping(struct vm_fault *vmf, unsigned long pfn,
 			      unsigned long fault_size)
 {
 	unsigned long i, nr_pages = fault_size / PAGE_SIZE;
@@ -89,7 +88,7 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn,
 			ALIGN_DOWN(vmf->address, fault_size));
 
 	for (i = 0; i < nr_pages; i++) {
-		struct folio *folio = pfn_folio(pfn_t_to_pfn(pfn) + i);
+		struct folio *folio = pfn_folio(pfn + i);
 
 		if (folio->mapping)
 			continue;
@@ -104,7 +103,7 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 {
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
-	pfn_t pfn;
+	unsigned long pfn;
 	unsigned int fault_size = PAGE_SIZE;
 
 	if (check_vma(dev_dax, vmf->vma, __func__))
@@ -125,11 +124,11 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, 0);
+	pfn = PHYS_PFN(phys);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn),
+	return vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn),
 					vmf->flags & FAULT_FLAG_WRITE);
 }
 
@@ -140,7 +139,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
 	pgoff_t pgoff;
-	pfn_t pfn;
+	unsigned long pfn;
 	unsigned int fault_size = PMD_SIZE;
 
 	if (check_vma(dev_dax, vmf->vma, __func__))
@@ -169,11 +168,11 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, 0);
+	pfn = PHYS_PFN(phys);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_folio_pmd(vmf, page_folio(pfn_t_to_page(pfn)),
+	return vmf_insert_folio_pmd(vmf, page_folio(pfn_to_page(pfn)),
 				vmf->flags & FAULT_FLAG_WRITE);
 }
 
@@ -185,7 +184,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 	struct device *dev = &dev_dax->dev;
 	phys_addr_t phys;
 	pgoff_t pgoff;
-	pfn_t pfn;
+	unsigned long pfn;
 	unsigned int fault_size = PUD_SIZE;
 
 
@@ -215,11 +214,11 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
 		return VM_FAULT_SIGBUS;
 	}
 
-	pfn = phys_to_pfn_t(phys, 0);
+	pfn = PHYS_PFN(phys);
 
 	dax_set_mapping(vmf, pfn, fault_size);
 
-	return vmf_insert_folio_pud(vmf, page_folio(pfn_t_to_page(pfn)),
+	return vmf_insert_folio_pud(vmf, page_folio(pfn_to_page(pfn)),
 				vmf->flags & FAULT_FLAG_WRITE);
 }
 #else
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 5e7c53f18491..c18451a37e4f 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -2,7 +2,6 @@
 #include <linux/platform_device.h>
 #include <linux/memregion.h>
 #include <linux/module.h>
-#include <linux/pfn_t.h>
 #include <linux/dax.h>
 #include "../bus.h"
 
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 584c70a34b52..c036e4d0b610 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -5,7 +5,6 @@
 #include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/device.h>
-#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/dax.h>
 #include <linux/fs.h>
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index c8ebf4e281f2..bee93066a849 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -2,7 +2,6 @@
 /* Copyright(c) 2016 - 2018 Intel Corporation. All rights reserved. */
 #include <linux/memremap.h>
 #include <linux/module.h>
-#include <linux/pfn_t.h>
 #include "../nvdimm/pfn.h"
 #include "../nvdimm/nd.h"
 #include "bus.h"
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e16d1d40d773..54c480e874cb 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -7,7 +7,6 @@
 #include <linux/mount.h>
 #include <linux/pseudo_fs.h>
 #include <linux/magic.h>
-#include <linux/pfn_t.h>
 #include <linux/cdev.h>
 #include <linux/slab.h>
 #include <linux/uio.h>
@@ -148,7 +147,7 @@ enum dax_device_flags {
  * pages accessible at the device relative @pgoff.
  */
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
-		enum dax_access_mode mode, void **kaddr, pfn_t *pfn)
+		enum dax_access_mode mode, void **kaddr, unsigned long *pfn)
 {
 	long avail;
 
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index d44401a695e2..e3fbb45f37a2 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -7,7 +7,6 @@
 
 
 #include <linux/dma-buf.h>
-#include <linux/pfn_t.h>
 #include <linux/shmem_fs.h>
 #include <linux/module.h>
 
diff --git a/drivers/gpu/drm/gma500/fbdev.c b/drivers/gpu/drm/gma500/fbdev.c
index 109efdc96ac5..68b825fc056e 100644
--- a/drivers/gpu/drm/gma500/fbdev.c
+++ b/drivers/gpu/drm/gma500/fbdev.c
@@ -6,7 +6,6 @@
  **************************************************************************/
 
 #include <linux/fb.h>
-#include <linux/pfn_t.h>
 
 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_drv.h>
@@ -33,7 +32,7 @@ static vm_fault_t psb_fbdev_vm_fault(struct vm_fault *vmf)
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 
 	for (i = 0; i < page_num; ++i) {
-		err = vmf_insert_mixed(vma, address, __pfn_to_pfn_t(pfn, 0));
+		err = vmf_insert_mixed(vma, address, pfn);
 		if (unlikely(err & VM_FAULT_ERROR))
 			break;
 		address += PAGE_SIZE;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index f6d37dff320d..75f5b0e871ef 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -5,7 +5,6 @@
 
 #include <linux/anon_inodes.h>
 #include <linux/mman.h>
-#include <linux/pfn_t.h>
 #include <linux/sizes.h>
 
 #include <drm/drm_cache.h>
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 2995e80fec3b..20bf31fe799b 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 #include <linux/shmem_fs.h>
 #include <linux/dma-buf.h>
-#include <linux/pfn_t.h>
 
 #include <drm/drm_prime.h>
 #include <drm/drm_file.h>
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 9df05b2b7ba0..381552bfb409 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -8,7 +8,6 @@
 #include <linux/seq_file.h>
 #include <linux/shmem_fs.h>
 #include <linux/spinlock.h>
-#include <linux/pfn_t.h>
 #include <linux/vmalloc.h>
 
 #include <drm/drm_prime.h>
@@ -371,7 +370,7 @@ static vm_fault_t omap_gem_fault_1d(struct drm_gem_object *obj,
 	VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
 			pfn, pfn << PAGE_SHIFT);
 
-	return vmf_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, 0));
+	return vmf_insert_mixed(vma, vmf->address, pfn);
 }
 
 /* Special handling for the case of faulting in 2d tiled buffers */
@@ -466,8 +465,7 @@ static vm_fault_t omap_gem_fault_2d(struct drm_gem_object *obj,
 			pfn, pfn << PAGE_SHIFT);
 
 	for (i = n; i > 0; i--) {
-		ret = vmf_insert_mixed(vma,
-			vaddr, __pfn_to_pfn_t(pfn, 0));
+		ret = vmf_insert_mixed(vma, vaddr, pfn);
 		if (ret & VM_FAULT_ERROR)
 			break;
 		pfn += priv->usergart[fmt].stride_pfn;
diff --git a/drivers/gpu/drm/v3d/v3d_bo.c b/drivers/gpu/drm/v3d/v3d_bo.c
index bb7815599435..c41476ddde68 100644
--- a/drivers/gpu/drm/v3d/v3d_bo.c
+++ b/drivers/gpu/drm/v3d/v3d_bo.c
@@ -16,7 +16,6 @@
  */
 
 #include <linux/dma-buf.h>
-#include <linux/pfn_t.h>
 #include <linux/vmalloc.h>
 
 #include "v3d_drv.h"
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index 7163950eb371..f3a13b300835 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -19,7 +19,6 @@
 #include <linux/io.h>
 #include <linux/workqueue.h>
 #include <linux/dma-mapping.h>
-#include <linux/pfn_t.h>
 
 #ifdef CONFIG_X86
 #include <asm/set_memory.h>
@@ -1618,7 +1617,7 @@ static vm_fault_t msc_mmap_fault(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	get_page(page);
-	return vmf_insert_mixed(vmf->vma, vmf->address, page_to_pfn_t(page));
+	return vmf_insert_mixed(vmf->vma, vmf->address, page_to_pfn(page));
 }
 
 static const struct vm_operations_struct msc_mmap_ops = {
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 15538ec58f8e..73bf290af181 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -170,7 +170,7 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
 
 static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff);
 
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index d484e8e1d48a..679b07dee229 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -893,7 +893,7 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti,
 
 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff);
 
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a7dc04bd55e5..366f46159785 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -316,7 +316,7 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff)
 
 static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff);
 
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 652627aea11b..2af5a9514c05 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -255,7 +255,7 @@ static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	return -EIO;
 }
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index a428e1cacf07..d8de4a3076a1 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -13,7 +13,6 @@
 #include <linux/dm-io.h>
 #include <linux/dm-kcopyd.h>
 #include <linux/dax.h>
-#include <linux/pfn_t.h>
 #include <linux/libnvdimm.h>
 #include <linux/delay.h>
 #include "dm-io-tracker.h"
@@ -256,7 +255,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 	int r;
 	loff_t s;
 	long p, da;
-	pfn_t pfn;
+	unsigned long pfn;
 	int id;
 	struct page **pages;
 	sector_t offset;
@@ -290,7 +289,7 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 		r = da;
 		goto err2;
 	}
-	if (!pfn_t_has_page(pfn)) {
+	if (!pfn_valid(pfn)) {
 		wc->memory_map = NULL;
 		r = -EOPNOTSUPP;
 		goto err2;
@@ -314,13 +313,13 @@ static int persistent_memory_claim(struct dm_writecache *wc)
 				r = daa ? daa : -EINVAL;
 				goto err3;
 			}
-			if (!pfn_t_has_page(pfn)) {
+			if (!pfn_valid(pfn)) {
 				r = -EOPNOTSUPP;
 				goto err3;
 			}
 			while (daa-- && i < p) {
-				pages[i++] = pfn_t_to_page(pfn);
-				pfn.val++;
+				pages[i++] = pfn_to_page(pfn);
+				pfn++;
 				if (!(i & 15))
 					cond_resched();
 			}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1726f0f828cc..4b9415f718e3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1218,7 +1218,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
 
 static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	struct mapped_device *md = dax_get_private(dax_dev);
 	sector_t sector = pgoff * PAGE_SECTORS;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index aa50006b7616..05785ff21a8b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -20,7 +20,6 @@
 #include <linux/kstrtox.h>
 #include <linux/vmalloc.h>
 #include <linux/blk-mq.h>
-#include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/uio.h>
 #include <linux/dax.h>
@@ -242,7 +241,7 @@ static void pmem_submit_bio(struct bio *bio)
 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
 __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 	sector_t sector = PFN_PHYS(pgoff) >> SECTOR_SHIFT;
@@ -254,7 +253,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = pmem->virt_addr + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+		*pfn = PHYS_PFN(pmem->phys_addr + offset);
 
 	if (bb->count &&
 	    badblocks_check(bb, sector, num, &first_bad, &num_bad)) {
@@ -303,7 +302,7 @@ static int pmem_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
 
 static long pmem_dax_direct_access(struct dax_device *dax_dev,
 		pgoff_t pgoff, long nr_pages, enum dax_access_mode mode,
-		void **kaddr, pfn_t *pfn)
+		void **kaddr, unsigned long *pfn)
 {
 	struct pmem_device *pmem = dax_get_private(dax_dev);
 
@@ -513,7 +512,6 @@ static int pmem_attach_disk(struct device *dev,
 
 	pmem->disk = disk;
 	pmem->pgmap.owner = pmem;
-	pmem->pfn_flags = 0;
 	if (is_nd_pfn(dev)) {
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 392b0b38acb9..a48509f90196 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,7 +5,6 @@
 #include <linux/badblocks.h>
 #include <linux/memremap.h>
 #include <linux/types.h>
-#include <linux/pfn_t.h>
 #include <linux/fs.h>
 
 enum dax_access_mode;
@@ -16,7 +15,6 @@ struct pmem_device {
 	phys_addr_t		phys_addr;
 	/* when non-zero this device is hosting a 'pfn' instance */
 	phys_addr_t		data_offset;
-	u64			pfn_flags;
 	void			*virt_addr;
 	/* immutable base size of the namespace */
 	size_t			size;
@@ -31,7 +29,7 @@ struct pmem_device {
 
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn);
+		unsigned long *pfn);
 
 #ifdef CONFIG_MEMORY_FAILURE
 static inline bool test_and_clear_pmem_poison(struct page *page)
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 249ae403f698..94fa5edecadd 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -17,7 +17,6 @@
 #include <linux/blkdev.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
-#include <linux/pfn_t.h>
 #include <linux/uio.h>
 #include <linux/dax.h>
 #include <linux/io.h>
@@ -33,7 +32,7 @@ static void dcssblk_release(struct gendisk *disk);
 static void dcssblk_submit_bio(struct bio *bio);
 static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn);
+		unsigned long *pfn);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
@@ -914,7 +913,7 @@ fail:
 
 static long
 __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
-		long nr_pages, void **kaddr, pfn_t *pfn)
+		long nr_pages, void **kaddr, unsigned long *pfn)
 {
 	resource_size_t offset = pgoff * PAGE_SIZE;
 	unsigned long dev_sz;
@@ -923,7 +922,7 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = __va(dev_info->start + offset);
 	if (pfn)
-		*pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), 0);
+		*pfn = PFN_DOWN(dev_info->start + offset);
 
 	return (dev_sz - offset) / PAGE_SIZE;
 }
@@ -931,7 +930,7 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
 static long
 dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	struct dcssblk_dev_info *dev_info = dax_get_private(dax_dev);
 
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 3f2ad5fb4c17..31bdb9110cc0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -20,7 +20,6 @@
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/pci.h>
-#include <linux/pfn_t.h>
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 #include <linux/types.h>
@@ -1669,12 +1668,12 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
 		break;
 #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
 	case PMD_ORDER:
-		ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn, 0), false);
+		ret = vmf_insert_pfn_pmd(vmf, pfn, false);
 		break;
 #endif
 #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
 	case PUD_ORDER:
-		ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn, 0), false);
+		ret = vmf_insert_pfn_pud(vmf, pfn, false);
 		break;
 #endif
 	default:
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 820a664cfec7..b002e9b734f9 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,7 +17,6 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
-#include <linux/pfn_t.h>
 #include <linux/ramfs.h>
 #include <linux/init.h>
 #include <linux/string.h>
@@ -412,8 +411,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
 		for (i = 0; i < pages && !ret; i++) {
 			vm_fault_t vmf;
 			unsigned long off = i * PAGE_SIZE;
-			pfn_t pfn = phys_to_pfn_t(address + off, 0);
-			vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+			vmf = vmf_insert_mixed(vma, vma->vm_start + off,
+					address + off);
 			if (vmf & VM_FAULT_ERROR)
 				ret = vm_fault_to_errno(vmf, 0);
 		}
diff --git a/fs/dax.c b/fs/dax.c
index f4ffb6982270..4229513806be 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,7 +20,6 @@
 #include <linux/sched/signal.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
-#include <linux/pfn_t.h>
 #include <linux/sizes.h>
 #include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
@@ -76,9 +75,9 @@ static struct folio *dax_to_folio(void *entry)
 	return page_folio(pfn_to_page(dax_to_pfn(entry)));
 }
 
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+static void *dax_make_entry(unsigned long pfn, unsigned long flags)
 {
-	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+	return xa_mk_value(flags | (pfn << DAX_SHIFT));
 }
 
 static bool dax_is_locked(void *entry)
@@ -713,7 +712,7 @@ retry:
 
 		if (order > 0)
 			flags |= DAX_PMD;
-		entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+		entry = dax_make_entry(0, flags);
 		dax_lock_entry(xas, entry);
 		if (xas_error(xas))
 			goto out_unlock;
@@ -1041,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
  * appropriate.
  */
 static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
-		const struct iomap_iter *iter, void *entry, pfn_t pfn,
+		const struct iomap_iter *iter, void *entry, unsigned long pfn,
 		unsigned long flags)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1239,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
-		size_t size, void **kaddr, pfn_t *pfnp)
+		size_t size, void **kaddr, unsigned long *pfnp)
 {
 	pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
 	int id, rc = 0;
@@ -1257,7 +1256,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
 	rc = -EINVAL;
 	if (PFN_PHYS(length) < size)
 		goto out;
-	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
+	if (*pfnp & (PHYS_PFN(size)-1))
 		goto out;
 
 	rc = 0;
@@ -1361,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 {
 	struct inode *inode = iter->inode;
 	unsigned long vaddr = vmf->address;
-	pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+	unsigned long pfn = my_zero_pfn(vaddr);
 	vm_fault_t ret;
 
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
 
-	ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
+	ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
 	trace_dax_load_hole(inode, vmf, ret);
 	return ret;
 }
@@ -1383,14 +1382,14 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	struct folio *zero_folio;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
-	pfn_t pfn;
+	unsigned long pfn;
 
 	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_folio))
 		goto fallback;
 
-	pfn = page_to_pfn_t(&zero_folio->page);
+	pfn = page_to_pfn(&zero_folio->page);
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
 				  DAX_PMD | DAX_ZERO_PAGE);
 
@@ -1779,7 +1778,8 @@ static vm_fault_t dax_fault_return(int error)
  * insertion for now and return the pfn so that caller can insert it after the
  * fsync is done.
  */
-static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
+					unsigned long pfn)
 {
 	if (WARN_ON_ONCE(!pfnp))
 		return VM_FAULT_SIGBUS;
@@ -1827,7 +1827,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
  * @pmd:	distinguish whether it is a pmd fault
  */
 static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
-		const struct iomap_iter *iter, pfn_t *pfnp,
+		const struct iomap_iter *iter, unsigned long *pfnp,
 		struct xa_state *xas, void **entry, bool pmd)
 {
 	const struct iomap *iomap = &iter->iomap;
@@ -1838,7 +1838,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 	unsigned long entry_flags = pmd ? DAX_PMD : 0;
 	struct folio *folio;
 	int ret, err = 0;
-	pfn_t pfn;
+	unsigned long pfn;
 	void *kaddr;
 
 	if (!pmd && vmf->cow_page)
@@ -1875,16 +1875,15 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
 
 	folio_ref_inc(folio);
 	if (pmd)
-		ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
-					write);
+		ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
 	else
-		ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
+		ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
 	folio_put(folio);
 
 	return ret;
 }
 
-static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
 			       int *iomap_errp, const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1996,7 +1995,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
 	return false;
 }
 
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
 			       const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -2077,7 +2076,7 @@ out:
 	return ret;
 }
 #else
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
 			       const struct iomap_ops *ops)
 {
 	return VM_FAULT_FALLBACK;
@@ -2098,7 +2097,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  * successfully.
  */
 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
-		    pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
+			unsigned long *pfnp, int *iomap_errp,
+			const struct iomap_ops *ops)
 {
 	if (order == 0)
 		return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
@@ -2118,8 +2118,8 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  * This function inserts a writeable PTE or PMD entry into the page tables
  * for an mmaped DAX file.  It also marks the page cache entry as dirty.
  */
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+					unsigned long pfn, unsigned int order)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
@@ -2141,7 +2141,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
 	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
 	dax_lock_entry(&xas, entry);
 	xas_unlock_irq(&xas);
-	folio = pfn_folio(pfn_t_to_pfn(pfn));
+	folio = pfn_folio(pfn);
 	folio_ref_inc(folio);
 	if (order == 0)
 		ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
@@ -2168,7 +2168,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
  * table entry.
  */
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
-		pfn_t pfn)
+		unsigned long pfn)
 {
 	int err;
 	loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 21df81347147..e6e962982319 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -747,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	pfn_t pfn;
+	unsigned long pfn;
 
 	if (write) {
 		sb_start_pagefault(sb);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 0502bf3cdf6a..ac6d4c1064cc 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -10,7 +10,6 @@
 #include <linux/dax.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
-#include <linux/pfn_t.h>
 #include <linux/iomap.h>
 #include <linux/interval_tree.h>
 
@@ -757,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
 	vm_fault_t ret;
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	pfn_t pfn;
+	unsigned long pfn;
 	int error = 0;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_conn_dax *fcd = fc->dax;
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 53c2626e90e7..aac914b2cd50 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -9,7 +9,6 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/group_cpus.h>
-#include <linux/pfn_t.h>
 #include <linux/memremap.h>
 #include <linux/module.h>
 #include <linux/virtio.h>
@@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
  */
 static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 				    long nr_pages, enum dax_access_mode mode,
-				    void **kaddr, pfn_t *pfn)
+				    void **kaddr, unsigned long *pfn)
 {
 	struct virtio_fs *fs = dax_get_private(dax_dev);
 	phys_addr_t offset = PFN_PHYS(pgoff);
@@ -1017,7 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = fs->window_kaddr + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
+		*pfn = fs->window_phys_addr + offset;
 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0b41b18debf3..314a9d9dd7db 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1730,7 +1730,7 @@ xfs_dax_fault_locked(
 	bool			write_fault)
 {
 	vm_fault_t		ret;
-	pfn_t			pfn;
+	unsigned long		pfn;
 
 	if (!IS_ENABLED(CONFIG_FS_DAX)) {
 		ASSERT(0);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index dcc9fcdf14e4..29eec755430b 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -26,7 +26,7 @@ struct dax_operations {
 	 * number of pages available for DAX at that pfn.
 	 */
 	long (*direct_access)(struct dax_device *, pgoff_t, long,
-			enum dax_access_mode, void **, pfn_t *);
+			enum dax_access_mode, void **, unsigned long *);
 	/* zero_page_range: required operation. Zero page range   */
 	int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
 	/*
@@ -241,7 +241,7 @@ static inline void dax_break_layout_final(struct inode *inode)
 bool dax_alive(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
-		enum dax_access_mode mode, void **kaddr, pfn_t *pfn);
+		enum dax_access_mode mode, void **kaddr, unsigned long *pfn);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
 		size_t bytes, struct iov_iter *i);
 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
@@ -255,9 +255,10 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
-		    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
+			unsigned long *pfnp, int *errp,
+			const struct iomap_ops *ops);
 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
-		unsigned int order, pfn_t pfn);
+		unsigned int order, unsigned long pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 void dax_delete_mapping_range(struct address_space *mapping,
 				loff_t start, loff_t end);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index cb95951547ab..84fdc3a6a19a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -156,7 +156,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode node, void **kaddr,
-		pfn_t *pfn);
+		unsigned long *pfn);
 typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff,
 		size_t nr_pages);
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 26607f2c65fb..8f1b15213f61 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -37,8 +37,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
 		    unsigned long cp_flags);
 
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
+			      bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
+			      bool write);
 vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
 				bool write);
 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4d833f159988..dccebf0abf06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3522,9 +3522,9 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn, pgprot_t pgprot);
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn);
+			unsigned long pfn);
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn);
+		unsigned long addr, unsigned long pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
 
 static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
diff --git a/include/linux/pfn.h b/include/linux/pfn.h
index 14bc053c53d8..b90ca0b6c331 100644
--- a/include/linux/pfn.h
+++ b/include/linux/pfn.h
@@ -4,15 +4,6 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-
-/*
- * pfn_t: encapsulates a page-frame number that is optionally backed
- * by memmap (struct page).  Whether a pfn_t has a 'struct page'
- * backing is indicated by flags in the high bits of the value.
- */
-typedef struct {
-	u64 val;
-} pfn_t;
 #endif
 
 #define PFN_ALIGN(x)	(((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK)
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
deleted file mode 100644
index 2c002936a0f6..000000000000
--- a/include/linux/pfn_t.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_PFN_T_H_
-#define _LINUX_PFN_T_H_
-#include <linux/mm.h>
-
-/*
- * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags
- */
-#define PFN_FLAGS_MASK (((u64) (~PAGE_MASK)) << (BITS_PER_LONG_LONG - PAGE_SHIFT))
-
-#define PFN_FLAGS_TRACE \
-	{ }
-
-static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, u64 flags)
-{
-	pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), };
-
-	return pfn_t;
-}
-
-/* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */
-static inline pfn_t pfn_to_pfn_t(unsigned long pfn)
-{
-	return __pfn_to_pfn_t(pfn, 0);
-}
-
-static inline pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
-{
-	return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
-}
-
-static inline bool pfn_t_has_page(pfn_t pfn)
-{
-	return true;
-}
-
-static inline unsigned long pfn_t_to_pfn(pfn_t pfn)
-{
-	return pfn.val & ~PFN_FLAGS_MASK;
-}
-
-static inline struct page *pfn_t_to_page(pfn_t pfn)
-{
-	if (pfn_t_has_page(pfn))
-		return pfn_to_page(pfn_t_to_pfn(pfn));
-	return NULL;
-}
-
-static inline phys_addr_t pfn_t_to_phys(pfn_t pfn)
-{
-	return PFN_PHYS(pfn_t_to_pfn(pfn));
-}
-
-static inline pfn_t page_to_pfn_t(struct page *page)
-{
-	return pfn_to_pfn_t(page_to_pfn(page));
-}
-
-static inline int pfn_t_valid(pfn_t pfn)
-{
-	return pfn_valid(pfn_t_to_pfn(pfn));
-}
-
-#ifdef CONFIG_MMU
-static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot)
-{
-	return pfn_pte(pfn_t_to_pfn(pfn), pgprot);
-}
-#endif
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot)
-{
-	return pfn_pmd(pfn_t_to_pfn(pfn), pgprot);
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
-{
-	return pfn_pud(pfn_t_to_pfn(pfn), pgprot);
-}
-#endif
-#endif
-
-#endif /* _LINUX_PFN_T_H_ */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index d84d0c49012f..bd8f9317b025 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -20,7 +20,6 @@
 #include <linux/mman.h>
 #include <linux/mm_types.h>
 #include <linux/module.h>
-#include <linux/pfn_t.h>
 #include <linux/printk.h>
 #include <linux/pgtable.h>
 #include <linux/random.h>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cf808b2eea29..ce130225a8e5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -22,7 +22,6 @@
 #include <linux/mm_types.h>
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
-#include <linux/pfn_t.h>
 #include <linux/mman.h>
 #include <linux/memremap.h>
 #include <linux/pagemap.h>
@@ -1375,7 +1374,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 struct folio_or_pfn {
 	union {
 		struct folio *folio;
-		pfn_t pfn;
+		unsigned long pfn;
 	};
 	bool is_folio;
 };
@@ -1391,7 +1390,7 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	if (!pmd_none(*pmd)) {
 		const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
-					  pfn_t_to_pfn(fop.pfn);
+					  fop.pfn;
 
 		if (write) {
 			if (pmd_pfn(*pmd) != pfn) {
@@ -1414,7 +1413,7 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
 		folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
 		add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
 	} else {
-		entry = pmd_mkhuge(pfn_t_pmd(fop.pfn, prot));
+		entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
 		entry = pmd_mkspecial(entry);
 	}
 	if (write) {
@@ -1442,7 +1441,8 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
  *
  * Return: vm_fault_t value.
  */
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
+			      bool write)
 {
 	unsigned long addr = vmf->address & PMD_MASK;
 	struct vm_area_struct *vma = vmf->vma;
@@ -1473,7 +1473,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 			return VM_FAULT_OOM;
 	}
 
-	pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
 
 	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
@@ -1539,7 +1539,7 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
 
 	if (!pud_none(*pud)) {
 		const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
-					  pfn_t_to_pfn(fop.pfn);
+					  fop.pfn;
 
 		if (write) {
 			if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
@@ -1559,7 +1559,7 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
 		folio_add_file_rmap_pud(fop.folio, &fop.folio->page, vma);
 		add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PUD_NR);
 	} else {
-		entry = pud_mkhuge(pfn_t_pud(fop.pfn, prot));
+		entry = pud_mkhuge(pfn_pud(fop.pfn, prot));
 		entry = pud_mkspecial(entry);
 	}
 	if (write) {
@@ -1580,7 +1580,8 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
  *
  * Return: vm_fault_t value.
  */
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
+			      bool write)
 {
 	unsigned long addr = vmf->address & PUD_MASK;
 	struct vm_area_struct *vma = vmf->vma;
@@ -1603,7 +1604,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return VM_FAULT_SIGBUS;
 
-	pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
 
 	ptl = pud_lock(vma->vm_mm, vmf->pud);
 	insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
diff --git a/mm/memory.c b/mm/memory.c
index e932a007af4c..0f9b32a20e5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,7 +57,6 @@
 #include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
-#include <linux/pfn_t.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
@@ -2435,7 +2434,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
 EXPORT_SYMBOL(vm_map_pages_zero);
 
 static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-			pfn_t pfn, pgprot_t prot, bool mkwrite)
+			unsigned long pfn, pgprot_t prot, bool mkwrite)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte, entry;
@@ -2457,7 +2456,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			 * allocation and mapping invalidation so just skip the
 			 * update.
 			 */
-			if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+			if (pte_pfn(entry) != pfn) {
 				WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
 				goto out_unlock;
 			}
@@ -2470,7 +2469,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 	}
 
 	/* Ok, finally just insert the thing.. */
-	entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+	entry = pte_mkspecial(pfn_pte(pfn, prot));
 
 	if (mkwrite) {
 		entry = pte_mkyoung(entry);
@@ -2541,8 +2540,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
 
 	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
 
-	return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, 0), pgprot,
-			false);
+	return insert_pfn(vma, addr, pfn, pgprot, false);
 }
 EXPORT_SYMBOL(vmf_insert_pfn_prot);
 
@@ -2573,21 +2571,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(vmf_insert_pfn);
 
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
+static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
+			bool mkwrite)
 {
-	if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+	if (unlikely(is_zero_pfn(pfn)) &&
 	    (mkwrite || !vm_mixed_zeropage_allowed(vma)))
 		return false;
 	/* these checks mirror the abort conditions in vm_normal_page */
 	if (vma->vm_flags & VM_MIXEDMAP)
 		return true;
-	if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+	if (is_zero_pfn(pfn))
 		return true;
 	return false;
 }
 
 static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn, bool mkwrite)
+		unsigned long addr, unsigned long pfn, bool mkwrite)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
 	int err;
@@ -2598,9 +2597,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return VM_FAULT_SIGBUS;
 
-	pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
+	pfnmap_setup_cachemode_pfn(pfn, &pgprot);
 
-	if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+	if (!pfn_modify_allowed(pfn, pgprot))
 		return VM_FAULT_SIGBUS;
 
 	/*
@@ -2610,7 +2609,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 	 * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 	 * without pte special, it would there be refcounted as a normal page.
 	 */
-	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_t_valid(pfn)) {
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
 		struct page *page;
 
 		/*
@@ -2618,7 +2617,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
 		 * regardless of whether the caller specified flags that
 		 * result in pfn_t_has_page() == false.
 		 */
-		page = pfn_to_page(pfn_t_to_pfn(pfn));
+		page = pfn_to_page(pfn);
 		err = insert_page(vma, addr, page, pgprot, mkwrite);
 	} else {
 		return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
@@ -2653,7 +2652,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
 EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
 
 vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-		pfn_t pfn)
+		unsigned long pfn)
 {
 	return __vm_insert_mixed(vma, addr, pfn, false);
 }
@@ -2665,7 +2664,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
  *  the same entry was actually inserted.
  */
 vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
-		unsigned long addr, pfn_t pfn)
+		unsigned long addr, unsigned long pfn)
 {
 	return __vm_insert_mixed(vma, addr, pfn, true);
 }
diff --git a/mm/memremap.c b/mm/memremap.c
index c17e0a69cace..044a4550671a 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -5,7 +5,6 @@
 #include <linux/kasan.h>
 #include <linux/memory_hotplug.h>
 #include <linux/memremap.h>
-#include <linux/pfn_t.h>
 #include <linux/swap.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 8cf0f9c9599d..ea8c74d99659 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,7 +35,6 @@
 #include <linux/compat.h>
 #include <linux/hugetlb.h>
 #include <linux/gfp.h>
-#include <linux/pfn_t.h>
 #include <linux/page_idle.h>
 #include <linux/page_owner.h>
 #include <linux/sched/mm.h>
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index c1ec099a3b1d..05e763aab104 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -10,7 +10,7 @@
 
 long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 		long nr_pages, enum dax_access_mode mode, void **kaddr,
-		pfn_t *pfn)
+		unsigned long *pfn)
 {
 	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
@@ -29,7 +29,7 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 			*kaddr = pmem->virt_addr + offset;
 		page = vmalloc_to_page(pmem->virt_addr + offset);
 		if (pfn)
-			*pfn = page_to_pfn_t(page);
+			*pfn = page_to_pfn(page);
 		pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
 				__func__, pmem, pgoff, page_to_pfn(page));
 
@@ -39,7 +39,7 @@ long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	if (kaddr)
 		*kaddr = pmem->virt_addr + offset;
 	if (pfn)
-		*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
+		*pfn = PHYS_PFN(pmem->phys_addr + offset);
 
 	/*
 	 * If badblocks are present, limit known good range to the
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index ddceb04b4a9a..f7e7bfe9bb85 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -8,7 +8,6 @@
 #include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/pfn_t.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/mm.h>
@@ -135,12 +134,6 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 }
 EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
-pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
-{
-        return phys_to_pfn_t(addr, flags);
-}
-EXPORT_SYMBOL(__wrap_phys_to_pfn_t);
-
 void *__wrap_memremap(resource_size_t offset, size_t size,
 		unsigned long flags)
 {
diff --git a/tools/testing/nvdimm/test/nfit_test.h b/tools/testing/nvdimm/test/nfit_test.h
index b00583d1eace..b9047fb8ea4a 100644
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -212,7 +212,6 @@ void __iomem *__wrap_devm_ioremap(struct device *dev,
 void *__wrap_devm_memremap(struct device *dev, resource_size_t offset,
 		size_t size, unsigned long flags);
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
-pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags);
 void *__wrap_memremap(resource_size_t offset, size_t size,
 		unsigned long flags);
 void __wrap_devm_memunmap(struct device *dev, void *addr);
-- 
cgit v1.2.3


From 50b4233a22b1ee9ccd0e847597de66ce21329ddb Mon Sep 17 00:00:00 2001
From: Julian Vetter <julian@outer-limits.org>
Date: Tue, 3 Jun 2025 15:21:21 +0200
Subject: include/linux/jhash.h: replace __get_unaligned_cpu32 in jhash
 function

__get_unaligned_cpu32() is deprecated.  So, replace it with the more
generic get_unaligned() and just cast the input parameter.

Link: https://lkml.kernel.org/r/20250603132121.3674066-1-julian@outer-limits.org
Signed-off-by: Julian Vetter <julian@outer-limits.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Wei-Hsin Yeh <weihsinyeh168@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jhash.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/jhash.h b/include/linux/jhash.h
index fa26a2dd3b52..7c1c1821c694 100644
--- a/include/linux/jhash.h
+++ b/include/linux/jhash.h
@@ -24,7 +24,7 @@
  * Jozsef
  */
 #include <linux/bitops.h>
-#include <linux/unaligned/packed_struct.h>
+#include <linux/unaligned.h>
 
 /* Best hash sizes are of power of two */
 #define jhash_size(n)   ((u32)1<<(n))
@@ -77,9 +77,9 @@ static inline u32 jhash(const void *key, u32 length, u32 initval)
 
 	/* All but the last block: affect some 32 bits of (a,b,c) */
 	while (length > 12) {
-		a += __get_unaligned_cpu32(k);
-		b += __get_unaligned_cpu32(k + 4);
-		c += __get_unaligned_cpu32(k + 8);
+		a += get_unaligned((u32 *)k);
+		b += get_unaligned((u32 *)(k + 4));
+		c += get_unaligned((u32 *)(k + 8));
 		__jhash_mix(a, b, c);
 		length -= 12;
 		k += 12;
-- 
cgit v1.2.3


From 2489e958129ff7cbf26a34ee33cdc9ccbd68fe3c Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:57 +0800
Subject: relayfs: abolish prev_padding

Patch series "relayfs: misc changes", v5.

The series mostly focuses on the error counters which helps every user
debug their own kernel module.


This patch (of 5):

prev_padding represents the unused space of certain subbuffer.  If the
content of a call of relay_write() exceeds the limit of the remainder of
this subbuffer, it will skip storing in the rest space and record the
start point as buf->prev_padding in relay_switch_subbuf().  Since the buf
is a per-cpu big buffer, the point of prev_padding as a global value for
the whole buffer instead of a single subbuffer (whose padding info is
stored in buf->padding[]) seems meaningless from the real use cases, so we
don't bother to record it any more.

Link: https://lkml.kernel.org/r/20250612061201.34272-1-kerneljasonxing@gmail.com
Link: https://lkml.kernel.org/r/20250612061201.34272-2-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c |  3 +--
 drivers/net/wwan/iosm/iosm_ipc_trace.c     |  3 +--
 drivers/net/wwan/t7xx/t7xx_port_trace.c    |  2 +-
 include/linux/relay.h                      |  5 +----
 kernel/relay.c                             | 14 ++++++++------
 kernel/trace/blktrace.c                    |  2 +-
 6 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index e8a04e476c57..09a64f224c49 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -220,8 +220,7 @@ static int guc_action_control_log(struct intel_guc *guc, bool enable,
  */
 static int subbuf_start_callback(struct rchan_buf *buf,
 				 void *subbuf,
-				 void *prev_subbuf,
-				 size_t prev_padding)
+				 void *prev_subbuf)
 {
 	/*
 	 * Use no-overwrite mode by default, where relay will stop accepting
diff --git a/drivers/net/wwan/iosm/iosm_ipc_trace.c b/drivers/net/wwan/iosm/iosm_ipc_trace.c
index eeecfa3d10c5..9656254c1c6c 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_trace.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_trace.c
@@ -51,8 +51,7 @@ static int ipc_trace_remove_buf_file_handler(struct dentry *dentry)
 }
 
 static int ipc_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					  void *prev_subbuf,
-					  size_t prev_padding)
+					  void *prev_subbuf)
 {
 	if (relay_buf_full(buf)) {
 		pr_err_ratelimited("Relay_buf full dropping traces");
diff --git a/drivers/net/wwan/t7xx/t7xx_port_trace.c b/drivers/net/wwan/t7xx/t7xx_port_trace.c
index 4ed8b4e29bf1..f16d3b01302c 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_trace.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_trace.c
@@ -33,7 +33,7 @@ static int t7xx_trace_remove_buf_file_handler(struct dentry *dentry)
 }
 
 static int t7xx_trace_subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					   void *prev_subbuf, size_t prev_padding)
+					   void *prev_subbuf)
 {
 	if (relay_buf_full(buf)) {
 		pr_err_ratelimited("Relay_buf full dropping traces");
diff --git a/include/linux/relay.h b/include/linux/relay.h
index b3224111d074..e10a0fdf4325 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -47,7 +47,6 @@ struct rchan_buf
 	unsigned int page_count;	/* number of current buffer pages */
 	unsigned int finalized;		/* buffer has been finalized */
 	size_t *padding;		/* padding counts per sub-buffer */
-	size_t prev_padding;		/* temporary variable */
 	size_t bytes_consumed;		/* bytes consumed in cur read subbuf */
 	size_t early_bytes;		/* bytes consumed before VFS inited */
 	unsigned int cpu;		/* this buf's cpu */
@@ -84,7 +83,6 @@ struct rchan_callbacks
 	 * @buf: the channel buffer containing the new sub-buffer
 	 * @subbuf: the start of the new sub-buffer
 	 * @prev_subbuf: the start of the previous sub-buffer
-	 * @prev_padding: unused space at the end of previous sub-buffer
 	 *
 	 * The client should return 1 to continue logging, 0 to stop
 	 * logging.
@@ -100,8 +98,7 @@ struct rchan_callbacks
 	 */
 	int (*subbuf_start) (struct rchan_buf *buf,
 			     void *subbuf,
-			     void *prev_subbuf,
-			     size_t prev_padding);
+			     void *prev_subbuf);
 
 	/*
 	 * create_buf_file - create file to represent a relay channel buffer
diff --git a/kernel/relay.c b/kernel/relay.c
index 3ee5b038d0d9..fc6ad76b789d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -249,13 +249,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
  */
 
 static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
-			      void *prev_subbuf, size_t prev_padding)
+			      void *prev_subbuf)
 {
 	if (!buf->chan->cb->subbuf_start)
 		return !relay_buf_full(buf);
 
 	return buf->chan->cb->subbuf_start(buf, subbuf,
-					   prev_subbuf, prev_padding);
+					   prev_subbuf);
 }
 
 /**
@@ -301,7 +301,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
 
-	relay_subbuf_start(buf, buf->data, NULL, 0);
+	relay_subbuf_start(buf, buf->data, NULL);
 }
 
 /**
@@ -554,9 +554,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 		goto toobig;
 
 	if (buf->offset != buf->chan->subbuf_size + 1) {
-		buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+		size_t prev_padding;
+
+		prev_padding = buf->chan->subbuf_size - buf->offset;
 		old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
-		buf->padding[old_subbuf] = buf->prev_padding;
+		buf->padding[old_subbuf] = prev_padding;
 		buf->subbufs_produced++;
 		if (buf->dentry)
 			d_inode(buf->dentry)->i_size +=
@@ -581,7 +583,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
 	new = buf->start + new_subbuf * buf->chan->subbuf_size;
 	buf->offset = 0;
-	if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
+	if (!relay_subbuf_start(buf, new, old)) {
 		buf->offset = buf->chan->subbuf_size + 1;
 		return 0;
 	}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3f6a7bdc6edf..d3083c88474e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -461,7 +461,7 @@ static const struct file_operations blk_msg_fops = {
  * the user space app in telling how many lost events there were.
  */
 static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf, size_t prev_padding)
+				     void *prev_subbuf)
 {
 	struct blk_trace *bt;
 
-- 
cgit v1.2.3


From ca01a90ae7bf9bb22137e719366bdc0f387675c2 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:58 +0800
Subject: relayfs: support a counter tracking if per-cpu buffers is full

When using relay mechanism, we often encounter the case where new data are
lost or old unconsumed data are overwritten because of slow reader.

Add 'full' field in per-cpu buffer structure to detect if the above case
is happening.  Relay has two modes: 1) non-overwrite mode, 2) overwrite
mode.  So buffer being full here respectively means: 1) relayfs doesn't
intend to accept new data and then simply drop them, or 2) relayfs is
going to start over again and overwrite old unread data with new data.

Note: this counter doesn't need any explicit lock to protect from being
modified by different threads for the better performance consideration.
Writers calling __relay_write/relay_write should consider how to use the
lock and ensure it performs under the lock protection, thus it's not
necessary to add a new small lock here.

Link: https://lkml.kernel.org/r/20250612061201.34272-3-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h | 9 +++++++++
 kernel/relay.c        | 8 +++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index e10a0fdf4325..cd77eb285a48 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -28,6 +28,14 @@
  */
 #define RELAYFS_CHANNEL_VERSION		7
 
+/*
+ * Relay buffer statistics
+ */
+struct rchan_buf_stats
+{
+	unsigned int full_count;	/* counter for buffer full */
+};
+
 /*
  * Per-cpu relay channel buffer
  */
@@ -43,6 +51,7 @@ struct rchan_buf
 	struct irq_work wakeup_work;	/* reader wakeup */
 	struct dentry *dentry;		/* channel file dentry */
 	struct kref kref;		/* channel buffer refcount */
+	struct rchan_buf_stats stats;	/* buffer stats */
 	struct page **page_array;	/* array of current buffer pages */
 	unsigned int page_count;	/* number of current buffer pages */
 	unsigned int finalized;		/* buffer has been finalized */
diff --git a/kernel/relay.c b/kernel/relay.c
index fc6ad76b789d..4b07efddc2cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -251,8 +251,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full);
 static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
 			      void *prev_subbuf)
 {
+	int full = relay_buf_full(buf);
+
+	if (full)
+		buf->stats.full_count++;
+
 	if (!buf->chan->cb->subbuf_start)
-		return !relay_buf_full(buf);
+		return !full;
 
 	return buf->chan->cb->subbuf_start(buf, subbuf,
 					   prev_subbuf);
@@ -297,6 +302,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	buf->finalized = 0;
 	buf->data = buf->start;
 	buf->offset = 0;
+	buf->stats.full_count = 0;
 
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
-- 
cgit v1.2.3


From a53202ce7fbafd24f854865b02eff891e246c550 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:11:59 +0800
Subject: relayfs: introduce getting relayfs statistics function

In this version, only support getting the counter for buffer full and
implement the framework of how it works.

Users can pass certain flag to fetch what field/statistics they expect to
know.  Each time it only returns one result.  So do not pass multiple
flags.

Link: https://lkml.kernel.org/r/20250612061201.34272-4-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h |  7 +++++++
 kernel/relay.c        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index cd77eb285a48..5310967f9d74 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -31,6 +31,12 @@
 /*
  * Relay buffer statistics
  */
+enum {
+	RELAY_STATS_BUF_FULL = (1 << 0),
+
+	RELAY_STATS_LAST = RELAY_STATS_BUF_FULL,
+};
+
 struct rchan_buf_stats
 {
 	unsigned int full_count;	/* counter for buffer full */
@@ -167,6 +173,7 @@ struct rchan *relay_open(const char *base_filename,
 			 void *private_data);
 extern void relay_close(struct rchan *chan);
 extern void relay_flush(struct rchan *chan);
+size_t relay_stats(struct rchan *chan, int flags);
 extern void relay_subbufs_consumed(struct rchan *chan,
 				   unsigned int cpu,
 				   size_t consumed);
diff --git a/kernel/relay.c b/kernel/relay.c
index 4b07efddc2cf..2fc27c0e771e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -700,6 +700,36 @@ void relay_flush(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_flush);
 
+/**
+ *	relay_stats - get channel buffer statistics
+ *	@chan: the channel
+ *	@flags: select particular information to get
+ *
+ *	Returns the count of certain field that caller specifies.
+ */
+size_t relay_stats(struct rchan *chan, int flags)
+{
+	unsigned int i, count = 0;
+	struct rchan_buf *rbuf;
+
+	if (!chan || flags > RELAY_STATS_LAST)
+		return 0;
+
+	if (chan->is_global) {
+		rbuf = *per_cpu_ptr(chan->buf, 0);
+		if (flags & RELAY_STATS_BUF_FULL)
+			count = rbuf->stats.full_count;
+	} else {
+		for_each_online_cpu(i) {
+			rbuf = *per_cpu_ptr(chan->buf, i);
+			if (rbuf && flags & RELAY_STATS_BUF_FULL)
+				count += rbuf->stats.full_count;
+		}
+	}
+
+	return count;
+}
+
 /**
  *	relay_file_open - open file op for relay files
  *	@inode: the inode
-- 
cgit v1.2.3


From 19f3cb64a25b80db667a00182785577fae465b3e Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 12 Jun 2025 14:12:01 +0800
Subject: relayfs: support a counter tracking if data is too big to write

It really doesn't matter if the user/admin knows what the last too big
value is.  Record how many times this case is triggered would be helpful.

Solve the existing issue where relay_reset() doesn't restore the value.

Store the counter in the per-cpu buffer structure instead of the global
buffer structure.  It also solves the racy condition which is likely to
happen when a few of per-cpu buffers encounter the too big data case and
then access the global field last_toobig without lock protection.

Remove the printk in relay_close() since kernel module can directly call
relay_stats() as they want.

Link: https://lkml.kernel.org/r/20250612061201.34272-6-kerneljasonxing@gmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Yushan Zhou <katrinzhou@tencent.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/relay.h |  5 +++--
 kernel/relay.c        | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 5310967f9d74..6772a7075840 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -33,13 +33,15 @@
  */
 enum {
 	RELAY_STATS_BUF_FULL = (1 << 0),
+	RELAY_STATS_WRT_BIG = (1 << 1),
 
-	RELAY_STATS_LAST = RELAY_STATS_BUF_FULL,
+	RELAY_STATS_LAST = RELAY_STATS_WRT_BIG,
 };
 
 struct rchan_buf_stats
 {
 	unsigned int full_count;	/* counter for buffer full */
+	unsigned int big_count;		/* counter for too big to write */
 };
 
 /*
@@ -79,7 +81,6 @@ struct rchan
 	const struct rchan_callbacks *cb; /* client callbacks */
 	struct kref kref;		/* channel refcount */
 	void *private_data;		/* for user-defined data */
-	size_t last_toobig;		/* tried to log event > subbuf size */
 	struct rchan_buf * __percpu *buf; /* per-cpu channel buffers */
 	int is_global;			/* One global buffer ? */
 	struct list_head list;		/* for channel list */
diff --git a/kernel/relay.c b/kernel/relay.c
index 2fc27c0e771e..8d915fe98198 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -303,6 +303,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
 	buf->data = buf->start;
 	buf->offset = 0;
 	buf->stats.full_count = 0;
+	buf->stats.big_count = 0;
 
 	for (i = 0; i < buf->chan->n_subbufs; i++)
 		buf->padding[i] = 0;
@@ -602,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 	return length;
 
 toobig:
-	buf->chan->last_toobig = length;
+	buf->stats.big_count++;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(relay_switch_subbuf);
@@ -662,11 +663,6 @@ void relay_close(struct rchan *chan)
 			if ((buf = *per_cpu_ptr(chan->buf, i)))
 				relay_close_buf(buf);
 
-	if (chan->last_toobig)
-		printk(KERN_WARNING "relay: one or more items not logged "
-		       "[item size (%zd) > sub-buffer size (%zd)]\n",
-		       chan->last_toobig, chan->subbuf_size);
-
 	list_del(&chan->list);
 	kref_put(&chan->kref, relay_destroy_channel);
 	mutex_unlock(&relay_channels_mutex);
@@ -719,11 +715,17 @@ size_t relay_stats(struct rchan *chan, int flags)
 		rbuf = *per_cpu_ptr(chan->buf, 0);
 		if (flags & RELAY_STATS_BUF_FULL)
 			count = rbuf->stats.full_count;
+		else if (flags & RELAY_STATS_WRT_BIG)
+			count = rbuf->stats.big_count;
 	} else {
 		for_each_online_cpu(i) {
 			rbuf = *per_cpu_ptr(chan->buf, i);
-			if (rbuf && flags & RELAY_STATS_BUF_FULL)
-				count += rbuf->stats.full_count;
+			if (rbuf) {
+				if (flags & RELAY_STATS_BUF_FULL)
+					count += rbuf->stats.full_count;
+				else if (flags & RELAY_STATS_WRT_BIG)
+					count += rbuf->stats.big_count;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From 1857fcc847443b0238cb64584b43d8c3a9049a0a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 17 Mar 2025 17:02:28 +0800
Subject: lib/raid6: replace custom zero page with ZERO_PAGE

Use the system-wide zero page instead of a custom zero page.

[herbert@gondor.apana.org.au: update lib/raid6/recov_rvv.c, per Klara]
  Link: https://lkml.kernel.org/r/aFkUnXWtxcgOTVkw@gondor.apana.org.au
Link: https://lkml.kernel.org/r/Z9flJNkWQICx0PXk@gondor.apana.org.au
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Song Liu <song@kernel.org>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Klara Modin <klarasmodin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 crypto/async_tx/async_pq.c          |  2 +-
 crypto/async_tx/async_raid6_recov.c |  4 ++--
 include/linux/raid/pq.h             | 12 +++++++++++-
 lib/raid6/algos.c                   |  3 ---
 lib/raid6/recov.c                   |  6 +++---
 lib/raid6/recov_avx2.c              |  6 +++---
 lib/raid6/recov_avx512.c            |  6 +++---
 lib/raid6/recov_loongarch_simd.c    | 12 ++++++------
 lib/raid6/recov_neon.c              |  6 +++---
 lib/raid6/recov_s390xc.c            |  6 +++---
 lib/raid6/recov_ssse3.c             |  6 +++---
 11 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/crypto/async_tx/async_pq.c b/crypto/async_tx/async_pq.c
index 5e2b2680d7db..9e4bb7fbde25 100644
--- a/crypto/async_tx/async_pq.c
+++ b/crypto/async_tx/async_pq.c
@@ -119,7 +119,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int *offsets, int disks,
 	for (i = 0; i < disks; i++) {
 		if (blocks[i] == NULL) {
 			BUG_ON(i > disks - 3); /* P or Q can't be zero */
-			srcs[i] = (void*)raid6_empty_zero_page;
+			srcs[i] = raid6_get_zero_page();
 		} else {
 			srcs[i] = page_address(blocks[i]) + offsets[i];
 
diff --git a/crypto/async_tx/async_raid6_recov.c b/crypto/async_tx/async_raid6_recov.c
index 354b8cd5537f..539ea5b378dc 100644
--- a/crypto/async_tx/async_raid6_recov.c
+++ b/crypto/async_tx/async_raid6_recov.c
@@ -414,7 +414,7 @@ async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
 		async_tx_quiesce(&submit->depend_tx);
 		for (i = 0; i < disks; i++)
 			if (blocks[i] == NULL)
-				ptrs[i] = (void *) raid6_empty_zero_page;
+				ptrs[i] = raid6_get_zero_page();
 			else
 				ptrs[i] = page_address(blocks[i]) + offs[i];
 
@@ -497,7 +497,7 @@ async_raid6_datap_recov(int disks, size_t bytes, int faila,
 		async_tx_quiesce(&submit->depend_tx);
 		for (i = 0; i < disks; i++)
 			if (blocks[i] == NULL)
-				ptrs[i] = (void*)raid6_empty_zero_page;
+				ptrs[i] = raid6_get_zero_page();
 			else
 				ptrs[i] = page_address(blocks[i]) + offs[i];
 
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index 72ff44cca864..2467b3be15c9 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -11,8 +11,13 @@
 #ifdef __KERNEL__
 
 #include <linux/blkdev.h>
+#include <linux/mm.h>
 
-extern const char raid6_empty_zero_page[PAGE_SIZE];
+/* This should be const but the raid6 code is too convoluted for that. */
+static inline void *raid6_get_zero_page(void)
+{
+	return page_address(ZERO_PAGE(0));
+}
 
 #else /* ! __KERNEL__ */
 /* Used for testing in user space */
@@ -191,6 +196,11 @@ static inline uint32_t raid6_jiffies(void)
 	return tv.tv_sec*1000 + tv.tv_usec/1000;
 }
 
+static inline void *raid6_get_zero_page(void)
+{
+	return raid6_empty_zero_page;
+}
+
 #endif /* ! __KERNEL__ */
 
 #endif /* LINUX_RAID_RAID6_H */
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 75ce3e134b7c..799e0e5eac26 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,9 +18,6 @@
 #else
 #include <linux/module.h>
 #include <linux/gfp.h>
-/* In .bss so it's zeroed */
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
-EXPORT_SYMBOL(raid6_empty_zero_page);
 #endif
 
 struct raid6_calls raid6_call;
diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c
index a7c1b2bbe40d..b5e47c008b41 100644
--- a/lib/raid6/recov.c
+++ b/lib/raid6/recov.c
@@ -31,10 +31,10 @@ static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -72,7 +72,7 @@ static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c
index 4e8095403ee2..97d598d2535c 100644
--- a/lib/raid6/recov_avx2.c
+++ b/lib/raid6/recov_avx2.c
@@ -28,10 +28,10 @@ static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -196,7 +196,7 @@ static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
index 310c715db313..7986120ca444 100644
--- a/lib/raid6/recov_avx512.c
+++ b/lib/raid6/recov_avx512.c
@@ -37,10 +37,10 @@ static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -238,7 +238,7 @@ static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
 	 */
 
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c
index 94aeac85e6f7..93dc515997a1 100644
--- a/lib/raid6/recov_loongarch_simd.c
+++ b/lib/raid6/recov_loongarch_simd.c
@@ -42,10 +42,10 @@ static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -197,7 +197,7 @@ static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -316,10 +316,10 @@ static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -436,7 +436,7 @@ static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c
index 1bfc14174d4d..70e1404c1512 100644
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -36,10 +36,10 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	 * delta p and delta q
 	 */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -74,7 +74,7 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	 * Use the dead data page as temporary storage for delta q
 	 */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks - 1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_s390xc.c b/lib/raid6/recov_s390xc.c
index 179eec900cea..1d32c01261be 100644
--- a/lib/raid6/recov_s390xc.c
+++ b/lib/raid6/recov_s390xc.c
@@ -35,10 +35,10 @@ static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -82,7 +82,7 @@ static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c
index 4bfa3c6b60de..2e849185c32b 100644
--- a/lib/raid6/recov_ssse3.c
+++ b/lib/raid6/recov_ssse3.c
@@ -30,10 +30,10 @@ static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila,
 	   Use the dead data pages as temporary storage for
 	   delta p and delta q */
 	dp = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-2] = dp;
 	dq = (u8 *)ptrs[failb];
-	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[failb] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
@@ -203,7 +203,7 @@ static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila,
 	/* Compute syndrome with zero for the missing data page
 	   Use the dead data page as temporary storage for delta q */
 	dq = (u8 *)ptrs[faila];
-	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[faila] = raid6_get_zero_page();
 	ptrs[disks-1] = dq;
 
 	raid6_call.gen_syndrome(disks, bytes, ptrs);
-- 
cgit v1.2.3


From 76fdb7eb4e1c91086ce9c3db6972c2ed48c96afb Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 8 Jul 2025 23:21:51 +1000
Subject: uapi: export PROCFS_ROOT_INO

The root inode of /proc having a fixed inode number has been part of the
core kernel ABI since its inception, and recently some userspace
programs (mainly container runtimes) have started to explicitly depend
on this behaviour.

The main reason this is useful to userspace is that by checking that a
suspect /proc handle has fstype PROC_SUPER_MAGIC and is PROCFS_ROOT_INO,
they can then use openat2(RESOLVE_{NO_{XDEV,MAGICLINK},BENEATH}) to
ensure that there isn't a bind-mount that replaces some procfs file with
a different one. This kind of attack has lead to security issues in
container runtimes in the past (such as CVE-2019-19921) and libraries
like libpathrs[1] use this feature of procfs to provide safe procfs
handling functions.

There was also some trailing whitespace in the "struct proc_dir_entry"
initialiser, so fix that up as well.

[1]: https://github.com/openSUSE/libpathrs

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/20250708-uapi-procfs-root-ino-v1-1-6ae61e97c79b@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/root.c          | 10 +++++-----
 include/linux/proc_ns.h |  1 -
 include/uapi/linux/fs.h | 11 +++++++++++
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 06a297a27ba3..ed86ac710384 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = {
  * This is the root "inode" in the /proc tree..
  */
 struct proc_dir_entry proc_root = {
-	.low_ino	= PROC_ROOT_INO, 
-	.namelen	= 5, 
-	.mode		= S_IFDIR | S_IRUGO | S_IXUGO, 
-	.nlink		= 2, 
+	.low_ino	= PROCFS_ROOT_INO,
+	.namelen	= 5,
+	.mode		= S_IFDIR | S_IRUGO | S_IXUGO,
+	.nlink		= 2,
 	.refcnt		= REFCOUNT_INIT(1),
-	.proc_iops	= &proc_root_inode_operations, 
+	.proc_iops	= &proc_root_inode_operations,
 	.proc_dir_ops	= &proc_root_operations,
 	.parent		= &proc_root,
 	.subdir		= RB_ROOT,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 6258455e49a4..4b20375f3783 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -40,7 +40,6 @@ extern const struct proc_ns_operations timens_for_children_operations;
  * We always define these enumerators
  */
 enum {
-	PROC_ROOT_INO		= 1,
 	PROC_IPC_INIT_INO	= IPC_NS_INIT_INO,
 	PROC_UTS_INIT_INO	= UTS_NS_INIT_INO,
 	PROC_USER_INIT_INO	= USER_NS_INIT_INO,
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 0098b0ce8ccb..28238a3edbc1 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -60,6 +60,17 @@
 #define RENAME_EXCHANGE		(1 << 1)	/* Exchange source and dest */
 #define RENAME_WHITEOUT		(1 << 2)	/* Whiteout source */
 
+/*
+ * The root inode of procfs is guaranteed to always have the same inode number.
+ * For programs that make heavy use of procfs, verifying that the root is a
+ * real procfs root and using openat2(RESOLVE_{NO_{XDEV,MAGICLINKS},BENEATH})
+ * will allow you to make sure you are never tricked into operating on the
+ * wrong procfs file.
+ */
+enum procfs_ino {
+	PROCFS_ROOT_INO = 1,
+};
+
 struct file_clone_range {
 	__s64 src_fd;
 	__u64 src_offset;
-- 
cgit v1.2.3


From 626bb0a45584d544d84eab909795ccb355062bcc Mon Sep 17 00:00:00 2001
From: Michael Walle <mwalle@kernel.org>
Date: Fri, 13 Jun 2025 13:45:12 +0200
Subject: mfd: tps6594: Add TI TPS652G1 support

The TPS652G1 is a stripped down version of the TPS65224. From a software
point of view, it lacks any voltage monitoring, the watchdog, the ESM
and the ADC.

Signed-off-by: Michael Walle <mwalle@kernel.org>
Link: https://lore.kernel.org/r/20250613114518.1772109-2-mwalle@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/tps6594-core.c  | 88 +++++++++++++++++++++++++++++++++++++++++----
 drivers/mfd/tps6594-i2c.c   | 10 ++++--
 drivers/mfd/tps6594-spi.c   | 10 ++++--
 include/linux/mfd/tps6594.h |  1 +
 4 files changed, 99 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c
index a7223e873cd1..c16c37e36617 100644
--- a/drivers/mfd/tps6594-core.c
+++ b/drivers/mfd/tps6594-core.c
@@ -1,6 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Core functions for TI TPS65224/TPS6594/TPS6593/LP8764 PMICs
+ * Core functions for following TI PMICs:
+ *  - LP8764
+ *  - TPS65224
+ *  - TPS652G1
+ *  - TPS6593
+ *  - TPS6594
  *
  * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/
  */
@@ -414,6 +419,61 @@ static const unsigned int tps65224_irq_reg[] = {
 	TPS6594_REG_INT_FSM_ERR,
 };
 
+/* TPS652G1 Resources */
+
+static const struct mfd_cell tps652g1_common_cells[] = {
+	MFD_CELL_RES("tps6594-pfsm", tps65224_pfsm_resources),
+	MFD_CELL_RES("tps6594-pinctrl", tps65224_pinctrl_resources),
+	MFD_CELL_NAME("tps6594-regulator"),
+};
+
+static const struct regmap_irq tps652g1_irqs[] = {
+	/* INT_GPIO register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO1, 2, TPS65224_BIT_GPIO1_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO2, 2, TPS65224_BIT_GPIO2_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO3, 2, TPS65224_BIT_GPIO3_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO4, 2, TPS65224_BIT_GPIO4_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO5, 2, TPS65224_BIT_GPIO5_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_GPIO6, 2, TPS65224_BIT_GPIO6_INT),
+
+	/* INT_STARTUP register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_VSENSE, 3, TPS65224_BIT_VSENSE_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_ENABLE, 3, TPS6594_BIT_ENABLE_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_PB_SHORT, 3, TPS65224_BIT_PB_SHORT_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_FSD, 3, TPS6594_BIT_FSD_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_SOFT_REBOOT, 3, TPS6594_BIT_SOFT_REBOOT_INT),
+
+	/* INT_MISC register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_BIST_PASS, 4, TPS6594_BIT_BIST_PASS_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_EXT_CLK, 4, TPS6594_BIT_EXT_CLK_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_REG_UNLOCK, 4, TPS65224_BIT_REG_UNLOCK_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_TWARN, 4, TPS6594_BIT_TWARN_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_PB_LONG, 4, TPS65224_BIT_PB_LONG_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_PB_FALL, 4, TPS65224_BIT_PB_FALL_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_PB_RISE, 4, TPS65224_BIT_PB_RISE_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_ADC_CONV_READY, 4, TPS65224_BIT_ADC_CONV_READY_INT),
+
+	/* INT_MODERATE_ERR register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_TSD_ORD, 5, TPS6594_BIT_TSD_ORD_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_BIST_FAIL, 5, TPS6594_BIT_BIST_FAIL_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_REG_CRC_ERR, 5, TPS6594_BIT_REG_CRC_ERR_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_RECOV_CNT, 5, TPS6594_BIT_RECOV_CNT_INT),
+
+	/* INT_SEVERE_ERR register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_TSD_IMM, 6, TPS6594_BIT_TSD_IMM_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_VCCA_OVP, 6, TPS6594_BIT_VCCA_OVP_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_PFSM_ERR, 6, TPS6594_BIT_PFSM_ERR_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_BG_XMON, 6, TPS65224_BIT_BG_XMON_INT),
+
+	/* INT_FSM_ERR register */
+	REGMAP_IRQ_REG(TPS65224_IRQ_IMM_SHUTDOWN, 7, TPS6594_BIT_IMM_SHUTDOWN_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_ORD_SHUTDOWN, 7, TPS6594_BIT_ORD_SHUTDOWN_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_MCU_PWR_ERR, 7, TPS6594_BIT_MCU_PWR_ERR_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_SOC_PWR_ERR, 7, TPS6594_BIT_SOC_PWR_ERR_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_COMM_ERR, 7, TPS6594_BIT_COMM_ERR_INT),
+	REGMAP_IRQ_REG(TPS65224_IRQ_I2C2_ERR, 7, TPS65224_BIT_I2C2_ERR_INT),
+};
+
 static inline unsigned int tps6594_get_irq_reg(struct regmap_irq_chip_data *data,
 					       unsigned int base, int index)
 {
@@ -443,7 +503,7 @@ static int tps6594_handle_post_irq(void *irq_drv_data)
 	 * a new interrupt.
 	 */
 	if (tps->use_crc) {
-		if (tps->chip_id == TPS65224) {
+		if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1) {
 			regmap_reg = TPS6594_REG_INT_FSM_ERR;
 			mask_val = TPS6594_BIT_COMM_ERR_INT;
 		} else {
@@ -481,6 +541,18 @@ static struct regmap_irq_chip tps65224_irq_chip = {
 	.handle_post_irq = tps6594_handle_post_irq,
 };
 
+static struct regmap_irq_chip tps652g1_irq_chip = {
+	.ack_base = TPS6594_REG_INT_BUCK,
+	.ack_invert = 1,
+	.clear_ack = 1,
+	.init_ack_masked = 1,
+	.num_regs = ARRAY_SIZE(tps65224_irq_reg),
+	.irqs = tps652g1_irqs,
+	.num_irqs = ARRAY_SIZE(tps652g1_irqs),
+	.get_irq_reg = tps65224_get_irq_reg,
+	.handle_post_irq = tps6594_handle_post_irq,
+};
+
 static const struct regmap_range tps6594_volatile_ranges[] = {
 	regmap_reg_range(TPS6594_REG_INT_TOP, TPS6594_REG_STAT_READBACK_ERR),
 	regmap_reg_range(TPS6594_REG_RTC_STATUS, TPS6594_REG_RTC_STATUS),
@@ -507,7 +579,7 @@ static int tps6594_check_crc_mode(struct tps6594 *tps, bool primary_pmic)
 	int ret;
 	unsigned int regmap_reg, mask_val;
 
-	if (tps->chip_id == TPS65224) {
+	if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1) {
 		regmap_reg = TPS6594_REG_CONFIG_2;
 		mask_val = TPS65224_BIT_I2C1_SPI_CRC_EN;
 	} else {
@@ -537,7 +609,7 @@ static int tps6594_set_crc_feature(struct tps6594 *tps)
 	int ret;
 	unsigned int regmap_reg, mask_val;
 
-	if (tps->chip_id == TPS65224) {
+	if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1) {
 		regmap_reg = TPS6594_REG_CONFIG_2;
 		mask_val = TPS65224_BIT_I2C1_SPI_CRC_EN;
 	} else {
@@ -628,6 +700,10 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc)
 		irq_chip = &tps65224_irq_chip;
 		n_cells = ARRAY_SIZE(tps65224_common_cells);
 		cells = tps65224_common_cells;
+	} else if (tps->chip_id == TPS652G1) {
+		irq_chip = &tps652g1_irq_chip;
+		n_cells = ARRAY_SIZE(tps652g1_common_cells);
+		cells = tps652g1_common_cells;
 	} else {
 		irq_chip = &tps6594_irq_chip;
 		n_cells = ARRAY_SIZE(tps6594_common_cells);
@@ -651,8 +727,8 @@ int tps6594_device_init(struct tps6594 *tps, bool enable_crc)
 	if (ret)
 		return dev_err_probe(dev, ret, "Failed to add common child devices\n");
 
-	/* No RTC for LP8764 and TPS65224 */
-	if (tps->chip_id != LP8764 && tps->chip_id != TPS65224) {
+	/* No RTC for LP8764, TPS65224 and TPS652G1 */
+	if (tps->chip_id != LP8764 && tps->chip_id != TPS65224 && tps->chip_id != TPS652G1) {
 		ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_AUTO, tps6594_rtc_cells,
 					   ARRAY_SIZE(tps6594_rtc_cells), NULL, 0,
 					   regmap_irq_get_domain(tps->irq_data));
diff --git a/drivers/mfd/tps6594-i2c.c b/drivers/mfd/tps6594-i2c.c
index 4ab91c34d9fb..7ff7516286fd 100644
--- a/drivers/mfd/tps6594-i2c.c
+++ b/drivers/mfd/tps6594-i2c.c
@@ -1,6 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * I2C access driver for TI TPS65224/TPS6594/TPS6593/LP8764 PMICs
+ * I2C access driver for the following TI PMICs:
+ *  - LP8764
+ *  - TPS65224
+ *  - TPS652G1
+ *  - TPS6593
+ *  - TPS6594
  *
  * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/
  */
@@ -197,6 +202,7 @@ static const struct of_device_id tps6594_i2c_of_match_table[] = {
 	{ .compatible = "ti,tps6593-q1", .data = (void *)TPS6593, },
 	{ .compatible = "ti,lp8764-q1",  .data = (void *)LP8764,  },
 	{ .compatible = "ti,tps65224-q1", .data = (void *)TPS65224, },
+	{ .compatible = "ti,tps652g1", .data = (void *)TPS652G1, },
 	{}
 };
 MODULE_DEVICE_TABLE(of, tps6594_i2c_of_match_table);
@@ -222,7 +228,7 @@ static int tps6594_i2c_probe(struct i2c_client *client)
 		return dev_err_probe(dev, -EINVAL, "Failed to find matching chip ID\n");
 	tps->chip_id = (unsigned long)match->data;
 
-	if (tps->chip_id == TPS65224)
+	if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1)
 		tps6594_i2c_regmap_config.volatile_table = &tps65224_volatile_table;
 
 	tps->regmap = devm_regmap_init(dev, NULL, client, &tps6594_i2c_regmap_config);
diff --git a/drivers/mfd/tps6594-spi.c b/drivers/mfd/tps6594-spi.c
index 6ebccb79f0cc..944b7313a1d9 100644
--- a/drivers/mfd/tps6594-spi.c
+++ b/drivers/mfd/tps6594-spi.c
@@ -1,6 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * SPI access driver for TI TPS65224/TPS6594/TPS6593/LP8764 PMICs
+ * SPI access driver for the following TI PMICs:
+ *  - LP8764
+ *  - TPS65224
+ *  - TPS652G1
+ *  - TPS6593
+ *  - TPS6594
  *
  * Copyright (C) 2023 BayLibre Incorporated - https://www.baylibre.com/
  */
@@ -82,6 +87,7 @@ static const struct of_device_id tps6594_spi_of_match_table[] = {
 	{ .compatible = "ti,tps6593-q1", .data = (void *)TPS6593, },
 	{ .compatible = "ti,lp8764-q1",  .data = (void *)LP8764,  },
 	{ .compatible = "ti,tps65224-q1", .data = (void *)TPS65224, },
+	{ .compatible = "ti,tps652g1", .data = (void *)TPS652G1, },
 	{}
 };
 MODULE_DEVICE_TABLE(of, tps6594_spi_of_match_table);
@@ -107,7 +113,7 @@ static int tps6594_spi_probe(struct spi_device *spi)
 		return dev_err_probe(dev, -EINVAL, "Failed to find matching chip ID\n");
 	tps->chip_id = (unsigned long)match->data;
 
-	if (tps->chip_id == TPS65224)
+	if (tps->chip_id == TPS65224 || tps->chip_id == TPS652G1)
 		tps6594_spi_regmap_config.volatile_table = &tps65224_volatile_table;
 
 	tps->regmap = devm_regmap_init(dev, NULL, spi, &tps6594_spi_regmap_config);
diff --git a/include/linux/mfd/tps6594.h b/include/linux/mfd/tps6594.h
index 16543fd4d83e..021db8875963 100644
--- a/include/linux/mfd/tps6594.h
+++ b/include/linux/mfd/tps6594.h
@@ -19,6 +19,7 @@ enum pmic_id {
 	TPS6593,
 	LP8764,
 	TPS65224,
+	TPS652G1,
 };
 
 /* Macro to get page index from register address */
-- 
cgit v1.2.3


From dd831ac8221e691e9e918585b1003c7071df0379 Mon Sep 17 00:00:00 2001
From: Xiang Mei <xmei5@asu.edu>
Date: Sat, 5 Jul 2025 14:21:43 -0700
Subject: net/sched: sch_qfq: Fix null-deref in agg_dequeue

To prevent a potential crash in agg_dequeue (net/sched/sch_qfq.c)
when cl->qdisc->ops->peek(cl->qdisc) returns NULL, we check the return
value before using it, similar to the existing approach in sch_hfsc.c.

To avoid code duplication, the following changes are made:

1. Changed qdisc_warn_nonwc(include/net/pkt_sched.h) into a static
inline function.

2. Moved qdisc_peek_len from net/sched/sch_hfsc.c to
include/net/pkt_sched.h so that sch_qfq can reuse it.

3. Applied qdisc_peek_len in agg_dequeue to avoid crashing.

Signed-off-by: Xiang Mei <xmei5@asu.edu>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://patch.msgid.link/20250705212143.3982664-1-xmei5@asu.edu
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/pkt_sched.h | 25 ++++++++++++++++++++++++-
 net/sched/sch_api.c     | 10 ----------
 net/sched/sch_hfsc.c    | 16 ----------------
 net/sched/sch_qfq.c     |  2 +-
 4 files changed, 25 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index d7b7b6cd4aa1..8a75c73fc555 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -114,7 +114,6 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
 					struct netlink_ext_ack *extack);
 void qdisc_put_rtab(struct qdisc_rate_table *tab);
 void qdisc_put_stab(struct qdisc_size_table *tab);
-void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
 bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 		     struct net_device *dev, struct netdev_queue *txq,
 		     spinlock_t *root_lock, bool validate);
@@ -290,4 +289,28 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch,
 	return true;
 }
 
+static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
+{
+	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+			txt, qdisc->ops->id, qdisc->handle >> 16);
+		qdisc->flags |= TCQ_F_WARN_NONWC;
+	}
+}
+
+static inline unsigned int qdisc_peek_len(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+
+	skb = sch->ops->peek(sch);
+	if (unlikely(skb == NULL)) {
+		qdisc_warn_nonwc("qdisc_peek_len", sch);
+		return 0;
+	}
+	len = qdisc_pkt_len(skb);
+
+	return len;
+}
+
 #endif
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 241e86cec9c5..d7c767b861a4 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -601,16 +601,6 @@ out:
 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
 }
 
-void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
-{
-	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
-		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
-			txt, qdisc->ops->id, qdisc->handle >> 16);
-		qdisc->flags |= TCQ_F_WARN_NONWC;
-	}
-}
-EXPORT_SYMBOL(qdisc_warn_nonwc);
-
 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
 {
 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 5a7745170e84..d8fd35da32a7 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -835,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time)
 	}
 }
 
-static unsigned int
-qdisc_peek_len(struct Qdisc *sch)
-{
-	struct sk_buff *skb;
-	unsigned int len;
-
-	skb = sch->ops->peek(sch);
-	if (unlikely(skb == NULL)) {
-		qdisc_warn_nonwc("qdisc_peek_len", sch);
-		return 0;
-	}
-	len = qdisc_pkt_len(skb);
-
-	return len;
-}
-
 static void
 hfsc_adjust_levels(struct hfsc_class *cl)
 {
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index bf1282cb22eb..bcce36608871 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -989,7 +989,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg,
 
 	if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */
 		list_del_init(&cl->alist);
-	else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) {
+	else if (cl->deficit < qdisc_peek_len(cl->qdisc)) {
 		cl->deficit += agg->lmax;
 		list_move_tail(&cl->alist, &agg->active);
 	}
-- 
cgit v1.2.3


From 7a8212f3bcfab753602599da644fb5bdeaf8ff2d Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 11 Jun 2025 15:40:42 +0300
Subject: media: v4l: Make media_entity_to_video_device() NULL-safe

Make media_entity_to_video_device(NULL) return NULL, instead of an invalid
pointer value.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 include/media/v4l2-dev.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h
index 1b6222fab24e..a69801274800 100644
--- a/include/media/v4l2-dev.h
+++ b/include/media/v4l2-dev.h
@@ -313,10 +313,16 @@ struct video_device {
  * media_entity_to_video_device - Returns a &struct video_device from
  *	the &struct media_entity embedded on it.
  *
- * @__entity: pointer to &struct media_entity
- */
-#define media_entity_to_video_device(__entity) \
-	container_of(__entity, struct video_device, entity)
+ * @__entity: pointer to &struct media_entity, may be NULL
+ */
+#define media_entity_to_video_device(__entity)				\
+({									\
+	typeof(__entity) __me_vdev_ent = __entity;			\
+									\
+	__me_vdev_ent ?							\
+		container_of(__me_vdev_ent,  struct video_device, entity) : \
+		NULL;							\
+})
 
 /**
  * to_video_device - Returns a &struct video_device from the
-- 
cgit v1.2.3


From 1fff2ee377e1c2230054e65092def460dd40b587 Mon Sep 17 00:00:00 2001
From: Mehdi Djait <mehdi.djait@linux.intel.com>
Date: Mon, 23 Jun 2025 15:51:15 +0200
Subject: media: uapi: videodev2: Fix comment for 12-bit packed Bayer formats

For 12-bit packed Bayer formats: every two consecutive samples are
packed into three bytes. Fix the corresponding comment.

Signed-off-by: Mehdi Djait <mehdi.djait@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 include/uapi/linux/videodev2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 6f7bd38dd5aa..1bb1979f6c18 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -726,7 +726,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_SGBRG12 v4l2_fourcc('G', 'B', '1', '2') /* 12  GBGB.. RGRG.. */
 #define V4L2_PIX_FMT_SGRBG12 v4l2_fourcc('B', 'A', '1', '2') /* 12  GRGR.. BGBG.. */
 #define V4L2_PIX_FMT_SRGGB12 v4l2_fourcc('R', 'G', '1', '2') /* 12  RGRG.. GBGB.. */
-	/* 12bit raw bayer packed, 6 bytes for every 4 pixels */
+	/* 12bit raw bayer packed, 3 bytes for every 2 pixels */
 #define V4L2_PIX_FMT_SBGGR12P v4l2_fourcc('p', 'B', 'C', 'C')
 #define V4L2_PIX_FMT_SGBRG12P v4l2_fourcc('p', 'G', 'C', 'C')
 #define V4L2_PIX_FMT_SGRBG12P v4l2_fourcc('p', 'g', 'C', 'C')
-- 
cgit v1.2.3


From c9596e882032827a5c7f99a5ca481cd46251a473 Mon Sep 17 00:00:00 2001
From: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Date: Sat, 5 Jul 2025 10:37:41 +0200
Subject: media: v4l2-subdev: Remove g_pixelaspect operation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no consumers or implementations left in tree for the subdevice
operation g_pixelaspect, delete it.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Link: https://lore.kernel.org/r/20250705083741.77517-4-niklas.soderlund+renesas@ragnatech.se
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 include/media/v4l2-subdev.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 57f2bcb4eb16..5dcf4065708f 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -460,8 +460,6 @@ enum v4l2_subdev_pre_streamon_flags {
  *	but use the v4l2_subdev_enable_streams() and
  *	v4l2_subdev_disable_streams() helpers.
  *
- * @g_pixelaspect: callback to return the pixelaspect ratio.
- *
  * @s_rx_buffer: set a host allocated memory buffer for the subdev. The subdev
  *	can adjust @size to a lower value and must not write more data to the
  *	buffer starting at @data than the original value of @size.
@@ -491,7 +489,6 @@ struct v4l2_subdev_video_ops {
 	int (*g_tvnorms_output)(struct v4l2_subdev *sd, v4l2_std_id *std);
 	int (*g_input_status)(struct v4l2_subdev *sd, u32 *status);
 	int (*s_stream)(struct v4l2_subdev *sd, int enable);
-	int (*g_pixelaspect)(struct v4l2_subdev *sd, struct v4l2_fract *aspect);
 	int (*s_rx_buffer)(struct v4l2_subdev *sd, void *buf,
 			   unsigned int *size);
 	int (*pre_streamon)(struct v4l2_subdev *sd, u32 flags);
-- 
cgit v1.2.3


From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Sat, 5 Jul 2025 00:01:38 +0800
Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt

This patch provides a setsockopt method to let applications leverage to
adjust how many descs to be handled at most in one send syscall. It
mitigates the situation where the default value (32) that is too small
leads to higher frequency of triggering send syscall.

Considering the prosperity/complexity the applications have, there is no
absolutely ideal suggestion fitting all cases. So keep 32 as its default
value like before.

The patch does the following things:
- Add XDP_MAX_TX_SKB_BUDGET socket option.
- Set max_tx_budget to 32 by default in the initialization phase as a
  per-socket granular control.
- Set the range of max_tx_budget as [32, xs->tx->nentries].

The idea behind this comes out of real workloads in production. We use a
user-level stack with xsk support to accelerate sending packets and
minimize triggering syscalls. When the packets are aggregated, it's not
hard to hit the upper bound (namely, 32). The moment user-space stack
fetches the -EAGAIN error number passed from sendto(), it will loop to try
again until all the expected descs from tx ring are sent out to the driver.
Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of
sendto() and higher throughput/PPS.

Here is what I did in production, along with some numbers as follows:
For one application I saw lately, I suggested using 128 as max_tx_budget
because I saw two limitations without changing any default configuration:
1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by
net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind
this was I counted how many descs are transmitted to the driver at one
time of sendto() based on [1] patch and then I calculated the
possibility of hitting the upper bound. Finally I chose 128 as a
suitable value because 1) it covers most of the cases, 2) a higher
number would not bring evident results. After twisting the parameters,
a stable improvement of around 4% for both PPS and throughput and less
resources consumption were found to be observed by strace -c -p xxx:
1) %time was decreased by 7.8%
2) error counter was decreased from 18367 to 572

[1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/networking/af_xdp.rst |  9 +++++++++
 include/net/xdp_sock.h              |  1 +
 include/uapi/linux/if_xdp.h         |  1 +
 net/xdp/xsk.c                       | 21 +++++++++++++++++++--
 tools/include/uapi/linux/if_xdp.h   |  1 +
 5 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index d486014bb31d..50d92084a49c 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -438,6 +438,15 @@ is created by a privileged process and passed to a non-privileged one.
 Once the option is set, kernel will refuse attempts to bind that socket
 to a different interface.  Updating the value requires CAP_NET_RAW.
 
+XDP_MAX_TX_SKB_BUDGET setsockopt
+--------------------------------
+
+This setsockopt sets the maximum number of descriptors that can be handled
+and passed to the driver at one send syscall. It is applied in the copy
+mode to allow application to tune the per-socket maximum iteration for
+better throughput and less frequency of send syscall.
+Allowed range is [32, xs->tx->nentries].
+
 XDP_STATISTICS getsockopt
 -------------------------
 
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e8bd6ddb7b12..ce587a225661 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -84,6 +84,7 @@ struct xdp_sock {
 	struct list_head map_list;
 	/* Protects map_list */
 	spinlock_t map_list_lock;
+	u32 max_tx_budget;
 	/* Protects multiple processes in the control path */
 	struct mutex mutex;
 	struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 44f2bb93e7e6..23a062781468 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -79,6 +79,7 @@ struct xdp_mmap_offsets {
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
+#define XDP_MAX_TX_SKB_BUDGET		9
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index bd61b0bc9c24..9c3acecc14b1 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -34,7 +34,7 @@
 #include "xsk.h"
 
 #define TX_BATCH_SIZE 32
-#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
+#define MAX_PER_SOCKET_BUDGET 32
 
 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
 {
@@ -783,10 +783,10 @@ free_err:
 static int __xsk_generic_xmit(struct sock *sk)
 {
 	struct xdp_sock *xs = xdp_sk(sk);
-	u32 max_batch = TX_BATCH_SIZE;
 	bool sent_frame = false;
 	struct xdp_desc desc;
 	struct sk_buff *skb;
+	u32 max_batch;
 	int err = 0;
 
 	mutex_lock(&xs->mutex);
@@ -800,6 +800,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 	if (xs->queue_id >= xs->dev->real_num_tx_queues)
 		goto out;
 
+	max_batch = READ_ONCE(xs->max_tx_budget);
 	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
 		if (max_batch-- == 0) {
 			err = -EAGAIN;
@@ -1440,6 +1441,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return err;
 	}
+	case XDP_MAX_TX_SKB_BUDGET:
+	{
+		unsigned int budget;
+
+		if (optlen != sizeof(budget))
+			return -EINVAL;
+		if (copy_from_sockptr(&budget, optval, sizeof(budget)))
+			return -EFAULT;
+		if (!xs->tx ||
+		    budget < TX_BATCH_SIZE || budget > xs->tx->nentries)
+			return -EACCES;
+
+		WRITE_ONCE(xs->max_tx_budget, budget);
+		return 0;
+	}
 	default:
 		break;
 	}
@@ -1737,6 +1753,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 
 	xs = xdp_sk(sk);
 	xs->state = XSK_READY;
+	xs->max_tx_budget = TX_BATCH_SIZE;
 	mutex_init(&xs->mutex);
 
 	INIT_LIST_HEAD(&xs->map_list);
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index 44f2bb93e7e6..23a062781468 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -79,6 +79,7 @@ struct xdp_mmap_offsets {
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
+#define XDP_MAX_TX_SKB_BUDGET		9
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
-- 
cgit v1.2.3


From fca02263f27eee093379844ac0fb280bf70e6aed Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:58:54 -0700
Subject: iommufd: Correct virt_id kdoc at struct iommu_vdevice_alloc

The userspace-api iommufd.rst has described it correctly but the uAPI doc
was remained uncorrected. Thus, fix it.

Link: https://patch.msgid.link/r/2cdcecaf2babee16fda7545ccad4e5bed7a5032d.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/linux/iommufd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index f29b6c44655e..4bc05e4621c1 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -995,7 +995,7 @@ struct iommu_viommu_alloc {
  * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
  * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
  * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
- *           of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table
+ *           of AMD IOMMU, and vRID of Intel VT-d
  *
  * Allocate a virtual device instance (for a physical device) against a vIOMMU.
  * This instance holds the device's information (related to its vIOMMU) in a VM.
-- 
cgit v1.2.3


From 4b57c057f9e6668ae442b19902dab8a73fe7b209 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:58:56 -0700
Subject: iommu: Use enum iommu_hw_info_type for type in hw_info op

Replace u32 to make it clear. No functional changes.

Also simplify the kdoc since the type itself is clear enough.

Link: https://patch.msgid.link/r/651c50dee8ab900f691202ef0204cd5a43fdd6a2.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h         | 3 ++-
 drivers/iommu/intel/iommu.c                         | 3 ++-
 drivers/iommu/iommufd/selftest.c                    | 3 ++-
 include/linux/iommu.h                               | 6 +++---
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 9f59c95a254c..69bbe39e28de 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -7,7 +7,8 @@
 
 #include "arm-smmu-v3.h"
 
-void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
+void *arm_smmu_hw_info(struct device *dev, u32 *length,
+		       enum iommu_hw_info_type *type)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct iommu_hw_info_arm_smmuv3 *info;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index bb39af84e6b0..04463a4aaa26 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -1033,7 +1033,8 @@ struct arm_vsmmu {
 };
 
 #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD)
-void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type);
+void *arm_smmu_hw_info(struct device *dev, u32 *length,
+		       enum iommu_hw_info_type *type);
 size_t arm_smmu_get_viommu_size(struct device *dev,
 				enum iommu_viommu_type viommu_type);
 int arm_vsmmu_init(struct iommufd_viommu *viommu,
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7aa3932251b2..850f1a6f548c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4091,7 +4091,8 @@ out_remove_dev_pasid:
 	return ret;
 }
 
-static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
+static void *intel_iommu_hw_info(struct device *dev, u32 *length,
+				 enum iommu_hw_info_type *type)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct intel_iommu *iommu = info->iommu;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 74ca955a766e..7a9abe3f47d5 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -287,7 +287,8 @@ static struct iommu_domain mock_blocking_domain = {
 	.ops = &mock_blocking_ops,
 };
 
-static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
+static void *mock_domain_hw_info(struct device *dev, u32 *length,
+				 enum iommu_hw_info_type *type)
 {
 	struct iommu_test_hw_info *info;
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 04548b18df28..b87c2841e6bc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -563,8 +563,7 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  * @capable: check capability
  * @hw_info: report iommu hardware information. The data buffer returned by this
  *           op is allocated in the iommu driver and freed by the caller after
- *           use. The information type is one of enum iommu_hw_info_type defined
- *           in include/uapi/linux/iommufd.h.
+ *           use.
  * @domain_alloc: Do not use in new drivers
  * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to
  *                         use identity_domain instead. This should only be used
@@ -623,7 +622,8 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
-	void *(*hw_info)(struct device *dev, u32 *length, u32 *type);
+	void *(*hw_info)(struct device *dev, u32 *length,
+			 enum iommu_hw_info_type *type);
 
 	/* Domain allocation and freeing by the iommu driver */
 #if IS_ENABLED(CONFIG_FSL_PAMU)
-- 
cgit v1.2.3


From 3fcf56a2393b399f289a473181ce6b19f716b59d Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:58:57 -0700
Subject: iommu: Add iommu_copy_struct_to_user helper

Similar to the iommu_copy_struct_from_user helper receiving data from the
user space, add an iommu_copy_struct_to_user helper to report output data
back to the user space data pointer.

Link: https://patch.msgid.link/r/fa292c2a730aadd77085ec3a8272360c96eabb9c.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommu.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b87c2841e6bc..fd7319706684 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -558,6 +558,46 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
 	return 0;
 }
 
+/**
+ * __iommu_copy_struct_to_user - Report iommu driver specific user space data
+ * @dst_data: Pointer to a struct iommu_user_data for user space data location
+ * @src_data: Pointer to an iommu driver specific user data that is defined in
+ *            include/uapi/linux/iommufd.h
+ * @data_type: The data type of the @src_data. Must match with @dst_data.type
+ * @data_len: Length of current user data structure, i.e. sizeof(struct _src)
+ * @min_len: Initial length of user data structure for backward compatibility.
+ *           This should be offsetofend using the last member in the user data
+ *           struct that was initially added to include/uapi/linux/iommufd.h
+ */
+static inline int
+__iommu_copy_struct_to_user(const struct iommu_user_data *dst_data,
+			    void *src_data, unsigned int data_type,
+			    size_t data_len, size_t min_len)
+{
+	if (WARN_ON(!dst_data || !src_data))
+		return -EINVAL;
+	if (dst_data->type != data_type)
+		return -EINVAL;
+	if (dst_data->len < min_len || data_len < dst_data->len)
+		return -EINVAL;
+	return copy_struct_to_user(dst_data->uptr, dst_data->len, src_data,
+				   data_len, NULL);
+}
+
+/**
+ * iommu_copy_struct_to_user - Report iommu driver specific user space data
+ * @user_data: Pointer to a struct iommu_user_data for user space data location
+ * @ksrc: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @data_type: The data type of the @ksrc. Must match with @user_data->type
+ * @min_last: The last member of the data structure @ksrc points in the initial
+ *            version.
+ * Return 0 for success, otherwise -error.
+ */
+#define iommu_copy_struct_to_user(user_data, ksrc, data_type, min_last)        \
+	__iommu_copy_struct_to_user(user_data, ksrc, data_type, sizeof(*ksrc), \
+				    offsetofend(typeof(*ksrc), min_last))
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
-- 
cgit v1.2.3


From c3436d42f812faffac94f8fb3fb246ab43ffdffe Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:58:58 -0700
Subject: iommu: Pass in a driver-level user data structure to viommu_init op

The new type of vIOMMU for tegra241-cmdqv allows user space VM to use one
of its virtual command queue HW resources exclusively. This requires user
space to mmap the corresponding MMIO page from kernel space for direct HW
control.

To forward the mmap info (offset and length), iommufd should add a driver
specific data structure to the IOMMUFD_CMD_VIOMMU_ALLOC ioctl, for driver
to output the info during the vIOMMU initialization back to user space.

Similar to the existing ioctls and their IOMMU handlers, add a user_data
to viommu_init op to bridge between iommufd and drivers.

Link: https://patch.msgid.link/r/90bd5637dab7f5507c7a64d2c4826e70431e45a4.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h         | 3 ++-
 drivers/iommu/iommufd/selftest.c                    | 3 ++-
 drivers/iommu/iommufd/viommu.c                      | 2 +-
 include/linux/iommu.h                               | 3 ++-
 5 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 69bbe39e28de..170d69162848 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -419,7 +419,8 @@ size_t arm_smmu_get_viommu_size(struct device *dev,
 }
 
 int arm_vsmmu_init(struct iommufd_viommu *viommu,
-		   struct iommu_domain *parent_domain)
+		   struct iommu_domain *parent_domain,
+		   const struct iommu_user_data *user_data)
 {
 	struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
 	struct arm_smmu_device *smmu =
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 04463a4aaa26..c1ced4d4b6d1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -1038,7 +1038,8 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length,
 size_t arm_smmu_get_viommu_size(struct device *dev,
 				enum iommu_viommu_type viommu_type);
 int arm_vsmmu_init(struct iommufd_viommu *viommu,
-		   struct iommu_domain *parent_domain);
+		   struct iommu_domain *parent_domain,
+		   const struct iommu_user_data *user_data);
 int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
 				    struct arm_smmu_nested_domain *nested_domain);
 void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state);
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 7a9abe3f47d5..0d896a89ace7 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -779,7 +779,8 @@ static size_t mock_get_viommu_size(struct device *dev,
 }
 
 static int mock_viommu_init(struct iommufd_viommu *viommu,
-			    struct iommu_domain *parent_domain)
+			    struct iommu_domain *parent_domain,
+			    const struct iommu_user_data *user_data)
 {
 	struct mock_iommu_device *mock_iommu = container_of(
 		viommu->iommu_dev, struct mock_iommu_device, iommu_dev);
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index bc8796e6684e..2009a421efae 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -84,7 +84,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	 */
 	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
 
-	rc = ops->viommu_init(viommu, hwpt_paging->common.domain);
+	rc = ops->viommu_init(viommu, hwpt_paging->common.domain, NULL);
 	if (rc)
 		goto out_put_hwpt;
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fd7319706684..e06a0fbe4bc7 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -700,7 +700,8 @@ struct iommu_ops {
 	size_t (*get_viommu_size)(struct device *dev,
 				  enum iommu_viommu_type viommu_type);
 	int (*viommu_init)(struct iommufd_viommu *viommu,
-			   struct iommu_domain *parent_domain);
+			   struct iommu_domain *parent_domain,
+			   const struct iommu_user_data *user_data);
 
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
-- 
cgit v1.2.3


From 1976cdf61ce9b6f97b5212676a3b9f74c68f6073 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:58:59 -0700
Subject: iommufd/viommu: Allow driver-specific user data for a vIOMMU object

The new type of vIOMMU for tegra241-cmdqv driver needs a driver-specific
user data. So, add data_len/uptr to the iommu_viommu_alloc uAPI and pass
it in via the viommu_init iommu op.

Link: https://patch.msgid.link/r/2315b0e164b355746387e960745ac9154caec124.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Pranjal Shrivastava <praan@google.com>
Acked-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/viommu.c | 8 +++++++-
 include/uapi/linux/iommufd.h   | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 2009a421efae..c0365849f849 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -17,6 +17,11 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	const struct iommu_user_data user_data = {
+		.type = cmd->type,
+		.uptr = u64_to_user_ptr(cmd->data_uptr),
+		.len = cmd->data_len,
+	};
 	struct iommufd_hwpt_paging *hwpt_paging;
 	struct iommufd_viommu *viommu;
 	struct iommufd_device *idev;
@@ -84,7 +89,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	 */
 	viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev);
 
-	rc = ops->viommu_init(viommu, hwpt_paging->common.domain, NULL);
+	rc = ops->viommu_init(viommu, hwpt_paging->common.domain,
+			      user_data.len ? &user_data : NULL);
 	if (rc)
 		goto out_put_hwpt;
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 4bc05e4621c1..04eee77335cf 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -965,6 +965,9 @@ enum iommu_viommu_type {
  * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
  * @hwpt_id: ID of a nesting parent HWPT to associate to
  * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ * @data_len: Length of the type specific data
+ * @__reserved: Must be 0
+ * @data_uptr: User pointer to a driver-specific virtual IOMMU data
  *
  * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
  * virtualization support that is a security-isolated slice of the real IOMMU HW
@@ -985,6 +988,9 @@ struct iommu_viommu_alloc {
 	__u32 dev_id;
 	__u32 hwpt_id;
 	__u32 out_viommu_id;
+	__u32 data_len;
+	__u32 __reserved;
+	__aligned_u64 data_uptr;
 };
 #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 
-- 
cgit v1.2.3


From 47462586f91358499897fddb20f6bb9cec5f4213 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Wed, 9 Jul 2025 23:07:52 -0700
Subject: fscrypt: Remove gfp_t argument from fscrypt_encrypt_block_inplace()

This argument is no longer used, so remove it.

Reviewed-by: Alex Markuze <amarkuze@redhat.com>
Link: https://lore.kernel.org/r/20250710060754.637098-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/ceph/crypto.c        | 3 +--
 fs/crypto/crypto.c      | 3 +--
 fs/ubifs/crypto.c       | 2 +-
 include/linux/fscrypt.h | 5 ++---
 4 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 3b3c4d8d401e..6d04d528ed03 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -523,8 +523,7 @@ int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
 
 	doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
 	      ceph_vinop(inode), len, offs, lblk_num);
-	return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
-					     gfp_flags);
+	return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num);
 }
 
 /**
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index bab0aacd4da3..b6ccab524fde 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -217,7 +217,6 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  * @offs:      Byte offset within @page at which the block to encrypt begins
  * @lblk_num:  Filesystem logical block number of the block, i.e. the 0-based
  *		number of the block within the file
- * @gfp_flags: Memory allocation flags
  *
  * Encrypt a possibly-compressed filesystem block that is located in an
  * arbitrary page, not necessarily in the original pagecache page.  The @inode
@@ -229,7 +228,7 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
  */
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
-				  u64 lblk_num, gfp_t gfp_flags)
+				  u64 lblk_num)
 {
 	if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
 		return -EOPNOTSUPP;
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 921f9033d0d2..fb5ac358077b 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -51,7 +51,7 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		memset(p + in_len, 0, pad_len - in_len);
 
 	err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len,
-					    offset_in_page(p), block, GFP_NOFS);
+					    offset_in_page(p), block);
 	if (err) {
 		ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err);
 		return err;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 56fad33043d5..8d0e3ad89b94 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -314,7 +314,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 		size_t len, size_t offs, gfp_t gfp_flags);
 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
 				  unsigned int len, unsigned int offs,
-				  u64 lblk_num, gfp_t gfp_flags);
+				  u64 lblk_num);
 
 int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
 				     size_t offs);
@@ -487,8 +487,7 @@ static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
 static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
 						struct page *page,
 						unsigned int len,
-						unsigned int offs, u64 lblk_num,
-						gfp_t gfp_flags)
+						unsigned int offs, u64 lblk_num)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From a8b289f0f2dcbadd8c207ad8f33cf7ba2b4eb088 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 10 Jul 2025 10:00:12 +0200
Subject: irqchip/irq-msi-lib: Fix build with PCI disabled

The armada-370-xp irqchip fails in some randconfig builds because
of a missing declaration:

In file included from drivers/irqchip/irq-armada-370-xp.c:23:
include/linux/irqchip/irq-msi-lib.h:25:39: error: 'struct msi_domain_info' declared inside parameter list will not be visible outside of this definition or declaration [-Werror]

Add a forward declaration for the msi_domain_info structure.

[ tglx: Fixed up the subsystem prefix. Is it really that hard to get right? ]

Fixes: e51b27438a10 ("irqchip: Make irq-msi-lib.h globally available")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/all/20250710080021.2303640-1-arnd@kernel.org
---
 include/linux/irqchip/irq-msi-lib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/irqchip/irq-msi-lib.h b/include/linux/irqchip/irq-msi-lib.h
index dd8d1d138544..224ac28e88d7 100644
--- a/include/linux/irqchip/irq-msi-lib.h
+++ b/include/linux/irqchip/irq-msi-lib.h
@@ -17,6 +17,7 @@
 
 #define MATCH_PLATFORM_MSI	BIT(DOMAIN_BUS_PLATFORM_MSI)
 
+struct msi_domain_info;
 int msi_lib_irq_domain_select(struct irq_domain *d, struct irq_fwspec *fwspec,
 			      enum irq_domain_bus_token bus_token);
 
-- 
cgit v1.2.3


From 96698d1898bc79c783990ac7d5458b7c8f8e0b69 Mon Sep 17 00:00:00 2001
From: Wang Liang <wangliang74@huawei.com>
Date: Tue, 8 Jul 2025 11:33:42 +0800
Subject: net: replace ND_PRINTK with dynamic debug

ND_PRINTK with val > 1 only works when the ND_DEBUG was set in compilation
phase. Replace it with dynamic debug. Convert ND_PRINTK with val <= 1 to
net_{err,warn}_ratelimited, and convert the rest to net_dbg_ratelimited.

Suggested-by: Ido Schimmel <idosch@idosch.org>
Signed-off-by: Wang Liang <wangliang74@huawei.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20250708033342.1627636-1-wangliang74@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ndisc.h |   9 ---
 net/6lowpan/ndisc.c |  16 ++----
 net/ipv6/ndisc.c    | 157 ++++++++++++++++++++--------------------------------
 3 files changed, 67 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 3c88d5bc5eed..d38783a2ce57 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -60,15 +60,6 @@ enum {
 
 #include <net/neighbour.h>
 
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...)				\
-do {								\
-	if (val <= ND_DEBUG)					\
-		net_##level##_ratelimited(fmt, ##__VA_ARGS__);	\
-} while (0)
-
 struct ctl_table;
 struct inet6_dev;
 struct net_device;
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index c40b98f7743c..868d28583c0a 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -20,9 +20,8 @@ static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
 	switch (nd_opt->nd_opt_len) {
 	case NDISC_802154_SHORT_ADDR_LENGTH:
 		if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
-			ND_PRINTK(2, warn,
-				  "%s: duplicated short addr ND6 option found: type=%d\n",
-				  __func__, nd_opt->nd_opt_type);
+			net_dbg_ratelimited("%s: duplicated short addr ND6 option found: type=%d\n",
+					    __func__, nd_opt->nd_opt_type);
 		else
 			ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
 		return 1;
@@ -63,8 +62,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
 			lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
 							     IEEE802154_SHORT_ADDR_LEN, 0);
 			if (!lladdr_short) {
-				ND_PRINTK(2, warn,
-					  "NA: invalid short link-layer address length\n");
+				net_dbg_ratelimited("NA: invalid short link-layer address length\n");
 				return;
 			}
 		}
@@ -75,8 +73,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
 			lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
 							     IEEE802154_SHORT_ADDR_LEN, 0);
 			if (!lladdr_short) {
-				ND_PRINTK(2, warn,
-					  "NA: invalid short link-layer address length\n");
+				net_dbg_ratelimited("NA: invalid short link-layer address length\n");
 				return;
 			}
 		}
@@ -209,9 +206,8 @@ static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
 						   sllao, tokenized, valid_lft,
 						   prefered_lft);
 		if (err)
-			ND_PRINTK(2, warn,
-				  "RA: could not add a short address based address for prefix: %pI6c\n",
-				  &pinfo->prefix);
+			net_dbg_ratelimited("RA: could not add a short address based address for prefix: %pI6c\n",
+					    &pinfo->prefix);
 	}
 }
 #endif
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 28f35cbb6577..d4c5876e1771 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -243,9 +243,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
 		case ND_OPT_NONCE:
 		case ND_OPT_REDIRECT_HDR:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
-				ND_PRINTK(2, warn,
-					  "%s: duplicated ND6 option found: type=%d\n",
-					  __func__, nd_opt->nd_opt_type);
+				net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n",
+						    __func__, nd_opt->nd_opt_type);
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
 			}
@@ -275,11 +274,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
 			 * to accommodate future extension to the
 			 * protocol.
 			 */
-			ND_PRINTK(2, notice,
-				  "%s: ignored unsupported option; type=%d, len=%d\n",
-				  __func__,
-				  nd_opt->nd_opt_type,
-				  nd_opt->nd_opt_len);
+			net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n",
+					    __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len);
 		}
 next_opt:
 		opt_len -= l;
@@ -754,9 +750,8 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
 		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
-			ND_PRINTK(1, dbg,
-				  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
-				  __func__, target);
+			net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n",
+					    __func__, target);
 		}
 		ndisc_send_ns(dev, target, target, saddr, 0);
 	} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
@@ -814,7 +809,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
 		return SKB_DROP_REASON_PKT_TOO_SMALL;
 
 	if (ipv6_addr_is_multicast(&msg->target)) {
-		ND_PRINTK(2, warn, "NS: multicast target address\n");
+		net_dbg_ratelimited("NS: multicast target address\n");
 		return reason;
 	}
 
@@ -823,7 +818,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
 	 * DAD has to be destined for solicited node multicast address.
 	 */
 	if (dad && !ipv6_addr_is_solict_mult(daddr)) {
-		ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
+		net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n");
 		return reason;
 	}
 
@@ -833,8 +828,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
 		if (!lladdr) {
-			ND_PRINTK(2, warn,
-				  "NS: invalid link-layer address length\n");
+			net_dbg_ratelimited("NS: invalid link-layer address length\n");
 			return reason;
 		}
 
@@ -844,8 +838,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
 		 *	in the message.
 		 */
 		if (dad) {
-			ND_PRINTK(2, warn,
-				  "NS: bad DAD packet (link-layer address option)\n");
+			net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n");
 			return reason;
 		}
 	}
@@ -862,10 +855,8 @@ have_ifp:
 				if (nonce != 0 && ifp->dad_nonce == nonce) {
 					u8 *np = (u8 *)&nonce;
 					/* Matching nonce if looped back */
-					ND_PRINTK(2, notice,
-						  "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
-						  ifp->idev->dev->name,
-						  &ifp->addr, np);
+					net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+							    ifp->idev->dev->name, &ifp->addr, np);
 					goto out;
 				}
 				/*
@@ -1016,13 +1007,13 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		return SKB_DROP_REASON_PKT_TOO_SMALL;
 
 	if (ipv6_addr_is_multicast(&msg->target)) {
-		ND_PRINTK(2, warn, "NA: target address is multicast\n");
+		net_dbg_ratelimited("NA: target address is multicast\n");
 		return reason;
 	}
 
 	if (ipv6_addr_is_multicast(daddr) &&
 	    msg->icmph.icmp6_solicited) {
-		ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
+		net_dbg_ratelimited("NA: solicited NA is multicasted\n");
 		return reason;
 	}
 
@@ -1041,8 +1032,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
 		if (!lladdr) {
-			ND_PRINTK(2, warn,
-				  "NA: invalid link-layer address length\n");
+			net_dbg_ratelimited("NA: invalid link-layer address length\n");
 			return reason;
 		}
 	}
@@ -1063,9 +1053,9 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		   unsolicited advertisement.
 		 */
 		if (skb->pkt_type != PACKET_LOOPBACK)
-			ND_PRINTK(1, warn,
-				  "NA: %pM advertised our address %pI6c on %s!\n",
-				  eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name);
+			net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n",
+					     eth_hdr(skb)->h_source, &ifp->addr,
+					     ifp->idev->dev->name);
 		in6_ifa_put(ifp);
 		return reason;
 	}
@@ -1152,7 +1142,7 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
 
 	idev = __in6_dev_get(skb->dev);
 	if (!idev) {
-		ND_PRINTK(1, err, "RS: can't find in6 device\n");
+		net_err_ratelimited("RS: can't find in6 device\n");
 		return reason;
 	}
 
@@ -1260,11 +1250,9 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 	optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
 		sizeof(struct ra_msg);
 
-	ND_PRINTK(2, info,
-		  "RA: %s, dev: %s\n",
-		  __func__, skb->dev->name);
+	net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name);
 	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
-		ND_PRINTK(2, warn, "RA: source address is not link-local\n");
+		net_dbg_ratelimited("RA: source address is not link-local\n");
 		return reason;
 	}
 	if (optlen < 0)
@@ -1272,15 +1260,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
-		ND_PRINTK(2, warn, "RA: from host or unauthorized router\n");
+		net_dbg_ratelimited("RA: from host or unauthorized router\n");
 		return reason;
 	}
 #endif
 
 	in6_dev = __in6_dev_get(skb->dev);
 	if (!in6_dev) {
-		ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n",
-			  skb->dev->name);
+		net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name);
 		return reason;
 	}
 
@@ -1288,18 +1275,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 		return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
 
 	if (!ipv6_accept_ra(in6_dev)) {
-		ND_PRINTK(2, info,
-			  "RA: %s, did not accept ra for dev: %s\n",
-			  __func__, skb->dev->name);
+		net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__,
+				    skb->dev->name);
 		goto skip_linkparms;
 	}
 
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	/* skip link-specific parameters from interior routers */
 	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
-		ND_PRINTK(2, info,
-			  "RA: %s, nodetype is NODEFAULT, dev: %s\n",
-			  __func__, skb->dev->name);
+		net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__,
+				    skb->dev->name);
 		goto skip_linkparms;
 	}
 #endif
@@ -1328,18 +1313,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 		send_ifinfo_notify = true;
 
 	if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) {
-		ND_PRINTK(2, info,
-			  "RA: %s, defrtr is false for dev: %s\n",
-			  __func__, skb->dev->name);
+		net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__,
+				    skb->dev->name);
 		goto skip_defrtr;
 	}
 
 	lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
 	if (lifetime != 0 &&
 	    lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) {
-		ND_PRINTK(2, info,
-			  "RA: router lifetime (%ds) is too short: %s\n",
-			  lifetime, skb->dev->name);
+		net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime,
+				    skb->dev->name);
 		goto skip_defrtr;
 	}
 
@@ -1349,9 +1332,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 	net = dev_net(in6_dev->dev);
 	if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
 	    ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
-		ND_PRINTK(2, info,
-			  "RA from local address detected on dev: %s: default router ignored\n",
-			  skb->dev->name);
+		net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n",
+				    skb->dev->name);
 		goto skip_defrtr;
 	}
 
@@ -1369,9 +1351,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 					 rt->fib6_nh->fib_nh_dev, NULL,
 					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
-			ND_PRINTK(0, err,
-				  "RA: %s got default router without neighbour\n",
-				  __func__);
+			net_err_ratelimited("RA: %s got default router without neighbour\n",
+					    __func__);
 			fib6_info_release(rt);
 			return reason;
 		}
@@ -1384,10 +1365,10 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 		rt = NULL;
 	}
 
-	ND_PRINTK(3, info, "RA: rt: %p  lifetime: %d, metric: %d, for dev: %s\n",
-		  rt, lifetime, defrtr_usr_metric, skb->dev->name);
+	net_dbg_ratelimited("RA: rt: %p  lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime,
+			    defrtr_usr_metric, skb->dev->name);
 	if (!rt && lifetime) {
-		ND_PRINTK(3, info, "RA: adding default router\n");
+		net_dbg_ratelimited("RA: adding default router\n");
 
 		if (neigh)
 			neigh_release(neigh);
@@ -1396,9 +1377,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 					 skb->dev, pref, defrtr_usr_metric,
 					 lifetime);
 		if (!rt) {
-			ND_PRINTK(0, err,
-				  "RA: %s failed to add default route\n",
-				  __func__);
+			net_err_ratelimited("RA: %s failed to add default route\n", __func__);
 			return reason;
 		}
 
@@ -1406,9 +1385,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 					 rt->fib6_nh->fib_nh_dev, NULL,
 					  &ipv6_hdr(skb)->saddr);
 		if (!neigh) {
-			ND_PRINTK(0, err,
-				  "RA: %s got default router without neighbour\n",
-				  __func__);
+			net_err_ratelimited("RA: %s got default router without neighbour\n",
+					    __func__);
 			fib6_info_release(rt);
 			return reason;
 		}
@@ -1439,7 +1417,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
 			fib6_metric_set(rt, RTAX_HOPLIMIT,
 					ra_msg->icmph.icmp6_hop_limit);
 		} else {
-			ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
+			net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n");
 		}
 	}
 
@@ -1495,8 +1473,7 @@ skip_linkparms:
 			lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
 						     skb->dev);
 			if (!lladdr) {
-				ND_PRINTK(2, warn,
-					  "RA: invalid link-layer address length\n");
+				net_dbg_ratelimited("RA: invalid link-layer address length\n");
 				goto out;
 			}
 		}
@@ -1510,9 +1487,8 @@ skip_linkparms:
 	}
 
 	if (!ipv6_accept_ra(in6_dev)) {
-		ND_PRINTK(2, info,
-			  "RA: %s, accept_ra is false for dev: %s\n",
-			  __func__, skb->dev->name);
+		net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__,
+				    skb->dev->name);
 		goto out;
 	}
 
@@ -1520,9 +1496,8 @@ skip_linkparms:
 	if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
 	    ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
 			  in6_dev->dev, 0)) {
-		ND_PRINTK(2, info,
-			  "RA from local address detected on dev: %s: router info ignored.\n",
-			  skb->dev->name);
+		net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n",
+				    skb->dev->name);
 		goto skip_routeinfo;
 	}
 
@@ -1558,9 +1533,8 @@ skip_routeinfo:
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	/* skip link-specific ndopts from interior routers */
 	if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
-		ND_PRINTK(2, info,
-			  "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
-			  __func__, skb->dev->name);
+		net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
+				    __func__, skb->dev->name);
 		goto out;
 	}
 #endif
@@ -1589,7 +1563,7 @@ skip_routeinfo:
 		}
 
 		if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
-			ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
+			net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu);
 		} else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
 			WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
 			fib6_metric_set(rt, RTAX_MTU, mtu);
@@ -1608,7 +1582,7 @@ skip_routeinfo:
 	}
 
 	if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
-		ND_PRINTK(2, warn, "RA: invalid RA options\n");
+		net_dbg_ratelimited("RA: invalid RA options\n");
 	}
 out:
 	/* Send a notify if RA changed managed/otherconf flags or
@@ -1636,15 +1610,13 @@ static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb)
 	switch (skb->ndisc_nodetype) {
 	case NDISC_NODETYPE_HOST:
 	case NDISC_NODETYPE_NODEFAULT:
-		ND_PRINTK(2, warn,
-			  "Redirect: from host or unauthorized router\n");
+		net_dbg_ratelimited("Redirect: from host or unauthorized router\n");
 		return reason;
 	}
 #endif
 
 	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
-		ND_PRINTK(2, warn,
-			  "Redirect: source address is not link-local\n");
+		net_dbg_ratelimited("Redirect: source address is not link-local\n");
 		return reason;
 	}
 
@@ -1705,15 +1677,13 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	}
 
 	if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
-		ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
-			  dev->name);
+		net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name);
 		return;
 	}
 
 	if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
 	    ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
-		ND_PRINTK(2, warn,
-			  "Redirect: target address is not link-local unicast\n");
+		net_dbg_ratelimited("Redirect: target address is not link-local unicast\n");
 		return;
 	}
 
@@ -1732,8 +1702,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	rt = dst_rt6_info(dst);
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
-		ND_PRINTK(2, warn,
-			  "Redirect: destination is not a neighbour\n");
+		net_dbg_ratelimited("Redirect: destination is not a neighbour\n");
 		goto release;
 	}
 
@@ -1746,8 +1715,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	if (dev->addr_len) {
 		struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
 		if (!neigh) {
-			ND_PRINTK(2, warn,
-				  "Redirect: no neigh for target address\n");
+			net_dbg_ratelimited("Redirect: no neigh for target address\n");
 			goto release;
 		}
 
@@ -1848,14 +1816,12 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb)
 	__skb_push(skb, skb->data - skb_transport_header(skb));
 
 	if (ipv6_hdr(skb)->hop_limit != 255) {
-		ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
-			  ipv6_hdr(skb)->hop_limit);
+		net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit);
 		return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT;
 	}
 
 	if (msg->icmph.icmp6_code != 0) {
-		ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
-			  msg->icmph.icmp6_code);
+		net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code);
 		return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE;
 	}
 
@@ -2006,9 +1972,8 @@ static int __net_init ndisc_net_init(struct net *net)
 	err = inet_ctl_sock_create(&sk, PF_INET6,
 				   SOCK_RAW, IPPROTO_ICMPV6, net);
 	if (err < 0) {
-		ND_PRINTK(0, err,
-			  "NDISC: Failed to initialize the control socket (err %d)\n",
-			  err);
+		net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n",
+				    err);
 		return err;
 	}
 
-- 
cgit v1.2.3


From 18cdb3d982da8976b28d57691eb256ec5688fad2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 7 Jul 2025 12:45:17 +0000
Subject: netfilter: flowtable: account for Ethernet header in
 nf_flow_pppoe_proto()

syzbot found a potential access to uninit-value in nf_flow_pppoe_proto()

Blamed commit forgot the Ethernet header.

BUG: KMSAN: uninit-value in nf_flow_offload_inet_hook+0x7e4/0x940 net/netfilter/nf_flow_table_inet.c:27
  nf_flow_offload_inet_hook+0x7e4/0x940 net/netfilter/nf_flow_table_inet.c:27
  nf_hook_entry_hookfn include/linux/netfilter.h:157 [inline]
  nf_hook_slow+0xe1/0x3d0 net/netfilter/core.c:623
  nf_hook_ingress include/linux/netfilter_netdev.h:34 [inline]
  nf_ingress net/core/dev.c:5742 [inline]
  __netif_receive_skb_core+0x4aff/0x70c0 net/core/dev.c:5837
  __netif_receive_skb_one_core net/core/dev.c:5975 [inline]
  __netif_receive_skb+0xcc/0xac0 net/core/dev.c:6090
  netif_receive_skb_internal net/core/dev.c:6176 [inline]
  netif_receive_skb+0x57/0x630 net/core/dev.c:6235
  tun_rx_batched+0x1df/0x980 drivers/net/tun.c:1485
  tun_get_user+0x4ee0/0x6b40 drivers/net/tun.c:1938
  tun_chr_write_iter+0x3e9/0x5c0 drivers/net/tun.c:1984
  new_sync_write fs/read_write.c:593 [inline]
  vfs_write+0xb4b/0x1580 fs/read_write.c:686
  ksys_write fs/read_write.c:738 [inline]
  __do_sys_write fs/read_write.c:749 [inline]

Reported-by: syzbot+bf6ed459397e307c3ad2@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/686bc073.a00a0220.c7b3.0086.GAE@google.com/T/#u
Fixes: 87b3593bed18 ("netfilter: flowtable: validate pppoe header")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Pablo Neira Ayuso <pablo@netfilter.org>
Link: https://patch.msgid.link/20250707124517.614489-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netfilter/nf_flow_table.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index d711642e78b5..c003cd194fa2 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -370,7 +370,7 @@ static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb)
 
 static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto)
 {
-	if (!pskb_may_pull(skb, PPPOE_SES_HLEN))
+	if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN))
 		return false;
 
 	*inner_proto = __nf_flow_pppoe_proto(skb);
-- 
cgit v1.2.3


From d7974697de4d6fa1a1ed9ca43616a8500046f25a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Jul 2025 15:06:38 -0700
Subject: ethtool: mark ETHER_FLOW as usable for Rx hash

Looks like some drivers (ena, enetc, fbnic.. there's probably more)
consider ETHER_FLOW to be legitimate target for flow hashing.
I'm not sure how intentional that is from the uAPI perspective
vs just an effect of ethtool IOCTL doing minimal input validation.
But Netlink will do strict validation, so we need to decide whether
we allow this use case or not. I don't see a strong reason against
it, and rejecting it would potentially regress a number of drivers.
So update the comments and flow_type_hashable().

Link: https://patch.msgid.link/20250708220640.2738464-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 4 ++--
 net/ethtool/ioctl.c          | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 707c1844010c..9e9afdd1238a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2314,7 +2314,7 @@ enum {
 	IPV6_USER_FLOW	= 0x0e, /* spec only (usr_ip6_spec; nfc only) */
 	IPV4_FLOW	= 0x10, /* hash only */
 	IPV6_FLOW	= 0x11, /* hash only */
-	ETHER_FLOW	= 0x12, /* spec only (ether_spec) */
+	ETHER_FLOW	= 0x12, /* hash or spec (ether_spec) */
 
 	/* Used for GTP-U IPv4 and IPv6.
 	 * The format of GTP packets only includes
@@ -2371,7 +2371,7 @@ enum {
 /* Flag to enable RSS spreading of traffic matching rule (nfc only) */
 #define	FLOW_RSS	0x20000000
 
-/* L3-L4 network traffic flow hash options */
+/* L2-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
 #define	RXH_VLAN	(1 << 2)
 #define	RXH_L3_PROTO	(1 << 3)
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 139f95620cdd..67f6d900a4ee 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -981,6 +981,7 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
 static bool flow_type_hashable(u32 flow_type)
 {
 	switch (flow_type) {
+	case ETHER_FLOW:
 	case TCP_V4_FLOW:
 	case UDP_V4_FLOW:
 	case SCTP_V4_FLOW:
-- 
cgit v1.2.3


From 178331743ca860561f60d04a7797a2fce13f0784 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Jul 2025 15:06:39 -0700
Subject: ethtool: rss: report which fields are configured for hashing

Implement ETHTOOL_GRXFH over Netlink. The number of flow types is
reasonable (around 20) so report all of them at once for simplicity.

Do not maintain the flow ID mapping with ioctl at the uAPI level.
This gives us a chance to clean up the confusion that come from
RxNFC vs RxFH (flow direction vs hashing) in the ioctl.
Try to align with the names used in ethtool CLI, they seem to have
stood the test of time just fine. One annoyance is that we still
call L4 ports the weird names, but I guess they also apply to IPSec
(where they cover the SPI) so it is what it is.

 $ ynl --family ethtool --dump rss-get
 {
    "header": {
	"dev-index": 1,
	"dev-name": "enp1s0"
    },
    "hfunc": 1,
    "hkey": b"...",
    "indir": [0, 1, ...],
    "flow-hash": {
        "ether": {"l2da"},
	"ah-esp4": {"ip-src", "ip-dst"},
        "ah-esp6": {"ip-src", "ip-dst"},
        "ah4": {"ip-src", "ip-dst"},
        "ah6": {"ip-src", "ip-dst"},
        "esp4": {"ip-src", "ip-dst"},
        "esp6": {"ip-src", "ip-dst"},
        "ip4": {"ip-src", "ip-dst"},
        "ip6": {"ip-src", "ip-dst"},
        "sctp4": {"ip-src", "ip-dst"},
        "sctp6": {"ip-src", "ip-dst"},
        "udp4": {"ip-src", "ip-dst"},
        "udp6": {"ip-src", "ip-dst"}
        "tcp4": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"},
        "tcp6": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"},
    },
 }

Link: https://patch.msgid.link/20250708220640.2738464-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       | 151 +++++++++++++++++++++++++
 Documentation/networking/ethtool-netlink.rst   |   9 +-
 include/uapi/linux/ethtool_netlink_generated.h |  34 ++++++
 net/ethtool/ioctl.c                            |   6 +-
 net/ethtool/rss.c                              | 105 +++++++++++++++--
 5 files changed, 291 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 49e782a33eb6..c38c03c624f0 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -158,6 +158,35 @@ definitions:
       -
         name: pse-event-sw-pw-control-error
         doc: PSE faced an error managing the power control from software
+  -
+    name: rxfh-fields
+    name-prefix: rxh-
+    enum-name:
+    header: linux/ethtool.h
+    type: flags
+    entries:
+      -
+        name: l2da
+        value: 1
+      -
+        name: vlan
+      -
+        name: l3-proto
+      -
+        name: ip-src
+      -
+        name: ip-dst
+      -
+        name: l4-b-0-1
+        doc: src port in case of TCP/UDP/SCTP
+      -
+        name: l4-b-2-3
+        doc: dst port in case of TCP/UDP/SCTP
+      -
+        name: gtp-teid
+      -
+        name: discard
+        value: 31
 
 attribute-sets:
   -
@@ -1447,6 +1476,123 @@ attribute-sets:
         name: pse-prio
         type: u32
         name-prefix: ethtool-a-
+  -
+    name: flow
+    attr-cnt-name: --ethtool-a-flow-cnt
+    doc: |
+      Flow types, corresponding to those defined in the old
+      ethtool header for RXFH and RXNFC as ${PROTO}_FLOW.
+      The values are not matching the old ones to avoid carrying
+      into Netlink the IP_USER_FLOW vs IPV4_FLOW vs IPV4_USER_FLOW confusion.
+    attributes:
+      -
+        name: ether
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ip4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ip6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: tcp4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: tcp6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: udp4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: udp6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: sctp4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: sctp6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ah4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ah6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: esp4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: esp6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ah-esp4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: ah-esp6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpc4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpc6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpc-teid4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpc-teid6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-eh4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-eh6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-ul4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-ul6
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-dl4
+        type: uint
+        enum: rxfh-fields
+      -
+        name: gtpu-dl6
+        type: uint
+        enum: rxfh-fields
   -
     name: rss
     attr-cnt-name: __ethtool-a-rss-cnt
@@ -1478,6 +1624,10 @@ attribute-sets:
       -
         name: start-context
         type: u32
+      -
+        name: flow-hash
+        type: nest
+        nested-attributes: flow
   -
     name: plca
     attr-cnt-name: __ethtool-a-plca-cnt
@@ -2307,6 +2457,7 @@ operations:
             - indir
             - hkey
             - input-xfrm
+            - flow-hash
       dump:
         request:
           attributes:
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 07e9808ebd2c..248bc3d93da9 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1969,14 +1969,15 @@ used to ignore context 0s and only dump additional contexts).
 
 Kernel response contents:
 
-=====================================  ======  ==========================
+=====================================  ======  ===============================
   ``ETHTOOL_A_RSS_HEADER``             nested  reply header
   ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
   ``ETHTOOL_A_RSS_HFUNC``              u32     RSS hash func
   ``ETHTOOL_A_RSS_INDIR``              binary  Indir table bytes
   ``ETHTOOL_A_RSS_HKEY``               binary  Hash key bytes
   ``ETHTOOL_A_RSS_INPUT_XFRM``         u32     RSS input data transformation
-=====================================  ======  ==========================
+  ``ETHTOOL_A_RSS_FLOW_HASH``          nested  Header fields included in hash
+=====================================  ======  ===============================
 
 ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function
 being used. Current supported options are toeplitz, xor or crc32.
@@ -1985,6 +1986,8 @@ indicates queue number.
 ETHTOOL_A_RSS_INPUT_XFRM attribute is a bitmap indicating the type of
 transformation applied to the input protocol fields before given to the RSS
 hfunc. Current supported options are symmetric-xor and symmetric-or-xor.
+ETHTOOL_A_RSS_FLOW_HASH carries per-flow type bitmask of which header
+fields are included in the hash calculation.
 
 PLCA_GET_CFG
 ============
@@ -2436,7 +2439,7 @@ are netlink only.
   ``ETHTOOL_SFLAGS``                  ``ETHTOOL_MSG_FEATURES_SET``
   ``ETHTOOL_GPFLAGS``                 ``ETHTOOL_MSG_PRIVFLAGS_GET``
   ``ETHTOOL_SPFLAGS``                 ``ETHTOOL_MSG_PRIVFLAGS_SET``
-  ``ETHTOOL_GRXFH``                   n/a
+  ``ETHTOOL_GRXFH``                   ``ETHTOOL_MSG_RSS_GET``
   ``ETHTOOL_SRXFH``                   n/a
   ``ETHTOOL_GGRO``                    ``ETHTOOL_MSG_FEATURES_GET``
   ``ETHTOOL_SGRO``                    ``ETHTOOL_MSG_FEATURES_SET``
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 8f30ffa1cd14..96027e26ffba 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -678,6 +678,39 @@ enum {
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_FLOW_ETHER = 1,
+	ETHTOOL_A_FLOW_IP4,
+	ETHTOOL_A_FLOW_IP6,
+	ETHTOOL_A_FLOW_TCP4,
+	ETHTOOL_A_FLOW_TCP6,
+	ETHTOOL_A_FLOW_UDP4,
+	ETHTOOL_A_FLOW_UDP6,
+	ETHTOOL_A_FLOW_SCTP4,
+	ETHTOOL_A_FLOW_SCTP6,
+	ETHTOOL_A_FLOW_AH4,
+	ETHTOOL_A_FLOW_AH6,
+	ETHTOOL_A_FLOW_ESP4,
+	ETHTOOL_A_FLOW_ESP6,
+	ETHTOOL_A_FLOW_AH_ESP4,
+	ETHTOOL_A_FLOW_AH_ESP6,
+	ETHTOOL_A_FLOW_GTPU4,
+	ETHTOOL_A_FLOW_GTPU6,
+	ETHTOOL_A_FLOW_GTPC4,
+	ETHTOOL_A_FLOW_GTPC6,
+	ETHTOOL_A_FLOW_GTPC_TEID4,
+	ETHTOOL_A_FLOW_GTPC_TEID6,
+	ETHTOOL_A_FLOW_GTPU_EH4,
+	ETHTOOL_A_FLOW_GTPU_EH6,
+	ETHTOOL_A_FLOW_GTPU_UL4,
+	ETHTOOL_A_FLOW_GTPU_UL6,
+	ETHTOOL_A_FLOW_GTPU_DL4,
+	ETHTOOL_A_FLOW_GTPU_DL6,
+
+	__ETHTOOL_A_FLOW_CNT,
+	ETHTOOL_A_FLOW_MAX = (__ETHTOOL_A_FLOW_CNT - 1)
+};
+
 enum {
 	ETHTOOL_A_RSS_UNSPEC,
 	ETHTOOL_A_RSS_HEADER,
@@ -687,6 +720,7 @@ enum {
 	ETHTOOL_A_RSS_HKEY,
 	ETHTOOL_A_RSS_INPUT_XFRM,
 	ETHTOOL_A_RSS_START_CONTEXT,
+	ETHTOOL_A_RSS_FLOW_HASH,
 
 	__ETHTOOL_A_RSS_CNT,
 	ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1)
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 67f6d900a4ee..cccb4694f5e1 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1101,7 +1101,11 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
 	rc = ops->set_rxfh_fields(dev, &fields, NULL);
 exit_unlock:
 	mutex_unlock(&dev->ethtool->rss_lock);
-	return rc;
+	if (rc)
+		return rc;
+
+	ethtool_rss_notify(dev, fields.rss_context);
+	return 0;
 }
 
 static noinline_for_stack int
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 37a7b20fcd07..41ab9fc67652 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -12,6 +12,7 @@ struct rss_req_info {
 
 struct rss_reply_data {
 	struct ethnl_reply_data		base;
+	bool				has_flow_hash;
 	bool				no_key_fields;
 	u32				indir_size;
 	u32				hkey_size;
@@ -19,6 +20,37 @@ struct rss_reply_data {
 	u32				input_xfrm;
 	u32				*indir_table;
 	u8				*hkey;
+	int				flow_hash[__ETHTOOL_A_FLOW_CNT];
+};
+
+static const u8 ethtool_rxfh_ft_nl2ioctl[] = {
+	[ETHTOOL_A_FLOW_ETHER]		= ETHER_FLOW,
+	[ETHTOOL_A_FLOW_IP4]		= IPV4_FLOW,
+	[ETHTOOL_A_FLOW_IP6]		= IPV6_FLOW,
+	[ETHTOOL_A_FLOW_TCP4]		= TCP_V4_FLOW,
+	[ETHTOOL_A_FLOW_UDP4]		= UDP_V4_FLOW,
+	[ETHTOOL_A_FLOW_SCTP4]		= SCTP_V4_FLOW,
+	[ETHTOOL_A_FLOW_AH_ESP4]	= AH_ESP_V4_FLOW,
+	[ETHTOOL_A_FLOW_TCP6]		= TCP_V6_FLOW,
+	[ETHTOOL_A_FLOW_UDP6]		= UDP_V6_FLOW,
+	[ETHTOOL_A_FLOW_SCTP6]		= SCTP_V6_FLOW,
+	[ETHTOOL_A_FLOW_AH_ESP6]	= AH_ESP_V6_FLOW,
+	[ETHTOOL_A_FLOW_AH4]		= AH_V4_FLOW,
+	[ETHTOOL_A_FLOW_ESP4]		= ESP_V4_FLOW,
+	[ETHTOOL_A_FLOW_AH6]		= AH_V6_FLOW,
+	[ETHTOOL_A_FLOW_ESP6]		= ESP_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPU4]		= GTPU_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPU6]		= GTPU_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPC4]		= GTPC_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPC6]		= GTPC_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPC_TEID4]	= GTPC_TEID_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPC_TEID6]	= GTPC_TEID_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_EH4]	= GTPU_EH_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_EH6]	= GTPU_EH_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_UL4]	= GTPU_UL_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_UL6]	= GTPU_UL_V6_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_DL4]	= GTPU_DL_V4_FLOW,
+	[ETHTOOL_A_FLOW_GTPU_DL6]	= GTPU_DL_V6_FLOW,
 };
 
 #define RSS_REQINFO(__req_base) \
@@ -49,6 +81,37 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
 	return 0;
 }
 
+static void
+rss_prepare_flow_hash(const struct rss_req_info *req, struct net_device *dev,
+		      struct rss_reply_data *data, const struct genl_info *info)
+{
+	int i;
+
+	data->has_flow_hash = false;
+
+	if (!dev->ethtool_ops->get_rxfh_fields)
+		return;
+	if (req->rss_context && !dev->ethtool_ops->rxfh_per_ctx_fields)
+		return;
+
+	mutex_lock(&dev->ethtool->rss_lock);
+	for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+		struct ethtool_rxfh_fields fields = {
+			.flow_type	= ethtool_rxfh_ft_nl2ioctl[i],
+			.rss_context	= req->rss_context,
+		};
+
+		if (dev->ethtool_ops->get_rxfh_fields(dev, &fields)) {
+			data->flow_hash[i] = -1; /* Unsupported */
+			continue;
+		}
+
+		data->flow_hash[i] = fields.data;
+		data->has_flow_hash = true;
+	}
+	mutex_unlock(&dev->ethtool->rss_lock);
+}
+
 static int
 rss_prepare_get(const struct rss_req_info *request, struct net_device *dev,
 		struct rss_reply_data *data, const struct genl_info *info)
@@ -153,6 +216,8 @@ static int
 rss_prepare(const struct rss_req_info *request, struct net_device *dev,
 	    struct rss_reply_data *data, const struct genl_info *info)
 {
+	rss_prepare_flow_hash(request, dev, data, info);
+
 	if (request->rss_context)
 		return rss_prepare_ctx(request, dev, data, info);
 	return rss_prepare_get(request, dev, data, info);
@@ -190,7 +255,10 @@ rss_reply_size(const struct ethnl_req_info *req_base,
 	      nla_total_size(sizeof(u32)) +	/* _RSS_HFUNC */
 	      nla_total_size(sizeof(u32)) +	/* _RSS_INPUT_XFRM */
 	      nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */
-	      nla_total_size(data->hkey_size);	/* _RSS_HKEY */
+	      nla_total_size(data->hkey_size) + /* _RSS_HKEY */
+	      nla_total_size(0) +		/* _RSS_FLOW_HASH */
+		nla_total_size(sizeof(u32)) * ETHTOOL_A_FLOW_MAX +
+	      0;
 
 	return len;
 }
@@ -211,17 +279,34 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base,
 		     sizeof(u32) * data->indir_size, data->indir_table)))
 		return -EMSGSIZE;
 
-	if (data->no_key_fields)
-		return 0;
-
-	if ((data->hfunc &&
-	     nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) ||
-	    (data->input_xfrm &&
-	     nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) ||
-	    (data->hkey_size &&
-	     nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)))
+	if (!data->no_key_fields &&
+	    ((data->hfunc &&
+	      nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) ||
+	     (data->input_xfrm &&
+	      nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) ||
+	     (data->hkey_size &&
+	      nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))))
 		return -EMSGSIZE;
 
+	if (data->has_flow_hash) {
+		struct nlattr *nest;
+		int i;
+
+		nest = nla_nest_start(skb, ETHTOOL_A_RSS_FLOW_HASH);
+		if (!nest)
+			return -EMSGSIZE;
+
+		for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+			if (data->flow_hash[i] >= 0 &&
+			    nla_put_uint(skb, i, data->flow_hash[i])) {
+				nla_nest_cancel(skb, nest);
+				return -EMSGSIZE;
+			}
+		}
+
+		nla_nest_end(skb, nest);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4dc5f7b2c0ccf233d16c5f3090208d70954f1e2a Mon Sep 17 00:00:00 2001
From: Jacky Chou <jacky_chou@aspeedtech.com>
Date: Wed, 9 Jul 2025 15:08:07 +0800
Subject: dt-bindings: clock: ast2600: Add reset definitions for MAC1 and MAC2

Add ASPEED_RESET_MAC1 and ASPEED_RESET_MAC2 reset definitions to
the ast2600-clock binding header. These are required for proper
reset control of the MAC1 and MAC2 ethernet controllers on the
AST2600 SoC.

Signed-off-by: Jacky Chou <jacky_chou@aspeedtech.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Link: https://patch.msgid.link/20250709070809.2560688-3-jacky_chou@aspeedtech.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/dt-bindings/clock/ast2600-clock.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/ast2600-clock.h b/include/dt-bindings/clock/ast2600-clock.h
index 7ae96c7bd72f..f60fff261130 100644
--- a/include/dt-bindings/clock/ast2600-clock.h
+++ b/include/dt-bindings/clock/ast2600-clock.h
@@ -122,6 +122,8 @@
 #define ASPEED_RESET_PCIE_DEV_OEN	20
 #define ASPEED_RESET_PCIE_RC_O		19
 #define ASPEED_RESET_PCIE_RC_OEN	18
+#define ASPEED_RESET_MAC2		12
+#define ASPEED_RESET_MAC1		11
 #define ASPEED_RESET_PCI_DP		5
 #define ASPEED_RESET_HACE		4
 #define ASPEED_RESET_AHB		1
-- 
cgit v1.2.3


From ed677858d4fe8d165952c1794898d6fc0b65ddfe Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 9 Jul 2025 18:04:04 +0200
Subject: ALSA: hda: Move widget capability macros into hdaudio.h

The get_wcaps() and co are used not only by HD-audio core but also
other driver code, hence it'd be better to put into the common header
instead of local.h.

OTOH, there are macros of the same name like get_wcaps() that are
still used in sound/pci/hda/* locally, and those conflict with each
other.  So we need to rename get_wcaps() (to be moved from hda-core)
with the proper snd_hdac prefix for avoiding name conflicts, and
define in the common hdaudio.h.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20250709160434.1859-2-tiwai@suse.de
---
 include/sound/hdaudio.h      | 24 ++++++++++++++++++++++++
 sound/hda/hdac_device.c      |  8 ++++----
 sound/hda/hdac_sysfs.c       | 14 +++++++-------
 sound/hda/local.h            | 21 ---------------------
 sound/soc/codecs/hdac_hdmi.c | 21 ++++++++++-----------
 5 files changed, 45 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index 25668eee65cf..d38234f8fe44 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -680,6 +680,30 @@ static inline void snd_hdac_dsp_cleanup(struct hdac_stream *azx_dev,
 }
 #endif /* CONFIG_SND_HDA_DSP_LOADER */
 
+/*
+ * Easy macros for widget capabilities
+ */
+#define snd_hdac_get_wcaps(codec, nid) \
+	snd_hdac_read_parm(codec, nid, AC_PAR_AUDIO_WIDGET_CAP)
+
+/* get the widget type from widget capability bits */
+static inline int snd_hdac_get_wcaps_type(unsigned int wcaps)
+{
+	if (!wcaps)
+		return -1; /* invalid type */
+	return (wcaps & AC_WCAP_TYPE) >> AC_WCAP_TYPE_SHIFT;
+}
+
+/* get the number of supported channels */
+static inline unsigned int snd_hdac_get_wcaps_channels(u32 wcaps)
+{
+	unsigned int chans;
+
+	chans = (wcaps & AC_WCAP_CHAN_CNT_EXT) >> 13;
+	chans = (chans + 1) * 2;
+
+	return chans;
+}
 
 /*
  * generic array helpers
diff --git a/sound/hda/hdac_device.c b/sound/hda/hdac_device.c
index a02dce5f6a88..018f9e176b1b 100644
--- a/sound/hda/hdac_device.c
+++ b/sound/hda/hdac_device.c
@@ -436,11 +436,11 @@ EXPORT_SYMBOL_GPL(snd_hdac_refresh_widgets);
 /* return CONNLIST_LEN parameter of the given widget */
 static unsigned int get_num_conns(struct hdac_device *codec, hda_nid_t nid)
 {
-	unsigned int wcaps = get_wcaps(codec, nid);
+	unsigned int wcaps = snd_hdac_get_wcaps(codec, nid);
 	unsigned int parm;
 
 	if (!(wcaps & AC_WCAP_CONN_LIST) &&
-	    get_wcaps_type(wcaps) != AC_WID_VOL_KNB)
+	    snd_hdac_get_wcaps_type(wcaps) != AC_WID_VOL_KNB)
 		return 0;
 
 	parm = snd_hdac_read_parm(codec, nid, AC_PAR_CONNLIST_LEN);
@@ -854,7 +854,7 @@ static unsigned int query_pcm_param(struct hdac_device *codec, hda_nid_t nid)
 	unsigned int val = 0;
 
 	if (nid != codec->afg &&
-	    (get_wcaps(codec, nid) & AC_WCAP_FORMAT_OVRD))
+	    (snd_hdac_get_wcaps(codec, nid) & AC_WCAP_FORMAT_OVRD))
 		val = snd_hdac_read_parm(codec, nid, AC_PAR_PCM);
 	if (!val || val == -1)
 		val = snd_hdac_read_parm(codec, codec->afg, AC_PAR_PCM);
@@ -894,7 +894,7 @@ int snd_hdac_query_supported_pcm(struct hdac_device *codec, hda_nid_t nid,
 {
 	unsigned int i, val, wcaps;
 
-	wcaps = get_wcaps(codec, nid);
+	wcaps = snd_hdac_get_wcaps(codec, nid);
 	val = query_pcm_param(codec, nid);
 
 	if (ratesp) {
diff --git a/sound/hda/hdac_sysfs.c b/sound/hda/hdac_sysfs.c
index 60b0a70428d5..bffe52859dba 100644
--- a/sound/hda/hdac_sysfs.c
+++ b/sound/hda/hdac_sysfs.c
@@ -161,13 +161,13 @@ static const struct kobj_type widget_ktype = {
 static ssize_t caps_show(struct hdac_device *codec, hda_nid_t nid,
 			struct widget_attribute *attr, char *buf)
 {
-	return sysfs_emit(buf, "0x%08x\n", get_wcaps(codec, nid));
+	return sysfs_emit(buf, "0x%08x\n", snd_hdac_get_wcaps(codec, nid));
 }
 
 static ssize_t pin_caps_show(struct hdac_device *codec, hda_nid_t nid,
 			     struct widget_attribute *attr, char *buf)
 {
-	if (get_wcaps_type(get_wcaps(codec, nid)) != AC_WID_PIN)
+	if (snd_hdac_get_wcaps_type(snd_hdac_get_wcaps(codec, nid)) != AC_WID_PIN)
 		return 0;
 	return sysfs_emit(buf, "0x%08x\n",
 			  snd_hdac_read_parm(codec, nid, AC_PAR_PIN_CAP));
@@ -178,7 +178,7 @@ static ssize_t pin_cfg_show(struct hdac_device *codec, hda_nid_t nid,
 {
 	unsigned int val;
 
-	if (get_wcaps_type(get_wcaps(codec, nid)) != AC_WID_PIN)
+	if (snd_hdac_get_wcaps_type(snd_hdac_get_wcaps(codec, nid)) != AC_WID_PIN)
 		return 0;
 	if (snd_hdac_read(codec, nid, AC_VERB_GET_CONFIG_DEFAULT, 0, &val))
 		return 0;
@@ -189,7 +189,7 @@ static bool has_pcm_cap(struct hdac_device *codec, hda_nid_t nid)
 {
 	if (nid == codec->afg || nid == codec->mfg)
 		return true;
-	switch (get_wcaps_type(get_wcaps(codec, nid))) {
+	switch (snd_hdac_get_wcaps_type(snd_hdac_get_wcaps(codec, nid))) {
 	case AC_WID_AUD_OUT:
 	case AC_WID_AUD_IN:
 		return true;
@@ -219,7 +219,7 @@ static ssize_t pcm_formats_show(struct hdac_device *codec, hda_nid_t nid,
 static ssize_t amp_in_caps_show(struct hdac_device *codec, hda_nid_t nid,
 				struct widget_attribute *attr, char *buf)
 {
-	if (nid != codec->afg && !(get_wcaps(codec, nid) & AC_WCAP_IN_AMP))
+	if (nid != codec->afg && !(snd_hdac_get_wcaps(codec, nid) & AC_WCAP_IN_AMP))
 		return 0;
 	return sysfs_emit(buf, "0x%08x\n",
 			  snd_hdac_read_parm(codec, nid, AC_PAR_AMP_IN_CAP));
@@ -228,7 +228,7 @@ static ssize_t amp_in_caps_show(struct hdac_device *codec, hda_nid_t nid,
 static ssize_t amp_out_caps_show(struct hdac_device *codec, hda_nid_t nid,
 				 struct widget_attribute *attr, char *buf)
 {
-	if (nid != codec->afg && !(get_wcaps(codec, nid) & AC_WCAP_OUT_AMP))
+	if (nid != codec->afg && !(snd_hdac_get_wcaps(codec, nid) & AC_WCAP_OUT_AMP))
 		return 0;
 	return sysfs_emit(buf, "0x%08x\n",
 			  snd_hdac_read_parm(codec, nid, AC_PAR_AMP_OUT_CAP));
@@ -237,7 +237,7 @@ static ssize_t amp_out_caps_show(struct hdac_device *codec, hda_nid_t nid,
 static ssize_t power_caps_show(struct hdac_device *codec, hda_nid_t nid,
 			       struct widget_attribute *attr, char *buf)
 {
-	if (nid != codec->afg && !(get_wcaps(codec, nid) & AC_WCAP_POWER))
+	if (nid != codec->afg && !(snd_hdac_get_wcaps(codec, nid) & AC_WCAP_POWER))
 		return 0;
 	return sysfs_emit(buf, "0x%08x\n",
 			  snd_hdac_read_parm(codec, nid, AC_PAR_POWER_STATE));
diff --git a/sound/hda/local.h b/sound/hda/local.h
index 896ba142e8bc..5f03b203c416 100644
--- a/sound/hda/local.h
+++ b/sound/hda/local.h
@@ -6,27 +6,6 @@
 #ifndef __HDAC_LOCAL_H
 #define __HDAC_LOCAL_H
 
-#define get_wcaps(codec, nid) \
-	snd_hdac_read_parm(codec, nid, AC_PAR_AUDIO_WIDGET_CAP)
-
-/* get the widget type from widget capability bits */
-static inline int get_wcaps_type(unsigned int wcaps)
-{
-	if (!wcaps)
-		return -1; /* invalid type */
-	return (wcaps & AC_WCAP_TYPE) >> AC_WCAP_TYPE_SHIFT;
-}
-
-static inline unsigned int get_wcaps_channels(u32 wcaps)
-{
-	unsigned int chans;
-
-	chans = (wcaps & AC_WCAP_CHAN_CNT_EXT) >> 13;
-	chans = (chans + 1) * 2;
-
-	return chans;
-}
-
 extern const struct attribute_group *hdac_dev_attr_groups[];
 int hda_widget_sysfs_init(struct hdac_device *codec);
 int hda_widget_sysfs_reinit(struct hdac_device *codec, hda_nid_t start_nid,
diff --git a/sound/soc/codecs/hdac_hdmi.c b/sound/soc/codecs/hdac_hdmi.c
index 1139a2754ca3..b33cd5178008 100644
--- a/sound/soc/codecs/hdac_hdmi.c
+++ b/sound/soc/codecs/hdac_hdmi.c
@@ -24,7 +24,6 @@
 #include <sound/hda_i915.h>
 #include <sound/pcm_drm_eld.h>
 #include <sound/hda_chmap.h>
-#include "../../hda/local.h"
 #include "hdac_hdmi.h"
 
 #define NAME_SIZE	32
@@ -221,8 +220,8 @@ static int hdac_hdmi_get_port_len(struct hdac_device *hdev, hda_nid_t nid)
 	unsigned int caps;
 	unsigned int type, param;
 
-	caps = get_wcaps(hdev, nid);
-	type = get_wcaps_type(caps);
+	caps = snd_hdac_get_wcaps(hdev, nid);
+	type = snd_hdac_get_wcaps_type(caps);
 
 	if (!(caps & AC_WCAP_DIGITAL) || (type != AC_WID_PIN))
 		return 0;
@@ -492,10 +491,10 @@ static int hdac_hdmi_query_port_connlist(struct hdac_device *hdev,
 					struct hdac_hdmi_pin *pin,
 					struct hdac_hdmi_port *port)
 {
-	if (!(get_wcaps(hdev, pin->nid) & AC_WCAP_CONN_LIST)) {
+	if (!(snd_hdac_get_wcaps(hdev, pin->nid) & AC_WCAP_CONN_LIST)) {
 		dev_warn(&hdev->dev,
 			"HDMI: pin %d wcaps %#x does not support connection list\n",
-			pin->nid, get_wcaps(hdev, pin->nid));
+			pin->nid, snd_hdac_get_wcaps(hdev, pin->nid));
 		return -EINVAL;
 	}
 
@@ -660,8 +659,8 @@ hdac_hdmi_query_cvt_params(struct hdac_device *hdev, struct hdac_hdmi_cvt *cvt)
 	struct hdac_hdmi_priv *hdmi = hdev_to_hdmi_priv(hdev);
 	int err;
 
-	chans = get_wcaps(hdev, cvt->nid);
-	chans = get_wcaps_channels(chans);
+	chans = snd_hdac_get_wcaps(hdev, cvt->nid);
+	chans = snd_hdac_get_wcaps_channels(chans);
 
 	cvt->params.channels_min = 2;
 
@@ -743,7 +742,7 @@ static void hdac_hdmi_set_power_state(struct hdac_device *hdev,
 	int count;
 	unsigned int state;
 
-	if (get_wcaps(hdev, nid) & AC_WCAP_POWER) {
+	if (snd_hdac_get_wcaps(hdev, nid) & AC_WCAP_POWER) {
 		if (!snd_hdac_check_power_state(hdev, nid, pwr_state)) {
 			for (count = 0; count < 10; count++) {
 				snd_hdac_codec_read(hdev, nid, 0,
@@ -761,7 +760,7 @@ static void hdac_hdmi_set_power_state(struct hdac_device *hdev,
 static void hdac_hdmi_set_amp(struct hdac_device *hdev,
 				   hda_nid_t nid, int val)
 {
-	if (get_wcaps(hdev, nid) & AC_WCAP_OUT_AMP)
+	if (snd_hdac_get_wcaps(hdev, nid) & AC_WCAP_OUT_AMP)
 		snd_hdac_codec_write(hdev, nid, 0,
 					AC_VERB_SET_AMP_GAIN_MUTE, val);
 }
@@ -1648,8 +1647,8 @@ static int hdac_hdmi_parse_and_map_nid(struct hdac_device *hdev,
 		unsigned int caps;
 		unsigned int type;
 
-		caps = get_wcaps(hdev, nid);
-		type = get_wcaps_type(caps);
+		caps = snd_hdac_get_wcaps(hdev, nid);
+		type = snd_hdac_get_wcaps_type(caps);
 
 		if (!(caps & AC_WCAP_DIGITAL))
 			continue;
-- 
cgit v1.2.3


From 6bf917e9aacc360e76b74249209a4290f2e8404b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 9 Jul 2025 18:04:12 +0200
Subject: ALSA: hda: Introduce hda_codec_driver ops

Until now, we use "patch_ops" embedded in hda_codec object for
defining the callbacks that are used in various places to manage
HD-audio codec.  But from the device driver POV, this should have been
rather the driver ops, instead of the callbacks in the codec object.

This patch defines the driver ops for HD-audio codec driver as the
replacement.  We reuse the same struct hda_codec_ops, and this is put
as hda_codec_driver.ops.  When the driver->ops callbacks are defined,
they are called primarily instead of codec->patch_ops callbacks.

With converting to the driver ops, there is no need to pass the ugly
patch_ops handling in hda_device_id tables.  That is, driver_data
field of hda_device_id becomes really optional and it can be used for
passing the codec-specific data (e.g. specifying a model).

The codec entries after the conversion should be with HDA_CODEC_ID()
and co, instead of the former HDA_CODEC_ENTRY().

Once after converting all codec drivers to use driver ops, we can get
rid of codec patch_ops.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20250709160434.1859-10-tiwai@suse.de
---
 include/sound/hda_codec.h    | 26 ++++++++++++++++++--
 sound/hda/common/bind.c      | 27 ++++++++++++++++-----
 sound/hda/common/codec.c     | 56 ++++++++++++++++++++++++++++++++------------
 sound/hda/common/hda_local.h | 11 +++++++++
 sound/soc/codecs/hda.c       | 27 ++++++++++++++-------
 sound/soc/codecs/hdac_hda.c  | 28 ++++++++++++++++------
 6 files changed, 137 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h
index c1fe6290d04d..a725ac48c20c 100644
--- a/include/sound/hda_codec.h
+++ b/include/sound/hda_codec.h
@@ -27,6 +27,7 @@ struct hda_beep;
 struct hda_codec;
 struct hda_pcm;
 struct hda_pcm_stream;
+struct hda_codec_ops;
 
 /*
  * codec bus
@@ -79,6 +80,17 @@ typedef int (*hda_codec_patch_t)(struct hda_codec *);
 #define HDA_CODEC_ID_GENERIC_HDMI	0x00000101
 #define HDA_CODEC_ID_GENERIC		0x00000201
 
+#define HDA_CODEC_ID_REV_MODEL(_vid, _rev, _name, _model)	  \
+	{ .vendor_id = (_vid), .rev_id = (_rev), .name = (_name), \
+	  .api_version = HDA_DEV_LEGACY, .driver_data = (_model) }
+#define HDA_CODEC_ID_MODEL(_vid, _name, _model)	  \
+	HDA_CODEC_ID_REV_MODEL(_vid, 0, _name, _model)
+#define HDA_CODEC_ID_REV(_vid, _rev, _name) \
+	HDA_CODEC_ID_REV_MODEL(_vid, _rev, _name, 0)
+#define HDA_CODEC_ID(_vid, _name) \
+	HDA_CODEC_ID_REV(_vid, 0, _name)
+
+/* old macros for patch_ops -- to be deprecated */
 #define HDA_CODEC_REV_ENTRY(_vid, _rev, _name, _patch) \
 	{ .vendor_id = (_vid), .rev_id = (_rev), .name = (_name), \
 	  .api_version = HDA_DEV_LEGACY, \
@@ -89,8 +101,12 @@ typedef int (*hda_codec_patch_t)(struct hda_codec *);
 struct hda_codec_driver {
 	struct hdac_driver core;
 	const struct hda_device_id *id;
+	const struct hda_codec_ops *ops;
 };
 
+#define hda_codec_to_driver(codec) \
+	container_of((codec)->core.dev.driver, struct hda_codec_driver, core.driver)
+
 int __hda_codec_driver_register(struct hda_codec_driver *drv, const char *name,
 			       struct module *owner);
 #define hda_codec_driver_register(drv) \
@@ -102,6 +118,8 @@ void hda_codec_driver_unregister(struct hda_codec_driver *drv);
 
 /* ops set by the preset patch */
 struct hda_codec_ops {
+	int (*probe)(struct hda_codec *codec, const struct hda_device_id *id);
+	void (*remove)(struct hda_codec *codec);
 	int (*build_controls)(struct hda_codec *codec);
 	int (*build_pcms)(struct hda_codec *codec);
 	int (*init)(struct hda_codec *codec);
@@ -184,7 +202,7 @@ struct hda_codec {
 	/* set by patch */
 	struct hda_codec_ops patch_ops;
 
-	/* PCM to create, set by patch_ops.build_pcms callback */
+	/* PCM to create, set by hda_codec_ops.build_pcms callback */
 	struct list_head pcm_list_head;
 	refcount_t pcm_ref;
 	wait_queue_head_t remove_sleep;
@@ -478,7 +496,11 @@ extern const struct dev_pm_ops hda_codec_driver_pm;
 static inline
 int hda_call_check_power_status(struct hda_codec *codec, hda_nid_t nid)
 {
-	if (codec->patch_ops.check_power_status)
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
+
+	if (driver->ops && driver->ops->check_power_status)
+		return driver->ops->check_power_status(codec, nid);
+	else if (codec->patch_ops.check_power_status)
 		return codec->patch_ops.check_power_status(codec, nid);
 	return 0;
 }
diff --git a/sound/hda/common/bind.c b/sound/hda/common/bind.c
index df8f88beddd0..56975178f533 100644
--- a/sound/hda/common/bind.c
+++ b/sound/hda/common/bind.c
@@ -42,6 +42,7 @@ static int hda_codec_match(struct hdac_device *dev, const struct hdac_driver *dr
 static void hda_codec_unsol_event(struct hdac_device *dev, unsigned int ev)
 {
 	struct hda_codec *codec = container_of(dev, struct hda_codec, core);
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 
 	/* ignore unsol events during shutdown */
 	if (codec->card->shutdown || codec->bus->shutdown)
@@ -51,7 +52,9 @@ static void hda_codec_unsol_event(struct hdac_device *dev, unsigned int ev)
 	if (codec->core.dev.power.power_state.event != PM_EVENT_ON)
 		return;
 
-	if (codec->patch_ops.unsol_event)
+	if (driver->ops && driver->ops->unsol_event)
+		driver->ops->unsol_event(codec, ev);
+	else if (codec->patch_ops.unsol_event)
 		codec->patch_ops.unsol_event(codec, ev);
 }
 
@@ -87,6 +90,7 @@ static int hda_codec_driver_probe(struct device *dev)
 {
 	struct hda_codec *codec = dev_to_hda_codec(dev);
 	struct module *owner = dev->driver->owner;
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	hda_codec_patch_t patch;
 	int err;
 
@@ -111,11 +115,17 @@ static int hda_codec_driver_probe(struct device *dev)
 		goto error;
 	}
 
-	patch = (hda_codec_patch_t)codec->preset->driver_data;
-	if (patch) {
-		err = patch(codec);
+	if (driver->ops && driver->ops->probe) {
+		err = driver->ops->probe(codec, codec->preset);
 		if (err < 0)
 			goto error_module_put;
+	} else {
+		patch = (hda_codec_patch_t)codec->preset->driver_data;
+		if (patch) {
+			err = patch(codec);
+			if (err < 0)
+				goto error_module_put;
+		}
 	}
 
 	err = snd_hda_codec_build_pcms(codec);
@@ -136,7 +146,9 @@ static int hda_codec_driver_probe(struct device *dev)
 	return 0;
 
  error_module:
-	if (codec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(codec);
+	else if (codec->patch_ops.free)
 		codec->patch_ops.free(codec);
  error_module_put:
 	module_put(owner);
@@ -150,6 +162,7 @@ static int hda_codec_driver_probe(struct device *dev)
 static int hda_codec_driver_remove(struct device *dev)
 {
 	struct hda_codec *codec = dev_to_hda_codec(dev);
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 
 	if (codec->bus->core.ext_ops) {
 		if (WARN_ON(!codec->bus->core.ext_ops->hdev_detach))
@@ -163,7 +176,9 @@ static int hda_codec_driver_remove(struct device *dev)
 		wait_event(codec->remove_sleep, !refcount_read(&codec->pcm_ref));
 	snd_power_sync_ref(codec->bus->card);
 
-	if (codec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(codec);
+	else if (codec->patch_ops.free)
 		codec->patch_ops.free(codec);
 	snd_hda_codec_cleanup_for_unbind(codec);
 	codec->preset = NULL;
diff --git a/sound/hda/common/codec.c b/sound/hda/common/codec.c
index cb72e9655c8a..8899be764d68 100644
--- a/sound/hda/common/codec.c
+++ b/sound/hda/common/codec.c
@@ -1114,6 +1114,7 @@ void snd_hda_codec_setup_stream(struct hda_codec *codec, hda_nid_t nid,
 				u32 stream_tag,
 				int channel_id, int format)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	struct hda_codec *c;
 	struct hda_cvt_setup *p;
 	int type;
@@ -1129,7 +1130,9 @@ void snd_hda_codec_setup_stream(struct hda_codec *codec, hda_nid_t nid,
 	if (!p)
 		return;
 
-	if (codec->patch_ops.stream_pm)
+	if (driver->ops && driver->ops->stream_pm)
+		driver->ops->stream_pm(codec, nid, true);
+	else if (codec->patch_ops.stream_pm)
 		codec->patch_ops.stream_pm(codec, nid, true);
 	if (codec->pcm_format_first)
 		update_pcm_format(codec, p, nid, format);
@@ -1190,7 +1193,9 @@ EXPORT_SYMBOL_GPL(__snd_hda_codec_cleanup_stream);
 static void really_cleanup_stream(struct hda_codec *codec,
 				  struct hda_cvt_setup *q)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	hda_nid_t nid = q->nid;
+
 	if (q->stream_tag || q->channel_id)
 		snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_CHANNEL_STREAMID, 0);
 	if (q->format_id)
@@ -1198,7 +1203,9 @@ static void really_cleanup_stream(struct hda_codec *codec,
 );
 	memset(q, 0, sizeof(*q));
 	q->nid = nid;
-	if (codec->patch_ops.stream_pm)
+	if (driver->ops && driver->ops->stream_pm)
+		driver->ops->stream_pm(codec, nid, false);
+	else if (codec->patch_ops.stream_pm)
 		codec->patch_ops.stream_pm(codec, nid, false);
 }
 
@@ -2746,6 +2753,7 @@ EXPORT_SYMBOL_GPL(snd_hda_codec_eapd_power_filter);
 static unsigned int hda_set_power_state(struct hda_codec *codec,
 					unsigned int power_state)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	hda_nid_t fg = codec->core.afg ? codec->core.afg : codec->core.mfg;
 	int count;
 	unsigned int state;
@@ -2762,7 +2770,10 @@ static unsigned int hda_set_power_state(struct hda_codec *codec,
 
 	/* repeat power states setting at most 10 times*/
 	for (count = 0; count < 10; count++) {
-		if (codec->patch_ops.set_power_state)
+		/* might be called before binding to driver, too */
+		if (driver && driver->ops && driver->ops->set_power_state)
+			driver->ops->set_power_state(codec, fg, power_state);
+		else if (codec->patch_ops.set_power_state)
 			codec->patch_ops.set_power_state(codec, fg,
 							 power_state);
 		else {
@@ -2842,10 +2853,13 @@ void snd_hda_update_power_acct(struct hda_codec *codec)
  */
 static unsigned int hda_call_codec_suspend(struct hda_codec *codec)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	unsigned int state;
 
 	snd_hdac_enter_pm(&codec->core);
-	if (codec->patch_ops.suspend)
+	if (driver->ops && driver->ops->suspend)
+		driver->ops->suspend(codec);
+	else if (codec->patch_ops.suspend)
 		codec->patch_ops.suspend(codec);
 	if (!codec->no_stream_clean_at_suspend)
 		hda_cleanup_all_streams(codec);
@@ -2860,6 +2874,8 @@ static unsigned int hda_call_codec_suspend(struct hda_codec *codec)
  */
 static void hda_call_codec_resume(struct hda_codec *codec)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
+
 	snd_hdac_enter_pm(&codec->core);
 	if (codec->core.regmap)
 		regcache_mark_dirty(codec->core.regmap);
@@ -2870,11 +2886,12 @@ static void hda_call_codec_resume(struct hda_codec *codec)
 	restore_shutup_pins(codec);
 	hda_exec_init_verbs(codec);
 	snd_hda_jack_set_dirty_all(codec);
-	if (codec->patch_ops.resume)
+	if (driver->ops && driver->ops->resume)
+		driver->ops->resume(codec);
+	else if (codec->patch_ops.resume)
 		codec->patch_ops.resume(codec);
 	else {
-		if (codec->patch_ops.init)
-			codec->patch_ops.init(codec);
+		snd_hda_codec_init(codec);
 		snd_hda_regmap_sync(codec);
 	}
 
@@ -3059,15 +3076,20 @@ EXPORT_SYMBOL_GPL(snd_pcm_2_1_chmaps);
 
 int snd_hda_codec_build_controls(struct hda_codec *codec)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	int err = 0;
+
 	hda_exec_init_verbs(codec);
 	/* continue to initialize... */
-	if (codec->patch_ops.init)
-		err = codec->patch_ops.init(codec);
-	if (!err && codec->patch_ops.build_controls)
-		err = codec->patch_ops.build_controls(codec);
-	if (err < 0)
-		return err;
+	err = snd_hda_codec_init(codec);
+	if (!err) {
+		if (driver->ops && driver->ops->build_controls)
+			err = driver->ops->build_controls(codec);
+		else if (codec->patch_ops.build_controls)
+			err = codec->patch_ops.build_controls(codec);
+		if (err < 0)
+			return err;
+	}
 
 	/* we create chmaps here instead of build_pcms */
 	err = add_std_chmaps(codec);
@@ -3253,16 +3275,20 @@ static int get_empty_pcm_device(struct hda_bus *bus, unsigned int type)
 /* call build_pcms ops of the given codec and set up the default parameters */
 int snd_hda_codec_parse_pcms(struct hda_codec *codec)
 {
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	struct hda_pcm *cpcm;
 	int err;
 
 	if (!list_empty(&codec->pcm_list_head))
 		return 0; /* already parsed */
 
-	if (!codec->patch_ops.build_pcms)
+	if (driver->ops && driver->ops->build_pcms)
+		err = driver->ops->build_pcms(codec);
+	else if (codec->patch_ops.build_pcms)
+		err = codec->patch_ops.build_pcms(codec);
+	else
 		return 0;
 
-	err = codec->patch_ops.build_pcms(codec);
 	if (err < 0) {
 		codec_err(codec, "cannot build PCMs for #%d (error %d)\n",
 			  codec->core.addr, err);
diff --git a/sound/hda/common/hda_local.h b/sound/hda/common/hda_local.h
index 428aa5a06ead..654fe1156d56 100644
--- a/sound/hda/common/hda_local.h
+++ b/sound/hda/common/hda_local.h
@@ -652,6 +652,17 @@ unsigned int snd_hda_codec_eapd_power_filter(struct hda_codec *codec,
 
 void snd_hda_codec_shutdown(struct hda_codec *codec);
 
+static inline int snd_hda_codec_init(struct hda_codec *codec)
+{
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
+
+	if (driver->ops && driver->ops->init)
+		return driver->ops->init(codec);
+	else if (codec->patch_ops.init)
+		return codec->patch_ops.init(codec);
+	return 0;
+}
+
 /*
  * AMP control callbacks
  */
diff --git a/sound/soc/codecs/hda.c b/sound/soc/codecs/hda.c
index dc7794c9ac44..ddb31001657e 100644
--- a/sound/soc/codecs/hda.c
+++ b/sound/soc/codecs/hda.c
@@ -173,6 +173,7 @@ EXPORT_SYMBOL_GPL(hda_codec_probe_complete);
 static int hda_codec_probe(struct snd_soc_component *component)
 {
 	struct hda_codec *codec = dev_to_hda_codec(component->dev);
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	struct hdac_device *hdev = &codec->core;
 	struct hdac_bus *bus = hdev->bus;
 	struct hdac_ext_link *hlink;
@@ -214,14 +215,19 @@ static int hda_codec_probe(struct snd_soc_component *component)
 		goto err;
 	}
 
-	patch = (hda_codec_patch_t)codec->preset->driver_data;
-	if (!patch) {
-		dev_err(&hdev->dev, "no patch specified\n");
-		ret = -EINVAL;
-		goto err;
+	if (driver->ops && driver->ops->probe) {
+		ret = driver->ops->probe(codec, codec->preset);
+	} else {
+		patch = (hda_codec_patch_t)codec->preset->driver_data;
+		if (!patch) {
+			dev_err(&hdev->dev, "no patch specified\n");
+			ret = -EINVAL;
+			goto err;
+		}
+
+		ret = patch(codec);
 	}
 
-	ret = patch(codec);
 	if (ret < 0) {
 		dev_err(&hdev->dev, "codec init failed: %d\n", ret);
 		goto err;
@@ -252,7 +258,9 @@ static int hda_codec_probe(struct snd_soc_component *component)
 complete_err:
 	hda_codec_unregister_dais(codec, component);
 parse_pcms_err:
-	if (codec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(codec);
+	else if (codec->patch_ops.free)
 		codec->patch_ops.free(codec);
 err:
 	snd_hda_codec_cleanup_for_unbind(codec);
@@ -271,6 +279,7 @@ device_new_err:
 static void hda_codec_remove(struct snd_soc_component *component)
 {
 	struct hda_codec *codec = dev_to_hda_codec(component->dev);
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	struct hdac_device *hdev = &codec->core;
 	struct hdac_bus *bus = hdev->bus;
 	struct hdac_ext_link *hlink;
@@ -281,7 +290,9 @@ static void hda_codec_remove(struct snd_soc_component *component)
 
 	hda_codec_unregister_dais(codec, component);
 
-	if (codec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(codec);
+	else if (codec->patch_ops.free)
 		codec->patch_ops.free(codec);
 
 	snd_hda_codec_cleanup_for_unbind(codec);
diff --git a/sound/soc/codecs/hdac_hda.c b/sound/soc/codecs/hdac_hda.c
index 29c88de5508b..7bb7845d5e43 100644
--- a/sound/soc/codecs/hdac_hda.c
+++ b/sound/soc/codecs/hdac_hda.c
@@ -409,6 +409,7 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 			snd_soc_component_get_dapm(component);
 	struct hdac_device *hdev = &hda_pvt->codec->core;
 	struct hda_codec *hcodec = hda_pvt->codec;
+	struct hda_codec_driver *driver = hda_codec_to_driver(hcodec);
 	struct hdac_ext_link *hlink;
 	hda_codec_patch_t patch;
 	int ret;
@@ -484,15 +485,23 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 		goto error_pm;
 	}
 
-	patch = (hda_codec_patch_t)hcodec->preset->driver_data;
-	if (patch) {
-		ret = patch(hcodec);
+	if (driver->ops && driver->ops->probe) {
+		ret = driver->ops->probe(hcodec, hcodec->preset);
 		if (ret < 0) {
-			dev_err(&hdev->dev, "%s: patch failed %d\n", __func__, ret);
+			dev_err(&hdev->dev, "%s: probe failed %d\n", __func__, ret);
 			goto error_regmap;
 		}
 	} else {
-		dev_dbg(&hdev->dev, "%s: no patch file found\n", __func__);
+		patch = (hda_codec_patch_t)hcodec->preset->driver_data;
+		if (patch) {
+			ret = patch(hcodec);
+			if (ret < 0) {
+				dev_err(&hdev->dev, "%s: patch failed %d\n", __func__, ret);
+				goto error_regmap;
+			}
+		} else {
+			dev_dbg(&hdev->dev, "%s: no patch file found\n", __func__);
+		}
 	}
 
 	ret = snd_hda_codec_parse_pcms(hcodec);
@@ -531,7 +540,9 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 	return 0;
 
 error_patch:
-	if (hcodec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(hcodec);
+	else if (hcodec->patch_ops.free)
 		hcodec->patch_ops.free(hcodec);
 error_regmap:
 	snd_hdac_regmap_exit(hdev);
@@ -548,6 +559,7 @@ static void hdac_hda_codec_remove(struct snd_soc_component *component)
 		      snd_soc_component_get_drvdata(component);
 	struct hdac_device *hdev = &hda_pvt->codec->core;
 	struct hda_codec *codec = hda_pvt->codec;
+	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
 	struct hdac_ext_link *hlink = NULL;
 
 	hlink = snd_hdac_ext_bus_get_hlink_by_name(hdev->bus, dev_name(&hdev->dev));
@@ -559,7 +571,9 @@ static void hdac_hda_codec_remove(struct snd_soc_component *component)
 	pm_runtime_disable(&hdev->dev);
 	snd_hdac_ext_bus_link_put(hdev->bus, hlink);
 
-	if (codec->patch_ops.free)
+	if (driver->ops && driver->ops->remove)
+		driver->ops->remove(codec);
+	else if (codec->patch_ops.free)
 		codec->patch_ops.free(codec);
 
 	snd_hda_codec_cleanup_for_unbind(codec);
-- 
cgit v1.2.3


From cabaf5908e586156d1d59f3fb8c51d2b169bc636 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 9 Jul 2025 18:04:27 +0200
Subject: ALSA: hda: Drop old codec binding method

Now that all patch_ops usage have been converted to the new
hda_codec_ops probe, we can drop patch_ops from the hda_codec,
together with the calls of patch_ops callbacks.

The hda_codec_ops.free callback is removed as all have been replaced
with the new remove callback.

Also, correct comments mentioning "patch"; it's replaced with "codec
driver".

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://patch.msgid.link/20250709160434.1859-25-tiwai@suse.de
---
 include/sound/hda_codec.h          | 22 ++--------------------
 sound/hda/Makefile                 |  2 +-
 sound/hda/codecs/generic.c         | 12 ------------
 sound/hda/codecs/generic.h         |  1 -
 sound/hda/codecs/hdmi/hdmi_local.h |  2 +-
 sound/hda/common/bind.c            | 26 +++++++-------------------
 sound/hda/common/codec.c           | 20 ++------------------
 sound/hda/common/hda_local.h       |  2 --
 sound/soc/codecs/hda.c             | 20 ++++----------------
 sound/soc/codecs/hdac_hda.c        | 31 +++++++++----------------------
 10 files changed, 26 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h
index a725ac48c20c..ddc9c392f93f 100644
--- a/include/sound/hda_codec.h
+++ b/include/sound/hda_codec.h
@@ -70,12 +70,8 @@ struct hda_bus {
 
 /*
  * codec preset
- *
- * Known codecs have the patch to build and set up the controls/PCMs
- * better than the generic parser.
  */
-typedef int (*hda_codec_patch_t)(struct hda_codec *);
-	
+
 #define HDA_CODEC_ID_SKIP_PROBE		0x00000001
 #define HDA_CODEC_ID_GENERIC_HDMI	0x00000101
 #define HDA_CODEC_ID_GENERIC		0x00000201
@@ -90,14 +86,6 @@ typedef int (*hda_codec_patch_t)(struct hda_codec *);
 #define HDA_CODEC_ID(_vid, _name) \
 	HDA_CODEC_ID_REV(_vid, 0, _name)
 
-/* old macros for patch_ops -- to be deprecated */
-#define HDA_CODEC_REV_ENTRY(_vid, _rev, _name, _patch) \
-	{ .vendor_id = (_vid), .rev_id = (_rev), .name = (_name), \
-	  .api_version = HDA_DEV_LEGACY, \
-	  .driver_data = (unsigned long)(_patch) }
-#define HDA_CODEC_ENTRY(_vid, _name, _patch) \
-	HDA_CODEC_REV_ENTRY(_vid, 0, _name, _patch)
-
 struct hda_codec_driver {
 	struct hdac_driver core;
 	const struct hda_device_id *id;
@@ -116,14 +104,13 @@ void hda_codec_driver_unregister(struct hda_codec_driver *drv);
 	module_driver(drv, hda_codec_driver_register, \
 		      hda_codec_driver_unregister)
 
-/* ops set by the preset patch */
+/* ops for hda codec driver */
 struct hda_codec_ops {
 	int (*probe)(struct hda_codec *codec, const struct hda_device_id *id);
 	void (*remove)(struct hda_codec *codec);
 	int (*build_controls)(struct hda_codec *codec);
 	int (*build_pcms)(struct hda_codec *codec);
 	int (*init)(struct hda_codec *codec);
-	void (*free)(struct hda_codec *codec);
 	void (*unsol_event)(struct hda_codec *codec, unsigned int res);
 	void (*set_power_state)(struct hda_codec *codec, hda_nid_t fg,
 				unsigned int power_state);
@@ -199,9 +186,6 @@ struct hda_codec {
 	const struct hda_device_id *preset;
 	const char *modelname;	/* model name for preset */
 
-	/* set by patch */
-	struct hda_codec_ops patch_ops;
-
 	/* PCM to create, set by hda_codec_ops.build_pcms callback */
 	struct list_head pcm_list_head;
 	refcount_t pcm_ref;
@@ -500,8 +484,6 @@ int hda_call_check_power_status(struct hda_codec *codec, hda_nid_t nid)
 
 	if (driver->ops && driver->ops->check_power_status)
 		return driver->ops->check_power_status(codec, nid);
-	else if (codec->patch_ops.check_power_status)
-		return codec->patch_ops.check_power_status(codec, nid);
 	return 0;
 }
 
diff --git a/sound/hda/Makefile b/sound/hda/Makefile
index 31b9fbedaa77..d9a6def582ef 100644
--- a/sound/hda/Makefile
+++ b/sound/hda/Makefile
@@ -3,6 +3,6 @@ obj-y += core/
 obj-$(CONFIG_SND_HDA) += common/
 obj-$(CONFIG_SND_HDA) += codecs/
 # this must be the last entry after codec drivers;
-# otherwise the codec patches won't be hooked before the PCI probe
+# otherwise the codec drivers won't be hooked before the PCI probe
 # when built in kernel
 obj-$(CONFIG_SND_HDA) += controllers/
diff --git a/sound/hda/codecs/generic.c b/sound/hda/codecs/generic.c
index 82c31b93424c..044f1d0aeaea 100644
--- a/sound/hda/codecs/generic.c
+++ b/sound/hda/codecs/generic.c
@@ -5190,8 +5190,6 @@ int snd_hda_gen_parse_auto_config(struct hda_codec *codec,
 	if (spec->power_down_unused || codec->power_save_node) {
 		if (!codec->power_filter)
 			codec->power_filter = snd_hda_gen_path_power_filter;
-		if (!codec->patch_ops.stream_pm)
-			codec->patch_ops.stream_pm = snd_hda_gen_stream_pm;
 	}
 
 	if (!spec->no_analog && spec->beep_nid) {
@@ -6103,15 +6101,6 @@ EXPORT_SYMBOL_GPL(snd_hda_gen_check_power_status);
  * the generic codec support
  */
 
-static const struct hda_codec_ops generic_patch_ops = {
-	.build_controls = snd_hda_gen_build_controls,
-	.build_pcms = snd_hda_gen_build_pcms,
-	.init = snd_hda_gen_init,
-	.free = snd_hda_gen_free,
-	.unsol_event = snd_hda_jack_unsol_event,
-	.check_power_status = snd_hda_gen_check_power_status,
-};
-
 static int snd_hda_gen_probe(struct hda_codec *codec,
 			     const struct hda_device_id *id)
 {
@@ -6132,7 +6121,6 @@ static int snd_hda_gen_probe(struct hda_codec *codec,
 	if (err < 0)
 		goto error;
 
-	codec->patch_ops = generic_patch_ops;
 	return 0;
 
 error:
diff --git a/sound/hda/codecs/generic.h b/sound/hda/codecs/generic.h
index 00a92fc55846..524591821f8c 100644
--- a/sound/hda/codecs/generic.h
+++ b/sound/hda/codecs/generic.h
@@ -312,7 +312,6 @@ int snd_hda_gen_spec_init(struct hda_gen_spec *spec);
 
 int snd_hda_gen_init(struct hda_codec *codec);
 void snd_hda_gen_remove(struct hda_codec *codec);
-#define snd_hda_gen_free	snd_hda_gen_remove
 
 int snd_hda_get_path_idx(struct hda_codec *codec, struct nid_path *path);
 struct nid_path *snd_hda_get_path_from_idx(struct hda_codec *codec, int idx);
diff --git a/sound/hda/codecs/hdmi/hdmi_local.h b/sound/hda/codecs/hdmi/hdmi_local.h
index 0654013f1fda..548241ad3fa9 100644
--- a/sound/hda/codecs/hdmi/hdmi_local.h
+++ b/sound/hda/codecs/hdmi/hdmi_local.h
@@ -57,7 +57,7 @@ struct hdmi_spec_per_pin {
 #endif
 };
 
-/* operations used by generic code that can be overridden by patches */
+/* operations used by generic code that can be overridden by codec drivers */
 struct hdmi_ops {
 	int (*pin_get_eld)(struct hda_codec *codec, hda_nid_t pin_nid,
 			   int dev_id, unsigned char *buf, int *eld_size);
diff --git a/sound/hda/common/bind.c b/sound/hda/common/bind.c
index 56975178f533..f85c640dd54f 100644
--- a/sound/hda/common/bind.c
+++ b/sound/hda/common/bind.c
@@ -54,8 +54,6 @@ static void hda_codec_unsol_event(struct hdac_device *dev, unsigned int ev)
 
 	if (driver->ops && driver->ops->unsol_event)
 		driver->ops->unsol_event(codec, ev);
-	else if (codec->patch_ops.unsol_event)
-		codec->patch_ops.unsol_event(codec, ev);
 }
 
 /**
@@ -91,7 +89,6 @@ static int hda_codec_driver_probe(struct device *dev)
 	struct hda_codec *codec = dev_to_hda_codec(dev);
 	struct module *owner = dev->driver->owner;
 	struct hda_codec_driver *driver = hda_codec_to_driver(codec);
-	hda_codec_patch_t patch;
 	int err;
 
 	if (codec->bus->core.ext_ops) {
@@ -115,19 +112,14 @@ static int hda_codec_driver_probe(struct device *dev)
 		goto error;
 	}
 
-	if (driver->ops && driver->ops->probe) {
-		err = driver->ops->probe(codec, codec->preset);
-		if (err < 0)
-			goto error_module_put;
-	} else {
-		patch = (hda_codec_patch_t)codec->preset->driver_data;
-		if (patch) {
-			err = patch(codec);
-			if (err < 0)
-				goto error_module_put;
-		}
+	if (WARN_ON(!(driver->ops && driver->ops->probe))) {
+		err = -EINVAL;
+		goto error_module_put;
 	}
 
+	err = driver->ops->probe(codec, codec->preset);
+	if (err < 0)
+		goto error_module_put;
 	err = snd_hda_codec_build_pcms(codec);
 	if (err < 0)
 		goto error_module;
@@ -148,8 +140,6 @@ static int hda_codec_driver_probe(struct device *dev)
  error_module:
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(codec);
-	else if (codec->patch_ops.free)
-		codec->patch_ops.free(codec);
  error_module_put:
 	module_put(owner);
 
@@ -178,8 +168,6 @@ static int hda_codec_driver_remove(struct device *dev)
 
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(codec);
-	else if (codec->patch_ops.free)
-		codec->patch_ops.free(codec);
 	snd_hda_codec_cleanup_for_unbind(codec);
 	codec->preset = NULL;
 	module_put(dev->driver->owner);
@@ -320,7 +308,7 @@ static int codec_bind_generic(struct hda_codec *codec)
  * @codec: the HDA codec
  *
  * Start parsing of the given codec tree and (re-)initialize the whole
- * patch instance.
+ * codec driver binding.
  *
  * Returns 0 if successful or a negative error code.
  */
diff --git a/sound/hda/common/codec.c b/sound/hda/common/codec.c
index 8899be764d68..33121d002087 100644
--- a/sound/hda/common/codec.c
+++ b/sound/hda/common/codec.c
@@ -766,7 +766,6 @@ void snd_hda_codec_cleanup_for_unbind(struct hda_codec *codec)
 		snd_hda_ctls_clear(codec);
 	codec_release_pcms(codec);
 	snd_hda_detach_beep_device(codec);
-	memset(&codec->patch_ops, 0, sizeof(codec->patch_ops));
 	snd_hda_jack_tbl_clear(codec);
 	codec->proc_widget_hook = NULL;
 	codec->spec = NULL;
@@ -1132,8 +1131,6 @@ void snd_hda_codec_setup_stream(struct hda_codec *codec, hda_nid_t nid,
 
 	if (driver->ops && driver->ops->stream_pm)
 		driver->ops->stream_pm(codec, nid, true);
-	else if (codec->patch_ops.stream_pm)
-		codec->patch_ops.stream_pm(codec, nid, true);
 	if (codec->pcm_format_first)
 		update_pcm_format(codec, p, nid, format);
 	update_pcm_stream_id(codec, p, nid, stream_tag, channel_id);
@@ -1205,8 +1202,6 @@ static void really_cleanup_stream(struct hda_codec *codec,
 	q->nid = nid;
 	if (driver->ops && driver->ops->stream_pm)
 		driver->ops->stream_pm(codec, nid, false);
-	else if (codec->patch_ops.stream_pm)
-		codec->patch_ops.stream_pm(codec, nid, false);
 }
 
 /* clean up the all conflicting obsolete streams */
@@ -2397,7 +2392,7 @@ static const struct snd_kcontrol_new dig_mixes[] = {
  * @cvt_nid: converter NID
  * @type: HDA_PCM_TYPE_*
  * Creates controls related with the digital output.
- * Called from each patch supporting the digital out.
+ * Called from each codec driver supporting the digital out.
  *
  * Returns 0 if successful, or a negative error code.
  */
@@ -2656,7 +2651,7 @@ static const struct snd_kcontrol_new dig_in_ctls[] = {
  * @nid: audio in widget NID
  *
  * Creates controls related with the SPDIF input.
- * Called from each patch supporting the SPDIF in.
+ * Called from each codec driver supporting the SPDIF in.
  *
  * Returns 0 if successful, or a negative error code.
  */
@@ -2773,9 +2768,6 @@ static unsigned int hda_set_power_state(struct hda_codec *codec,
 		/* might be called before binding to driver, too */
 		if (driver && driver->ops && driver->ops->set_power_state)
 			driver->ops->set_power_state(codec, fg, power_state);
-		else if (codec->patch_ops.set_power_state)
-			codec->patch_ops.set_power_state(codec, fg,
-							 power_state);
 		else {
 			state = power_state;
 			if (codec->power_filter)
@@ -2859,8 +2851,6 @@ static unsigned int hda_call_codec_suspend(struct hda_codec *codec)
 	snd_hdac_enter_pm(&codec->core);
 	if (driver->ops && driver->ops->suspend)
 		driver->ops->suspend(codec);
-	else if (codec->patch_ops.suspend)
-		codec->patch_ops.suspend(codec);
 	if (!codec->no_stream_clean_at_suspend)
 		hda_cleanup_all_streams(codec);
 	state = hda_set_power_state(codec, AC_PWRST_D3);
@@ -2888,8 +2878,6 @@ static void hda_call_codec_resume(struct hda_codec *codec)
 	snd_hda_jack_set_dirty_all(codec);
 	if (driver->ops && driver->ops->resume)
 		driver->ops->resume(codec);
-	else if (codec->patch_ops.resume)
-		codec->patch_ops.resume(codec);
 	else {
 		snd_hda_codec_init(codec);
 		snd_hda_regmap_sync(codec);
@@ -3085,8 +3073,6 @@ int snd_hda_codec_build_controls(struct hda_codec *codec)
 	if (!err) {
 		if (driver->ops && driver->ops->build_controls)
 			err = driver->ops->build_controls(codec);
-		else if (codec->patch_ops.build_controls)
-			err = codec->patch_ops.build_controls(codec);
 		if (err < 0)
 			return err;
 	}
@@ -3284,8 +3270,6 @@ int snd_hda_codec_parse_pcms(struct hda_codec *codec)
 
 	if (driver->ops && driver->ops->build_pcms)
 		err = driver->ops->build_pcms(codec);
-	else if (codec->patch_ops.build_pcms)
-		err = codec->patch_ops.build_pcms(codec);
 	else
 		return 0;
 
diff --git a/sound/hda/common/hda_local.h b/sound/hda/common/hda_local.h
index 654fe1156d56..e56bea4c9357 100644
--- a/sound/hda/common/hda_local.h
+++ b/sound/hda/common/hda_local.h
@@ -658,8 +658,6 @@ static inline int snd_hda_codec_init(struct hda_codec *codec)
 
 	if (driver->ops && driver->ops->init)
 		return driver->ops->init(codec);
-	else if (codec->patch_ops.init)
-		return codec->patch_ops.init(codec);
 	return 0;
 }
 
diff --git a/sound/soc/codecs/hda.c b/sound/soc/codecs/hda.c
index ddb31001657e..126270ffd418 100644
--- a/sound/soc/codecs/hda.c
+++ b/sound/soc/codecs/hda.c
@@ -177,7 +177,6 @@ static int hda_codec_probe(struct snd_soc_component *component)
 	struct hdac_device *hdev = &codec->core;
 	struct hdac_bus *bus = hdev->bus;
 	struct hdac_ext_link *hlink;
-	hda_codec_patch_t patch;
 	int ret;
 
 #ifdef CONFIG_PM
@@ -215,19 +214,12 @@ static int hda_codec_probe(struct snd_soc_component *component)
 		goto err;
 	}
 
-	if (driver->ops && driver->ops->probe) {
-		ret = driver->ops->probe(codec, codec->preset);
-	} else {
-		patch = (hda_codec_patch_t)codec->preset->driver_data;
-		if (!patch) {
-			dev_err(&hdev->dev, "no patch specified\n");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		ret = patch(codec);
+	if (WARN_ON(!(driver->ops && driver->ops->probe))) {
+		ret = -EINVAL;
+		goto err;
 	}
 
+	ret = driver->ops->probe(codec, codec->preset);
 	if (ret < 0) {
 		dev_err(&hdev->dev, "codec init failed: %d\n", ret);
 		goto err;
@@ -260,8 +252,6 @@ complete_err:
 parse_pcms_err:
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(codec);
-	else if (codec->patch_ops.free)
-		codec->patch_ops.free(codec);
 err:
 	snd_hda_codec_cleanup_for_unbind(codec);
 device_new_err:
@@ -292,8 +282,6 @@ static void hda_codec_remove(struct snd_soc_component *component)
 
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(codec);
-	else if (codec->patch_ops.free)
-		codec->patch_ops.free(codec);
 
 	snd_hda_codec_cleanup_for_unbind(codec);
 	pm_runtime_put_noidle(&hdev->dev);
diff --git a/sound/soc/codecs/hdac_hda.c b/sound/soc/codecs/hdac_hda.c
index 7bb7845d5e43..191cb8427664 100644
--- a/sound/soc/codecs/hdac_hda.c
+++ b/sound/soc/codecs/hdac_hda.c
@@ -411,7 +411,6 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 	struct hda_codec *hcodec = hda_pvt->codec;
 	struct hda_codec_driver *driver = hda_codec_to_driver(hcodec);
 	struct hdac_ext_link *hlink;
-	hda_codec_patch_t patch;
 	int ret;
 
 	hlink = snd_hdac_ext_bus_get_hlink_by_name(hdev->bus, dev_name(&hdev->dev));
@@ -485,23 +484,15 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 		goto error_pm;
 	}
 
-	if (driver->ops && driver->ops->probe) {
-		ret = driver->ops->probe(hcodec, hcodec->preset);
-		if (ret < 0) {
-			dev_err(&hdev->dev, "%s: probe failed %d\n", __func__, ret);
-			goto error_regmap;
-		}
-	} else {
-		patch = (hda_codec_patch_t)hcodec->preset->driver_data;
-		if (patch) {
-			ret = patch(hcodec);
-			if (ret < 0) {
-				dev_err(&hdev->dev, "%s: patch failed %d\n", __func__, ret);
-				goto error_regmap;
-			}
-		} else {
-			dev_dbg(&hdev->dev, "%s: no patch file found\n", __func__);
-		}
+	if (WARN_ON(!(driver->ops && driver->ops->probe))) {
+		ret = -EINVAL;
+		goto error_regmap;
+	}
+
+	ret = driver->ops->probe(hcodec, hcodec->preset);
+	if (ret < 0) {
+		dev_err(&hdev->dev, "%s: probe failed %d\n", __func__, ret);
+		goto error_regmap;
 	}
 
 	ret = snd_hda_codec_parse_pcms(hcodec);
@@ -542,8 +533,6 @@ static int hdac_hda_codec_probe(struct snd_soc_component *component)
 error_patch:
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(hcodec);
-	else if (hcodec->patch_ops.free)
-		hcodec->patch_ops.free(hcodec);
 error_regmap:
 	snd_hdac_regmap_exit(hdev);
 error_pm:
@@ -573,8 +562,6 @@ static void hdac_hda_codec_remove(struct snd_soc_component *component)
 
 	if (driver->ops && driver->ops->remove)
 		driver->ops->remove(codec);
-	else if (codec->patch_ops.free)
-		codec->patch_ops.free(codec);
 
 	snd_hda_codec_cleanup_for_unbind(codec);
 }
-- 
cgit v1.2.3


From 5c21c5f22d0701ac6c1cafc0e8de4bf42e5c53e5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 11 Jul 2025 15:47:48 +0200
Subject: cleanup: add a scoped version of CLASS()

This will make it possible to use:

scoped_class() {
}

constructs to limit variables to certain scopes and still perform
auto-cleanup.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/cleanup.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 7093e1d08af0..bee606bebaca 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -277,6 +277,14 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
 	class_##_name##_t var __cleanup(class_##_name##_destructor) =	\
 		class_##_name##_constructor
 
+#define scoped_class(_name, var, args)                          \
+	for (CLASS(_name, var)(args);                           \
+	     __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \
+	     ({ goto _label; }))                                \
+		if (0) {                                        \
+_label:                                                         \
+			break;                                  \
+		} else
 
 /*
  * DEFINE_GUARD(name, type, lock, unlock):
-- 
cgit v1.2.3


From 56180dd20c19e5b0fa34822997a9ac66b517e7b3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Jul 2025 13:00:07 +0200
Subject: futex: Use RCU-based per-CPU reference counting instead of rcuref_t

The use of rcuref_t for reference counting introduces a performance bottleneck
when accessed concurrently by multiple threads during futex operations.

Replace rcuref_t with special crafted per-CPU reference counters. The
lifetime logic remains the same.

The newly allocate private hash starts in FR_PERCPU state. In this state, each
futex operation that requires the private hash uses a per-CPU counter (an
unsigned int) for incrementing or decrementing the reference count.

When the private hash is about to be replaced, the per-CPU counters are
migrated to a atomic_t counter mm_struct::futex_atomic.
The migration process:
- Waiting for one RCU grace period to ensure all users observe the
  current private hash. This can be skipped if a grace period elapsed
  since the private hash was assigned.

- futex_private_hash::state is set to FR_ATOMIC, forcing all users to
  use mm_struct::futex_atomic for reference counting.

- After a RCU grace period, all users are guaranteed to be using the
  atomic counter. The per-CPU counters can now be summed up and added to
  the atomic_t counter. If the resulting count is zero, the hash can be
  safely replaced. Otherwise, active users still hold a valid reference.

- Once the atomic reference count drops to zero, the next futex
  operation will switch to the new private hash.

call_rcu_hurry() is used to speed up transition which otherwise might be
delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The
side effects would be that on auto scaling the new hash is used later
and the SET_SLOTS prctl() will block longer.

[bigeasy: commit description + mm get/ put_async]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de
---
 include/linux/futex.h    |  16 +---
 include/linux/mm_types.h |   5 +
 include/linux/sched/mm.h |   2 +-
 init/Kconfig             |   4 -
 kernel/fork.c            |   8 +-
 kernel/futex/core.c      | 243 +++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 243 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b37193653e6b..9e9750f04980 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -85,18 +85,12 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 int futex_hash_allocate_default(void);
 void futex_hash_free(struct mm_struct *mm);
-
-static inline void futex_mm_init(struct mm_struct *mm)
-{
-	RCU_INIT_POINTER(mm->futex_phash, NULL);
-	mm->futex_phash_new = NULL;
-	mutex_init(&mm->futex_hash_lock);
-}
+int futex_mm_init(struct mm_struct *mm);
 
 #else /* !CONFIG_FUTEX_PRIVATE_HASH */
 static inline int futex_hash_allocate_default(void) { return 0; }
-static inline void futex_hash_free(struct mm_struct *mm) { }
-static inline void futex_mm_init(struct mm_struct *mm) { }
+static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
+static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
 #endif /* CONFIG_FUTEX_PRIVATE_HASH */
 
 #else /* !CONFIG_FUTEX */
@@ -118,8 +112,8 @@ static inline int futex_hash_allocate_default(void)
 {
 	return 0;
 }
-static inline void futex_hash_free(struct mm_struct *mm) { }
-static inline void futex_mm_init(struct mm_struct *mm) { }
+static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
+static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
 
 #endif
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d6b91e8a66d6..0f0662157066 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1070,6 +1070,11 @@ struct mm_struct {
 		struct mutex			futex_hash_lock;
 		struct futex_private_hash	__rcu *futex_phash;
 		struct futex_private_hash	*futex_phash_new;
+		/* futex-ref */
+		unsigned long			futex_batches;
+		struct rcu_head			futex_rcu;
+		atomic_long_t			futex_atomic;
+		unsigned int			__percpu *futex_ref;
 #endif
 
 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b13474825130..2201da0afecc 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -140,7 +140,7 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
 /* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
diff --git a/init/Kconfig b/init/Kconfig
index 666783eb50ab..af4c2f085455 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1716,13 +1716,9 @@ config FUTEX_PI
 	depends on FUTEX && RT_MUTEXES
 	default y
 
-#
-# marked broken for performance reasons; gives us one more cycle to sort things out.
-#
 config FUTEX_PRIVATE_HASH
 	bool
 	depends on FUTEX && !BASE_SMALL && MMU
-	depends on BROKEN
 	default y
 
 config FUTEX_MPOL
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..0b885dcbde9a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	RCU_INIT_POINTER(mm->exe_file, NULL);
 	mmu_notifier_subscriptions_init(mm);
 	init_tlb_flush_pending(mm);
-	futex_mm_init(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 		mm->def_flags = 0;
 	}
 
+	if (futex_mm_init(mm))
+		goto fail_mm_init;
+
 	if (mm_alloc_pgd(mm))
 		goto fail_nopgd;
 
@@ -1090,6 +1092,8 @@ fail_nocontext:
 fail_noid:
 	mm_free_pgd(mm);
 fail_nopgd:
+	futex_hash_free(mm);
+fail_mm_init:
 	free_mm(mm);
 	return NULL;
 }
@@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct,
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 90d53fb0ee9e..1dcb4c8a2585 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -42,7 +42,6 @@
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
 #include <linux/prctl.h>
-#include <linux/rcuref.h>
 #include <linux/mempolicy.h>
 #include <linux/mmap_lock.h>
 
@@ -65,7 +64,7 @@ static struct {
 #define futex_queues	(__futex_data.queues)
 
 struct futex_private_hash {
-	rcuref_t	users;
+	int		state;
 	unsigned int	hash_mask;
 	struct rcu_head	rcu;
 	void		*mm;
@@ -129,6 +128,12 @@ static struct futex_hash_bucket *
 __futex_hash(union futex_key *key, struct futex_private_hash *fph);
 
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
 static inline bool futex_key_is_private(union futex_key *key)
 {
 	/*
@@ -142,15 +147,14 @@ bool futex_private_hash_get(struct futex_private_hash *fph)
 {
 	if (fph->immutable)
 		return true;
-	return rcuref_get(&fph->users);
+	return futex_ref_get(fph);
 }
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
-	/* Ignore return value, last put is verified via rcuref_is_dead() */
 	if (fph->immutable)
 		return;
-	if (rcuref_put(&fph->users))
+	if (futex_ref_put(fph))
 		wake_up_var(fph->mm);
 }
 
@@ -243,14 +247,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
 	fph = rcu_dereference_protected(mm->futex_phash,
 					lockdep_is_held(&mm->futex_hash_lock));
 	if (fph) {
-		if (!rcuref_is_dead(&fph->users)) {
+		if (!futex_ref_is_dead(fph)) {
 			mm->futex_phash_new = new;
 			return false;
 		}
 
 		futex_rehash_private(fph, new);
 	}
-	rcu_assign_pointer(mm->futex_phash, new);
+	new->state = FR_PERCPU;
+	scoped_guard(rcu) {
+		mm->futex_batches = get_state_synchronize_rcu();
+		rcu_assign_pointer(mm->futex_phash, new);
+	}
 	kvfree_rcu(fph, rcu);
 	return true;
 }
@@ -289,9 +297,7 @@ again:
 		if (!fph)
 			return NULL;
 
-		if (fph->immutable)
-			return fph;
-		if (rcuref_get(&fph->users))
+		if (futex_private_hash_get(fph))
 			return fph;
 	}
 	futex_pivot_hash(mm);
@@ -1527,16 +1533,219 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 #define FH_IMMUTABLE	0x02
 
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+
+	/*
+	 * The counter we're about to switch to must have fully switched;
+	 * otherwise it would be impossible for it to have reported success
+	 * from futex_ref_is_dead().
+	 */
+	WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+	/*
+	 * Set the atomic to the bias value such that futex_ref_{get,put}()
+	 * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+	 * when folding in the percpu count.
+	 */
+	atomic_long_set(&mm->futex_atomic, LONG_MAX);
+	smp_store_release(&fph->state, FR_ATOMIC);
+
+	call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+	unsigned int count = 0;
+	long ret;
+	int cpu;
+
+	/*
+	 * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+	 * and per this RCU callback, everybody must now observe this state and
+	 * use the atomic variable.
+	 */
+	WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+	/*
+	 * Therefore the per-cpu counter is now stable, sum and reset.
+	 */
+	for_each_possible_cpu(cpu) {
+		unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+		count += *ptr;
+		*ptr = 0;
+	}
+
+	/*
+	 * Re-init for the next cycle.
+	 */
+	this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+	/*
+	 * Add actual count, subtract bias and initial refcount.
+	 *
+	 * The moment this atomic operation happens, futex_ref_is_dead() can
+	 * become true.
+	 */
+	ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+	if (!ret)
+		wake_up_var(mm);
+
+	WARN_ON_ONCE(ret < 0);
+	mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+	struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+	struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+	if (fph->state == FR_PERCPU) {
+		/*
+		 * Per this extra grace-period, everybody must now observe
+		 * fph as the current fph and no previously observed fph's
+		 * are in-flight.
+		 *
+		 * Notably, nobody will now rely on the atomic
+		 * futex_ref_is_dead() state anymore so we can begin the
+		 * migration of the per-cpu counter into the atomic.
+		 */
+		__futex_ref_atomic_begin(fph);
+		return;
+	}
+
+	__futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+
+	/*
+	 * Can only transition the current fph;
+	 */
+	WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+	/*
+	 * We enqueue at least one RCU callback. Ensure mm stays if the task
+	 * exits before the transition is completed.
+	 */
+	mmget(mm);
+
+	/*
+	 * In order to avoid the following scenario:
+	 *
+	 * futex_hash()			__futex_pivot_hash()
+	 *   guard(rcu);		  guard(mm->futex_hash_lock);
+	 *   fph = mm->futex_phash;
+	 *				  rcu_assign_pointer(&mm->futex_phash, new);
+	 *				futex_hash_allocate()
+	 *				  futex_ref_drop()
+	 *				    fph->state = FR_ATOMIC;
+	 *				    atomic_set(, BIAS);
+	 *
+	 *   futex_private_hash_get(fph); // OOPS
+	 *
+	 * Where an old fph (which is FR_ATOMIC) and should fail on
+	 * inc_not_zero, will succeed because a new transition is started and
+	 * the atomic is bias'ed away from 0.
+	 *
+	 * There must be at least one full grace-period between publishing a
+	 * new fph and trying to replace it.
+	 */
+	if (poll_state_synchronize_rcu(mm->futex_batches)) {
+		/*
+		 * There was a grace-period, we can begin now.
+		 */
+		__futex_ref_atomic_begin(fph);
+		return;
+	}
+
+	call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+
+	guard(rcu)();
+
+	if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+		this_cpu_inc(*mm->futex_ref);
+		return true;
+	}
+
+	return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+
+	guard(rcu)();
+
+	if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+		this_cpu_dec(*mm->futex_ref);
+		return false;
+	}
+
+	return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+	struct mm_struct *mm = fph->mm;
+
+	guard(rcu)();
+
+	if (smp_load_acquire(&fph->state) == FR_PERCPU)
+		return false;
+
+	return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+	mutex_init(&mm->futex_hash_lock);
+	RCU_INIT_POINTER(mm->futex_phash, NULL);
+	mm->futex_phash_new = NULL;
+	/* futex-ref */
+	atomic_long_set(&mm->futex_atomic, 0);
+	mm->futex_batches = get_state_synchronize_rcu();
+	mm->futex_ref = alloc_percpu(unsigned int);
+	if (!mm->futex_ref)
+		return -ENOMEM;
+	this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+	return 0;
+}
+
 void futex_hash_free(struct mm_struct *mm)
 {
 	struct futex_private_hash *fph;
 
+	free_percpu(mm->futex_ref);
 	kvfree(mm->futex_phash_new);
 	fph = rcu_dereference_raw(mm->futex_phash);
-	if (fph) {
-		WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+	if (fph)
 		kvfree(fph);
-	}
 }
 
 static bool futex_pivot_pending(struct mm_struct *mm)
@@ -1549,7 +1758,7 @@ static bool futex_pivot_pending(struct mm_struct *mm)
 		return true;
 
 	fph = rcu_dereference(mm->futex_phash);
-	return rcuref_is_dead(&fph->users);
+	return futex_ref_is_dead(fph);
 }
 
 static bool futex_hash_less(struct futex_private_hash *a,
@@ -1598,11 +1807,11 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 		}
 	}
 
-	fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+	fph = kvzalloc(struct_size(fph, queues, hash_slots),
+		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!fph)
 		return -ENOMEM;
 
-	rcuref_init(&fph->users, 1);
 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 	fph->custom = custom;
 	fph->immutable = !!(flags & FH_IMMUTABLE);
@@ -1645,7 +1854,7 @@ again:
 				 * allocated a replacement hash, drop the initial
 				 * reference on the existing hash.
 				 */
-				futex_private_hash_put(cur);
+				futex_ref_drop(cur);
 			}
 
 			if (new) {
-- 
cgit v1.2.3


From 760e6f7befbab9a84c54457a8ee45313b7b91ee5 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 10 Jul 2025 13:00:09 +0200
Subject: futex: Remove support for IMMUTABLE

The FH_FLAG_IMMUTABLE flag was meant to avoid the reference counting on
the private hash and so to avoid the performance regression on big
machines.
With the switch to per-CPU counter this is no longer needed. That flag
was never useable on any released kernel.

Remove any support for IMMUTABLE while preserve the flags argument and
enforce it to be zero.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250710110011.384614-5-bigeasy@linutronix.de
---
 include/uapi/linux/prctl.h |  2 --
 kernel/futex/core.c        | 36 +++---------------------------------
 2 files changed, 3 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 43dec6eed559..3b93fb906e3c 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -367,8 +367,6 @@ struct prctl_mm_map {
 /* FUTEX hash management */
 #define PR_FUTEX_HASH			78
 # define PR_FUTEX_HASH_SET_SLOTS	1
-# define FH_FLAG_IMMUTABLE		(1ULL << 0)
 # define PR_FUTEX_HASH_GET_SLOTS	2
-# define PR_FUTEX_HASH_GET_IMMUTABLE	3
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 1981574a459d..d9bb5567af0c 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -69,7 +69,6 @@ struct futex_private_hash {
 	struct rcu_head	rcu;
 	void		*mm;
 	bool		custom;
-	bool		immutable;
 	struct futex_hash_bucket queues[];
 };
 
@@ -145,15 +144,11 @@ static inline bool futex_key_is_private(union futex_key *key)
 
 static bool futex_private_hash_get(struct futex_private_hash *fph)
 {
-	if (fph->immutable)
-		return true;
 	return futex_ref_get(fph);
 }
 
 void futex_private_hash_put(struct futex_private_hash *fph)
 {
-	if (fph->immutable)
-		return;
 	if (futex_ref_put(fph))
 		wake_up_var(fph->mm);
 }
@@ -1530,7 +1525,6 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
 }
 
 #define FH_CUSTOM	0x01
-#define FH_IMMUTABLE	0x02
 
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 
@@ -1800,7 +1794,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 	 */
 	scoped_guard(rcu) {
 		fph = rcu_dereference(mm->futex_phash);
-		if (fph && (!fph->hash_mask || fph->immutable)) {
+		if (fph && !fph->hash_mask) {
 			if (custom)
 				return -EBUSY;
 			return 0;
@@ -1814,7 +1808,6 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
 
 	fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
 	fph->custom = custom;
-	fph->immutable = !!(flags & FH_IMMUTABLE);
 	fph->mm = mm;
 
 	for (i = 0; i < hash_slots; i++)
@@ -1838,7 +1831,7 @@ again:
 		mm->futex_phash_new = NULL;
 
 		if (fph) {
-			if (cur && (!cur->hash_mask || cur->immutable)) {
+			if (cur && !cur->hash_mask) {
 				/*
 				 * If two threads simultaneously request the global
 				 * hash then the first one performs the switch,
@@ -1931,19 +1924,6 @@ static int futex_hash_get_slots(void)
 	return 0;
 }
 
-static int futex_hash_get_immutable(void)
-{
-	struct futex_private_hash *fph;
-
-	guard(rcu)();
-	fph = rcu_dereference(current->mm->futex_phash);
-	if (fph && fph->immutable)
-		return 1;
-	if (fph && !fph->hash_mask)
-		return 1;
-	return 0;
-}
-
 #else
 
 static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
@@ -1956,10 +1936,6 @@ static int futex_hash_get_slots(void)
 	return 0;
 }
 
-static int futex_hash_get_immutable(void)
-{
-	return 0;
-}
 #endif
 
 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
@@ -1969,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
 
 	switch (arg2) {
 	case PR_FUTEX_HASH_SET_SLOTS:
-		if (arg4 & ~FH_FLAG_IMMUTABLE)
+		if (arg4)
 			return -EINVAL;
-		if (arg4 & FH_FLAG_IMMUTABLE)
-			flags |= FH_IMMUTABLE;
 		ret = futex_hash_allocate(arg3, flags);
 		break;
 
@@ -1980,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
 		ret = futex_hash_get_slots();
 		break;
 
-	case PR_FUTEX_HASH_GET_IMMUTABLE:
-		ret = futex_hash_get_immutable();
-		break;
-
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From ed42eee797ff3dc889ade63c1dd7c4f430699e23 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:04 -0700
Subject: iommufd/viommu: Add driver-defined vDEVICE support

NVIDIA VCMDQ driver will have a driver-defined vDEVICE structure and do
some HW configurations with that.

To allow IOMMU drivers to define their own vDEVICE structures, move the
struct iommufd_vdevice to the public header and provide a pair of viommu
ops, similar to get_viommu_size and viommu_init.

Doing this, however, creates a new window between the vDEVICE allocation
and its driver-level initialization, during which an abort could happen
but it can't invoke a driver destroy function from the struct viommu_ops
since the driver structure isn't initialized yet. vIOMMU object doesn't
have this problem, since its destroy op is set via the viommu_ops by the
driver viommu_init function. Thus, vDEVICE should do something similar:
add a destroy function pointer inside the struct iommufd_vdevice instead
of the struct iommufd_viommu_ops.

Note that there is unlikely a use case for a type dependent vDEVICE, so
a static vdevice_size is probably enough for the near term instead of a
get_vdevice_size function op.

Link: https://patch.msgid.link/r/1e751c01da7863c669314d8e27fdb89eabcf5605.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 12 ------------
 drivers/iommu/iommufd/viommu.c          | 26 +++++++++++++++++++++++++-
 include/linux/iommufd.h                 | 31 +++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9fdbf5f21f2e..06b8c2e2d9e6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -653,18 +653,6 @@ void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
 
-struct iommufd_vdevice {
-	struct iommufd_object obj;
-	struct iommufd_viommu *viommu;
-	struct device *dev;
-
-	/*
-	 * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of
-	 * AMD IOMMU, and vRID of Intel VT-d
-	 */
-	u64 virt_id;
-};
-
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index c0365849f849..081ee6697a11 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -116,6 +116,8 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj)
 		container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
 
+	if (vdev->destroy)
+		vdev->destroy(vdev);
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
 	xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
@@ -126,6 +128,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
 	struct iommufd_vdevice *vdev, *curr;
+	size_t vdev_size = sizeof(*vdev);
 	struct iommufd_viommu *viommu;
 	struct iommufd_device *idev;
 	u64 virt_id = cmd->virt_id;
@@ -150,7 +153,22 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
-	vdev = iommufd_object_alloc_ucmd(ucmd, vdev, IOMMUFD_OBJ_VDEVICE);
+	if (viommu->ops && viommu->ops->vdevice_size) {
+		/*
+		 * It is a driver bug for:
+		 * - ops->vdevice_size smaller than the core structure size
+		 * - not implementing a pairing ops->vdevice_init op
+		 */
+		if (WARN_ON_ONCE(viommu->ops->vdevice_size < vdev_size ||
+				 !viommu->ops->vdevice_init)) {
+			rc = -EOPNOTSUPP;
+			goto out_put_idev;
+		}
+		vdev_size = viommu->ops->vdevice_size;
+	}
+
+	vdev = (struct iommufd_vdevice *)_iommufd_object_alloc_ucmd(
+		ucmd, vdev_size, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
 		goto out_put_idev;
@@ -168,6 +186,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
+	if (viommu->ops && viommu->ops->vdevice_init) {
+		rc = viommu->ops->vdevice_init(vdev);
+		if (rc)
+			goto out_put_idev;
+	}
+
 	cmd->out_vdevice_id = vdev->obj.id;
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 2d1bf2f97ee3..bdd10a85eeef 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -104,6 +104,21 @@ struct iommufd_viommu {
 	enum iommu_viommu_type type;
 };
 
+struct iommufd_vdevice {
+	struct iommufd_object obj;
+	struct iommufd_viommu *viommu;
+	struct device *dev;
+
+	/*
+	 * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of
+	 * AMD IOMMU, and vRID of Intel VT-d
+	 */
+	u64 virt_id;
+
+	/* Clean up all driver-specific parts of an iommufd_vdevice */
+	void (*destroy)(struct iommufd_vdevice *vdev);
+};
+
 /**
  * struct iommufd_viommu_ops - vIOMMU specific operations
  * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
@@ -120,6 +135,14 @@ struct iommufd_viommu {
  *                    array->entry_num to report the number of handled requests.
  *                    The data structure of the array entry must be defined in
  *                    include/uapi/linux/iommufd.h
+ * @vdevice_size: Size of the driver-defined vDEVICE structure per this vIOMMU
+ * @vdevice_init: Initialize the driver-level structure of a vDEVICE object, or
+ *                related HW procedure. @vdev is already initialized by iommufd
+ *                core: vdev->dev and vdev->viommu pointers; vdev->id carries a
+ *                per-vIOMMU virtual ID (refer to struct iommu_vdevice_alloc in
+ *                include/uapi/linux/iommufd.h)
+ *                If driver has a deinit function to revert what vdevice_init op
+ *                does, it should set it to the @vdev->destroy function pointer
  */
 struct iommufd_viommu_ops {
 	void (*destroy)(struct iommufd_viommu *viommu);
@@ -128,6 +151,8 @@ struct iommufd_viommu_ops {
 		const struct iommu_user_data *user_data);
 	int (*cache_invalidate)(struct iommufd_viommu *viommu,
 				struct iommu_user_data_array *array);
+	const size_t vdevice_size;
+	int (*vdevice_init)(struct iommufd_vdevice *vdev);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
@@ -224,4 +249,10 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
 	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu,                 \
 					((drv_struct *)NULL)->member)))
+
+#define VDEVICE_STRUCT_SIZE(drv_struct, member)                                \
+	(sizeof(drv_struct) +                                                  \
+	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
+	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice,                \
+					((drv_struct *)NULL)->member)))
 #endif
-- 
cgit v1.2.3


From e2e9360022585c21dc30d2b19f5866c252f40806 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:05 -0700
Subject: iommufd/viommu: Introduce IOMMUFD_OBJ_HW_QUEUE and its related struct

Add IOMMUFD_OBJ_HW_QUEUE with an iommufd_hw_queue structure, representing
a HW-accelerated queue type of IOMMU's physical queue that can be passed
through to a user space VM for direct hardware control, such as:
 - NVIDIA's Virtual Command Queue
 - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers

Add new viommu ops for iommufd to communicate with IOMMU drivers to fetch
supported HW queue structure size and to forward user space ioctls to the
IOMMU drivers for initialization/destroy.

As the existing HWs, NVIDIA's VCMDQs access the guest memory via physical
addresses, while AMD's Buffers access the guest memory via guest physical
addresses (i.e. iova of the nesting parent HWPT). Separate two mutually
exclusive hw_queue_init and hw_queue_init_phys ops to indicate whether a
vIOMMU HW accesses the guest queue in the guest physical space (via iova)
or the host physical space (via pa).

In a latter case, the iommufd core will validate the physical pages of a
given guest queue, to ensure the underlying physical pages are contiguous
and pinned.

Since this is introduced with NVIDIA's VCMDQs, add hw_queue_init_phys for
now, and leave some notes for hw_queue_init in the near future (for AMD).

Either NVIDIA's or AMD's HW is a multi-queue model: NVIDIA's will be only
one type in enum iommu_hw_queue_type, while AMD's will be three different
types (two of which will have multi queues). Compared to letting the core
manage multiple queues with three types per vIOMMU object, it'd be easier
for the driver to manage that by having three different driver-structure
arrays per vIOMMU object. Thus, pass in the index to the init op.

Link: https://patch.msgid.link/r/6939b73699e278e60ce167e911b3d9be68882bad.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/linux/iommufd.h      | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/iommufd.h |  9 +++++++++
 2 files changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index bdd10a85eeef..f13f3ca6adb5 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -37,6 +37,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_VIOMMU,
 	IOMMUFD_OBJ_VDEVICE,
 	IOMMUFD_OBJ_VEVENTQ,
+	IOMMUFD_OBJ_HW_QUEUE,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -119,6 +120,19 @@ struct iommufd_vdevice {
 	void (*destroy)(struct iommufd_vdevice *vdev);
 };
 
+struct iommufd_hw_queue {
+	struct iommufd_object obj;
+	struct iommufd_viommu *viommu;
+
+	u64 base_addr; /* in guest physical address space */
+	size_t length;
+
+	enum iommu_hw_queue_type type;
+
+	/* Clean up all driver-specific parts of an iommufd_hw_queue */
+	void (*destroy)(struct iommufd_hw_queue *hw_queue);
+};
+
 /**
  * struct iommufd_viommu_ops - vIOMMU specific operations
  * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory
@@ -143,6 +157,22 @@ struct iommufd_vdevice {
  *                include/uapi/linux/iommufd.h)
  *                If driver has a deinit function to revert what vdevice_init op
  *                does, it should set it to the @vdev->destroy function pointer
+ * @get_hw_queue_size: Get the size of a driver-defined HW queue structure for a
+ *                     given @viommu corresponding to @queue_type. Driver should
+ *                     return 0 if HW queue aren't supported accordingly. It is
+ *                     required for driver to use the HW_QUEUE_STRUCT_SIZE macro
+ *                     to sanitize the driver-level HW queue structure related
+ *                     to the core one
+ * @hw_queue_init_phys: Initialize the driver-level structure of a HW queue that
+ *                      is initialized with its core-level structure that holds
+ *                      all the info about a guest queue memory.
+ *                      Driver providing this op indicates that HW accesses the
+ *                      guest queue memory via physical addresses.
+ *                      @index carries the logical HW QUEUE ID per vIOMMU in a
+ *                      guest VM, for a multi-queue model. @base_addr_pa carries
+ *                      the physical location of the guest queue
+ *                      If driver has a deinit function to revert what this op
+ *                      does, it should set it to the @hw_queue->destroy pointer
  */
 struct iommufd_viommu_ops {
 	void (*destroy)(struct iommufd_viommu *viommu);
@@ -153,6 +183,11 @@ struct iommufd_viommu_ops {
 				struct iommu_user_data_array *array);
 	const size_t vdevice_size;
 	int (*vdevice_init)(struct iommufd_vdevice *vdev);
+	size_t (*get_hw_queue_size)(struct iommufd_viommu *viommu,
+				    enum iommu_hw_queue_type queue_type);
+	/* AMD's HW will add hw_queue_init simply using @hw_queue->base_addr */
+	int (*hw_queue_init_phys)(struct iommufd_hw_queue *hw_queue, u32 index,
+				  phys_addr_t base_addr_pa);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
@@ -255,4 +290,11 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
 	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice,                \
 					((drv_struct *)NULL)->member)))
+
+#define HW_QUEUE_STRUCT_SIZE(drv_struct, member)                               \
+	(sizeof(drv_struct) +                                                  \
+	 BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) +                     \
+	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue,               \
+					((drv_struct *)NULL)->member)))
+
 #endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 04eee77335cf..640a8b5147c2 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -1147,4 +1147,13 @@ struct iommu_veventq_alloc {
 	__u32 __reserved;
 };
 #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
+
+/**
+ * enum iommu_hw_queue_type - HW Queue Type
+ * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use
+ */
+enum iommu_hw_queue_type {
+	IOMMU_HW_QUEUE_TYPE_DEFAULT = 0,
+};
+
 #endif
-- 
cgit v1.2.3


From 2238ddc2b0560734c2dabb1c1fb4b342b5193625 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:06 -0700
Subject: iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl

Introduce a new IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl for user space to allocate
a HW QUEUE object for a vIOMMU specific HW-accelerated queue, e.g.:
 - NVIDIA's Virtual Command Queue
 - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers

Since this is introduced with NVIDIA's VCMDQs that access the guest memory
in the physical address space, add an iommufd_hw_queue_alloc_phys() helper
that will create an access object to the queue memory in the IOAS, to avoid
the mappings of the guest memory from being unmapped, during the life cycle
of the HW queue object.

AMD's HW will need an hw_queue_init op that is mutually exclusive with the
hw_queue_init_phys op, and their case will bypass the access part, i.e. no
iommufd_hw_queue_alloc_phys() call.

Link: https://patch.msgid.link/r/dab4ace747deb46c1fe70a5c663307f46990ae56.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |   2 +
 drivers/iommu/iommufd/main.c            |   6 ++
 drivers/iommu/iommufd/viommu.c          | 180 ++++++++++++++++++++++++++++++++
 include/linux/iommufd.h                 |   1 +
 include/uapi/linux/iommufd.h            |  34 ++++++
 5 files changed, 223 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 06b8c2e2d9e6..dcd609573244 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -652,6 +652,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_hw_queue_destroy(struct iommufd_object *obj);
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 778694d7c207..4e8dbbfac890 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -354,6 +354,7 @@ union ucmd_buffer {
 	struct iommu_destroy destroy;
 	struct iommu_fault_alloc fault;
 	struct iommu_hw_info info;
+	struct iommu_hw_queue_alloc hw_queue;
 	struct iommu_hwpt_alloc hwpt;
 	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
 	struct iommu_hwpt_invalidate cache;
@@ -396,6 +397,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_fault_alloc, out_fault_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
+	IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl,
+		 struct iommu_hw_queue_alloc, length),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
@@ -559,6 +562,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_HW_QUEUE] = {
+		.destroy = iommufd_hw_queue_destroy,
+	},
 	[IOMMUFD_OBJ_HWPT_PAGING] = {
 		.destroy = iommufd_hwpt_paging_destroy,
 		.abort = iommufd_hwpt_paging_abort,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 081ee6697a11..91339f799916 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -201,3 +201,183 @@ out_put_viommu:
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
 	return rc;
 }
+
+static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx,
+					    struct iommufd_access *access,
+					    u64 base_iova, size_t length)
+{
+	u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova);
+	u64 offset = base_iova - aligned_iova;
+
+	iommufd_access_unpin_pages(access, aligned_iova,
+				   PAGE_ALIGN(length + offset));
+	iommufd_access_detach_internal(access);
+	iommufd_access_destroy_internal(ictx, access);
+}
+
+void iommufd_hw_queue_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_hw_queue *hw_queue =
+		container_of(obj, struct iommufd_hw_queue, obj);
+
+	if (hw_queue->destroy)
+		hw_queue->destroy(hw_queue);
+	if (hw_queue->access)
+		iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx,
+						hw_queue->access,
+						hw_queue->base_addr,
+						hw_queue->length);
+	if (hw_queue->viommu)
+		refcount_dec(&hw_queue->viommu->obj.users);
+}
+
+/*
+ * When the HW accesses the guest queue via physical addresses, the underlying
+ * physical pages of the guest queue must be contiguous. Also, for the security
+ * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of
+ * the guest queue from the nesting parent iopt while the HW is still accessing
+ * the guest queue memory physically, such a HW queue must require an access to
+ * pin the underlying pages and prevent that from happening.
+ */
+static struct iommufd_access *
+iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd,
+			    struct iommufd_viommu *viommu, phys_addr_t *base_pa)
+{
+	u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova);
+	u64 offset = cmd->nesting_parent_iova - aligned_iova;
+	struct iommufd_access *access;
+	struct page **pages;
+	size_t max_npages;
+	size_t length;
+	size_t i;
+	int rc;
+
+	/* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */
+	if (check_add_overflow(offset, cmd->length, &length))
+		return ERR_PTR(-ERANGE);
+	if (check_add_overflow(length, PAGE_SIZE - 1, &length))
+		return ERR_PTR(-ERANGE);
+	max_npages = length / PAGE_SIZE;
+	/* length needs to be page aligned too */
+	length = max_npages * PAGE_SIZE;
+
+	/*
+	 * Use kvcalloc() to avoid memory fragmentation for a large page array.
+	 * Set __GFP_NOWARN to avoid syzkaller blowups
+	 */
+	pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	access = iommufd_access_create_internal(viommu->ictx);
+	if (IS_ERR(access)) {
+		rc = PTR_ERR(access);
+		goto out_free;
+	}
+
+	rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas);
+	if (rc)
+		goto out_destroy;
+
+	rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0);
+	if (rc)
+		goto out_detach;
+
+	/* Validate if the underlying physical pages are contiguous */
+	for (i = 1; i < max_npages; i++) {
+		if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1)
+			continue;
+		rc = -EFAULT;
+		goto out_unpin;
+	}
+
+	*base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset;
+	kfree(pages);
+	return access;
+
+out_unpin:
+	iommufd_access_unpin_pages(access, aligned_iova, length);
+out_detach:
+	iommufd_access_detach_internal(access);
+out_destroy:
+	iommufd_access_destroy_internal(viommu->ictx, access);
+out_free:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+
+int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_hw_queue_alloc *cmd = ucmd->cmd;
+	struct iommufd_hw_queue *hw_queue;
+	struct iommufd_viommu *viommu;
+	struct iommufd_access *access;
+	size_t hw_queue_size;
+	phys_addr_t base_pa;
+	u64 last;
+	int rc;
+
+	if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT)
+		return -EOPNOTSUPP;
+	if (!cmd->length)
+		return -EINVAL;
+	if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1,
+			       &last))
+		return -EOVERFLOW;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	if (!viommu->ops || !viommu->ops->get_hw_queue_size ||
+	    !viommu->ops->hw_queue_init_phys) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type);
+	if (!hw_queue_size) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	/*
+	 * It is a driver bug for providing a hw_queue_size smaller than the
+	 * core HW queue structure size
+	 */
+	if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd(
+		ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE);
+	if (IS_ERR(hw_queue)) {
+		rc = PTR_ERR(hw_queue);
+		goto out_put_viommu;
+	}
+
+	access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa);
+	if (IS_ERR(access)) {
+		rc = PTR_ERR(access);
+		goto out_put_viommu;
+	}
+
+	hw_queue->viommu = viommu;
+	refcount_inc(&viommu->obj.users);
+	hw_queue->access = access;
+	hw_queue->type = cmd->type;
+	hw_queue->length = cmd->length;
+	hw_queue->base_addr = cmd->nesting_parent_iova;
+
+	rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa);
+	if (rc)
+		goto out_put_viommu;
+
+	cmd->out_hw_queue_id = hw_queue->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index f13f3ca6adb5..ce4011a2fc27 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -123,6 +123,7 @@ struct iommufd_vdevice {
 struct iommufd_hw_queue {
 	struct iommufd_object obj;
 	struct iommufd_viommu *viommu;
+	struct iommufd_access *access;
 
 	u64 base_addr; /* in guest physical address space */
 	size_t length;
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 640a8b5147c2..b928c1ed2395 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -56,6 +56,7 @@ enum {
 	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
 	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
 	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
+	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
 };
 
 /**
@@ -1156,4 +1157,37 @@ enum iommu_hw_queue_type {
 	IOMMU_HW_QUEUE_TYPE_DEFAULT = 0,
 };
 
+/**
+ * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_hw_queue_alloc)
+ * @flags: Must be 0
+ * @viommu_id: Virtual IOMMU ID to associate the HW queue with
+ * @type: One of enum iommu_hw_queue_type
+ * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue
+ *         model
+ * @out_hw_queue_id: The ID of the new HW queue
+ * @nesting_parent_iova: Base address of the queue memory in the guest physical
+ *                       address space
+ * @length: Length of the queue memory
+ *
+ * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which
+ * allows HW to access a guest queue memory described using @nesting_parent_iova
+ * and @length.
+ *
+ * A vIOMMU can allocate multiple queues, but it must use a different @index per
+ * type to separate each allocation, e.g::
+ *
+ *     Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ...
+ */
+struct iommu_hw_queue_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 index;
+	__u32 out_hw_queue_id;
+	__aligned_u64 nesting_parent_iova;
+	__aligned_u64 length;
+};
+#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC)
 #endif
-- 
cgit v1.2.3


From 0b37d892d0425811618a737037b0212884cc25ae Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:07 -0700
Subject: iommufd/driver: Add iommufd_hw_queue_depend/undepend() helpers

NVIDIA Virtual Command Queue is one of the iommufd users exposing vIOMMU
features to user space VMs. Its hardware has a strict rule when mapping
and unmapping multiple global CMDQVs to/from a VM-owned VINTF, requiring
mappings in ascending order and unmappings in descending order.

The tegra241-cmdqv driver can apply the rule for a mapping in the LVCMDQ
allocation handler. However, it can't do the same for an unmapping since
user space could start random destroy calls breaking the rule, while the
destroy op in the driver level can't reject a destroy call as it returns
void.

Add iommufd_hw_queue_depend/undepend for-driver helpers, allowing LVCMDQ
allocator to refcount_inc() a sibling LVCMDQ object and LVCMDQ destroyer
to refcount_dec(), so that iommufd core will help block a random destroy
call that breaks the rule.

This is a bit of compromise, because a driver might end up with abusing
the API that deadlocks the objects. So restrict the API to a dependency
between two driver-allocated objects of the same type, as iommufd would
unlikely build any core-level dependency in this case. And encourage to
use the macro version that currently supports the HW QUEUE objects only.

Link: https://patch.msgid.link/r/2735c32e759c82f2e6c87cb32134eaf09b7589b5.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/driver.c | 28 +++++++++++++++++++++++++++
 include/linux/iommufd.h        | 44 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index 887719016804..e578ef32d30c 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -3,6 +3,34 @@
  */
 #include "iommufd_private.h"
 
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+int _iommufd_object_depend(struct iommufd_object *obj_dependent,
+			   struct iommufd_object *obj_depended)
+{
+	/* Reject self dependency that dead locks */
+	if (obj_dependent == obj_depended)
+		return -EINVAL;
+	/* Only support dependency between two objects of the same type */
+	if (obj_dependent->type != obj_depended->type)
+		return -EINVAL;
+
+	refcount_inc(&obj_depended->users);
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_depend, "IOMMUFD");
+
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+void _iommufd_object_undepend(struct iommufd_object *obj_dependent,
+			      struct iommufd_object *obj_depended)
+{
+	if (WARN_ON_ONCE(obj_dependent == obj_depended ||
+			 obj_dependent->type != obj_depended->type))
+		return;
+
+	refcount_dec(&obj_depended->users);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD");
+
 /* Caller should xa_lock(&viommu->vdevs) to protect the return value */
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id)
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index ce4011a2fc27..fa23439fa483 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -251,6 +251,10 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
 #endif /* CONFIG_IOMMUFD */
 
 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE)
+int _iommufd_object_depend(struct iommufd_object *obj_dependent,
+			   struct iommufd_object *obj_depended);
+void _iommufd_object_undepend(struct iommufd_object *obj_dependent,
+			      struct iommufd_object *obj_depended);
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
@@ -259,6 +263,18 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 				enum iommu_veventq_type type, void *event_data,
 				size_t data_len);
 #else /* !CONFIG_IOMMUFD_DRIVER_CORE */
+static inline int _iommufd_object_depend(struct iommufd_object *obj_dependent,
+					 struct iommufd_object *obj_depended)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+_iommufd_object_undepend(struct iommufd_object *obj_dependent,
+			 struct iommufd_object *obj_depended)
+{
+}
+
 static inline struct device *
 iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
 {
@@ -298,4 +314,32 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 	 BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue,               \
 					((drv_struct *)NULL)->member)))
 
+/*
+ * Helpers for IOMMU driver to build/destroy a dependency between two sibling
+ * structures created by one of the allocators above
+ */
+#define iommufd_hw_queue_depend(dependent, depended, member)                   \
+	({                                                                     \
+		int ret = -EINVAL;                                             \
+									       \
+		static_assert(__same_type(struct iommufd_hw_queue,             \
+					  dependent->member));                 \
+		static_assert(__same_type(typeof(*dependent), *depended));     \
+		if (!WARN_ON_ONCE(dependent->member.viommu !=                  \
+				  depended->member.viommu))                    \
+			ret = _iommufd_object_depend(&dependent->member.obj,   \
+						     &depended->member.obj);   \
+		ret;                                                           \
+	})
+
+#define iommufd_hw_queue_undepend(dependent, depended, member)                 \
+	({                                                                     \
+		static_assert(__same_type(struct iommufd_hw_queue,             \
+					  dependent->member));                 \
+		static_assert(__same_type(typeof(*dependent), *depended));     \
+		WARN_ON_ONCE(dependent->member.viommu !=                       \
+			     depended->member.viommu);                         \
+		_iommufd_object_undepend(&dependent->member.obj,               \
+					 &depended->member.obj);               \
+	})
 #endif
-- 
cgit v1.2.3


From 0b226380d4cce2e6ee50800d861934d474a30121 Mon Sep 17 00:00:00 2001
From: Sumit Gupta <sumitg@nvidia.com>
Date: Thu, 10 Jul 2025 00:21:46 +0200
Subject: dt-bindings: memory: tegra: Add Tegra264 support

Add bindings for the Memory Controller (MC) and External Memory
Controller (EMC) found on the Tegra264 SoC. Tegra264 SoC has a different
number of interrupt lines for MC sub-units: UCF_SOC, hub, hub common,
syncpoint and MC channel. The total number of interrupt lines is eight.
Update maxItems for MC interrupts accordingly.

This also adds a header containing the memory client ID definitions that
are used by the interconnects property in DT and the tegra_mc_client
table in the MC driver. These IDs are defined by the hardware, so the
numbering doesn't start at 0 and contains holes. Also added are the
stream IDs for various hardware blocks found on Tegra264. These are
allocated as blocks of 256 IDs and each block can be subdivided for
additional fine-grained isolation if needed.

Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
[treding@nvidia.com: add SMMU stream IDs, squash patches]
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20250709222147.3758356-2-thierry.reding@gmail.com
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 .../memory-controllers/nvidia,tegra186-mc.yaml     |  84 ++++++++++++-
 include/dt-bindings/memory/nvidia,tegra264.h       | 136 +++++++++++++++++++++
 2 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 include/dt-bindings/memory/nvidia,tegra264.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra186-mc.yaml b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra186-mc.yaml
index 935d63d181d9..b901f1b3e0fc 100644
--- a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra186-mc.yaml
+++ b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra186-mc.yaml
@@ -32,6 +32,7 @@ properties:
           - nvidia,tegra186-mc
           - nvidia,tegra194-mc
           - nvidia,tegra234-mc
+          - nvidia,tegra264-mc
 
   reg:
     minItems: 6
@@ -42,8 +43,12 @@ properties:
     maxItems: 18
 
   interrupts:
-    items:
-      - description: MC general interrupt
+    minItems: 1
+    maxItems: 8
+
+  interrupt-names:
+    minItems: 1
+    maxItems: 8
 
   "#address-cells":
     const: 2
@@ -74,6 +79,7 @@ patternProperties:
               - nvidia,tegra186-emc
               - nvidia,tegra194-emc
               - nvidia,tegra234-emc
+              - nvidia,tegra264-emc
 
       reg:
         minItems: 1
@@ -127,6 +133,15 @@ patternProperties:
             reg:
               minItems: 2
 
+      - if:
+          properties:
+            compatible:
+              const: nvidia,tegra264-emc
+        then:
+          properties:
+            reg:
+              minItems: 2
+
     additionalProperties: false
 
     required:
@@ -158,6 +173,12 @@ allOf:
             - const: ch2
             - const: ch3
 
+        interrupts:
+          items:
+            - description: MC general interrupt
+
+        interrupt-names: false
+
   - if:
       properties:
         compatible:
@@ -189,6 +210,12 @@ allOf:
             - const: ch14
             - const: ch15
 
+        interrupts:
+          items:
+            - description: MC general interrupt
+
+        interrupt-names: false
+
   - if:
       properties:
         compatible:
@@ -220,6 +247,59 @@ allOf:
             - const: ch14
             - const: ch15
 
+        interrupts:
+          items:
+            - description: MC general interrupt
+
+        interrupt-names: false
+
+  - if:
+      properties:
+        compatible:
+          const: nvidia,tegra264-mc
+    then:
+      properties:
+        reg:
+          minItems: 17
+          maxItems: 17
+          description: 17 memory controller channels
+
+        reg-names:
+          items:
+            - const: broadcast
+            - const: ch0
+            - const: ch1
+            - const: ch2
+            - const: ch3
+            - const: ch4
+            - const: ch5
+            - const: ch6
+            - const: ch7
+            - const: ch8
+            - const: ch9
+            - const: ch10
+            - const: ch11
+            - const: ch12
+            - const: ch13
+            - const: ch14
+            - const: ch15
+
+        interrupts:
+          minItems: 8
+          maxItems: 8
+          description: One interrupt line for each MC component
+
+        interrupt-names:
+          items:
+            - const: mcf
+            - const: hub1
+            - const: hub2
+            - const: hub3
+            - const: hub4
+            - const: hub5
+            - const: sbs
+            - const: channel
+
 additionalProperties: false
 
 required:
diff --git a/include/dt-bindings/memory/nvidia,tegra264.h b/include/dt-bindings/memory/nvidia,tegra264.h
new file mode 100644
index 000000000000..521405c01f84
--- /dev/null
+++ b/include/dt-bindings/memory/nvidia,tegra264.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. */
+
+#ifndef DT_BINDINGS_MEMORY_NVIDIA_TEGRA264_H
+#define DT_BINDINGS_MEMORY_NVIDIA_TEGRA264_H
+
+#define TEGRA264_SID(x) ((x) << 8)
+
+/*
+ * SMMU stream IDs
+ */
+
+#define TEGRA264_SID_AON       TEGRA264_SID(0x01)
+#define TEGRA264_SID_APE       TEGRA264_SID(0x02)
+#define TEGRA264_SID_ETR       TEGRA264_SID(0x03)
+#define TEGRA264_SID_BPMP      TEGRA264_SID(0x04)
+#define TEGRA264_SID_DCE       TEGRA264_SID(0x05)
+#define TEGRA264_SID_EQOS      TEGRA264_SID(0x06)
+#define TEGRA264_SID_GPCDMA    TEGRA264_SID(0x08)
+#define TEGRA264_SID_DISP      TEGRA264_SID(0x09)
+#define TEGRA264_SID_HDA       TEGRA264_SID(0x0a)
+#define TEGRA264_SID_HOST1X    TEGRA264_SID(0x0b)
+#define TEGRA264_SID_ISP0      TEGRA264_SID(0x0c)
+#define TEGRA264_SID_ISP1      TEGRA264_SID(0x0d)
+#define TEGRA264_SID_PMA0      TEGRA264_SID(0x0e)
+#define TEGRA264_SID_FSI0      TEGRA264_SID(0x0f)
+#define TEGRA264_SID_FSI1      TEGRA264_SID(0x10)
+#define TEGRA264_SID_PVA       TEGRA264_SID(0x11)
+#define TEGRA264_SID_SDMMC0    TEGRA264_SID(0x12)
+#define TEGRA264_SID_MGBE0     TEGRA264_SID(0x13)
+#define TEGRA264_SID_MGBE1     TEGRA264_SID(0x14)
+#define TEGRA264_SID_MGBE2     TEGRA264_SID(0x15)
+#define TEGRA264_SID_MGBE3     TEGRA264_SID(0x16)
+#define TEGRA264_SID_MSSSEQ    TEGRA264_SID(0x17)
+#define TEGRA264_SID_SE        TEGRA264_SID(0x18)
+#define TEGRA264_SID_SEU1      TEGRA264_SID(0x19)
+#define TEGRA264_SID_SEU2      TEGRA264_SID(0x1a)
+#define TEGRA264_SID_SEU3      TEGRA264_SID(0x1b)
+#define TEGRA264_SID_PSC       TEGRA264_SID(0x1c)
+#define TEGRA264_SID_OESP      TEGRA264_SID(0x23)
+#define TEGRA264_SID_SB        TEGRA264_SID(0x24)
+#define TEGRA264_SID_XSPI0     TEGRA264_SID(0x25)
+#define TEGRA264_SID_TSEC      TEGRA264_SID(0x29)
+#define TEGRA264_SID_UFS       TEGRA264_SID(0x2a)
+#define TEGRA264_SID_RCE       TEGRA264_SID(0x2b)
+#define TEGRA264_SID_RCE1      TEGRA264_SID(0x2c)
+#define TEGRA264_SID_VI        TEGRA264_SID(0x2e)
+#define TEGRA264_SID_VI1       TEGRA264_SID(0x2f)
+#define TEGRA264_SID_VIC       TEGRA264_SID(0x30)
+#define TEGRA264_SID_XUSB_DEV  TEGRA264_SID(0x32)
+#define TEGRA264_SID_XUSB_DEV1 TEGRA264_SID(0x33)
+#define TEGRA264_SID_XUSB_DEV2 TEGRA264_SID(0x34)
+#define TEGRA264_SID_XUSB_DEV3 TEGRA264_SID(0x35)
+#define TEGRA264_SID_XUSB_DEV4 TEGRA264_SID(0x36)
+#define TEGRA264_SID_XUSB_DEV5 TEGRA264_SID(0x37)
+
+/*
+ * memory client IDs
+ */
+
+/* HOST1X read client */
+#define TEGRA264_MEMORY_CLIENT_HOST1XR		0x16
+/* VIC read client */
+#define TEGRA264_MEMORY_CLIENT_VICR		0x6c
+/* VIC Write client */
+#define TEGRA264_MEMORY_CLIENT_VICW		0x6d
+/* VI R5 Write client */
+#define TEGRA264_MEMORY_CLIENT_VIW		0x72
+#define TEGRA264_MEMORY_CLIENT_NVDECSRD2MC	0x78
+#define TEGRA264_MEMORY_CLIENT_NVDECSWR2MC	0x79
+/* Audio processor(APE) Read client */
+#define TEGRA264_MEMORY_CLIENT_APER		0x7a
+/* Audio processor(APE) Write client */
+#define TEGRA264_MEMORY_CLIENT_APEW		0x7b
+/* Audio DMA Read client */
+#define TEGRA264_MEMORY_CLIENT_APEDMAR		0x9f
+/* Audio DMA Write client */
+#define TEGRA264_MEMORY_CLIENT_APEDMAW		0xa0
+#define TEGRA264_MEMORY_CLIENT_GPUR02MC		0xb6
+#define TEGRA264_MEMORY_CLIENT_GPUW02MC		0xb7
+/* VI Falcon Read client */
+#define TEGRA264_MEMORY_CLIENT_VIFALCONR	0xbc
+/* VI Falcon Write client */
+#define TEGRA264_MEMORY_CLIENT_VIFALCONW	0xbd
+/* Read Client of RCE */
+#define TEGRA264_MEMORY_CLIENT_RCER		0xd2
+/* Write client of RCE */
+#define TEGRA264_MEMORY_CLIENT_RCEW		0xd3
+/* PCIE0/MSI Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE0W		0xd9
+/* PCIE1/RPX4 Read clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE1R		0xda
+/* PCIE1/RPX4 Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE1W		0xdb
+/* PCIE2/DMX4 Read clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE2AR		0xdc
+/* PCIE2/DMX4 Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE2AW		0xdd
+/* PCIE3/RPX4 Read clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE3R		0xde
+/* PCIE3/RPX4 Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE3W		0xdf
+/* PCIE4/DMX8 Read clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE4R		0xe0
+/* PCIE4/DMX8 Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE4W		0xe1
+/* PCIE5/DMX4 Read clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE5R		0xe2
+/* PCIE5/DMX4 Write clients */
+#define TEGRA264_MEMORY_CLIENT_PCIE5W		0xe3
+/* UFS Read client */
+#define TEGRA264_MEMORY_CLIENT_UFSR		0x15c
+/* UFS write client */
+#define TEGRA264_MEMORY_CLIENT_UFSW		0x15d
+/* HDA Read client */
+#define TEGRA264_MEMORY_CLIENT_HDAR		0x17c
+/* HDA Write client */
+#define TEGRA264_MEMORY_CLIENT_HDAW		0x17d
+/* Disp ISO Read Client */
+#define TEGRA264_MEMORY_CLIENT_DISPR		0x182
+/* MGBE0 Read mccif */
+#define TEGRA264_MEMORY_CLIENT_MGBE0R		0x1a2
+/* MGBE0 Write mccif */
+#define TEGRA264_MEMORY_CLIENT_MGBE0W		0x1a3
+/* MGBE1 Read mccif */
+#define TEGRA264_MEMORY_CLIENT_MGBE1R		0x1a4
+/* MGBE1 Write mccif */
+#define TEGRA264_MEMORY_CLIENT_MGBE1W		0x1a5
+/* VI1 R5 Write client */
+#define TEGRA264_MEMORY_CLIENT_VI1W		0x1a6
+/* SDMMC0 Read mccif */
+#define TEGRA264_MEMORY_CLIENT_SDMMC0R		0x1c2
+/* SDMMC0 Write mccif */
+#define TEGRA264_MEMORY_CLIENT_SDMMC0W		0x1c3
+
+#endif /* DT_BINDINGS_MEMORY_NVIDIA_TEGRA264_H */
-- 
cgit v1.2.3


From 319cc06db42ac0cd6256a26b4ea21b52a6fe6308 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 8 Jul 2025 10:28:11 +0200
Subject: dt-bindings: Add Tegra264 clock and reset definitions

The BPMP firmware on Tegra264 defines a set of IDs for clock and reset
resources. These are not enumerations but provided by hardware, and 0 is
a reserved value, hence the numbering starts at 1.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 include/dt-bindings/clock/nvidia,tegra264.h | 466 ++++++++++++++++++++++++++++
 include/dt-bindings/reset/nvidia,tegra264.h |  92 ++++++
 2 files changed, 558 insertions(+)
 create mode 100644 include/dt-bindings/clock/nvidia,tegra264.h
 create mode 100644 include/dt-bindings/reset/nvidia,tegra264.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/nvidia,tegra264.h b/include/dt-bindings/clock/nvidia,tegra264.h
new file mode 100644
index 000000000000..0fc2ad5e6cef
--- /dev/null
+++ b/include/dt-bindings/clock/nvidia,tegra264.h
@@ -0,0 +1,466 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. */
+
+#ifndef DT_BINDINGS_CLOCK_NVIDIA_TEGRA264_H
+#define DT_BINDINGS_CLOCK_NVIDIA_TEGRA264_H
+
+#define TEGRA264_CLK_OSC				1
+#define TEGRA264_CLK_CLK_S				2
+#define TEGRA264_CLK_JTAG_REG				3
+#define TEGRA264_CLK_SPLL				4
+#define TEGRA264_CLK_SPLL_OUT0				5
+#define TEGRA264_CLK_SPLL_OUT1				6
+#define TEGRA264_CLK_SPLL_OUT2				7
+#define TEGRA264_CLK_SPLL_OUT3				8
+#define TEGRA264_CLK_SPLL_OUT4				9
+#define TEGRA264_CLK_SPLL_OUT5				10
+#define TEGRA264_CLK_SPLL_OUT6				11
+#define TEGRA264_CLK_SPLL_OUT7				12
+#define TEGRA264_CLK_AON_I2C				13
+#define TEGRA264_CLK_HOST1X				14
+#define TEGRA264_CLK_ISP				15
+#define TEGRA264_CLK_ISP1				16
+#define TEGRA264_CLK_ISP_ROOT				17
+#define TEGRA264_CLK_NAFLL_PVA0_CORE			18
+#define TEGRA264_CLK_NAFLL_PVA0_VPS			19
+#define TEGRA264_CLK_NVCSI				20
+#define TEGRA264_CLK_NVCSILP				21
+#define TEGRA264_CLK_PLLP_OUT0				22
+#define TEGRA264_CLK_PVA0_CPU_AXI			23
+#define TEGRA264_CLK_PVA0_VPS				24
+#define TEGRA264_CLK_PWM10				25
+#define TEGRA264_CLK_PWM2				26
+#define TEGRA264_CLK_PWM3				27
+#define TEGRA264_CLK_PWM4				28
+#define TEGRA264_CLK_PWM5				29
+#define TEGRA264_CLK_PWM9				30
+#define TEGRA264_CLK_QSPI0				31
+#define TEGRA264_CLK_QSPI0_2X_PM			32
+#define TEGRA264_CLK_RCE1_CPU				33
+#define TEGRA264_CLK_RCE1_NIC				34
+#define TEGRA264_CLK_RCE_CPU				35
+#define TEGRA264_CLK_RCE_NIC				36
+#define TEGRA264_CLK_SE					37
+#define TEGRA264_CLK_SEU1				38
+#define TEGRA264_CLK_SEU2				39
+#define TEGRA264_CLK_SEU3				40
+#define TEGRA264_CLK_SE_ROOT				41
+#define TEGRA264_CLK_SPI1				42
+#define TEGRA264_CLK_SPI2				43
+#define TEGRA264_CLK_SPI3				44
+#define TEGRA264_CLK_SPI4				45
+#define TEGRA264_CLK_SPI5				46
+#define TEGRA264_CLK_TOP_I2C				47
+#define TEGRA264_CLK_TSEC				48
+#define TEGRA264_CLK_TSEC_PKA				49
+#define TEGRA264_CLK_UART0				50
+#define TEGRA264_CLK_UART10				51
+#define TEGRA264_CLK_UART11				52
+#define TEGRA264_CLK_UART4				53
+#define TEGRA264_CLK_UART5				54
+#define TEGRA264_CLK_UART8				55
+#define TEGRA264_CLK_UART9				56
+#define TEGRA264_CLK_VI					57
+#define TEGRA264_CLK_VI1				58
+#define TEGRA264_CLK_VIC				59
+#define TEGRA264_CLK_VI_ROOT				60
+#define TEGRA264_CLK_DISPPLL				61
+#define TEGRA264_CLK_SPPLL0				62
+#define TEGRA264_CLK_SPPLL0_CLKOUT1A			63
+#define TEGRA264_CLK_SPPLL0_CLKOUT2A			64
+#define TEGRA264_CLK_SPPLL1				65
+#define TEGRA264_CLK_VPLL0				66
+#define TEGRA264_CLK_VPLL1				67
+#define TEGRA264_CLK_VPLL2				68
+#define TEGRA264_CLK_VPLL3				69
+#define TEGRA264_CLK_VPLL4				70
+#define TEGRA264_CLK_VPLL5				71
+#define TEGRA264_CLK_VPLL6				72
+#define TEGRA264_CLK_VPLL7				73
+#define TEGRA264_CLK_RG0_DIV				74
+#define TEGRA264_CLK_RG1_DIV				75
+#define TEGRA264_CLK_RG2_DIV				76
+#define TEGRA264_CLK_RG3_DIV				77
+#define TEGRA264_CLK_RG4_DIV				78
+#define TEGRA264_CLK_RG5_DIV				79
+#define TEGRA264_CLK_RG6_DIV				80
+#define TEGRA264_CLK_RG7_DIV				81
+#define TEGRA264_CLK_RG0				82
+#define TEGRA264_CLK_RG1				83
+#define TEGRA264_CLK_RG2				84
+#define TEGRA264_CLK_RG3				85
+#define TEGRA264_CLK_RG4				86
+#define TEGRA264_CLK_RG5				87
+#define TEGRA264_CLK_RG6				88
+#define TEGRA264_CLK_RG7				89
+#define TEGRA264_CLK_DISP				90
+#define TEGRA264_CLK_DSC				91
+#define TEGRA264_CLK_DSC_ROOT				92
+#define TEGRA264_CLK_HUB				93
+#define TEGRA264_CLK_VPLLX_SOR0_MUXED			94
+#define TEGRA264_CLK_VPLLX_SOR1_MUXED			95
+#define TEGRA264_CLK_VPLLX_SOR2_MUXED			96
+#define TEGRA264_CLK_VPLLX_SOR3_MUXED			97
+#define TEGRA264_CLK_LINKA_SYM				98
+#define TEGRA264_CLK_LINKB_SYM				99
+#define TEGRA264_CLK_LINKC_SYM				100
+#define TEGRA264_CLK_LINKD_SYM				101
+#define TEGRA264_CLK_PRE_SOR0				102
+#define TEGRA264_CLK_PRE_SOR1				103
+#define TEGRA264_CLK_PRE_SOR2				104
+#define TEGRA264_CLK_PRE_SOR3				105
+#define TEGRA264_CLK_SOR0_PLL_REF			106
+#define TEGRA264_CLK_SOR1_PLL_REF			107
+#define TEGRA264_CLK_SOR2_PLL_REF			108
+#define TEGRA264_CLK_SOR3_PLL_REF			109
+#define TEGRA264_CLK_SOR0_PAD				110
+#define TEGRA264_CLK_SOR1_PAD				111
+#define TEGRA264_CLK_SOR2_PAD				112
+#define TEGRA264_CLK_SOR3_PAD				113
+#define TEGRA264_CLK_SOR0_REF				114
+#define TEGRA264_CLK_SOR1_REF				115
+#define TEGRA264_CLK_SOR2_REF				116
+#define TEGRA264_CLK_SOR3_REF				117
+#define TEGRA264_CLK_SOR0_DIV				118
+#define TEGRA264_CLK_SOR1_DIV				119
+#define TEGRA264_CLK_SOR2_DIV				120
+#define TEGRA264_CLK_SOR3_DIV				121
+#define TEGRA264_CLK_SOR0				122
+#define TEGRA264_CLK_SOR1				123
+#define TEGRA264_CLK_SOR2				124
+#define TEGRA264_CLK_SOR3				125
+#define TEGRA264_CLK_SF0_SOR				126
+#define TEGRA264_CLK_SF1_SOR				127
+#define TEGRA264_CLK_SF2_SOR				128
+#define TEGRA264_CLK_SF3_SOR				129
+#define TEGRA264_CLK_SF4_SOR				130
+#define TEGRA264_CLK_SF5_SOR				131
+#define TEGRA264_CLK_SF6_SOR				132
+#define TEGRA264_CLK_SF7_SOR				133
+#define TEGRA264_CLK_SF0				134
+#define TEGRA264_CLK_SF1				135
+#define TEGRA264_CLK_SF2				136
+#define TEGRA264_CLK_SF3				137
+#define TEGRA264_CLK_SF4				138
+#define TEGRA264_CLK_SF5				139
+#define TEGRA264_CLK_SF6				140
+#define TEGRA264_CLK_SF7				141
+#define TEGRA264_CLK_MAUD				142
+#define TEGRA264_CLK_AZA_2XBIT				143
+#define TEGRA264_CLK_DCE_CPU				144
+#define TEGRA264_CLK_DCE_NIC				145
+#define TEGRA264_CLK_PLLC4				146
+#define TEGRA264_CLK_PLLC4_OUT0				147
+#define TEGRA264_CLK_PLLC4_OUT1				148
+#define TEGRA264_CLK_PLLC4_MUXED			149
+#define TEGRA264_CLK_SDMMC1				150
+#define TEGRA264_CLK_SDMMC_LEGACY_TM			151
+#define TEGRA264_CLK_PLLC0				152
+#define TEGRA264_CLK_NAFLL_BPMP				153
+#define TEGRA264_CLK_PLLP_OUT_PDIV			154
+#define TEGRA264_CLK_DISP_ROOT				155
+#define TEGRA264_CLK_ADSP				156
+#define TEGRA264_CLK_PLLA				157
+#define TEGRA264_CLK_PLLA1				158
+#define TEGRA264_CLK_PLLA1_OUT1				159
+#define TEGRA264_CLK_PLLAON				160
+#define TEGRA264_CLK_PLLAON_APE				161
+#define TEGRA264_CLK_PLLA_OUT0				162
+#define TEGRA264_CLK_AHUB				163
+#define TEGRA264_CLK_APE				164
+#define TEGRA264_CLK_I2S1_SCLK_IN			165
+#define TEGRA264_CLK_I2S2_SCLK_IN			166
+#define TEGRA264_CLK_I2S3_SCLK_IN			167
+#define TEGRA264_CLK_I2S4_SCLK_IN			168
+#define TEGRA264_CLK_I2S5_SCLK_IN			169
+#define TEGRA264_CLK_I2S6_SCLK_IN			170
+#define TEGRA264_CLK_I2S7_SCLK_IN			171
+#define TEGRA264_CLK_I2S8_SCLK_IN			172
+#define TEGRA264_CLK_I2S9_SCLK_IN			173
+#define TEGRA264_CLK_I2S1_AUDIO_SYNC			174
+#define TEGRA264_CLK_I2S2_AUDIO_SYNC			175
+#define TEGRA264_CLK_I2S3_AUDIO_SYNC			176
+#define TEGRA264_CLK_I2S4_AUDIO_SYNC			177
+#define TEGRA264_CLK_I2S5_AUDIO_SYNC			178
+#define TEGRA264_CLK_I2S6_AUDIO_SYNC			179
+#define TEGRA264_CLK_I2S7_AUDIO_SYNC			180
+#define TEGRA264_CLK_I2S8_AUDIO_SYNC			181
+#define TEGRA264_CLK_DMIC1_AUDIO_SYNC			182
+#define TEGRA264_CLK_DSPK1_AUDIO_SYNC			183
+#define TEGRA264_CLK_I2S1				184
+#define TEGRA264_CLK_I2S2				185
+#define TEGRA264_CLK_I2S3				186
+#define TEGRA264_CLK_I2S4				187
+#define TEGRA264_CLK_I2S5				188
+#define TEGRA264_CLK_I2S6				189
+#define TEGRA264_CLK_I2S7				190
+#define TEGRA264_CLK_I2S8				191
+#define TEGRA264_CLK_I2S9				192
+#define TEGRA264_CLK_DMIC1				193
+#define TEGRA264_CLK_DMIC5				194
+#define TEGRA264_CLK_DSPK1				195
+#define TEGRA264_CLK_AON_CPU				196
+#define TEGRA264_CLK_AON_NIC				197
+#define TEGRA264_CLK_BPMP				198
+#define TEGRA264_CLK_AXI_CBB				199
+#define TEGRA264_CLK_FUSE				200
+#define TEGRA264_CLK_TSENSE				201
+#define TEGRA264_CLK_CSITE				202
+#define TEGRA264_CLK_HCSITE				203
+#define TEGRA264_CLK_DBGAPB				204
+#define TEGRA264_CLK_LA					205
+#define TEGRA264_CLK_PLLREFGP				206
+#define TEGRA264_CLK_PLLE0				207
+#define TEGRA264_CLK_UPHY0_PLL0_XDIG			208
+#define TEGRA264_CLK_EQOS_APP				209
+#define TEGRA264_CLK_EQOS_MAC				210
+#define TEGRA264_CLK_EQOS_MACSEC			211
+#define TEGRA264_CLK_EQOS_TX_PCS			212
+#define TEGRA264_CLK_MGBES_PTP_REF			213
+#define TEGRA264_CLK_MGBE0_UPHY1_PLL_XDIG		214
+#define TEGRA264_CLK_MGBE0_TX_PCS			215
+#define TEGRA264_CLK_MGBE0_MAC				216
+#define TEGRA264_CLK_MGBE0_MACSEC			217
+#define TEGRA264_CLK_MGBE0_APP				218
+#define TEGRA264_CLK_MGBE1_UPHY1_PLL_XDIG		219
+#define TEGRA264_CLK_MGBE1_TX_PCS			220
+#define TEGRA264_CLK_MGBE1_MAC				221
+#define TEGRA264_CLK_MGBE1_MACSEC			222
+#define TEGRA264_CLK_MGBE1_APP				223
+#define TEGRA264_CLK_MGBE2_UPHY1_PLL_XDIG		224
+#define TEGRA264_CLK_MGBE2_TX_PCS			225
+#define TEGRA264_CLK_MGBE2_MAC				226
+#define TEGRA264_CLK_MGBE2_MACSEC			227
+#define TEGRA264_CLK_MGBE2_APP				228
+#define TEGRA264_CLK_MGBE3_UPHY1_PLL_XDIG		229
+#define TEGRA264_CLK_MGBE3_TX_PCS			230
+#define TEGRA264_CLK_MGBE3_MAC				231
+#define TEGRA264_CLK_MGBE3_MACSEC			232
+#define TEGRA264_CLK_MGBE3_APP				233
+#define TEGRA264_CLK_PLLREFUFS				234
+#define TEGRA264_CLK_PLLREFUFS_CLKOUT624		235
+#define TEGRA264_CLK_PLLREFUFS_REFCLKOUT		236
+#define TEGRA264_CLK_PLLREFUFS_UFSDEV_REFCLKOUT		237
+#define TEGRA264_CLK_UFSHC_CG_SYS			238
+#define TEGRA264_CLK_MPHY_L0_RX_LS_BIT_DIV		239
+#define TEGRA264_CLK_MPHY_L0_RX_LS_BIT			240
+#define TEGRA264_CLK_MPHY_L0_RX_LS_SYMB_DIV		241
+#define TEGRA264_CLK_MPHY_L0_RX_HS_SYMB_DIV		242
+#define TEGRA264_CLK_MPHY_L0_RX_SYMB			243
+#define TEGRA264_CLK_MPHY_L0_UPHY_TX_FIFO		244
+#define TEGRA264_CLK_MPHY_L0_TX_LS_3XBIT_DIV		245
+#define TEGRA264_CLK_MPHY_L0_TX_LS_SYMB_DIV		246
+#define TEGRA264_CLK_UPHY0_PLL4_XDIG			247
+#define TEGRA264_CLK_MPHY_L0_TX_HS_SYMB_DIV		248
+#define TEGRA264_CLK_MPHY_L0_TX_SYMB			249
+#define TEGRA264_CLK_MPHY_L0_TX_LS_3XBIT		250
+#define TEGRA264_CLK_MPHY_L0_RX_ANA			251
+#define TEGRA264_CLK_MPHY_L1_RX_ANA			252
+#define TEGRA264_CLK_MPHY_TX_1MHZ_REF			253
+#define TEGRA264_CLK_MPHY_CORE_PLL_FIXED		254
+#define TEGRA264_CLK_MPHY_IOBIST			255
+#define TEGRA264_CLK_UFSHC_CG_SYS_DIV			256
+#define TEGRA264_CLK_XUSB1_CORE				257
+#define TEGRA264_CLK_XUSB1_FALCON			258
+#define TEGRA264_CLK_XUSB1_FS				259
+#define TEGRA264_CLK_XUSB1_SS				260
+#define TEGRA264_CLK_UPHY0_USB_P0_RX_CORE		261
+#define TEGRA264_CLK_UPHY0_USB_P1_RX_CORE		262
+#define TEGRA264_CLK_UPHY0_USB_P2_RX_CORE		263
+#define TEGRA264_CLK_UPHY0_USB_P3_RX_CORE		264
+#define TEGRA264_CLK_XUSB1_CLK480M_NVWRAP_CORE		265
+#define TEGRA264_CLK_XUSB1_CORE_HOST			266
+#define TEGRA264_CLK_XUSB1_CORE_DEV			267
+#define TEGRA264_CLK_XUSB1_CORE_SUPERSPEED		268
+#define TEGRA264_CLK_XUSB1_FALCON_HOST			269
+#define TEGRA264_CLK_XUSB1_FALCON_SUPERSPEED		270
+#define TEGRA264_CLK_XUSB1_FS_HOST			271
+#define TEGRA264_CLK_XUSB1_FS_DEV			272
+#define TEGRA264_CLK_XUSB1_HS_HSICP			273
+#define TEGRA264_CLK_XUSB1_SS_DEV			274
+#define TEGRA264_CLK_XUSB1_SS_SUPERSPEED		275
+#define TEGRA264_CLK_AON_TOUCH				276
+#define TEGRA264_CLK_AUD_MCLK				277
+#define TEGRA264_CLK_EXTPERIPH1				278
+#define TEGRA264_CLK_EXTPERIPH2				279
+#define TEGRA264_CLK_EXTPERIPH3				280
+#define TEGRA264_CLK_EXTPERIPH4				281
+#define TEGRA264_CLK_JTAG_REG_UNGATED			282
+#define TEGRA264_CLK_IST_BUS				283
+#define TEGRA264_CLK_IST_BUS_RIST_MCC			284
+#define TEGRA264_CLK_MATHS_SEC_RIST			285
+#define TEGRA264_CLK_NAFLL_IST				286
+#define TEGRA264_CLK_RIST_ROOT				287
+#define TEGRA264_CLK_IST_CONTROLLER_RIST		288
+#define TEGRA264_CLK_MSS_ENCRYPT			289
+#define TEGRA264_CLK_EMC				290
+#define TEGRA264_CLK_SPPLL0_CLKOUT100			291
+#define TEGRA264_CLK_SPPLL0_CLKOUT270			292
+#define TEGRA264_CLK_SPPLL1_CLKOUT100			293
+#define TEGRA264_CLK_SPPLL1_CLKOUT270			294
+#define TEGRA264_CLK_DP_LINKA_REF			295
+#define TEGRA264_CLK_DP_LINKB_REF			296
+#define TEGRA264_CLK_DP_LINKC_REF			297
+#define TEGRA264_CLK_DP_LINKD_REF			298
+#define TEGRA264_CLK_PLLNVCSI				299
+#define TEGRA264_CLK_PLLBPMPCAM				300
+#define TEGRA264_CLK_UTMI_PLL1				301
+#define TEGRA264_CLK_UTMI_PLL1_CLKOUT48			302
+#define TEGRA264_CLK_UTMI_PLL1_CLKOUT60			303
+#define TEGRA264_CLK_UTMI_PLL1_CLKOUT480		304
+#define TEGRA264_CLK_NAFLL_ISP				305
+#define TEGRA264_CLK_NAFLL_RCE				306
+#define TEGRA264_CLK_NAFLL_RCE1				307
+#define TEGRA264_CLK_NAFLL_SE				308
+#define TEGRA264_CLK_NAFLL_VI				309
+#define TEGRA264_CLK_NAFLL_VIC				310
+#define TEGRA264_CLK_NAFLL_DCE				311
+#define TEGRA264_CLK_NAFLL_TSEC				312
+#define TEGRA264_CLK_NAFLL_CPAIR0			313
+#define TEGRA264_CLK_NAFLL_CPAIR1			314
+#define TEGRA264_CLK_NAFLL_CPAIR2			315
+#define TEGRA264_CLK_NAFLL_CPAIR3			316
+#define TEGRA264_CLK_NAFLL_CPAIR4			317
+#define TEGRA264_CLK_NAFLL_CPAIR5			318
+#define TEGRA264_CLK_NAFLL_CPAIR6			319
+#define TEGRA264_CLK_NAFLL_GPU_SYS			320
+#define TEGRA264_CLK_NAFLL_GPU_NVD			321
+#define TEGRA264_CLK_NAFLL_GPU_UPROC			322
+#define TEGRA264_CLK_NAFLL_GPU_GPC0			323
+#define TEGRA264_CLK_NAFLL_GPU_GPC1			324
+#define TEGRA264_CLK_NAFLL_GPU_GPC2			325
+#define TEGRA264_CLK_SOR_LINKA_INPUT			326
+#define TEGRA264_CLK_SOR_LINKB_INPUT			327
+#define TEGRA264_CLK_SOR_LINKC_INPUT			328
+#define TEGRA264_CLK_SOR_LINKD_INPUT			329
+#define TEGRA264_CLK_SOR_LINKA_AFIFO			330
+#define TEGRA264_CLK_SOR_LINKB_AFIFO			331
+#define TEGRA264_CLK_SOR_LINKC_AFIFO			332
+#define TEGRA264_CLK_SOR_LINKD_AFIFO			333
+#define TEGRA264_CLK_I2S1_PAD_M				334
+#define TEGRA264_CLK_I2S2_PAD_M				335
+#define TEGRA264_CLK_I2S3_PAD_M				336
+#define TEGRA264_CLK_I2S4_PAD_M				337
+#define TEGRA264_CLK_I2S5_PAD_M				338
+#define TEGRA264_CLK_I2S6_PAD_M				339
+#define TEGRA264_CLK_I2S7_PAD_M				340
+#define TEGRA264_CLK_I2S8_PAD_M				341
+#define TEGRA264_CLK_I2S9_PAD_M				342
+#define TEGRA264_CLK_BPMP_NIC				343
+#define TEGRA264_CLK_CLK1M				344
+#define TEGRA264_CLK_RDET				345
+#define TEGRA264_CLK_ADC_SOC_REF			346
+#define TEGRA264_CLK_UPHY0_PLL0_TXREF			347
+#define TEGRA264_CLK_EQOS_TX				348
+#define TEGRA264_CLK_EQOS_TX_M				349
+#define TEGRA264_CLK_EQOS_RX_PCS_IN			350
+#define TEGRA264_CLK_EQOS_RX_PCS_M			351
+#define TEGRA264_CLK_EQOS_RX_IN				352
+#define TEGRA264_CLK_EQOS_RX				353
+#define TEGRA264_CLK_EQOS_RX_M				354
+#define TEGRA264_CLK_MGBE0_UPHY1_PLL_TXREF		355
+#define TEGRA264_CLK_MGBE0_TX				356
+#define TEGRA264_CLK_MGBE0_TX_M				357
+#define TEGRA264_CLK_MGBE0_RX_PCS_IN			358
+#define TEGRA264_CLK_MGBE0_RX_PCS_M			359
+#define TEGRA264_CLK_MGBE0_RX_IN			360
+#define TEGRA264_CLK_MGBE0_RX_M				361
+#define TEGRA264_CLK_MGBE1_UPHY1_PLL_TXREF		362
+#define TEGRA264_CLK_MGBE1_TX				363
+#define TEGRA264_CLK_MGBE1_TX_M				364
+#define TEGRA264_CLK_MGBE1_RX_PCS_IN			365
+#define TEGRA264_CLK_MGBE1_RX_PCS_M			366
+#define TEGRA264_CLK_MGBE1_RX_IN			367
+#define TEGRA264_CLK_MGBE1_RX_M				368
+#define TEGRA264_CLK_MGBE2_UPHY1_PLL_TXREF		369
+#define TEGRA264_CLK_MGBE2_TX				370
+#define TEGRA264_CLK_MGBE2_TX_M				371
+#define TEGRA264_CLK_MGBE2_RX_PCS_IN			372
+#define TEGRA264_CLK_MGBE2_RX_PCS_M			373
+#define TEGRA264_CLK_MGBE2_RX_IN			374
+#define TEGRA264_CLK_MGBE2_RX_M				375
+#define TEGRA264_CLK_MGBE3_UPHY1_PLL_TXREF		376
+#define TEGRA264_CLK_MGBE3_TX				377
+#define TEGRA264_CLK_MGBE3_TX_M				378
+#define TEGRA264_CLK_MGBE3_RX_PCS_IN			379
+#define TEGRA264_CLK_MGBE3_RX_PCS_M			380
+#define TEGRA264_CLK_MGBE3_RX_IN			381
+#define TEGRA264_CLK_MGBE3_RX_M				382
+#define TEGRA264_CLK_UPHY0_USB_P0_TX_CORE		383
+#define TEGRA264_CLK_UPHY0_USB_P1_TX_CORE		384
+#define TEGRA264_CLK_UPHY0_USB_P2_TX_CORE		385
+#define TEGRA264_CLK_UPHY0_USB_P3_TX_CORE		386
+#define TEGRA264_CLK_UPHY0_USB_P0_TX			387
+#define TEGRA264_CLK_UPHY0_USB_P1_TX			388
+#define TEGRA264_CLK_UPHY0_USB_P2_TX			389
+#define TEGRA264_CLK_UPHY0_USB_P3_TX			390
+#define TEGRA264_CLK_UPHY0_USB_P0_RX_IN			391
+#define TEGRA264_CLK_UPHY0_USB_P1_RX_IN			392
+#define TEGRA264_CLK_UPHY0_USB_P2_RX_IN			393
+#define TEGRA264_CLK_UPHY0_USB_P3_RX_IN			394
+#define TEGRA264_CLK_UPHY0_USB_P0_RX_M			395
+#define TEGRA264_CLK_UPHY0_USB_P1_RX_M			396
+#define TEGRA264_CLK_UPHY0_USB_P2_RX_M			397
+#define TEGRA264_CLK_UPHY0_USB_P3_RX_M			398
+#define TEGRA264_CLK_UPHY0_LANE0_TX_M			399
+#define TEGRA264_CLK_PCIE_C1_XCLK_NOBG_M		400
+#define TEGRA264_CLK_PCIE_C2_XCLK_NOBG_M		401
+#define TEGRA264_CLK_PCIE_C3_XCLK_NOBG_M		402
+#define TEGRA264_CLK_PCIE_C4_XCLK_NOBG_M		403
+#define TEGRA264_CLK_PCIE_C5_XCLK_NOBG_M		404
+#define TEGRA264_CLK_PCIE_C1_L0_RX_M			405
+#define TEGRA264_CLK_PCIE_C1_L1_RX_M			406
+#define TEGRA264_CLK_PCIE_C1_L2_RX_M			407
+#define TEGRA264_CLK_PCIE_C1_L3_RX_M			408
+#define TEGRA264_CLK_PCIE_C2_L0_RX_M			409
+#define TEGRA264_CLK_PCIE_C2_L1_RX_M			410
+#define TEGRA264_CLK_PCIE_C2_L2_RX_M			411
+#define TEGRA264_CLK_PCIE_C2_L3_RX_M			412
+#define TEGRA264_CLK_PCIE_C3_L0_RX_M			413
+#define TEGRA264_CLK_PCIE_C3_L1_RX_M			414
+#define TEGRA264_CLK_PCIE_C4_L0_RX_M			415
+#define TEGRA264_CLK_PCIE_C4_L1_RX_M			416
+#define TEGRA264_CLK_PCIE_C4_L2_RX_M			417
+#define TEGRA264_CLK_PCIE_C4_L3_RX_M			418
+#define TEGRA264_CLK_PCIE_C4_L4_RX_M			419
+#define TEGRA264_CLK_PCIE_C4_L5_RX_M			420
+#define TEGRA264_CLK_PCIE_C4_L6_RX_M			421
+#define TEGRA264_CLK_PCIE_C4_L7_RX_M			422
+#define TEGRA264_CLK_PCIE_C5_L0_RX_M			423
+#define TEGRA264_CLK_PCIE_C5_L1_RX_M			424
+#define TEGRA264_CLK_PCIE_C5_L2_RX_M			425
+#define TEGRA264_CLK_PCIE_C5_L3_RX_M			426
+#define TEGRA264_CLK_MPHY_L0_RX_PWM_BIT_M		427
+#define TEGRA264_CLK_MPHY_L1_RX_PWM_BIT_M		428
+#define TEGRA264_CLK_DBB_UPHY0				429
+#define TEGRA264_CLK_UPHY0_UXL_CORE			430
+#define TEGRA264_CLK_ISC_CPU_ROOT			431
+#define TEGRA264_CLK_ISC_NIC				432
+#define TEGRA264_CLK_CTC_TXCLK0_M			433
+#define TEGRA264_CLK_CTC_TXCLK1_M			434
+#define TEGRA264_CLK_CTC_RXCLK0_M			435
+#define TEGRA264_CLK_CTC_RXCLK1_M			436
+#define TEGRA264_CLK_PLLREFGP_OUT			437
+#define TEGRA264_CLK_PLLREFGP_OUT1			438
+#define TEGRA264_CLK_GPU_SYS				439
+#define TEGRA264_CLK_GPU_NVD				440
+#define TEGRA264_CLK_GPU_UPROC				441
+#define TEGRA264_CLK_GPU_GPC0				442
+#define TEGRA264_CLK_GPU_GPC1				443
+#define TEGRA264_CLK_GPU_GPC2				444
+#define TEGRA264_CLK_PLLX				445
+#define TEGRA264_CLK_APE_SOUNDWIRE_MSRC0		446
+#define TEGRA264_CLK_APE_SOUNDWIRE_DATA_EN_SHAPER	447
+#define TEGRA264_CLK_AO_SOUNDWIRE_MSRC0			448
+#define TEGRA264_CLK_AO_SOUNDWIRE_DATA_EN_SHAPER	449
+#define TEGRA264_CLK_MGBE0_TX_SER			459
+#define TEGRA264_CLK_MGBE1_TX_SER			460
+#define TEGRA264_CLK_MGBE2_TX_SER			461
+#define TEGRA264_CLK_MGBE3_TX_SER			462
+#define TEGRA264_CLK_MGBE0_RX_SER			463
+#define TEGRA264_CLK_MGBE1_RX_SER			464
+#define TEGRA264_CLK_MGBE2_RX_SER			465
+#define TEGRA264_CLK_MGBE3_RX_SER			466
+#define TEGRA264_CLK_DPAUX				467
+
+#endif /* DT_BINDINGS_CLOCK_NVIDIA_TEGRA264_H */
diff --git a/include/dt-bindings/reset/nvidia,tegra264.h b/include/dt-bindings/reset/nvidia,tegra264.h
new file mode 100644
index 000000000000..a61a56bb232b
--- /dev/null
+++ b/include/dt-bindings/reset/nvidia,tegra264.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. */
+
+#ifndef DT_BINDINGS_RESET_NVIDIA_TEGRA264_H
+#define DT_BINDINGS_RESET_NVIDIA_TEGRA264_H
+
+#define TEGRA264_RESET_APE_TKE			1
+#define TEGRA264_RESET_CEC			2
+#define TEGRA264_RESET_ADSP_ALL			3
+#define TEGRA264_RESET_RCE_ALL			4
+#define TEGRA264_RESET_UFSHC			5
+#define TEGRA264_RESET_UFSHC_AXI_M		6
+#define TEGRA264_RESET_UFSHC_LP_SEQ		7
+#define TEGRA264_RESET_DPAUX			8
+#define TEGRA264_RESET_EQOS_PCS			9
+#define TEGRA264_RESET_HWPM			10
+#define TEGRA264_RESET_I2C1			11
+#define TEGRA264_RESET_I2C2			12
+#define TEGRA264_RESET_I2C3			13
+#define TEGRA264_RESET_I2C4			14
+#define TEGRA264_RESET_I2C6			15
+#define TEGRA264_RESET_I2C7			16
+#define TEGRA264_RESET_I2C8			17
+#define TEGRA264_RESET_I2C9			18
+#define TEGRA264_RESET_ISP			19
+#define TEGRA264_RESET_LA			20
+#define TEGRA264_RESET_NVCSI			21
+#define TEGRA264_RESET_EQOS_MAC			22
+#define TEGRA264_RESET_PWM10			23
+#define TEGRA264_RESET_PWM2			24
+#define TEGRA264_RESET_PWM3			25
+#define TEGRA264_RESET_PWM4			26
+#define TEGRA264_RESET_PWM5			27
+#define TEGRA264_RESET_PWM9			28
+#define TEGRA264_RESET_QSPI0			29
+#define TEGRA264_RESET_HDA			30
+#define TEGRA264_RESET_HDACODEC			31
+#define TEGRA264_RESET_I2C0			32
+#define TEGRA264_RESET_I2C10			33
+#define TEGRA264_RESET_SDMMC1			34
+#define TEGRA264_RESET_MIPI_CAL			35
+#define TEGRA264_RESET_SPI1			36
+#define TEGRA264_RESET_SPI2			37
+#define TEGRA264_RESET_SPI3			38
+#define TEGRA264_RESET_SPI4			39
+#define TEGRA264_RESET_SPI5			40
+#define TEGRA264_RESET_SPI7			41
+#define TEGRA264_RESET_SPI8			42
+#define TEGRA264_RESET_SPI9			43
+#define TEGRA264_RESET_TACH0			44
+#define TEGRA264_RESET_TSEC			45
+#define TEGRA264_RESET_VI			46
+#define TEGRA264_RESET_VI1			47
+#define TEGRA264_RESET_PVA0_ALL			48
+#define TEGRA264_RESET_VIC			49
+#define TEGRA264_RESET_MPHY_CLK_CTL		50
+#define TEGRA264_RESET_MPHY_L0_RX		51
+#define TEGRA264_RESET_MPHY_L0_TX		52
+#define TEGRA264_RESET_MPHY_L1_RX		53
+#define TEGRA264_RESET_MPHY_L1_TX		54
+#define TEGRA264_RESET_ISP1			55
+#define TEGRA264_RESET_I2C11			56
+#define TEGRA264_RESET_I2C12			57
+#define TEGRA264_RESET_I2C14			58
+#define TEGRA264_RESET_I2C15			59
+#define TEGRA264_RESET_I2C16			60
+#define TEGRA264_RESET_EQOS_MACSEC		61
+#define TEGRA264_RESET_MGBE0_PCS		62
+#define TEGRA264_RESET_MGBE0_MAC		63
+#define TEGRA264_RESET_MGBE0_MACSEC		64
+#define TEGRA264_RESET_MGBE1_PCS		65
+#define TEGRA264_RESET_MGBE1_MAC		66
+#define TEGRA264_RESET_MGBE1_MACSEC		67
+#define TEGRA264_RESET_MGBE2_PCS		68
+#define TEGRA264_RESET_MGBE2_MAC		69
+#define TEGRA264_RESET_MGBE2_MACSEC		70
+#define TEGRA264_RESET_MGBE3_PCS		71
+#define TEGRA264_RESET_MGBE3_MAC		72
+#define TEGRA264_RESET_MGBE3_MACSEC		73
+#define TEGRA264_RESET_ADSP_CORE0		74
+#define TEGRA264_RESET_ADSP_CORE1		75
+#define TEGRA264_RESET_APE			76
+#define TEGRA264_RESET_XUSB1_PADCTL		77
+#define TEGRA264_RESET_AON_CPU_ALL		78
+#define TEGRA264_RESET_AON_HSP			79
+#define TEGRA264_RESET_UART4			80
+#define TEGRA264_RESET_UART5			81
+#define TEGRA264_RESET_UART9			82
+#define TEGRA264_RESET_UART10			83
+#define TEGRA264_RESET_UART8			84
+
+#endif /* DT_BINDINGS_RESET_NVIDIA_TEGRA264_H */
-- 
cgit v1.2.3


From 19c24f7ee39af503b9731067b91add627b70ecb6 Mon Sep 17 00:00:00 2001
From: David Kaplan <david.kaplan@amd.com>
Date: Mon, 7 Jul 2025 13:32:57 -0500
Subject: cpu: Define attack vectors

Define 4 new attack vectors that are used for controlling CPU speculation
mitigations.  These may be individually disabled as part of the
mitigations= command line.  Attack vector controls are combined with global
options like 'auto' or 'auto,nosmt' like 'mitigations=auto,no_user_kernel'.
The global options come first in the mitigations= string.

Cross-thread mitigations can either remain enabled fully, including
potentially disabling SMT ('auto,nosmt'), remain enabled except for
disabling SMT ('auto'), or entirely disabled through the new
'no_cross_thread' attack vector option.

The default settings for these attack vectors are consistent with existing
kernel defaults, other than the automatic disabling of VM-based attack
vectors if KVM support is not present.

Signed-off-by: David Kaplan <david.kaplan@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/20250707183316.1349127-3-david.kaplan@amd.com
---
 include/linux/cpu.h |  21 +++++++++
 kernel/cpu.c        | 130 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 140 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6378370a952f..1fb143ee1ffa 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -198,9 +198,25 @@ void cpuhp_report_idle_dead(void);
 static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
+enum cpu_attack_vectors {
+	CPU_MITIGATE_USER_KERNEL,
+	CPU_MITIGATE_USER_USER,
+	CPU_MITIGATE_GUEST_HOST,
+	CPU_MITIGATE_GUEST_GUEST,
+	NR_CPU_ATTACK_VECTORS,
+};
+
+enum smt_mitigations {
+	SMT_MITIGATIONS_OFF,
+	SMT_MITIGATIONS_AUTO,
+	SMT_MITIGATIONS_ON,
+};
+
 #ifdef CONFIG_CPU_MITIGATIONS
 extern bool cpu_mitigations_off(void);
 extern bool cpu_mitigations_auto_nosmt(void);
+extern bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v);
+extern enum smt_mitigations smt_mitigations;
 #else
 static inline bool cpu_mitigations_off(void)
 {
@@ -210,6 +226,11 @@ static inline bool cpu_mitigations_auto_nosmt(void)
 {
 	return false;
 }
+static inline bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
+{
+	return false;
+}
+#define smt_mitigations SMT_MITIGATIONS_OFF
 #endif
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59e009e0be4..faf0f23fc5d8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -37,6 +37,7 @@
 #include <linux/cpuset.h>
 #include <linux/random.h>
 #include <linux/cc_platform.h>
+#include <linux/parser.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -3174,8 +3175,38 @@ void __init boot_cpu_hotplug_init(void)
 
 #ifdef CONFIG_CPU_MITIGATIONS
 /*
- * These are used for a global "mitigations=" cmdline option for toggling
- * optional CPU mitigations.
+ * All except the cross-thread attack vector are mitigated by default.
+ * Cross-thread mitigation often requires disabling SMT which is expensive
+ * so cross-thread mitigations are only partially enabled by default.
+ *
+ * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
+ * present.
+ */
+static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
+	[CPU_MITIGATE_USER_KERNEL] = true,
+	[CPU_MITIGATE_USER_USER] = true,
+	[CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
+	[CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
+};
+
+bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
+{
+	if (v < NR_CPU_ATTACK_VECTORS)
+		return attack_vectors[v];
+
+	WARN_ONCE(1, "Invalid attack vector %d\n", v);
+	return false;
+}
+
+/*
+ * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
+ * be combined with attack-vector disables which follow them.
+ *
+ * Examples:
+ *   mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
+ *   mitigations=auto,nosmt,no_guest_host,no_guest_guest
+ *
+ * mitigations=off is equivalent to disabling all attack vectors.
  */
 enum cpu_mitigations {
 	CPU_MITIGATIONS_OFF,
@@ -3183,19 +3214,96 @@ enum cpu_mitigations {
 	CPU_MITIGATIONS_AUTO_NOSMT,
 };
 
+enum {
+	NO_USER_KERNEL,
+	NO_USER_USER,
+	NO_GUEST_HOST,
+	NO_GUEST_GUEST,
+	NO_CROSS_THREAD,
+	NR_VECTOR_PARAMS,
+};
+
+enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
 static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
 
+static const match_table_t global_mitigations = {
+	{ CPU_MITIGATIONS_AUTO_NOSMT,	"auto,nosmt"},
+	{ CPU_MITIGATIONS_AUTO,		"auto"},
+	{ CPU_MITIGATIONS_OFF,		"off"},
+};
+
+static const match_table_t vector_mitigations = {
+	{ NO_USER_KERNEL,	"no_user_kernel"},
+	{ NO_USER_USER,		"no_user_user"},
+	{ NO_GUEST_HOST,	"no_guest_host"},
+	{ NO_GUEST_GUEST,	"no_guest_guest"},
+	{ NO_CROSS_THREAD,	"no_cross_thread"},
+	{ NR_VECTOR_PARAMS,	NULL},
+};
+
+static int __init mitigations_parse_global_opt(char *arg)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
+		const char *pattern = global_mitigations[i].pattern;
+
+		if (!strncmp(arg, pattern, strlen(pattern))) {
+			cpu_mitigations = global_mitigations[i].token;
+			return strlen(pattern);
+		}
+	}
+
+	return 0;
+}
+
 static int __init mitigations_parse_cmdline(char *arg)
 {
-	if (!strcmp(arg, "off"))
-		cpu_mitigations = CPU_MITIGATIONS_OFF;
-	else if (!strcmp(arg, "auto"))
-		cpu_mitigations = CPU_MITIGATIONS_AUTO;
-	else if (!strcmp(arg, "auto,nosmt"))
-		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
-	else
-		pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
-			arg);
+	char *s, *p;
+	int len;
+
+	len = mitigations_parse_global_opt(arg);
+
+	if (cpu_mitigations_off()) {
+		memset(attack_vectors, 0, sizeof(attack_vectors));
+		smt_mitigations = SMT_MITIGATIONS_OFF;
+	} else if (cpu_mitigations_auto_nosmt()) {
+		smt_mitigations = SMT_MITIGATIONS_ON;
+	}
+
+	p = arg + len;
+
+	if (!*p)
+		return 0;
+
+	/* Attack vector controls may come after the ',' */
+	if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
+		pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",	arg);
+		return 0;
+	}
+
+	while ((s = strsep(&p, ",")) != NULL) {
+		switch (match_token(s, vector_mitigations, NULL)) {
+		case NO_USER_KERNEL:
+			attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
+			break;
+		case NO_USER_USER:
+			attack_vectors[CPU_MITIGATE_USER_USER] = false;
+			break;
+		case NO_GUEST_HOST:
+			attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
+			break;
+		case NO_GUEST_GUEST:
+			attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
+			break;
+		case NO_CROSS_THREAD:
+			smt_mitigations = SMT_MITIGATIONS_OFF;
+			break;
+		default:
+			pr_crit("Unsupported mitigations options %s\n",	s);
+			return 0;
+		}
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1657624a69fcfd3f27ba6223e1c8fb6a16815568 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Tue, 1 Jul 2025 06:45:18 +0000
Subject: media: core: export v4l2_translate_cmd

video_translate_cmd() can be useful for drivers to convert between the
VIDIOC_*32 and VIDIOC_ defines. Let's export it.

Now that the function is exported, use this opportunity to rename the
function with the v4l2_ prefix, that is less ambiguous than video_

The VIDIOC_*32 defines are not accessible by the drivers, they live in
v4l2-compat-ioctl32.c.

Reviewed-by: Hans Verkuil <hverkuil@xs4all.nl>
Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Link: https://lore.kernel.org/r/20250701-uvc-grannular-invert-v4-6-8003b9b89f68@chromium.org
Signed-off-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 5 +++--
 include/media/v4l2-ioctl.h           | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index be94a79b976e..0db05f9c7117 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -3249,7 +3249,7 @@ static int check_array_args(unsigned int cmd, void *parg, size_t *array_size,
 	return ret;
 }
 
-static unsigned int video_translate_cmd(unsigned int cmd)
+unsigned int v4l2_translate_cmd(unsigned int cmd)
 {
 #if !defined(CONFIG_64BIT) && defined(CONFIG_COMPAT_32BIT_TIME)
 	switch (cmd) {
@@ -3270,6 +3270,7 @@ static unsigned int video_translate_cmd(unsigned int cmd)
 
 	return cmd;
 }
+EXPORT_SYMBOL_GPL(v4l2_translate_cmd);
 
 static int video_get_user(void __user *arg, void *parg,
 			  unsigned int real_cmd, unsigned int cmd,
@@ -3430,7 +3431,7 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg,
 	size_t  array_size = 0;
 	void __user *user_ptr = NULL;
 	void	**kernel_ptr = NULL;
-	unsigned int cmd = video_translate_cmd(orig_cmd);
+	unsigned int cmd = v4l2_translate_cmd(orig_cmd);
 	const size_t ioc_size = _IOC_SIZE(cmd);
 
 	/*  Copy arguments into temp kernel buffer  */
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index c6ec87e88dfe..82695c3a300a 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -679,6 +679,7 @@ long int v4l2_compat_ioctl32(struct file *file, unsigned int cmd,
 #endif
 
 unsigned int v4l2_compat_translate_cmd(unsigned int cmd);
+unsigned int v4l2_translate_cmd(unsigned int cmd);
 int v4l2_compat_get_user(void __user *arg, void *parg, unsigned int cmd);
 int v4l2_compat_put_user(void __user *arg, void *parg, unsigned int cmd);
 int v4l2_compat_get_array_args(struct file *file, void *mbuf,
-- 
cgit v1.2.3


From 2ab4019aa34dc2aec4a0824fbf1e49300884fbbf Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Mon, 7 Jul 2025 18:34:04 +0000
Subject: media: uvcvideo: Introduce V4L2_META_FMT_UVC_MSXU_1_5

The UVC driver provides two metadata types V4L2_META_FMT_UVC, and
V4L2_META_FMT_D4XX. The only difference between the two of them is that
V4L2_META_FMT_UVC only copies PTS, SCR, size and flags, and
V4L2_META_FMT_D4XX copies the whole metadata section.

Now we only enable V4L2_META_FMT_D4XX for the Intel D4xx family of
devices, but it is useful to have the whole metadata payload for any
device where vendors include other metadata, such as the one described by
Microsoft:
https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/mf-capture-metadata

This patch introduces a new format V4L2_META_FMT_UVC_MSXU_1_5, that is
identical to V4L2_META_FMT_D4XX.

Let the user enable this format with a quirk for now. This way they can
test if their devices provide useful metadata without rebuilding the
kernel. They can later contribute patches to auto-quirk their devices.
We will also work in methods to auto-detect devices compatible with this
new metadata format.

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Link: https://lore.kernel.org/r/20250707-uvc-meta-v8-4-ed17f8b1218b@chromium.org
Signed-off-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 .../userspace-api/media/v4l/meta-formats.rst       |  1 +
 .../media/v4l/metafmt-uvc-msxu-1-5.rst             | 23 ++++++++++++++++++++++
 MAINTAINERS                                        |  1 +
 drivers/media/usb/uvc/uvc_metadata.c               |  4 ++++
 drivers/media/usb/uvc/uvcvideo.h                   |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 7 files changed, 32 insertions(+)
 create mode 100644 Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/meta-formats.rst b/Documentation/userspace-api/media/v4l/meta-formats.rst
index bb6876cfc271..0de80328c36b 100644
--- a/Documentation/userspace-api/media/v4l/meta-formats.rst
+++ b/Documentation/userspace-api/media/v4l/meta-formats.rst
@@ -20,6 +20,7 @@ These formats are used for the :ref:`metadata` interface only.
     metafmt-pisp-fe
     metafmt-rkisp1
     metafmt-uvc
+    metafmt-uvc-msxu-1-5
     metafmt-vivid
     metafmt-vsp1-hgo
     metafmt-vsp1-hgt
diff --git a/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst b/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst
new file mode 100644
index 000000000000..dd1c3076df24
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
+
+.. _v4l2-meta-fmt-uvc-msxu-1-5:
+
+***********************************
+V4L2_META_FMT_UVC_MSXU_1_5 ('UVCM')
+***********************************
+
+Microsoft(R)'s UVC Payload Metadata.
+
+
+Description
+===========
+
+V4L2_META_FMT_UVC_MSXU_1_5 buffers follow the metadata buffer layout of
+V4L2_META_FMT_UVC with the only difference that it includes all the UVC
+metadata in the `buffer[]` field, not just the first 2-12 bytes.
+
+The metadata format follows the specification from Microsoft(R) [1].
+
+.. _1:
+
+[1] https://docs.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ef99240a57e..5f69c82d53ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -25837,6 +25837,7 @@ S:	Maintained
 W:	http://www.ideasonboard.org/uvc/
 T:	git git://linuxtv.org/media.git
 F:	Documentation/userspace-api/media/drivers/uvcvideo.rst
+F:	Documentation/userspace-api/media/v4l/metafmt-uvc-msxu-1-5.rst
 F:	Documentation/userspace-api/media/v4l/metafmt-uvc.rst
 F:	drivers/media/common/uvc.c
 F:	drivers/media/usb/uvc/
diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c
index 58691df60dd3..d0ee139dbda7 100644
--- a/drivers/media/usb/uvc/uvc_metadata.c
+++ b/drivers/media/usb/uvc/uvc_metadata.c
@@ -189,6 +189,10 @@ void uvc_meta_init(struct uvc_device *dev)
 	    !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC))
 		dev->meta_formats[i++] = dev->info->meta_format;
 
+	if (dev->quirks & UVC_QUIRK_MSXU_META &&
+	    !WARN_ON(dev->info->meta_format == V4L2_META_FMT_UVC_MSXU_1_5))
+		dev->meta_formats[i++] = V4L2_META_FMT_UVC_MSXU_1_5;
+
 	 /* IMPORTANT: for new meta-formats update UVC_MAX_META_DATA_FORMATS. */
 	dev->meta_formats[i++] = 0;
 }
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index 81ec171fdfde..eb164d063199 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -77,6 +77,7 @@
 #define UVC_QUIRK_DISABLE_AUTOSUSPEND	0x00008000
 #define UVC_QUIRK_INVALID_DEVICE_SOF	0x00010000
 #define UVC_QUIRK_MJPEG_NO_EOF		0x00020000
+#define UVC_QUIRK_MSXU_META		0x00040000
 
 /* Format flags */
 #define UVC_FMT_FLAG_COMPRESSED		0x00000001
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 0db05f9c7117..46da373066f4 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1463,6 +1463,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_META_FMT_VSP1_HGO:	descr = "R-Car VSP1 1-D Histogram"; break;
 	case V4L2_META_FMT_VSP1_HGT:	descr = "R-Car VSP1 2-D Histogram"; break;
 	case V4L2_META_FMT_UVC:		descr = "UVC Payload Header Metadata"; break;
+	case V4L2_META_FMT_UVC_MSXU_1_5:	descr = "UVC MSXU Metadata"; break;
 	case V4L2_META_FMT_D4XX:	descr = "Intel D4xx UVC Metadata"; break;
 	case V4L2_META_FMT_VIVID:       descr = "Vivid Metadata"; break;
 	case V4L2_META_FMT_RK_ISP1_PARAMS:	descr = "Rockchip ISP1 3A Parameters"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 1bb1979f6c18..3dd9fa45dde1 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -867,6 +867,7 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_VSP1_HGT    v4l2_fourcc('V', 'S', 'P', 'T') /* R-Car VSP1 2-D Histogram */
 #define V4L2_META_FMT_UVC         v4l2_fourcc('U', 'V', 'C', 'H') /* UVC Payload Header metadata */
 #define V4L2_META_FMT_D4XX        v4l2_fourcc('D', '4', 'X', 'X') /* D4XX Payload Header metadata */
+#define V4L2_META_FMT_UVC_MSXU_1_5  v4l2_fourcc('U', 'V', 'C', 'M') /* UVC MSXU metadata */
 #define V4L2_META_FMT_VIVID	  v4l2_fourcc('V', 'I', 'V', 'D') /* Vivid Metadata */
 
 /* Vendor specific - used for RK_ISP1 camera sub-system */
-- 
cgit v1.2.3


From 6cb786f040ad35b23b4a7bff8b9d772f22909d48 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Mon, 7 Jul 2025 18:34:05 +0000
Subject: media: uvcvideo: Auto-set UVC_QUIRK_MSXU_META

If the camera supports the MSXU_CONTROL_METADATA control, auto set the
MSXU_META quirk.

Reviewed-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Link: https://lore.kernel.org/r/20250707-uvc-meta-v8-5-ed17f8b1218b@chromium.org
Signed-off-by: Hans de Goede <hansg@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil@xs4all.nl>
---
 drivers/media/usb/uvc/uvc_driver.c   |  7 +++-
 drivers/media/usb/uvc/uvc_metadata.c | 75 +++++++++++++++++++++++++++++++++++-
 drivers/media/usb/uvc/uvcvideo.h     |  2 +-
 include/linux/usb/uvc.h              |  3 ++
 4 files changed, 84 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index b12c95fe8b0a..775bede0d93d 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -2293,7 +2293,12 @@ static int uvc_probe(struct usb_interface *intf,
 		goto error;
 	}
 
-	uvc_meta_init(dev);
+	ret = uvc_meta_init(dev);
+	if (ret < 0) {
+		dev_err(&dev->udev->dev,
+			"Error initializing the metadata formats (%d)\n", ret);
+		goto error;
+	}
 
 	if (dev->quirks & UVC_QUIRK_NO_RESET_RESUME)
 		udev->quirks &= ~USB_QUIRK_RESET_RESUME;
diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c
index d0ee139dbda7..12972527ab8d 100644
--- a/drivers/media/usb/uvc/uvc_metadata.c
+++ b/drivers/media/usb/uvc/uvc_metadata.c
@@ -10,6 +10,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/usb.h>
+#include <linux/usb/uvc.h>
 #include <linux/videodev2.h>
 
 #include <media/v4l2-ioctl.h>
@@ -166,6 +167,71 @@ static const struct v4l2_file_operations uvc_meta_fops = {
 	.mmap = vb2_fop_mmap,
 };
 
+static struct uvc_entity *uvc_meta_find_msxu(struct uvc_device *dev)
+{
+	static const u8 uvc_msxu_guid[16] = UVC_GUID_MSXU_1_5;
+	struct uvc_entity *entity;
+
+	list_for_each_entry(entity, &dev->entities, list) {
+		if (!memcmp(entity->guid, uvc_msxu_guid, sizeof(entity->guid)))
+			return entity;
+	}
+
+	return NULL;
+}
+
+#define MSXU_CONTROL_METADATA 0x9
+static int uvc_meta_detect_msxu(struct uvc_device *dev)
+{
+	u32 *data __free(kfree) = NULL;
+	struct uvc_entity *entity;
+	int ret;
+
+	entity = uvc_meta_find_msxu(dev);
+	if (!entity)
+		return 0;
+
+	/*
+	 * USB requires buffers aligned in a special way, simplest way is to
+	 * make sure that query_ctrl will work is to kmalloc() them.
+	 */
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	/* Check if the metadata is already enabled. */
+	ret = uvc_query_ctrl(dev, UVC_GET_CUR, entity->id, dev->intfnum,
+			     MSXU_CONTROL_METADATA, data, sizeof(*data));
+	if (ret)
+		return 0;
+
+	if (*data) {
+		dev->quirks |= UVC_QUIRK_MSXU_META;
+		return 0;
+	}
+
+	/*
+	 * We have seen devices that require 1 to enable the metadata, others
+	 * requiring a value != 1 and others requiring a value >1. Luckily for
+	 * us, the value from GET_MAX seems to work all the time.
+	 */
+	ret = uvc_query_ctrl(dev, UVC_GET_MAX, entity->id, dev->intfnum,
+			     MSXU_CONTROL_METADATA, data, sizeof(*data));
+	if (ret || !*data)
+		return 0;
+
+	/*
+	 * If we can set MSXU_CONTROL_METADATA, the device will report
+	 * metadata.
+	 */
+	ret = uvc_query_ctrl(dev, UVC_SET_CUR, entity->id, dev->intfnum,
+			     MSXU_CONTROL_METADATA, data, sizeof(*data));
+	if (!ret)
+		dev->quirks |= UVC_QUIRK_MSXU_META;
+
+	return 0;
+}
+
 int uvc_meta_register(struct uvc_streaming *stream)
 {
 	struct uvc_device *dev = stream->dev;
@@ -179,9 +245,14 @@ int uvc_meta_register(struct uvc_streaming *stream)
 					 &uvc_meta_fops, &uvc_meta_ioctl_ops);
 }
 
-void uvc_meta_init(struct uvc_device *dev)
+int uvc_meta_init(struct uvc_device *dev)
 {
 	unsigned int i = 0;
+	int ret;
+
+	ret = uvc_meta_detect_msxu(dev);
+	if (ret)
+		return ret;
 
 	dev->meta_formats[i++] = V4L2_META_FMT_UVC;
 
@@ -195,4 +266,6 @@ void uvc_meta_init(struct uvc_device *dev)
 
 	 /* IMPORTANT: for new meta-formats update UVC_MAX_META_DATA_FORMATS. */
 	dev->meta_formats[i++] = 0;
+
+	return 0;
 }
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index eb164d063199..b34c1914ff39 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -728,7 +728,7 @@ int uvc_query_ctrl(struct uvc_device *dev, u8 query, u8 unit,
 void uvc_video_clock_update(struct uvc_streaming *stream,
 			    struct vb2_v4l2_buffer *vbuf,
 			    struct uvc_buffer *buf);
-void uvc_meta_init(struct uvc_device *dev);
+int uvc_meta_init(struct uvc_device *dev);
 int uvc_meta_register(struct uvc_streaming *stream);
 
 int uvc_register_video_device(struct uvc_device *dev,
diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h
index bce95153e5a6..ee19e9f915b8 100644
--- a/include/linux/usb/uvc.h
+++ b/include/linux/usb/uvc.h
@@ -29,6 +29,9 @@
 #define UVC_GUID_EXT_GPIO_CONTROLLER \
 	{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
 	 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03}
+#define UVC_GUID_MSXU_1_5 \
+	{0xdc, 0x95, 0x3f, 0x0f, 0x32, 0x26, 0x4e, 0x4c, \
+	 0x92, 0xc9, 0xa0, 0x47, 0x82, 0xf4, 0x3b, 0xc8}
 
 #define UVC_GUID_FORMAT_MJPEG \
 	{ 'M',  'J',  'P',  'G', 0x00, 0x00, 0x10, 0x00, \
-- 
cgit v1.2.3


From 56e9a0d8e53f56f313d332888a32a44a71f3a9ab Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:09 -0700
Subject: iommufd: Add mmap interface

For vIOMMU passing through HW resources to user space (VMs), allowing a VM
to control the passed through HW directly by accessing hardware registers,
add an mmap infrastructure to map the physical MMIO pages to user space.

Maintain a maple tree per ictx as a translation table managing mmappable
regions, from an allocated for-user mmap offset to an iommufd_mmap struct,
where it stores the real physical address range for io_remap_pfn_range().

Keep track of the lifecycle of the mmappable region by taking refcount of
its owner, so as to enforce user space to unmap the region first before it
can destroy its owner object.

To allow an IOMMU driver to add and delete mmappable regions onto/from the
maple tree, add iommufd_viommu_alloc/destroy_mmap helpers.

Link: https://patch.msgid.link/r/9a888a326b12aa5fe940083eae1156304e210fe0.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/driver.c          | 52 +++++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 14 ++++++++
 drivers/iommu/iommufd/main.c            | 63 +++++++++++++++++++++++++++++++++
 include/linux/iommufd.h                 | 42 ++++++++++++++++++++++
 4 files changed, 171 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index e578ef32d30c..e4eae20bcd4e 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -31,6 +31,58 @@ void _iommufd_object_undepend(struct iommufd_object *obj_dependent,
 }
 EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD");
 
+/*
+ * Allocate an @offset to return to user space to use for an mmap() syscall
+ *
+ * Driver should use a per-structure helper in include/linux/iommufd.h
+ */
+int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner,
+			phys_addr_t mmio_addr, size_t length,
+			unsigned long *offset)
+{
+	struct iommufd_mmap *immap;
+	unsigned long startp;
+	int rc;
+
+	if (!PAGE_ALIGNED(mmio_addr))
+		return -EINVAL;
+	if (!length || !PAGE_ALIGNED(length))
+		return -EINVAL;
+
+	immap = kzalloc(sizeof(*immap), GFP_KERNEL);
+	if (!immap)
+		return -ENOMEM;
+	immap->owner = owner;
+	immap->length = length;
+	immap->mmio_addr = mmio_addr;
+
+	/* Skip the first page to ease caller identifying the returned offset */
+	rc = mtree_alloc_range(&ictx->mt_mmap, &startp, immap, immap->length,
+			       PAGE_SIZE, ULONG_MAX, GFP_KERNEL);
+	if (rc < 0) {
+		kfree(immap);
+		return rc;
+	}
+
+	/* mmap() syscall will right-shift the offset in vma->vm_pgoff too */
+	immap->vm_pgoff = startp >> PAGE_SHIFT;
+	*offset = startp;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_alloc_mmap, "IOMMUFD");
+
+/* Driver should use a per-structure helper in include/linux/iommufd.h */
+void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
+			   struct iommufd_object *owner, unsigned long offset)
+{
+	struct iommufd_mmap *immap;
+
+	immap = mtree_erase(&ictx->mt_mmap, offset);
+	WARN_ON_ONCE(!immap || immap->owner != owner);
+	kfree(immap);
+}
+EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD");
+
 /* Caller should xa_lock(&viommu->vdevs) to protect the return value */
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index dcd609573244..cd14163abdd1 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -7,6 +7,7 @@
 #include <linux/iommu.h>
 #include <linux/iommufd.h>
 #include <linux/iova_bitmap.h>
+#include <linux/maple_tree.h>
 #include <linux/rwsem.h>
 #include <linux/uaccess.h>
 #include <linux/xarray.h>
@@ -44,6 +45,7 @@ struct iommufd_ctx {
 	struct xarray groups;
 	wait_queue_head_t destroy_wait;
 	struct rw_semaphore ioas_creation_lock;
+	struct maple_tree mt_mmap;
 
 	struct mutex sw_msi_lock;
 	struct list_head sw_msi_list;
@@ -55,6 +57,18 @@ struct iommufd_ctx {
 	struct iommufd_ioas *vfio_ioas;
 };
 
+/* Entry for iommufd_ctx::mt_mmap */
+struct iommufd_mmap {
+	struct iommufd_object *owner;
+
+	/* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */
+	unsigned long vm_pgoff;
+
+	/* Physical range for io_remap_pfn_range() */
+	phys_addr_t mmio_addr;
+	size_t length;
+};
+
 /*
  * The IOVA to PFN map. The map automatically copies the PFNs into multiple
  * domains and permits sharing of PFNs between io_pagetable instances. This
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 4e8dbbfac890..0fb81a905cb1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -275,6 +275,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp)
 	xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT);
 	xa_init(&ictx->groups);
 	ictx->file = filp;
+	mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE);
 	init_waitqueue_head(&ictx->destroy_wait);
 	mutex_init(&ictx->sw_msi_lock);
 	INIT_LIST_HEAD(&ictx->sw_msi_list);
@@ -479,11 +480,73 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
 	return ret;
 }
 
+static void iommufd_fops_vma_open(struct vm_area_struct *vma)
+{
+	struct iommufd_mmap *immap = vma->vm_private_data;
+
+	refcount_inc(&immap->owner->users);
+}
+
+static void iommufd_fops_vma_close(struct vm_area_struct *vma)
+{
+	struct iommufd_mmap *immap = vma->vm_private_data;
+
+	refcount_dec(&immap->owner->users);
+}
+
+static const struct vm_operations_struct iommufd_vma_ops = {
+	.open = iommufd_fops_vma_open,
+	.close = iommufd_fops_vma_close,
+};
+
+/* The vm_pgoff must be pre-allocated from mt_mmap, and given to user space */
+static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct iommufd_ctx *ictx = filp->private_data;
+	size_t length = vma->vm_end - vma->vm_start;
+	struct iommufd_mmap *immap;
+	int rc;
+
+	if (!PAGE_ALIGNED(length))
+		return -EINVAL;
+	if (!(vma->vm_flags & VM_SHARED))
+		return -EINVAL;
+	if (vma->vm_flags & VM_EXEC)
+		return -EPERM;
+
+	/* vma->vm_pgoff carries a page-shifted start position to an immap */
+	immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT);
+	if (!immap)
+		return -ENXIO;
+	/*
+	 * mtree_load() returns the immap for any contained mmio_addr, so only
+	 * allow the exact immap thing to be mapped
+	 */
+	if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length)
+		return -ENXIO;
+
+	vma->vm_pgoff = 0;
+	vma->vm_private_data = immap;
+	vma->vm_ops = &iommufd_vma_ops;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	rc = io_remap_pfn_range(vma, vma->vm_start,
+				immap->mmio_addr >> PAGE_SHIFT, length,
+				vma->vm_page_prot);
+	if (rc)
+		return rc;
+
+	/* vm_ops.open won't be called for mmap itself. */
+	refcount_inc(&immap->owner->users);
+	return rc;
+}
+
 static const struct file_operations iommufd_fops = {
 	.owner = THIS_MODULE,
 	.open = iommufd_fops_open,
 	.release = iommufd_fops_release,
 	.unlocked_ioctl = iommufd_fops_ioctl,
+	.mmap = iommufd_fops_mmap,
 };
 
 /**
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index fa23439fa483..e3a0cd47384d 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -255,6 +255,11 @@ int _iommufd_object_depend(struct iommufd_object *obj_dependent,
 			   struct iommufd_object *obj_depended);
 void _iommufd_object_undepend(struct iommufd_object *obj_dependent,
 			      struct iommufd_object *obj_depended);
+int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner,
+			phys_addr_t mmio_addr, size_t length,
+			unsigned long *offset);
+void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
+			   struct iommufd_object *owner, unsigned long offset);
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
@@ -275,6 +280,20 @@ _iommufd_object_undepend(struct iommufd_object *obj_dependent,
 {
 }
 
+static inline int _iommufd_alloc_mmap(struct iommufd_ctx *ictx,
+				      struct iommufd_object *owner,
+				      phys_addr_t mmio_addr, size_t length,
+				      unsigned long *offset)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
+					 struct iommufd_object *owner,
+					 unsigned long offset)
+{
+}
+
 static inline struct device *
 iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
 {
@@ -342,4 +361,27 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu,
 		_iommufd_object_undepend(&dependent->member.obj,               \
 					 &depended->member.obj);               \
 	})
+
+/*
+ * Helpers for IOMMU driver to alloc/destroy an mmapable area for a structure.
+ *
+ * To support an mmappable MMIO region, kernel driver must first register it to
+ * iommufd core to allocate an @offset, during a driver-structure initialization
+ * (e.g. viommu_init op). Then, it should report to user space this @offset and
+ * the @length of the MMIO region for mmap syscall.
+ */
+static inline int iommufd_viommu_alloc_mmap(struct iommufd_viommu *viommu,
+					    phys_addr_t mmio_addr,
+					    size_t length,
+					    unsigned long *offset)
+{
+	return _iommufd_alloc_mmap(viommu->ictx, &viommu->obj, mmio_addr,
+				   length, offset);
+}
+
+static inline void iommufd_viommu_destroy_mmap(struct iommufd_viommu *viommu,
+					       unsigned long offset)
+{
+	_iommufd_destroy_mmap(viommu->ictx, &viommu->obj, offset);
+}
 #endif
-- 
cgit v1.2.3


From 62622a8753fa6af3c104f9552863e6473b92fb31 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:12 -0700
Subject: iommu: Allow an input type in hw_info op

The hw_info uAPI will support a bidirectional data_type field that can be
used as an input field for user space to request for a specific info data.

To prepare for the uAPI update, change the iommu layer first:
 - Add a new IOMMU_HW_INFO_TYPE_DEFAULT as an input, for which driver can
   output its only (or firstly) supported type
 - Update the kdoc accordingly
 - Roll out the type validation in the existing drivers

Link: https://patch.msgid.link/r/00f4a2d3d930721f61367014717b3ba2d1e82a81.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 4 ++++
 drivers/iommu/intel/iommu.c                         | 4 ++++
 drivers/iommu/iommufd/device.c                      | 3 +++
 drivers/iommu/iommufd/selftest.c                    | 4 ++++
 include/linux/iommu.h                               | 3 ++-
 include/uapi/linux/iommufd.h                        | 4 +++-
 6 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 170d69162848..eb9fe1f6311a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -15,6 +15,10 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length,
 	u32 __iomem *base_idr;
 	unsigned int i;
 
+	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+	    *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 850f1a6f548c..5f75faffca15 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4098,6 +4098,10 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length,
 	struct intel_iommu *iommu = info->iommu;
 	struct iommu_hw_info_vtd *vtd;
 
+	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+	    *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
 	if (!vtd)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 0567faff5680..14955dc43892 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1512,6 +1512,9 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
 	    cmd->__reserved[2])
 		return -EOPNOTSUPP;
 
+	/* Clear the type field since drivers don't support a random input */
+	cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT;
+
 	idev = iommufd_get_device(ucmd, cmd->dev_id);
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 8b2c44b32530..a5dc36219a90 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -310,6 +310,10 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length,
 {
 	struct iommu_test_hw_info *info;
 
+	if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
+	    *type != IOMMU_HW_INFO_TYPE_SELFTEST)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e06a0fbe4bc7..e8b59ef54e48 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -603,7 +603,8 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data,
  * @capable: check capability
  * @hw_info: report iommu hardware information. The data buffer returned by this
  *           op is allocated in the iommu driver and freed by the caller after
- *           use.
+ *           use. @type can input a requested type and output a supported type.
+ *           Driver should reject an unsupported data @type input
  * @domain_alloc: Do not use in new drivers
  * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to
  *                         use identity_domain instead. This should only be used
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index b928c1ed2395..9c8c304b5de2 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -593,13 +593,15 @@ struct iommu_hw_info_arm_smmuv3 {
 
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
- * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
+ * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware
  *                           info
+ * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
  * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
  */
 enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_NONE = 0,
+	IOMMU_HW_INFO_TYPE_DEFAULT = 0,
 	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
 	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
 };
-- 
cgit v1.2.3


From a9f10bab2e5084d6746391fccd7bef6ac87620b8 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:13 -0700
Subject: iommufd: Allow an input data_type via iommu_hw_info

The iommu_hw_info can output via the out_data_type field the vendor data
type from a driver, but this only allows driver to report one data type.

Now, with SMMUv3 having a Tegra241 CMDQV implementation, it has two sets
of types and data structs to report.

One way to support that is to use the same type field bidirectionally.

Reuse the same field by adding an "in_data_type", allowing user space to
request for a specific type and to get the corresponding data.

For backward compatibility, since the ioctl handler has never checked an
input value, add an IOMMU_HW_INFO_FLAG_INPUT_TYPE to switch between the
old output-only field and the new bidirectional field.

Link: https://patch.msgid.link/r/887378a7167e1786d9d13cde0c36263ed61823d7.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c |  9 ++++++---
 include/uapi/linux/iommufd.h   | 20 +++++++++++++++++++-
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 14955dc43892..e2ba21c43ad2 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1499,6 +1499,7 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD");
 
 int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
 {
+	const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE;
 	struct iommu_hw_info *cmd = ucmd->cmd;
 	void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr);
 	const struct iommu_ops *ops;
@@ -1508,12 +1509,14 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
 	void *data;
 	int rc;
 
-	if (cmd->flags || cmd->__reserved[0] || cmd->__reserved[1] ||
-	    cmd->__reserved[2])
+	if (cmd->flags & ~SUPPORTED_FLAGS)
+		return -EOPNOTSUPP;
+	if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2])
 		return -EOPNOTSUPP;
 
 	/* Clear the type field since drivers don't support a random input */
-	cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT;
+	if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE))
+		cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT;
 
 	idev = iommufd_get_device(ucmd, cmd->dev_id);
 	if (IS_ERR(idev))
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 9c8c304b5de2..32ee02380912 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -628,6 +628,15 @@ enum iommufd_hw_capabilities {
 	IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2,
 };
 
+/**
+ * enum iommufd_hw_info_flags - Flags for iommu_hw_info
+ * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type
+ *                                 for user space to request for a specific info
+ */
+enum iommufd_hw_info_flags {
+	IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0,
+};
+
 /**
  * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
  * @size: sizeof(struct iommu_hw_info)
@@ -637,6 +646,12 @@ enum iommufd_hw_capabilities {
  *            data that kernel supports
  * @data_uptr: User pointer to a user-space buffer used by the kernel to fill
  *             the iommu type specific hardware information data
+ * @in_data_type: This shares the same field with @out_data_type, making it be
+ *                a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is
+ *                set, an input type carried via this @in_data_type field will
+ *                be valid, requesting for the info data to the given type. If
+ *                IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will
+ *                be seen as IOMMU_HW_INFO_TYPE_DEFAULT
  * @out_data_type: Output the iommu hardware info type as defined in the enum
  *                 iommu_hw_info_type.
  * @out_capabilities: Output the generic iommu capability info type as defined
@@ -666,7 +681,10 @@ struct iommu_hw_info {
 	__u32 dev_id;
 	__u32 data_len;
 	__aligned_u64 data_uptr;
-	__u32 out_data_type;
+	union {
+		__u32 in_data_type;
+		__u32 out_data_type;
+	};
 	__u8 out_max_pasid_log2;
 	__u8 __reserved[3];
 	__aligned_u64 out_capabilities;
-- 
cgit v1.2.3


From 4dc0d12474f9d4833c3dd96b73d61e406d3f5dc7 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:20 -0700
Subject: iommu/tegra241-cmdqv: Add user-space use support

The CMDQV HW supports a user-space use for virtualization cases. It allows
the VM to issue guest-level TLBI or ATC_INV commands directly to the queue
and executes them without a VMEXIT, as HW will replace the VMID field in a
TLBI command and the SID field in an ATC_INV command with the preset VMID
and SID.

This is built upon the vIOMMU infrastructure by allowing VMM to allocate a
VINTF (as a vIOMMU object) and assign VCMDQs (HW QUEUE objs) to the VINTF.

So firstly, replace the standard vSMMU model with the VINTF implementation
but reuse the standard cache_invalidate op (for unsupported commands) and
the standard alloc_domain_nested op (for standard nested STE).

Each VINTF has two 64KB MMIO pages (128B per logical VCMDQ):
 - Page0 (directly accessed by guest) has all the control and status bits.
 - Page1 (trapped by VMM) has guest-owned queue memory location/size info.

VMM should trap the emulated VINTF0's page1 of the guest VM for the guest-
level VCMDQ location/size info and forward that to the kernel to translate
to a physical memory location to program the VCMDQ HW during an allocation
call. Then, it should mmap the assigned VINTF's page0 to the VINTF0 page0
of the guest VM. This allows the guest OS to read and write the guest-own
VINTF's page0 for direct control of the VCMDQ HW.

For ATC invalidation commands that hold an SID, it requires all devices to
register their virtual SIDs to the SID_MATCH registers and their physical
SIDs to the pairing SID_REPLACE registers, so that HW can use those as a
lookup table to replace those virtual SIDs with the correct physical SIDs.
Thus, implement the driver-allocated vDEVICE op with a tegra241_vintf_sid
structure to allocate SID_REPLACE and to program the SIDs accordingly.

This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to
the standard SMMUv3 operating in the nested translation mode trapping CMDQ
for TLBI and ATC_INV commands, this gives a huge performance improvement:
70% to 90% reductions of invalidation time were measured by various DMA
unmap tests running in a guest OS.

Link: https://patch.msgid.link/r/fb0eab83f529440b6aa181798912a6f0afa21eb0.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c    |   6 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h        |   7 +
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c     | 400 ++++++++++++++++++++-
 include/uapi/linux/iommufd.h                       |  59 +++
 4 files changed, 466 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
index 1cf9646e776f..d9bea8f1f636 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c
@@ -225,7 +225,7 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg,
 	return 0;
 }
 
-static struct iommu_domain *
+struct iommu_domain *
 arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
 			      const struct iommu_user_data *user_data)
 {
@@ -336,8 +336,8 @@ static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu,
 	return 0;
 }
 
-static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
-				      struct iommu_user_data_array *array)
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+			       struct iommu_user_data_array *array)
 {
 	struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core);
 	struct arm_smmu_device *smmu = vsmmu->smmu;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index b7c7fdef531e..3fa02c51df9f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -1057,10 +1057,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
 void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state);
 void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master);
 int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt);
+struct iommu_domain *
+arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
+			      const struct iommu_user_data *user_data);
+int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu,
+			       struct iommu_user_data_array *array);
 #else
 #define arm_smmu_get_viommu_size NULL
 #define arm_smmu_hw_info NULL
 #define arm_vsmmu_init NULL
+#define arm_vsmmu_alloc_domain_nested NULL
+#define arm_vsmmu_cache_invalidate NULL
 
 static inline int
 arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state,
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 869c90b660c1..3eeb8444fadf 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -8,7 +8,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/iopoll.h>
+#include <uapi/linux/iommufd.h>
 
 #include <acpi/acpixf.h>
 
@@ -26,8 +28,10 @@
 #define  CMDQV_EN			BIT(0)
 
 #define TEGRA241_CMDQV_PARAM		0x0004
+#define  CMDQV_NUM_SID_PER_VM_LOG2	GENMASK(15, 12)
 #define  CMDQV_NUM_VINTF_LOG2		GENMASK(11, 8)
 #define  CMDQV_NUM_VCMDQ_LOG2		GENMASK(7, 4)
+#define  CMDQV_VER			GENMASK(3, 0)
 
 #define TEGRA241_CMDQV_STATUS		0x0008
 #define  CMDQV_ENABLED			BIT(0)
@@ -53,6 +57,9 @@
 #define  VINTF_STATUS			GENMASK(3, 1)
 #define  VINTF_ENABLED			BIT(0)
 
+#define TEGRA241_VINTF_SID_MATCH(s)	(0x0040 + 0x4*(s))
+#define TEGRA241_VINTF_SID_REPLACE(s)	(0x0080 + 0x4*(s))
+
 #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
 					(0x00C0 + 0x8*(m))
 #define  LVCMDQ_ERR_MAP_NUM_64		2
@@ -114,16 +121,20 @@ MODULE_PARM_DESC(bypass_vcmdq,
 
 /**
  * struct tegra241_vcmdq - Virtual Command Queue
+ * @core: Embedded iommufd_hw_queue structure
  * @idx: Global index in the CMDQV
  * @lidx: Local index in the VINTF
  * @enabled: Enable status
  * @cmdqv: Parent CMDQV pointer
  * @vintf: Parent VINTF pointer
+ * @prev: Previous LVCMDQ to depend on
  * @cmdq: Command Queue struct
  * @page0: MMIO Page0 base address
  * @page1: MMIO Page1 base address
  */
 struct tegra241_vcmdq {
+	struct iommufd_hw_queue core;
+
 	u16 idx;
 	u16 lidx;
 
@@ -131,22 +142,30 @@ struct tegra241_vcmdq {
 
 	struct tegra241_cmdqv *cmdqv;
 	struct tegra241_vintf *vintf;
+	struct tegra241_vcmdq *prev;
 	struct arm_smmu_cmdq cmdq;
 
 	void __iomem *page0;
 	void __iomem *page1;
 };
+#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core)
 
 /**
  * struct tegra241_vintf - Virtual Interface
+ * @vsmmu: Embedded arm_vsmmu structure
  * @idx: Global index in the CMDQV
  * @enabled: Enable status
  * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: Parent CMDQV pointer
  * @lvcmdqs: List of logical VCMDQ pointers
+ * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs
  * @base: MMIO base address
+ * @mmap_offset: Offset argument for mmap() syscall
+ * @sids: Stream ID mapping resources
  */
 struct tegra241_vintf {
+	struct arm_vsmmu vsmmu;
+
 	u16 idx;
 
 	bool enabled;
@@ -154,19 +173,41 @@ struct tegra241_vintf {
 
 	struct tegra241_cmdqv *cmdqv;
 	struct tegra241_vcmdq **lvcmdqs;
+	struct mutex lvcmdq_mutex; /* user space race */
 
 	void __iomem *base;
+	unsigned long mmap_offset;
+
+	struct ida sids;
+};
+#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core)
+
+/**
+ * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping
+ * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID
+ * @vintf: Parent VINTF pointer
+ * @sid: Physical Stream ID
+ * @idx: Mapping index in the VINTF
+ */
+struct tegra241_vintf_sid {
+	struct iommufd_vdevice core;
+	struct tegra241_vintf *vintf;
+	u32 sid;
+	u8 idx;
 };
+#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core)
 
 /**
  * struct tegra241_cmdqv - CMDQ-V for SMMUv3
  * @smmu: SMMUv3 device
  * @dev: CMDQV device
  * @base: MMIO base address
+ * @base_phys: MMIO physical base address, for mmap
  * @irq: IRQ number
  * @num_vintfs: Total number of VINTFs
  * @num_vcmdqs: Total number of VCMDQs
  * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
+ * @num_sids_per_vintf: Total number of SID mappings per VINTF
  * @vintf_ids: VINTF id allocator
  * @vintfs: List of VINTFs
  */
@@ -175,12 +216,14 @@ struct tegra241_cmdqv {
 	struct device *dev;
 
 	void __iomem *base;
+	phys_addr_t base_phys;
 	int irq;
 
 	/* CMDQV Hardware Params */
 	u16 num_vintfs;
 	u16 num_vcmdqs;
 	u16 num_lvcmdqs_per_vintf;
+	u16 num_sids_per_vintf;
 
 	struct ida vintf_ids;
 
@@ -351,6 +394,29 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
 
 /* HW Reset Functions */
 
+/*
+ * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC
+ * following an ATC_INV command at the end of the guest queue while this ATC_INV
+ * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned
+ * to the next VM, which will be a false alarm potentially causing some unwanted
+ * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when
+ * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ.
+ */
+static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq)
+{
+	struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu;
+	u64 cmd_sync[CMDQ_ENT_DWORDS] = {};
+
+	cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) |
+		      FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE);
+
+	/*
+	 * It does not hurt to insert another CMD_SYNC, taking advantage of the
+	 * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion.
+	 */
+	arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true);
+}
+
 /* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */
 static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 {
@@ -364,6 +430,8 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)),
 			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS)));
 	}
+	tegra241_vcmdq_hw_flush_timeout(vcmdq);
+
 	writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD));
 	writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS));
 	writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE));
@@ -420,6 +488,7 @@ static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq)
 static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
 {
 	u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf;
+	int sidx;
 
 	/* HW requires to unmap LVCMDQs in descending order */
 	while (lidx--) {
@@ -429,6 +498,10 @@ static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
 		}
 	}
 	vintf_write_config(vintf, 0);
+	for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) {
+		writel(0, REG_VINTF(vintf, SID_MATCH(sidx)));
+		writel(0, REG_VINTF(vintf, SID_REPLACE(sidx)));
+	}
 }
 
 /* Map a global VCMDQ to the pre-assigned LVCMDQ */
@@ -457,7 +530,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
 	 * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
 	 * restricted set of supported commands.
 	 */
-	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
+	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) |
+		 FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid);
 	writel(regval, REG_VINTF(vintf, CONFIG));
 
 	ret = vintf_write_config(vintf, regval | VINTF_EN);
@@ -584,7 +658,9 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 
 	dev_dbg(vintf->cmdqv->dev,
 		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
-	kfree(vcmdq);
+	/* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */
+	if (vcmdq->vintf->hyp_own)
+		kfree(vcmdq);
 }
 
 static struct tegra241_vcmdq *
@@ -671,7 +747,13 @@ static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
 
 	dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
 	tegra241_cmdqv_deinit_vintf(cmdqv, idx);
-	kfree(vintf);
+	if (!vintf->hyp_own) {
+		mutex_destroy(&vintf->lvcmdq_mutex);
+		ida_destroy(&vintf->sids);
+		/* Guest-owned VINTF is free-ed with viommu by iommufd core */
+	} else {
+		kfree(vintf);
+	}
 }
 
 static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
@@ -699,10 +781,45 @@ static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
 	put_device(cmdqv->dev); /* smmu->impl_dev */
 }
 
+static int
+tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu,
+			       const struct iommu_user_data *user_data);
+
+static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length,
+				    enum iommu_hw_info_type *type)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct iommu_hw_info_tegra241_cmdqv *info;
+	u32 regval;
+
+	if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM));
+	info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf);
+	info->log2vsids = ilog2(cmdqv->num_sids_per_vintf);
+	info->version = FIELD_GET(CMDQV_VER, regval);
+
+	*length = sizeof(*info);
+	*type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV;
+	return info;
+}
+
 static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
+	/* For in-kernel use */
 	.get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
 	.device_reset = tegra241_cmdqv_hw_reset,
 	.device_remove = tegra241_cmdqv_remove,
+	/* For user-space use */
+	.hw_info = tegra241_cmdqv_hw_info,
+	.vsmmu_size = VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core),
+	.vsmmu_type = IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+	.vsmmu_init = tegra241_cmdqv_init_vintf_user,
 };
 
 /* Probe Functions */
@@ -844,6 +961,7 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->irq = irq;
 	cmdqv->base = base;
 	cmdqv->dev = smmu->impl_dev;
+	cmdqv->base_phys = res->start;
 
 	if (cmdqv->irq > 0) {
 		ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr,
@@ -860,6 +978,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
 	cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
 	cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
+	cmdqv->num_sids_per_vintf =
+		1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval);
 
 	cmdqv->vintfs =
 		kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
@@ -913,3 +1033,277 @@ out_fallback:
 	put_device(smmu->impl_dev);
 	return ERR_PTR(-ENODEV);
 }
+
+/* User space VINTF and VCMDQ Functions */
+
+static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu,
+					    enum iommu_hw_queue_type queue_type)
+{
+	if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV)
+		return 0;
+	return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core);
+}
+
+static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq)
+{
+	char header[64];
+
+	/* Configure the vcmdq only; User space does the enabling */
+	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+
+	dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n",
+		lvcmdq_error_header(vcmdq, header, 64),
+		vcmdq->cmdq.q.q_base & VCMDQ_ADDR,
+		1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE));
+	return 0;
+}
+
+static void
+tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue)
+{
+	struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue);
+
+	mutex_lock(&vcmdq->vintf->lvcmdq_mutex);
+	tegra241_vcmdq_hw_deinit(vcmdq);
+	tegra241_vcmdq_unmap_lvcmdq(vcmdq);
+	tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx);
+	if (vcmdq->prev)
+		iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core);
+	mutex_unlock(&vcmdq->vintf->lvcmdq_mutex);
+}
+
+static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue,
+					    u32 lidx, phys_addr_t base_addr_pa)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu);
+	struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue);
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+	struct arm_smmu_device *smmu = &cmdqv->smmu;
+	struct tegra241_vcmdq *prev = NULL;
+	u32 log2size, max_n_shift;
+	char header[64];
+	int ret;
+
+	if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV)
+		return -EOPNOTSUPP;
+	if (lidx >= cmdqv->num_lvcmdqs_per_vintf)
+		return -EINVAL;
+
+	mutex_lock(&vintf->lvcmdq_mutex);
+
+	if (vintf->lvcmdqs[lidx]) {
+		ret = -EEXIST;
+		goto unlock;
+	}
+
+	/*
+	 * HW requires to map LVCMDQs in ascending order, so reject if the
+	 * previous lvcmdqs is not allocated yet.
+	 */
+	if (lidx) {
+		prev = vintf->lvcmdqs[lidx - 1];
+		if (!prev) {
+			ret = -EIO;
+			goto unlock;
+		}
+	}
+
+	/*
+	 * hw_queue->length must be a power of 2, in range of
+	 *   [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ]
+	 */
+	max_n_shift = FIELD_GET(IDR1_CMDQS,
+				readl_relaxed(smmu->base + ARM_SMMU_IDR1));
+	if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 ||
+	    hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+	log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT;
+
+	/* base_addr_pa must be aligned to hw_queue->length */
+	if (base_addr_pa & ~VCMDQ_ADDR ||
+	    base_addr_pa & (hw_queue->length - 1)) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	/*
+	 * HW requires to unmap LVCMDQs in descending order, so destroy() must
+	 * follow this rule. Set a dependency on its previous LVCMDQ so iommufd
+	 * core will help enforce it.
+	 */
+	if (prev) {
+		ret = iommufd_hw_queue_depend(vcmdq, prev, core);
+		if (ret)
+			goto unlock;
+	}
+	vcmdq->prev = prev;
+
+	ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq);
+	if (ret)
+		goto undepend_vcmdq;
+
+	dev_dbg(cmdqv->dev, "%sallocated\n",
+		lvcmdq_error_header(vcmdq, header, 64));
+
+	tegra241_vcmdq_map_lvcmdq(vcmdq);
+
+	vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR;
+	vcmdq->cmdq.q.q_base |= log2size;
+
+	ret = tegra241_vcmdq_hw_init_user(vcmdq);
+	if (ret)
+		goto unmap_lvcmdq;
+
+	hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user;
+	mutex_unlock(&vintf->lvcmdq_mutex);
+	return 0;
+
+unmap_lvcmdq:
+	tegra241_vcmdq_unmap_lvcmdq(vcmdq);
+	tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+undepend_vcmdq:
+	if (vcmdq->prev)
+		iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core);
+unlock:
+	mutex_unlock(&vintf->lvcmdq_mutex);
+	return ret;
+}
+
+static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+
+	if (vintf->mmap_offset)
+		iommufd_viommu_destroy_mmap(&vintf->vsmmu.core,
+					    vintf->mmap_offset);
+	tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx);
+}
+
+static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev)
+{
+	struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev);
+	struct tegra241_vintf *vintf = vsid->vintf;
+
+	writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx)));
+	writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx)));
+	ida_free(&vintf->sids, vsid->idx);
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx,
+		vsid->idx, vsid->sid);
+}
+
+static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(vdev->dev);
+	struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu);
+	struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev);
+	struct arm_smmu_stream *stream = &master->streams[0];
+	u64 virt_sid = vdev->virt_id;
+	int sidx;
+
+	if (virt_sid > UINT_MAX)
+		return -EINVAL;
+
+	WARN_ON_ONCE(master->num_streams != 1);
+
+	/* Find an empty pair of SID_REPLACE and SID_MATCH */
+	sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1,
+			     GFP_KERNEL);
+	if (sidx < 0)
+		return sidx;
+
+	writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx)));
+	writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx)));
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n",
+		vintf->idx, sidx, stream->id, (u32)virt_sid);
+
+	vsid->idx = sidx;
+	vsid->vintf = vintf;
+	vsid->sid = stream->id;
+
+	vdev->destroy = &tegra241_vintf_destroy_vsid;
+	return 0;
+}
+
+static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = {
+	.destroy = tegra241_cmdqv_destroy_vintf_user,
+	.alloc_domain_nested = arm_vsmmu_alloc_domain_nested,
+	/* Non-accelerated commands will be still handled by the kernel */
+	.cache_invalidate = arm_vsmmu_cache_invalidate,
+	.vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core),
+	.vdevice_init = tegra241_vintf_init_vsid,
+	.get_hw_queue_size = tegra241_vintf_get_vcmdq_size,
+	.hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user,
+};
+
+static int
+tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu,
+			       const struct iommu_user_data *user_data)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu);
+	struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core);
+	struct iommu_viommu_tegra241_cmdqv data;
+	phys_addr_t page0_base;
+	int ret;
+
+	if (!user_data)
+		return -EINVAL;
+
+	ret = iommu_copy_struct_from_user(&data, user_data,
+					  IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+					  out_vintf_mmap_length);
+	if (ret)
+		return ret;
+
+	ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf);
+	if (ret < 0) {
+		dev_err(cmdqv->dev, "no more available vintf\n");
+		return ret;
+	}
+
+	/*
+	 * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre-
+	 * allocate a LVCMDQ until user space wants one, for security reasons.
+	 * It is different than the kernel-owned VINTF0, which had pre-assigned
+	 * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs
+	 * by the tegra241_vintf_hw_init() call.
+	 */
+	ret = tegra241_vintf_hw_init(vintf, false);
+	if (ret)
+		goto deinit_vintf;
+
+	page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx);
+	ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K,
+					&vintf->mmap_offset);
+	if (ret)
+		goto hw_deinit_vintf;
+
+	data.out_vintf_mmap_length = SZ_64K;
+	data.out_vintf_mmap_offset = vintf->mmap_offset;
+	ret = iommu_copy_struct_to_user(user_data, &data,
+					IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV,
+					out_vintf_mmap_length);
+	if (ret)
+		goto free_mmap;
+
+	ida_init(&vintf->sids);
+	mutex_init(&vintf->lvcmdq_mutex);
+
+	dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx,
+		vintf->vsmmu.vmid);
+
+	vsmmu->core.ops = &tegra241_cmdqv_viommu_ops;
+	return 0;
+
+free_mmap:
+	iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset);
+hw_deinit_vintf:
+	tegra241_vintf_hw_deinit(vintf);
+deinit_vintf:
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+	return ret;
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 32ee02380912..2fecea1973bc 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -591,6 +591,28 @@ struct iommu_hw_info_arm_smmuv3 {
 	__u32 aidr;
 };
 
+/**
+ * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware
+ *         Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV)
+ *
+ * @flags: Must be 0
+ * @version: Version number for the CMDQ-V HW for PARAM bits[03:00]
+ * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04]
+ * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12]
+ * @__reserved: Must be 0
+ *
+ * VMM can use these fields directly in its emulated global PARAM register. Note
+ * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM
+ * bits[11:08] should be set to 0 for log2 of the total number of VINTFs.
+ */
+struct iommu_hw_info_tegra241_cmdqv {
+	__u32 flags;
+	__u8 version;
+	__u8 log2vcmdqs;
+	__u8 log2vsids;
+	__u8 __reserved;
+};
+
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
  * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware
@@ -598,12 +620,15 @@ struct iommu_hw_info_arm_smmuv3 {
  * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
  * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
+ * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                     SMMUv3) info type
  */
 enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_NONE = 0,
 	IOMMU_HW_INFO_TYPE_DEFAULT = 0,
 	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
 	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
+	IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3,
 };
 
 /**
@@ -972,10 +997,29 @@ struct iommu_fault_alloc {
  * enum iommu_viommu_type - Virtual IOMMU Type
  * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
  * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                    SMMUv3) enabled ARM SMMUv3 type
  */
 enum iommu_viommu_type {
 	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
 	IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2,
+};
+
+/**
+ * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface
+ *                                      (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
+ * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0
+ * @out_vintf_mmap_length: mmap length argument for VINTF's page0
+ *
+ * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel
+ * for user space to mmap the VINTF page0 from the host physical address space
+ * to the guest physical address space so that a guest kernel can directly R/W
+ * access to the VINTF page0 in order to control its virtual command queues.
+ */
+struct iommu_viommu_tegra241_cmdqv {
+	__aligned_u64 out_vintf_mmap_offset;
+	__aligned_u64 out_vintf_mmap_length;
 };
 
 /**
@@ -1172,9 +1216,24 @@ struct iommu_veventq_alloc {
 /**
  * enum iommu_hw_queue_type - HW Queue Type
  * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                      SMMUv3) Virtual Command Queue (VCMDQ)
  */
 enum iommu_hw_queue_type {
 	IOMMU_HW_QUEUE_TYPE_DEFAULT = 0,
+	/*
+	 * TEGRA241_CMDQV requirements (otherwise, allocation will fail)
+	 * - alloc starts from the lowest @index=0 in ascending order
+	 * - destroy starts from the last allocated @index in descending order
+	 * - @base_addr must be aligned to @length in bytes and mapped in IOAS
+	 * - @length must be a power of 2, with a minimum 32 bytes and a maximum
+	 *   2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1]
+	 *   from struct iommu_hw_info_arm_smmuv3)
+	 * - suggest to back the queue memory with contiguous physical pages or
+	 *   a single huge page with alignment of the queue size, and limit the
+	 *   emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes)
+	 */
+	IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1,
 };
 
 /**
-- 
cgit v1.2.3


From 32b2d3a57e26804ca96d82a222667ac0fa226cb7 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 9 Jul 2025 22:59:21 -0700
Subject: iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support

Add a new vEVENTQ type for VINTFs that are assigned to the user space.
Simply report the two 64-bit LVCMDQ_ERR_MAPs register values.

Link: https://patch.msgid.link/r/68161a980da41fa5022841209638aeff258557b5.1752126748.git.nicolinc@nvidia.com
Reviewed-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 22 ++++++++++++++++++++++
 include/uapi/linux/iommufd.h                   | 15 +++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 3eeb8444fadf..d5d43a1c7708 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -295,6 +295,20 @@ static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval)
 
 /* ISR Functions */
 
+static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf)
+{
+	struct iommufd_viommu *viommu = &vintf->vsmmu.core;
+	struct iommu_vevent_tegra241_cmdqv vevent_data;
+	int i;
+
+	for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++)
+		vevent_data.lvcmdq_err_map[i] =
+			readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+
+	iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV,
+				    &vevent_data, sizeof(vevent_data));
+}
+
 static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf)
 {
 	int i;
@@ -340,6 +354,14 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
 		vintf_map &= ~BIT_ULL(0);
 	}
 
+	/* Handle other user VINTFs and their LVCMDQs */
+	while (vintf_map) {
+		unsigned long idx = __ffs64(vintf_map);
+
+		tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]);
+		vintf_map &= ~BIT_ULL(idx);
+	}
+
 	return IRQ_HANDLED;
 }
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 2fecea1973bc..554aacf89ea7 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -1146,10 +1146,12 @@ struct iommufd_vevent_header {
  * enum iommu_veventq_type - Virtual Event Queue Type
  * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
  * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue
+ * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ
  */
 enum iommu_veventq_type {
 	IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
 	IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1,
+	IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2,
 };
 
 /**
@@ -1173,6 +1175,19 @@ struct iommu_vevent_arm_smmuv3 {
 	__aligned_le64 evt[4];
 };
 
+/**
+ * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ
+ *                                      (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV)
+ * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian.
+ *                  (Refer to register LVCMDQ_ERR_MAPs per VINTF )
+ *
+ * The 128-bit register value from HW exclusively reflect the error bits for a
+ * Virtual Interface represented by a vIOMMU object. Read and report directly.
+ */
+struct iommu_vevent_tegra241_cmdqv {
+	__aligned_le64 lvcmdq_err_map[2];
+};
+
 /**
  * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
  * @size: sizeof(struct iommu_veventq_alloc)
-- 
cgit v1.2.3


From b725441f02c2b31c04a95d0e9ca5420fa029a767 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Thu, 10 Jul 2025 11:20:32 +0800
Subject: bpf: Add attach_type field to bpf_link

Attach_type will be set when a link is created by user. It is better to
record attach_type in bpf_link generically and have it available
universally for all link types. So add the attach_type field in bpf_link
and move the sleepable field to avoid unnecessary gap padding.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250710032038.888700-2-chen.dylane@linux.dev
---
 drivers/net/netkit.c           |  2 +-
 include/linux/bpf.h            | 28 +++++++++++++++++-----------
 kernel/bpf/bpf_iter.c          |  3 ++-
 kernel/bpf/bpf_struct_ops.c    |  5 +++--
 kernel/bpf/cgroup.c            |  4 ++--
 kernel/bpf/net_namespace.c     |  2 +-
 kernel/bpf/syscall.c           | 35 ++++++++++++++++++++++-------------
 kernel/bpf/tcx.c               |  3 ++-
 kernel/bpf/trampoline.c        | 10 ++++++----
 kernel/trace/bpf_trace.c       |  4 ++--
 net/bpf/bpf_dummy_struct_ops.c |  3 ++-
 net/core/dev.c                 |  3 ++-
 net/core/sock_map.c            |  3 ++-
 net/netfilter/nf_bpf_link.c    |  3 ++-
 14 files changed, 66 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index d072a7968f56..5928c99eac73 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -775,7 +775,7 @@ static int netkit_link_init(struct netkit_link *nkl,
 			    struct bpf_prog *prog)
 {
 	bpf_link_init(&nkl->link, BPF_LINK_TYPE_NETKIT,
-		      &netkit_link_lops, prog);
+		      &netkit_link_lops, prog, attr->link_create.attach_type);
 	nkl->location = attr->link_create.attach_type;
 	nkl->dev = dev;
 	return bpf_link_prime(&nkl->link, link_primer);
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 34dd90ec7fad..a9ee9c14b486 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1729,12 +1729,10 @@ struct bpf_link {
 	enum bpf_link_type type;
 	const struct bpf_link_ops *ops;
 	struct bpf_prog *prog;
-	/* whether BPF link itself has "sleepable" semantics, which can differ
-	 * from underlying BPF program having a "sleepable" semantics, as BPF
-	 * link's semantics is determined by target attach hook
-	 */
-	bool sleepable;
+
 	u32 flags;
+	enum bpf_attach_type attach_type;
+
 	/* rcu is used before freeing, work can be used to schedule that
 	 * RCU-based freeing before that, so they never overlap
 	 */
@@ -1742,6 +1740,11 @@ struct bpf_link {
 		struct rcu_head rcu;
 		struct work_struct work;
 	};
+	/* whether BPF link itself has "sleepable" semantics, which can differ
+	 * from underlying BPF program having a "sleepable" semantics, as BPF
+	 * link's semantics is determined by target attach hook
+	 */
+	bool sleepable;
 };
 
 struct bpf_link_ops {
@@ -2034,11 +2037,13 @@ int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
 int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
-				    int cgroup_atype);
+				    int cgroup_atype,
+				    enum bpf_attach_type attach_type);
 void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog);
 #else
 static inline int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
-						  int cgroup_atype)
+						  int cgroup_atype,
+						  enum bpf_attach_type attach_type)
 {
 	return -EOPNOTSUPP;
 }
@@ -2528,10 +2533,11 @@ int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
-		   const struct bpf_link_ops *ops, struct bpf_prog *prog);
+		   const struct bpf_link_ops *ops, struct bpf_prog *prog,
+		   enum bpf_attach_type attach_type);
 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
 			     const struct bpf_link_ops *ops, struct bpf_prog *prog,
-			     bool sleepable);
+			     enum bpf_attach_type attach_type, bool sleepable);
 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
 int bpf_link_settle(struct bpf_link_primer *primer);
 void bpf_link_cleanup(struct bpf_link_primer *primer);
@@ -2883,13 +2889,13 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog)
 
 static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 				 const struct bpf_link_ops *ops,
-				 struct bpf_prog *prog)
+				 struct bpf_prog *prog, enum bpf_attach_type attach_type)
 {
 }
 
 static inline void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
 					   const struct bpf_link_ops *ops, struct bpf_prog *prog,
-					   bool sleepable)
+					   enum bpf_attach_type attach_type, bool sleepable)
 {
 }
 
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 303ab1f42d3a..0cbcae727079 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -552,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 	if (!link)
 		return -ENOMEM;
 
-	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog,
+		      attr->link_create.attach_type);
 	link->tinfo = tinfo;
 
 	err = bpf_link_prime(&link->link, &link_primer);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 96113633e391..687a3e9c76f5 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 			goto reset_unlock;
 		}
 		bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
-			      &bpf_struct_ops_link_lops, prog);
+			      &bpf_struct_ops_link_lops, prog, prog->expected_attach_type);
 		*plink++ = &link->link;
 
 		ksym = kzalloc(sizeof(*ksym), GFP_USER);
@@ -1351,7 +1351,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 		err = -ENOMEM;
 		goto err_out;
 	}
-	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL,
+		      attr->link_create.attach_type);
 
 	err = bpf_link_prime(&link->link, &link_primer);
 	if (err)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index cd220e861d67..bacdd0ca7419 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -867,7 +867,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	cgrp->bpf.flags[atype] = saved_flags;
 
 	if (type == BPF_LSM_CGROUP) {
-		err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+		err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type);
 		if (err)
 			goto cleanup;
 	}
@@ -1495,7 +1495,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		goto out_put_cgroup;
 	}
 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
-		      prog);
+		      prog, attr->link_create.attach_type);
 	link->cgroup = cgrp;
 	link->type = attr->link_create.attach_type;
 
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 868cc2c43899..63702c862757 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -501,7 +501,7 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
 		goto out_put_net;
 	}
 	bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS,
-		      &bpf_netns_link_ops, prog);
+		      &bpf_netns_link_ops, prog, type);
 	net_link->net = net;
 	net_link->type = type;
 	net_link->netns_type = netns_type;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3f36bfe13266..cd7321fe0ba3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3069,7 +3069,7 @@ static int bpf_obj_get(const union bpf_attr *attr)
  */
 void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
 			     const struct bpf_link_ops *ops, struct bpf_prog *prog,
-			     bool sleepable)
+			     enum bpf_attach_type attach_type, bool sleepable)
 {
 	WARN_ON(ops->dealloc && ops->dealloc_deferred);
 	atomic64_set(&link->refcnt, 1);
@@ -3078,12 +3078,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type,
 	link->id = 0;
 	link->ops = ops;
 	link->prog = prog;
+	link->attach_type = attach_type;
 }
 
 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
-		   const struct bpf_link_ops *ops, struct bpf_prog *prog)
+		   const struct bpf_link_ops *ops, struct bpf_prog *prog,
+		   enum bpf_attach_type attach_type)
 {
-	bpf_link_init_sleepable(link, type, ops, prog, false);
+	bpf_link_init_sleepable(link, type, ops, prog, attach_type, false);
 }
 
 static void bpf_link_free_id(int id)
@@ -3443,7 +3445,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = {
 static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 				   int tgt_prog_fd,
 				   u32 btf_id,
-				   u64 bpf_cookie)
+				   u64 bpf_cookie,
+				   enum bpf_attach_type attach_type)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_prog *tgt_prog = NULL;
@@ -3511,7 +3514,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		goto out_put_prog;
 	}
 	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
-		      &bpf_tracing_link_lops, prog);
+		      &bpf_tracing_link_lops, prog, attach_type);
+
 	link->attach_type = prog->expected_attach_type;
 	link->link.cookie = bpf_cookie;
 
@@ -4049,7 +4053,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 		err = -ENOMEM;
 		goto out_put_file;
 	}
-	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog,
+		      attr->link_create.attach_type);
 	link->perf_file = perf_file;
 
 	err = bpf_link_prime(&link->link, &link_primer);
@@ -4081,7 +4086,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 #endif /* CONFIG_PERF_EVENTS */
 
 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
-				  const char __user *user_tp_name, u64 cookie)
+				  const char __user *user_tp_name, u64 cookie,
+				  enum bpf_attach_type attach_type)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_raw_tp_link *link;
@@ -4104,7 +4110,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 			tp_name = prog->aux->attach_func_name;
 			break;
 		}
-		return bpf_tracing_prog_attach(prog, 0, 0, 0);
+		return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type);
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
 		if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0)
@@ -4126,7 +4132,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 		goto out_put_btp;
 	}
 	bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
-				&bpf_raw_tp_link_lops, prog,
+				&bpf_raw_tp_link_lops, prog, attach_type,
 				tracepoint_is_faultable(btp->tp));
 	link->btp = btp;
 	link->cookie = cookie;
@@ -4168,7 +4174,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 
 	tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
 	cookie = attr->raw_tracepoint.cookie;
-	fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
+	fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type);
 	if (fd < 0)
 		bpf_prog_put(prog);
 	return fd;
@@ -5525,7 +5531,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 		ret = bpf_tracing_prog_attach(prog,
 					      attr->link_create.target_fd,
 					      attr->link_create.target_btf_id,
-					      attr->link_create.tracing.cookie);
+					      attr->link_create.tracing.cookie,
+					      attr->link_create.attach_type);
 		break;
 	case BPF_PROG_TYPE_LSM:
 	case BPF_PROG_TYPE_TRACING:
@@ -5534,7 +5541,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			goto out;
 		}
 		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
-			ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
+			ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie,
+						     attr->link_create.attach_type);
 		else if (prog->expected_attach_type == BPF_TRACE_ITER)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
 		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
@@ -5543,7 +5551,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			ret = bpf_tracing_prog_attach(prog,
 						      attr->link_create.target_fd,
 						      attr->link_create.target_btf_id,
-						      attr->link_create.tracing.cookie);
+						      attr->link_create.tracing.cookie,
+						      attr->link_create.attach_type);
 		break;
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_SK_LOOKUP:
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index 2e4885e7781f..e6a14f408d94 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -301,7 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx,
 			 struct net_device *dev,
 			 struct bpf_prog *prog)
 {
-	bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog);
+	bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog,
+		      attr->link_create.attach_type);
 	tcx->location = attr->link_create.attach_type;
 	tcx->dev = dev;
 	return bpf_link_prime(&tcx->link, link_primer);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index b1e358c16eeb..0e364614c3a2 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -674,7 +674,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
 
 static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
 						     bpf_func_t bpf_func,
-						     int cgroup_atype)
+						     int cgroup_atype,
+						     enum bpf_attach_type attach_type)
 {
 	struct bpf_shim_tramp_link *shim_link = NULL;
 	struct bpf_prog *p;
@@ -701,7 +702,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog
 	p->expected_attach_type = BPF_LSM_MAC;
 	bpf_prog_inc(p);
 	bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
-		      &bpf_shim_tramp_link_lops, p);
+		      &bpf_shim_tramp_link_lops, p, attach_type);
 	bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
 
 	return shim_link;
@@ -726,7 +727,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
 }
 
 int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
-				    int cgroup_atype)
+				    int cgroup_atype,
+				    enum bpf_attach_type attach_type)
 {
 	struct bpf_shim_tramp_link *shim_link = NULL;
 	struct bpf_attach_target_info tgt_info = {};
@@ -763,7 +765,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
 
 	/* Allocate and install new shim. */
 
-	shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+	shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type);
 	if (!shim_link) {
 		err = -ENOMEM;
 		goto err;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e7f97a9a8bbd..ffdde840abb8 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2986,7 +2986,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	}
 
 	bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
-		      &bpf_kprobe_multi_link_lops, prog);
+		      &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type);
 
 	err = bpf_link_prime(&link->link, &link_primer);
 	if (err)
@@ -3441,7 +3441,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	link->link.flags = flags;
 
 	bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
-		      &bpf_uprobe_multi_link_lops, prog);
+		      &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type);
 
 	for (i = 0; i < cnt; i++) {
 		uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index f71f67c6896b..812457819b5a 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -171,7 +171,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	}
 	/* prog doesn't take the ownership of the reference from caller */
 	bpf_prog_inc(prog);
-	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog,
+		      prog->expected_attach_type);
 
 	op_idx = prog->expected_attach_type;
 	err = bpf_struct_ops_prepare_trampoline(tlinks, link,
diff --git a/net/core/dev.c b/net/core/dev.c
index be97c440ecd5..7969fddc94e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10364,7 +10364,8 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		goto unlock;
 	}
 
-	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
+		      attr->link_create.attach_type);
 	link->dev = dev;
 	link->flags = attr->link_create.flags;
 
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 82a14f131d00..fbe9a33ddf18 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1866,7 +1866,8 @@ int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
 	}
 
 	attach_type = attr->link_create.attach_type;
-	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog,
+		      attach_type);
 	sockmap_link->map = map;
 	sockmap_link->attach_type = attach_type;
 
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 06b084844700..a054d3b216d8 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -225,7 +225,8 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	if (!link)
 		return -ENOMEM;
 
-	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog);
+	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog,
+		      attr->link_create.attach_type);
 
 	link->hook_ops.hook = nf_hook_run_bpf;
 	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
-- 
cgit v1.2.3


From 9b8d543dc2bbf5d3a1e2d60049df94ae2bc68b28 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Thu, 10 Jul 2025 11:20:33 +0800
Subject: bpf: Remove attach_type in bpf_cgroup_link

Use attach_type in bpf_link, and remove it in bpf_cgroup_link.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250710032038.888700-3-chen.dylane@linux.dev
---
 include/linux/bpf-cgroup.h |  1 -
 kernel/bpf/cgroup.c        | 13 ++++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 70c8b94e797a..082ccd8ad96b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -103,7 +103,6 @@ struct bpf_cgroup_storage {
 struct bpf_cgroup_link {
 	struct bpf_link link;
 	struct cgroup *cgroup;
-	enum bpf_attach_type type;
 };
 
 struct bpf_prog_list {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index bacdd0ca7419..72c8b50dca0a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -984,7 +984,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
 	struct hlist_head *progs;
 	bool found = false;
 
-	atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
+	atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id);
 	if (atype < 0)
 		return -EINVAL;
 
@@ -1396,8 +1396,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 	}
 
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
-				    cg_link->type, 0));
-	if (cg_link->type == BPF_LSM_CGROUP)
+				    link->attach_type, 0));
+	if (link->attach_type == BPF_LSM_CGROUP)
 		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
 
 	cg = cg_link->cgroup;
@@ -1439,7 +1439,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
 		   "cgroup_id:\t%llu\n"
 		   "attach_type:\t%d\n",
 		   cg_id,
-		   cg_link->type);
+		   link->attach_type);
 }
 
 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
@@ -1455,7 +1455,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
 	cgroup_unlock();
 
 	info->cgroup.cgroup_id = cg_id;
-	info->cgroup.attach_type = cg_link->type;
+	info->cgroup.attach_type = link->attach_type;
 	return 0;
 }
 
@@ -1497,7 +1497,6 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
 		      prog, attr->link_create.attach_type);
 	link->cgroup = cgrp;
-	link->type = attr->link_create.attach_type;
 
 	err = bpf_link_prime(&link->link, &link_primer);
 	if (err) {
@@ -1506,7 +1505,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	}
 
 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
-				link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+				link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
 				attr->link_create.cgroup.relative_fd,
 				attr->link_create.cgroup.expected_revision);
 	if (err) {
-- 
cgit v1.2.3


From 6e816e1c052b453a93aeb8b57ede9acde58c458d Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Thu, 10 Jul 2025 11:20:35 +0800
Subject: bpf: Remove location field in tcx_link

Use attach_type in bpf_link to replace the location filed, and
remove location field in tcx_link.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20250710032038.888700-5-chen.dylane@linux.dev
---
 include/net/tcx.h |  1 -
 kernel/bpf/tcx.c  | 13 ++++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tcx.h b/include/net/tcx.h
index 5ce0ce9e0c02..23a61af13547 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -20,7 +20,6 @@ struct tcx_entry {
 struct tcx_link {
 	struct bpf_link link;
 	struct net_device *dev;
-	u32 location;
 };
 
 static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c
index e6a14f408d94..efd987ea6872 100644
--- a/kernel/bpf/tcx.c
+++ b/kernel/bpf/tcx.c
@@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
 				u64 revision)
 {
 	struct tcx_link *tcx = tcx_link(link);
-	bool created, ingress = tcx->location == BPF_TCX_INGRESS;
+	bool created, ingress = link->attach_type == BPF_TCX_INGRESS;
 	struct bpf_mprog_entry *entry, *entry_new;
 	struct net_device *dev = tcx->dev;
 	int ret;
@@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd,
 static void tcx_link_release(struct bpf_link *link)
 {
 	struct tcx_link *tcx = tcx_link(link);
-	bool ingress = tcx->location == BPF_TCX_INGRESS;
+	bool ingress = link->attach_type == BPF_TCX_INGRESS;
 	struct bpf_mprog_entry *entry, *entry_new;
 	struct net_device *dev;
 	int ret = 0;
@@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog,
 			   struct bpf_prog *oprog)
 {
 	struct tcx_link *tcx = tcx_link(link);
-	bool ingress = tcx->location == BPF_TCX_INGRESS;
+	bool ingress = link->attach_type == BPF_TCX_INGRESS;
 	struct bpf_mprog_entry *entry, *entry_new;
 	struct net_device *dev;
 	int ret = 0;
@@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq)
 
 	seq_printf(seq, "ifindex:\t%u\n", ifindex);
 	seq_printf(seq, "attach_type:\t%u (%s)\n",
-		   tcx->location,
-		   tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress");
+		   link->attach_type,
+		   link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress");
 }
 
 static int tcx_link_fill_info(const struct bpf_link *link,
@@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link,
 	rtnl_unlock();
 
 	info->tcx.ifindex = ifindex;
-	info->tcx.attach_type = tcx->location;
+	info->tcx.attach_type = link->attach_type;
 	return 0;
 }
 
@@ -303,7 +303,6 @@ static int tcx_link_init(struct tcx_link *tcx,
 {
 	bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog,
 		      attr->link_create.attach_type);
-	tcx->location = attr->link_create.attach_type;
 	tcx->dev = dev;
 	return bpf_link_prime(&tcx->link, link_primer);
 }
-- 
cgit v1.2.3


From 0eeeebdcc5feeec48118f7a3df2ac818e694ccc7 Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Thu, 10 Jul 2025 11:20:37 +0800
Subject: bpf: Remove attach_type in bpf_tracing_link

Use attach_type in bpf_link, and remove it in bpf_tracing_link.

Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/bpf/20250710032038.888700-7-chen.dylane@linux.dev
---
 include/linux/bpf.h  | 1 -
 kernel/bpf/syscall.c | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a9ee9c14b486..bc887831eaa5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1784,7 +1784,6 @@ struct bpf_shim_tramp_link {
 
 struct bpf_tracing_link {
 	struct bpf_tramp_link link;
-	enum bpf_attach_type attach_type;
 	struct bpf_trampoline *trampoline;
 	struct bpf_prog *tgt_prog;
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd7321fe0ba3..1a26d17536be 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3414,7 +3414,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
 		   "target_obj_id:\t%u\n"
 		   "target_btf_id:\t%u\n"
 		   "cookie:\t%llu\n",
-		   tr_link->attach_type,
+		   link->attach_type,
 		   target_obj_id,
 		   target_btf_id,
 		   tr_link->link.cookie);
@@ -3426,7 +3426,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
 	struct bpf_tracing_link *tr_link =
 		container_of(link, struct bpf_tracing_link, link.link);
 
-	info->tracing.attach_type = tr_link->attach_type;
+	info->tracing.attach_type = link->attach_type;
 	info->tracing.cookie = tr_link->link.cookie;
 	bpf_trampoline_unpack_key(tr_link->trampoline->key,
 				  &info->tracing.target_obj_id,
@@ -3516,7 +3516,6 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 	bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING,
 		      &bpf_tracing_link_lops, prog, attach_type);
 
-	link->attach_type = prog->expected_attach_type;
 	link->link.cookie = bpf_cookie;
 
 	mutex_lock(&prog->aux->dst_mutex);
-- 
cgit v1.2.3


From daec29dcc8731b7596690ab9f647839e4584a86d Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Tue, 17 Jun 2025 19:08:14 +0200
Subject: locking/mutex: Mark devm_mutex_init() as __must_check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

devm_mutex_init() can fail. With CONFIG_DEBUG_MUTEXES=y the mutex will be
marked as unusable and trigger errors on usage.
Enforce all callers check the return value through the compiler.

As devm_mutex_init() itself is a macro, it can not be annotated
directly. Annotate __devm_mutex_init() instead.
Unfortunately __must_check/warn_unused_result don't propagate through
statement expression. So move the statement expression into the argument
list of the call to __devm_mutex_init() through a helper macro.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20250617-must_check-devm_mutex_init-v7-3-d9e449f4d224@weissschuh.net
---
 include/linux/mutex.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a039fa8c1780..00afd341d293 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -126,11 +126,11 @@ do {							\
 
 #ifdef CONFIG_DEBUG_MUTEXES
 
-int __devm_mutex_init(struct device *dev, struct mutex *lock);
+int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock);
 
 #else
 
-static inline int __devm_mutex_init(struct device *dev, struct mutex *lock)
+static inline int __must_check __devm_mutex_init(struct device *dev, struct mutex *lock)
 {
 	/*
 	 * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so
@@ -141,14 +141,17 @@ static inline int __devm_mutex_init(struct device *dev, struct mutex *lock)
 
 #endif
 
-#define devm_mutex_init(dev, mutex)			\
+#define __mutex_init_ret(mutex)				\
 ({							\
 	typeof(mutex) mutex_ = (mutex);			\
 							\
 	mutex_init(mutex_);				\
-	__devm_mutex_init(dev, mutex_);			\
+	mutex_;						\
 })
 
+#define devm_mutex_init(dev, mutex) \
+	__devm_mutex_init(dev, __mutex_init_ret(mutex))
+
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
  * Also see Documentation/locking/mutex-design.rst.
-- 
cgit v1.2.3


From 30dbb2d0e16fce445581049ebcd9043837a843ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:53 +0000
Subject: net_sched: act: annotate data-races in tcf_lastuse_update() and
 tcf_tm_dump()

tcf_tm_dump() reads fields that can be changed concurrently,
and tcf_lastuse_update() might race against itself.

Add READ_ONCE() and WRITE_ONCE() annotations.

Fetch jiffies once in tcf_tm_dump().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/act_api.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 04781c92b43d..2894cfff2da3 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -76,19 +76,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
 {
 	unsigned long now = jiffies;
 
-	if (tm->lastuse != now)
-		tm->lastuse = now;
-	if (unlikely(!tm->firstuse))
-		tm->firstuse = now;
+	if (READ_ONCE(tm->lastuse) != now)
+		WRITE_ONCE(tm->lastuse, now);
+	if (unlikely(!READ_ONCE(tm->firstuse)))
+		WRITE_ONCE(tm->firstuse, now);
 }
 
 static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
 {
-	dtm->install = jiffies_to_clock_t(jiffies - stm->install);
-	dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse);
-	dtm->firstuse = stm->firstuse ?
-		jiffies_to_clock_t(jiffies - stm->firstuse) : 0;
-	dtm->expires = jiffies_to_clock_t(stm->expires);
+	unsigned long firstuse, now = jiffies;
+
+	dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install));
+	dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse));
+
+	firstuse = READ_ONCE(stm->firstuse);
+	dtm->firstuse = firstuse ?
+		jiffies_to_clock_t(now - firstuse) : 0;
+
+	dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires));
 }
 
 static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
-- 
cgit v1.2.3


From 0d752877705c0252ef2726e4c63c5573f048951c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:54 +0000
Subject: net_sched: act_connmark: use RCU in tcf_connmark_dump()

Also storing tcf_action into struct tcf_connmark_parms
makes sure there is no discrepancy in tcf_connmark_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_connmark.h |  1 +
 net/sched/act_connmark.c         | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h
index e8dd77a96748..a5ce83f3eea4 100644
--- a/include/net/tc_act/tc_connmark.h
+++ b/include/net/tc_act/tc_connmark.h
@@ -7,6 +7,7 @@
 struct tcf_connmark_parms {
 	struct net *net;
 	u16 zone;
+	int action;
 	struct rcu_head rcu;
 };
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 0fce631e7c91..3e89927d7116 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -88,7 +88,7 @@ count:
 	/* using overlimits stats to count how many packets marked */
 	tcf_action_inc_overlimit_qstats(&ca->common);
 out:
-	return READ_ONCE(ca->tcf_action);
+	return parms->action;
 }
 
 static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
@@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 	if (err < 0)
 		goto release_idr;
 
+	nparms->action = parm->action;
+
 	spin_lock_bh(&ci->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock));
@@ -190,20 +192,20 @@ out_free:
 static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 				    int bind, int ref)
 {
+	const struct tcf_connmark_info *ci = to_connmark(a);
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_connmark_info *ci = to_connmark(a);
+	const struct tcf_connmark_parms *parms;
 	struct tc_connmark opt = {
 		.index   = ci->tcf_index,
 		.refcnt  = refcount_read(&ci->tcf_refcnt) - ref,
 		.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
 	};
-	struct tcf_connmark_parms *parms;
 	struct tcf_t t;
 
-	spin_lock_bh(&ci->tcf_lock);
-	parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock));
+	rcu_read_lock();
+	parms = rcu_dereference(ci->parms);
 
-	opt.action = ci->tcf_action;
+	opt.action = parms->action;
 	opt.zone = parms->zone;
 	if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t,
 			  TCA_CONNMARK_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&ci->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&ci->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From ba9dc9c14038b5f721e193f9e69ab73fd2f7bdd2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:55 +0000
Subject: net_sched: act_csum: use RCU in tcf_csum_dump()

Also storing tcf_action into struct tcf_csum_params
makes sure there is no discrepancy in tcf_csum_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_csum.h |  1 +
 net/sched/act_csum.c         | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 2515da0142a6..8d0c7a9f9345 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -8,6 +8,7 @@
 
 struct tcf_csum_params {
 	u32 update_flags;
+	int action;
 	struct rcu_head rcu;
 };
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 5cc8e407e791..0939e6b2ba4d 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
 		goto put_chain;
 	}
 	params_new->update_flags = parm->update_flags;
+	params_new->action = parm->action;
 
 	spin_lock_bh(&p->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb,
 	tcf_lastuse_update(&p->tcf_tm);
 	tcf_action_update_bstats(&p->common, skb);
 
-	action = READ_ONCE(p->tcf_action);
+	action = params->action;
 	if (unlikely(action == TC_ACT_SHOT))
 		goto drop;
 
@@ -631,9 +632,9 @@ drop:
 static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 			 int ref)
 {
+	const struct tcf_csum *p = to_tcf_csum(a);
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_csum *p = to_tcf_csum(a);
-	struct tcf_csum_params *params;
+	const struct tcf_csum_params *params;
 	struct tc_csum opt = {
 		.index   = p->tcf_index,
 		.refcnt  = refcount_read(&p->tcf_refcnt) - ref,
@@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	};
 	struct tcf_t t;
 
-	spin_lock_bh(&p->tcf_lock);
-	params = rcu_dereference_protected(p->params,
-					   lockdep_is_held(&p->tcf_lock));
-	opt.action = p->tcf_action;
+	rcu_read_lock();
+	params = rcu_dereference(p->params);
+	opt.action = params->action;
 	opt.update_flags = params->update_flags;
 
 	if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
@@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 554e66bad84ce4181ad91a2ae9cc74c7c440e836 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:56 +0000
Subject: net_sched: act_ct: use RCU in tcf_ct_dump()

Also storing tcf_action into struct tcf_ct_params
makes sure there is no discrepancy in tcf_ct_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_ct.h |  2 +-
 net/sched/act_ct.c         | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index e6b45cb27ebf..8b90c86c0b0d 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -13,7 +13,7 @@ struct tcf_ct_params {
 	struct nf_conntrack_helper *helper;
 	struct nf_conn *tmpl;
 	u16 zone;
-
+	int action;
 	u32 mark;
 	u32 mark_mask;
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index c02f39efc6ef..6749a4a9a9cd 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 
 	p = rcu_dereference_bh(c->params);
 
-	retval = READ_ONCE(c->tcf_action);
+	retval = p->action;
 	commit = p->ct_action & TCA_CT_ACT_COMMIT;
 	clear = p->ct_action & TCA_CT_ACT_CLEAR;
 	tmpl = p->tmpl;
@@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
 	if (err)
 		goto cleanup;
 
+	params->action = parm->action;
 	spin_lock_bh(&c->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	params = rcu_replace_pointer(c->params, params,
@@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a)
 }
 
 static int tcf_ct_dump_key_val(struct sk_buff *skb,
-			       void *val, int val_type,
-			       void *mask, int mask_type,
+			       const void *val, int val_type,
+			       const void *mask, int mask_type,
 			       int len)
 {
 	int err;
@@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb,
 	return 0;
 }
 
-static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
+static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p)
 {
-	struct nf_nat_range2 *range = &p->range;
+	const struct nf_nat_range2 *range = &p->range;
 
 	if (!(p->ct_action & TCA_CT_ACT_NAT))
 		return 0;
@@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
 	return 0;
 }
 
-static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
+static int tcf_ct_dump_helper(struct sk_buff *skb,
+			      const struct nf_conntrack_helper *helper)
 {
 	if (!helper)
 		return 0;
@@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
 			      int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_ct *c = to_ct(a);
-	struct tcf_ct_params *p;
-
+	const struct tcf_ct *c = to_ct(a);
+	const struct tcf_ct_params *p;
 	struct tc_ct opt = {
 		.index   = c->tcf_index,
 		.refcnt  = refcount_read(&c->tcf_refcnt) - ref,
@@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	spin_lock_bh(&c->tcf_lock);
-	p = rcu_dereference_protected(c->params,
-				      lockdep_is_held(&c->tcf_lock));
-	opt.action = c->tcf_action;
+	rcu_read_lock();
+	p = rcu_dereference(c->params);
+	opt.action = p->action;
 
 	if (tcf_ct_dump_key_val(skb,
 				&p->ct_action, TCA_CT_ACTION,
@@ -1579,11 +1579,11 @@ skip_dump:
 	tcf_tm_dump(&t, &c->tcf_tm);
 	if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&c->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 nla_put_failure:
-	spin_unlock_bh(&c->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From d300335b4e18672913dd792ff9f49e6cccf41d26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:57 +0000
Subject: net_sched: act_ctinfo: use atomic64_t for three counters

Commit 21c167aa0ba9 ("net/sched: act_ctinfo: use percpu stats")
missed that stats_dscp_set, stats_dscp_error and stats_cpmark_set
might be written (and read) locklessly.

Use atomic64_t for these three fields, I doubt act_ctinfo is used
heavily on big SMP hosts anyway.

Fixes: 24ec483cec98 ("net: sched: Introduce act_ctinfo action")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pedro Tammela <pctammela@mojatatu.com>
Link: https://patch.msgid.link/20250709090204.797558-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_ctinfo.h |  6 +++---
 net/sched/act_ctinfo.c         | 19 +++++++++++--------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
index f071c1d70a25..a04bcac7adf4 100644
--- a/include/net/tc_act/tc_ctinfo.h
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -18,9 +18,9 @@ struct tcf_ctinfo_params {
 struct tcf_ctinfo {
 	struct tc_action common;
 	struct tcf_ctinfo_params __rcu *params;
-	u64 stats_dscp_set;
-	u64 stats_dscp_error;
-	u64 stats_cpmark_set;
+	atomic64_t stats_dscp_set;
+	atomic64_t stats_dscp_error;
+	atomic64_t stats_cpmark_set;
 };
 
 enum {
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 5b1241ddc758..93ab3bcd6d31 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
 				ipv4_change_dsfield(ip_hdr(skb),
 						    INET_ECN_MASK,
 						    newdscp);
-				ca->stats_dscp_set++;
+				atomic64_inc(&ca->stats_dscp_set);
 			} else {
-				ca->stats_dscp_error++;
+				atomic64_inc(&ca->stats_dscp_error);
 			}
 		}
 		break;
@@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
 				ipv6_change_dsfield(ipv6_hdr(skb),
 						    INET_ECN_MASK,
 						    newdscp);
-				ca->stats_dscp_set++;
+				atomic64_inc(&ca->stats_dscp_set);
 			} else {
-				ca->stats_dscp_error++;
+				atomic64_inc(&ca->stats_dscp_error);
 			}
 		}
 		break;
@@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
 				  struct tcf_ctinfo_params *cp,
 				  struct sk_buff *skb)
 {
-	ca->stats_cpmark_set++;
+	atomic64_inc(&ca->stats_cpmark_set);
 	skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask;
 }
 
@@ -323,15 +323,18 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
 	}
 
 	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
-			      ci->stats_dscp_set, TCA_CTINFO_PAD))
+			      atomic64_read(&ci->stats_dscp_set),
+			      TCA_CTINFO_PAD))
 		goto nla_put_failure;
 
 	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
-			      ci->stats_dscp_error, TCA_CTINFO_PAD))
+			      atomic64_read(&ci->stats_dscp_error),
+			      TCA_CTINFO_PAD))
 		goto nla_put_failure;
 
 	if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
-			      ci->stats_cpmark_set, TCA_CTINFO_PAD))
+			      atomic64_read(&ci->stats_cpmark_set),
+			      TCA_CTINFO_PAD))
 		goto nla_put_failure;
 
 	spin_unlock_bh(&ci->tcf_lock);
-- 
cgit v1.2.3


From 799c94178cf9c9e80575b05b7479396de8b42b61 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:58 +0000
Subject: net_sched: act_ctinfo: use RCU in tcf_ctinfo_dump()

Also storing tcf_action into struct tcf_ctinfo_params
makes sure there is no discrepancy in tcf_ctinfo_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_ctinfo.h |  1 +
 net/sched/act_ctinfo.c         | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
index a04bcac7adf4..7fe01ab236da 100644
--- a/include/net/tc_act/tc_ctinfo.h
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -7,6 +7,7 @@
 struct tcf_ctinfo_params {
 	struct rcu_head rcu;
 	struct net *net;
+	int action;
 	u32 dscpmask;
 	u32 dscpstatemask;
 	u32 cpmarkmask;
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 93ab3bcd6d31..71efe04d00b5 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
 	struct tcf_ctinfo_params *cp;
 	struct nf_conn *ct;
 	int proto, wlen;
-	int action;
 
 	cp = rcu_dereference_bh(ca->params);
 
 	tcf_lastuse_update(&ca->tcf_tm);
 	tcf_action_update_bstats(&ca->common, skb);
-	action = READ_ONCE(ca->tcf_action);
 
 	wlen = skb_network_offset(skb);
 	switch (skb_protocol(skb, true)) {
@@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb,
 	if (thash)
 		nf_ct_put(ct);
 out:
-	return action;
+	return cp->action;
 }
 
 static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
@@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
 		cp_new->mode |= CTINFO_MODE_CPMARK;
 	}
 
+	cp_new->action = actparm->action;
+
 	spin_lock_bh(&ci->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch);
 	cp_new = rcu_replace_pointer(ci->params, cp_new,
@@ -282,25 +282,24 @@ release_idr:
 static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
 			   int bind, int ref)
 {
-	struct tcf_ctinfo *ci = to_ctinfo(a);
+	const struct tcf_ctinfo *ci = to_ctinfo(a);
+	unsigned char *b = skb_tail_pointer(skb);
+	const struct tcf_ctinfo_params *cp;
 	struct tc_ctinfo opt = {
 		.index   = ci->tcf_index,
 		.refcnt  = refcount_read(&ci->tcf_refcnt) - ref,
 		.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
 	};
-	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_ctinfo_params *cp;
 	struct tcf_t t;
 
-	spin_lock_bh(&ci->tcf_lock);
-	cp = rcu_dereference_protected(ci->params,
-				       lockdep_is_held(&ci->tcf_lock));
+	rcu_read_lock();
+	cp = rcu_dereference(ci->params);
 
 	tcf_tm_dump(&t, &ci->tcf_tm);
 	if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
 		goto nla_put_failure;
 
-	opt.action = ci->tcf_action;
+	opt.action = cp->action;
 	if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
 		goto nla_put_failure;
 
@@ -337,11 +336,11 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
 			      TCA_CTINFO_PAD))
 		goto nla_put_failure;
 
-	spin_unlock_bh(&ci->tcf_lock);
+	rcu_read_unlock();
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&ci->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 8151684e339996ffe6d65968c5eea154366539f4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:01:59 +0000
Subject: net_sched: act_mpls: use RCU in tcf_mpls_dump()

Also storing tcf_action into struct tcf_mpls_params
makes sure there is no discrepancy in tcf_mpls_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_mpls.h |  1 +
 net/sched/act_mpls.c         | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
index d452e5e94fd0..dd067bd4018d 100644
--- a/include/net/tc_act/tc_mpls.h
+++ b/include/net/tc_act/tc_mpls.h
@@ -10,6 +10,7 @@
 struct tcf_mpls_params {
 	int tcfm_action;
 	u32 tcfm_label;
+	int action; /* tcf_action */
 	u8 tcfm_tc;
 	u8 tcfm_ttl;
 	u8 tcfm_bos;
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index 9f86f4e666d3..6654011dcd2b 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
 	struct tcf_mpls *m = to_mpls(a);
 	struct tcf_mpls_params *p;
 	__be32 new_lse;
-	int ret, mac_len;
+	int mac_len;
 
 	tcf_lastuse_update(&m->tcf_tm);
 	bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb);
@@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
 		mac_len = skb_network_offset(skb);
 	}
 
-	ret = READ_ONCE(m->tcf_action);
-
 	p = rcu_dereference_bh(m->mpls_p);
 
 	switch (p->tcfm_action) {
@@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb,
 	if (skb_at_tc_ingress(skb))
 		skb_pull_rcsum(skb, skb->mac_len);
 
-	return ret;
+	return p->action;
 
 drop:
 	qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats));
@@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
 					 ACT_MPLS_BOS_NOT_SET);
 	p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO],
 					     htons(ETH_P_MPLS_UC));
+	p->action = parm->action;
 
 	spin_lock_bh(&m->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
@@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
 			 int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_mpls *m = to_mpls(a);
-	struct tcf_mpls_params *p;
+	const struct tcf_mpls *m = to_mpls(a);
+	const struct tcf_mpls_params *p;
 	struct tc_mpls opt = {
 		.index    = m->tcf_index,
 		.refcnt   = refcount_read(&m->tcf_refcnt) - ref,
@@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	spin_lock_bh(&m->tcf_lock);
-	opt.action = m->tcf_action;
-	p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock));
+	rcu_read_lock();
+	p = rcu_dereference(m->mpls_p);
 	opt.m_action = p->tcfm_action;
+	opt.action = p->action;
 
 	if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a,
 	if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD))
 		goto nla_put_failure;
 
-	spin_unlock_bh(&m->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&m->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -EMSGSIZE;
 }
-- 
cgit v1.2.3


From 5d28928668a2ef6182401ddca7ab4064bf349e3e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:02:00 +0000
Subject: net_sched: act_nat: use RCU in tcf_nat_dump()

Also storing tcf_action into struct tcf_nat_params
makes sure there is no discrepancy in tcf_nat_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-9-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_nat.h |  1 +
 net/sched/act_nat.c         | 25 ++++++++++++-------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h
index c869274ac529..ae35f4009445 100644
--- a/include/net/tc_act/tc_nat.h
+++ b/include/net/tc_act/tc_nat.h
@@ -6,6 +6,7 @@
 #include <net/act_api.h>
 
 struct tcf_nat_parms {
+	int action;
 	__be32 old_addr;
 	__be32 new_addr;
 	__be32 mask;
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index d541f553805f..26241d80ebe0 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
 	nparm->new_addr = parm->new_addr;
 	nparm->mask = parm->mask;
 	nparm->flags = parm->flags;
+	nparm->action = parm->action;
 
 	p = to_tcf_nat(*a);
 
@@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb,
 	tcf_lastuse_update(&p->tcf_tm);
 	tcf_action_update_bstats(&p->common, skb);
 
-	action = READ_ONCE(p->tcf_action);
-
 	parms = rcu_dereference_bh(p->parms);
+	action = parms->action;
+	if (unlikely(action == TC_ACT_SHOT))
+		goto drop;
+
 	old_addr = parms->old_addr;
 	new_addr = parms->new_addr;
 	mask = parms->mask;
 	egress = parms->flags & TCA_NAT_FLAG_EGRESS;
 
-	if (unlikely(action == TC_ACT_SHOT))
-		goto drop;
-
 	noff = skb_network_offset(skb);
 	if (!pskb_may_pull(skb, sizeof(*iph) + noff))
 		goto drop;
@@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 			int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_nat *p = to_tcf_nat(a);
+	const struct tcf_nat *p = to_tcf_nat(a);
+	const struct tcf_nat_parms *parms;
 	struct tc_nat opt = {
 		.index    = p->tcf_index,
 		.refcnt   = refcount_read(&p->tcf_refcnt) - ref,
 		.bindcnt  = atomic_read(&p->tcf_bindcnt) - bind,
 	};
-	struct tcf_nat_parms *parms;
 	struct tcf_t t;
 
-	spin_lock_bh(&p->tcf_lock);
-
-	opt.action = p->tcf_action;
+	rcu_read_lock();
 
-	parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock));
+	parms = rcu_dereference(p->parms);
 
+	opt.action = parms->action;
 	opt.old_addr = parms->old_addr;
 	opt.new_addr = parms->new_addr;
 	opt.mask = parms->mask;
@@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 9d096746572616a50cac4906f528a1959c0ee1c2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:02:01 +0000
Subject: net_sched: act_pedit: use RCU in tcf_pedit_dump()

Also storing tcf_action into struct tcf_pedit_params
makes sure there is no discrepancy in tcf_pedit_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-10-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_pedit.h |  1 +
 net/sched/act_pedit.c         | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 83fe39931781..f58ee15cd858 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,6 +14,7 @@ struct tcf_pedit_key_ex {
 struct tcf_pedit_parms {
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
+	int action;
 	u32 tcfp_off_max_hint;
 	unsigned char tcfp_nkeys;
 	unsigned char tcfp_flags;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index fc0a35a7b62a..4b65901397a8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 	}
 
 	p = to_pedit(*a);
-
+	nparms->action = parm->action;
 	spin_lock_bh(&p->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	oparms = rcu_replace_pointer(p->parms, nparms, 1);
@@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
 bad:
 	tcf_action_inc_overlimit_qstats(&p->common);
 done:
-	return p->tcf_action;
+	return parms->action;
 }
 
 static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets,
@@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 			  int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_pedit *p = to_pedit(a);
-	struct tcf_pedit_parms *parms;
+	const struct tcf_pedit *p = to_pedit(a);
+	const struct tcf_pedit_parms *parms;
 	struct tc_pedit *opt;
 	struct tcf_t t;
 	int s;
 
-	spin_lock_bh(&p->tcf_lock);
-	parms = rcu_dereference_protected(p->parms, 1);
+	rcu_read_lock();
+	parms = rcu_dereference(p->parms);
 	s = struct_size(opt, keys, parms->tcfp_nkeys);
 
 	opt = kzalloc(s, GFP_ATOMIC);
 	if (unlikely(!opt)) {
-		spin_unlock_bh(&p->tcf_lock);
+		rcu_read_unlock();
 		return -ENOBUFS;
 	}
 	opt->nkeys = parms->tcfp_nkeys;
@@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	       flex_array_size(opt, keys, parms->tcfp_nkeys));
 	opt->index = p->tcf_index;
 	opt->flags = parms->tcfp_flags;
-	opt->action = p->tcf_action;
+	opt->action = parms->action;
 	opt->refcnt = refcount_read(&p->tcf_refcnt) - ref;
 	opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind;
 
@@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
 	tcf_tm_dump(&t, &p->tcf_tm);
 	if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 
 	kfree(opt);
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&p->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	kfree(opt);
 	return -1;
-- 
cgit v1.2.3


From cec7a5c6c695ba2226b6120dc330e3bea3ea96f8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:02:02 +0000
Subject: net_sched: act_police: use RCU in tcf_police_dump()

Also storing tcf_action into struct tcf_police_params
makes sure there is no discrepancy in tcf_police_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-11-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_police.h |  3 ++-
 net/sched/act_police.c         | 18 +++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 490d88cb5233..a89fc8e68b1e 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -5,10 +5,11 @@
 #include <net/act_api.h>
 
 struct tcf_police_params {
+	int			action;
 	int			tcfp_result;
 	u32			tcfp_ewma_rate;
-	s64			tcfp_burst;
 	u32			tcfp_mtu;
+	s64			tcfp_burst;
 	s64			tcfp_mtu_ptoks;
 	s64			tcfp_pkt_burst;
 	struct psched_ratecfg	rate;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index a214ed681142..0e1c61183379 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		psched_ppscfg_precompute(&new->ppsrate, pps);
 	}
 
+	new->action = parm->action;
 	spin_lock_bh(&police->tcf_lock);
 	spin_lock_bh(&police->tcfp_lock);
 	police->tcfp_t_c = ktime_get_ns();
@@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb,
 	tcf_lastuse_update(&police->tcf_tm);
 	bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb);
 
-	ret = READ_ONCE(police->tcf_action);
 	p = rcu_dereference_bh(police->params);
+	ret = p->action;
 
 	if (p->tcfp_ewma_rate) {
 		struct gnet_stats_rate_est64 sample;
@@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a,
 static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 			       int bind, int ref)
 {
+	const struct tcf_police *police = to_police(a);
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_police *police = to_police(a);
-	struct tcf_police_params *p;
+	const struct tcf_police_params *p;
 	struct tc_police opt = {
 		.index = police->tcf_index,
 		.refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 	};
 	struct tcf_t t;
 
-	spin_lock_bh(&police->tcf_lock);
-	opt.action = police->tcf_action;
-	p = rcu_dereference_protected(police->params,
-				      lockdep_is_held(&police->tcf_lock));
+	rcu_read_lock();
+	p = rcu_dereference(police->params);
+	opt.action = p->action;
 	opt.mtu = p->tcfp_mtu;
 	opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
 	if (p->rate_present) {
@@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 	tcf_tm_dump(&t, &police->tcf_tm);
 	if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&police->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&police->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 1f376373bd225c90381b745e38fa65a9386f7f8e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 9 Jul 2025 09:02:03 +0000
Subject: net_sched: act_skbedit: use RCU in tcf_skbedit_dump()

Also storing tcf_action into struct tcf_skbedit_params
makes sure there is no discrepancy in tcf_skbedit_act().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250709090204.797558-12-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_skbedit.h |  1 +
 net/sched/act_skbedit.c         | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 9649600fb3dc..31b2cd0bebb5 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -12,6 +12,7 @@
 #include <linux/tc_act/tc_skbedit.h>
 
 struct tcf_skbedit_params {
+	int action;
 	u32 flags;
 	u32 priority;
 	u32 mark;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 1f1d9ce3e968..8c1d1554f657 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
 {
 	struct tcf_skbedit *d = to_skbedit(a);
 	struct tcf_skbedit_params *params;
-	int action;
 
 	tcf_lastuse_update(&d->tcf_tm);
 	bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
 
 	params = rcu_dereference_bh(d->params);
-	action = READ_ONCE(d->tcf_action);
 
 	if (params->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = params->priority;
@@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb,
 	}
 	if (params->flags & SKBEDIT_F_PTYPE)
 		skb->pkt_type = params->ptype;
-	return action;
+	return params->action;
 
 err:
 	qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
@@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	if (flags & SKBEDIT_F_MASK)
 		params_new->mask = *mask;
 
+	params_new->action = parm->action;
 	spin_lock_bh(&d->tcf_lock);
 	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
 	params_new = rcu_replace_pointer(d->params, params_new,
@@ -284,9 +283,9 @@ release_idr:
 static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 			    int bind, int ref)
 {
+	const struct tcf_skbedit *d = to_skbedit(a);
 	unsigned char *b = skb_tail_pointer(skb);
-	struct tcf_skbedit *d = to_skbedit(a);
-	struct tcf_skbedit_params *params;
+	const struct tcf_skbedit_params *params;
 	struct tc_skbedit opt = {
 		.index   = d->tcf_index,
 		.refcnt  = refcount_read(&d->tcf_refcnt) - ref,
@@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	u64 pure_flags = 0;
 	struct tcf_t t;
 
-	spin_lock_bh(&d->tcf_lock);
-	params = rcu_dereference_protected(d->params,
-					   lockdep_is_held(&d->tcf_lock));
-	opt.action = d->tcf_action;
+	rcu_read_lock();
+	params = rcu_dereference(d->params);
+	opt.action = params->action;
 
 	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
 		goto nla_put_failure;
-	spin_unlock_bh(&d->tcf_lock);
+	rcu_read_unlock();
 
 	return skb->len;
 
 nla_put_failure:
-	spin_unlock_bh(&d->tcf_lock);
+	rcu_read_unlock();
 	nlmsg_trim(skb, b);
 	return -1;
 }
-- 
cgit v1.2.3


From 8feaf9832be52be16e588029366e27940f6b88ea Mon Sep 17 00:00:00 2001
From: Michael Guralnik <michaelgur@nvidia.com>
Date: Wed, 9 Jul 2025 09:42:08 +0300
Subject: net/mlx5: Expose HCA capability bits for mkey max page size

Expose the HCA capability for maximal page size that can be configured
for an mkey. Used for enforcing capabilities when working with highly
contiguous memory and using large page sizes.

Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/3e4d3fda37934430f65f72601519e22bf396fd05.1751979184.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 0e93f342be09..a1bd92ed8f3a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2171,7 +2171,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   min_mkey_log_entity_size_fixed_buffer[0x5];
 	u8	   ec_vf_vport_base[0x10];
 
-	u8	   reserved_at_3a0[0xa];
+	u8	   reserved_at_3a0[0x2];
+	u8	   max_mkey_log_entity_size_fixed_buffer[0x6];
+	u8	   reserved_at_3a8[0x2];
 	u8	   max_mkey_log_entity_size_mtt[0x6];
 	u8	   max_rqt_vhca_id[0x10];
 
-- 
cgit v1.2.3


From c4f96972c3c206ac8f6770b5ecd5320b561d0058 Mon Sep 17 00:00:00 2001
From: Edward Srouji <edwards@nvidia.com>
Date: Wed, 9 Jul 2025 09:42:09 +0300
Subject: RDMA/mlx5: Fix UMR modifying of mkey page size

When changing the page size on an mkey, the driver needs to set the
appropriate bits in the mkey mask to indicate which fields are being
modified.
The 6th bit of a page size in mlx5 driver is considered an extension,
and this bit has a dedicated capability and mask bits.

Previously, the driver was not setting this mask in the mkey mask when
performing page size changes, regardless of its hardware support,
potentially leading to an incorrect page size updates.

This fixes the issue by setting the relevant bit in the mkey mask when
performing page size changes on an mkey and the 6th bit of this field is
supported by the hardware.

Fixes: cef7dde8836a ("net/mlx5: Expand mkey page size to support 6 bits")
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/9f43a9c73bf2db6085a99dc836f7137e76579f09.1751979184.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mlx5/umr.c | 6 ++++--
 include/linux/mlx5/device.h      | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index 5be4426a2884..25601dea9e30 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -32,13 +32,15 @@ static __be64 get_umr_disable_mr_mask(void)
 	return cpu_to_be64(result);
 }
 
-static __be64 get_umr_update_translation_mask(void)
+static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
 {
 	u64 result;
 
 	result = MLX5_MKEY_MASK_LEN |
 		 MLX5_MKEY_MASK_PAGE_SIZE |
 		 MLX5_MKEY_MASK_START_ADDR;
+	if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
+		result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
 
 	return cpu_to_be64(result);
 }
@@ -654,7 +656,7 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
 		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
 
 	if (update_translation) {
-		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
+		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
 		if (!mr->ibmr.length)
 			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
 	}
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 6822cfa5f4ad..9d2467f982ad 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -280,6 +280,7 @@ enum {
 	MLX5_MKEY_MASK_SMALL_FENCE	= 1ull << 23,
 	MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE	= 1ull << 25,
 	MLX5_MKEY_MASK_FREE			= 1ull << 29,
+	MLX5_MKEY_MASK_PAGE_SIZE_5		= 1ull << 42,
 	MLX5_MKEY_MASK_RELAXED_ORDERING_READ	= 1ull << 47,
 };
 
-- 
cgit v1.2.3


From cbe080f931f48bc7b054008fc2567d1c8c247a89 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Wed, 9 Jul 2025 15:41:06 +0300
Subject: net/mlx5: Expose disciplined_fr_counter through HCA capabilities in
 mlx5_ifc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce the `disciplined_fr_counter` capability bit to indicate that
the device’s free-running cycle counter is disciplined to real-time.

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1752064867-16874-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a1bd92ed8f3a..d7684bb28a3a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1846,7 +1846,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_bf_reg_size[0x5];
 
-	u8         reserved_at_270[0x3];
+	u8         disciplined_fr_counter[0x1];
+	u8         reserved_at_271[0x2];
 	u8	   qp_error_syndrome[0x1];
 	u8	   reserved_at_274[0x2];
 	u8         lag_dct[0x2];
-- 
cgit v1.2.3


From cd1746cb6555a2238c4aae9f9d60b637a61bf177 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@nvidia.com>
Date: Wed, 9 Jul 2025 15:41:07 +0300
Subject: net/mlx5: IFC updates for disabled host PF

The port 2 host PF can be disabled, this bit reflects that setting.

Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
Reviewed-by: William Tu <witu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1752064867-16874-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d7684bb28a3a..639dd0b56655 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -12383,7 +12383,9 @@ struct mlx5_ifc_mtrc_ctrl_bits {
 
 struct mlx5_ifc_host_params_context_bits {
 	u8         host_number[0x8];
-	u8         reserved_at_8[0x7];
+	u8         reserved_at_8[0x5];
+	u8         host_pf_not_exist[0x1];
+	u8         reserved_at_14[0x1];
 	u8         host_pf_disabled[0x1];
 	u8         host_num_of_vfs[0x10];
 
-- 
cgit v1.2.3


From 1a40c362ae265ca4004f7373b34c22af6810f6cb Mon Sep 17 00:00:00 2001
From: Michael Margolin <mrgolin@amazon.com>
Date: Tue, 8 Jul 2025 20:23:06 +0000
Subject: RDMA/uverbs: Add a common way to create CQ with umem

Add ioctl command attributes and a common handling for the option to
create CQs with memory buffers passed from userspace. When required
attributes are supplied, create umem and provide it for driver's use.
The extension enables creation of CQs on top of preallocated CPU
virtual or device memory buffers, by supplying VA or dmabuf fd, in a
common way.
Drivers can support this flow by initializing a new create_cq_umem fp
field in their ops struct, with a function that can handle the new
parameter.

Signed-off-by: Michael Margolin <mrgolin@amazon.com>
Link: https://patch.msgid.link/20250708202308.24783-2-mrgolin@amazon.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/device.c              |  1 +
 drivers/infiniband/core/uverbs_std_types_cq.c | 87 +++++++++++++++++++++++++--
 include/rdma/ib_verbs.h                       |  4 ++
 include/uapi/rdma/ib_user_ioctl_cmds.h        |  4 ++
 4 files changed, 90 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 1ca6a9b7ba1a..f301cdce1728 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2728,6 +2728,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, create_ah);
 	SET_DEVICE_OP(dev_ops, create_counters);
 	SET_DEVICE_OP(dev_ops, create_cq);
+	SET_DEVICE_OP(dev_ops, create_cq_umem);
 	SET_DEVICE_OP(dev_ops, create_flow);
 	SET_DEVICE_OP(dev_ops, create_qp);
 	SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
index 432054f0a8a4..37cd37556510 100644
--- a/drivers/infiniband/core/uverbs_std_types_cq.c
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	struct ib_ucq_object *obj = container_of(
 		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
 		typeof(*obj), uevent.uobject);
+	struct ib_uverbs_completion_event_file *ev_file = NULL;
 	struct ib_device *ib_dev = attrs->context->device;
-	int ret;
-	u64 user_handle;
+	struct ib_umem_dmabuf *umem_dmabuf;
 	struct ib_cq_init_attr attr = {};
-	struct ib_cq                   *cq;
-	struct ib_uverbs_completion_event_file    *ev_file = NULL;
 	struct ib_uobject *ev_file_uobj;
+	struct ib_umem *umem = NULL;
+	u64 buffer_length;
+	u64 buffer_offset;
+	struct ib_cq *cq;
+	u64 user_handle;
+	u64 buffer_va;
+	int buffer_fd;
+	int ret;
 
-	if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
+	if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq)
 		return -EOPNOTSUPP;
 
 	ret = uverbs_copy_from(&attr.comp_vector, attrs,
@@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	INIT_LIST_HEAD(&obj->comp_list);
 	INIT_LIST_HEAD(&obj->uevent.event_list);
 
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) {
+
+		ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA);
+		if (ret)
+			goto err_event_file;
+
+		ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+		if (ret)
+			goto err_event_file;
+
+		if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) ||
+		    uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+		    !ib_dev->ops.create_cq_umem) {
+			ret = -EINVAL;
+			goto err_event_file;
+		}
+
+		umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(umem)) {
+			ret = PTR_ERR(umem);
+			goto err_event_file;
+		}
+	} else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) {
+
+		ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD);
+		if (ret)
+			goto err_event_file;
+
+		ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET);
+		if (ret)
+			goto err_event_file;
+
+		ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH);
+		if (ret)
+			goto err_event_file;
+
+		if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) ||
+		    !ib_dev->ops.create_cq_umem) {
+			ret = -EINVAL;
+			goto err_event_file;
+		}
+
+		umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length,
+							buffer_fd, IB_ACCESS_LOCAL_WRITE);
+		if (IS_ERR(umem_dmabuf)) {
+			ret = PTR_ERR(umem_dmabuf);
+			goto err_event_file;
+		}
+		umem = &umem_dmabuf->umem;
+	} else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) ||
+		   uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) ||
+		   !ib_dev->ops.create_cq) {
+		ret = -EINVAL;
+		goto err_event_file;
+	}
+
 	cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
 	if (!cq) {
 		ret = -ENOMEM;
+		ib_umem_release(umem);
 		goto err_event_file;
 	}
 
@@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
 	rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
 	rdma_restrack_set_name(&cq->res, NULL);
 
-	ret = ib_dev->ops.create_cq(cq, &attr, attrs);
+	ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) :
+		ib_dev->ops.create_cq(cq, &attr, attrs);
 	if (ret)
 		goto err_free;
 
@@ -180,6 +244,17 @@ DECLARE_UVERBS_NAMED_METHOD(
 		       UVERBS_OBJECT_ASYNC_EVENT,
 		       UVERBS_ACCESS_READ,
 		       UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD,
+			   UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_OPTIONAL),
 	UVERBS_ATTR_UHW());
 
 static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1d123812a1f9..3fb1c963eeb0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2486,6 +2486,10 @@ struct ib_device_ops {
 	int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
 	int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
 			 struct uverbs_attr_bundle *attrs);
+	int (*create_cq_umem)(struct ib_cq *cq,
+			      const struct ib_cq_init_attr *attr,
+			      struct ib_umem *umem,
+			      struct uverbs_attr_bundle *attrs);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
 	int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index ac7b162611ed..5f3e5bee51b2 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -105,6 +105,10 @@ enum uverbs_attrs_create_cq_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_CQ_FLAGS,
 	UVERBS_ATTR_CREATE_CQ_RESP_CQE,
 	UVERBS_ATTR_CREATE_CQ_EVENT_FD,
+	UVERBS_ATTR_CREATE_CQ_BUFFER_VA,
+	UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH,
+	UVERBS_ATTR_CREATE_CQ_BUFFER_FD,
+	UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET,
 };
 
 enum uverbs_attrs_destroy_cq_cmd_attr_ids {
-- 
cgit v1.2.3


From c897c2c8b8e82981df10df546c753ac857612937 Mon Sep 17 00:00:00 2001
From: Michael Margolin <mrgolin@amazon.com>
Date: Tue, 8 Jul 2025 20:23:07 +0000
Subject: RDMA/core: Add umem "is_contiguous" and "start_dma_addr" helpers

In some cases drivers may need to check if a given umem is contiguous.
Add a helper function in core code so that drivers don't need to deal
with umem or scatter-gather lists structure.
Additionally add a helper for getting umem's start DMA address and use
it in other helper functions that open code it.

Signed-off-by: Michael Margolin <mrgolin@amazon.com>
Link: https://patch.msgid.link/20250708202308.24783-3-mrgolin@amazon.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/rdma/ib_umem.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 7dc7b1cc71b5..0a8e092c0ea8 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -52,11 +52,15 @@ static inline int ib_umem_offset(struct ib_umem *umem)
 	return umem->address & ~PAGE_MASK;
 }
 
+static inline dma_addr_t ib_umem_start_dma_addr(struct ib_umem *umem)
+{
+	return sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem);
+}
+
 static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem,
 					       unsigned long pgsz)
 {
-	return (sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem)) &
-	       (pgsz - 1);
+	return ib_umem_start_dma_addr(umem) & (pgsz - 1);
 }
 
 static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem,
@@ -135,14 +139,27 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
 						    unsigned long pgsz_bitmap,
 						    u64 pgoff_bitmask)
 {
-	struct scatterlist *sg = umem->sgt_append.sgt.sgl;
 	dma_addr_t dma_addr;
 
-	dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK);
+	dma_addr = ib_umem_start_dma_addr(umem);
 	return ib_umem_find_best_pgsz(umem, pgsz_bitmap,
 				      dma_addr & pgoff_bitmask);
 }
 
+static inline bool ib_umem_is_contiguous(struct ib_umem *umem)
+{
+	dma_addr_t dma_addr;
+	unsigned long pgsz;
+
+	/*
+	 * Select the smallest aligned page that can contain the whole umem if
+	 * it was contiguous.
+	 */
+	dma_addr = ib_umem_start_dma_addr(umem);
+	pgsz = roundup_pow_of_two((dma_addr ^ (umem->length - 1 + dma_addr)) + 1);
+	return !!ib_umem_find_best_pgoff(umem, pgsz, U64_MAX);
+}
+
 struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
 					  unsigned long offset, size_t size,
 					  int fd, int access,
-- 
cgit v1.2.3


From 9fb3dd85197f5e5901a81b104a0f8b513148d138 Mon Sep 17 00:00:00 2001
From: Michael Margolin <mrgolin@amazon.com>
Date: Tue, 8 Jul 2025 20:23:08 +0000
Subject: RDMA/efa: Add CQ with external memory support

Add an option to create CQ using external memory instead of allocating
in the driver. The memory can be passed from userspace by dmabuf fd and
an offset or a VA. One of the possible usages is creating CQs that
reside in accelerator memory, allowing low latency asynchronous direct
polling from the accelerator device. Add a capability bit to reflect on
the feature support.

Reviewed-by: Daniel Kranzdorf <dkkranzd@amazon.com>
Reviewed-by: Yonatan Nachum <ynachum@amazon.com>
Signed-off-by: Michael Margolin <mrgolin@amazon.com>
Link: https://patch.msgid.link/20250708202308.24783-4-mrgolin@amazon.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/efa/efa.h       |  3 ++
 drivers/infiniband/hw/efa/efa_main.c  |  1 +
 drivers/infiniband/hw/efa/efa_verbs.c | 61 +++++++++++++++++++++++++++--------
 include/uapi/rdma/efa-abi.h           |  3 +-
 4 files changed, 53 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 838182d0409c..3d49c1db928e 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -107,6 +107,7 @@ struct efa_cq {
 	u16 cq_idx;
 	/* NULL when no interrupts requested */
 	struct efa_eq *eq;
+	struct ib_umem *umem;
 };
 
 struct efa_qp {
@@ -162,6 +163,8 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct uverbs_attr_bundle *attrs);
+int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_umem *umem, struct uverbs_attr_bundle *attrs);
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
 			 struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 4f03c0ec819f..6c415b9adb5f 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -372,6 +372,7 @@ static const struct ib_device_ops efa_dev_ops = {
 	.alloc_pd = efa_alloc_pd,
 	.alloc_ucontext = efa_alloc_ucontext,
 	.create_cq = efa_create_cq,
+	.create_cq_umem = efa_create_cq_umem,
 	.create_qp = efa_create_qp,
 	.create_user_ah = efa_create_ah,
 	.dealloc_pd = efa_dealloc_pd,
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 7c708029b4b4..0f68aec12883 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -254,6 +254,7 @@ int efa_query_device(struct ib_device *ibdev,
 		resp.max_rdma_size = dev_attr->max_rdma_size;
 
 		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID;
+		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_EXT_MEM;
 		if (EFA_DEV_CAP(dev, RDMA_READ))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
 
@@ -1087,8 +1088,11 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		xa_erase(&dev->cqs_xa, cq->cq_idx);
 		synchronize_irq(cq->eq->irq.irqn);
 	}
-	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
-			DMA_FROM_DEVICE);
+
+	if (cq->umem)
+		ib_umem_release(cq->umem);
+	else
+		efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE);
 	return 0;
 }
 
@@ -1127,8 +1131,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 	return 0;
 }
 
-int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
-		  struct uverbs_attr_bundle *attrs)
+int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		       struct ib_umem *umem, struct uverbs_attr_bundle *attrs)
 {
 	struct ib_udata *udata = &attrs->driver_udata;
 	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
@@ -1207,11 +1211,30 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 	cq->ucontext = ucontext;
 	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
-	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
-					 DMA_FROM_DEVICE);
-	if (!cq->cpu_addr) {
-		err = -ENOMEM;
-		goto err_out;
+
+	if (umem) {
+		if (umem->length < cq->size) {
+			ibdev_dbg(&dev->ibdev, "External memory too small\n");
+			err = -EINVAL;
+			goto err_free_mem;
+		}
+
+		if (!ib_umem_is_contiguous(umem)) {
+			ibdev_dbg(&dev->ibdev, "Non contiguous CQ unsupported\n");
+			err = -EINVAL;
+			goto err_free_mem;
+		}
+
+		cq->cpu_addr = NULL;
+		cq->dma_addr = ib_umem_start_dma_addr(umem);
+		cq->umem = umem;
+	} else {
+		cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
+						 DMA_FROM_DEVICE);
+		if (!cq->cpu_addr) {
+			err = -ENOMEM;
+			goto err_out;
+		}
 	}
 
 	params.uarn = cq->ucontext->uarn;
@@ -1228,7 +1251,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 
 	err = efa_com_create_cq(&dev->edev, &params, &result);
 	if (err)
-		goto err_free_mapped;
+		goto err_free_mem;
 
 	resp.db_off = result.db_off;
 	resp.cq_idx = result.cq_idx;
@@ -1236,7 +1259,9 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	cq->ibcq.cqe = result.actual_depth;
 	WARN_ON_ONCE(entries != result.actual_depth);
 
-	err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
+	if (!umem)
+		err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
+
 	if (err) {
 		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
 			  cq->cq_idx);
@@ -1274,15 +1299,23 @@ err_remove_mmap:
 	efa_cq_user_mmap_entries_remove(cq);
 err_destroy_cq:
 	efa_destroy_cq_idx(dev, cq->cq_idx);
-err_free_mapped:
-	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
-			DMA_FROM_DEVICE);
+err_free_mem:
+	if (umem)
+		ib_umem_release(umem);
+	else
+		efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE);
 
 err_out:
 	atomic64_inc(&dev->stats.create_cq_err);
 	return err;
 }
 
+int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		  struct uverbs_attr_bundle *attrs)
+{
+	return efa_create_cq_umem(ibcq, attr, NULL, attrs);
+}
+
 static int umem_to_page_list(struct efa_dev *dev,
 			     struct ib_umem *umem,
 			     u64 *page_list,
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 11b94b0b035b..98b71b9979f8 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
 /*
- * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef EFA_ABI_USER_H
@@ -131,6 +131,7 @@ enum {
 	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
 	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
 	EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV = 1 << 6,
+	EFA_QUERY_DEVICE_CAPS_CQ_WITH_EXT_MEM = 1 << 7,
 };
 
 struct efa_ibv_ex_query_device_resp {
-- 
cgit v1.2.3


From 11d58620dfd0c52b0c49b04d28707c7a5a2d00ae Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 1 Jul 2025 16:37:51 -0500
Subject: iio: adc: ad_sigma_delta: use u8 instead of uint8_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace uint8_t with u8 in the ad_sigma_delta driver.

Technically, uint8_t comes from the C standard library, while u8 is a
Linux kernel type. Since we don't use the C standard library in the
kernel, we should use the kernel types instead.

There is also one instance where int64_t is replaced with s64.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250701-iio-adc-ad7173-add-spi-offload-support-v3-3-42abb83e3dac@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 17 +++++++++--------
 include/linux/iio/adc/ad_sigma_delta.h | 10 +++++-----
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 5cdd73160c6d..5362157966d8 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spi/spi.h>
+#include <linux/types.h>
 #include <linux/unaligned.h>
 
 #include <linux/iio/adc/ad_sigma_delta.h>
@@ -38,7 +39,7 @@
  * @sigma_delta: The sigma delta device
  * @comm: New value for the communications register
  */
-void ad_sd_set_comm(struct ad_sigma_delta *sigma_delta, uint8_t comm)
+void ad_sd_set_comm(struct ad_sigma_delta *sigma_delta, u8 comm)
 {
 	/* Some variants use the lower two bits of the communications register
 	 * to select the channel */
@@ -59,7 +60,7 @@ EXPORT_SYMBOL_NS_GPL(ad_sd_set_comm, "IIO_AD_SIGMA_DELTA");
 int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg,
 	unsigned int size, unsigned int val)
 {
-	uint8_t *data = sigma_delta->tx_buf;
+	u8 *data = sigma_delta->tx_buf;
 	struct spi_transfer t = {
 		.tx_buf		= data,
 		.len		= size + 1,
@@ -99,9 +100,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg,
 EXPORT_SYMBOL_NS_GPL(ad_sd_write_reg, "IIO_AD_SIGMA_DELTA");
 
 static int ad_sd_read_reg_raw(struct ad_sigma_delta *sigma_delta,
-	unsigned int reg, unsigned int size, uint8_t *val)
+			      unsigned int reg, unsigned int size, u8 *val)
 {
-	uint8_t *data = sigma_delta->tx_buf;
+	u8 *data = sigma_delta->tx_buf;
 	int ret;
 	struct spi_transfer t[] = {
 		{
@@ -185,8 +186,8 @@ EXPORT_SYMBOL_NS_GPL(ad_sd_read_reg, "IIO_AD_SIGMA_DELTA");
 int ad_sd_reset(struct ad_sigma_delta *sigma_delta)
 {
 	unsigned int reset_length = sigma_delta->info->num_resetclks;
-	uint8_t *buf;
 	unsigned int size;
+	u8 *buf;
 	int ret;
 
 	size = DIV_ROUND_UP(reset_length, 8);
@@ -454,7 +455,7 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 	unsigned int i, slot, samples_buf_size;
 	unsigned int channel;
-	uint8_t *samples_buf;
+	u8 *samples_buf;
 	int ret;
 
 	if (sigma_delta->num_slots == 1) {
@@ -488,7 +489,7 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 	}
 
 	samples_buf_size = ALIGN(slot * indio_dev->channels[0].scan_type.storagebits / 8, 8);
-	samples_buf_size += sizeof(int64_t);
+	samples_buf_size += sizeof(s64);
 	samples_buf = devm_krealloc(&sigma_delta->spi->dev, sigma_delta->samples_buf,
 				    samples_buf_size, GFP_KERNEL);
 	if (!samples_buf)
@@ -543,7 +544,7 @@ static irqreturn_t ad_sd_trigger_handler(int irq, void *p)
 	struct iio_poll_func *pf = p;
 	struct iio_dev *indio_dev = pf->indio_dev;
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
-	uint8_t *data = sigma_delta->rx_buf;
+	u8 *data = sigma_delta->rx_buf;
 	unsigned int transfer_size;
 	unsigned int sample_size;
 	unsigned int sample_pos;
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index f242b285081b..5056677c9941 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -94,7 +94,7 @@ struct ad_sigma_delta {
 	bool			bus_locked;
 	bool			keep_cs_asserted;
 
-	uint8_t			comm;
+	u8			comm;
 
 	const struct ad_sigma_delta_info *info;
 	unsigned int		active_slots;
@@ -105,7 +105,7 @@ struct ad_sigma_delta {
 	bool			status_appended;
 	/* map slots to channels in order to know what to expect from devices */
 	unsigned int		*slots;
-	uint8_t			*samples_buf;
+	u8			*samples_buf;
 
 	/*
 	 * DMA (thus cache coherency maintenance) requires the
@@ -114,8 +114,8 @@ struct ad_sigma_delta {
 	 * 'rx_buf' is up to 32 bits per sample + 64 bit timestamp,
 	 * rounded to 16 bytes to take into account padding.
 	 */
-	uint8_t				tx_buf[4] __aligned(IIO_DMA_MINALIGN);
-	uint8_t				rx_buf[16] __aligned(8);
+	u8				tx_buf[4] __aligned(IIO_DMA_MINALIGN);
+	u8				rx_buf[16] __aligned(8);
 };
 
 static inline int ad_sigma_delta_set_channel(struct ad_sigma_delta *sd,
@@ -177,7 +177,7 @@ static inline int ad_sigma_delta_postprocess_sample(struct ad_sigma_delta *sd,
 	return 0;
 }
 
-void ad_sd_set_comm(struct ad_sigma_delta *sigma_delta, uint8_t comm);
+void ad_sd_set_comm(struct ad_sigma_delta *sigma_delta, u8 comm);
 int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg,
 	unsigned int size, unsigned int val);
 int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg,
-- 
cgit v1.2.3


From db63e45a7da0678652c69f7cbed2cbf2a9922b39 Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 1 Jul 2025 16:37:56 -0500
Subject: iio: adc: ad_sigma_delta: use spi_optimize_message()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use spi_optimize_message() to improve the performance of buffered reads.

By setting up the SPI message and pre-optimizing it in the buffer
postenable callback, we can reduce overhead during each sample read.

A rough estimate shows that this reduced the CPU usage of the interrupt
handler thread from 22% to 16% using an EVAL-AD4112ARDZ board on a
DE10-Nano (measuring a single channel at the default 6.2 kHz sample
rate).

Signed-off-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250701-iio-adc-ad7173-add-spi-offload-support-v3-8-42abb83e3dac@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 72 ++++++++++++++++------------------
 include/linux/iio/adc/ad_sigma_delta.h |  3 ++
 2 files changed, 37 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index ce549775ac3d..124c42e19f2e 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -466,8 +466,9 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 	const struct iio_scan_type *scan_type = &indio_dev->channels[0].scan_type;
+	struct spi_transfer *xfer = sigma_delta->sample_xfer;
 	unsigned int i, slot, samples_buf_size;
-	unsigned int channel;
+	unsigned int channel, scan_size;
 	u8 *samples_buf;
 	int ret;
 
@@ -510,6 +511,28 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 		return -ENOMEM;
 
 	sigma_delta->samples_buf = samples_buf;
+	scan_size = BITS_TO_BYTES(scan_type->realbits + scan_type->shift);
+	/* For 24-bit data, there is an extra byte of padding. */
+	xfer[1].rx_buf = &sigma_delta->rx_buf[scan_size == 3 ? 1 : 0];
+	xfer[1].len = scan_size + (sigma_delta->status_appended ? 1 : 0);
+	xfer[1].cs_change = 1;
+
+	if (sigma_delta->info->has_registers) {
+		xfer[0].tx_buf = &sigma_delta->sample_addr;
+		xfer[0].len = 1;
+
+		ad_sd_set_read_reg_addr(sigma_delta,
+					sigma_delta->info->data_reg ?: AD_SD_REG_DATA,
+					&sigma_delta->sample_addr);
+		spi_message_init_with_transfers(&sigma_delta->sample_msg, xfer, 2);
+	} else {
+		spi_message_init_with_transfers(&sigma_delta->sample_msg,
+						&xfer[1], 1);
+	}
+
+	ret = spi_optimize_message(sigma_delta->spi, &sigma_delta->sample_msg);
+	if (ret)
+		return ret;
 
 	spi_bus_lock(sigma_delta->spi->controller);
 	sigma_delta->bus_locked = true;
@@ -529,6 +552,7 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 
 err_unlock:
 	spi_bus_unlock(sigma_delta->spi->controller);
+	spi_unoptimize_message(&sigma_delta->sample_msg);
 
 	return ret;
 }
@@ -550,7 +574,10 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev)
 
 	ad_sigma_delta_disable_all(sigma_delta);
 	sigma_delta->bus_locked = false;
-	return spi_bus_unlock(sigma_delta->spi->controller);
+	spi_bus_unlock(sigma_delta->spi->controller);
+	spi_unoptimize_message(&sigma_delta->sample_msg);
+
+	return 0;
 }
 
 static irqreturn_t ad_sd_trigger_handler(int irq, void *p)
@@ -560,50 +587,19 @@ static irqreturn_t ad_sd_trigger_handler(int irq, void *p)
 	const struct iio_scan_type *scan_type = &indio_dev->channels[0].scan_type;
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 	u8 *data = sigma_delta->rx_buf;
-	unsigned int transfer_size;
 	unsigned int sample_size;
 	unsigned int sample_pos;
 	unsigned int status_pos;
 	unsigned int reg_size;
-	unsigned int data_reg;
+	int ret;
 
 	reg_size = BITS_TO_BYTES(scan_type->realbits + scan_type->shift);
+	/* For 24-bit data, there is an extra byte of padding. */
+	status_pos = reg_size + (reg_size == 3 ? 1 : 0);
 
-	if (sigma_delta->info->data_reg != 0)
-		data_reg = sigma_delta->info->data_reg;
-	else
-		data_reg = AD_SD_REG_DATA;
-
-	/* Status word will be appended to the sample during transfer */
-	if (sigma_delta->status_appended)
-		transfer_size = reg_size + 1;
-	else
-		transfer_size = reg_size;
-
-	switch (reg_size) {
-	case 4:
-	case 2:
-	case 1:
-		status_pos = reg_size;
-		ad_sd_read_reg_raw(sigma_delta, data_reg, transfer_size, &data[0]);
-		break;
-	case 3:
-		/*
-		 * Data array after transfer will look like (if status is appended):
-		 * data[] = { [0][sample][sample][sample][status] }
-		 * Keeping the first byte 0 shifts the status position by 1 byte to the right.
-		 */
-		status_pos = reg_size + 1;
-
-		/* We store 24 bit samples in a 32 bit word. Keep the upper
-		 * byte set to zero. */
-		ad_sd_read_reg_raw(sigma_delta, data_reg, transfer_size, &data[1]);
-		break;
-
-	default:
-		dev_err_ratelimited(&indio_dev->dev, "Unsupported reg_size: %u\n", reg_size);
+	ret = spi_sync_locked(sigma_delta->spi, &sigma_delta->sample_msg);
+	if (ret)
 		goto irq_handled;
-	}
 
 	/*
 	 * For devices sampling only one channel at
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 5056677c9941..2037bb68b441 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -105,6 +105,8 @@ struct ad_sigma_delta {
 	bool			status_appended;
 	/* map slots to channels in order to know what to expect from devices */
 	unsigned int		*slots;
+	struct spi_message	sample_msg;
+	struct spi_transfer	sample_xfer[2];
 	u8			*samples_buf;
 
 	/*
@@ -116,6 +118,7 @@ struct ad_sigma_delta {
 	 */
 	u8				tx_buf[4] __aligned(IIO_DMA_MINALIGN);
 	u8				rx_buf[16] __aligned(8);
+	u8				sample_addr;
 };
 
 static inline int ad_sigma_delta_set_channel(struct ad_sigma_delta *sd,
-- 
cgit v1.2.3


From 219da3ea842a156e5808176e11db256db9798f6c Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Tue, 1 Jul 2025 16:37:59 -0500
Subject: iio: adc: ad_sigma_delta: add SPI offload support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add SPI offload support to the ad_sigma_delta module.

When the SPI controller has SPI offload capabilities, the module will
now use that for buffered reads instead of the RDY interrupt trigger.

Drivers that use the ad_sigma_delta module will have to opt into this
by setting supports_spi_offload since each driver will likely need
additional changes before SPI offload can be used. This will allow us
to gradually enable SPI offload support for each driver.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250701-iio-adc-ad7173-add-spi-offload-support-v3-11-42abb83e3dac@baylibre.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 165 +++++++++++++++++++++++----------
 include/linux/iio/adc/ad_sigma_delta.h |  14 +++
 2 files changed, 132 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index 124c42e19f2e..9d2dba0a0ee6 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/property.h>
 #include <linux/slab.h>
+#include <linux/spi/offload/consumer.h>
 #include <linux/spi/spi.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
@@ -27,6 +28,7 @@
 #include <linux/unaligned.h>
 
 #include <linux/iio/adc/ad_sigma_delta.h>
+#include <linux/iio/buffer-dmaengine.h>
 #include <linux/iio/buffer.h>
 #include <linux/iio/iio.h>
 #include <linux/iio/trigger_consumer.h>
@@ -467,8 +469,7 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 	const struct iio_scan_type *scan_type = &indio_dev->channels[0].scan_type;
 	struct spi_transfer *xfer = sigma_delta->sample_xfer;
-	unsigned int i, slot, samples_buf_size;
-	unsigned int channel, scan_size;
+	unsigned int i, slot, channel;
 	u8 *samples_buf;
 	int ret;
 
@@ -496,25 +497,35 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 	sigma_delta->active_slots = slot;
 	sigma_delta->current_slot = 0;
 
-	if (sigma_delta->active_slots > 1) {
-		ret = ad_sigma_delta_append_status(sigma_delta, true);
-		if (ret)
-			return ret;
-	}
+	if (ad_sigma_delta_has_spi_offload(sigma_delta)) {
+		xfer[1].offload_flags = SPI_OFFLOAD_XFER_RX_STREAM;
+		xfer[1].bits_per_word = scan_type->realbits;
+		xfer[1].len = spi_bpw_to_bytes(scan_type->realbits);
+	} else {
+		unsigned int samples_buf_size, scan_size;
 
-	samples_buf_size = ALIGN(slot * BITS_TO_BYTES(scan_type->storagebits),
-				 sizeof(s64));
-	samples_buf_size += sizeof(s64);
-	samples_buf = devm_krealloc(&sigma_delta->spi->dev, sigma_delta->samples_buf,
-				    samples_buf_size, GFP_KERNEL);
-	if (!samples_buf)
-		return -ENOMEM;
+		if (sigma_delta->active_slots > 1) {
+			ret = ad_sigma_delta_append_status(sigma_delta, true);
+			if (ret)
+				return ret;
+		}
 
-	sigma_delta->samples_buf = samples_buf;
-	scan_size = BITS_TO_BYTES(scan_type->realbits + scan_type->shift);
-	/* For 24-bit data, there is an extra byte of padding. */
-	xfer[1].rx_buf = &sigma_delta->rx_buf[scan_size == 3 ? 1 : 0];
-	xfer[1].len = scan_size + (sigma_delta->status_appended ? 1 : 0);
+		samples_buf_size =
+			ALIGN(slot * BITS_TO_BYTES(scan_type->storagebits),
+			      sizeof(s64));
+		samples_buf_size += sizeof(s64);
+		samples_buf = devm_krealloc(&sigma_delta->spi->dev,
+					    sigma_delta->samples_buf,
+					    samples_buf_size, GFP_KERNEL);
+		if (!samples_buf)
+			return -ENOMEM;
+
+		sigma_delta->samples_buf = samples_buf;
+		scan_size = BITS_TO_BYTES(scan_type->realbits + scan_type->shift);
+		/* For 24-bit data, there is an extra byte of padding. */
+		xfer[1].rx_buf = &sigma_delta->rx_buf[scan_size == 3 ? 1 : 0];
+		xfer[1].len = scan_size + (sigma_delta->status_appended ? 1 : 0);
+	}
 	xfer[1].cs_change = 1;
 
 	if (sigma_delta->info->has_registers) {
@@ -530,6 +541,8 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 						&xfer[1], 1);
 	}
 
+	sigma_delta->sample_msg.offload = sigma_delta->offload;
+
 	ret = spi_optimize_message(sigma_delta->spi, &sigma_delta->sample_msg);
 	if (ret)
 		return ret;
@@ -546,7 +559,19 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 	if (ret)
 		goto err_unlock;
 
-	ad_sd_enable_irq(sigma_delta);
+	if (ad_sigma_delta_has_spi_offload(sigma_delta)) {
+		struct spi_offload_trigger_config config = {
+			.type = SPI_OFFLOAD_TRIGGER_DATA_READY,
+		};
+
+		ret = spi_offload_trigger_enable(sigma_delta->offload,
+						 sigma_delta->offload_trigger,
+						 &config);
+		if (ret)
+			goto err_unlock;
+	} else {
+		ad_sd_enable_irq(sigma_delta);
+	}
 
 	return 0;
 
@@ -561,10 +586,15 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev)
 {
 	struct ad_sigma_delta *sigma_delta = iio_device_get_drvdata(indio_dev);
 
-	reinit_completion(&sigma_delta->completion);
-	wait_for_completion_timeout(&sigma_delta->completion, HZ);
+	if (ad_sigma_delta_has_spi_offload(sigma_delta)) {
+		spi_offload_trigger_disable(sigma_delta->offload,
+					    sigma_delta->offload_trigger);
+	} else {
+		reinit_completion(&sigma_delta->completion);
+		wait_for_completion_timeout(&sigma_delta->completion, HZ);
 
-	ad_sd_disable_irq(sigma_delta);
+		ad_sd_disable_irq(sigma_delta);
+	}
 
 	sigma_delta->keep_cs_asserted = false;
 	ad_sigma_delta_set_mode(sigma_delta, AD_SD_MODE_IDLE);
@@ -679,7 +709,8 @@ static irqreturn_t ad_sd_data_rdy_trig_poll(int irq, void *private)
 	if ((!sigma_delta->rdy_gpiod || gpiod_get_value(sigma_delta->rdy_gpiod)) &&
 	    ad_sd_disable_irq(sigma_delta)) {
 		complete(&sigma_delta->completion);
-		iio_trigger_poll(sigma_delta->trig);
+		if (sigma_delta->trig)
+			iio_trigger_poll(sigma_delta->trig);
 
 		return IRQ_HANDLED;
 	}
@@ -712,17 +743,6 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de
 	unsigned long irq_flags = irq_get_trigger_type(sigma_delta->irq_line);
 	int ret;
 
-	if (dev != &sigma_delta->spi->dev) {
-		dev_err(dev, "Trigger parent should be '%s', got '%s'\n",
-			dev_name(dev), dev_name(&sigma_delta->spi->dev));
-		return -EFAULT;
-	}
-
-	sigma_delta->trig = devm_iio_trigger_alloc(dev, "%s-dev%d", indio_dev->name,
-						   iio_device_id(indio_dev));
-	if (sigma_delta->trig == NULL)
-		return -ENOMEM;
-
 	init_completion(&sigma_delta->completion);
 
 	sigma_delta->irq_dis = true;
@@ -742,14 +762,33 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de
 	if (ret)
 		return ret;
 
-	iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
+	if (ad_sigma_delta_has_spi_offload(sigma_delta)) {
+		sigma_delta->offload_trigger =
+			devm_spi_offload_trigger_get(dev, sigma_delta->offload,
+						     SPI_OFFLOAD_TRIGGER_DATA_READY);
+		if (IS_ERR(sigma_delta->offload_trigger))
+			return dev_err_probe(dev, PTR_ERR(sigma_delta->offload_trigger),
+					     "Failed to get SPI offload trigger\n");
+	} else {
+		if (dev != &sigma_delta->spi->dev)
+			return dev_err_probe(dev, -EFAULT,
+				"Trigger parent should be '%s', got '%s'\n",
+				dev_name(dev), dev_name(&sigma_delta->spi->dev));
 
-	ret = devm_iio_trigger_register(dev, sigma_delta->trig);
-	if (ret)
-		return ret;
+		sigma_delta->trig = devm_iio_trigger_alloc(dev, "%s-dev%d",
+			indio_dev->name, iio_device_id(indio_dev));
+		if (!sigma_delta->trig)
+			return -ENOMEM;
 
-	/* select default trigger */
-	indio_dev->trig = iio_trigger_get(sigma_delta->trig);
+		iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
+
+		ret = devm_iio_trigger_register(dev, sigma_delta->trig);
+		if (ret)
+			return ret;
+
+		/* select default trigger */
+		indio_dev->trig = iio_trigger_get(sigma_delta->trig);
+	}
 
 	return 0;
 }
@@ -769,12 +808,29 @@ int devm_ad_sd_setup_buffer_and_trigger(struct device *dev, struct iio_dev *indi
 	if (!sigma_delta->slots)
 		return -ENOMEM;
 
-	ret = devm_iio_triggered_buffer_setup(dev, indio_dev,
-					      &iio_pollfunc_store_time,
-					      &ad_sd_trigger_handler,
-					      &ad_sd_buffer_setup_ops);
-	if (ret)
-		return ret;
+	if (ad_sigma_delta_has_spi_offload(sigma_delta)) {
+		struct dma_chan *rx_dma;
+
+		rx_dma = devm_spi_offload_rx_stream_request_dma_chan(dev,
+			sigma_delta->offload);
+		if (IS_ERR(rx_dma))
+			return dev_err_probe(dev, PTR_ERR(rx_dma),
+					     "Failed to get RX DMA channel\n");
+
+		ret = devm_iio_dmaengine_buffer_setup_with_handle(dev, indio_dev,
+			rx_dma, IIO_BUFFER_DIRECTION_IN);
+		if (ret)
+			return dev_err_probe(dev, ret, "Cannot setup DMA buffer\n");
+
+		indio_dev->setup_ops = &ad_sd_buffer_setup_ops;
+	} else {
+		ret = devm_iio_triggered_buffer_setup(dev, indio_dev,
+						      &iio_pollfunc_store_time,
+						      &ad_sd_trigger_handler,
+						      &ad_sd_buffer_setup_ops);
+		if (ret)
+			return ret;
+	}
 
 	return devm_ad_sd_probe_trigger(dev, indio_dev);
 }
@@ -837,6 +893,20 @@ int ad_sd_init(struct ad_sigma_delta *sigma_delta, struct iio_dev *indio_dev,
 			return sigma_delta->irq_line;
 	}
 
+	if (info->supports_spi_offload) {
+		struct spi_offload_config offload_config = {
+			.capability_flags = SPI_OFFLOAD_CAP_TRIGGER |
+					    SPI_OFFLOAD_CAP_RX_STREAM_DMA,
+		};
+		int ret;
+
+		sigma_delta->offload = devm_spi_offload_get(&spi->dev, spi,
+							    &offload_config);
+		ret = PTR_ERR_OR_ZERO(sigma_delta->offload);
+		if (ret && ret != -ENODEV)
+			return dev_err_probe(&spi->dev, ret, "Failed to get SPI offload\n");
+	}
+
 	iio_device_set_drvdata(indio_dev, sigma_delta);
 
 	return 0;
@@ -846,3 +916,4 @@ EXPORT_SYMBOL_NS_GPL(ad_sd_init, "IIO_AD_SIGMA_DELTA");
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("Analog Devices Sigma-Delta ADCs");
 MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS("IIO_DMAENGINE_BUFFER");
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 2037bb68b441..6e70a412e218 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -31,6 +31,8 @@ struct ad_sigma_delta;
 struct device;
 struct gpio_desc;
 struct iio_dev;
+struct spi_offload;
+struct spi_offload_trigger;
 
 /**
  * struct ad_sigma_delta_info - Sigma Delta driver specific callbacks and options
@@ -47,6 +49,10 @@ struct iio_dev;
  * @has_registers: true if the device has writable and readable registers, false
  *		if there is just one read-only sample data shift register.
  * @has_named_irqs: Set to true if there is more than one IRQ line.
+ * @supports_spi_offload: Set to true if the driver supports SPI offload. Often
+ *		special considerations are needed for scan_type and other channel
+ *		info, so individual drivers have to set this to let the core
+ *		code know that it can use SPI offload if it is available.
  * @addr_shift: Shift of the register address in the communications register.
  * @read_mask: Mask for the communications register having the read bit set.
  * @status_ch_mask: Mask for the channel number stored in status register.
@@ -65,6 +71,7 @@ struct ad_sigma_delta_info {
 	int (*postprocess_sample)(struct ad_sigma_delta *, unsigned int raw_sample);
 	bool has_registers;
 	bool has_named_irqs;
+	bool supports_spi_offload;
 	unsigned int addr_shift;
 	unsigned int read_mask;
 	unsigned int status_ch_mask;
@@ -108,6 +115,8 @@ struct ad_sigma_delta {
 	struct spi_message	sample_msg;
 	struct spi_transfer	sample_xfer[2];
 	u8			*samples_buf;
+	struct spi_offload	*offload;
+	struct spi_offload_trigger *offload_trigger;
 
 	/*
 	 * DMA (thus cache coherency maintenance) requires the
@@ -121,6 +130,11 @@ struct ad_sigma_delta {
 	u8				sample_addr;
 };
 
+static inline bool ad_sigma_delta_has_spi_offload(struct ad_sigma_delta *sd)
+{
+	return sd->offload != NULL;
+}
+
 static inline int ad_sigma_delta_set_channel(struct ad_sigma_delta *sd,
 	unsigned int channel)
 {
-- 
cgit v1.2.3


From 3df2817d5a949646a17bc87677520b358644ee70 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Thu, 3 Jul 2025 16:11:41 +0200
Subject: dt-bindings: iio: adc: mt6359: Add MT6363 PMIC AuxADC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a compatible and channel bindings for MediaTek's MT6363 PMIC,
featuring an Auxiliary ADC IP with 15 ADC channels used for both
internal temperatures and voltages and for external voltage inputs.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250703141146.171431-2-angelogioacchino.delregno@collabora.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../bindings/iio/adc/mediatek,mt6359-auxadc.yaml   |  1 +
 .../dt-bindings/iio/adc/mediatek,mt6363-auxadc.h   | 24 ++++++++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 include/dt-bindings/iio/adc/mediatek,mt6363-auxadc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml b/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
index 6497c416094d..a94429477e46 100644
--- a/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
@@ -22,6 +22,7 @@ properties:
       - mediatek,mt6357-auxadc
       - mediatek,mt6358-auxadc
       - mediatek,mt6359-auxadc
+      - mediatek,mt6363-auxadc
 
   "#io-channel-cells":
     const: 1
diff --git a/include/dt-bindings/iio/adc/mediatek,mt6363-auxadc.h b/include/dt-bindings/iio/adc/mediatek,mt6363-auxadc.h
new file mode 100644
index 000000000000..92d135477d0e
--- /dev/null
+++ b/include/dt-bindings/iio/adc/mediatek,mt6363-auxadc.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+
+#ifndef _DT_BINDINGS_MEDIATEK_MT6363_AUXADC_H
+#define _DT_BINDINGS_MEDIATEK_MT6363_AUXADC_H
+
+/* ADC Channel Index */
+#define MT6363_AUXADC_BATADC		0
+#define MT6363_AUXADC_VCDT		1
+#define MT6363_AUXADC_BAT_TEMP		2
+#define MT6363_AUXADC_CHIP_TEMP		3
+#define MT6363_AUXADC_VSYSSNS		4
+#define MT6363_AUXADC_VTREF		5
+#define MT6363_AUXADC_VCORE_TEMP	6
+#define MT6363_AUXADC_VPROC_TEMP	7
+#define MT6363_AUXADC_VGPU_TEMP		8
+#define MT6363_AUXADC_VIN1		9
+#define MT6363_AUXADC_VIN2		10
+#define MT6363_AUXADC_VIN3		11
+#define MT6363_AUXADC_VIN4		12
+#define MT6363_AUXADC_VIN5		13
+#define MT6363_AUXADC_VIN6		14
+#define MT6363_AUXADC_VIN7		15
+
+#endif
-- 
cgit v1.2.3


From 00da77d1d226b9d8298262fd61b6281abd89a9d4 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Thu, 3 Jul 2025 16:11:42 +0200
Subject: dt-bindings: iio: adc: mt6359: Add MT6373 PMIC AuxADC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a compatible and channel bindings for MediaTek's MT6373 PMIC,
featuring an Auxiliary ADC IP with 15 ADC channels for external
(SoC) temperatures and external voltage inputs.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Nuno Sá <nuno.sa@analog.com>
Link: https://patch.msgid.link/20250703141146.171431-3-angelogioacchino.delregno@collabora.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../bindings/iio/adc/mediatek,mt6359-auxadc.yaml      |  1 +
 include/dt-bindings/iio/adc/mediatek,mt6373-auxadc.h  | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 include/dt-bindings/iio/adc/mediatek,mt6373-auxadc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml b/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
index a94429477e46..5d4ab701f51a 100644
--- a/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/mediatek,mt6359-auxadc.yaml
@@ -23,6 +23,7 @@ properties:
       - mediatek,mt6358-auxadc
       - mediatek,mt6359-auxadc
       - mediatek,mt6363-auxadc
+      - mediatek,mt6373-auxadc
 
   "#io-channel-cells":
     const: 1
diff --git a/include/dt-bindings/iio/adc/mediatek,mt6373-auxadc.h b/include/dt-bindings/iio/adc/mediatek,mt6373-auxadc.h
new file mode 100644
index 000000000000..17cab86d355e
--- /dev/null
+++ b/include/dt-bindings/iio/adc/mediatek,mt6373-auxadc.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause */
+
+#ifndef _DT_BINDINGS_MEDIATEK_MT6373_AUXADC_H
+#define _DT_BINDINGS_MEDIATEK_MT6373_AUXADC_H
+
+/* ADC Channel Index */
+#define MT6373_AUXADC_CHIP_TEMP		0
+#define MT6373_AUXADC_VCORE_TEMP	1
+#define MT6373_AUXADC_VPROC_TEMP	2
+#define MT6373_AUXADC_VGPU_TEMP		3
+#define MT6373_AUXADC_VIN1		4
+#define MT6373_AUXADC_VIN2		5
+#define MT6373_AUXADC_VIN3		6
+#define MT6373_AUXADC_VIN4		7
+#define MT6373_AUXADC_VIN5		8
+#define MT6373_AUXADC_VIN6		9
+#define MT6373_AUXADC_VIN7		10
+
+#endif
-- 
cgit v1.2.3


From 69d5b62c4bded309332add0fac6760239ff47a68 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Thu, 10 Jul 2025 20:40:01 +0800
Subject: ASoC: codec: tlv320aic32x4: Drop aic32x4_pdata usage

There is no machine is using aic32x4_pdata as platform_data, so
remove the dead code.

Cc: Markus Niebel <Markus.Niebel@ew.tq-group.com>
Cc: Alexander Stein <alexander.stein@ew.tq-group.com>
Reviewed-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://patch.msgid.link/20250710-asoc-gpio-1-v2-1-2233b272a1a6@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tlv320aic32x4.h    | 9 ---------
 sound/soc/codecs/tlv320aic32x4.c | 9 +--------
 2 files changed, 1 insertion(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/sound/tlv320aic32x4.h b/include/sound/tlv320aic32x4.h
index 0abf74d7edbd..b779d671a995 100644
--- a/include/sound/tlv320aic32x4.h
+++ b/include/sound/tlv320aic32x4.h
@@ -40,13 +40,4 @@
 struct aic32x4_setup_data {
 	unsigned int gpio_func[5];
 };
-
-struct aic32x4_pdata {
-	struct aic32x4_setup_data *setup;
-	u32 power_cfg;
-	u32 micpga_routing;
-	bool swapdacs;
-	int rstn_gpio;
-};
-
 #endif
diff --git a/sound/soc/codecs/tlv320aic32x4.c b/sound/soc/codecs/tlv320aic32x4.c
index 54ea4bc58c27..7dbcf7f7130b 100644
--- a/sound/soc/codecs/tlv320aic32x4.c
+++ b/sound/soc/codecs/tlv320aic32x4.c
@@ -1346,7 +1346,6 @@ int aic32x4_probe(struct device *dev, struct regmap *regmap,
 		  enum aic32x4_type type)
 {
 	struct aic32x4_priv *aic32x4;
-	struct aic32x4_pdata *pdata = dev->platform_data;
 	struct device_node *np = dev->of_node;
 	int ret;
 
@@ -1363,13 +1362,7 @@ int aic32x4_probe(struct device *dev, struct regmap *regmap,
 
 	dev_set_drvdata(dev, aic32x4);
 
-	if (pdata) {
-		aic32x4->power_cfg = pdata->power_cfg;
-		aic32x4->swapdacs = pdata->swapdacs;
-		aic32x4->micpga_routing = pdata->micpga_routing;
-		aic32x4->rstn_gpio = pdata->rstn_gpio;
-		aic32x4->mclk_name = "mclk";
-	} else if (np) {
+	if (np) {
 		ret = aic32x4_parse_dt(aic32x4, np);
 		if (ret) {
 			dev_err(dev, "Failed to parse DT node\n");
-- 
cgit v1.2.3


From e24d552a17e92714d4f62e112d536babd6428acb Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 20 Jun 2025 16:33:05 +0100
Subject: mm/madvise: eliminate very confusing manipulation of prev VMA

The madvise code has for the longest time had very confusing code around
the 'prev' VMA pointer passed around various functions which, in all cases
except madvise_update_vma(), is unused and instead simply updated as soon
as the function is invoked.

To compound the confusion, the prev pointer is also used to indicate to
the caller that the mmap lock has been dropped and that we can therefore
not safely access the end of the current VMA (which might have been
updated by madvise_update_vma()).

Clear up this confusion by not setting prev = vma anywhere except in
madvise_walk_vmas(), update all references to prev which will always be
equal to vma after madvise_vma_behavior() is invoked, and adding a flag to
indicate that the lock has been dropped to make this explicit.

Additionally, drop a redundant BUG_ON() from madvise_collapse(), which is
simply reiterating the BUG_ON(mmap_locked) above it (note that BUG_ON() is
not appropriate here, but we leave existing code as-is).

We finally adjust the madvise_walk_vmas() logic to be a little clearer -
delaying the assignment of the end of the range to the start of the new
range until the last moment and handling the lock being dropped scenario
immediately.

Additionally add some explanatory comments.

[lorenzo.stoakes@oracle.com: fix very subtle bug]
  Link: https://lkml.kernel.org/r/dca94cde-8afb-4eab-8e57-3f508624d670@lucifer.local
Link: https://lkml.kernel.org/r/63d281c5df2e64225ab5b4bda398b45e22818701.1750433500.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  9 +++---
 mm/khugepaged.c         |  9 ++----
 mm/madvise.c            | 77 +++++++++++++++++++++++++------------------------
 3 files changed, 47 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8f1b15213f61..4d5bb67dc4ec 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -433,9 +433,8 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags,
 		     int advice);
-int madvise_collapse(struct vm_area_struct *vma,
-		     struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end);
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end, bool *lock_dropped);
 void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
 			   unsigned long end, struct vm_area_struct *next);
 spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
@@ -596,8 +595,8 @@ static inline int hugepage_madvise(struct vm_area_struct *vma,
 }
 
 static inline int madvise_collapse(struct vm_area_struct *vma,
-				   struct vm_area_struct **prev,
-				   unsigned long start, unsigned long end)
+				   unsigned long start,
+				   unsigned long end, bool *lock_dropped)
 {
 	return -EINVAL;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 3495a20cef5e..1aa7ca67c756 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2727,8 +2727,8 @@ static int madvise_collapse_errno(enum scan_result r)
 	}
 }
 
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
-		     unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+		     unsigned long end, bool *lock_dropped)
 {
 	struct collapse_control *cc;
 	struct mm_struct *mm = vma->vm_mm;
@@ -2739,8 +2739,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	BUG_ON(vma->vm_start > start);
 	BUG_ON(vma->vm_end < end);
 
-	*prev = vma;
-
 	if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
 		return -EINVAL;
 
@@ -2788,7 +2786,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 							 &mmap_locked, cc);
 		}
 		if (!mmap_locked)
-			*prev = NULL;  /* Tell caller we dropped mmap_lock */
+			*lock_dropped = true;
 
 handle_result:
 		switch (result) {
@@ -2798,7 +2796,6 @@ handle_result:
 			break;
 		case SCAN_PTE_MAPPED_HUGEPAGE:
 			BUG_ON(mmap_locked);
-			BUG_ON(*prev);
 			mmap_read_lock(mm);
 			result = collapse_pte_mapped_thp(mm, addr, true);
 			mmap_read_unlock(mm);
diff --git a/mm/madvise.c b/mm/madvise.c
index f04b8165e2ab..c467ee42596f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -75,7 +75,9 @@ struct madvise_behavior {
 	 */
 	struct madvise_behavior_range range;
 	/* The VMA and VMA preceding it (if applicable) currently targeted. */
-	struct vm_area_struct *prev, *vma;
+	struct vm_area_struct *prev;
+	struct vm_area_struct *vma;
+	bool lock_dropped;
 };
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -188,10 +190,8 @@ static int madvise_update_vma(vm_flags_t new_flags,
 	struct anon_vma_name *anon_name = madv_behavior->anon_name;
 	VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
 
-	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
-		madv_behavior->prev = vma;
+	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name))
 		return 0;
-	}
 
 	vma = vma_modify_flags_name(&vmi, madv_behavior->prev, vma,
 			range->start, range->end, new_flags, anon_name);
@@ -199,7 +199,6 @@ static int madvise_update_vma(vm_flags_t new_flags,
 		return PTR_ERR(vma);
 
 	madv_behavior->vma = vma;
-	madv_behavior->prev = vma;
 
 	/* vm_flags is protected by the mmap_lock held in write mode. */
 	vma_start_write(vma);
@@ -301,6 +300,12 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
 }
 #endif		/* CONFIG_SWAP */
 
+static void mark_mmap_lock_dropped(struct madvise_behavior *madv_behavior)
+{
+	VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
+	madv_behavior->lock_dropped = true;
+}
+
 /*
  * Schedule all required I/O operations.  Do not wait for completion.
  */
@@ -313,7 +318,6 @@ static long madvise_willneed(struct madvise_behavior *madv_behavior)
 	unsigned long end = madv_behavior->range.end;
 	loff_t offset;
 
-	madv_behavior->prev = vma;
 #ifdef CONFIG_SWAP
 	if (!file) {
 		walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
@@ -342,7 +346,7 @@ static long madvise_willneed(struct madvise_behavior *madv_behavior)
 	 * vma's reference to the file) can go away as soon as we drop
 	 * mmap_lock.
 	 */
-	madv_behavior->prev = NULL;	/* tell sys_madvise we drop mmap_lock */
+	mark_mmap_lock_dropped(madv_behavior);
 	get_file(file);
 	offset = (loff_t)(start - vma->vm_start)
 			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -633,7 +637,6 @@ static long madvise_cold(struct madvise_behavior *madv_behavior)
 	struct vm_area_struct *vma = madv_behavior->vma;
 	struct mmu_gather tlb;
 
-	madv_behavior->prev = vma;
 	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
@@ -665,7 +668,6 @@ static long madvise_pageout(struct madvise_behavior *madv_behavior)
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma = madv_behavior->vma;
 
-	madv_behavior->prev = vma;
 	if (!can_madv_lru_vma(vma))
 		return -EINVAL;
 
@@ -954,7 +956,6 @@ static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
 	struct madvise_behavior_range *range = &madv_behavior->range;
 	int behavior = madv_behavior->behavior;
 
-	madv_behavior->prev = madv_behavior->vma;
 	if (!madvise_dontneed_free_valid_vma(madv_behavior))
 		return -EINVAL;
 
@@ -964,8 +965,7 @@ static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
 	if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
 		struct vm_area_struct *vma;
 
-		madv_behavior->prev = NULL; /* mmap_lock has been dropped, prev is stale */
-
+		mark_mmap_lock_dropped(madv_behavior);
 		mmap_read_lock(mm);
 		madv_behavior->vma = vma = vma_lookup(mm, range->start);
 		if (!vma)
@@ -1064,7 +1064,7 @@ static long madvise_remove(struct madvise_behavior *madv_behavior)
 	unsigned long start = madv_behavior->range.start;
 	unsigned long end = madv_behavior->range.end;
 
-	madv_behavior->prev = NULL; /* tell sys_madvise we drop mmap_lock */
+	mark_mmap_lock_dropped(madv_behavior);
 
 	if (vma->vm_flags & VM_LOCKED)
 		return -EINVAL;
@@ -1183,7 +1183,6 @@ static long madvise_guard_install(struct madvise_behavior *madv_behavior)
 	long err;
 	int i;
 
-	madv_behavior->prev = vma;
 	if (!is_valid_guard_vma(vma, /* allow_locked = */false))
 		return -EINVAL;
 
@@ -1293,7 +1292,6 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
 	struct vm_area_struct *vma = madv_behavior->vma;
 	struct madvise_behavior_range *range = &madv_behavior->range;
 
-	madv_behavior->prev = vma;
 	/*
 	 * We're ok with removing guards in mlock()'d ranges, as this is a
 	 * non-destructive action.
@@ -1336,8 +1334,8 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
 	case MADV_DONTNEED_LOCKED:
 		return madvise_dontneed_free(madv_behavior);
 	case MADV_COLLAPSE:
-		return madvise_collapse(vma, &madv_behavior->prev,
-					range->start, range->end);
+		return madvise_collapse(vma, range->start, range->end,
+			&madv_behavior->lock_dropped);
 	case MADV_GUARD_INSTALL:
 		return madvise_guard_install(madv_behavior);
 	case MADV_GUARD_REMOVE:
@@ -1589,7 +1587,6 @@ static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
 		vma_end_read(vma);
 		goto take_mmap_read_lock;
 	}
-	madv_behavior->prev = vma; /* Not currently required. */
 	madv_behavior->vma = vma;
 	return true;
 
@@ -1617,7 +1614,7 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
 	unsigned long last_end = range->end;
 	int unmapped_error = 0;
 	int error;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *prev, *vma;
 
 	/*
 	 * If VMA read lock is supported, apply madvise to a single VMA
@@ -1630,24 +1627,23 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
 		return error;
 	}
 
-	/*
-	 * If the interval [start,end) covers some unmapped address
-	 * ranges, just ignore them, but return -ENOMEM at the end.
-	 * - different from the way of handling in mlock etc.
-	 */
-	vma = find_vma_prev(mm, range->start, &madv_behavior->prev);
+	vma = find_vma_prev(mm, range->start, &prev);
 	if (vma && range->start > vma->vm_start)
-		madv_behavior->prev = vma;
+		prev = vma;
 
 	for (;;) {
-		struct vm_area_struct *prev;
-
 		/* Still start < end. */
 		if (!vma)
 			return -ENOMEM;
 
 		/* Here start < (last_end|vma->vm_end). */
 		if (range->start < vma->vm_start) {
+			/*
+			 * This indicates a gap between VMAs in the input
+			 * range. This does not cause the operation to abort,
+			 * rather we simply return -ENOMEM to indicate that this
+			 * has happened, but carry on.
+			 */
 			unmapped_error = -ENOMEM;
 			range->start = vma->vm_start;
 			if (range->start >= last_end)
@@ -1658,21 +1654,28 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
 		range->end = min(vma->vm_end, last_end);
 
 		/* Here vma->vm_start <= range->start < range->end <= (last_end|vma->vm_end). */
+		madv_behavior->prev = prev;
 		madv_behavior->vma = vma;
 		error = madvise_vma_behavior(madv_behavior);
 		if (error)
 			return error;
-		prev = madv_behavior->prev;
+		if (madv_behavior->lock_dropped) {
+			/* We dropped the mmap lock, we can't ref the VMA. */
+			prev = NULL;
+			vma = NULL;
+			madv_behavior->lock_dropped = false;
+		} else {
+			vma = madv_behavior->vma;
+			prev = vma;
+		}
 
-		range->start = range->end;
-		if (prev && range->start < prev->vm_end)
-			range->start = prev->vm_end;
-		if (range->start >= last_end)
+		if (vma && range->end < vma->vm_end)
+			range->end = vma->vm_end;
+		if (range->end >= last_end)
 			break;
-		if (prev)
-			vma = find_vma(mm, prev->vm_end);
-		else	/* madvise_remove dropped mmap_lock */
-			vma = find_vma(mm, range->start);
+
+		vma = find_vma(mm, vma ? vma->vm_end : range->end);
+		range->start = range->end;
 	}
 
 	return unmapped_error;
-- 
cgit v1.2.3


From 6b233784b198e0d6dbfd526341b6ec51ffd30020 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 24 Jun 2025 15:03:46 +0200
Subject: mm, madvise: extract mm code from prctl_set_vma() to mm/madvise.c

Setting anon_name is done via madvise_set_anon_name() and behaves a lot of
like other madvise operations.  However, apparently because madvise() has
lacked the 4th argument and prctl() not, the userspace entry point has
been implemented via prctl(PR_SET_VMA, ...) and handled first by
prctl_set_vma().

Currently prctl_set_vma() lives in kernel/sys.c but setting the
vma->anon_name is mm-specific code so extract it to a new
set_anon_vma_name() function under mm.  mm/madvise.c seems to be the most
straightforward place as that's where madvise_set_anon_name() lives.  Stop
declaring the latter in mm.h and instead declare set_anon_vma_name().

Link: https://lkml.kernel.org/r/20250624-anon_name_cleanup-v2-2-600075462a11@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Colin Cross <ccross@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 14 +++++++-------
 kernel/sys.c       | 50 +-------------------------------------------------
 mm/madvise.c       | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e0549f3d681..ef40f68c1183 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4059,14 +4059,14 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping,
 #endif
 
 #ifdef CONFIG_ANON_VMA_NAME
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
-			  unsigned long len_in,
-			  struct anon_vma_name *anon_name);
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+		      const char __user *uname);
 #else
-static inline int
-madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
-		      unsigned long len_in, struct anon_vma_name *anon_name) {
-	return 0;
+static inline
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+		      const char __user *uname)
+{
+	return -EINVAL;
 }
 #endif
 
diff --git a/kernel/sys.c b/kernel/sys.c
index adc0de0aa364..b153fb345ada 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2343,54 +2343,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
 
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
-#ifdef CONFIG_ANON_VMA_NAME
-
-#define ANON_VMA_NAME_MAX_LEN		80
-#define ANON_VMA_NAME_INVALID_CHARS	"\\`$[]"
-
-static inline bool is_valid_name_char(char ch)
-{
-	/* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
-	return ch > 0x1f && ch < 0x7f &&
-		!strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
-}
-
 static int prctl_set_vma(unsigned long opt, unsigned long addr,
 			 unsigned long size, unsigned long arg)
 {
-	struct mm_struct *mm = current->mm;
-	const char __user *uname;
-	struct anon_vma_name *anon_name = NULL;
 	int error;
 
 	switch (opt) {
 	case PR_SET_VMA_ANON_NAME:
-		uname = (const char __user *)arg;
-		if (uname) {
-			char *name, *pch;
-
-			name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
-			if (IS_ERR(name))
-				return PTR_ERR(name);
-
-			for (pch = name; *pch != '\0'; pch++) {
-				if (!is_valid_name_char(*pch)) {
-					kfree(name);
-					return -EINVAL;
-				}
-			}
-			/* anon_vma has its own copy */
-			anon_name = anon_vma_name_alloc(name);
-			kfree(name);
-			if (!anon_name)
-				return -ENOMEM;
-
-		}
-
-		mmap_write_lock(mm);
-		error = madvise_set_anon_name(mm, addr, size, anon_name);
-		mmap_write_unlock(mm);
-		anon_vma_name_put(anon_name);
+		error = set_anon_vma_name(addr, size, (const char __user *)arg);
 		break;
 	default:
 		error = -EINVAL;
@@ -2399,14 +2359,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
 	return error;
 }
 
-#else /* CONFIG_ANON_VMA_NAME */
-static int prctl_set_vma(unsigned long opt, unsigned long start,
-			 unsigned long size, unsigned long arg)
-{
-	return -EINVAL;
-}
-#endif /* CONFIG_ANON_VMA_NAME */
-
 static inline unsigned long get_current_mdwe(void)
 {
 	unsigned long ret = 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0ca405017b37..a2294bc1cc7b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -134,8 +134,8 @@ static int replace_anon_vma_name(struct vm_area_struct *vma,
 	return 0;
 }
 
-int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
-			  unsigned long len_in, struct anon_vma_name *anon_name)
+static int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+		unsigned long len_in, struct anon_vma_name *anon_name)
 {
 	unsigned long end;
 	unsigned long len;
@@ -2096,3 +2096,51 @@ free_iov:
 out:
 	return ret;
 }
+
+#ifdef CONFIG_ANON_VMA_NAME
+
+#define ANON_VMA_NAME_MAX_LEN		80
+#define ANON_VMA_NAME_INVALID_CHARS	"\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+	/* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+	return ch > 0x1f && ch < 0x7f &&
+		!strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+int set_anon_vma_name(unsigned long addr, unsigned long size,
+		      const char __user *uname)
+{
+	struct anon_vma_name *anon_name = NULL;
+	struct mm_struct *mm = current->mm;
+	int error;
+
+	if (uname) {
+		char *name, *pch;
+
+		name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+		if (IS_ERR(name))
+			return PTR_ERR(name);
+
+		for (pch = name; *pch != '\0'; pch++) {
+			if (!is_valid_name_char(*pch)) {
+				kfree(name);
+				return -EINVAL;
+			}
+		}
+		/* anon_vma has its own copy */
+		anon_name = anon_vma_name_alloc(name);
+		kfree(name);
+		if (!anon_name)
+			return -ENOMEM;
+	}
+
+	mmap_write_lock(mm);
+	error = madvise_set_anon_name(mm, addr, size, anon_name);
+	mmap_write_unlock(mm);
+	anon_vma_name_put(anon_name);
+
+	return error;
+}
+#endif
-- 
cgit v1.2.3


From 8d2882a8edb8621d37fd8931e0686070cc6cc189 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 16 Jun 2025 15:51:45 +0200
Subject: mm,memory_hotplug: remove status_change_nid_normal and update
 documentation

Now that the last user of status_change_nid_normal is gone, we can remove
it.  Update documentation accordingly.

Link: https://lkml.kernel.org/r/20250616135158.450136-3-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/memory-hotplug.rst                    |  3 ---
 Documentation/translations/zh_CN/core-api/memory-hotplug.rst |  3 ---
 include/linux/memory.h                                       |  1 -
 mm/memory_hotplug.c                                          | 12 ------------
 4 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index 682259ee633a..d1b8eb9add8a 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -56,14 +56,11 @@ The third argument (arg) passes a pointer of struct memory_notify::
 	struct memory_notify {
 		unsigned long start_pfn;
 		unsigned long nr_pages;
-		int status_change_nid_normal;
 		int status_change_nid;
 	}
 
 - start_pfn is start_pfn of online/offline memory.
 - nr_pages is # of pages of online/offline memory.
-- status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
-  is (will be) set/clear, if this is -1, then nodemask status is not changed.
 - status_change_nid is set node id when N_MEMORY of nodemask is (will be)
   set/clear. It means a new(memoryless) node gets new memory by online and a
   node loses all memory. If this is -1, then nodemask status is not changed.
diff --git a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
index 9b2841fb9a5f..c2a4122ae221 100644
--- a/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
+++ b/Documentation/translations/zh_CN/core-api/memory-hotplug.rst
@@ -62,7 +62,6 @@ memory_notify结构体的指针::
 	struct memory_notify {
 		unsigned long start_pfn;
 		unsigned long nr_pages;
-		int status_change_nid_normal;
 		int status_change_nid;
 	}
 
@@ -70,8 +69,6 @@ memory_notify结构体的指针::
 
 - nr_pages是在线/离线内存的页数。
 
-- status_change_nid_normal是当nodemask的N_NORMAL_MEMORY被设置/清除时设置节
-  点id，如果是-1，则nodemask状态不改变。
 
 - status_change_nid是当nodemask的N_MEMORY被（将）设置/清除时设置的节点id。这
   意味着一个新的（没上线的）节点通过联机获得新的内存，而一个节点失去了所有的内
diff --git a/include/linux/memory.h b/include/linux/memory.h
index bd4440bc4a57..a8284d41e452 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -109,7 +109,6 @@ struct memory_notify {
 	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
-	int status_change_nid_normal;
 	int status_change_nid;
 };
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bec20a91e757..d11278c3840d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -706,19 +706,13 @@ static void node_states_check_changes_online(unsigned long nr_pages,
 	int nid = zone_to_nid(zone);
 
 	arg->status_change_nid = NUMA_NO_NODE;
-	arg->status_change_nid_normal = NUMA_NO_NODE;
 
 	if (!node_state(nid, N_MEMORY))
 		arg->status_change_nid = nid;
-	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
-		arg->status_change_nid_normal = nid;
 }
 
 static void node_states_set_node(int node, struct memory_notify *arg)
 {
-	if (arg->status_change_nid_normal >= 0)
-		node_set_state(node, N_NORMAL_MEMORY);
-
 	if (arg->status_change_nid >= 0)
 		node_set_state(node, N_MEMORY);
 }
@@ -1894,7 +1888,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 	enum zone_type zt;
 
 	arg->status_change_nid = NUMA_NO_NODE;
-	arg->status_change_nid_normal = NUMA_NO_NODE;
 
 	/*
 	 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1906,8 +1899,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 	 */
 	for (zt = 0; zt <= ZONE_NORMAL; zt++)
 		present_pages += pgdat->node_zones[zt].present_pages;
-	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
-		arg->status_change_nid_normal = zone_to_nid(zone);
 
 	/*
 	 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
@@ -1926,9 +1917,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
 
 static void node_states_clear_node(int node, struct memory_notify *arg)
 {
-	if (arg->status_change_nid_normal >= 0)
-		node_clear_state(node, N_NORMAL_MEMORY);
-
 	if (arg->status_change_nid >= 0)
 		node_clear_state(node, N_MEMORY);
 }
-- 
cgit v1.2.3


From 67929de108479dbb78496b61af5c24072fc16d8d Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 16 Jun 2025 15:51:46 +0200
Subject: mm,memory_hotplug: implement numa node notifier

There are at least six consumers of hotplug_memory_notifier that what they
really are interested in is whether any numa node changed its state, e.g:
going from having memory to not having memory and vice versa.

Implement a specific notifier for numa nodes when their state gets
changed, which will later be used by those consumers that are only
interested in numa node state changes.

Add documentation as well.

[dan.carpenter@linaro.org: set failure reason in offline_pages()]
  Link: https://lkml.kernel.org/r/be4fd31b-7d09-46b0-8329-6d0464ffa7a5@sabinyo.mountain
Link: https://lkml.kernel.org/r/20250616135158.450136-4-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/memory-hotplug.rst |  83 +++++++++++++++++
 drivers/base/node.c                       |  21 +++++
 include/linux/node.h                      |  40 ++++++++
 mm/memory_hotplug.c                       | 146 ++++++++++++++----------------
 4 files changed, 210 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index d1b8eb9add8a..fb84e78968b2 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -9,6 +9,9 @@ Memory hotplug event notifier
 
 Hotplugging events are sent to a notification queue.
 
+Memory notifier
+----------------
+
 There are six types of notification defined in ``include/linux/memory.h``:
 
 MEM_GOING_ONLINE
@@ -68,6 +71,14 @@ The third argument (arg) passes a pointer of struct memory_notify::
   If status_changed_nid* >= 0, callback should create/discard structures for the
   node if necessary.
 
+It is possible to get notified for MEM_CANCEL_ONLINE without having been notified
+for MEM_GOING_ONLINE, and the same applies to MEM_CANCEL_OFFLINE and
+MEM_GOING_OFFLINE.
+This can happen when a consumer fails, meaning we break the callchain and we
+stop calling the remaining consumers of the notifier.
+It is then important that users of memory_notify make no assumptions and get
+prepared to handle such cases.
+
 The callback routine shall return one of the values
 NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP
 defined in ``include/linux/notifier.h``
@@ -80,6 +91,78 @@ further processing of the notification queue.
 
 NOTIFY_STOP stops further processing of the notification queue.
 
+Numa node notifier
+------------------
+
+There are six types of notification defined in ``include/linux/node.h``:
+
+NODE_ADDING_FIRST_MEMORY
+ Generated before memory becomes available to this node for the first time.
+
+NODE_CANCEL_ADDING_FIRST_MEMORY
+ Generated if NODE_ADDING_FIRST_MEMORY fails.
+
+NODE_ADDED_FIRST_MEMORY
+ Generated when memory has become available fo this node for the first time.
+
+NODE_REMOVING_LAST_MEMORY
+ Generated when the last memory available to this node is about to be offlined.
+
+NODE_CANCEL_REMOVING_LAST_MEMORY
+ Generated when NODE_CANCEL_REMOVING_LAST_MEMORY fails.
+
+NODE_REMOVED_LAST_MEMORY
+ Generated when the last memory available to this node has been offlined.
+
+A callback routine can be registered by calling::
+
+  hotplug_node_notifier(callback_func, priority)
+
+Callback functions with higher values of priority are called before callback
+functions with lower values.
+
+A callback function must have the following prototype::
+
+  int callback_func(
+
+    struct notifier_block *self, unsigned long action, void *arg);
+
+The first argument of the callback function (self) is a pointer to the block
+of the notifier chain that points to the callback function itself.
+The second argument (action) is one of the event types described above.
+The third argument (arg) passes a pointer of struct node_notify::
+
+        struct node_notify {
+                int nid;
+        }
+
+- nid is the node we are adding or removing memory to.
+
+It is possible to get notified for NODE_CANCEL_ADDING_FIRST_MEMORY without
+having been notified for NODE_ADDING_FIRST_MEMORY, and the same applies to
+NODE_CANCEL_REMOVING_LAST_MEMORY and NODE_REMOVING_LAST_MEMORY.
+This can happen when a consumer fails, meaning we break the callchain and we
+stop calling the remaining consumers of the notifier.
+It is then important that users of node_notify make no assumptions and get
+prepared to handle such cases.
+
+The callback routine shall return one of the values
+NOTIFY_DONE, NOTIFY_OK, NOTIFY_BAD, NOTIFY_STOP
+defined in ``include/linux/notifier.h``
+
+NOTIFY_DONE and NOTIFY_OK have no effect on the further processing.
+
+NOTIFY_BAD is used as response to the NODE_ADDING_FIRST_MEMORY,
+NODE_REMOVING_LAST_MEMORY, NODE_ADDED_FIRST_MEMORY or
+NODE_REMOVED_LAST_MEMORY action to cancel hotplugging.
+It stops further processing of the notification queue.
+
+NOTIFY_STOP stops further processing of the notification queue.
+
+Please note that we should not fail for NODE_ADDED_FIRST_MEMORY /
+NODE_REMOVED_FIRST_MEMORY, as memory_hotplug code cannot rollback at that
+point anymore.
+
 Locking Internals
 =================
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6cbeca45c451..6d66382dae65 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -112,6 +112,27 @@ static const struct attribute_group *node_access_node_groups[] = {
 	NULL,
 };
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+static BLOCKING_NOTIFIER_HEAD(node_chain);
+
+int register_node_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&node_chain, nb);
+}
+EXPORT_SYMBOL(register_node_notifier);
+
+void unregister_node_notifier(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&node_chain, nb);
+}
+EXPORT_SYMBOL(unregister_node_notifier);
+
+int node_notify(unsigned long val, void *v)
+{
+	return blocking_notifier_call_chain(&node_chain, val, v);
+}
+#endif
+
 static void node_remove_accesses(struct node *node)
 {
 	struct node_access_nodes *c, *cnext;
diff --git a/include/linux/node.h b/include/linux/node.h
index 88bceebcbfa5..2c7529335b21 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -125,6 +125,46 @@ static inline void register_memory_blocks_under_nodes(void)
 #endif
 
 extern void unregister_node(struct node *node);
+
+struct node_notify {
+	int nid;
+};
+
+#define NODE_ADDING_FIRST_MEMORY                (1<<0)
+#define NODE_ADDED_FIRST_MEMORY                 (1<<1)
+#define NODE_CANCEL_ADDING_FIRST_MEMORY         (1<<2)
+#define NODE_REMOVING_LAST_MEMORY               (1<<3)
+#define NODE_REMOVED_LAST_MEMORY                (1<<4)
+#define NODE_CANCEL_REMOVING_LAST_MEMORY        (1<<5)
+
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA)
+extern int register_node_notifier(struct notifier_block *nb);
+extern void unregister_node_notifier(struct notifier_block *nb);
+extern int node_notify(unsigned long val, void *v);
+
+#define hotplug_node_notifier(fn, pri) ({		\
+	static __meminitdata struct notifier_block fn##_node_nb =\
+		{ .notifier_call = fn, .priority = pri };\
+	register_node_notifier(&fn##_node_nb);			\
+})
+#else
+static inline int register_node_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline void unregister_node_notifier(struct notifier_block *nb)
+{
+}
+static inline int node_notify(unsigned long val, void *v)
+{
+	return 0;
+}
+static inline int hotplug_node_notifier(notifier_fn_t fn, int pri)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_NUMA
 extern void node_dev_init(void);
 /* Core of the node registration - only memory hotplug should use this */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d11278c3840d..0d54a01985ba 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
 #include <linux/compaction.h>
 #include <linux/rmap.h>
 #include <linux/module.h>
+#include <linux/node.h>
 
 #include <asm/tlbflush.h>
 
@@ -699,24 +700,6 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
 	online_mem_sections(start_pfn, end_pfn);
 }
 
-/* check which state of node_states will be changed when online memory */
-static void node_states_check_changes_online(unsigned long nr_pages,
-	struct zone *zone, struct memory_notify *arg)
-{
-	int nid = zone_to_nid(zone);
-
-	arg->status_change_nid = NUMA_NO_NODE;
-
-	if (!node_state(nid, N_MEMORY))
-		arg->status_change_nid = nid;
-}
-
-static void node_states_set_node(int node, struct memory_notify *arg)
-{
-	if (arg->status_change_nid >= 0)
-		node_set_state(node, N_MEMORY);
-}
-
 static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages)
 {
@@ -1167,11 +1150,18 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
 int online_pages(unsigned long pfn, unsigned long nr_pages,
 		       struct zone *zone, struct memory_group *group)
 {
-	unsigned long flags;
-	int need_zonelists_rebuild = 0;
+	struct memory_notify mem_arg = {
+		.start_pfn = pfn,
+		.nr_pages = nr_pages,
+		.status_change_nid = NUMA_NO_NODE,
+	};
+	struct node_notify node_arg = {
+		.nid = NUMA_NO_NODE,
+	};
 	const int nid = zone_to_nid(zone);
+	int need_zonelists_rebuild = 0;
+	unsigned long flags;
 	int ret;
-	struct memory_notify arg;
 
 	/*
 	 * {on,off}lining is constrained to full memory sections (or more
@@ -1188,11 +1178,17 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	/* associate pfn range with the zone */
 	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
 
-	arg.start_pfn = pfn;
-	arg.nr_pages = nr_pages;
-	node_states_check_changes_online(nr_pages, zone, &arg);
+	if (!node_state(nid, N_MEMORY)) {
+		/* Adding memory to the node for the first time */
+		node_arg.nid = nid;
+		mem_arg.status_change_nid = nid;
+		ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
+		ret = notifier_to_errno(ret);
+		if (ret)
+			goto failed_addition;
+	}
 
-	ret = memory_notify(MEM_GOING_ONLINE, &arg);
+	ret = memory_notify(MEM_GOING_ONLINE, &mem_arg);
 	ret = notifier_to_errno(ret);
 	if (ret)
 		goto failed_addition;
@@ -1218,7 +1214,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	online_pages_range(pfn, nr_pages);
 	adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
 
-	node_states_set_node(nid, &arg);
+	if (node_arg.nid >= 0)
+		node_set_state(nid, N_MEMORY);
 	if (need_zonelists_rebuild)
 		build_all_zonelists(NULL);
 
@@ -1239,16 +1236,22 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	kswapd_run(nid);
 	kcompactd_run(nid);
 
+	if (node_arg.nid >= 0)
+		/* First memory added successfully. Notify consumers. */
+		node_notify(NODE_ADDED_FIRST_MEMORY, &node_arg);
+
 	writeback_set_ratelimit();
 
-	memory_notify(MEM_ONLINE, &arg);
+	memory_notify(MEM_ONLINE, &mem_arg);
 	return 0;
 
 failed_addition:
 	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
 		 (unsigned long long) pfn << PAGE_SHIFT,
 		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
-	memory_notify(MEM_CANCEL_ONLINE, &arg);
+	memory_notify(MEM_CANCEL_ONLINE, &mem_arg);
+	if (node_arg.nid != NUMA_NO_NODE)
+		node_notify(NODE_CANCEL_ADDING_FIRST_MEMORY, &node_arg);
 	remove_pfn_range_from_zone(zone, pfn, nr_pages);
 	return ret;
 }
@@ -1879,48 +1882,6 @@ static int __init cmdline_parse_movable_node(char *p)
 }
 early_param("movable_node", cmdline_parse_movable_node);
 
-/* check which state of node_states will be changed when offline memory */
-static void node_states_check_changes_offline(unsigned long nr_pages,
-		struct zone *zone, struct memory_notify *arg)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	unsigned long present_pages = 0;
-	enum zone_type zt;
-
-	arg->status_change_nid = NUMA_NO_NODE;
-
-	/*
-	 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
-	 * If the memory to be offline is within the range
-	 * [0..ZONE_NORMAL], and it is the last present memory there,
-	 * the zones in that range will become empty after the offlining,
-	 * thus we can determine that we need to clear the node from
-	 * node_states[N_NORMAL_MEMORY].
-	 */
-	for (zt = 0; zt <= ZONE_NORMAL; zt++)
-		present_pages += pgdat->node_zones[zt].present_pages;
-
-	/*
-	 * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
-	 * does not apply as we don't support 32bit.
-	 * Here we count the possible pages from ZONE_MOVABLE.
-	 * If after having accounted all the pages, we see that the nr_pages
-	 * to be offlined is over or equal to the accounted pages,
-	 * we know that the node will become empty, and so, we can clear
-	 * it for N_MEMORY as well.
-	 */
-	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
-
-	if (nr_pages >= present_pages)
-		arg->status_change_nid = zone_to_nid(zone);
-}
-
-static void node_states_clear_node(int node, struct memory_notify *arg)
-{
-	if (arg->status_change_nid >= 0)
-		node_clear_state(node, N_MEMORY);
-}
-
 static int count_system_ram_pages_cb(unsigned long start_pfn,
 				     unsigned long nr_pages, void *data)
 {
@@ -1936,11 +1897,19 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 			struct zone *zone, struct memory_group *group)
 {
-	const unsigned long end_pfn = start_pfn + nr_pages;
 	unsigned long pfn, managed_pages, system_ram_pages = 0;
+	const unsigned long end_pfn = start_pfn + nr_pages;
+	struct pglist_data *pgdat = zone->zone_pgdat;
 	const int node = zone_to_nid(zone);
+	struct memory_notify mem_arg = {
+		.start_pfn = start_pfn,
+		.nr_pages = nr_pages,
+		.status_change_nid = NUMA_NO_NODE,
+	};
+	struct node_notify node_arg = {
+		.nid = NUMA_NO_NODE,
+	};
 	unsigned long flags;
-	struct memory_notify arg;
 	char *reason;
 	int ret;
 
@@ -1999,11 +1968,23 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 		goto failed_removal_pcplists_disabled;
 	}
 
-	arg.start_pfn = start_pfn;
-	arg.nr_pages = nr_pages;
-	node_states_check_changes_offline(nr_pages, zone, &arg);
+	/*
+	 * Check whether the node will have no present pages after we offline
+	 * 'nr_pages' more. If so, we know that the node will become empty, and
+	 * so we will clear N_MEMORY for it.
+	 */
+	if (nr_pages >= pgdat->node_present_pages) {
+		node_arg.nid = node;
+		mem_arg.status_change_nid = node;
+		ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
+		ret = notifier_to_errno(ret);
+		if (ret) {
+			reason = "node notifier failure";
+			goto failed_removal_isolated;
+		}
+	}
 
-	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
+	ret = memory_notify(MEM_GOING_OFFLINE, &mem_arg);
 	ret = notifier_to_errno(ret);
 	if (ret) {
 		reason = "notifier failure";
@@ -2083,27 +2064,32 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	 * Make sure to mark the node as memory-less before rebuilding the zone
 	 * list. Otherwise this node would still appear in the fallback lists.
 	 */
-	node_states_clear_node(node, &arg);
+	if (node_arg.nid >= 0)
+		node_clear_state(node, N_MEMORY);
 	if (!populated_zone(zone)) {
 		zone_pcp_reset(zone);
 		build_all_zonelists(NULL);
 	}
 
-	if (arg.status_change_nid >= 0) {
+	if (node_arg.nid >= 0) {
 		kcompactd_stop(node);
 		kswapd_stop(node);
+		/* Node went memoryless. Notify consumers */
+		node_notify(NODE_REMOVED_LAST_MEMORY, &node_arg);
 	}
 
 	writeback_set_ratelimit();
 
-	memory_notify(MEM_OFFLINE, &arg);
+	memory_notify(MEM_OFFLINE, &mem_arg);
 	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
 	return 0;
 
 failed_removal_isolated:
 	/* pushback to free area */
 	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-	memory_notify(MEM_CANCEL_OFFLINE, &arg);
+	memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
+	if (node_arg.nid != NUMA_NO_NODE)
+		node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
 failed_removal_pcplists_disabled:
 	lru_cache_enable();
 	zone_pcp_enable(zone);
-- 
cgit v1.2.3


From d2a9721d807de405b198291badcc807700746781 Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador@suse.de>
Date: Mon, 16 Jun 2025 15:51:54 +0200
Subject: mm,memory_hotplug: drop status_change_nid parameter from
 memory_notify

There no users left of status_change_nid, so drop it from memory_notify
struct.

Link: https://lkml.kernel.org/r/20250616135158.450136-12-osalvador@suse.de
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/memory-hotplug.rst | 7 -------
 include/linux/memory.h                    | 1 -
 mm/memory_hotplug.c                       | 4 ----
 3 files changed, 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/core-api/memory-hotplug.rst b/Documentation/core-api/memory-hotplug.rst
index fb84e78968b2..8fc97c2379de 100644
--- a/Documentation/core-api/memory-hotplug.rst
+++ b/Documentation/core-api/memory-hotplug.rst
@@ -59,17 +59,10 @@ The third argument (arg) passes a pointer of struct memory_notify::
 	struct memory_notify {
 		unsigned long start_pfn;
 		unsigned long nr_pages;
-		int status_change_nid;
 	}
 
 - start_pfn is start_pfn of online/offline memory.
 - nr_pages is # of pages of online/offline memory.
-- status_change_nid is set node id when N_MEMORY of nodemask is (will be)
-  set/clear. It means a new(memoryless) node gets new memory by online and a
-  node loses all memory. If this is -1, then nodemask status is not changed.
-
-  If status_changed_nid* >= 0, callback should create/discard structures for the
-  node if necessary.
 
 It is possible to get notified for MEM_CANCEL_ONLINE without having been notified
 for MEM_GOING_ONLINE, and the same applies to MEM_CANCEL_OFFLINE and
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a8284d41e452..40eb70ccb09d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -109,7 +109,6 @@ struct memory_notify {
 	unsigned long altmap_nr_pages;
 	unsigned long start_pfn;
 	unsigned long nr_pages;
-	int status_change_nid;
 };
 
 struct notifier_block;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d54a01985ba..a371d59cc718 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1153,7 +1153,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	struct memory_notify mem_arg = {
 		.start_pfn = pfn,
 		.nr_pages = nr_pages,
-		.status_change_nid = NUMA_NO_NODE,
 	};
 	struct node_notify node_arg = {
 		.nid = NUMA_NO_NODE,
@@ -1181,7 +1180,6 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 	if (!node_state(nid, N_MEMORY)) {
 		/* Adding memory to the node for the first time */
 		node_arg.nid = nid;
-		mem_arg.status_change_nid = nid;
 		ret = node_notify(NODE_ADDING_FIRST_MEMORY, &node_arg);
 		ret = notifier_to_errno(ret);
 		if (ret)
@@ -1904,7 +1902,6 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	struct memory_notify mem_arg = {
 		.start_pfn = start_pfn,
 		.nr_pages = nr_pages,
-		.status_change_nid = NUMA_NO_NODE,
 	};
 	struct node_notify node_arg = {
 		.nid = NUMA_NO_NODE,
@@ -1975,7 +1972,6 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 	 */
 	if (nr_pages >= pgdat->node_present_pages) {
 		node_arg.nid = node;
-		mem_arg.status_change_nid = node;
 		ret = node_notify(NODE_REMOVING_LAST_MEMORY, &node_arg);
 		ret = notifier_to_errno(ret);
 		if (ret) {
-- 
cgit v1.2.3


From 42f46ed99ac6c07adf7f3bcbe9040b0c52d62d0f Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:09 -0400
Subject: mm/page_alloc: pageblock flags functions clean up

Patch series "Make MIGRATE_ISOLATE a standalone bit", v10.

This patchset moves MIGRATE_ISOLATE to a standalone bit to avoid being
overwritten during pageblock isolation process.  Currently,
MIGRATE_ISOLATE is part of enum migratetype (in include/linux/mmzone.h),
thus, setting a pageblock to MIGRATE_ISOLATE overwrites its original
migratetype.  This causes pageblock migratetype loss during
alloc_contig_range() and memory offline, especially when the process fails
due to a failed pageblock isolation and the code tries to undo the
finished pageblock isolations.

In terms of performance for changing pageblock types, no performance
change is observed:

1. I used perf to collect stats of offlining and onlining all memory
   of a 40GB VM 10 times and see that get_pfnblock_flags_mask() and
   set_pfnblock_flags_mask() take about 0.12% and 0.02% of the whole
   process respectively with and without this patchset across 3 runs.

2. I used perf to collect stats of dd from /dev/random to a 40GB tmpfs
   file and find get_pfnblock_flags_mask() takes about 0.05% of the
   process with and without this patchset across 3 runs.


This patch (of 6):

No functional change is intended.

1. Add __NR_PAGEBLOCK_BITS for the number of pageblock flag bits and use
   roundup_pow_of_two(__NR_PAGEBLOCK_BITS) as NR_PAGEBLOCK_BITS to take
   right amount of bits for pageblock flags.
2. Rename PB_migrate_skip to PB_compact_skip.
3. Add {get,set,clear}_pfnblock_bit() to operate one a standalone bit,
   like PB_compact_skip.
3. Make {get,set}_pfnblock_flags_mask() internal functions and use
   {get,set}_pfnblock_migratetype() for pageblock migratetype operations.
4. Move pageblock flags common code to get_pfnblock_bitmap_bitidx().
3. Use MIGRATETYPE_MASK to get the migratetype of a pageblock from its
   flags.
4. Use PB_migrate_end in the definition of MIGRATETYPE_MASK instead of
   PB_migrate_bits.
5. Add a comment on is_migrate_cma_folio() to prevent one from changing it
   to use get_pageblock_migratetype() and causing issues.

Link: https://lkml.kernel.org/r/20250617021115.2331563-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20250617021115.2331563-2-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/physical_memory.rst |   2 +-
 include/linux/mmzone.h               |  18 ++--
 include/linux/page-isolation.h       |   2 +-
 include/linux/pageblock-flags.h      |  34 +++----
 mm/memory_hotplug.c                  |   2 +-
 mm/page_alloc.c                      | 171 +++++++++++++++++++++++++++--------
 6 files changed, 162 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst
index d3ac106e6b14..9af11b5bd145 100644
--- a/Documentation/mm/physical_memory.rst
+++ b/Documentation/mm/physical_memory.rst
@@ -584,7 +584,7 @@ Compaction control
 
 ``compact_blockskip_flush``
   Set to true when compaction migration scanner and free scanner meet, which
-  means the ``PB_migrate_skip`` bits should be cleared.
+  means the ``PB_compact_skip`` bits should be cleared.
 
 ``contiguous``
   Set to true when the zone is contiguous (in other words, no hole).
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5bec8b1d0e66..76d66c07b673 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -92,8 +92,12 @@ extern const char * const migratetype_names[MIGRATE_TYPES];
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
-#  define is_migrate_cma_folio(folio, pfn)	(MIGRATE_CMA ==		\
-	get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
+/*
+ * __dump_folio() in mm/debug.c passes a folio pointer to on-stack struct folio,
+ * so folio_pfn() cannot be used and pfn is needed.
+ */
+#  define is_migrate_cma_folio(folio, pfn) \
+	(get_pfnblock_migratetype(&folio->page, pfn) == MIGRATE_CMA)
 #else
 #  define is_migrate_cma(migratetype) false
 #  define is_migrate_cma_page(_page) false
@@ -122,14 +126,12 @@ static inline bool migratetype_is_mergeable(int mt)
 
 extern int page_group_by_mobility_disabled;
 
-#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)
+#define get_pageblock_migratetype(page) \
+	get_pfnblock_migratetype(page, page_to_pfn(page))
 
-#define get_pageblock_migratetype(page)					\
-	get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)
+#define folio_migratetype(folio) \
+	get_pageblock_migratetype(&folio->page)
 
-#define folio_migratetype(folio)				\
-	get_pfnblock_flags_mask(&folio->page, folio_pfn(folio),		\
-			MIGRATETYPE_MASK)
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 898bb788243b..277d8d92980c 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -25,7 +25,7 @@ static inline bool is_migrate_isolate(int migratetype)
 #define MEMORY_OFFLINE	0x1
 #define REPORT_FAILURE	0x2
 
-void set_pageblock_migratetype(struct page *page, int migratetype);
+void set_pageblock_migratetype(struct page *page, enum migratetype migratetype);
 
 bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 				  int migratetype);
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 6297c6343c55..c240c7a1fb03 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -19,15 +19,19 @@ enum pageblock_bits {
 	PB_migrate,
 	PB_migrate_end = PB_migrate + PB_migratetype_bits - 1,
 			/* 3 bits required for migrate types */
-	PB_migrate_skip,/* If set the block is skipped by compaction */
+	PB_compact_skip,/* If set the block is skipped by compaction */
 
 	/*
 	 * Assume the bits will always align on a word. If this assumption
 	 * changes then get/set pageblock needs updating.
 	 */
-	NR_PAGEBLOCK_BITS
+	__NR_PAGEBLOCK_BITS
 };
 
+#define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS))
+
+#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1)
+
 #if defined(CONFIG_HUGETLB_PAGE)
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -65,27 +69,23 @@ extern unsigned int pageblock_order;
 /* Forward declaration */
 struct page;
 
-unsigned long get_pfnblock_flags_mask(const struct page *page,
-				unsigned long pfn,
-				unsigned long mask);
-
-void set_pfnblock_flags_mask(struct page *page,
-				unsigned long flags,
-				unsigned long pfn,
-				unsigned long mask);
+enum migratetype get_pfnblock_migratetype(const struct page *page,
+					  unsigned long pfn);
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+		      enum pageblock_bits pb_bit);
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+		      enum pageblock_bits pb_bit);
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+			enum pageblock_bits pb_bit);
 
 /* Declarations for getting and setting flags. See mm/page_alloc.c */
 #ifdef CONFIG_COMPACTION
 #define get_pageblock_skip(page) \
-	get_pfnblock_flags_mask(page, page_to_pfn(page),	\
-			(1 << (PB_migrate_skip)))
+	get_pfnblock_bit(page, page_to_pfn(page), PB_compact_skip)
 #define clear_pageblock_skip(page) \
-	set_pfnblock_flags_mask(page, 0, page_to_pfn(page),	\
-			(1 << PB_migrate_skip))
+	clear_pfnblock_bit(page, page_to_pfn(page), PB_compact_skip)
 #define set_pageblock_skip(page) \
-	set_pfnblock_flags_mask(page, (1 << PB_migrate_skip),	\
-			page_to_pfn(page),			\
-			(1 << PB_migrate_skip))
+	set_pfnblock_bit(page, page_to_pfn(page), PB_compact_skip)
 #else
 static inline bool get_pageblock_skip(struct page *page)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a371d59cc718..403221982c2e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -774,7 +774,7 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 
 	/*
 	 * TODO now we have a visible range of pages which are not associated
-	 * with their zone properly. Not nice but set_pfnblock_flags_mask
+	 * with their zone properly. Not nice but set_pfnblock_migratetype()
 	 * expects the zone spans the pfn range. All the pages in the range
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 31041e2aa33a..43a4c1532721 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -353,81 +353,174 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 }
 
+static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
+{
+	return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+}
+
+static __always_inline void
+get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
+			   unsigned long **bitmap_word, unsigned long *bitidx)
+{
+	unsigned long *bitmap;
+	unsigned long word_bitidx;
+
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	*bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = *bitidx / BITS_PER_LONG;
+	*bitidx &= (BITS_PER_LONG - 1);
+	*bitmap_word = &bitmap[word_bitidx];
+}
+
+
 /**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * __get_pfnblock_flags_mask - Return the requested group of flags for
+ * a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
  * @pfn: The target page frame number
  * @mask: mask of bits that the caller is interested in
  *
  * Return: pageblock_bits flags
  */
-unsigned long get_pfnblock_flags_mask(const struct page *page,
-					unsigned long pfn, unsigned long mask)
+static unsigned long __get_pfnblock_flags_mask(const struct page *page,
+					       unsigned long pfn,
+					       unsigned long mask)
 {
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
+	unsigned long *bitmap_word;
+	unsigned long bitidx;
 	unsigned long word;
 
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
+	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
 	/*
-	 * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+	 * This races, without locks, with set_pfnblock_migratetype(). Ensure
 	 * a consistent read of the memory array, so that results, even though
 	 * racy, are not corrupted.
 	 */
-	word = READ_ONCE(bitmap[word_bitidx]);
+	word = READ_ONCE(*bitmap_word);
 	return (word >> bitidx) & mask;
 }
 
-static __always_inline int get_pfnblock_migratetype(const struct page *page,
-					unsigned long pfn)
+/**
+ * get_pfnblock_bit - Check if a standalone bit of a pageblock is set
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to check
+ *
+ * Return: true if the bit is set, otherwise false
+ */
+bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
+		      enum pageblock_bits pb_bit)
 {
-	return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+	unsigned long *bitmap_word;
+	unsigned long bitidx;
+
+	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+		return false;
+
+	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+	return test_bit(bitidx + pb_bit, bitmap_word);
 }
 
 /**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * get_pfnblock_migratetype - Return the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ *
+ * Return: The migratetype of the pageblock
+ *
+ * Use get_pfnblock_migratetype() if caller already has both @page and @pfn
+ * to save a call to page_to_pfn().
+ */
+__always_inline enum migratetype
+get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
+{
+	return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+}
+
+/**
+ * __set_pfnblock_flags_mask - Set the requested group of flags for
+ * a pageblock_nr_pages block of pages
  * @page: The page within the block of interest
- * @flags: The flags to set
  * @pfn: The target page frame number
+ * @flags: The flags to set
  * @mask: mask of bits that the caller is interested in
  */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-					unsigned long pfn,
-					unsigned long mask)
+static void __set_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+				      unsigned long flags, unsigned long mask)
 {
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
+	unsigned long *bitmap_word;
+	unsigned long bitidx;
 	unsigned long word;
 
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
 
 	mask <<= bitidx;
 	flags <<= bitidx;
 
-	word = READ_ONCE(bitmap[word_bitidx]);
+	word = READ_ONCE(*bitmap_word);
 	do {
-	} while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
+	} while (!try_cmpxchg(bitmap_word, &word, (word & ~mask) | flags));
+}
+
+/**
+ * set_pfnblock_bit - Set a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to set
+ */
+void set_pfnblock_bit(const struct page *page, unsigned long pfn,
+		      enum pageblock_bits pb_bit)
+{
+	unsigned long *bitmap_word;
+	unsigned long bitidx;
+
+	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+		return;
+
+	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+	set_bit(bitidx + pb_bit, bitmap_word);
 }
 
-void set_pageblock_migratetype(struct page *page, int migratetype)
+/**
+ * clear_pfnblock_bit - Clear a standalone bit of a pageblock
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @pb_bit: pageblock bit to clear
+ */
+void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
+			enum pageblock_bits pb_bit)
+{
+	unsigned long *bitmap_word;
+	unsigned long bitidx;
+
+	if (WARN_ON_ONCE(!is_standalone_pb_bit(pb_bit)))
+		return;
+
+	get_pfnblock_bitmap_bitidx(page, pfn, &bitmap_word, &bitidx);
+
+	clear_bit(bitidx + pb_bit, bitmap_word);
+}
+
+/**
+ * set_pageblock_migratetype - Set the migratetype of a pageblock
+ * @page: The page within the block of interest
+ * @migratetype: migratetype to set
+ */
+__always_inline void set_pageblock_migratetype(struct page *page,
+					       enum migratetype migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled &&
 		     migratetype < MIGRATE_PCPTYPES))
 		migratetype = MIGRATE_UNMOVABLE;
 
-	set_pfnblock_flags_mask(page, (unsigned long)migratetype,
-				page_to_pfn(page), MIGRATETYPE_MASK);
+	__set_pfnblock_flags_mask(page, page_to_pfn(page),
+				  (unsigned long)migratetype, MIGRATETYPE_MASK);
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -667,7 +760,7 @@ static inline void __add_to_free_list(struct page *page, struct zone *zone,
 	int nr_pages = 1 << order;
 
 	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
-		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     "page type is %d, passed migratetype is %d (nr=%d)\n",
 		     get_pageblock_migratetype(page), migratetype, nr_pages);
 
 	if (tail)
@@ -693,7 +786,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 
 	/* Free page moving can fail, so it happens before the type update */
 	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
-		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     "page type is %d, passed migratetype is %d (nr=%d)\n",
 		     get_pageblock_migratetype(page), old_mt, nr_pages);
 
 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
@@ -715,7 +808,7 @@ static inline void __del_page_from_free_list(struct page *page, struct zone *zon
 	int nr_pages = 1 << order;
 
         VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
-		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     "page type is %d, passed migratetype is %d (nr=%d)\n",
 		     get_pageblock_migratetype(page), migratetype, nr_pages);
 
 	/* clear reported state and update reported page count */
@@ -3123,7 +3216,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 
 /*
  * Do not instrument rmqueue() with KMSAN. This function may call
- * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * __msan_poison_alloca() through a call to set_pfnblock_migratetype().
  * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
  * may call rmqueue() again, which will result in a deadlock.
  */
-- 
cgit v1.2.3


From e904bce2d9d43e0f370e238457a13847d161570b Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:10 -0400
Subject: mm/page_isolation: make page isolation a standalone bit

During page isolation, the original migratetype is overwritten, since
MIGRATE_* are enums and stored in pageblock bitmaps.  Change
MIGRATE_ISOLATE to be stored a standalone bit, PB_migrate_isolate, like
PB_compact_skip, so that migratetype is not lost during pageblock
isolation.

Link: https://lkml.kernel.org/r/20250617021115.2331563-3-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmzone.h          |  3 +++
 include/linux/page-isolation.h  | 16 ++++++++++++++++
 include/linux/pageblock-flags.h | 14 ++++++++++++++
 mm/page_alloc.c                 | 27 ++++++++++++++++++++++++---
 4 files changed, 57 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 76d66c07b673..1d1bb2b7f40d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -79,6 +79,9 @@ enum migratetype {
 	 * __free_pageblock_cma() function.
 	 */
 	MIGRATE_CMA,
+	__MIGRATE_TYPE_END = MIGRATE_CMA,
+#else
+	__MIGRATE_TYPE_END = MIGRATE_HIGHATOMIC,
 #endif
 #ifdef CONFIG_MEMORY_ISOLATION
 	MIGRATE_ISOLATE,	/* can't allocate from here */
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 277d8d92980c..fc021d3f95ca 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -11,6 +11,12 @@ static inline bool is_migrate_isolate(int migratetype)
 {
 	return migratetype == MIGRATE_ISOLATE;
 }
+#define get_pageblock_isolate(page) \
+	get_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate)
+#define clear_pageblock_isolate(page) \
+	clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate)
+#define set_pageblock_isolate(page) \
+	set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate)
 #else
 static inline bool is_migrate_isolate_page(struct page *page)
 {
@@ -20,6 +26,16 @@ static inline bool is_migrate_isolate(int migratetype)
 {
 	return false;
 }
+static inline bool get_pageblock_isolate(struct page *page)
+{
+	return false;
+}
+static inline void clear_pageblock_isolate(struct page *page)
+{
+}
+static inline void set_pageblock_isolate(struct page *page)
+{
+}
 #endif
 
 #define MEMORY_OFFLINE	0x1
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index c240c7a1fb03..6a44be0f39f4 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -21,6 +21,13 @@ enum pageblock_bits {
 			/* 3 bits required for migrate types */
 	PB_compact_skip,/* If set the block is skipped by compaction */
 
+#ifdef CONFIG_MEMORY_ISOLATION
+	/*
+	 * Pageblock isolation is represented with a separate bit, so that
+	 * the migratetype of a block is not overwritten by isolation.
+	 */
+	PB_migrate_isolate, /* If set the block is isolated */
+#endif
 	/*
 	 * Assume the bits will always align on a word. If this assumption
 	 * changes then get/set pageblock needs updating.
@@ -32,6 +39,13 @@ enum pageblock_bits {
 
 #define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1)
 
+#ifdef CONFIG_MEMORY_ISOLATION
+#define MIGRATETYPE_AND_ISO_MASK \
+	(((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate))
+#else
+#define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK
+#endif
+
 #if defined(CONFIG_HUGETLB_PAGE)
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 43a4c1532721..61dd34102c14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -365,8 +365,12 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
 	unsigned long *bitmap;
 	unsigned long word_bitidx;
 
+#ifdef CONFIG_MEMORY_ISOLATION
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 8);
+#else
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
+#endif
+	BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
 	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 
 	bitmap = get_pageblock_bitmap(page, pfn);
@@ -439,7 +443,16 @@ bool get_pfnblock_bit(const struct page *page, unsigned long pfn,
 __always_inline enum migratetype
 get_pfnblock_migratetype(const struct page *page, unsigned long pfn)
 {
-	return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
+	unsigned long mask = MIGRATETYPE_AND_ISO_MASK;
+	unsigned long flags;
+
+	flags = __get_pfnblock_flags_mask(page, pfn, mask);
+
+#ifdef CONFIG_MEMORY_ISOLATION
+	if (flags & BIT(PB_migrate_isolate))
+		return MIGRATE_ISOLATE;
+#endif
+	return flags & MIGRATETYPE_MASK;
 }
 
 /**
@@ -519,8 +532,16 @@ __always_inline void set_pageblock_migratetype(struct page *page,
 		     migratetype < MIGRATE_PCPTYPES))
 		migratetype = MIGRATE_UNMOVABLE;
 
+#ifdef CONFIG_MEMORY_ISOLATION
+	if (migratetype == MIGRATE_ISOLATE) {
+		set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+		return;
+	}
+	/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
+#endif
 	__set_pfnblock_flags_mask(page, page_to_pfn(page),
-				  (unsigned long)migratetype, MIGRATETYPE_MASK);
+				  (unsigned long)migratetype,
+				  MIGRATETYPE_AND_ISO_MASK);
 }
 
 #ifdef CONFIG_DEBUG_VM
-- 
cgit v1.2.3


From 1bc3587a88d291a37dab12d6c14aa7da53304251 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:11 -0400
Subject: mm/page_alloc: add support for initializing pageblock as isolated

MIGRATE_ISOLATE is a standalone bit, so a pageblock cannot be initialized
to just MIGRATE_ISOLATE.  Add init_pageblock_migratetype() to enable
initialize a pageblock with a migratetype and isolated.

Link: https://lkml.kernel.org/r/20250617021115.2331563-4-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  3 ++-
 include/linux/page-isolation.h |  3 +++
 kernel/kexec_handover.c        |  4 ++--
 mm/hugetlb.c                   |  4 ++--
 mm/internal.h                  |  3 ++-
 mm/memory_hotplug.c            | 12 ++++++++----
 mm/memremap.c                  |  2 +-
 mm/mm_init.c                   | 24 +++++++++++++++---------
 mm/page_alloc.c                | 26 ++++++++++++++++++++++++++
 9 files changed, 61 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index eaac5ae8c05c..23f038a16231 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -314,7 +314,8 @@ extern int add_memory_driver_managed(int nid, u64 start, u64 size,
 				     mhp_t mhp_flags);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 				   unsigned long nr_pages,
-				   struct vmem_altmap *altmap, int migratetype);
+				   struct vmem_altmap *altmap, int migratetype,
+				   bool isolate_pageblock);
 extern void remove_pfn_range_from_zone(struct zone *zone,
 				       unsigned long start_pfn,
 				       unsigned long nr_pages);
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index fc021d3f95ca..14c6a5f691c2 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -41,6 +41,9 @@ static inline void set_pageblock_isolate(struct page *page)
 #define MEMORY_OFFLINE	0x1
 #define REPORT_FAILURE	0x2
 
+void __meminit init_pageblock_migratetype(struct page *page,
+					  enum migratetype migratetype,
+					  bool isolate);
 void set_pageblock_migratetype(struct page *page, enum migratetype migratetype);
 
 bool move_freepages_block_isolate(struct zone *zone, struct page *page,
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 5a21dbe17950..49634cc3fb43 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void)
 		ulong pfn;
 
 		for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
-			set_pageblock_migratetype(pfn_to_page(pfn),
-						  MIGRATE_CMA);
+			init_pageblock_migratetype(pfn_to_page(pfn),
+						   MIGRATE_CMA, false);
 	}
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03896375749..11d5668ff6e7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3297,8 +3297,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
 		if (folio_test_hugetlb_cma(folio))
 			init_cma_pageblock(folio_page(folio, i));
 		else
-			set_pageblock_migratetype(folio_page(folio, i),
-					  MIGRATE_MOVABLE);
+			init_pageblock_migratetype(folio_page(folio, i),
+					  MIGRATE_MOVABLE, false);
 	}
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index fe83dfca3c72..22a95a2b7fa1 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -820,7 +820,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 			  int nid, bool exact_nid);
 
 void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
-		unsigned long, enum meminit_context, struct vmem_altmap *, int);
+		unsigned long, enum meminit_context, struct vmem_altmap *, int,
+		bool);
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 403221982c2e..a3c2b0784070 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -747,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn)
  */
 void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 				  unsigned long nr_pages,
-				  struct vmem_altmap *altmap, int migratetype)
+				  struct vmem_altmap *altmap, int migratetype,
+				  bool isolate_pageblock)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
 	int nid = pgdat->node_id;
@@ -779,7 +780,8 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
 	memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
-			 MEMINIT_HOTPLUG, altmap, migratetype);
+			 MEMINIT_HOTPLUG, altmap, migratetype,
+			 isolate_pageblock);
 
 	set_zone_contiguous(zone);
 }
@@ -1104,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
 	if (mhp_off_inaccessible)
 		page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
 
-	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
+			       false);
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = pfn_to_page(pfn + i);
@@ -1175,7 +1178,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 
 
 	/* associate pfn range with the zone */
-	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
+	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
+			       true);
 
 	if (!node_state(nid, N_MEMORY)) {
 		/* Adding memory to the node for the first time */
diff --git a/mm/memremap.c b/mm/memremap.c
index f75078c14839..b0ce0d8254bd 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -228,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
 		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
 		move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
 				PHYS_PFN(range_len(range)), params->altmap,
-				MIGRATE_MOVABLE);
+				MIGRATE_MOVABLE, false);
 	}
 
 	mem_hotplug_done();
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 02f41e2bdf60..5c21b3af216b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -685,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid)
 	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
 
 	if (pageblock_aligned(pfn))
-		set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE);
+		init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE,
+				false);
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -874,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn,
 void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn, unsigned long zone_end_pfn,
 		enum meminit_context context,
-		struct vmem_altmap *altmap, int migratetype)
+		struct vmem_altmap *altmap, int migratetype,
+		bool isolate_pageblock)
 {
 	unsigned long pfn, end_pfn = start_pfn + size;
 	struct page *page;
@@ -931,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
 		 * over the place during system boot.
 		 */
 		if (pageblock_aligned(pfn)) {
-			set_pageblock_migratetype(page, migratetype);
+			init_pageblock_migratetype(page, migratetype,
+					isolate_pageblock);
 			cond_resched();
 		}
 		pfn++;
@@ -954,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone,
 		return;
 
 	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
-			  zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+			  zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE,
+			  false);
 
 	if (*hole_pfn < start_pfn)
 		init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
@@ -1035,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
 	 * because this is done early in section_activate()
 	 */
 	if (pageblock_aligned(pfn)) {
-		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+		init_pageblock_migratetype(page, MIGRATE_MOVABLE, false);
 		cond_resched();
 	}
 
@@ -1996,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn,
 	/* Free a large naturally-aligned chunk if possible */
 	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
 		for (i = 0; i < nr_pages; i += pageblock_nr_pages)
-			set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
+			init_pageblock_migratetype(page + i, MIGRATE_MOVABLE,
+					false);
 		__free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY);
 		return;
 	}
@@ -2006,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn,
 
 	for (i = 0; i < nr_pages; i++, page++, pfn++) {
 		if (pageblock_aligned(pfn))
-			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+			init_pageblock_migratetype(page, MIGRATE_MOVABLE,
+					false);
 		__free_pages_core(page, 0, MEMINIT_EARLY);
 	}
 }
@@ -2305,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
 		set_page_count(p, 0);
 	} while (++p, --i);
 
-	set_pageblock_migratetype(page, MIGRATE_CMA);
+	init_pageblock_migratetype(page, MIGRATE_CMA, false);
 	set_page_refcounted(page);
 	/* pages were reserved and not allocated */
 	clear_page_tag_ref(page);
@@ -2319,7 +2325,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
  */
 void __init init_cma_pageblock(struct page *page)
 {
-	set_pageblock_migratetype(page, MIGRATE_CMA);
+	init_pageblock_migratetype(page, MIGRATE_CMA, false);
 	adjust_managed_page_count(page, pageblock_nr_pages);
 	page_zone(page)->cma_pages += pageblock_nr_pages;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61dd34102c14..c7730264bf5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -544,6 +544,32 @@ __always_inline void set_pageblock_migratetype(struct page *page,
 				  MIGRATETYPE_AND_ISO_MASK);
 }
 
+void __meminit init_pageblock_migratetype(struct page *page,
+					  enum migratetype migratetype,
+					  bool isolate)
+{
+	unsigned long flags;
+
+	if (unlikely(page_group_by_mobility_disabled &&
+		     migratetype < MIGRATE_PCPTYPES))
+		migratetype = MIGRATE_UNMOVABLE;
+
+	flags = migratetype;
+
+#ifdef CONFIG_MEMORY_ISOLATION
+	if (migratetype == MIGRATE_ISOLATE) {
+		VM_WARN_ONCE(
+			1,
+			"Set isolate=true to isolate pageblock with a migratetype");
+		return;
+	}
+	if (isolate)
+		flags |= BIT(PB_migrate_isolate);
+#endif
+	__set_pfnblock_flags_mask(page, page_to_pfn(page), flags,
+				  MIGRATETYPE_AND_ISO_MASK);
+}
+
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
-- 
cgit v1.2.3


From b1df9c5713dc41229667aa44eaea2399e8de9470 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:12 -0400
Subject: mm/page_isolation: remove migratetype from
 move_freepages_block_isolate()

Since migratetype is no longer overwritten during pageblock isolation,
moving a pageblock out of MIGRATE_ISOLATE no longer needs a new
migratetype.

Add pageblock_isolate_and_move_free_pages() and
pageblock_unisolate_and_move_free_pages() to be explicit about the page
isolation operations.  Both share the common code in
__move_freepages_block_isolate(), which is renamed from
move_freepages_block_isolate().

Add toggle_pageblock_isolate() to flip pageblock isolation bit in
__move_freepages_block_isolate().

Make set_pageblock_migratetype() only accept non MIGRATE_ISOLATE types, so
that one should use set_pageblock_isolate() to isolate pageblocks.  As a
result, move pageblock migratetype code out of __move_freepages_block().

Link: https://lkml.kernel.org/r/20250617021115.2331563-5-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h |  5 ++-
 mm/page_alloc.c                | 80 +++++++++++++++++++++++++++++++++---------
 mm/page_isolation.c            | 21 ++++++-----
 3 files changed, 75 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 14c6a5f691c2..7241a6719618 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -44,10 +44,9 @@ static inline void set_pageblock_isolate(struct page *page)
 void __meminit init_pageblock_migratetype(struct page *page,
 					  enum migratetype migratetype,
 					  bool isolate);
-void set_pageblock_migratetype(struct page *page, enum migratetype migratetype);
 
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
-				  int migratetype);
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page);
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page);
 
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     int migratetype, int flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c7730264bf5f..938b01bed1f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -525,8 +525,8 @@ void clear_pfnblock_bit(const struct page *page, unsigned long pfn,
  * @page: The page within the block of interest
  * @migratetype: migratetype to set
  */
-__always_inline void set_pageblock_migratetype(struct page *page,
-					       enum migratetype migratetype)
+static void set_pageblock_migratetype(struct page *page,
+				      enum migratetype migratetype)
 {
 	if (unlikely(page_group_by_mobility_disabled &&
 		     migratetype < MIGRATE_PCPTYPES))
@@ -534,9 +534,13 @@ __always_inline void set_pageblock_migratetype(struct page *page,
 
 #ifdef CONFIG_MEMORY_ISOLATION
 	if (migratetype == MIGRATE_ISOLATE) {
-		set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+		VM_WARN_ONCE(1,
+			"Use set_pageblock_isolate() for pageblock isolation");
 		return;
 	}
+	VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
+				      PB_migrate_isolate),
+		     "Use clear_pageblock_isolate() to unisolate pageblock");
 	/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
 #endif
 	__set_pfnblock_flags_mask(page, page_to_pfn(page),
@@ -1921,8 +1925,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
 #endif
 
 /*
- * Change the type of a block and move all its free pages to that
- * type's freelist.
+ * Move all free pages of a block to new type's freelist. Caller needs to
+ * change the block type.
  */
 static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
 				  int old_mt, int new_mt)
@@ -1954,8 +1958,6 @@ static int __move_freepages_block(struct zone *zone, unsigned long start_pfn,
 		pages_moved += 1 << order;
 	}
 
-	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
-
 	return pages_moved;
 }
 
@@ -2013,11 +2015,16 @@ static int move_freepages_block(struct zone *zone, struct page *page,
 				int old_mt, int new_mt)
 {
 	unsigned long start_pfn;
+	int res;
 
 	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
 		return -1;
 
-	return __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+	res = __move_freepages_block(zone, start_pfn, old_mt, new_mt);
+	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
+
+	return res;
+
 }
 
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -2045,11 +2052,19 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
 	return start_pfn;
 }
 
+static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
+{
+	if (isolate)
+		set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+	else
+		clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+}
+
 /**
- * move_freepages_block_isolate - move free pages in block for page isolation
+ * __move_freepages_block_isolate - move free pages in block for page isolation
  * @zone: the zone
  * @page: the pageblock page
- * @migratetype: migratetype to set on the pageblock
+ * @isolate: to isolate the given pageblock or unisolate it
  *
  * This is similar to move_freepages_block(), but handles the special
  * case encountered in page isolation, where the block of interest
@@ -2064,10 +2079,18 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
  *
  * Returns %true if pages could be moved, %false otherwise.
  */
-bool move_freepages_block_isolate(struct zone *zone, struct page *page,
-				  int migratetype)
+static bool __move_freepages_block_isolate(struct zone *zone,
+		struct page *page, bool isolate)
 {
 	unsigned long start_pfn, pfn;
+	int from_mt;
+	int to_mt;
+
+	if (isolate == get_pageblock_isolate(page)) {
+		VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
+			     isolate ? "Isolate" : "Unisolate");
+		return false;
+	}
 
 	if (!prep_move_freepages_block(zone, page, &start_pfn, NULL, NULL))
 		return false;
@@ -2084,7 +2107,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 
 		del_page_from_free_list(buddy, zone, order,
 					get_pfnblock_migratetype(buddy, pfn));
-		set_pageblock_migratetype(page, migratetype);
+		toggle_pageblock_isolate(page, isolate);
 		split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
 		return true;
 	}
@@ -2095,16 +2118,38 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 
 		del_page_from_free_list(page, zone, order,
 					get_pfnblock_migratetype(page, pfn));
-		set_pageblock_migratetype(page, migratetype);
+		toggle_pageblock_isolate(page, isolate);
 		split_large_buddy(zone, page, pfn, order, FPI_NONE);
 		return true;
 	}
 move:
-	__move_freepages_block(zone, start_pfn,
-			       get_pfnblock_migratetype(page, start_pfn),
-			       migratetype);
+	/* Use MIGRATETYPE_MASK to get non-isolate migratetype */
+	if (isolate) {
+		from_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+						    MIGRATETYPE_MASK);
+		to_mt = MIGRATE_ISOLATE;
+	} else {
+		from_mt = MIGRATE_ISOLATE;
+		to_mt = __get_pfnblock_flags_mask(page, page_to_pfn(page),
+						  MIGRATETYPE_MASK);
+	}
+
+	__move_freepages_block(zone, start_pfn, from_mt, to_mt);
+	toggle_pageblock_isolate(pfn_to_page(start_pfn), isolate);
+
 	return true;
 }
+
+bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+	return __move_freepages_block_isolate(zone, page, true);
+}
+
+bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page)
+{
+	return __move_freepages_block_isolate(zone, page, false);
+}
+
 #endif /* CONFIG_MEMORY_ISOLATION */
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -2296,6 +2341,7 @@ try_to_claim_block(struct zone *zone, struct page *page,
 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled) {
 		__move_freepages_block(zone, start_pfn, block_type, start_type);
+		set_pageblock_migratetype(pfn_to_page(start_pfn), start_type);
 		return __rmqueue_smallest(zone, order, start_type);
 	}
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b2fc5266e3d2..08f627a5032f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -188,7 +188,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
 			migratetype, isol_flags);
 	if (!unmovable) {
-		if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
+		if (!pageblock_isolate_and_move_free_pages(zone, page)) {
 			spin_unlock_irqrestore(&zone->lock, flags);
 			return -EBUSY;
 		}
@@ -209,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	return -EBUSY;
 }
 
-static void unset_migratetype_isolate(struct page *page, int migratetype)
+static void unset_migratetype_isolate(struct page *page)
 {
 	struct zone *zone;
 	unsigned long flags;
@@ -262,10 +262,10 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 		 * Isolating this block already succeeded, so this
 		 * should not fail on zone boundaries.
 		 */
-		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+		WARN_ON_ONCE(!pageblock_unisolate_and_move_free_pages(zone, page));
 	} else {
-		set_pageblock_migratetype(page, migratetype);
-		__putback_isolated_page(page, order, migratetype);
+		clear_pageblock_isolate(page);
+		__putback_isolated_page(page, order, get_pageblock_migratetype(page));
 	}
 	zone->nr_isolate_pageblock--;
 out:
@@ -383,7 +383,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 		if (PageBuddy(page)) {
 			int order = buddy_order(page);
 
-			/* move_freepages_block_isolate() handled this */
+			/* pageblock_isolate_and_move_free_pages() handled this */
 			VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
 
 			pfn += 1UL << order;
@@ -433,7 +433,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 failed:
 	/* restore the original migratetype */
 	if (!skip_isolation)
-		unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
+		unset_migratetype_isolate(pfn_to_page(isolate_pageblock));
 	return -EBUSY;
 }
 
@@ -504,7 +504,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	ret = isolate_single_pageblock(isolate_end, flags, true,
 			skip_isolation, migratetype);
 	if (ret) {
-		unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+		unset_migratetype_isolate(pfn_to_page(isolate_start));
 		return ret;
 	}
 
@@ -517,8 +517,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 					start_pfn, end_pfn)) {
 			undo_isolate_page_range(isolate_start, pfn, migratetype);
 			unset_migratetype_isolate(
-				pfn_to_page(isolate_end - pageblock_nr_pages),
-				migratetype);
+				pfn_to_page(isolate_end - pageblock_nr_pages));
 			return -EBUSY;
 		}
 	}
@@ -548,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (!page || !is_migrate_isolate_page(page))
 			continue;
-		unset_migratetype_isolate(page, migratetype);
+		unset_migratetype_isolate(page);
 	}
 }
 /*
-- 
cgit v1.2.3


From 7a3324eb66f616408fdaaff8a1289c0a9b333748 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:13 -0400
Subject: mm/page_isolation: remove migratetype from undo_isolate_page_range()

Since migratetype is no longer overwritten during pageblock isolation,
undoing pageblock isolation no longer needs which migratetype to restore.

Link: https://lkml.kernel.org/r/20250617021115.2331563-6-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h | 3 +--
 mm/memory_hotplug.c            | 4 ++--
 mm/page_alloc.c                | 2 +-
 mm/page_isolation.c            | 9 +++------
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 7241a6719618..7a681a49e73c 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -51,8 +51,7 @@ bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *pag
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     int migratetype, int flags);
 
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     int migratetype);
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn);
 
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 			int isol_flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a3c2b0784070..6e3380ad1bf5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1222,7 +1222,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
 		build_all_zonelists(NULL);
 
 	/* Basic onlining is complete, allow allocation of onlined pages. */
-	undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+	undo_isolate_page_range(pfn, pfn + nr_pages);
 
 	/*
 	 * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -2086,7 +2086,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 
 failed_removal_isolated:
 	/* pushback to free area */
-	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+	undo_isolate_page_range(start_pfn, end_pfn);
 	memory_notify(MEM_CANCEL_OFFLINE, &mem_arg);
 	if (node_arg.nid != NUMA_NO_NODE)
 		node_notify(NODE_CANCEL_REMOVING_LAST_MEMORY, &node_arg);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 938b01bed1f6..8d4b3f12b9a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6984,7 +6984,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 		     start, end, outer_start, outer_end);
 	}
 done:
-	undo_isolate_page_range(start, end, migratetype);
+	undo_isolate_page_range(start, end);
 	return ret;
 }
 EXPORT_SYMBOL(alloc_contig_range_noprof);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 08f627a5032f..1edfef408faf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -515,7 +515,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		page = __first_valid_page(pfn, pageblock_nr_pages);
 		if (page && set_migratetype_isolate(page, migratetype, flags,
 					start_pfn, end_pfn)) {
-			undo_isolate_page_range(isolate_start, pfn, migratetype);
+			undo_isolate_page_range(isolate_start, pfn);
 			unset_migratetype_isolate(
 				pfn_to_page(isolate_end - pageblock_nr_pages));
 			return -EBUSY;
@@ -528,13 +528,10 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * undo_isolate_page_range - undo effects of start_isolate_page_range()
  * @start_pfn:		The first PFN of the isolated range
  * @end_pfn:		The last PFN of the isolated range
- * @migratetype:	New migrate type to set on the range
  *
- * This finds every MIGRATE_ISOLATE page block in the given range
- * and switches it to @migratetype.
+ * This finds and unsets every MIGRATE_ISOLATE page block in the given range
  */
-void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			    int migratetype)
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	struct page *page;
-- 
cgit v1.2.3


From d1554fb6302093d353c8bf4601f9bf994b836904 Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Mon, 16 Jun 2025 22:11:14 -0400
Subject: mm/page_isolation: remove migratetype parameter from more functions

migratetype is no longer overwritten during pageblock isolation,
start_isolate_page_range(), has_unmovable_pages(), and
set_migratetype_isolate() no longer need which migratetype to restore
during isolation failure.

For has_unmoable_pages(), it needs to know if the isolation is for CMA
allocation, so adding PB_ISOLATE_MODE_CMA_ALLOC provide the information.
At the same time change isolation flags to enum pb_isolate_mode
(PB_ISOLATE_MODE_MEM_OFFLINE, PB_ISOLATE_MODE_CMA_ALLOC,
PB_ISOLATE_MODE_OTHER).  Remove REPORT_FAILURE and check
PB_ISOLATE_MODE_MEM_OFFLINE, since only PB_ISOLATE_MODE_MEM_OFFLINE
reports isolation failures.

alloc_contig_range() no longer needs migratetype.  Replace it with a newly
defined acr_flags_t to tell if an allocation is for CMA.  So does
__alloc_contig_migrate_range().  Add ACR_FLAGS_NONE (set to 0) to indicate
ordinary allocations.

Link: https://lkml.kernel.org/r/20250617021115.2331563-7-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Richard Chang <richardycc@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/virtio/virtio_mem.c    |  2 +-
 include/linux/gfp.h            |  7 ++++-
 include/linux/page-isolation.h | 20 +++++++++---
 include/trace/events/kmem.h    | 14 +++++----
 mm/cma.c                       |  2 +-
 mm/memory_hotplug.c            |  6 ++--
 mm/page_alloc.c                | 27 ++++++++--------
 mm/page_isolation.c            | 70 +++++++++++++++++++-----------------------
 8 files changed, 80 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 56d0dbe62163..1688ecd69a04 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -1243,7 +1243,7 @@ static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
 		if (atomic_read(&vm->config_changed))
 			return -EAGAIN;
 
-		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
+		rc = alloc_contig_range(pfn, pfn + nr_pages, ACR_FLAGS_NONE,
 					GFP_KERNEL);
 		if (rc == -ENOMEM)
 			/* whoops, out of memory */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index be160e8d8bcb..5ebf26fcdcfa 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -423,9 +423,14 @@ static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
 extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
 
 #ifdef CONFIG_CONTIG_ALLOC
+
+typedef unsigned int __bitwise acr_flags_t;
+#define ACR_FLAGS_NONE ((__force acr_flags_t)0) // ordinary allocation request
+#define ACR_FLAGS_CMA ((__force acr_flags_t)BIT(0)) // allocate for CMA
+
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
-			      unsigned migratetype, gfp_t gfp_mask);
+				     acr_flags_t alloc_flags, gfp_t gfp_mask);
 #define alloc_contig_range(...)			alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
 
 extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 7a681a49e73c..3e2f960e166c 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -38,8 +38,20 @@ static inline void set_pageblock_isolate(struct page *page)
 }
 #endif
 
-#define MEMORY_OFFLINE	0x1
-#define REPORT_FAILURE	0x2
+/*
+ * Pageblock isolation modes:
+ * PB_ISOLATE_MODE_MEM_OFFLINE - isolate to offline (!allocate) memory
+ *				 e.g., skip over PageHWPoison() pages and
+ *				 PageOffline() pages. Unmovable pages will be
+ *				 reported in this mode.
+ * PB_ISOLATE_MODE_CMA_ALLOC   - isolate for CMA allocations
+ * PB_ISOLATE_MODE_OTHER       - isolate for other purposes
+ */
+enum pb_isolate_mode {
+	PB_ISOLATE_MODE_MEM_OFFLINE,
+	PB_ISOLATE_MODE_CMA_ALLOC,
+	PB_ISOLATE_MODE_OTHER,
+};
 
 void __meminit init_pageblock_migratetype(struct page *page,
 					  enum migratetype migratetype,
@@ -49,10 +61,10 @@ bool pageblock_isolate_and_move_free_pages(struct zone *zone, struct page *page)
 bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *page);
 
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     int migratetype, int flags);
+			     enum pb_isolate_mode mode);
 
 void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn);
 
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
-			int isol_flags);
+			enum pb_isolate_mode mode);
 #endif
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f74925a6cf69..efffcf578217 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -304,6 +304,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->change_ownership)
 );
 
+#ifdef CONFIG_CONTIG_ALLOC
 TRACE_EVENT(mm_alloc_contig_migrate_range_info,
 
 	TP_PROTO(unsigned long start,
@@ -311,9 +312,9 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info,
 		 unsigned long nr_migrated,
 		 unsigned long nr_reclaimed,
 		 unsigned long nr_mapped,
-		 int migratetype),
+		 acr_flags_t alloc_flags),
 
-	TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype),
+	TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, alloc_flags),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, start)
@@ -321,7 +322,7 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info,
 		__field(unsigned long, nr_migrated)
 		__field(unsigned long, nr_reclaimed)
 		__field(unsigned long, nr_mapped)
-		__field(int, migratetype)
+		__field(acr_flags_t, alloc_flags)
 	),
 
 	TP_fast_assign(
@@ -330,17 +331,18 @@ TRACE_EVENT(mm_alloc_contig_migrate_range_info,
 		__entry->nr_migrated = nr_migrated;
 		__entry->nr_reclaimed = nr_reclaimed;
 		__entry->nr_mapped = nr_mapped;
-		__entry->migratetype = migratetype;
+		__entry->alloc_flags = alloc_flags;
 	),
 
-	TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
+	TP_printk("start=0x%lx end=0x%lx alloc_flags=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
 		  __entry->start,
 		  __entry->end,
-		  __entry->migratetype,
+		  __entry->alloc_flags,
 		  __entry->nr_migrated,
 		  __entry->nr_reclaimed,
 		  __entry->nr_mapped)
 );
+#endif
 
 TRACE_EVENT(mm_setup_per_zone_wmarks,
 
diff --git a/mm/cma.c b/mm/cma.c
index bd3772773736..c0b2630a1b81 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -822,7 +822,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr,
 
 		pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit);
 		mutex_lock(&cma->alloc_mutex);
-		ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
+		ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp);
 		mutex_unlock(&cma->alloc_mutex);
 		if (ret == 0) {
 			page = pfn_to_page(pfn);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6e3380ad1bf5..e4009a44f883 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1962,8 +1962,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
-				       MIGRATE_MOVABLE,
-				       MEMORY_OFFLINE | REPORT_FAILURE);
+				       PB_ISOLATE_MODE_MEM_OFFLINE);
 	if (ret) {
 		reason = "failure to isolate range";
 		goto failed_removal_pcplists_disabled;
@@ -2033,7 +2032,8 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 			goto failed_removal_isolated;
 		}
 
-		ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
+		ret = test_pages_isolated(start_pfn, end_pfn,
+					  PB_ISOLATE_MODE_MEM_OFFLINE);
 
 	} while (ret);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d4b3f12b9a6..4f55f8ed65c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6693,11 +6693,12 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
 
 /*
  * [start, end) must belong to a single zone.
- * @migratetype: using migratetype to filter the type of migration in
+ * @alloc_flags: using acr_flags_t to filter the type of migration in
  *		trace_mm_alloc_contig_migrate_range_info.
  */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
-		unsigned long start, unsigned long end, int migratetype)
+					unsigned long start, unsigned long end,
+					acr_flags_t alloc_flags)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned int nr_reclaimed;
@@ -6769,7 +6770,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		putback_movable_pages(&cc->migratepages);
 	}
 
-	trace_mm_alloc_contig_migrate_range_info(start, end, migratetype,
+	trace_mm_alloc_contig_migrate_range_info(start, end, alloc_flags,
 						 total_migrated,
 						 total_reclaimed,
 						 total_mapped);
@@ -6840,10 +6841,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
  * alloc_contig_range() -- tries to allocate given range of pages
  * @start:	start PFN to allocate
  * @end:	one-past-the-last PFN to allocate
- * @migratetype:	migratetype of the underlying pageblocks (either
- *			#MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
- *			in range must have the same migratetype and it must
- *			be either of the two.
+ * @alloc_flags:	allocation information
  * @gfp_mask:	GFP mask. Node/zone/placement hints are ignored; only some
  *		action and reclaim modifiers are supported. Reclaim modifiers
  *		control allocation behavior during compaction/migration/reclaim.
@@ -6860,7 +6858,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
  * need to be freed with free_contig_range().
  */
 int alloc_contig_range_noprof(unsigned long start, unsigned long end,
-		       unsigned migratetype, gfp_t gfp_mask)
+			      acr_flags_t alloc_flags, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
 	int ret = 0;
@@ -6875,6 +6873,9 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 		.alloc_contig = true,
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
+	enum pb_isolate_mode mode = (alloc_flags & ACR_FLAGS_CMA) ?
+					    PB_ISOLATE_MODE_CMA_ALLOC :
+					    PB_ISOLATE_MODE_OTHER;
 
 	gfp_mask = current_gfp_context(gfp_mask);
 	if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
@@ -6901,7 +6902,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 	 * put back to page allocator so that buddy can use them.
 	 */
 
-	ret = start_isolate_page_range(start, end, migratetype, 0);
+	ret = start_isolate_page_range(start, end, mode);
 	if (ret)
 		goto done;
 
@@ -6917,7 +6918,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 	 * allocated.  So, if we fall through be sure to clear ret so that
 	 * -EBUSY is not accidentally used or returned to caller.
 	 */
-	ret = __alloc_contig_migrate_range(&cc, start, end, migratetype);
+	ret = __alloc_contig_migrate_range(&cc, start, end, alloc_flags);
 	if (ret && ret != -EBUSY)
 		goto done;
 
@@ -6951,7 +6952,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 	outer_start = find_large_buddy(start);
 
 	/* Make sure the range is really isolated. */
-	if (test_pages_isolated(outer_start, end, 0)) {
+	if (test_pages_isolated(outer_start, end, mode)) {
 		ret = -EBUSY;
 		goto done;
 	}
@@ -6994,8 +6995,8 @@ static int __alloc_contig_pages(unsigned long start_pfn,
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
 
-	return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
-				   gfp_mask);
+	return alloc_contig_range_noprof(start_pfn, end_pfn, ACR_FLAGS_NONE,
+					 gfp_mask);
 }
 
 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 1edfef408faf..ece3bfc56bcd 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -31,7 +31,7 @@
  *
  */
 static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
-				int migratetype, int flags)
+				enum pb_isolate_mode mode)
 {
 	struct page *page = pfn_to_page(start_pfn);
 	struct zone *zone = page_zone(page);
@@ -46,7 +46,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
 		 * isolate CMA pageblocks even when they are not movable in fact
 		 * so consider them movable here.
 		 */
-		if (is_migrate_cma(migratetype))
+		if (mode == PB_ISOLATE_MODE_CMA_ALLOC)
 			return NULL;
 
 		return page;
@@ -117,7 +117,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
 		 * The HWPoisoned page may be not in buddy system, and
 		 * page_count() is not 0.
 		 */
-		if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+		if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageHWPoison(page))
 			continue;
 
 		/*
@@ -130,7 +130,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
 		 * move these pages that still have a reference count > 0.
 		 * (false negatives in this function only)
 		 */
-		if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+		if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
 			continue;
 
 		if (__PageMovable(page) || PageLRU(page))
@@ -151,7 +151,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
  * present in [start_pfn, end_pfn). The pageblock must intersect with
  * [start_pfn, end_pfn).
  */
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+static int set_migratetype_isolate(struct page *page, enum pb_isolate_mode mode,
 			unsigned long start_pfn, unsigned long end_pfn)
 {
 	struct zone *zone = page_zone(page);
@@ -186,7 +186,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 				  end_pfn);
 
 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
-			migratetype, isol_flags);
+			mode);
 	if (!unmovable) {
 		if (!pageblock_isolate_and_move_free_pages(zone, page)) {
 			spin_unlock_irqrestore(&zone->lock, flags);
@@ -198,7 +198,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	}
 
 	spin_unlock_irqrestore(&zone->lock, flags);
-	if (isol_flags & REPORT_FAILURE) {
+	if (mode == PB_ISOLATE_MODE_MEM_OFFLINE) {
 		/*
 		 * printk() with zone->lock held will likely trigger a
 		 * lockdep splat, so defer it here.
@@ -292,11 +292,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * isolate_single_pageblock() -- tries to isolate a pageblock that might be
  * within a free or in-use page.
  * @boundary_pfn:		pageblock-aligned pfn that a page might cross
- * @flags:			isolation flags
+ * @mode:			isolation mode
  * @isolate_before:	isolate the pageblock before the boundary_pfn
  * @skip_isolation:	the flag to skip the pageblock isolation in second
  *			isolate_single_pageblock()
- * @migratetype:	migrate type to set in error recovery.
  *
  * Free and in-use pages can be as big as MAX_PAGE_ORDER and contain more than one
  * pageblock. When not all pageblocks within a page are isolated at the same
@@ -311,8 +310,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
  * either. The function handles this by splitting the free page or migrating
  * the in-use page then splitting the free page.
  */
-static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
-		bool isolate_before, bool skip_isolation, int migratetype)
+static int isolate_single_pageblock(unsigned long boundary_pfn,
+			enum pb_isolate_mode mode, bool isolate_before,
+			bool skip_isolation)
 {
 	unsigned long start_pfn;
 	unsigned long isolate_pageblock;
@@ -338,12 +338,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				      zone->zone_start_pfn);
 
 	if (skip_isolation) {
-		int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
-
-		VM_BUG_ON(!is_migrate_isolate(mt));
+		VM_BUG_ON(!get_pageblock_isolate(pfn_to_page(isolate_pageblock)));
 	} else {
-		ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
-				flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+		ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock),
+				mode, isolate_pageblock,
+				isolate_pageblock + pageblock_nr_pages);
 
 		if (ret)
 			return ret;
@@ -441,14 +440,7 @@ failed:
  * start_isolate_page_range() - mark page range MIGRATE_ISOLATE
  * @start_pfn:		The first PFN of the range to be isolated.
  * @end_pfn:		The last PFN of the range to be isolated.
- * @migratetype:	Migrate type to set in error recovery.
- * @flags:		The following flags are allowed (they can be combined in
- *			a bit mask)
- *			MEMORY_OFFLINE - isolate to offline (!allocate) memory
- *					 e.g., skip over PageHWPoison() pages
- *					 and PageOffline() pages.
- *			REPORT_FAILURE - report details about the failure to
- *			isolate the range
+ * @mode:		isolation mode
  *
  * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
  * the range will never be allocated. Any free pages and pages freed in the
@@ -481,7 +473,7 @@ failed:
  * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
  */
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-			     int migratetype, int flags)
+			     enum pb_isolate_mode mode)
 {
 	unsigned long pfn;
 	struct page *page;
@@ -492,8 +484,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	bool skip_isolation = false;
 
 	/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
-	ret = isolate_single_pageblock(isolate_start, flags, false,
-			skip_isolation, migratetype);
+	ret = isolate_single_pageblock(isolate_start, mode, false,
+			skip_isolation);
 	if (ret)
 		return ret;
 
@@ -501,8 +493,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 		skip_isolation = true;
 
 	/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
-	ret = isolate_single_pageblock(isolate_end, flags, true,
-			skip_isolation, migratetype);
+	ret = isolate_single_pageblock(isolate_end, mode, true, skip_isolation);
 	if (ret) {
 		unset_migratetype_isolate(pfn_to_page(isolate_start));
 		return ret;
@@ -513,8 +504,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 	     pfn < isolate_end - pageblock_nr_pages;
 	     pfn += pageblock_nr_pages) {
 		page = __first_valid_page(pfn, pageblock_nr_pages);
-		if (page && set_migratetype_isolate(page, migratetype, flags,
-					start_pfn, end_pfn)) {
+		if (page && set_migratetype_isolate(page, mode, start_pfn,
+					end_pfn)) {
 			undo_isolate_page_range(isolate_start, pfn);
 			unset_migratetype_isolate(
 				pfn_to_page(isolate_end - pageblock_nr_pages));
@@ -556,7 +547,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
  */
 static unsigned long
 __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
-				  int flags)
+				  enum pb_isolate_mode mode)
 {
 	struct page *page;
 
@@ -569,11 +560,12 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
 			 * simple way to verify that as VM_BUG_ON(), though.
 			 */
 			pfn += 1 << buddy_order(page);
-		else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+		else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+			 PageHWPoison(page))
 			/* A HWPoisoned page cannot be also PageBuddy */
 			pfn++;
-		else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
-			 !page_count(page))
+		else if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) &&
+			 PageOffline(page) && !page_count(page))
 			/*
 			 * The responsible driver agreed to skip PageOffline()
 			 * pages when offlining memory by dropping its
@@ -591,11 +583,11 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
  * test_pages_isolated - check if pageblocks in range are isolated
  * @start_pfn:		The first PFN of the isolated range
  * @end_pfn:		The first PFN *after* the isolated range
- * @isol_flags:		Testing mode flags
+ * @mode:		Testing mode
  *
  * This tests if all in the specified range are free.
  *
- * If %MEMORY_OFFLINE is specified in @flags, it will consider
+ * If %PB_ISOLATE_MODE_MEM_OFFLINE specified in @mode, it will consider
  * poisoned and offlined pages free as well.
  *
  * Caller must ensure the requested range doesn't span zones.
@@ -603,7 +595,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
  * Returns 0 if true, -EBUSY if one or more pages are in use.
  */
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
-			int isol_flags)
+			enum pb_isolate_mode mode)
 {
 	unsigned long pfn, flags;
 	struct page *page;
@@ -639,7 +631,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 	/* Check all pages are free or marked as ISOLATED */
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lock, flags);
-	pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags);
+	pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, mode);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	ret = pfn < end_pfn ? -EBUSY : 0;
-- 
cgit v1.2.3


From eff41389d8249a1a5a67faa440255ed8e526803a Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 27 Jun 2025 12:07:07 -0400
Subject: mm/hugetlb: remove prepare_hugepage_range()

Only mips and loongarch implemented this API, however what it does was
checking against stack overflow for either len or addr.  That's already
done in arch's arch_get_unmapped_area*() functions, even though it may not
be 100% identical checks.

For example, for both of the architectures, there will be a trivial
difference on how stack top was defined.  The old code uses STACK_TOP
which may be slightly smaller than TASK_SIZE on either of them, but the
hope is that shouldn't be a problem.

It means the whole API is pretty much obsolete at least now, remove it
completely.

Link: https://lkml.kernel.org/r/20250627160707.2124580-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Jann Horn <jannh@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/loongarch/include/asm/hugetlb.h | 14 --------------
 arch/mips/include/asm/hugetlb.h      | 14 --------------
 fs/hugetlbfs/inode.c                 |  8 ++------
 include/asm-generic/hugetlb.h        |  8 --------
 include/linux/hugetlb.h              |  6 ------
 5 files changed, 2 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h
index 4dc4b3e04225..ab68b594f889 100644
--- a/arch/loongarch/include/asm/hugetlb.h
+++ b/arch/loongarch/include/asm/hugetlb.h
@@ -10,20 +10,6 @@
 
 uint64_t pmd_to_entrylo(unsigned long pmd_val);
 
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
-					 unsigned long addr,
-					 unsigned long len)
-{
-	unsigned long task_size = STACK_TOP;
-
-	if (len > task_size)
-		return -ENOMEM;
-	if (task_size - len < addr)
-		return -EINVAL;
-	return 0;
-}
-
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index fbc71ddcf0f6..8c460ce01ffe 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -11,20 +11,6 @@
 
 #include <asm/page.h>
 
-#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
-					 unsigned long addr,
-					 unsigned long len)
-{
-	unsigned long task_size = STACK_TOP;
-
-	if (len > task_size)
-		return -ENOMEM;
-	if (task_size - len < addr)
-		return -EINVAL;
-	return 0;
-}
-
 #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 					    unsigned long addr, pte_t *ptep,
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 00b2d1a032fd..81a6acddd690 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -179,12 +179,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (flags & MAP_FIXED) {
-		if (addr & ~huge_page_mask(h))
-			return -EINVAL;
-		if (prepare_hugepage_range(file, addr, len))
-			return -EINVAL;
-	}
+	if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
+		return -EINVAL;
 	if (addr)
 		addr0 = ALIGN(addr, huge_page_size(h));
 
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 3e0a8fe9b108..4bce4f07f44f 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -114,14 +114,6 @@ static inline int huge_pte_none_mostly(pte_t pte)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
-static inline int prepare_hugepage_range(struct file *file,
-		unsigned long addr, unsigned long len)
-{
-	return 0;
-}
-#endif
-
 #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 		unsigned long addr, pte_t *ptep)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c6c87eae4a8d..474de8e2a8f2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -359,12 +359,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
-static inline int prepare_hugepage_range(struct file *file,
-				unsigned long addr, unsigned long len)
-{
-	return -EINVAL;
-}
-
 static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
 {
 }
-- 
cgit v1.2.3


From 5bd3b163e374462c05c055ff091582d757929d3f Mon Sep 17 00:00:00 2001
From: Xuanye Liu <liuqiye2025@163.com>
Date: Wed, 2 Jul 2025 15:12:35 +0800
Subject: mm: fix spelling issue in swap.h

recliam -> reclaim

Link: https://lkml.kernel.org/r/20250702071235.212794-1-liuqiye2025@163.com
Signed-off-by: Xuanye Liu <liuqiye2025@163.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..a49be950c485 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -415,7 +415,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #define MIN_SWAPPINESS 0
 #define MAX_SWAPPINESS 200
 
-/* Just recliam from anon folios in proactive memory reclaim */
+/* Just reclaim from anon folios in proactive memory reclaim */
 #define SWAPPINESS_ANON_ONLY (MAX_SWAPPINESS + 1)
 
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
-- 
cgit v1.2.3


From 526f36f3f47b9ad29ffb1bf668b7f295287ee11b Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Thu, 3 Jul 2025 12:03:38 +0530
Subject: maple tree: add some comments

Add comments explaining the fields for maple_metadata, since "end" is
ambiguous and "gap" can be confused as the largest gap, whereas it is
actually the offset of the largest gap.

Add comment for mas_ascend() to explain, whose min and max we are trying
to find.  Explain that, for example, if we are already on offset zero,
then the parent min is mas->min, otherwise we need to walk up to find the
implied pivot min.

Link: https://lkml.kernel.org/r/20250703063338.51509-1-dev.jain@arm.com
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/maple_tree.h | 4 ++--
 lib/maple_tree.c           | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index 9ef129038224..bafe143b1f78 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -75,8 +75,8 @@
  * searching for gaps or any other code that needs to find the end of the data.
  */
 struct maple_metadata {
-	unsigned char end;
-	unsigned char gap;
+	unsigned char end;	/* end of data */
+	unsigned char gap;	/* offset of largest gap */
 };
 
 /*
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 0e85e92c5375..b4ee2d29d7a9 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -1053,7 +1053,7 @@ static inline void mte_set_gap(const struct maple_enode *mn,
  * mas_ascend() - Walk up a level of the tree.
  * @mas: The maple state
  *
- * Sets the @mas->max and @mas->min to the correct values when walking up.  This
+ * Sets the @mas->max and @mas->min for the parent node of mas->node.  This
  * may cause several levels of walking up to find the correct min and max.
  * May find a dead node which will cause a premature return.
  * Return: 1 on dead node, 0 otherwise
@@ -1098,6 +1098,12 @@ static int mas_ascend(struct ma_state *mas)
 
 	min = 0;
 	max = ULONG_MAX;
+
+	/*
+	 * !mas->offset implies that parent node min == mas->min.
+	 * mas->offset > 0 implies that we need to walk up to find the
+	 * implied pivot min.
+	 */
 	if (!mas->offset) {
 		min = mas->min;
 		set_min = true;
-- 
cgit v1.2.3


From 15504b1163007bbfbd9a63460d5c14737c16e96d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:24:56 +0200
Subject: mm/balloon_compaction: convert balloon_page_delete() to
 balloon_page_finalize()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's move the removal of the page from the balloon list into the single
caller, to remove the dependency on the PG_isolated flag and clarify
locking requirements.

Note that for now, balloon_page_delete() was used on two paths:

(1) Removing a page from the balloon for deflation through
    balloon_page_list_dequeue()
(2) Removing an isolated page from the balloon for migration in the
    per-driver migration handlers. Isolated pages were already removed from
    the balloon list during isolation.

So instead of relying on the flag, we can just distinguish both cases
directly and handle it accordingly in the caller.

We'll shuffle the operations a bit such that they logically make more
sense (e.g., remove from the list before clearing flags).

In balloon migration functions we can now move the balloon_page_finalize()
out of the balloon lock and perform the finalization just before dropping
the balloon reference.

Document that the page lock is currently required when modifying the
movability aspects of a page; hopefully we can soon decouple this from the
page lock.

Link: https://lkml.kernel.org/r/20250704102524.326966-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/cmm.c |  2 +-
 drivers/misc/vmw_balloon.c           |  3 +--
 drivers/virtio/virtio_balloon.c      |  4 +---
 include/linux/balloon_compaction.h   | 43 ++++++++++++++----------------------
 mm/balloon_compaction.c              |  3 ++-
 5 files changed, 21 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 5f4037c1d7fe..5e0a718d1be7 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -532,7 +532,6 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
 	balloon_page_insert(b_dev_info, newpage);
-	balloon_page_delete(page);
 	b_dev_info->isolated_pages--;
 	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
 
@@ -542,6 +541,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info,
 	 */
 	plpar_page_set_active(page);
 
+	balloon_page_finalize(page);
 	/* balloon page list reference */
 	put_page(page);
 
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index c817d8c21641..6653fc53c951 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -1778,8 +1778,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info,
 	 * @pages_lock . We keep holding @comm_lock since we will need it in a
 	 * second.
 	 */
-	balloon_page_delete(page);
-
+	balloon_page_finalize(page);
 	put_page(page);
 
 	/* Inflate */
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 89da052f4f68..e299e18346a3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -866,15 +866,13 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 	tell_host(vb, vb->inflate_vq);
 
 	/* balloon's page migration 2nd step -- deflate "page" */
-	spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
-	balloon_page_delete(page);
-	spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
 	vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
 	set_page_pfns(vb, vb->pfns, page);
 	tell_host(vb, vb->deflate_vq);
 
 	mutex_unlock(&vb->balloon_lock);
 
+	balloon_page_finalize(page);
 	put_page(page); /* balloon reference */
 
 	return MIGRATEPAGE_SUCCESS;
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 5ca2d5699620..b9f19da37b08 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -97,27 +97,6 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 	list_add(&page->lru, &balloon->pages);
 }
 
-/*
- * balloon_page_delete - delete a page from balloon's page list and clear
- *			 the page->private assignement accordingly.
- * @page    : page to be released from balloon's page list
- *
- * Caller must ensure the page is locked and the spin_lock protecting balloon
- * pages list is held before deleting a page from the balloon device.
- */
-static inline void balloon_page_delete(struct page *page)
-{
-	__ClearPageOffline(page);
-	__ClearPageMovable(page);
-	set_page_private(page, 0);
-	/*
-	 * No touch page.lru field once @page has been isolated
-	 * because VM is using the field.
-	 */
-	if (!PageIsolated(page))
-		list_del(&page->lru);
-}
-
 /*
  * balloon_page_device - get the b_dev_info descriptor for the balloon device
  *			 that enqueues the given page.
@@ -141,12 +120,6 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 	list_add(&page->lru, &balloon->pages);
 }
 
-static inline void balloon_page_delete(struct page *page)
-{
-	__ClearPageOffline(page);
-	list_del(&page->lru);
-}
-
 static inline gfp_t balloon_mapping_gfp_mask(void)
 {
 	return GFP_HIGHUSER;
@@ -154,6 +127,22 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
 
 #endif /* CONFIG_BALLOON_COMPACTION */
 
+/*
+ * balloon_page_finalize - prepare a balloon page that was removed from the
+ *			   balloon list for release to the page allocator
+ * @page: page to be released to the page allocator
+ *
+ * Caller must ensure that the page is locked.
+ */
+static inline void balloon_page_finalize(struct page *page)
+{
+	if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) {
+		__ClearPageMovable(page);
+		set_page_private(page, 0);
+	}
+	__ClearPageOffline(page);
+}
+
 /*
  * balloon_page_push - insert a page into a page list.
  * @head : pointer to list
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index fcb60233aa35..ec176bdb8a78 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -94,7 +94,8 @@ size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
 		if (!trylock_page(page))
 			continue;
 
-		balloon_page_delete(page);
+		list_del(&page->lru);
+		balloon_page_finalize(page);
 		__count_vm_event(BALLOON_DEFLATE);
 		list_add(&page->lru, pages);
 		unlock_page(page);
-- 
cgit v1.2.3


From 65aabd88dffda68639808e0827cfef624a1cd55f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:24:59 +0200
Subject: mm/balloon_compaction: make PageOffline sticky until the page is
 freed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let the page freeing code handle clearing the page type.  Being able to
identify balloon pages until actually freed is a requirement for upcoming
movable_ops migration changes.

Link: https://lkml.kernel.org/r/20250704102524.326966-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index b9f19da37b08..bfc6e50bd004 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -140,7 +140,7 @@ static inline void balloon_page_finalize(struct page *page)
 		__ClearPageMovable(page);
 		set_page_private(page, 0);
 	}
-	__ClearPageOffline(page);
+	/* PageOffline is sticky until the page is freed to the buddy. */
 }
 
 /*
-- 
cgit v1.2.3


From 6ef0c1976b8fab938e732c2fb751fa8965153b2e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:01 +0200
Subject: mm/migrate: rename isolate_movable_page() to
 isolate_movable_ops_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

...  and start moving back to per-page things that will absolutely not be
folio things in the future.  Add documentation and a comment that the
remaining folio stuff (lock, refcount) will have to be reworked as well.

While at it, convert the VM_BUG_ON() into a WARN_ON_ONCE() and handle it
gracefully (relevant with further changes), and convert a WARN_ON_ONCE()
into a VM_WARN_ON_ONCE_PAGE().

Note that we will leave anything that needs a rework (lock, refcount,
->lru) to be using folios for now: that perfectly highlights the
problematic bits.

Link: https://lkml.kernel.org/r/20250704102524.326966-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  4 ++--
 mm/compaction.c         |  2 +-
 mm/migrate.c            | 39 +++++++++++++++++++++++++++++----------
 3 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index aaa2114498d6..c0ec7422837b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -69,7 +69,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free,
 		  unsigned long private, enum migrate_mode mode, int reason,
 		  unsigned int *ret_succeeded);
 struct folio *alloc_migration_target(struct folio *src, unsigned long private);
-bool isolate_movable_page(struct page *page, isolate_mode_t mode);
+bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode);
 bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
 
 int migrate_huge_page_move_mapping(struct address_space *mapping,
@@ -90,7 +90,7 @@ static inline int migrate_pages(struct list_head *l, new_folio_t new,
 static inline struct folio *alloc_migration_target(struct folio *src,
 		unsigned long private)
 	{ return NULL; }
-static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+static inline bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 	{ return false; }
 static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
 	{ return false; }
diff --git a/mm/compaction.c b/mm/compaction.c
index 3925cb61dbb8..17455c5a4be0 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1093,7 +1093,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 					locked = NULL;
 				}
 
-				if (isolate_movable_page(page, mode)) {
+				if (isolate_movable_ops_page(page, mode)) {
 					folio = page_folio(page);
 					goto isolate_success;
 				}
diff --git a/mm/migrate.c b/mm/migrate.c
index 208d2d4a2f8d..2e648d75248e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -51,8 +51,26 @@
 #include "internal.h"
 #include "swap.h"
 
-bool isolate_movable_page(struct page *page, isolate_mode_t mode)
+/**
+ * isolate_movable_ops_page - isolate a movable_ops page for migration
+ * @page: The page.
+ * @mode: The isolation mode.
+ *
+ * Try to isolate a movable_ops page for migration. Will fail if the page is
+ * not a movable_ops page, if the page is already isolated for migration
+ * or if the page was just was released by its owner.
+ *
+ * Once isolated, the page cannot get freed until it is either putback
+ * or migrated.
+ *
+ * Returns true if isolation succeeded, otherwise false.
+ */
+bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 {
+	/*
+	 * TODO: these pages will not be folios in the future. All
+	 * folio dependencies will have to be removed.
+	 */
 	struct folio *folio = folio_get_nontail_page(page);
 	const struct movable_operations *mops;
 
@@ -73,7 +91,7 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	 * we use non-atomic bitops on newly allocated page flags so
 	 * unconditionally grabbing the lock ruins page's owner side.
 	 */
-	if (unlikely(!__folio_test_movable(folio)))
+	if (unlikely(!__PageMovable(page)))
 		goto out_putfolio;
 
 	/*
@@ -90,18 +108,19 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode)
 	if (unlikely(!folio_trylock(folio)))
 		goto out_putfolio;
 
-	if (!folio_test_movable(folio) || folio_test_isolated(folio))
+	if (!PageMovable(page) || PageIsolated(page))
 		goto out_no_isolated;
 
-	mops = folio_movable_ops(folio);
-	VM_BUG_ON_FOLIO(!mops, folio);
+	mops = page_movable_ops(page);
+	if (WARN_ON_ONCE(!mops))
+		goto out_no_isolated;
 
-	if (!mops->isolate_page(&folio->page, mode))
+	if (!mops->isolate_page(page, mode))
 		goto out_no_isolated;
 
 	/* Driver shouldn't use the isolated flag */
-	WARN_ON_ONCE(folio_test_isolated(folio));
-	folio_set_isolated(folio);
+	VM_WARN_ON_ONCE_PAGE(PageIsolated(page), page);
+	SetPageIsolated(page);
 	folio_unlock(folio);
 
 	return true;
@@ -175,8 +194,8 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
 	if (lru)
 		isolated = folio_isolate_lru(folio);
 	else
-		isolated = isolate_movable_page(&folio->page,
-						ISOLATE_UNEVICTABLE);
+		isolated = isolate_movable_ops_page(&folio->page,
+						    ISOLATE_UNEVICTABLE);
 
 	if (!isolated)
 		return false;
-- 
cgit v1.2.3


From 07e5355eeead3a715bb48ffe13499dd3d0178e52 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:04 +0200
Subject: mm/migrate: remove folio_test_movable() and folio_movable_ops()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Folios will have nothing to do with movable_ops page migration.  These
functions are now unused, so let's remove them.

Note that __folio_test_movable() and friends will be removed separately
next, after more rework.

Link: https://lkml.kernel.org/r/20250704102524.326966-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c0ec7422837b..c99a00d4ca27 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -118,20 +118,6 @@ static inline void __ClearPageMovable(struct page *page)
 }
 #endif
 
-static inline bool folio_test_movable(struct folio *folio)
-{
-	return PageMovable(&folio->page);
-}
-
-static inline
-const struct movable_operations *folio_movable_ops(struct folio *folio)
-{
-	VM_BUG_ON(!__folio_test_movable(folio));
-
-	return (const struct movable_operations *)
-		((unsigned long)folio->mapping - PAGE_MAPPING_MOVABLE);
-}
-
 static inline
 const struct movable_operations *page_movable_ops(struct page *page)
 {
-- 
cgit v1.2.3


From 3544c4faccb8f0867bc65f8007ee70bfb5054305 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:07 +0200
Subject: mm/balloon_compaction: stop using __ClearPageMovable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We can just look at the balloon device (stored in page->private), to see
if the page is still part of the balloon.

As isolated balloon pages cannot get released (they are taken off the
balloon list while isolated), we don't have to worry about this case in
the putback and migration callback.  Add a WARN_ON_ONCE for now.

Link: https://lkml.kernel.org/r/20250704102524.326966-14-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h |  4 +---
 mm/balloon_compaction.c            | 11 +++++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index bfc6e50bd004..9bce8e9f5018 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -136,10 +136,8 @@ static inline gfp_t balloon_mapping_gfp_mask(void)
  */
 static inline void balloon_page_finalize(struct page *page)
 {
-	if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) {
-		__ClearPageMovable(page);
+	if (IS_ENABLED(CONFIG_BALLOON_COMPACTION))
 		set_page_private(page, 0);
-	}
 	/* PageOffline is sticky until the page is freed to the buddy. */
 }
 
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index ec176bdb8a78..e4f1a122d786 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -206,6 +206,9 @@ static bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
 	struct balloon_dev_info *b_dev_info = balloon_page_device(page);
 	unsigned long flags;
 
+	if (!b_dev_info)
+		return false;
+
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
 	list_del(&page->lru);
 	b_dev_info->isolated_pages++;
@@ -219,6 +222,10 @@ static void balloon_page_putback(struct page *page)
 	struct balloon_dev_info *b_dev_info = balloon_page_device(page);
 	unsigned long flags;
 
+	/* Isolated balloon pages cannot get deflated. */
+	if (WARN_ON_ONCE(!b_dev_info))
+		return;
+
 	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
 	list_add(&page->lru, &b_dev_info->pages);
 	b_dev_info->isolated_pages--;
@@ -234,6 +241,10 @@ static int balloon_page_migrate(struct page *newpage, struct page *page,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 
+	/* Isolated balloon pages cannot get deflated. */
+	if (WARN_ON_ONCE(!balloon))
+		return -EAGAIN;
+
 	return balloon->migratepage(balloon, newpage, page, mode);
 }
 
-- 
cgit v1.2.3


From 34727dee04994c8ceb1bb8a927af0a88e52e103c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:08 +0200
Subject: mm/migrate: remove __ClearPageMovable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unused, let's remove it.

The Chinese docs in Documentation/translations/zh_CN/mm/page_migration.rst
still mention it, but that whole docs is destined to get outdated and
updated by somebody that actually speaks that language.

Link: https://lkml.kernel.org/r/20250704102524.326966-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  8 ++------
 mm/compaction.c         | 11 -----------
 2 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c99a00d4ca27..6eeda8eb1e0d 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -35,8 +35,8 @@ struct migration_target_control;
  * @src page.  The driver should copy the contents of the
  * @src page to the @dst page and set up the fields of @dst page.
  * Both pages are locked.
- * If page migration is successful, the driver should call
- * __ClearPageMovable(@src) and return MIGRATEPAGE_SUCCESS.
+ * If page migration is successful, the driver should
+ * return MIGRATEPAGE_SUCCESS.
  * If the driver cannot migrate the page at the moment, it can return
  * -EAGAIN.  The VM interprets this as a temporary migration failure and
  * will retry it later.  Any other error value is a permanent migration
@@ -106,16 +106,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #ifdef CONFIG_COMPACTION
 bool PageMovable(struct page *page);
 void __SetPageMovable(struct page *page, const struct movable_operations *ops);
-void __ClearPageMovable(struct page *page);
 #else
 static inline bool PageMovable(struct page *page) { return false; }
 static inline void __SetPageMovable(struct page *page,
 		const struct movable_operations *ops)
 {
 }
-static inline void __ClearPageMovable(struct page *page)
-{
-}
 #endif
 
 static inline
diff --git a/mm/compaction.c b/mm/compaction.c
index 17455c5a4be0..889ec696ba96 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -137,17 +137,6 @@ void __SetPageMovable(struct page *page, const struct movable_operations *mops)
 }
 EXPORT_SYMBOL(__SetPageMovable);
 
-void __ClearPageMovable(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageMovable(page), page);
-	/*
-	 * This page still has the type of a movable page, but it's
-	 * actually not movable any more.
-	 */
-	page->mapping = (void *)PAGE_MAPPING_MOVABLE;
-}
-EXPORT_SYMBOL(__ClearPageMovable);
-
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
 
-- 
cgit v1.2.3


From 22d103aef090dc688a88881fb955376dec1228d5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:09 +0200
Subject: mm/migration: remove PageMovable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, if __ClearPageMovable() were invoked on a page, this would
cause __PageMovable() to return false, but due to the continued existence
of page movable ops, PageMovable() would have returned true.

With __ClearPageMovable() gone, the two are exactly equivalent.

So we can replace PageMovable() checks by __PageMovable().  In fact,
__PageMovable() cannot change until a page is freed, so we can turn some
PageMovable() into sanity checks for __PageMovable().

Link: https://lkml.kernel.org/r/20250704102524.326966-16-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h |  2 --
 mm/compaction.c         | 15 ---------------
 mm/migrate.c            | 18 ++++++++++--------
 3 files changed, 10 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6eeda8eb1e0d..25659a685e2a 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -104,10 +104,8 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
-bool PageMovable(struct page *page);
 void __SetPageMovable(struct page *page, const struct movable_operations *ops);
 #else
-static inline bool PageMovable(struct page *page) { return false; }
 static inline void __SetPageMovable(struct page *page,
 		const struct movable_operations *ops)
 {
diff --git a/mm/compaction.c b/mm/compaction.c
index 889ec696ba96..5c3737301701 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -114,21 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages)
 }
 
 #ifdef CONFIG_COMPACTION
-bool PageMovable(struct page *page)
-{
-	const struct movable_operations *mops;
-
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	if (!__PageMovable(page))
-		return false;
-
-	mops = page_movable_ops(page);
-	if (mops)
-		return true;
-
-	return false;
-}
-
 void __SetPageMovable(struct page *page, const struct movable_operations *mops)
 {
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 61e98ed46f13..1f07c8f1fb74 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -87,9 +87,12 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 		goto out;
 
 	/*
-	 * Check movable flag before taking the page lock because
+	 * Check for movable_ops pages before taking the page lock because
 	 * we use non-atomic bitops on newly allocated page flags so
 	 * unconditionally grabbing the lock ruins page's owner side.
+	 *
+	 * Note that once a page has movable_ops, it will stay that way
+	 * until the page was freed.
 	 */
 	if (unlikely(!__PageMovable(page)))
 		goto out_putfolio;
@@ -108,7 +111,8 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 	if (unlikely(!folio_trylock(folio)))
 		goto out_putfolio;
 
-	if (!PageMovable(page) || PageIsolated(page))
+	VM_WARN_ON_ONCE_PAGE(!__PageMovable(page), page);
+	if (PageIsolated(page))
 		goto out_no_isolated;
 
 	mops = page_movable_ops(page);
@@ -149,11 +153,10 @@ static void putback_movable_ops_page(struct page *page)
 	 */
 	struct folio *folio = page_folio(page);
 
+	VM_WARN_ON_ONCE_PAGE(!__PageMovable(page), page);
 	VM_WARN_ON_ONCE_PAGE(!PageIsolated(page), page);
 	folio_lock(folio);
-	/* If the page was released by it's owner, there is nothing to do. */
-	if (PageMovable(page))
-		page_movable_ops(page)->putback_page(page);
+	page_movable_ops(page)->putback_page(page);
 	ClearPageIsolated(page);
 	folio_unlock(folio);
 	folio_put(folio);
@@ -191,10 +194,9 @@ static int migrate_movable_ops_page(struct page *dst, struct page *src,
 {
 	int rc = MIGRATEPAGE_SUCCESS;
 
+	VM_WARN_ON_ONCE_PAGE(!__PageMovable(src), src);
 	VM_WARN_ON_ONCE_PAGE(!PageIsolated(src), src);
-	/* If the page was released by it's owner, there is nothing to do. */
-	if (PageMovable(src))
-		rc = page_movable_ops(src)->migrate_page(dst, src, mode);
+	rc = page_movable_ops(src)->migrate_page(dst, src, mode);
 	if (rc == MIGRATEPAGE_SUCCESS)
 		ClearPageIsolated(src);
 	return rc;
-- 
cgit v1.2.3


From d4fb4587bd73b6b773397f5fed52a5e4bd4dec8b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:10 +0200
Subject: mm: rename __PageMovable() to page_has_movable_ops()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's make it clearer that we are talking about movable_ops pages.

While at it, convert a VM_BUG_ON to a VM_WARN_ON_ONCE_PAGE.

Link: https://lkml.kernel.org/r/20250704102524.326966-17-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/migrate.h    |  2 +-
 include/linux/page-flags.h |  2 +-
 mm/compaction.c            |  7 ++-----
 mm/memory-failure.c        |  4 ++--
 mm/memory_hotplug.c        | 10 ++++------
 mm/migrate.c               |  8 ++++----
 mm/page_alloc.c            |  2 +-
 mm/page_isolation.c        | 10 +++++-----
 8 files changed, 20 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 25659a685e2a..e04035f70e36 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -115,7 +115,7 @@ static inline void __SetPageMovable(struct page *page,
 static inline
 const struct movable_operations *page_movable_ops(struct page *page)
 {
-	VM_BUG_ON(!__PageMovable(page));
+	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
 
 	return (const struct movable_operations *)
 		((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4fe5ee67535b..c67163b73c5e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -750,7 +750,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio)
 			PAGE_MAPPING_MOVABLE;
 }
 
-static __always_inline bool __PageMovable(const struct page *page)
+static __always_inline bool page_has_movable_ops(const struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
 				PAGE_MAPPING_MOVABLE;
diff --git a/mm/compaction.c b/mm/compaction.c
index 5c3737301701..41fd6a1fe9a3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1056,11 +1056,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 * Skip any other type of page
 		 */
 		if (!PageLRU(page)) {
-			/*
-			 * __PageMovable can return false positive so we need
-			 * to verify it under page_lock.
-			 */
-			if (unlikely(__PageMovable(page)) &&
+			/* Isolation code will deal with any races. */
+			if (unlikely(page_has_movable_ops(page)) &&
 					!PageIsolated(page)) {
 				if (locked) {
 					unlock_page_lruvec_irqrestore(locked, flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b91a33fb6c69..9e2cff199934 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,8 +1388,8 @@ static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
 	if (PageSlab(page))
 		return false;
 
-	/* Soft offline could migrate non-LRU movable pages */
-	if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+	/* Soft offline could migrate movable_ops pages */
+	if ((flags & MF_SOFT_OFFLINE) && page_has_movable_ops(page))
 		return true;
 
 	return PageLRU(page) || is_free_buddy_page(page);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4009a44f883..1f15af712bc3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1739,8 +1739,8 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). Will skip over most unmovable
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU and
+ * hugetlb folio, movable_ops pages). Will skip over most unmovable
  * pages (esp., pages that can be skipped when offlining), but bail out on
  * definitely unmovable pages.
  *
@@ -1759,13 +1759,11 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
 		struct folio *folio;
 
 		page = pfn_to_page(pfn);
-		if (PageLRU(page))
-			goto found;
-		if (__PageMovable(page))
+		if (PageLRU(page) || page_has_movable_ops(page))
 			goto found;
 
 		/*
-		 * PageOffline() pages that are not marked __PageMovable() and
+		 * PageOffline() pages that do not have movable_ops and
 		 * have a reference count > 0 (after MEM_GOING_OFFLINE) are
 		 * definitely unmovable. If their reference count would be 0,
 		 * they could at least be skipped when offlining memory.
diff --git a/mm/migrate.c b/mm/migrate.c
index 1f07c8f1fb74..bf9cfdafc54c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -94,7 +94,7 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 	 * Note that once a page has movable_ops, it will stay that way
 	 * until the page was freed.
 	 */
-	if (unlikely(!__PageMovable(page)))
+	if (unlikely(!page_has_movable_ops(page)))
 		goto out_putfolio;
 
 	/*
@@ -111,7 +111,7 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 	if (unlikely(!folio_trylock(folio)))
 		goto out_putfolio;
 
-	VM_WARN_ON_ONCE_PAGE(!__PageMovable(page), page);
+	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
 	if (PageIsolated(page))
 		goto out_no_isolated;
 
@@ -153,7 +153,7 @@ static void putback_movable_ops_page(struct page *page)
 	 */
 	struct folio *folio = page_folio(page);
 
-	VM_WARN_ON_ONCE_PAGE(!__PageMovable(page), page);
+	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
 	VM_WARN_ON_ONCE_PAGE(!PageIsolated(page), page);
 	folio_lock(folio);
 	page_movable_ops(page)->putback_page(page);
@@ -194,7 +194,7 @@ static int migrate_movable_ops_page(struct page *dst, struct page *src,
 {
 	int rc = MIGRATEPAGE_SUCCESS;
 
-	VM_WARN_ON_ONCE_PAGE(!__PageMovable(src), src);
+	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
 	VM_WARN_ON_ONCE_PAGE(!PageIsolated(src), src);
 	rc = page_movable_ops(src)->migrate_page(dst, src, mode);
 	if (rc == MIGRATEPAGE_SUCCESS)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6318c85d678e..036d9b7b01c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2006,7 +2006,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
 			 * migration are movable. But we don't actually try
 			 * isolating, as that would be expensive.
 			 */
-			if (PageLRU(page) || __PageMovable(page))
+			if (PageLRU(page) || page_has_movable_ops(page))
 				(*num_movable)++;
 			pfn++;
 		}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index ece3bfc56bcd..b97b965b3ed0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -21,9 +21,9 @@
  * consequently belong to a single zone.
  *
  * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. Similarly, pages
+ * with movable_ops can only be identified some time after they were
+ * allocated. So you can't expect this function should be exact.
  *
  * Returns a page without holding a reference. If the caller wants to
  * dereference that page (e.g., dumping), it has to make sure that it
@@ -133,7 +133,7 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
 		if ((mode == PB_ISOLATE_MODE_MEM_OFFLINE) && PageOffline(page))
 			continue;
 
-		if (__PageMovable(page) || PageLRU(page))
+		if (PageLRU(page) || page_has_movable_ops(page))
 			continue;
 
 		/*
@@ -421,7 +421,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn,
 			 * proper free and split handling for them.
 			 */
 			VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
-			VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
+			VM_WARN_ON_ONCE_PAGE(page_has_movable_ops(page), page);
 
 			goto failed;
 		}
-- 
cgit v1.2.3


From 457d7b3adb11576ce5f3ae0d9a4987ace213bed2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:12 +0200
Subject: mm: remove __folio_test_movable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert to page_has_movable_ops().  While at it, cleanup relevant code a
bit.

The data_race() in migrate_folio_unmap() is questionable: we already hold
a page reference, and concurrent modifications can no longer happen (iow:
__ClearPageMovable() no longer exists).  Drop it for now, we'll rework
page_has_movable_ops() soon either way to no longer rely on page->mapping.

Wherever we cast from folio to page now is a clear sign that this code has
to be decoupled.

Link: https://lkml.kernel.org/r/20250704102524.326966-19-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  6 ------
 mm/migrate.c               | 43 +++++++++++++------------------------------
 mm/vmscan.c                |  6 ++++--
 3 files changed, 17 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c67163b73c5e..4c27ebb689e3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -744,12 +744,6 @@ static __always_inline bool PageAnon(const struct page *page)
 	return folio_test_anon(page_folio(page));
 }
 
-static __always_inline bool __folio_test_movable(const struct folio *folio)
-{
-	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
-			PAGE_MAPPING_MOVABLE;
-}
-
 static __always_inline bool page_has_movable_ops(const struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
diff --git a/mm/migrate.c b/mm/migrate.c
index bf9cfdafc54c..aec0774d3da3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -221,12 +221,7 @@ void putback_movable_pages(struct list_head *l)
 			continue;
 		}
 		list_del(&folio->lru);
-		/*
-		 * We isolated non-lru movable folio so here we can use
-		 * __folio_test_movable because LRU folio's mapping cannot
-		 * have PAGE_MAPPING_MOVABLE.
-		 */
-		if (unlikely(__folio_test_movable(folio))) {
+		if (unlikely(page_has_movable_ops(&folio->page))) {
 			putback_movable_ops_page(&folio->page);
 		} else {
 			node_stat_mod_folio(folio, NR_ISOLATED_ANON +
@@ -239,26 +234,20 @@ void putback_movable_pages(struct list_head *l)
 /* Must be called with an elevated refcount on the non-hugetlb folio */
 bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
 {
-	bool isolated, lru;
-
 	if (folio_test_hugetlb(folio))
 		return folio_isolate_hugetlb(folio, list);
 
-	lru = !__folio_test_movable(folio);
-	if (lru)
-		isolated = folio_isolate_lru(folio);
-	else
-		isolated = isolate_movable_ops_page(&folio->page,
-						    ISOLATE_UNEVICTABLE);
-
-	if (!isolated)
-		return false;
-
-	list_add(&folio->lru, list);
-	if (lru)
+	if (page_has_movable_ops(&folio->page)) {
+		if (!isolate_movable_ops_page(&folio->page,
+					      ISOLATE_UNEVICTABLE))
+			return false;
+	} else {
+		if (!folio_isolate_lru(folio))
+			return false;
 		node_stat_add_folio(folio, NR_ISOLATED_ANON +
 				    folio_is_file_lru(folio));
-
+	}
+	list_add(&folio->lru, list);
 	return true;
 }
 
@@ -1142,12 +1131,7 @@ static void migrate_folio_undo_dst(struct folio *dst, bool locked,
 static void migrate_folio_done(struct folio *src,
 			       enum migrate_reason reason)
 {
-	/*
-	 * Compaction can migrate also non-LRU pages which are
-	 * not accounted to NR_ISOLATED_*. They can be recognized
-	 * as __folio_test_movable
-	 */
-	if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
+	if (likely(!page_has_movable_ops(&src->page)) && reason != MR_DEMOTION)
 		mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
 				    folio_is_file_lru(src), -folio_nr_pages(src));
 
@@ -1166,7 +1150,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
 	int rc = -EAGAIN;
 	int old_page_state = 0;
 	struct anon_vma *anon_vma = NULL;
-	bool is_lru = data_race(!__folio_test_movable(src));
 	bool locked = false;
 	bool dst_locked = false;
 
@@ -1267,7 +1250,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio,
 		goto out;
 	dst_locked = true;
 
-	if (unlikely(!is_lru)) {
+	if (unlikely(page_has_movable_ops(&src->page))) {
 		__migrate_folio_record(dst, old_page_state, anon_vma);
 		return MIGRATEPAGE_UNMAP;
 	}
@@ -1332,7 +1315,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 	prev = dst->lru.prev;
 	list_del(&dst->lru);
 
-	if (unlikely(__folio_test_movable(src))) {
+	if (unlikely(page_has_movable_ops(&src->page))) {
 		rc = migrate_movable_ops_page(&dst->page, &src->page, mode);
 		if (rc)
 			goto out;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c86a2495138a..b1b999734ee4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1651,9 +1651,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 	unsigned int noreclaim_flag;
 
 	list_for_each_entry_safe(folio, next, folio_list, lru) {
+		/* TODO: these pages should not even appear in this list. */
+		if (page_has_movable_ops(&folio->page))
+			continue;
 		if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
-		    !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
-		    !folio_test_unevictable(folio)) {
+		    !folio_test_dirty(folio) && !folio_test_unevictable(folio)) {
 			folio_clear_active(folio);
 			list_move(&folio->lru, &clean_folios);
 		}
-- 
cgit v1.2.3


From 84caf98838a3e5f4bdb344c13679e1067ffbf094 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:13 +0200
Subject: mm: stop storing migration_ops in page->mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

...  instead, look them up statically based on the page type.  Maybe in
the future we want a registration interface?  At least for now, it can be
easily handled using the two page types that actually support page
migration.

The remaining usage of page->mapping is to flag such pages as actually
being movable (having movable_ops), which we will change next.

Link: https://lkml.kernel.org/r/20250704102524.326966-20-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h |  2 +-
 include/linux/migrate.h            | 14 ++------------
 include/linux/zsmalloc.h           |  2 ++
 mm/balloon_compaction.c            |  1 -
 mm/compaction.c                    |  5 ++---
 mm/migrate.c                       | 23 +++++++++++++++++++++++
 mm/zpdesc.h                        |  5 ++---
 mm/zsmalloc.c                      |  8 +++-----
 8 files changed, 35 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 9bce8e9f5018..a8a1706cc56f 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -92,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	__SetPageMovable(page, &balloon_mops);
+	__SetPageMovable(page);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e04035f70e36..6aece3f3c8be 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -104,23 +104,13 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
-void __SetPageMovable(struct page *page, const struct movable_operations *ops);
+void __SetPageMovable(struct page *page);
 #else
-static inline void __SetPageMovable(struct page *page,
-		const struct movable_operations *ops)
+static inline void __SetPageMovable(struct page *page)
 {
 }
 #endif
 
-static inline
-const struct movable_operations *page_movable_ops(struct page *page)
-{
-	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
-
-	return (const struct movable_operations *)
-		((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE);
-}
-
 #ifdef CONFIG_NUMA_BALANCING
 int migrate_misplaced_folio_prepare(struct folio *folio,
 		struct vm_area_struct *vma, int node);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 13e9cc5490f7..f3ccff2d966c 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -46,4 +46,6 @@ void zs_obj_read_end(struct zs_pool *pool, unsigned long handle,
 void zs_obj_write(struct zs_pool *pool, unsigned long handle,
 		  void *handle_mem, size_t mem_len);
 
+extern const struct movable_operations zsmalloc_mops;
+
 #endif
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index e4f1a122d786..2a4a649805c1 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -253,6 +253,5 @@ const struct movable_operations balloon_mops = {
 	.isolate_page = balloon_page_isolate,
 	.putback_page = balloon_page_putback,
 };
-EXPORT_SYMBOL_GPL(balloon_mops);
 
 #endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index 41fd6a1fe9a3..348eb754cb22 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -114,11 +114,10 @@ static unsigned long release_free_list(struct list_head *freepages)
 }
 
 #ifdef CONFIG_COMPACTION
-void __SetPageMovable(struct page *page, const struct movable_operations *mops)
+void __SetPageMovable(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
-	page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
+	page->mapping = (void *)(PAGE_MAPPING_MOVABLE);
 }
 EXPORT_SYMBOL(__SetPageMovable);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index aec0774d3da3..90ddf906d706 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -43,6 +43,8 @@
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
 #include <linux/pagewalk.h>
+#include <linux/balloon_compaction.h>
+#include <linux/zsmalloc.h>
 
 #include <asm/tlbflush.h>
 
@@ -51,6 +53,27 @@
 #include "internal.h"
 #include "swap.h"
 
+static const struct movable_operations *page_movable_ops(struct page *page)
+{
+	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
+
+	/*
+	 * If we enable page migration for a page of a certain type by marking
+	 * it as movable, the page type must be sticky until the page gets freed
+	 * back to the buddy.
+	 */
+#ifdef CONFIG_BALLOON_COMPACTION
+	if (PageOffline(page))
+		/* Only balloon compaction sets PageOffline pages movable. */
+		return &balloon_mops;
+#endif /* CONFIG_BALLOON_COMPACTION */
+#if defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION)
+	if (PageZsmalloc(page))
+		return &zsmalloc_mops;
+#endif /* defined(CONFIG_ZSMALLOC) && defined(CONFIG_COMPACTION) */
+	return NULL;
+}
+
 /**
  * isolate_movable_ops_page - isolate a movable_ops page for migration
  * @page: The page.
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index 5763f3603973..6855d9e2732d 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -152,10 +152,9 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
 	return page_zpdesc(pfn_to_page(pfn));
 }
 
-static inline void __zpdesc_set_movable(struct zpdesc *zpdesc,
-					const struct movable_operations *mops)
+static inline void __zpdesc_set_movable(struct zpdesc *zpdesc)
 {
-	__SetPageMovable(zpdesc_page(zpdesc), mops);
+	__SetPageMovable(zpdesc_page(zpdesc));
 }
 
 static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b12250e219bb..4aaff7c26ea9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1685,8 +1685,6 @@ static void lock_zspage(struct zspage *zspage)
 
 #ifdef CONFIG_COMPACTION
 
-static const struct movable_operations zsmalloc_mops;
-
 static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 				struct zpdesc *newzpdesc, struct zpdesc *oldzpdesc)
 {
@@ -1709,7 +1707,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
 	set_first_obj_offset(newzpdesc, first_obj_offset);
 	if (unlikely(ZsHugePage(zspage)))
 		newzpdesc->handle = oldzpdesc->handle;
-	__zpdesc_set_movable(newzpdesc, &zsmalloc_mops);
+	__zpdesc_set_movable(newzpdesc);
 }
 
 static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1819,7 +1817,7 @@ static void zs_page_putback(struct page *page)
 {
 }
 
-static const struct movable_operations zsmalloc_mops = {
+const struct movable_operations zsmalloc_mops = {
 	.isolate_page = zs_page_isolate,
 	.migrate_page = zs_page_migrate,
 	.putback_page = zs_page_putback,
@@ -1882,7 +1880,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
 
 	do {
 		WARN_ON(!zpdesc_trylock(zpdesc));
-		__zpdesc_set_movable(zpdesc, &zsmalloc_mops);
+		__zpdesc_set_movable(zpdesc);
 		zpdesc_unlock(zpdesc);
 	} while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL);
 }
-- 
cgit v1.2.3


From 3d388584d59985e95f5bfb9dbd9776fa1bb1ec8a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:14 +0200
Subject: mm: convert "movable" flag in page->mapping to a page flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead, let's use a page flag.  As the page flag can result in
false-positives, glue it to the page types for which we support/implement
movable_ops page migration.

We are reusing PG_uptodate, that is for example used to track file system
state and does not apply to movable_ops pages.  So warning in case it is
set in page_has_movable_ops() on other page types could result in
false-positive warnings.

Likely we could set the bit using a non-atomic update: in contrast to
page->mapping, we could have others trying to update the flags
concurrently when trying to lock the folio.  In
isolate_movable_ops_page(), we already take care of that by checking if
the page has movable_ops before locking it.  Let's start with the atomic
variant, we could later switch to the non-atomic variant once we are sure
other cases are similarly fine.  Once we perform the switch, we'll have to
introduce __SETPAGEFLAG_NOOP().

[david@redhat.com: add missing `:' in kerneldoc]
  Link: https://lkml.kernel.org/r/d96e2916-2c43-462c-b6a1-2375ef397d8b@redhat.com
Link: https://lkml.kernel.org/r/20250704102524.326966-21-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h |  2 +-
 include/linux/migrate.h            |  8 ------
 include/linux/page-flags.h         | 54 +++++++++++++++++++++++++++++++-------
 mm/compaction.c                    |  6 -----
 mm/zpdesc.h                        |  2 +-
 5 files changed, 46 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index a8a1706cc56f..b222b0737c46 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -92,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	__SetPageMovable(page);
+	SetPageMovableOps(page);
 	set_page_private(page, (unsigned long)balloon);
 	list_add(&page->lru, &balloon->pages);
 }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 6aece3f3c8be..acadd41e0b5c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -103,14 +103,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 
 #endif /* CONFIG_MIGRATION */
 
-#ifdef CONFIG_COMPACTION
-void __SetPageMovable(struct page *page);
-#else
-static inline void __SetPageMovable(struct page *page)
-{
-}
-#endif
-
 #ifdef CONFIG_NUMA_BALANCING
 int migrate_misplaced_folio_prepare(struct folio *folio,
 		struct vm_area_struct *vma, int node);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4c27ebb689e3..a02a12e51915 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -170,6 +170,11 @@ enum pageflags {
 	/* non-lru isolated movable page */
 	PG_isolated = PG_reclaim,
 
+#ifdef CONFIG_MIGRATION
+	/* this is a movable_ops page (for selected typed pages only) */
+	PG_movable_ops = PG_uptodate,
+#endif
+
 	/* Only valid for buddy pages. Used to track pages that are reported */
 	PG_reported = PG_uptodate,
 
@@ -698,9 +703,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
  * bit; and then folio->mapping points, not to an anon_vma, but to a private
  * structure which KSM associates with that merged page.  See ksm.h.
  *
- * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
- * page and then folio->mapping points to a struct movable_operations.
- *
  * Please note that, confusingly, "folio_mapping" refers to the inode
  * address_space which maps the folio from disk; whereas "folio_mapped"
  * refers to user virtual address space into which the folio is mapped.
@@ -743,13 +745,6 @@ static __always_inline bool PageAnon(const struct page *page)
 {
 	return folio_test_anon(page_folio(page));
 }
-
-static __always_inline bool page_has_movable_ops(const struct page *page)
-{
-	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
-				PAGE_MAPPING_MOVABLE;
-}
-
 #ifdef CONFIG_KSM
 /*
  * A KSM page is one of those write-protected "shared pages" or "merged pages"
@@ -1133,6 +1128,45 @@ bool is_free_buddy_page(const struct page *page);
 
 PAGEFLAG(Isolated, isolated, PF_ANY);
 
+#ifdef CONFIG_MIGRATION
+/*
+ * This page is migratable through movable_ops (for selected typed pages
+ * only).
+ *
+ * Page migration of such pages might fail, for example, if the page is
+ * already isolated by somebody else, or if the page is about to get freed.
+ *
+ * While a subsystem might set selected typed pages that support page migration
+ * as being movable through movable_ops, it must never clear this flag.
+ *
+ * This flag is only cleared when the page is freed back to the buddy.
+ *
+ * Only selected page types support this flag (see page_movable_ops()) and
+ * the flag might be used in other context for other pages. Always use
+ * page_has_movable_ops() instead.
+ */
+TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
+SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
+#else /* !CONFIG_MIGRATION */
+TESTPAGEFLAG_FALSE(MovableOps, movable_ops);
+SETPAGEFLAG_NOOP(MovableOps, movable_ops);
+#endif /* CONFIG_MIGRATION */
+
+/**
+ * page_has_movable_ops - test for a movable_ops page
+ * @page: The page to test.
+ *
+ * Test whether this is a movable_ops page. Such pages will stay that
+ * way until freed.
+ *
+ * Returns true if this is a movable_ops page, otherwise false.
+ */
+static inline bool page_has_movable_ops(const struct page *page)
+{
+	return PageMovableOps(page) &&
+	       (PageOffline(page) || PageZsmalloc(page));
+}
+
 static __always_inline int PageAnonExclusive(const struct page *page)
 {
 	VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
diff --git a/mm/compaction.c b/mm/compaction.c
index 348eb754cb22..349f4ea0ec3e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -114,12 +114,6 @@ static unsigned long release_free_list(struct list_head *freepages)
 }
 
 #ifdef CONFIG_COMPACTION
-void __SetPageMovable(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	page->mapping = (void *)(PAGE_MAPPING_MOVABLE);
-}
-EXPORT_SYMBOL(__SetPageMovable);
 
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
diff --git a/mm/zpdesc.h b/mm/zpdesc.h
index 6855d9e2732d..25bf5ea0beb8 100644
--- a/mm/zpdesc.h
+++ b/mm/zpdesc.h
@@ -154,7 +154,7 @@ static inline struct zpdesc *pfn_zpdesc(unsigned long pfn)
 
 static inline void __zpdesc_set_movable(struct zpdesc *zpdesc)
 {
-	__SetPageMovable(zpdesc_page(zpdesc));
+	SetPageMovableOps(zpdesc_page(zpdesc));
 }
 
 static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc)
-- 
cgit v1.2.3


From 92f091769fde42509ca7685e67c9951f2350ceb7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:15 +0200
Subject: mm: rename PG_isolated to PG_movable_ops_isolated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's rename the flag to make it clearer where it applies (not folios
...).

While at it, define the flag only with CONFIG_MIGRATION.

Link: https://lkml.kernel.org/r/20250704102524.326966-22-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 16 +++++++++++-----
 mm/compaction.c            |  2 +-
 mm/migrate.c               | 14 +++++++-------
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a02a12e51915..92f7152a445e 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -167,10 +167,9 @@ enum pageflags {
 	/* Remapped by swiotlb-xen. */
 	PG_xen_remapped = PG_owner_priv_1,
 
-	/* non-lru isolated movable page */
-	PG_isolated = PG_reclaim,
-
 #ifdef CONFIG_MIGRATION
+	/* movable_ops page that is isolated for migration */
+	PG_movable_ops_isolated = PG_reclaim,
 	/* this is a movable_ops page (for selected typed pages only) */
 	PG_movable_ops = PG_uptodate,
 #endif
@@ -1126,8 +1125,6 @@ static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
 
 bool is_free_buddy_page(const struct page *page);
 
-PAGEFLAG(Isolated, isolated, PF_ANY);
-
 #ifdef CONFIG_MIGRATION
 /*
  * This page is migratable through movable_ops (for selected typed pages
@@ -1147,9 +1144,18 @@ PAGEFLAG(Isolated, isolated, PF_ANY);
  */
 TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
 SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL);
+/*
+ * A movable_ops page has this flag set while it is isolated for migration.
+ * This flag primarily protects against concurrent migration attempts.
+ *
+ * Once migration ended (success or failure), the flag is cleared. The
+ * flag is managed by the migration core.
+ */
+PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL);
 #else /* !CONFIG_MIGRATION */
 TESTPAGEFLAG_FALSE(MovableOps, movable_ops);
 SETPAGEFLAG_NOOP(MovableOps, movable_ops);
+PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated);
 #endif /* CONFIG_MIGRATION */
 
 /**
diff --git a/mm/compaction.c b/mm/compaction.c
index 349f4ea0ec3e..bf021b31c7ec 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1051,7 +1051,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		if (!PageLRU(page)) {
 			/* Isolation code will deal with any races. */
 			if (unlikely(page_has_movable_ops(page)) &&
-					!PageIsolated(page)) {
+			    !PageMovableOpsIsolated(page)) {
 				if (locked) {
 					unlock_page_lruvec_irqrestore(locked, flags);
 					locked = NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index 90ddf906d706..e4e05a98c84e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -135,7 +135,7 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 		goto out_putfolio;
 
 	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
-	if (PageIsolated(page))
+	if (PageMovableOpsIsolated(page))
 		goto out_no_isolated;
 
 	mops = page_movable_ops(page);
@@ -146,8 +146,8 @@ bool isolate_movable_ops_page(struct page *page, isolate_mode_t mode)
 		goto out_no_isolated;
 
 	/* Driver shouldn't use the isolated flag */
-	VM_WARN_ON_ONCE_PAGE(PageIsolated(page), page);
-	SetPageIsolated(page);
+	VM_WARN_ON_ONCE_PAGE(PageMovableOpsIsolated(page), page);
+	SetPageMovableOpsIsolated(page);
 	folio_unlock(folio);
 
 	return true;
@@ -177,10 +177,10 @@ static void putback_movable_ops_page(struct page *page)
 	struct folio *folio = page_folio(page);
 
 	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(page), page);
-	VM_WARN_ON_ONCE_PAGE(!PageIsolated(page), page);
+	VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(page), page);
 	folio_lock(folio);
 	page_movable_ops(page)->putback_page(page);
-	ClearPageIsolated(page);
+	ClearPageMovableOpsIsolated(page);
 	folio_unlock(folio);
 	folio_put(folio);
 }
@@ -218,10 +218,10 @@ static int migrate_movable_ops_page(struct page *dst, struct page *src,
 	int rc = MIGRATEPAGE_SUCCESS;
 
 	VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src);
-	VM_WARN_ON_ONCE_PAGE(!PageIsolated(src), src);
+	VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src);
 	rc = page_movable_ops(src)->migrate_page(dst, src, mode);
 	if (rc == MIGRATEPAGE_SUCCESS)
-		ClearPageIsolated(src);
+		ClearPageMovableOpsIsolated(src);
 	return rc;
 }
 
-- 
cgit v1.2.3


From bd56d30242039826160d95a65cb14c1cd65e6488 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:16 +0200
Subject: mm/page-flags: rename PAGE_MAPPING_MOVABLE to PAGE_MAPPING_ANON_KSM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KSM is the only remaining user, let's rename the flag.  While at it,
adjust to remaining page -> folio in the doc.

Link: https://lkml.kernel.org/r/20250704102524.326966-23-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 92f7152a445e..22767499fb21 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -697,10 +697,10 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
  * folio->mapping points to its anon_vma, not to a struct address_space;
  * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
  *
- * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
+ * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
+ * the PAGE_MAPPING_ANON_KSM bit may be set along with the PAGE_MAPPING_ANON
  * bit; and then folio->mapping points, not to an anon_vma, but to a private
- * structure which KSM associates with that merged page.  See ksm.h.
+ * structure which KSM associates with that merged folio.  See ksm.h.
  *
  * Please note that, confusingly, "folio_mapping" refers to the inode
  * address_space which maps the folio from disk; whereas "folio_mapped"
@@ -714,9 +714,9 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
  * See mm/slab.h.
  */
 #define PAGE_MAPPING_ANON	0x1
-#define PAGE_MAPPING_MOVABLE	0x2
-#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
+#define PAGE_MAPPING_ANON_KSM	0x2
+#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
+#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
 
 static __always_inline bool folio_mapping_flags(const struct folio *folio)
 {
-- 
cgit v1.2.3


From 5799c0ed0aff65989b04ca3f517401e4ee94da10 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:17 +0200
Subject: mm/page-alloc: remove PageMappingFlags()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As PageMappingFlags() now only indicates anon (incl.  KSM) folios, we can
now simply check for PageAnon() and remove PageMappingFlags().

...  and while at it, use the folio instead and operate on folio->mapping.

Link: https://lkml.kernel.org/r/20250704102524.326966-24-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 -----
 mm/page_alloc.c            | 7 +++----
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 22767499fb21..a2151f6d96ae 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -723,11 +723,6 @@ static __always_inline bool folio_mapping_flags(const struct folio *folio)
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
-static __always_inline bool PageMappingFlags(const struct page *page)
-{
-	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
-}
-
 static __always_inline bool folio_test_anon(const struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 036d9b7b01c0..fa09154a799c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1375,10 +1375,9 @@ __always_inline bool free_pages_prepare(struct page *page,
 			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		}
 	}
-	if (PageMappingFlags(page)) {
-		if (PageAnon(page))
-			mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
-		page->mapping = NULL;
+	if (folio_test_anon(folio)) {
+		mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+		folio->mapping = NULL;
 	}
 	if (unlikely(page_has_type(page)))
 		/* Reset the page_type (which overlays _mapcount) */
-- 
cgit v1.2.3


From beb2cdeed673150cdfad653dd2e7c9999c230f57 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:18 +0200
Subject: mm/page-flags: remove folio_mapping_flags()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's unused and the page counterpart is gone, so let's remove it.

Link: https://lkml.kernel.org/r/20250704102524.326966-25-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a2151f6d96ae..ae2b80fcea6a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -718,11 +718,6 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 #define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
 
-static __always_inline bool folio_mapping_flags(const struct folio *folio)
-{
-	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
-}
-
 static __always_inline bool folio_test_anon(const struct folio *folio)
 {
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
-- 
cgit v1.2.3


From 78cb1a13c42a6d843e21389f74d1edb90ed07288 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:19 +0200
Subject: mm: simplify folio_expected_ref_count()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that PAGE_MAPPING_MOVABLE is gone, we can simplify and rely on the
folio_test_anon() test only.

... but staring at the users, this function should never even have been
called on movable_ops pages. E.g.,
* __buffer_migrate_folio() does not make sense for them
* folio_migrate_mapping() does not make sense for them
* migrate_huge_page_move_mapping() does not make sense for them
* __migrate_folio() does not make sense for them
* ... and khugepaged should never stumble over them

Let's simply refuse typed pages (which includes slab) except hugetlb, and
WARN.

Link: https://lkml.kernel.org/r/20250704102524.326966-26-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef40f68c1183..805108d7bbc3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2167,13 +2167,13 @@ static inline int folio_expected_ref_count(const struct folio *folio)
 	const int order = folio_order(folio);
 	int ref_count = 0;
 
-	if (WARN_ON_ONCE(folio_test_slab(folio)))
+	if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
 		return 0;
 
 	if (folio_test_anon(folio)) {
 		/* One reference per page from the swapcache. */
 		ref_count += folio_test_swapcache(folio) << order;
-	} else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) {
+	} else {
 		/* One reference per page from the pagecache. */
 		ref_count += !!folio->mapping << order;
 		/* One reference from PG_private. */
-- 
cgit v1.2.3


From df25569d401e36327b339c3f5b3265d74eae90f2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:20 +0200
Subject: mm: rename PAGE_MAPPING_* to FOLIO_MAPPING_*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the mapping flags are only used for folios, let's rename the
defines.

Link: https://lkml.kernel.org/r/20250704102524.326966-27-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/page.c             |  4 ++--
 include/linux/fs.h         |  2 +-
 include/linux/mm_types.h   |  1 -
 include/linux/page-flags.h | 20 ++++++++++----------
 include/linux/pagemap.h    |  2 +-
 mm/gup.c                   |  4 ++--
 mm/internal.h              |  2 +-
 mm/ksm.c                   |  4 ++--
 mm/rmap.c                  | 16 ++++++++--------
 mm/util.c                  |  6 +++---
 10 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 999af26c7298..0cdc78c0d23f 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -149,7 +149,7 @@ u64 stable_page_flags(const struct page *page)
 
 	k = folio->flags;
 	mapping = (unsigned long)folio->mapping;
-	is_anon = mapping & PAGE_MAPPING_ANON;
+	is_anon = mapping & FOLIO_MAPPING_ANON;
 
 	/*
 	 * pseudo flags for the well known (anonymous) memory mapped pages
@@ -158,7 +158,7 @@ u64 stable_page_flags(const struct page *page)
 		u |= 1 << KPF_MMAP;
 	if (is_anon) {
 		u |= 1 << KPF_ANON;
-		if (mapping & PAGE_MAPPING_KSM)
+		if (mapping & FOLIO_MAPPING_KSM)
 			u |= 1 << KPF_KSM;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e14e9d11ca0f..d3e7ad6941a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -526,7 +526,7 @@ struct address_space {
 	/*
 	 * On most architectures that alignment is already the case; but
 	 * must be enforced here for CRIS, to let the least significant bit
-	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
+	 * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON.
 	 */
 
 /* XArray tags, for tagging dirty and writeback pages in the pagecache. */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 804d269a4f5e..1ec273b06691 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -105,7 +105,6 @@ struct page {
 					unsigned int order;
 				};
 			};
-			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
 			union {
 				pgoff_t __folio_index;		/* Our offset within mapping. */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index ae2b80fcea6a..8e4d6eda8a8d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -695,10 +695,10 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
 /*
  * On an anonymous folio mapped into a user virtual memory area,
  * folio->mapping points to its anon_vma, not to a struct address_space;
- * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
+ * with the FOLIO_MAPPING_ANON bit set to distinguish it.  See rmap.h.
  *
  * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_ANON_KSM bit may be set along with the PAGE_MAPPING_ANON
+ * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON
  * bit; and then folio->mapping points, not to an anon_vma, but to a private
  * structure which KSM associates with that merged folio.  See ksm.h.
  *
@@ -713,21 +713,21 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
  * false before calling the following functions (e.g., folio_test_anon).
  * See mm/slab.h.
  */
-#define PAGE_MAPPING_ANON	0x1
-#define PAGE_MAPPING_ANON_KSM	0x2
-#define PAGE_MAPPING_KSM	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM)
+#define FOLIO_MAPPING_ANON	0x1
+#define FOLIO_MAPPING_ANON_KSM	0x2
+#define FOLIO_MAPPING_KSM	(FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)
+#define FOLIO_MAPPING_FLAGS	(FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM)
 
 static __always_inline bool folio_test_anon(const struct folio *folio)
 {
-	return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
+	return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0;
 }
 
 static __always_inline bool PageAnonNotKsm(const struct page *page)
 {
 	unsigned long flags = (unsigned long)page_folio(page)->mapping;
 
-	return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON;
+	return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON;
 }
 
 static __always_inline bool PageAnon(const struct page *page)
@@ -743,8 +743,8 @@ static __always_inline bool PageAnon(const struct page *page)
  */
 static __always_inline bool folio_test_ksm(const struct folio *folio)
 {
-	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
-				PAGE_MAPPING_KSM;
+	return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) ==
+				FOLIO_MAPPING_KSM;
 }
 #else
 FOLIO_TEST_FLAG_FALSE(ksm)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e63fbfbd5b0f..10a222e68b85 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -502,7 +502,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping,
 static inline bool mapping_large_folio_support(struct address_space *mapping)
 {
 	/* AS_FOLIO_ORDER is only reasonable for pagecache folios */
-	VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
+	VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON,
 			"Anonymous mapping always supports large folio");
 
 	return mapping_max_folio_order(mapping) > 0;
diff --git a/mm/gup.c b/mm/gup.c
index 30d320719fa2..adffe663594d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2804,9 +2804,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
 		return false;
 
 	/* Anonymous folios pose no problem. */
-	mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
+	mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS;
 	if (mapping_flags)
-		return mapping_flags & PAGE_MAPPING_ANON;
+		return mapping_flags & FOLIO_MAPPING_ANON;
 
 	/*
 	 * At this point, we know the mapping is non-null and points to an
diff --git a/mm/internal.h b/mm/internal.h
index 22a95a2b7fa1..2e235740128a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio)
 {
 	unsigned long mapping = (unsigned long)folio->mapping;
 
-	return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+	return (void *)(mapping & ~FOLIO_MAPPING_FLAGS);
 }
 
 /*
diff --git a/mm/ksm.c b/mm/ksm.c
index ef73b25fd65a..2b0210d41c55 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -893,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
 	unsigned long kpfn;
 
 	expected_mapping = (void *)((unsigned long)stable_node |
-					PAGE_MAPPING_KSM);
+					FOLIO_MAPPING_KSM);
 again:
 	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
 	folio = pfn_folio(kpfn);
@@ -1070,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio,
 					 struct ksm_stable_node *stable_node)
 {
 	VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
-	folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+	folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM);
 }
 
 #ifdef CONFIG_SYSFS
diff --git a/mm/rmap.c b/mm/rmap.c
index bd83724d14b6..4b1a2a33e39f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio)
 
 	rcu_read_lock();
 	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
-	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
 		goto out;
 	if (!folio_mapped(folio))
 		goto out;
 
-	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
 		anon_vma = NULL;
 		goto out;
@@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
 retry:
 	rcu_read_lock();
 	anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
-	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+	if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
 		goto out;
 	if (!folio_mapped(folio))
 		goto out;
 
-	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON);
 	root_anon_vma = READ_ONCE(anon_vma->root);
 	if (down_read_trylock(&root_anon_vma->rwsem)) {
 		/*
@@ -1334,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_VMA(!anon_vma, vma);
 
-	anon_vma += PAGE_MAPPING_ANON;
+	anon_vma += FOLIO_MAPPING_ANON;
 	/*
-	 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+	 * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written
 	 * simultaneously, so a concurrent reader (eg folio_referenced()'s
 	 * folio_test_anon()) will not see one without the other.
 	 */
@@ -1367,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
 	/*
 	 * page_idle does a lockless/optimistic rmap scan on folio->mapping.
 	 * Make sure the compiler doesn't split the stores of anon_vma and
-	 * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+	 * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code
 	 * could mistake the mapping for a struct address_space and crash.
 	 */
-	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+	anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON;
 	WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
 	folio->index = linear_page_index(vma, address);
 }
diff --git a/mm/util.c b/mm/util.c
index 0b270c43d7d1..20bbfe4ce1b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -670,9 +670,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio)
 {
 	unsigned long mapping = (unsigned long)folio->mapping;
 
-	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+	if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON)
 		return NULL;
-	return (void *)(mapping - PAGE_MAPPING_ANON);
+	return (void *)(mapping - FOLIO_MAPPING_ANON);
 }
 
 /**
@@ -699,7 +699,7 @@ struct address_space *folio_mapping(struct folio *folio)
 		return swap_address_space(folio->swap);
 
 	mapping = folio->mapping;
-	if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
+	if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS)
 		return NULL;
 
 	return mapping;
-- 
cgit v1.2.3


From f5e43012b86aa6a0b0ad5881cb68bfb872826c22 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:22 +0200
Subject: mm/balloon_compaction: "movable_ops" doc updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's bring the docs up-to-date.  Setting PG_movable_ops + page->private
very likely still requires to be performed under documented locks: it's
complicated.

We will rework this in the future, as we will try avoiding using the page
lock.

Link: https://lkml.kernel.org/r/20250704102524.326966-29-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index b222b0737c46..2fecfead91d2 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -4,12 +4,13 @@
  *
  * Common interface definitions for making balloon pages movable by compaction.
  *
- * Balloon page migration makes use of the general non-lru movable page
+ * Balloon page migration makes use of the general "movable_ops page migration"
  * feature.
  *
  * page->private is used to reference the responsible balloon device.
- * page->mapping is used in context of non-lru page migration to reference
- * the address space operations for page isolation/migration/compaction.
+ * That these pages have movable_ops, and which movable_ops apply,
+ * is derived from the page type (PageOffline()) combined with the
+ * PG_movable_ops flag (PageMovableOps()).
  *
  * As the page isolation scanning step a compaction thread does is a lockless
  * procedure (from a page standpoint), it might bring some racy situations while
@@ -17,12 +18,10 @@
  * and safely perform balloon's page compaction and migration we must, always,
  * ensure following these simple rules:
  *
- *   i. when updating a balloon's page ->mapping element, strictly do it under
- *      the following lock order, independently of the far superior
- *      locking scheme (lru_lock, balloon_lock):
+ *   i. Setting the PG_movable_ops flag and page->private with the following
+ *	lock order
  *	    +-page_lock(page);
  *	      +--spin_lock_irq(&b_dev_info->pages_lock);
- *	            ... page->mapping updates here ...
  *
  *  ii. isolation or dequeueing procedure must remove the page from balloon
  *      device page list under b_dev_info->pages_lock.
-- 
cgit v1.2.3


From 9640b17a89a86f40df47bfc831a8afeff0c7eabc Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 4 Jul 2025 12:25:23 +0200
Subject: mm/balloon_compaction: provide single balloon_page_insert() and
 balloon_mapping_gfp_mask()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's just special-case based on IS_ENABLED(CONFIG_BALLOON_COMPACTION)
like we did for balloon_page_finalize().

Link: https://lkml.kernel.org/r/20250704102524.326966-30-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Eugenio Pé rez <eperezma@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Gregory Price <gourry@gourry.net>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Jerrin Shaji George <jerrin.shaji-george@broadcom.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/balloon_compaction.h | 42 ++++++++++++++------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 2fecfead91d2..7cfe48769239 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -77,6 +77,15 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
 
 #ifdef CONFIG_BALLOON_COMPACTION
 extern const struct movable_operations balloon_mops;
+/*
+ * balloon_page_device - get the b_dev_info descriptor for the balloon device
+ *			 that enqueues the given page.
+ */
+static inline struct balloon_dev_info *balloon_page_device(struct page *page)
+{
+	return (struct balloon_dev_info *)page_private(page);
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
 
 /*
  * balloon_page_insert - insert a page into the balloon's page list and make
@@ -91,41 +100,20 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
 				       struct page *page)
 {
 	__SetPageOffline(page);
-	SetPageMovableOps(page);
-	set_page_private(page, (unsigned long)balloon);
-	list_add(&page->lru, &balloon->pages);
-}
-
-/*
- * balloon_page_device - get the b_dev_info descriptor for the balloon device
- *			 that enqueues the given page.
- */
-static inline struct balloon_dev_info *balloon_page_device(struct page *page)
-{
-	return (struct balloon_dev_info *)page_private(page);
-}
-
-static inline gfp_t balloon_mapping_gfp_mask(void)
-{
-	return GFP_HIGHUSER_MOVABLE;
-}
-
-#else /* !CONFIG_BALLOON_COMPACTION */
-
-static inline void balloon_page_insert(struct balloon_dev_info *balloon,
-				       struct page *page)
-{
-	__SetPageOffline(page);
+	if (IS_ENABLED(CONFIG_BALLOON_COMPACTION)) {
+		SetPageMovableOps(page);
+		set_page_private(page, (unsigned long)balloon);
+	}
 	list_add(&page->lru, &balloon->pages);
 }
 
 static inline gfp_t balloon_mapping_gfp_mask(void)
 {
+	if (IS_ENABLED(CONFIG_BALLOON_COMPACTION))
+		return GFP_HIGHUSER_MOVABLE;
 	return GFP_HIGHUSER;
 }
 
-#endif /* CONFIG_BALLOON_COMPACTION */
-
 /*
  * balloon_page_finalize - prepare a balloon page that was removed from the
  *			   balloon list for release to the page allocator
-- 
cgit v1.2.3


From 214db70287277096e77d804284066ce1c07297dd Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 4 Jul 2025 15:14:07 -0700
Subject: mm/damon: add trace event for auto-tuned monitoring intervals

Patch series "mm/damon: add trace events for auto-tuned monitoring
intervals and DAMOS quota".

The aim-oriented auto-tuning features for monitoring intervals and DAMOS
quota are important and recommended.  Add tracepoints for observabilities
of those tuned values and the tuning itself.


This patch (of 2):

Aim-oriented monitoring intervals auto-tuning is an important and
recommended feature for DAMON users.  Add a trace event for the
observability of the tuned intervals and tuning itself.

Link: https://lkml.kernel.org/r/20250704221408.38510-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250704221408.38510-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/damon.h | 17 +++++++++++++++++
 mm/damon/core.c              |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index da4bd9fd1162..32c611076023 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -48,6 +48,23 @@ TRACE_EVENT_CONDITION(damos_before_apply,
 			__entry->nr_accesses, __entry->age)
 );
 
+TRACE_EVENT(damon_monitor_intervals_tune,
+
+	TP_PROTO(unsigned long sample_us),
+
+	TP_ARGS(sample_us),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, sample_us)
+	),
+
+	TP_fast_assign(
+		__entry->sample_us = sample_us;
+	),
+
+	TP_printk("sample_us=%lu", __entry->sample_us)
+);
+
 TRACE_EVENT(damon_aggregated,
 
 	TP_PROTO(unsigned int target_id, struct damon_region *r,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 979b29e16ef4..57a1ace4d10d 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1490,6 +1490,7 @@ static void kdamond_tune_intervals(struct damon_ctx *c)
 			new_attrs.sample_interval);
 	new_attrs.aggr_interval = new_attrs.sample_interval *
 		c->attrs.aggr_samples;
+	trace_damon_monitor_intervals_tune(new_attrs.sample_interval);
 	damon_set_attrs(c, &new_attrs);
 }
 
-- 
cgit v1.2.3


From a86d695193bfab3f130f9275c275e4e143dcd2e3 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 4 Jul 2025 15:14:08 -0700
Subject: mm/damon: add trace event for effective size quota

Aim-oriented DAMOS quota auto-tuning is an important and recommended
feature for DAMOS users.  Add a trace event for the observability of the
tuned quota and tuning itself.

[sj@kernel.org: initialize sidx in damos_trace_esz()]
  Link: https://lkml.kernel.org/r/20250705172003.52324-1-sj@kernel.org
[sj@kernel.org: make damos_esz unconditional trace event]
  Link: https://lkml.kernel.org/r/20250709182843.35812-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250704221408.38510-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/damon.h | 24 ++++++++++++++++++++++++
 mm/damon/core.c              | 20 +++++++++++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
index 32c611076023..852d725afea2 100644
--- a/include/trace/events/damon.h
+++ b/include/trace/events/damon.h
@@ -9,6 +9,30 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(damos_esz,
+
+	TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
+		unsigned long esz),
+
+	TP_ARGS(context_idx, scheme_idx, esz),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, context_idx)
+		__field(unsigned int, scheme_idx)
+		__field(unsigned long, esz)
+	),
+
+	TP_fast_assign(
+		__entry->context_idx = context_idx;
+		__entry->scheme_idx = scheme_idx;
+		__entry->esz = esz;
+	),
+
+	TP_printk("ctx_idx=%u scheme_idx=%u esz=%lu",
+			__entry->context_idx, __entry->scheme_idx,
+			__entry->esz)
+);
+
 TRACE_EVENT_CONDITION(damos_before_apply,
 
 	TP_PROTO(unsigned int context_idx, unsigned int scheme_idx,
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 57a1ace4d10d..e8036254cc98 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -2011,12 +2011,26 @@ static void damos_set_effective_quota(struct damos_quota *quota)
 	quota->esz = esz;
 }
 
+static void damos_trace_esz(struct damon_ctx *c, struct damos *s,
+		struct damos_quota *quota)
+{
+	unsigned int cidx = 0, sidx = 0;
+	struct damos *siter;
+
+	damon_for_each_scheme(siter, c) {
+		if (siter == s)
+			break;
+		sidx++;
+	}
+	trace_damos_esz(cidx, sidx, quota->esz);
+}
+
 static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 {
 	struct damos_quota *quota = &s->quota;
 	struct damon_target *t;
 	struct damon_region *r;
-	unsigned long cumulated_sz;
+	unsigned long cumulated_sz, cached_esz;
 	unsigned int score, max_score = 0;
 
 	if (!quota->ms && !quota->sz && list_empty(&quota->goals))
@@ -2030,7 +2044,11 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
 		quota->total_charged_sz += quota->charged_sz;
 		quota->charged_from = jiffies;
 		quota->charged_sz = 0;
+		if (trace_damos_esz_enabled())
+			cached_esz = quota->esz;
 		damos_set_effective_quota(quota);
+		if (trace_damos_esz_enabled() && quota->esz != cached_esz)
+			damos_trace_esz(c, s, quota);
 	}
 
 	if (!c->ops.get_scheme_score)
-- 
cgit v1.2.3


From 1bfe6354e0975fe89c3d25e81b6546d205556a4b Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 7 Jul 2025 22:08:04 +0800
Subject: ext4: process folios writeback in bytes

Since ext4 supports large folios, processing writebacks in pages is no
longer appropriate, it can be modified to process writebacks in bytes.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250707140814.542883-2-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c             | 70 +++++++++++++++++++++++----------------------
 include/trace/events/ext4.h | 13 ++++-----
 2 files changed, 42 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 35e7f34ee188..4cb499f28c12 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1665,11 +1665,12 @@ struct mpage_da_data {
 	unsigned int can_map:1;	/* Can writepages call map blocks? */
 
 	/* These are internal state of ext4_do_writepages() */
-	pgoff_t first_page;	/* The first page to write */
-	pgoff_t next_page;	/* Current page to examine */
-	pgoff_t last_page;	/* Last page to examine */
+	loff_t start_pos;	/* The start pos to write */
+	loff_t next_pos;	/* Current pos to examine */
+	loff_t end_pos;		/* Last pos to examine */
+
 	/*
-	 * Extent to map - this can be after first_page because that can be
+	 * Extent to map - this can be after start_pos because that can be
 	 * fully mapped. We somewhat abuse m_flags to store whether the extent
 	 * is delalloc or unwritten.
 	 */
@@ -1689,38 +1690,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
 
-	/* This is necessary when next_page == 0. */
-	if (mpd->first_page >= mpd->next_page)
+	/* This is necessary when next_pos == 0. */
+	if (mpd->start_pos >= mpd->next_pos)
 		return;
 
 	mpd->scanned_until_end = 0;
-	index = mpd->first_page;
-	end   = mpd->next_page - 1;
 	if (invalidate) {
 		ext4_lblk_t start, last;
-		start = index << (PAGE_SHIFT - inode->i_blkbits);
-		last = end << (PAGE_SHIFT - inode->i_blkbits);
+		start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+		last = mpd->next_pos >> inode->i_blkbits;
 
 		/*
 		 * avoid racing with extent status tree scans made by
 		 * ext4_insert_delayed_block()
 		 */
 		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_es_remove_extent(inode, start, last - start + 1);
+		ext4_es_remove_extent(inode, start, last - start);
 		up_write(&EXT4_I(inode)->i_data_sem);
 	}
 
 	folio_batch_init(&fbatch);
-	while (index <= end) {
-		nr = filemap_get_folios(mapping, &index, end, &fbatch);
+	index = mpd->start_pos >> PAGE_SHIFT;
+	end = mpd->next_pos >> PAGE_SHIFT;
+	while (index < end) {
+		nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
 		if (nr == 0)
 			break;
 		for (i = 0; i < nr; i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			if (folio->index < mpd->first_page)
+			if (folio_pos(folio) < mpd->start_pos)
 				continue;
-			if (folio_next_index(folio) - 1 > end)
+			if (folio_next_index(folio) > end)
 				continue;
 			BUG_ON(!folio_test_locked(folio));
 			BUG_ON(folio_test_writeback(folio));
@@ -2022,7 +2023,7 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 
 static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
 {
-	mpd->first_page += folio_nr_pages(folio);
+	mpd->start_pos += folio_size(folio);
 	folio_unlock(folio);
 }
 
@@ -2032,7 +2033,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
 	loff_t size;
 	int err;
 
-	BUG_ON(folio->index != mpd->first_page);
+	WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
 	folio_clear_dirty_for_io(folio);
 	/*
 	 * We have to be very careful here!  Nothing protects writeback path
@@ -2444,7 +2445,7 @@ update_disksize:
 	 * Update on-disk size after IO is submitted.  Races with
 	 * truncate are avoided by checking i_size under i_data_sem.
 	 */
-	disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+	disksize = mpd->start_pos;
 	if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
 		int err2;
 		loff_t i_size;
@@ -2547,8 +2548,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 	struct address_space *mapping = mpd->inode->i_mapping;
 	struct folio_batch fbatch;
 	unsigned int nr_folios;
-	pgoff_t index = mpd->first_page;
-	pgoff_t end = mpd->last_page;
+	pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+	pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
 	xa_mark_t tag;
 	int i, err = 0;
 	int blkbits = mpd->inode->i_blkbits;
@@ -2563,7 +2564,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 		tag = PAGECACHE_TAG_DIRTY;
 
 	mpd->map.m_len = 0;
-	mpd->next_page = index;
+	mpd->next_pos = mpd->start_pos;
 	if (ext4_should_journal_data(mpd->inode)) {
 		handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
 					    bpp);
@@ -2594,7 +2595,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 				goto out;
 
 			/* If we can't merge this page, we are done. */
-			if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
+			if (mpd->map.m_len > 0 &&
+			    mpd->next_pos != folio_pos(folio))
 				goto out;
 
 			if (handle) {
@@ -2640,8 +2642,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			}
 
 			if (mpd->map.m_len == 0)
-				mpd->first_page = folio->index;
-			mpd->next_page = folio_next_index(folio);
+				mpd->start_pos = folio_pos(folio);
+			mpd->next_pos = folio_pos(folio) + folio_size(folio);
 			/*
 			 * Writeout when we cannot modify metadata is simple.
 			 * Just submit the page. For data=journal mode we
@@ -2784,18 +2786,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 		writeback_index = mapping->writeback_index;
 		if (writeback_index)
 			cycled = 0;
-		mpd->first_page = writeback_index;
-		mpd->last_page = -1;
+		mpd->start_pos = writeback_index << PAGE_SHIFT;
+		mpd->end_pos = LLONG_MAX;
 	} else {
-		mpd->first_page = wbc->range_start >> PAGE_SHIFT;
-		mpd->last_page = wbc->range_end >> PAGE_SHIFT;
+		mpd->start_pos = wbc->range_start;
+		mpd->end_pos = wbc->range_end;
 	}
 
 	ext4_io_submit_init(&mpd->io_submit, wbc);
 retry:
 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-		tag_pages_for_writeback(mapping, mpd->first_page,
-					mpd->last_page);
+		tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+					mpd->end_pos >> PAGE_SHIFT);
 	blk_start_plug(&plug);
 
 	/*
@@ -2855,7 +2857,7 @@ retry:
 		}
 		mpd->do_map = 1;
 
-		trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+		trace_ext4_da_write_pages(inode, mpd->start_pos, wbc);
 		ret = mpage_prepare_extent_to_map(mpd);
 		if (!ret && mpd->map.m_len)
 			ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2912,8 +2914,8 @@ unplug:
 	blk_finish_plug(&plug);
 	if (!ret && !cycled && wbc->nr_to_write > 0) {
 		cycled = 1;
-		mpd->last_page = writeback_index - 1;
-		mpd->first_page = 0;
+		mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+		mpd->start_pos = 0;
 		goto retry;
 	}
 
@@ -2923,7 +2925,7 @@ unplug:
 		 * Set the writeback_index so that range_cyclic
 		 * mode will write it back later
 		 */
-		mapping->writeback_index = mpd->first_page;
+		mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
 
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 156908641e68..62d52997b5c6 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -483,15 +483,15 @@ TRACE_EVENT(ext4_writepages,
 );
 
 TRACE_EVENT(ext4_da_write_pages,
-	TP_PROTO(struct inode *inode, pgoff_t first_page,
+	TP_PROTO(struct inode *inode, loff_t start_pos,
 		 struct writeback_control *wbc),
 
-	TP_ARGS(inode, first_page, wbc),
+	TP_ARGS(inode, start_pos, wbc),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
-		__field(      pgoff_t,	first_page		)
+		__field(       loff_t,	start_pos		)
 		__field(	 long,	nr_to_write		)
 		__field(	  int,	sync_mode		)
 	),
@@ -499,15 +499,14 @@ TRACE_EVENT(ext4_da_write_pages,
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
-		__entry->first_page	= first_page;
+		__entry->start_pos	= start_pos;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
-		  "sync_mode %d",
+	TP_printk("dev %d,%d ino %lu start_pos 0x%llx nr_to_write %ld sync_mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->first_page,
+		  (unsigned long) __entry->ino, __entry->start_pos,
 		  __entry->nr_to_write, __entry->sync_mode)
 );
 
-- 
cgit v1.2.3


From 6b132759b0fe78e518abafb62190c294100db6d6 Mon Sep 17 00:00:00 2001
From: Zhang Yi <yi.zhang@huawei.com>
Date: Mon, 7 Jul 2025 22:08:09 +0800
Subject: ext4: enhance tracepoints during the folios writeback

After mpage_map_and_submit_extent() supports restarting handle if
credits are insufficient during allocating blocks, it is more likely to
exit the current mapping iteration and continue to process the current
processing partially mapped folio again. The existing tracepoints are
not sufficient to track this situation, so enhance the tracepoints to
track the writeback position and the return value before and after
submitting the folios.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250707140814.542883-7-yi.zhang@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c             |  5 ++++-
 include/trace/events/ext4.h | 42 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 74b7ba2afb67..2b1d158b3f18 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2933,7 +2933,8 @@ retry:
 		}
 		mpd->do_map = 1;
 
-		trace_ext4_da_write_pages(inode, mpd->start_pos, wbc);
+		trace_ext4_da_write_folios_start(inode, mpd->start_pos,
+				mpd->next_pos, wbc);
 		ret = mpage_prepare_extent_to_map(mpd);
 		if (!ret && mpd->map.m_len)
 			ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2971,6 +2972,8 @@ retry:
 		} else
 			ext4_put_io_end(mpd->io_submit.io_end);
 		mpd->io_submit.io_end = NULL;
+		trace_ext4_da_write_folios_end(inode, mpd->start_pos,
+				mpd->next_pos, wbc, ret);
 
 		if (ret == -ENOSPC && sbi->s_journal) {
 			/*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 62d52997b5c6..845451077c41 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -482,16 +482,17 @@ TRACE_EVENT(ext4_writepages,
 		  (unsigned long) __entry->writeback_index)
 );
 
-TRACE_EVENT(ext4_da_write_pages,
-	TP_PROTO(struct inode *inode, loff_t start_pos,
+TRACE_EVENT(ext4_da_write_folios_start,
+	TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
 		 struct writeback_control *wbc),
 
-	TP_ARGS(inode, start_pos, wbc),
+	TP_ARGS(inode, start_pos, next_pos, wbc),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
 		__field(	ino_t,	ino			)
 		__field(       loff_t,	start_pos		)
+		__field(       loff_t,	next_pos		)
 		__field(	 long,	nr_to_write		)
 		__field(	  int,	sync_mode		)
 	),
@@ -500,16 +501,47 @@ TRACE_EVENT(ext4_da_write_pages,
 		__entry->dev		= inode->i_sb->s_dev;
 		__entry->ino		= inode->i_ino;
 		__entry->start_pos	= start_pos;
+		__entry->next_pos	= next_pos;
 		__entry->nr_to_write	= wbc->nr_to_write;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
 
-	TP_printk("dev %d,%d ino %lu start_pos 0x%llx nr_to_write %ld sync_mode %d",
+	TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  (unsigned long) __entry->ino, __entry->start_pos,
+		  (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos,
 		  __entry->nr_to_write, __entry->sync_mode)
 );
 
+TRACE_EVENT(ext4_da_write_folios_end,
+	TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos,
+		 struct writeback_control *wbc, int ret),
+
+	TP_ARGS(inode, start_pos, next_pos, wbc, ret),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(       loff_t,	start_pos		)
+		__field(       loff_t,	next_pos		)
+		__field(	 long,	nr_to_write		)
+		__field(	  int,	ret			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->start_pos	= start_pos;
+		__entry->next_pos	= next_pos;
+		__entry->nr_to_write	= wbc->nr_to_write;
+		__entry->ret		= ret;
+	),
+
+	TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos,
+		  __entry->nr_to_write, __entry->ret)
+);
+
 TRACE_EVENT(ext4_da_write_pages_extent,
 	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),
 
-- 
cgit v1.2.3


From c3ff7f06c7876bc292cac1c7d4df3d0bfd74f3b7 Mon Sep 17 00:00:00 2001
From: I Viswanath <viswanathiyyappan@gmail.com>
Date: Wed, 9 Jul 2025 20:37:18 +0530
Subject: i2c: Clarify behavior of I2C_M_RD flag

Update the description of I2C_M_RD to clarify that not setting it
signals a write transaction

Signed-off-by: I Viswanath <viswanathiyyappan@gmail.com>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 include/uapi/linux/i2c.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/i2c.h b/include/uapi/linux/i2c.h
index 92326ebde350..a2db2a56c8b0 100644
--- a/include/uapi/linux/i2c.h
+++ b/include/uapi/linux/i2c.h
@@ -21,7 +21,8 @@
  *
  * @flags:
  *   Supported by all adapters:
- *   %I2C_M_RD: read data (from slave to master). Guaranteed to be 0x0001!
+ *   %I2C_M_RD: read data (from slave to master). Guaranteed to be 0x0001! If
+ *   not set, the transaction is interpreted as write.
  *
  *   Optional:
  *   %I2C_M_DMA_SAFE: the buffer of this message is DMA safe. Makes only sense
-- 
cgit v1.2.3


From 0df11950099887520cc8bf22a790a5535be30e8d Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 13 Jul 2025 14:26:25 +0100
Subject: xen: Remove some deadcode (x)

Remove three uncalled functions:

  xenbus_mkdir() was added in 2007 by
commit 4bac07c993d0 ("xen: add the Xenbus sysfs and virtual device hotplug
driver")
but has remained unused.

  xen_get_runstate_snapshot() last use was removed in 2016 by
commit 6ba286ad8457 ("xen: support runqueue steal time on xen")
which replaces the use by the _cpu version.

  xen_resume_notifier_unregister() last use was removed in 2017 by
commit 1914f0cd203c ("xen/acpi: upload PM state from init-domain to Xen")

Remove them.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20250713132625.164728-1-linux@treblig.org>
---
 drivers/xen/manage.c           |  6 ------
 drivers/xen/time.c             |  8 --------
 drivers/xen/xenbus/xenbus_xs.c | 17 -----------------
 include/xen/xen-ops.h          |  2 --
 include/xen/xenbus.h           |  2 --
 5 files changed, 35 deletions(-)

(limited to 'include')

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index b4b4ebed68da..841afa4933c7 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -52,12 +52,6 @@ void xen_resume_notifier_register(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(xen_resume_notifier_register);
 
-void xen_resume_notifier_unregister(struct notifier_block *nb)
-{
-	raw_notifier_chain_unregister(&xen_resume_notifier, nb);
-}
-EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister);
-
 #ifdef CONFIG_HIBERNATE_CALLBACKS
 static int xen_suspend(void *data)
 {
diff --git a/drivers/xen/time.c b/drivers/xen/time.c
index 152dd33bb223..5683383d2305 100644
--- a/drivers/xen/time.c
+++ b/drivers/xen/time.c
@@ -136,14 +136,6 @@ void xen_manage_runstate_time(int action)
 	}
 }
 
-/*
- * Runstate accounting
- */
-void xen_get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
-	xen_get_runstate_snapshot_cpu(res, smp_processor_id());
-}
-
 /* return true when a vcpu could run but has no real cpu to run on */
 bool xen_vcpu_stolen(int vcpu)
 {
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index dcf9182c8451..3c9da446b85d 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -512,23 +512,6 @@ int xenbus_write(struct xenbus_transaction t,
 }
 EXPORT_SYMBOL_GPL(xenbus_write);
 
-/* Create a new directory. */
-int xenbus_mkdir(struct xenbus_transaction t,
-		 const char *dir, const char *node)
-{
-	char *path;
-	int ret;
-
-	path = join(dir, node);
-	if (IS_ERR(path))
-		return PTR_ERR(path);
-
-	ret = xs_error(xs_single(t, XS_MKDIR, path, NULL));
-	kfree(path);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xenbus_mkdir);
-
 /* Destroy a file or directory (directories must be empty). */
 int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node)
 {
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 47f11bec5e90..9e2a769b0d96 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -30,13 +30,11 @@ void xen_arch_suspend(void);
 void xen_reboot(int reason);
 
 void xen_resume_notifier_register(struct notifier_block *nb);
-void xen_resume_notifier_unregister(struct notifier_block *nb);
 
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_runstate_info(int cpu);
 void xen_time_setup_guest(void);
 void xen_manage_runstate_time(int action);
-void xen_get_runstate_snapshot(struct vcpu_runstate_info *res);
 u64 xen_steal_clock(int cpu);
 
 int xen_setup_shutdown_event(void);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 3f90bdd387b6..1c23e6387f13 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -154,8 +154,6 @@ void *xenbus_read(struct xenbus_transaction t,
 		  const char *dir, const char *node, unsigned int *len);
 int xenbus_write(struct xenbus_transaction t,
 		 const char *dir, const char *node, const char *string);
-int xenbus_mkdir(struct xenbus_transaction t,
-		 const char *dir, const char *node);
 int xenbus_exists(struct xenbus_transaction t,
 		  const char *dir, const char *node);
 int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node);
-- 
cgit v1.2.3


From 67fd9615a782b11cd0c62823d722a815c9e1eb75 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:26 +0200
Subject: iomap: pass more arguments using the iomap writeback context

Add inode and wpc fields to pass the inode and writeback context that
are needed in the entire writeback call chain, and let the callers
initialize all fields in the writeback context before calling
iomap_writepages to simplify the argument passing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-3-hch@lst.de
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/fops.c           |  8 ++++++--
 fs/gfs2/aops.c         |  8 ++++++--
 fs/iomap/buffered-io.c | 52 ++++++++++++++++++++++----------------------------
 fs/xfs/xfs_aops.c      | 24 ++++++++++++++++-------
 fs/zonefs/file.c       |  8 ++++++--
 include/linux/iomap.h  |  6 +++---
 6 files changed, 61 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/block/fops.c b/block/fops.c
index 1309861d4c2c..3394263d942b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -558,9 +558,13 @@ static const struct iomap_writeback_ops blkdev_writeback_ops = {
 static int blkdev_writepages(struct address_space *mapping,
 		struct writeback_control *wbc)
 {
-	struct iomap_writepage_ctx wpc = { };
+	struct iomap_writepage_ctx wpc = {
+		.inode		= mapping->host,
+		.wbc		= wbc,
+		.ops		= &blkdev_writeback_ops
+	};
 
-	return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+	return iomap_writepages(&wpc);
 }
 
 const struct address_space_operations def_blk_aops = {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 14f204cd5a82..47d74afd63ac 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -159,7 +159,11 @@ static int gfs2_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
 	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
-	struct iomap_writepage_ctx wpc = { };
+	struct iomap_writepage_ctx wpc = {
+		.inode		= mapping->host,
+		.wbc		= wbc,
+		.ops		= &gfs2_writeback_ops,
+	};
 	int ret;
 
 	/*
@@ -168,7 +172,7 @@ static int gfs2_writepages(struct address_space *mapping,
 	 * want balance_dirty_pages() to loop indefinitely trying to write out
 	 * pages held in the ail that it can't find.
 	 */
-	ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
+	ret = iomap_writepages(&wpc);
 	if (ret == 0 && wbc->nr_to_write > 0)
 		set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
 	return ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index bbd722365404..77d44b691b81 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1618,20 +1618,19 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 }
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode, loff_t pos,
-		u16 ioend_flags)
+		loff_t pos, u16 ioend_flags)
 {
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
-			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
+			       REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	bio->bi_end_io = iomap_writepage_end_bio;
-	bio->bi_write_hint = inode->i_write_hint;
-	wbc_init_bio(wbc, bio);
+	bio->bi_write_hint = wpc->inode->i_write_hint;
+	wbc_init_bio(wpc->wbc, bio);
 	wpc->nr_folios = 0;
-	return iomap_init_ioend(inode, bio, pos, ioend_flags);
+	return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags);
 }
 
 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
@@ -1670,9 +1669,7 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
  * writepage context that the caller will need to submit.
  */
 static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, loff_t pos, loff_t end_pos,
-		unsigned len)
+		struct folio *folio, loff_t pos, loff_t end_pos, unsigned len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
@@ -1693,8 +1690,7 @@ new_ioend:
 		error = iomap_submit_ioend(wpc, 0);
 		if (error)
 			return error;
-		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
-				ioend_flags);
+		wpc->ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
 	}
 
 	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -1748,24 +1744,24 @@ new_ioend:
 	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
 		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
 
-	wbc_account_cgroup_owner(wbc, folio, len);
+	wbc_account_cgroup_owner(wpc->wbc, folio, len);
 	return 0;
 }
 
 static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct folio *folio,
-		struct inode *inode, u64 pos, u64 end_pos,
-		unsigned dirty_len, unsigned *count)
+		struct folio *folio, u64 pos, u64 end_pos, unsigned dirty_len,
+		unsigned *count)
 {
 	int error;
 
 	do {
 		unsigned map_len;
 
-		error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
+		error = wpc->ops->map_blocks(wpc, wpc->inode, pos, dirty_len);
 		if (error)
 			break;
-		trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
+		trace_iomap_writepage_map(wpc->inode, pos, dirty_len,
+				&wpc->iomap);
 
 		map_len = min_t(u64, dirty_len,
 			wpc->iomap.offset + wpc->iomap.length - pos);
@@ -1779,8 +1775,8 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
 		case IOMAP_HOLE:
 			break;
 		default:
-			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
-					end_pos, map_len);
+			error = iomap_add_to_ioend(wpc, folio, pos, end_pos,
+					map_len);
 			if (!error)
 				(*count)++;
 			break;
@@ -1862,10 +1858,10 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
 }
 
 static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct folio *folio)
+		struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
-	struct inode *inode = folio->mapping->host;
+	struct inode *inode = wpc->inode;
 	u64 pos = folio_pos(folio);
 	u64 end_pos = pos + folio_size(folio);
 	u64 end_aligned = 0;
@@ -1912,8 +1908,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 */
 	end_aligned = round_up(end_pos, i_blocksize(inode));
 	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
-		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
-				pos, end_pos, rlen, &count);
+		error = iomap_writepage_map_blocks(wpc, folio, pos, end_pos,
+				rlen, &count);
 		if (error)
 			break;
 		pos += rlen;
@@ -1949,10 +1945,9 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 }
 
 int
-iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
-		struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops)
+iomap_writepages(struct iomap_writepage_ctx *wpc)
 {
+	struct address_space *mapping = wpc->inode->i_mapping;
 	struct folio *folio = NULL;
 	int error;
 
@@ -1964,9 +1959,8 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 			PF_MEMALLOC))
 		return -EIO;
 
-	wpc->ops = ops;
-	while ((folio = writeback_iter(mapping, wbc, folio, &error)))
-		error = iomap_writepage_map(wpc, wbc, folio);
+	while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error)))
+		error = iomap_writepage_map(wpc, folio);
 	return iomap_submit_ioend(wpc, error);
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63151feb9c3f..65485a52df3b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -636,19 +636,29 @@ xfs_vm_writepages(
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
 	if (xfs_is_zoned_inode(ip)) {
-		struct xfs_zoned_writepage_ctx	xc = { };
+		struct xfs_zoned_writepage_ctx	xc = {
+			.ctx = {
+				.inode	= mapping->host,
+				.wbc	= wbc,
+				.ops	= &xfs_zoned_writeback_ops
+			},
+		};
 		int				error;
 
-		error = iomap_writepages(mapping, wbc, &xc.ctx,
-					 &xfs_zoned_writeback_ops);
+		error = iomap_writepages(&xc.ctx);
 		if (xc.open_zone)
 			xfs_open_zone_put(xc.open_zone);
 		return error;
 	} else {
-		struct xfs_writepage_ctx	wpc = { };
-
-		return iomap_writepages(mapping, wbc, &wpc.ctx,
-				&xfs_writeback_ops);
+		struct xfs_writepage_ctx	wpc = {
+			.ctx = {
+				.inode	= mapping->host,
+				.wbc	= wbc,
+				.ops	= &xfs_writeback_ops
+			},
+		};
+
+		return iomap_writepages(&wpc.ctx);
 	}
 }
 
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 42e2c0065bb3..edca4bbe4b72 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -152,9 +152,13 @@ static const struct iomap_writeback_ops zonefs_writeback_ops = {
 static int zonefs_writepages(struct address_space *mapping,
 			     struct writeback_control *wbc)
 {
-	struct iomap_writepage_ctx wpc = { };
+	struct iomap_writepage_ctx wpc = {
+		.inode		= mapping->host,
+		.wbc		= wbc,
+		.ops		= &zonefs_writeback_ops,
+	};
 
-	return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
+	return iomap_writepages(&wpc);
 }
 
 static int zonefs_swap_activate(struct swap_info_struct *sis,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 522644d62f30..00179c9387c5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -448,6 +448,8 @@ struct iomap_writeback_ops {
 
 struct iomap_writepage_ctx {
 	struct iomap		iomap;
+	struct inode		*inode;
+	struct writeback_control *wbc;
 	struct iomap_ioend	*ioend;
 	const struct iomap_writeback_ops *ops;
 	u32			nr_folios;	/* folios added to the ioend */
@@ -461,9 +463,7 @@ void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 		struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
-int iomap_writepages(struct address_space *mapping,
-		struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
-		const struct iomap_writeback_ops *ops);
+int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
 /*
  * Flags for direct I/O ->end_io:
-- 
cgit v1.2.3


From fb7399cf2d0b33825b8039f95c45395c7deba25c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:28 +0200
Subject: iomap: refactor the writeback interface

Replace ->map_blocks with a new ->writeback_range, which differs in the
following ways:

 - it must also queue up the I/O for writeback, that is called into the
   slightly refactored and extended in scope iomap_add_to_ioend for
   each region
 - can handle only a part of the requested region, that is the retry
   loop for partial mappings moves to the caller
 - handles cleanup on failures as well, and thus also replaces the
   discard_folio method only implemented by XFS.

This will allow to use the iomap writeback code also for file systems
that are not block based like fuse.

Co-developed-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-5-hch@lst.de
Acked-by: Damien Le Moal <dlemoal@kernel.org>	# zonefs
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst |  32 +++----
 block/fops.c                                   |  25 +++--
 fs/gfs2/bmap.c                                 |  26 ++---
 fs/iomap/buffered-io.c                         |  96 +++++++++----------
 fs/iomap/trace.h                               |   2 +-
 fs/xfs/xfs_aops.c                              | 128 ++++++++++++++++---------
 fs/zonefs/file.c                               |  28 +++---
 include/linux/iomap.h                          |  21 ++--
 8 files changed, 197 insertions(+), 161 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 3b628e370d88..f07c8fdb2046 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -271,7 +271,7 @@ writeback.
 It does not lock ``i_rwsem`` or ``invalidate_lock``.
 
 The dirty bit will be cleared for all folios run through the
-``->map_blocks`` machinery described below even if the writeback fails.
+``->writeback_range`` machinery described below even if the writeback fails.
 This is to prevent dirty folio clots when storage devices fail; an
 ``-EIO`` is recorded for userspace to collect via ``fsync``.
 
@@ -283,15 +283,14 @@ The ``ops`` structure must be specified and is as follows:
 .. code-block:: c
 
  struct iomap_writeback_ops {
-     int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
-                       loff_t offset, unsigned len);
-     int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
-     void (*discard_folio)(struct folio *folio, loff_t pos);
+    int (*writeback_range)(struct iomap_writepage_ctx *wpc,
+         struct folio *folio, u64 pos, unsigned int len, u64 end_pos);
+    int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
  };
 
 The fields are as follows:
 
-  - ``map_blocks``: Sets ``wpc->iomap`` to the space mapping of the file
+  - ``writeback_range``: Sets ``wpc->iomap`` to the space mapping of the file
     range (in bytes) given by ``offset`` and ``len``.
     iomap calls this function for each dirty fs block in each dirty folio,
     though it will `reuse mappings
@@ -306,6 +305,15 @@ The fields are as follows:
     This revalidation must be open-coded by the filesystem; it is
     unclear if ``iomap::validity_cookie`` can be reused for this
     purpose.
+
+    If this methods fails to schedule I/O for any part of a dirty folio, it
+    should throw away any reservations that may have been made for the write.
+    The folio will be marked clean and an ``-EIO`` recorded in the
+    pagecache.
+    Filesystems can use this callback to `remove
+    <https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
+    delalloc reservations to avoid having delalloc reservations for
+    clean pagecache.
     This function must be supplied by the filesystem.
 
   - ``submit_ioend``: Allows the file systems to hook into writeback bio
@@ -316,18 +324,6 @@ The fields are as follows:
     transactions from process context before submitting the bio.
     This function is optional.
 
-  - ``discard_folio``: iomap calls this function after ``->map_blocks``
-    fails to schedule I/O for any part of a dirty folio.
-    The function should throw away any reservations that may have been
-    made for the write.
-    The folio will be marked clean and an ``-EIO`` recorded in the
-    pagecache.
-    Filesystems can use this callback to `remove
-    <https://lore.kernel.org/all/20201029163313.1766967-1-bfoster@redhat.com/>`_
-    delalloc reservations to avoid having delalloc reservations for
-    clean pagecache.
-    This function is optional.
-
 Pagecache Writeback Completion
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/block/fops.c b/block/fops.c
index 3394263d942b..b500ff8f55dd 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -537,22 +537,29 @@ static void blkdev_readahead(struct readahead_control *rac)
 	iomap_readahead(rac, &blkdev_iomap_ops);
 }
 
-static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
-		struct inode *inode, loff_t offset, unsigned int len)
+static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
 {
-	loff_t isize = i_size_read(inode);
+	loff_t isize = i_size_read(wpc->inode);
 
 	if (WARN_ON_ONCE(offset >= isize))
 		return -EIO;
-	if (offset >= wpc->iomap.offset &&
-	    offset < wpc->iomap.offset + wpc->iomap.length)
-		return 0;
-	return blkdev_iomap_begin(inode, offset, isize - offset,
-				  IOMAP_WRITE, &wpc->iomap, NULL);
+
+	if (offset < wpc->iomap.offset ||
+	    offset >= wpc->iomap.offset + wpc->iomap.length) {
+		int error;
+
+		error = blkdev_iomap_begin(wpc->inode, offset, isize - offset,
+				IOMAP_WRITE, &wpc->iomap, NULL);
+		if (error)
+			return error;
+	}
+
+	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
 }
 
 static const struct iomap_writeback_ops blkdev_writeback_ops = {
-	.map_blocks		= blkdev_map_blocks,
+	.writeback_range	= blkdev_writeback_range,
 };
 
 static int blkdev_writepages(struct address_space *mapping,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 7703d0471139..0cc41de54aba 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2469,23 +2469,25 @@ out:
 	return error;
 }
 
-static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
-		loff_t offset, unsigned int len)
+static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
+		struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
 {
-	int ret;
-
-	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
+	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
 		return -EIO;
 
-	if (offset >= wpc->iomap.offset &&
-	    offset < wpc->iomap.offset + wpc->iomap.length)
-		return 0;
+	if (offset < wpc->iomap.offset ||
+	    offset >= wpc->iomap.offset + wpc->iomap.length) {
+		int ret;
 
-	memset(&wpc->iomap, 0, sizeof(wpc->iomap));
-	ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
-	return ret;
+		memset(&wpc->iomap, 0, sizeof(wpc->iomap));
+		ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
+		if (ret)
+			return ret;
+	}
+
+	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
 }
 
 const struct iomap_writeback_ops gfs2_writeback_ops = {
-	.map_blocks		= gfs2_map_blocks,
+	.writeback_range	= gfs2_writeback_range,
 };
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 93b2a90e6867..c558ac15bc87 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1668,14 +1668,30 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
  * At the end of a writeback pass, there will be a cached ioend remaining on the
  * writepage context that the caller will need to submit.
  */
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
-		struct folio *folio, loff_t pos, loff_t end_pos, unsigned len)
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+		loff_t pos, loff_t end_pos, unsigned int dirty_len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
 	unsigned int ioend_flags = 0;
+	unsigned int map_len = min_t(u64, dirty_len,
+		wpc->iomap.offset + wpc->iomap.length - pos);
 	int error;
 
+	trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
+
+	WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+	switch (wpc->iomap.type) {
+	case IOMAP_INLINE:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	case IOMAP_HOLE:
+		return map_len;
+	default:
+		break;
+	}
+
 	if (wpc->iomap.type == IOMAP_UNWRITTEN)
 		ioend_flags |= IOMAP_IOEND_UNWRITTEN;
 	if (wpc->iomap.flags & IOMAP_F_SHARED)
@@ -1693,11 +1709,11 @@ new_ioend:
 		wpc->ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
 	}
 
-	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+	if (!bio_add_folio(&wpc->ioend->io_bio, folio, map_len, poff))
 		goto new_ioend;
 
 	if (ifs)
-		atomic_add(len, &ifs->write_bytes_pending);
+		atomic_add(map_len, &ifs->write_bytes_pending);
 
 	/*
 	 * Clamp io_offset and io_size to the incore EOF so that ondisk
@@ -1740,63 +1756,39 @@ new_ioend:
 	 * Note that this defeats the ability to chain the ioends of
 	 * appending writes.
 	 */
-	wpc->ioend->io_size += len;
+	wpc->ioend->io_size += map_len;
 	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
 		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
 
-	wbc_account_cgroup_owner(wpc->wbc, folio, len);
-	return 0;
+	wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
+	return map_len;
 }
+EXPORT_SYMBOL_GPL(iomap_add_to_ioend);
 
-static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
-		struct folio *folio, u64 pos, u64 end_pos, unsigned dirty_len,
+static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+		struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
 		bool *wb_pending)
 {
-	int error;
-
 	do {
-		unsigned map_len;
-
-		error = wpc->ops->map_blocks(wpc, wpc->inode, pos, dirty_len);
-		if (error)
-			break;
-		trace_iomap_writepage_map(wpc->inode, pos, dirty_len,
-				&wpc->iomap);
+		ssize_t ret;
 
-		map_len = min_t(u64, dirty_len,
-			wpc->iomap.offset + wpc->iomap.length - pos);
-		WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+		ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos);
+		if (WARN_ON_ONCE(ret == 0 || ret > rlen))
+			return -EIO;
+		if (ret < 0)
+			return ret;
+		rlen -= ret;
+		pos += ret;
 
-		switch (wpc->iomap.type) {
-		case IOMAP_INLINE:
-			WARN_ON_ONCE(1);
-			error = -EIO;
-			break;
-		case IOMAP_HOLE:
-			break;
-		default:
-			error = iomap_add_to_ioend(wpc, folio, pos, end_pos,
-					map_len);
-			if (!error)
-				*wb_pending = true;
-			break;
-		}
-		dirty_len -= map_len;
-		pos += map_len;
-	} while (dirty_len && !error);
+		/*
+		 * Holes are not be written back by ->writeback_range, so track
+		 * if we did handle anything that is not a hole here.
+		 */
+		if (wpc->iomap.type != IOMAP_HOLE)
+			*wb_pending = true;
+	} while (rlen);
 
-	/*
-	 * We cannot cancel the ioend directly here on error.  We may have
-	 * already set other pages under writeback and hence we have to run I/O
-	 * completion to mark the error state of the pages under writeback
-	 * appropriately.
-	 *
-	 * Just let the file system know what portion of the folio failed to
-	 * map.
-	 */
-	if (error && wpc->ops->discard_folio)
-		wpc->ops->discard_folio(folio, pos);
-	return error;
+	return 0;
 }
 
 /*
@@ -1908,8 +1900,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 */
 	end_aligned = round_up(end_pos, i_blocksize(inode));
 	while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
-		error = iomap_writepage_map_blocks(wpc, folio, pos, end_pos,
-				rlen, &wb_pending);
+		error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
+				&wb_pending);
 		if (error)
 			break;
 		pos += rlen;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 455cc6f90be0..aaea02c9560a 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -169,7 +169,7 @@ DEFINE_EVENT(iomap_class, name,	\
 DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
 DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
 
-TRACE_EVENT(iomap_writepage_map,
+TRACE_EVENT(iomap_add_to_ioend,
 	TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
 		 struct iomap *iomap),
 	TP_ARGS(inode, pos, dirty_len, iomap),
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 65485a52df3b..f6d44ab78442 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -233,6 +233,47 @@ xfs_end_bio(
 	spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
 }
 
+/*
+ * We cannot cancel the ioend directly on error.  We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+	struct folio		*folio,
+	loff_t			pos)
+{
+	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (xfs_is_shutdown(mp))
+		return;
+
+	xfs_alert_ratelimited(mp,
+		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+			folio, ip->i_ino, pos);
+
+	/*
+	 * The end of the punch range is always the offset of the first
+	 * byte of the next folio. Hence the end offset is only dependent on the
+	 * folio itself and not the start offset that is passed in.
+	 */
+	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+				folio_pos(folio) + folio_size(folio), NULL);
+}
+
 /*
  * Fast revalidation of the cached writeback mapping. Return true if the current
  * mapping is valid, false otherwise.
@@ -278,13 +319,12 @@ xfs_imap_valid(
 static int
 xfs_map_blocks(
 	struct iomap_writepage_ctx *wpc,
-	struct inode		*inode,
 	loff_t			offset,
 	unsigned int		len)
 {
-	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(wpc->inode);
 	struct xfs_mount	*mp = ip->i_mount;
-	ssize_t			count = i_blocksize(inode);
+	ssize_t			count = i_blocksize(wpc->inode);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
 	xfs_fileoff_t		cow_fsb;
@@ -436,6 +476,24 @@ allocate_blocks:
 	return 0;
 }
 
+static ssize_t
+xfs_writeback_range(
+	struct iomap_writepage_ctx *wpc,
+	struct folio		*folio,
+	u64			offset,
+	unsigned int		len,
+	u64			end_pos)
+{
+	ssize_t			ret;
+
+	ret = xfs_map_blocks(wpc, offset, len);
+	if (!ret)
+		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+	if (ret < 0)
+		xfs_discard_folio(folio, offset);
+	return ret;
+}
+
 static bool
 xfs_ioend_needs_wq_completion(
 	struct iomap_ioend	*ioend)
@@ -488,47 +546,9 @@ xfs_submit_ioend(
 	return 0;
 }
 
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
-	struct folio		*folio,
-	loff_t			pos)
-{
-	struct xfs_inode	*ip = XFS_I(folio->mapping->host);
-	struct xfs_mount	*mp = ip->i_mount;
-
-	if (xfs_is_shutdown(mp))
-		return;
-
-	xfs_alert_ratelimited(mp,
-		"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
-			folio, ip->i_ino, pos);
-
-	/*
-	 * The end of the punch range is always the offset of the first
-	 * byte of the next folio. Hence the end offset is only dependent on the
-	 * folio itself and not the start offset that is passed in.
-	 */
-	xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
-				folio_pos(folio) + folio_size(folio), NULL);
-}
-
 static const struct iomap_writeback_ops xfs_writeback_ops = {
-	.map_blocks		= xfs_map_blocks,
+	.writeback_range	= xfs_writeback_range,
 	.submit_ioend		= xfs_submit_ioend,
-	.discard_folio		= xfs_discard_folio,
 };
 
 struct xfs_zoned_writepage_ctx {
@@ -545,11 +565,10 @@ XFS_ZWPC(struct iomap_writepage_ctx *ctx)
 static int
 xfs_zoned_map_blocks(
 	struct iomap_writepage_ctx *wpc,
-	struct inode		*inode,
 	loff_t			offset,
 	unsigned int		len)
 {
-	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_inode	*ip = XFS_I(wpc->inode);
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + len);
@@ -608,6 +627,24 @@ xfs_zoned_map_blocks(
 	return 0;
 }
 
+static ssize_t
+xfs_zoned_writeback_range(
+	struct iomap_writepage_ctx *wpc,
+	struct folio		*folio,
+	u64			offset,
+	unsigned int		len,
+	u64			end_pos)
+{
+	ssize_t			ret;
+
+	ret = xfs_zoned_map_blocks(wpc, offset, len);
+	if (!ret)
+		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+	if (ret < 0)
+		xfs_discard_folio(folio, offset);
+	return ret;
+}
+
 static int
 xfs_zoned_submit_ioend(
 	struct iomap_writepage_ctx *wpc,
@@ -621,9 +658,8 @@ xfs_zoned_submit_ioend(
 }
 
 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
-	.map_blocks		= xfs_zoned_map_blocks,
+	.writeback_range	= xfs_zoned_writeback_range,
 	.submit_ioend		= xfs_zoned_submit_ioend,
-	.discard_folio		= xfs_discard_folio,
 };
 
 STATIC int
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index edca4bbe4b72..c88e2c851753 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -124,29 +124,33 @@ static void zonefs_readahead(struct readahead_control *rac)
  * Map blocks for page writeback. This is used only on conventional zone files,
  * which implies that the page range can only be within the fixed inode size.
  */
-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
-				   struct inode *inode, loff_t offset,
-				   unsigned int len)
+static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
+		struct folio *folio, u64 offset, unsigned len, u64 end_pos)
 {
-	struct zonefs_zone *z = zonefs_inode_zone(inode);
+	struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
 
 	if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
 		return -EIO;
-	if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+	if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
 		return -EIO;
 
 	/* If the mapping is already OK, nothing needs to be done */
-	if (offset >= wpc->iomap.offset &&
-	    offset < wpc->iomap.offset + wpc->iomap.length)
-		return 0;
+	if (offset < wpc->iomap.offset ||
+	    offset >= wpc->iomap.offset + wpc->iomap.length) {
+		int error;
+
+		error = zonefs_write_iomap_begin(wpc->inode, offset,
+				z->z_capacity - offset, IOMAP_WRITE,
+				&wpc->iomap, NULL);
+		if (error)
+			return error;
+	}
 
-	return zonefs_write_iomap_begin(inode, offset,
-					z->z_capacity - offset,
-					IOMAP_WRITE, &wpc->iomap, NULL);
+	return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
 }
 
 static const struct iomap_writeback_ops zonefs_writeback_ops = {
-	.map_blocks		= zonefs_write_map_blocks,
+	.writeback_range	= zonefs_writeback_range,
 };
 
 static int zonefs_writepages(struct address_space *mapping,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 00179c9387c5..625d7911a2b5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -416,18 +416,20 @@ static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
 
 struct iomap_writeback_ops {
 	/*
-	 * Required, maps the blocks so that writeback can be performed on
-	 * the range starting at offset.
+	 * Required, performs writeback on the passed in range
 	 *
-	 * Can return arbitrarily large regions, but we need to call into it at
+	 * Can map arbitrarily large regions, but we need to call into it at
 	 * least once per folio to allow the file systems to synchronize with
 	 * the write path that could be invalidating mappings.
 	 *
 	 * An existing mapping from a previous call to this method can be reused
 	 * by the file system if it is still valid.
+	 *
+	 * Returns the number of bytes processed or a negative errno.
 	 */
-	int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
-			  loff_t offset, unsigned len);
+	ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
+			struct folio *folio, u64 pos, unsigned int len,
+			u64 end_pos);
 
 	/*
 	 * Optional, allows the file systems to hook into bio submission,
@@ -438,12 +440,6 @@ struct iomap_writeback_ops {
 	 * the bio could not be submitted.
 	 */
 	int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
-
-	/*
-	 * Optional, allows the file system to discard state on a page where
-	 * we failed to submit any I/O.
-	 */
-	void (*discard_folio)(struct folio *folio, loff_t pos);
 };
 
 struct iomap_writepage_ctx {
@@ -463,6 +459,9 @@ void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 		struct list_head *more_ioends);
 void iomap_sort_ioends(struct list_head *ioend_list);
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+		loff_t pos, loff_t end_pos, unsigned int dirty_len);
+
 int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
 /*
-- 
cgit v1.2.3


From f4fa7981fa26c664cc540cbce9bcb7ffe02a8912 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:29 +0200
Subject: iomap: hide ioends from the generic writeback code

Replace the ioend pointer in iomap_writeback_ctx with a void *wb_ctx
one to facilitate non-block, non-ioend writeback for use.  Rename
the submit_ioend method to writeback_submit and make it mandatory so
that the generic writeback code stops seeing ioends and bios.

Co-developed-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-6-hch@lst.de
Acked-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst | 17 ++---
 block/fops.c                                   |  1 +
 fs/gfs2/bmap.c                                 |  1 +
 fs/iomap/buffered-io.c                         | 91 +++++++++++++-------------
 fs/xfs/xfs_aops.c                              | 60 +++++++++--------
 fs/zonefs/file.c                               |  1 +
 include/linux/iomap.h                          | 19 +++---
 7 files changed, 100 insertions(+), 90 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index f07c8fdb2046..4b93c5f7841a 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -284,8 +284,8 @@ The ``ops`` structure must be specified and is as follows:
 
  struct iomap_writeback_ops {
     int (*writeback_range)(struct iomap_writepage_ctx *wpc,
-         struct folio *folio, u64 pos, unsigned int len, u64 end_pos);
-    int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
+        struct folio *folio, u64 pos, unsigned int len, u64 end_pos);
+    int (*writeback_submit)(struct iomap_writepage_ctx *wpc, int error);
  };
 
 The fields are as follows:
@@ -316,13 +316,15 @@ The fields are as follows:
     clean pagecache.
     This function must be supplied by the filesystem.
 
-  - ``submit_ioend``: Allows the file systems to hook into writeback bio
-    submission.
+  - ``writeback_submit``: Submit the previous built writeback context.
+    Block based file systems should use the iomap_ioend_writeback_submit
+    helper, other file system can implement their own.
+    File systems can optionall to hook into writeback bio submission.
     This might include pre-write space accounting updates, or installing
     a custom ``->bi_end_io`` function for internal purposes, such as
     deferring the ioend completion to a workqueue to run metadata update
     transactions from process context before submitting the bio.
-    This function is optional.
+    This function must be supplied by the filesystem.
 
 Pagecache Writeback Completion
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -336,10 +338,9 @@ If the write failed, it will also set the error bits on the folios and
 the address space.
 This can happen in interrupt or process context, depending on the
 storage device.
-
 Filesystems that need to update internal bookkeeping (e.g. unwritten
-extent conversions) should provide a ``->submit_ioend`` function to
-set ``struct iomap_end::bio::bi_end_io`` to its own function.
+extent conversions) should set their own bi_end_io on the bios
+submitted by ``->submit_writeback``
 This function should call ``iomap_finish_ioends`` after finishing its
 own work (e.g. unwritten extent conversion).
 
diff --git a/block/fops.c b/block/fops.c
index b500ff8f55dd..0845737c0320 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -560,6 +560,7 @@ static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
 
 static const struct iomap_writeback_ops blkdev_writeback_ops = {
 	.writeback_range	= blkdev_writeback_range,
+	.writeback_submit	= iomap_ioend_writeback_submit,
 };
 
 static int blkdev_writepages(struct address_space *mapping,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0cc41de54aba..86045d3577b7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2490,4 +2490,5 @@ static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
 
 const struct iomap_writeback_ops gfs2_writeback_ops = {
 	.writeback_range	= gfs2_writeback_range,
+	.writeback_submit	= iomap_ioend_writeback_submit,
 };
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c558ac15bc87..650a67b7d223 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1571,7 +1571,7 @@ u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
 	return folio_count;
 }
 
-static void iomap_writepage_end_bio(struct bio *bio)
+static void ioend_writeback_end_bio(struct bio *bio)
 {
 	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
 
@@ -1580,42 +1580,30 @@ static void iomap_writepage_end_bio(struct bio *bio)
 }
 
 /*
- * Submit an ioend.
- *
- * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback.
- * We cannot cancel ioend directly in that case, so call the bio end I/O handler
- * with the error status here to run the normal I/O completion handler to clear
- * the writeback bit and let the file system proess the errors.
+ * We cannot cancel the ioend directly in case of an error, so call the bio end
+ * I/O handler with the error status here to run the normal I/O completion
+ * handler.
  */
-static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
+int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
 {
-	if (!wpc->ioend)
-		return error;
+	struct iomap_ioend *ioend = wpc->wb_ctx;
 
-	/*
-	 * Let the file systems prepare the I/O submission and hook in an I/O
-	 * comletion handler.  This also needs to happen in case after a
-	 * failure happened so that the file system end I/O handler gets called
-	 * to clean up.
-	 */
-	if (wpc->ops->submit_ioend) {
-		error = wpc->ops->submit_ioend(wpc, error);
-	} else {
-		if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
-			error = -EIO;
-		if (!error)
-			submit_bio(&wpc->ioend->io_bio);
-	}
+	if (!ioend->io_bio.bi_end_io)
+		ioend->io_bio.bi_end_io = ioend_writeback_end_bio;
+
+	if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+		error = -EIO;
 
 	if (error) {
-		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
-		bio_endio(&wpc->ioend->io_bio);
+		ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&ioend->io_bio);
+		return error;
 	}
 
-	wpc->ioend = NULL;
-	return error;
+	submit_bio(&ioend->io_bio);
+	return 0;
 }
+EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
 
 static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 		loff_t pos, u16 ioend_flags)
@@ -1626,7 +1614,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 			       REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
-	bio->bi_end_io = iomap_writepage_end_bio;
 	bio->bi_write_hint = wpc->inode->i_write_hint;
 	wbc_init_bio(wpc->wbc, bio);
 	wpc->nr_folios = 0;
@@ -1636,16 +1623,17 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
 		u16 ioend_flags)
 {
+	struct iomap_ioend *ioend = wpc->wb_ctx;
+
 	if (ioend_flags & IOMAP_IOEND_BOUNDARY)
 		return false;
 	if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
-	    (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+	    (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
 		return false;
-	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
+	if (pos != ioend->io_offset + ioend->io_size)
 		return false;
 	if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
-	    iomap_sector(&wpc->iomap, pos) !=
-	    bio_end_sector(&wpc->ioend->io_bio))
+	    iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio))
 		return false;
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
@@ -1671,6 +1659,7 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
 ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		loff_t pos, loff_t end_pos, unsigned int dirty_len)
 {
+	struct iomap_ioend *ioend = wpc->wb_ctx;
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
 	unsigned int ioend_flags = 0;
@@ -1701,15 +1690,17 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 	if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
 		ioend_flags |= IOMAP_IOEND_BOUNDARY;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+	if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
 new_ioend:
-		error = iomap_submit_ioend(wpc, 0);
-		if (error)
-			return error;
-		wpc->ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+		if (ioend) {
+			error = wpc->ops->writeback_submit(wpc, 0);
+			if (error)
+				return error;
+		}
+		wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
 	}
 
-	if (!bio_add_folio(&wpc->ioend->io_bio, folio, map_len, poff))
+	if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
 		goto new_ioend;
 
 	if (ifs)
@@ -1756,9 +1747,9 @@ new_ioend:
 	 * Note that this defeats the ability to chain the ioends of
 	 * appending writes.
 	 */
-	wpc->ioend->io_size += map_len;
-	if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
-		wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+	ioend->io_size += map_len;
+	if (ioend->io_offset + ioend->io_size > end_pos)
+		ioend->io_size = end_pos - ioend->io_offset;
 
 	wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
 	return map_len;
@@ -1953,6 +1944,18 @@ iomap_writepages(struct iomap_writepage_ctx *wpc)
 
 	while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error)))
 		error = iomap_writepage_map(wpc, folio);
-	return iomap_submit_ioend(wpc, error);
+
+	/*
+	 * If @error is non-zero, it means that we have a situation where some
+	 * part of the submission process has failed after we've marked pages
+	 * for writeback.
+	 *
+	 * We cannot cancel the writeback directly in that case, so always call
+	 * ->writeback_submit to run the I/O completion handler to clear the
+	 * writeback bit and let the file system proess the errors.
+	 */
+	if (wpc->wb_ctx)
+		return wpc->ops->writeback_submit(wpc, error);
+	return error;
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f6d44ab78442..1ee4f835ac3c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -514,41 +514,40 @@ xfs_ioend_needs_wq_completion(
 }
 
 static int
-xfs_submit_ioend(
-	struct iomap_writepage_ctx *wpc,
-	int			status)
+xfs_writeback_submit(
+	struct iomap_writepage_ctx	*wpc,
+	int				error)
 {
-	struct iomap_ioend	*ioend = wpc->ioend;
-	unsigned int		nofs_flag;
+	struct iomap_ioend		*ioend = wpc->wb_ctx;
 
 	/*
-	 * We can allocate memory here while doing writeback on behalf of
-	 * memory reclaim.  To avoid memory allocation deadlocks set the
-	 * task-wide nofs context for the following operations.
+	 * Convert CoW extents to regular.
+	 *
+	 * We can allocate memory here while doing writeback on behalf of memory
+	 * reclaim.  To avoid memory allocation deadlocks, set the task-wide
+	 * nofs context.
 	 */
-	nofs_flag = memalloc_nofs_save();
+	if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
+		unsigned int		nofs_flag;
 
-	/* Convert CoW extents to regular */
-	if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
-		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+		nofs_flag = memalloc_nofs_save();
+		error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 				ioend->io_offset, ioend->io_size);
+		memalloc_nofs_restore(nofs_flag);
 	}
 
-	memalloc_nofs_restore(nofs_flag);
-
-	/* send ioends that might require a transaction to the completion wq */
+	/*
+	 * Send ioends that might require a transaction to the completion wq.
+	 */
 	if (xfs_ioend_needs_wq_completion(ioend))
 		ioend->io_bio.bi_end_io = xfs_end_bio;
 
-	if (status)
-		return status;
-	submit_bio(&ioend->io_bio);
-	return 0;
+	return iomap_ioend_writeback_submit(wpc, error);
 }
 
 static const struct iomap_writeback_ops xfs_writeback_ops = {
 	.writeback_range	= xfs_writeback_range,
-	.submit_ioend		= xfs_submit_ioend,
+	.writeback_submit	= xfs_writeback_submit,
 };
 
 struct xfs_zoned_writepage_ctx {
@@ -646,20 +645,25 @@ xfs_zoned_writeback_range(
 }
 
 static int
-xfs_zoned_submit_ioend(
-	struct iomap_writepage_ctx *wpc,
-	int			status)
+xfs_zoned_writeback_submit(
+	struct iomap_writepage_ctx	*wpc,
+	int				error)
 {
-	wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
-	if (status)
-		return status;
-	xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
+	struct iomap_ioend		*ioend = wpc->wb_ctx;
+
+	ioend->io_bio.bi_end_io = xfs_end_bio;
+	if (error) {
+		ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&ioend->io_bio);
+		return error;
+	}
+	xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
 	return 0;
 }
 
 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
 	.writeback_range	= xfs_zoned_writeback_range,
-	.submit_ioend		= xfs_zoned_submit_ioend,
+	.writeback_submit	= xfs_zoned_writeback_submit,
 };
 
 STATIC int
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index c88e2c851753..fee9403ad49b 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -151,6 +151,7 @@ static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
 
 static const struct iomap_writeback_ops zonefs_writeback_ops = {
 	.writeback_range	= zonefs_writeback_range,
+	.writeback_submit	= iomap_ioend_writeback_submit,
 };
 
 static int zonefs_writepages(struct address_space *mapping,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 625d7911a2b5..9f32dd8dc075 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -391,8 +391,7 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
 /*
  * Structure for writeback I/O completions.
  *
- * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
- * for direct I/O) can split a bio generated by iomap.  In that case the parent
+ * File systems can split a bio generated by iomap.  In that case the parent
  * ioend it was split from is recorded in ioend->io_parent.
  */
 struct iomap_ioend {
@@ -416,7 +415,7 @@ static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
 
 struct iomap_writeback_ops {
 	/*
-	 * Required, performs writeback on the passed in range
+	 * Performs writeback on the passed in range
 	 *
 	 * Can map arbitrarily large regions, but we need to call into it at
 	 * least once per folio to allow the file systems to synchronize with
@@ -432,23 +431,22 @@ struct iomap_writeback_ops {
 			u64 end_pos);
 
 	/*
-	 * Optional, allows the file systems to hook into bio submission,
-	 * including overriding the bi_end_io handler.
+	 * Submit a writeback context previously build up by ->writeback_range.
 	 *
-	 * Returns 0 if the bio was successfully submitted, or a negative
-	 * error code if status was non-zero or another error happened and
-	 * the bio could not be submitted.
+	 * Returns 0 if the context was successfully submitted, or a negative
+	 * error code if not.  If @error is non-zero a failure occurred, and
+	 * the writeback context should be completed with an error.
 	 */
-	int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
+	int (*writeback_submit)(struct iomap_writepage_ctx *wpc, int error);
 };
 
 struct iomap_writepage_ctx {
 	struct iomap		iomap;
 	struct inode		*inode;
 	struct writeback_control *wbc;
-	struct iomap_ioend	*ioend;
 	const struct iomap_writeback_ops *ops;
 	u32			nr_folios;	/* folios added to the ioend */
+	void			*wb_ctx;	/* pending writeback context */
 };
 
 struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
@@ -461,6 +459,7 @@ void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 void iomap_sort_ioends(struct list_head *ioend_list);
 ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		loff_t pos, loff_t end_pos, unsigned int dirty_len);
+int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
 
 int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
-- 
cgit v1.2.3


From 9caf1ea80cedf7d35d9371c44fbe5f84b0da667a Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Thu, 10 Jul 2025 15:33:30 +0200
Subject: iomap: add public helpers for uptodate state manipulation

Add a new iomap_start_folio_write helper to abstract away the
write_bytes_pending handling, and export it and the existing
iomap_finish_folio_write for non-iomap writeback in fuse.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
[hch: split from a larger patch]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-7-hch@lst.de
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 20 +++++++++++++++-----
 include/linux/iomap.h  |  5 +++++
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 650a67b7d223..060bfcc353da 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1527,7 +1527,18 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+void iomap_start_folio_write(struct inode *inode, struct folio *folio,
+		size_t len)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+	if (ifs)
+		atomic_add(len, &ifs->write_bytes_pending);
+}
+EXPORT_SYMBOL_GPL(iomap_start_folio_write);
+
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 		size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
@@ -1538,6 +1549,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 	if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
 		folio_end_writeback(folio);
 }
+EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
 
 /*
  * We're now finished for good with this ioend structure.  Update the page
@@ -1660,7 +1672,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		loff_t pos, loff_t end_pos, unsigned int dirty_len)
 {
 	struct iomap_ioend *ioend = wpc->wb_ctx;
-	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
 	unsigned int ioend_flags = 0;
 	unsigned int map_len = min_t(u64, dirty_len,
@@ -1703,8 +1714,7 @@ new_ioend:
 	if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
 		goto new_ioend;
 
-	if (ifs)
-		atomic_add(map_len, &ifs->write_bytes_pending);
+	iomap_start_folio_write(wpc->inode, folio, map_len);
 
 	/*
 	 * Clamp io_offset and io_size to the incore EOF so that ondisk
@@ -1877,7 +1887,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		 * all blocks.
 		 */
 		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
-		atomic_inc(&ifs->write_bytes_pending);
+		iomap_start_folio_write(inode, folio, 1);
 	}
 
 	/*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 9f32dd8dc075..cbf9d299a616 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -461,6 +461,11 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
 		loff_t pos, loff_t end_pos, unsigned int dirty_len);
 int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
 
+void iomap_start_folio_write(struct inode *inode, struct folio *folio,
+		size_t len);
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+		size_t len);
+
 int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
 /*
-- 
cgit v1.2.3


From 8b217cf779cba2b10112f6845dcbbb7e6f4b3d75 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:34 +0200
Subject: iomap: export iomap_writeback_folio

Allow fuse to use iomap_writeback_folio for folio laundering.  Note
that the caller needs to manually submit the pending writeback context.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-11-hch@lst.de
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/iomap/buffered-io.c | 4 ++--
 include/linux/iomap.h  | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ca45a6d1cb68..b4b398e8bd54 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1634,8 +1634,7 @@ static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode,
 	return true;
 }
 
-static int iomap_writeback_folio(struct iomap_writepage_ctx *wpc,
-		struct folio *folio)
+int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 {
 	struct iomap_folio_state *ifs = folio->private;
 	struct inode *inode = wpc->inode;
@@ -1717,6 +1716,7 @@ static int iomap_writeback_folio(struct iomap_writepage_ctx *wpc,
 	mapping_set_error(inode->i_mapping, error);
 	return error;
 }
+EXPORT_SYMBOL_GPL(iomap_writeback_folio);
 
 int
 iomap_writepages(struct iomap_writepage_ctx *wpc)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index cbf9d299a616..b65d3f063bb0 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -466,6 +466,7 @@ void iomap_start_folio_write(struct inode *inode, struct folio *folio,
 void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
 		size_t len);
 
+int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio);
 int iomap_writepages(struct iomap_writepage_ctx *wpc);
 
 /*
-- 
cgit v1.2.3


From 2a5574fc57d13031f869c409181bdeadd75770e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:35 +0200
Subject: iomap: replace iomap_folio_ops with iomap_write_ops

The iomap_folio_ops are only used for buffered writes, including the zero
and unshare variants.  Rename them to iomap_write_ops to better describe
the usage, and pass them through the call chain like the other operation
specific methods instead of through the iomap.

xfs_iomap_valid grows a IOMAP_HOLE check to keep the existing behavior
that never attached the folio_ops to a iomap representing a hole.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-12-hch@lst.de
Acked-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/design.rst     |  3 -
 Documentation/filesystems/iomap/operations.rst |  8 +--
 block/fops.c                                   |  3 +-
 fs/gfs2/bmap.c                                 | 21 ++++---
 fs/gfs2/bmap.h                                 |  1 +
 fs/gfs2/file.c                                 |  3 +-
 fs/iomap/buffered-io.c                         | 79 ++++++++++++++------------
 fs/xfs/xfs_file.c                              |  6 +-
 fs/xfs/xfs_iomap.c                             | 12 ++--
 fs/xfs/xfs_iomap.h                             |  1 +
 fs/xfs/xfs_reflink.c                           |  3 +-
 fs/zonefs/file.c                               |  3 +-
 include/linux/iomap.h                          | 22 +++----
 13 files changed, 89 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index f2df9b6df988..0f7672676c0b 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -167,7 +167,6 @@ structure below:
      struct dax_device   *dax_dev;
      void                *inline_data;
      void                *private;
-     const struct iomap_folio_ops *folio_ops;
      u64                 validity_cookie;
  };
 
@@ -292,8 +291,6 @@ The fields are as follows:
    <https://lore.kernel.org/all/20180619164137.13720-7-hch@lst.de/>`_.
    This value will be passed unchanged to ``->iomap_end``.
 
- * ``folio_ops`` will be covered in the section on pagecache operations.
-
  * ``validity_cookie`` is a magic freshness value set by the filesystem
    that should be used to detect stale mappings.
    For pagecache operations this is critical for correct operation
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 4b93c5f7841a..a9b48ce4af92 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -57,16 +57,12 @@ The following address space operations can be wrapped easily:
  * ``bmap``
  * ``swap_activate``
 
-``struct iomap_folio_ops``
+``struct iomap_write_ops``
 --------------------------
 
-The ``->iomap_begin`` function for pagecache operations may set the
-``struct iomap::folio_ops`` field to an ops structure to override
-default behaviors of iomap:
-
 .. code-block:: c
 
- struct iomap_folio_ops {
+ struct iomap_write_ops {
      struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
                                 unsigned len);
      void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
diff --git a/block/fops.c b/block/fops.c
index 0845737c0320..0c2c010ff303 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -723,7 +723,8 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
 {
-	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL);
+	return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL,
+			NULL);
 }
 
 /*
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 86045d3577b7..131091520de6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -963,12 +963,16 @@ static struct folio *
 gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
 {
 	struct inode *inode = iter->inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int blockmask = i_blocksize(inode) - 1;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned int blocks;
 	struct folio *folio;
 	int status;
 
+	if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
+		return iomap_get_folio(iter, pos, len);
+
 	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 	status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 	if (status)
@@ -987,7 +991,7 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-	if (!gfs2_is_stuffed(ip))
+	if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
 		gfs2_trans_add_databufs(ip->i_gl, folio,
 					offset_in_folio(folio, pos),
 					copied);
@@ -995,13 +999,14 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	if (tr->tr_num_buf_new)
-		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-
-	gfs2_trans_end(sdp);
+	if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) {
+		if (tr->tr_num_buf_new)
+			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+		gfs2_trans_end(sdp);
+	}
 }
 
-static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
+const struct iomap_write_ops gfs2_iomap_write_ops = {
 	.get_folio = gfs2_iomap_get_folio,
 	.put_folio = gfs2_iomap_put_folio,
 };
@@ -1078,8 +1083,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 		gfs2_trans_end(sdp);
 	}
 
-	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
-		iomap->folio_ops = &gfs2_iomap_folio_ops;
 	return 0;
 
 out_trans_end:
@@ -1304,7 +1307,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length
 		return 0;
 	length = min(length, inode->i_size - from);
 	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
-			NULL);
+			&gfs2_iomap_write_ops, NULL);
 }
 
 #define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e8b1e8ebdf3..6cdc72dd55a3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
 }
 
 extern const struct iomap_ops gfs2_iomap_ops;
+extern const struct iomap_write_ops gfs2_iomap_write_ops;
 extern const struct iomap_writeback_ops gfs2_writeback_ops;
 
 int gfs2_unstuff_dinode(struct gfs2_inode *ip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index fd1147aa3891..2908f5bee21d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1058,7 +1058,8 @@ retry:
 	}
 
 	pagefault_disable();
-	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL);
+	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops,
+			&gfs2_iomap_write_ops, NULL);
 	pagefault_enable();
 	if (ret > 0)
 		written += ret;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b4b398e8bd54..9f2cc5dd7e80 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -733,28 +733,27 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
 	return 0;
 }
 
-static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len)
+static struct folio *__iomap_get_folio(struct iomap_iter *iter,
+		const struct iomap_write_ops *write_ops, size_t len)
 {
-	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
 	loff_t pos = iter->pos;
 
 	if (!mapping_large_folio_support(iter->inode->i_mapping))
 		len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
 
-	if (folio_ops && folio_ops->get_folio)
-		return folio_ops->get_folio(iter, pos, len);
-	else
-		return iomap_get_folio(iter, pos, len);
+	if (write_ops && write_ops->get_folio)
+		return write_ops->get_folio(iter, pos, len);
+	return iomap_get_folio(iter, pos, len);
 }
 
-static void __iomap_put_folio(struct iomap_iter *iter, size_t ret,
+static void __iomap_put_folio(struct iomap_iter *iter,
+		const struct iomap_write_ops *write_ops, size_t ret,
 		struct folio *folio)
 {
-	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
 	loff_t pos = iter->pos;
 
-	if (folio_ops && folio_ops->put_folio) {
-		folio_ops->put_folio(iter->inode, pos, ret, folio);
+	if (write_ops && write_ops->put_folio) {
+		write_ops->put_folio(iter->inode, pos, ret, folio);
 	} else {
 		folio_unlock(folio);
 		folio_put(folio);
@@ -791,10 +790,10 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
  * offset, and length. Callers can optionally pass a max length *plen,
  * otherwise init to zero.
  */
-static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
+static int iomap_write_begin(struct iomap_iter *iter,
+		const struct iomap_write_ops *write_ops, struct folio **foliop,
 		size_t *poffset, u64 *plen)
 {
-	const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
 	loff_t pos = iter->pos;
 	u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
@@ -809,7 +808,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
-	folio = __iomap_get_folio(iter, len);
+	folio = __iomap_get_folio(iter, write_ops, len);
 	if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
@@ -823,8 +822,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
 	 * could do the wrong thing here (zero a page range incorrectly or fail
 	 * to zero) and corrupt data.
 	 */
-	if (folio_ops && folio_ops->iomap_valid) {
-		bool iomap_valid = folio_ops->iomap_valid(iter->inode,
+	if (write_ops && write_ops->iomap_valid) {
+		bool iomap_valid = write_ops->iomap_valid(iter->inode,
 							 &iter->iomap);
 		if (!iomap_valid) {
 			iter->iomap.flags |= IOMAP_F_STALE;
@@ -850,8 +849,7 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
 	return 0;
 
 out_unlock:
-	__iomap_put_folio(iter, 0, folio);
-
+	__iomap_put_folio(iter, write_ops, 0, folio);
 	return status;
 }
 
@@ -923,7 +921,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
 	return __iomap_write_end(iter->inode, pos, len, copied, folio);
 }
 
-static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
+		const struct iomap_write_ops *write_ops)
 {
 	ssize_t total_written = 0;
 	int status = 0;
@@ -967,7 +966,8 @@ retry:
 			break;
 		}
 
-		status = iomap_write_begin(iter, &folio, &offset, &bytes);
+		status = iomap_write_begin(iter, write_ops, &folio, &offset,
+				&bytes);
 		if (unlikely(status)) {
 			iomap_write_failed(iter->inode, iter->pos, bytes);
 			break;
@@ -996,7 +996,7 @@ retry:
 			i_size_write(iter->inode, pos + written);
 			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
 		}
-		__iomap_put_folio(iter, written, folio);
+		__iomap_put_folio(iter, write_ops, written, folio);
 
 		if (old_size < pos)
 			pagecache_isize_extended(iter->inode, old_size, pos);
@@ -1029,7 +1029,8 @@ retry:
 
 ssize_t
 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
-		const struct iomap_ops *ops, void *private)
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private)
 {
 	struct iomap_iter iter = {
 		.inode		= iocb->ki_filp->f_mapping->host,
@@ -1046,7 +1047,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 		iter.flags |= IOMAP_DONTCACHE;
 
 	while ((ret = iomap_iter(&iter, ops)) > 0)
-		iter.status = iomap_write_iter(&iter, i);
+		iter.status = iomap_write_iter(&iter, i, write_ops);
 
 	if (unlikely(iter.pos == iocb->ki_pos))
 		return ret;
@@ -1280,7 +1281,8 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
 }
 EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
 
-static int iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter,
+		const struct iomap_write_ops *write_ops)
 {
 	struct iomap *iomap = &iter->iomap;
 	u64 bytes = iomap_length(iter);
@@ -1295,14 +1297,15 @@ static int iomap_unshare_iter(struct iomap_iter *iter)
 		bool ret;
 
 		bytes = min_t(u64, SIZE_MAX, bytes);
-		status = iomap_write_begin(iter, &folio, &offset, &bytes);
+		status = iomap_write_begin(iter, write_ops, &folio, &offset,
+				&bytes);
 		if (unlikely(status))
 			return status;
 		if (iomap->flags & IOMAP_F_STALE)
 			break;
 
 		ret = iomap_write_end(iter, bytes, bytes, folio);
-		__iomap_put_folio(iter, bytes, folio);
+		__iomap_put_folio(iter, write_ops, bytes, folio);
 		if (WARN_ON_ONCE(!ret))
 			return -EIO;
 
@@ -1320,7 +1323,8 @@ static int iomap_unshare_iter(struct iomap_iter *iter)
 
 int
 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
-		const struct iomap_ops *ops)
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops)
 {
 	struct iomap_iter iter = {
 		.inode		= inode,
@@ -1335,7 +1339,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 
 	iter.len = min(len, size - pos);
 	while ((ret = iomap_iter(&iter, ops)) > 0)
-		iter.status = iomap_unshare_iter(&iter);
+		iter.status = iomap_unshare_iter(&iter, write_ops);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1354,7 +1358,8 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
 	return filemap_write_and_wait_range(mapping, i->pos, end);
 }
 
-static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+		const struct iomap_write_ops *write_ops)
 {
 	u64 bytes = iomap_length(iter);
 	int status;
@@ -1365,7 +1370,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		bool ret;
 
 		bytes = min_t(u64, SIZE_MAX, bytes);
-		status = iomap_write_begin(iter, &folio, &offset, &bytes);
+		status = iomap_write_begin(iter, write_ops, &folio, &offset,
+				&bytes);
 		if (status)
 			return status;
 		if (iter->iomap.flags & IOMAP_F_STALE)
@@ -1378,7 +1384,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		folio_mark_accessed(folio);
 
 		ret = iomap_write_end(iter, bytes, bytes, folio);
-		__iomap_put_folio(iter, bytes, folio);
+		__iomap_put_folio(iter, write_ops, bytes, folio);
 		if (WARN_ON_ONCE(!ret))
 			return -EIO;
 
@@ -1394,7 +1400,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		const struct iomap_ops *ops, void *private)
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private)
 {
 	struct iomap_iter iter = {
 		.inode		= inode,
@@ -1424,7 +1431,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 	    filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
 		iter.len = plen;
 		while ((ret = iomap_iter(&iter, ops)) > 0)
-			iter.status = iomap_zero_iter(&iter, did_zero);
+			iter.status = iomap_zero_iter(&iter, did_zero,
+					write_ops);
 
 		iter.len = len - (iter.pos - pos);
 		if (ret || !iter.len)
@@ -1455,7 +1463,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 			continue;
 		}
 
-		iter.status = iomap_zero_iter(&iter, did_zero);
+		iter.status = iomap_zero_iter(&iter, did_zero, write_ops);
 	}
 	return ret;
 }
@@ -1463,7 +1471,8 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
 
 int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops, void *private)
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private)
 {
 	unsigned int blocksize = i_blocksize(inode);
 	unsigned int off = pos & (blocksize - 1);
@@ -1472,7 +1481,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 	if (!off)
 		return 0;
 	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
-			private);
+			write_ops, private);
 }
 EXPORT_SYMBOL_GPL(iomap_truncate_page);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 48254a72071b..6e0970f24df5 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -979,7 +979,8 @@ write_retry:
 
 	trace_xfs_file_buffered_write(iocb, from);
 	ret = iomap_file_buffered_write(iocb, from,
-			&xfs_buffered_write_iomap_ops, NULL);
+			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+			NULL);
 
 	/*
 	 * If we hit a space limit, try to free up some lingering preallocated
@@ -1059,7 +1060,8 @@ xfs_file_buffered_write_zoned(
 retry:
 	trace_xfs_file_buffered_write(iocb, from);
 	ret = iomap_file_buffered_write(iocb, from,
-			&xfs_buffered_write_iomap_ops, &ac);
+			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+			&ac);
 	if (ret == -ENOSPC && !cleared_space) {
 		/*
 		 * Kick off writeback to convert delalloc space and release the
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ff05e6b1b0bb..2e94a9435002 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -79,6 +79,9 @@ xfs_iomap_valid(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 
+	if (iomap->type == IOMAP_HOLE)
+		return true;
+
 	if (iomap->validity_cookie !=
 			xfs_iomap_inode_sequence(ip, iomap->flags)) {
 		trace_xfs_iomap_invalid(ip, iomap);
@@ -89,7 +92,7 @@ xfs_iomap_valid(
 	return true;
 }
 
-static const struct iomap_folio_ops xfs_iomap_folio_ops = {
+const struct iomap_write_ops xfs_iomap_write_ops = {
 	.iomap_valid		= xfs_iomap_valid,
 };
 
@@ -151,7 +154,6 @@ xfs_bmbt_to_iomap(
 		iomap->flags |= IOMAP_F_DIRTY;
 
 	iomap->validity_cookie = sequence_cookie;
-	iomap->folio_ops = &xfs_iomap_folio_ops;
 	return 0;
 }
 
@@ -2198,7 +2200,8 @@ xfs_zero_range(
 		return dax_zero_range(inode, pos, len, did_zero,
 				      &xfs_dax_write_iomap_ops);
 	return iomap_zero_range(inode, pos, len, did_zero,
-				&xfs_buffered_write_iomap_ops, ac);
+			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+			ac);
 }
 
 int
@@ -2214,5 +2217,6 @@ xfs_truncate_page(
 		return dax_truncate_page(inode, pos, did_zero,
 					&xfs_dax_write_iomap_ops);
 	return iomap_truncate_page(inode, pos, did_zero,
-				   &xfs_buffered_write_iomap_ops, ac);
+			&xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+			ac);
 }
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 674f8ac1b9bd..ebcce7d49446 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -57,5 +57,6 @@ extern const struct iomap_ops xfs_seek_iomap_ops;
 extern const struct iomap_ops xfs_xattr_iomap_ops;
 extern const struct iomap_ops xfs_dax_write_iomap_ops;
 extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
+extern const struct iomap_write_ops xfs_iomap_write_ops;
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index ad3bcb76d805..3f177b4ec131 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1881,7 +1881,8 @@ xfs_reflink_unshare(
 				&xfs_dax_write_iomap_ops);
 	else
 		error = iomap_file_unshare(inode, offset, len,
-				&xfs_buffered_write_iomap_ops);
+				&xfs_buffered_write_iomap_ops,
+				&xfs_iomap_write_ops);
 	if (error)
 		goto out;
 
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index fee9403ad49b..24c29c10e27f 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -572,7 +572,8 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 	if (ret <= 0)
 		goto inode_unlock;
 
-	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
+	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops,
+			NULL, NULL);
 	if (ret == -EIO)
 		zonefs_io_error(inode, true);
 
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index b65d3f063bb0..80f543cc4fe8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -101,8 +101,6 @@ struct vm_fault;
  */
 #define IOMAP_NULL_ADDR -1ULL	/* addr is not valid */
 
-struct iomap_folio_ops;
-
 struct iomap {
 	u64			addr; /* disk offset of mapping, bytes */
 	loff_t			offset;	/* file offset of mapping, bytes */
@@ -113,7 +111,6 @@ struct iomap {
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;
 	void			*private; /* filesystem private */
-	const struct iomap_folio_ops *folio_ops;
 	u64			validity_cookie; /* used with .iomap_valid() */
 };
 
@@ -143,16 +140,11 @@ static inline bool iomap_inline_data_valid(const struct iomap *iomap)
 }
 
 /*
- * When a filesystem sets folio_ops in an iomap mapping it returns, get_folio
- * and put_folio will be called for each folio written to.  This only applies
- * to buffered writes as unbuffered writes will not typically have folios
- * associated with them.
- *
  * When get_folio succeeds, put_folio will always be called to do any
  * cleanup work necessary.  put_folio is responsible for unlocking and putting
  * @folio.
  */
-struct iomap_folio_ops {
+struct iomap_write_ops {
 	struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos,
 			unsigned len);
 	void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
@@ -335,7 +327,8 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
 }
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
-		const struct iomap_ops *ops, void *private);
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private);
 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
@@ -344,11 +337,14 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
 bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
-		const struct iomap_ops *ops);
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-		bool *did_zero, const struct iomap_ops *ops, void *private);
+		bool *did_zero, const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private);
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		const struct iomap_ops *ops, void *private);
+		const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops, void *private);
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
 		void *private);
 typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
-- 
cgit v1.2.3


From c5690dd0197809bc5305f474a71b2e71e7eac0ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jul 2025 15:33:37 +0200
Subject: iomap: add read_folio_range() handler for buffered writes

Add a read_folio_range() handler for buffered writes that filesystems
may pass in if they wish to provide a custom handler for synchronously
reading in the contents of a folio.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
[hch: renamed to read_folio_range, pass less arguments]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/20250710133343.399917-14-hch@lst.de
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/operations.rst |  6 ++++++
 fs/iomap/buffered-io.c                         | 13 +++++++++----
 include/linux/iomap.h                          | 10 ++++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index a9b48ce4af92..067ed8e14ef3 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -68,6 +68,8 @@ The following address space operations can be wrapped easily:
      void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied,
                        struct folio *folio);
      bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+     int (*read_folio_range)(const struct iomap_iter *iter,
+     			struct folio *folio, loff_t pos, size_t len);
  };
 
 iomap calls these functions:
@@ -123,6 +125,10 @@ iomap calls these functions:
     ``->iomap_valid``, then the iomap should considered stale and the
     validation failed.
 
+  - ``read_folio_range``: Called to synchronously read in the range that will
+    be written to. If this function is not provided, iomap will default to
+    submitting a bio read request.
+
 These ``struct kiocb`` flags are significant for buffered I/O with iomap:
 
  * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8a44f56a3a80..aed4fc30a849 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -671,7 +671,8 @@ static int iomap_read_folio_range(const struct iomap_iter *iter,
 	return submit_bio_wait(&bio);
 }
 
-static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
+static int __iomap_write_begin(const struct iomap_iter *iter,
+		const struct iomap_write_ops *write_ops, size_t len,
 		struct folio *folio)
 {
 	struct iomap_folio_state *ifs;
@@ -722,8 +723,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
 			if (iter->flags & IOMAP_NOWAIT)
 				return -EAGAIN;
 
-			status = iomap_read_folio_range(iter, folio,
-					block_start, plen);
+			if (write_ops && write_ops->read_folio_range)
+				status = write_ops->read_folio_range(iter,
+						folio, block_start, plen);
+			else
+				status = iomap_read_folio_range(iter,
+						folio, block_start, plen);
 			if (status)
 				return status;
 		}
@@ -839,7 +844,7 @@ static int iomap_write_begin(struct iomap_iter *iter,
 	else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
 		status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
 	else
-		status = __iomap_write_begin(iter, len, folio);
+		status = __iomap_write_begin(iter, write_ops, len, folio);
 
 	if (unlikely(status))
 		goto out_unlock;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 80f543cc4fe8..73dceabc21c8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -166,6 +166,16 @@ struct iomap_write_ops {
 	 * locked by the iomap code.
 	 */
 	bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+
+	/*
+	 * Optional if the filesystem wishes to provide a custom handler for
+	 * reading in the contents of a folio, otherwise iomap will default to
+	 * submitting a bio read request.
+	 *
+	 * The read must be done synchronously.
+	 */
+	int (*read_folio_range)(const struct iomap_iter *iter,
+			struct folio *folio, loff_t pos, size_t len);
 };
 
 /*
-- 
cgit v1.2.3


From e075f4360931263f5ec006ea5dadc065e5e98eb8 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Thu, 10 Jul 2025 18:57:07 +0800
Subject: smpboot: introduce SDTL_INIT() helper to tidy sched topology setup

Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the
sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty
---
 arch/powerpc/kernel/smp.c      | 25 ++++++++++---------------
 arch/s390/kernel/topology.c    | 10 +++++-----
 arch/x86/kernel/smpboot.c      | 21 ++++++---------------
 include/linux/sched/topology.h |  4 ++--
 kernel/sched/topology.c        | 24 ++++++++----------------
 5 files changed, 31 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5ac7084eebc0..f59e4b9cc207 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void)
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT);
 	} else {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT)
-		};
+		powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT);
 	}
 #endif
 	if (shared_caches) {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE);
 	}
+
 	if (has_coregroup_support()) {
-		powerpc_topology[i++] = (struct sched_domain_topology_level){
-			cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC)
-		};
+		powerpc_topology[i++] =
+			SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC);
 	}
-	powerpc_topology[i++] = (struct sched_domain_topology_level){
-		cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG)
-	};
+
+	powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG);
 
 	/* There must be one trailing NULL entry left.  */
 	BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 3df048e190b1..46569b8e47dd 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu)
 }
 
 static struct sched_domain_topology_level s390_topology[] = {
-	{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
-	{ cpu_book_mask, SD_INIT_NAME(BOOK) },
-	{ cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
-	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
+	SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT),
+	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
+	SDTL_INIT(cpu_book_mask, NULL, BOOK),
+	SDTL_INIT(cpu_drawer_mask, NULL, DRAWER),
+	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
 	{ NULL, },
 };
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fc78c2325fd2..e0adf75f617a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -485,35 +485,26 @@ static void __init build_sched_topology(void)
 	int i = 0;
 
 #ifdef CONFIG_SCHED_SMT
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT);
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS);
 #endif
 #ifdef CONFIG_SCHED_MC
-	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
-	};
+	x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC);
 #endif
 	/*
 	 * When there is NUMA topology inside the package skip the PKG domain
 	 * since the NUMA domains will auto-magically create the right spanning
 	 * domains based on the SLIT.
 	 */
-	if (!x86_has_numa_in_package) {
-		x86_topology[i++] = (struct sched_domain_topology_level){
-			cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
-		};
-	}
+	if (!x86_has_numa_in_package)
+		x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG);
 
 	/*
 	 * There must be one trailing NULL entry left.
 	 */
-	BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+	BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1);
 
 	set_sched_topology(x86_topology);
 }
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index e54e7fa76ba6..0d5daaa277b7 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -196,8 +196,8 @@ struct sched_domain_topology_level {
 extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
 extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
 
-
-# define SD_INIT_NAME(type)		.name = #type
+#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \
+	    { .mask = maskfn, .sd_flags = flagsfn, .name = #dname })
 
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern void rebuild_sched_domains_energy(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8e06b1d22e91..d01f5a49f2e7 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
  */
 static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+	SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
 #endif
 
 #ifdef CONFIG_SCHED_CLUSTER
-	{ cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+	SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
 #endif
 
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+	SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
 #endif
-	{ cpu_cpu_mask, SD_INIT_NAME(PKG) },
+	SDTL_INIT(cpu_cpu_mask, NULL, PKG),
 	{ NULL, },
 };
 
@@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node)
 	/*
 	 * Add the NUMA identity distance, aka single NODE.
 	 */
-	tl[i++] = (struct sched_domain_topology_level){
-		.mask = sd_numa_mask,
-		.numa_level = 0,
-		SD_INIT_NAME(NODE)
-	};
+	tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
 
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
 	for (j = 1; j < nr_levels; i++, j++) {
-		tl[i] = (struct sched_domain_topology_level){
-			.mask = sd_numa_mask,
-			.sd_flags = cpu_numa_flags,
-			.flags = SDTL_OVERLAP,
-			.numa_level = j,
-			SD_INIT_NAME(NUMA)
-		};
+		tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+		tl[i].numa_level = j;
+		tl[i].flags = SDTL_OVERLAP;
 	}
 
 	sched_domain_topology_saved = sched_domain_topology;
-- 
cgit v1.2.3


From 1eec89a671413ce38df9fe9e70f5130a9eb79a59 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Fri, 11 Jul 2025 11:20:30 +0530
Subject: sched/topology: Remove sched_domain_topology_level::flags

Support for overlapping domains added in commit e3589f6c81e4 ("sched:
Allow for overlapping sched_domain spans") also allowed forcefully
setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat().

Since NUMA domains had to be presumed overlapping to ensure correct
behavior, "sched_domain_topology_level::flags" was introduced. NUMA
domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always
added during build_sched_domains() for these domains, even when
FORCE_SD_OVERLAP was off.

Condition for adding the SD_OVERLAP flag at the aforementioned commit
was as follows:

    if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
            sd->flags |= SD_OVERLAP;

The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de
("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains
as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags.

Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant
and instead rely on SD_NUMA to detect the only overlapping domain
currently supported. Since SDTL_OVERLAP was the only user of
"tl->flags", get rid of "sched_domain_topology_level::flags" too.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com
---
 include/linux/sched/sd_flags.h |  8 --------
 include/linux/sched/topology.h |  3 ---
 kernel/sched/fair.c            |  6 +++---
 kernel/sched/topology.c        | 19 ++++++++++---------
 4 files changed, 13 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index b04a5d04dee9..42839cfa2778 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
 
-/*
- * sched_groups of this level overlap
- *
- * SHARED_PARENT: Set for all NUMA levels above NODE.
- * NEEDS_GROUPS: Overlaps can only exist with more than one group.
- */
-SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
-
 /*
  * Cross-node balancing
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0d5daaa277b7..5263746b63e8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
 
-#define SDTL_OVERLAP	0x01
-
 struct sd_data {
 	struct sched_domain *__percpu *sd;
 	struct sched_domain_shared *__percpu *sds;
@@ -187,7 +185,6 @@ struct sd_data {
 struct sched_domain_topology_level {
 	sched_domain_mask_f mask;
 	sched_domain_flags_f sd_flags;
-	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
 	char                *name;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20a845697c1d..b9b4bbbf0af6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 	min_capacity = ULONG_MAX;
 	max_capacity = 0;
 
-	if (child->flags & SD_OVERLAP) {
+	if (child->flags & SD_NUMA) {
 		/*
-		 * SD_OVERLAP domains cannot assume that child groups
+		 * SD_NUMA domains cannot assume that child groups
 		 * span the current group.
 		 */
 
@@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 		}
 	} else  {
 		/*
-		 * !SD_OVERLAP domains can assume that child groups
+		 * !SD_NUMA domains can assume that child groups
 		 * span the current group.
 		 */
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d01f5a49f2e7..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!(sd->flags & SD_OVERLAP) &&
+		if (!(sd->flags & SD_NUMA) &&
 		    cpumask_intersects(groupmask, sched_group_span(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				group->sgc->id,
 				cpumask_pr_args(sched_group_span(group)));
 
-		if ((sd->flags & SD_OVERLAP) &&
+		if ((sd->flags & SD_NUMA) &&
 		    !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
 			printk(KERN_CONT " mask=%*pbl",
 				cpumask_pr_args(group_balance_mask(group)));
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
 		 * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
 		 * which is shared by all the overlapping groups.
 		 */
-		WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+		WARN_ON_ONCE(sd->flags & SD_NUMA);
 
 		sg = sd->groups;
 		if (cpu != sg->asym_prefer_cpu) {
@@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node)
 	for (j = 1; j < nr_levels; i++, j++) {
 		tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
 		tl[i].numa_level = j;
-		tl[i].flags = SDTL_OVERLAP;
 	}
 
 	sched_domain_topology_saved = sched_domain_topology;
@@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 			if (sdd->sd) {
 				sd = *per_cpu_ptr(sdd->sd, j);
-				if (sd && (sd->flags & SD_OVERLAP))
+				if (sd && (sd->flags & SD_NUMA))
 					free_sched_groups(sd->groups, 0);
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
@@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	id_seen = sched_domains_tmpmask2;
 
 	for_each_sd_topology(tl) {
+		int tl_common_flags = 0;
+
+		if (tl->sd_flags)
+			tl_common_flags = (*tl->sd_flags)();
 
 		/* NUMA levels are allowed to overlap */
-		if (tl->flags & SDTL_OVERLAP)
+		if (tl_common_flags & SD_NUMA)
 			continue;
 
 		cpumask_clear(covered);
@@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP)
-				sd->flags |= SD_OVERLAP;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
 				break;
 		}
@@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			if (sd->flags & SD_OVERLAP) {
+			if (sd->flags & SD_NUMA) {
 				if (build_overlap_sched_groups(sd, i))
 					goto error;
 			} else {
-- 
cgit v1.2.3


From 89635eae076cd8eaa5cb752f66538c9dc6c9fdc3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 11 Jul 2025 16:10:01 +0100
Subject: netfs: Fix race between cache write completion and ALL_QUEUED being
 set

When netfslib is issuing subrequests, the subrequests start processing
immediately and may complete before we reach the end of the issuing
function.  At the end of the issuing function we set NETFS_RREQ_ALL_QUEUED
to indicate to the collector that we aren't going to issue any more subreqs
and that it can do the final notifications and cleanup.

Now, this isn't a problem if the request is synchronous
(NETFS_RREQ_OFFLOAD_COLLECTION is unset) as the result collection will be
done in-thread and we're guaranteed an opportunity to run the collector.

However, if the request is asynchronous, collection is primarily triggered
by the termination of subrequests queuing it on a workqueue.  Now, a race
can occur here if the app thread sets ALL_QUEUED after the last subrequest
terminates.

This can happen most easily with the copy2cache code (as used by Ceph)
where, in the collection routine of a read request, an asynchronous write
request is spawned to copy data to the cache.  Folios are added to the
write request as they're unlocked, but there may be a delay before
ALL_QUEUED is set as the write subrequests may complete before we get
there.

If all the write subreqs have finished by the ALL_QUEUED point, no further
events happen and the collection never happens, leaving the request
hanging.

Fix this by queuing the collector after setting ALL_QUEUED.  This is a bit
heavy-handed and it may be sufficient to do it only if there are no extant
subreqs.

Also add a tracepoint to cross-reference both requests in a copy-to-request
operation and add a trace to the netfs_rreq tracepoint to indicate the
setting of ALL_QUEUED.

Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Link: https://lore.kernel.org/r/CAKPOu+8z_ijTLHdiCYGU_Uk7yYD=shxyGLwfe-L7AV3DhebS3w@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/20250711151005.2956810-3-dhowells@redhat.com
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.org>
cc: Paulo Alcantara <pc@manguebit.org>
cc: Viacheslav Dubeyko <slava@dubeyko.com>
cc: Alex Markuze <amarkuze@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: stable@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/read_pgpriv2.c      |  4 ++++
 include/trace/events/netfs.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index 080d2a6a51d9..8097bc069c1d 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -111,6 +111,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
 		goto cancel_put;
 
 	__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags);
+	trace_netfs_copy2cache(rreq, creq);
 	trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
 	netfs_stat(&netfs_n_wh_copy_to_cache);
 	rreq->copy_to_cache = creq;
@@ -155,6 +156,9 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
 	netfs_issue_write(creq, &creq->io_streams[1]);
 	smp_wmb(); /* Write lists before ALL_QUEUED. */
 	set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
+	trace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache);
+	if (list_empty_careful(&creq->io_streams[1].subrequests))
+		netfs_wake_collector(creq);
 
 	netfs_put_request(creq, netfs_rreq_trace_put_return);
 	creq->copy_to_cache = NULL;
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 73e96ccbe830..64a382fbc31a 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -55,6 +55,7 @@
 	EM(netfs_rreq_trace_copy,		"COPY   ")	\
 	EM(netfs_rreq_trace_dirty,		"DIRTY  ")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
+	EM(netfs_rreq_trace_end_copy_to_cache,	"END-C2C")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
 	EM(netfs_rreq_trace_ki_complete,	"KI-CMPL")	\
 	EM(netfs_rreq_trace_recollect,		"RECLLCT")	\
@@ -559,6 +560,35 @@ TRACE_EVENT(netfs_write,
 		      __entry->start, __entry->start + __entry->len - 1)
 	    );
 
+TRACE_EVENT(netfs_copy2cache,
+	    TP_PROTO(const struct netfs_io_request *rreq,
+		     const struct netfs_io_request *creq),
+
+	    TP_ARGS(rreq, creq),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		rreq)
+		    __field(unsigned int,		creq)
+		    __field(unsigned int,		cookie)
+		    __field(unsigned int,		ino)
+			     ),
+
+	    TP_fast_assign(
+		    struct netfs_inode *__ctx = netfs_inode(rreq->inode);
+		    struct fscache_cookie *__cookie = netfs_i_cookie(__ctx);
+		    __entry->rreq	= rreq->debug_id;
+		    __entry->creq	= creq->debug_id;
+		    __entry->cookie	= __cookie ? __cookie->debug_id : 0;
+		    __entry->ino	= rreq->inode->i_ino;
+			   ),
+
+	    TP_printk("R=%08x CR=%08x c=%08x i=%x ",
+		      __entry->rreq,
+		      __entry->creq,
+		      __entry->cookie,
+		      __entry->ino)
+	    );
+
 TRACE_EVENT(netfs_collect,
 	    TP_PROTO(const struct netfs_io_request *wreq),
 
-- 
cgit v1.2.3


From 45b9d1da6ca0d0285140f8779793b537e4560d45 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Mon, 23 Jun 2025 22:34:00 +0800
Subject: PM / devfreq: Allow devfreq driver to add custom sysfs ABIs

Extend the devfreq_dev_profile to allow drivers optionally create
device-specific sysfs ABIs together with other common devfreq ABIs under
the devfreq device path.

Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Link: https://patchwork.kernel.org/project/linux-pm/patch/20250623143401.4095045-2-zhanjie9@hisilicon.com/
---
 drivers/devfreq/devfreq.c | 1 +
 include/linux/devfreq.h   | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index c5f5960e643b..2e8d01d47f69 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -831,6 +831,7 @@ struct devfreq *devfreq_add_device(struct device *dev,
 	mutex_lock(&devfreq->lock);
 	devfreq->dev.parent = dev;
 	devfreq->dev.class = devfreq_class;
+	devfreq->dev.groups = profile->dev_groups;
 	devfreq->dev.release = devfreq_dev_release;
 	INIT_LIST_HEAD(&devfreq->node);
 	devfreq->profile = profile;
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index d312ffbac4dd..dc1075dc3446 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -103,6 +103,8 @@ struct devfreq_dev_status {
  *
  * @is_cooling_device: A self-explanatory boolean giving the device a
  *                     cooling effect property.
+ * @dev_groups:		Optional device-specific sysfs attribute groups that to
+ *			be attached to the devfreq device.
  */
 struct devfreq_dev_profile {
 	unsigned long initial_freq;
@@ -119,6 +121,8 @@ struct devfreq_dev_profile {
 	unsigned int max_state;
 
 	bool is_cooling_device;
+
+	const struct attribute_group **dev_groups;
 };
 
 /**
-- 
cgit v1.2.3


From 36a686c0784fcccdaa4f38b498a9ef0d42ea7cb8 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 10 Jul 2025 18:43:42 +0200
Subject: Revert "netfilter: nf_tables: Add notifications for hook changes"

This reverts commit 465b9ee0ee7bc268d7f261356afd6c4262e48d82.

Such notifications fit better into core or nfnetlink_hook code,
following the NFNL_MSG_HOOK_GET message format.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  5 ---
 include/uapi/linux/netfilter/nf_tables.h | 10 ------
 include/uapi/linux/netfilter/nfnetlink.h |  2 --
 net/netfilter/nf_tables_api.c            | 59 --------------------------------
 net/netfilter/nfnetlink.c                |  1 -
 net/netfilter/nft_chain_filter.c         |  2 --
 6 files changed, 79 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index e4d8e451e935..5e49619ae49c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1142,11 +1142,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 
-struct nft_hook;
-void nf_tables_chain_device_notify(const struct nft_chain *chain,
-				   const struct nft_hook *hook,
-				   const struct net_device *dev, int event);
-
 enum nft_chain_types {
 	NFT_CHAIN_T_DEFAULT = 0,
 	NFT_CHAIN_T_ROUTE,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 518ba144544c..2beb30be2c5f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -142,8 +142,6 @@ enum nf_tables_msg_types {
 	NFT_MSG_DESTROYOBJ,
 	NFT_MSG_DESTROYFLOWTABLE,
 	NFT_MSG_GETSETELEM_RESET,
-	NFT_MSG_NEWDEV,
-	NFT_MSG_DELDEV,
 	NFT_MSG_MAX,
 };
 
@@ -1786,18 +1784,10 @@ enum nft_synproxy_attributes {
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
  * @NFTA_DEVICE_NAME: name of this device (NLA_STRING)
- * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING)
- * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING)
- * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING)
- * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING)
  */
 enum nft_devices_attributes {
 	NFTA_DEVICE_UNSPEC,
 	NFTA_DEVICE_NAME,
-	NFTA_DEVICE_TABLE,
-	NFTA_DEVICE_FLOWTABLE,
-	NFTA_DEVICE_CHAIN,
-	NFTA_DEVICE_SPEC,
 	__NFTA_DEVICE_MAX
 };
 #define NFTA_DEVICE_MAX		(__NFTA_DEVICE_MAX - 1)
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index 50d807af2649..6cd58cd2a6f0 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -25,8 +25,6 @@ enum nfnetlink_groups {
 #define NFNLGRP_ACCT_QUOTA		NFNLGRP_ACCT_QUOTA
 	NFNLGRP_NFTRACE,
 #define NFNLGRP_NFTRACE			NFNLGRP_NFTRACE
-	NFNLGRP_NFT_DEV,
-#define NFNLGRP_NFT_DEV			NFNLGRP_NFT_DEV
 	__NFNLGRP_MAX,
 };
 #define NFNLGRP_MAX	(__NFNLGRP_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 24c71ecb2179..a7240736f98e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -9686,64 +9686,6 @@ struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
 }
 EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu);
 
-static void
-nf_tables_device_notify(const struct nft_table *table, int attr,
-			const char *name, const struct nft_hook *hook,
-			const struct net_device *dev, int event)
-{
-	struct net *net = dev_net(dev);
-	struct nlmsghdr *nlh;
-	struct sk_buff *skb;
-	u16 flags = 0;
-
-	if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV))
-		return;
-
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!skb)
-		goto err;
-
-	event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV;
-	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
-	nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family,
-			   NFNETLINK_V0, nft_base_seq(net));
-	if (!nlh)
-		goto err;
-
-	if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) ||
-	    nla_put_string(skb, attr, name) ||
-	    nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) ||
-	    nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
-		goto err;
-
-	nlmsg_end(skb, nlh);
-	nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV,
-		       nlmsg_report(nlh), GFP_KERNEL);
-	return;
-err:
-	if (skb)
-		kfree_skb(skb);
-	nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS);
-}
-
-void
-nf_tables_chain_device_notify(const struct nft_chain *chain,
-			      const struct nft_hook *hook,
-			      const struct net_device *dev, int event)
-{
-	nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN,
-				chain->name, hook, dev, event);
-}
-
-static void
-nf_tables_flowtable_device_notify(const struct nft_flowtable *ft,
-				  const struct nft_hook *hook,
-				  const struct net_device *dev, int event)
-{
-	nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE,
-				ft->name, hook, dev, event);
-}
-
 static int nft_flowtable_event(unsigned long event, struct net_device *dev,
 			       struct nft_flowtable *flowtable, bool changename)
 {
@@ -9791,7 +9733,6 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev,
 			list_add_tail_rcu(&ops->list, &hook->ops_list);
 			break;
 		}
-		nf_tables_flowtable_device_notify(flowtable, hook, dev, event);
 		break;
 	}
 	return 0;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index ac77fc21632d..e598a2a252b0 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -86,7 +86,6 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
 	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,
 	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,
 	[NFNLGRP_NFTRACE]		= NFNL_SUBSYS_NFTABLES,
-	[NFNLGRP_NFT_DEV]		= NFNL_SUBSYS_NFTABLES,
 };
 
 static struct nfnl_net *nfnl_pernet(struct net *net)
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 846d48ba8965..b16185e9a6dd 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -363,8 +363,6 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev,
 			list_add_tail_rcu(&ops->list, &hook->ops_list);
 			break;
 		}
-		nf_tables_chain_device_notify(&basechain->chain,
-					      hook, dev, event);
 		break;
 	}
 	return 0;
-- 
cgit v1.2.3


From 25c411fce735dda29de26f58d3fce52d4824380c Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Sat, 12 Jul 2025 03:33:42 +0000
Subject: sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable

Add a CONFIG_SCHED_PROXY_EXEC option, along with a boot argument
sched_proxy_exec= that can be used to disable the feature at boot
time if CONFIG_SCHED_PROXY_EXEC was enabled.

Also uses this option to allow the rq->donor to be different from
rq->curr.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20250712033407.2383110-2-jstultz@google.com
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +++++
 include/linux/sched.h                           | 13 +++++++++++
 init/Kconfig                                    | 12 ++++++++++
 kernel/sched/core.c                             | 29 +++++++++++++++++++++++++
 kernel/sched/sched.h                            | 12 ++++++++++
 5 files changed, 71 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 07e22ba5bfe3..00b835717860 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6387,6 +6387,11 @@
 	sa1100ir	[NET]
 			See drivers/net/irda/sa1100_ir.c.
 
+	sched_proxy_exec= [KNL]
+			Enables or disables "proxy execution" style
+			solution to mutex-based priority inversion.
+			Format: <bool>
+
 	sched_verbose	[KNL,EARLY] Enables verbose scheduler debug messages.
 
 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54a91261e99b..f225b6b1baa3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1656,6 +1656,19 @@ struct task_struct {
 	randomized_struct_fields_end
 } __attribute__ ((aligned (64)));
 
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static inline bool sched_proxy_exec(void)
+{
+	return static_branch_likely(&__sched_proxy_exec);
+}
+#else
+static inline bool sched_proxy_exec(void)
+{
+	return false;
+}
+#endif
+
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
diff --git a/init/Kconfig b/init/Kconfig
index 965699c0a6d9..24dd42d3808d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -878,6 +878,18 @@ config UCLAMP_BUCKETS_COUNT
 
 	  If in doubt, use the default value.
 
+config SCHED_PROXY_EXEC
+	bool "Proxy Execution"
+	# Avoid some build failures w/ PREEMPT_RT until it can be fixed
+	depends on !PREEMPT_RT
+	# Need to investigate how to inform sched_ext of split contexts
+	depends on !SCHED_CLASS_EXT
+	# Not particularly useful until we get to multi-rq proxying
+	depends on EXPERT
+	help
+	  This option enables proxy execution, a mechanism for mutex-owning
+	  tasks to inherit the scheduling context of higher priority waiters.
+
 endmenu
 
 #
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9c8bda84d80..dd9f5c08b563 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,6 +119,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+	bool proxy_enable = true;
+
+	if (*str && kstrtobool(str + 1, &proxy_enable)) {
+		pr_warn("Unable to parse sched_proxy_exec=\n");
+		return 0;
+	}
+
+	if (proxy_enable) {
+		pr_info("sched_proxy_exec enabled via boot arg\n");
+		static_branch_enable(&__sched_proxy_exec);
+	} else {
+		pr_info("sched_proxy_exec disabled via boot arg\n");
+		static_branch_disable(&__sched_proxy_exec);
+	}
+	return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+	pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+	return 0;
+}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
+
 /*
  * Debugging: various feature bits
  *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ac953fad8c21..e53d0b87f780 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1142,10 +1142,15 @@ struct rq {
 	 */
 	unsigned long 		nr_uninterruptible;
 
+#ifdef CONFIG_SCHED_PROXY_EXEC
+	struct task_struct __rcu	*donor;  /* Scheduling context */
+	struct task_struct __rcu	*curr;   /* Execution context */
+#else
 	union {
 		struct task_struct __rcu *donor; /* Scheduler context */
 		struct task_struct __rcu *curr;  /* Execution context */
 	};
+#endif
 	struct sched_dl_entity	*dl_server;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
@@ -1326,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		raw_cpu_ptr(&runqueues)
 
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+	rcu_assign_pointer(rq->donor, t);
+}
+#else
 static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
 {
 	/* Do nothing */
 }
+#endif
 
 #ifdef CONFIG_SCHED_CORE
 static inline struct cpumask *sched_group_span(struct sched_group *sg);
-- 
cgit v1.2.3


From 44e4e0297c3c01987399bb9973f4d22a096a62c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 12 Jul 2025 03:33:43 +0000
Subject: locking/mutex: Rework task_struct::blocked_on

Track the blocked-on relation for mutexes, to allow following this
relation at schedule time.

   task
     | blocked-on
     v
   mutex
     | owner
     v
   task

This all will be used for tracking blocked-task/mutex chains
with the prox-execution patch in a similar fashion to how
priority inheritance is done with rt_mutexes.

For serialization, blocked-on is only set by the task itself
(current). And both when setting or clearing (potentially by
others), is done while holding the mutex::wait_lock.

[minor changes while rebasing]
[jstultz: Fix blocked_on tracking in __mutex_lock_common in error paths]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Connor O'Brien <connoro@google.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20250712033407.2383110-3-jstultz@google.com
---
 include/linux/sched.h        |  5 +----
 kernel/fork.c                |  3 +--
 kernel/locking/mutex-debug.c |  9 +++++----
 kernel/locking/mutex.c       | 22 ++++++++++++++++++++++
 kernel/locking/ww_mutex.h    | 18 ++++++++++++++++--
 5 files changed, 45 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f225b6b1baa3..33ad240ec900 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1230,10 +1230,7 @@ struct task_struct {
 	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
 
-#ifdef CONFIG_DEBUG_MUTEXES
-	/* Mutex deadlock detection: */
-	struct mutex_waiter		*blocked_on;
-#endif
+	struct mutex			*blocked_on;	/* lock we're blocked on */
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..5f87f05aff4a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2123,9 +2123,8 @@ __latent_entropy struct task_struct *copy_process(
 	lockdep_init_task(p);
 #endif
 
-#ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
-#endif
+
 #ifdef CONFIG_BCACHE
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 6e6f6071cfa2..758b7a6792b0 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 {
 	lockdep_assert_held(&lock->wait_lock);
 
-	/* Mark the current thread as blocked on the lock: */
-	task->blocked_on = waiter;
+	/* Current thread can't be already blocked (since it's executing!) */
+	DEBUG_LOCKS_WARN_ON(task->blocked_on);
 }
 
 void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 			 struct task_struct *task)
 {
+	struct mutex *blocked_on = READ_ONCE(task->blocked_on);
+
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
 	DEBUG_LOCKS_WARN_ON(waiter->task != task);
-	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
-	task->blocked_on = NULL;
+	DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
 
 	INIT_LIST_HEAD(&waiter->list);
 	waiter->task = NULL;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a39ecccbd106..e2f59863a866 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -644,6 +644,8 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			goto err_early_kill;
 	}
 
+	WARN_ON(current->blocked_on);
+	current->blocked_on = lock;
 	set_current_state(state);
 	trace_contention_begin(lock, LCB_F_MUTEX);
 	for (;;) {
@@ -680,6 +682,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 		first = __mutex_waiter_is_first(lock, &waiter);
 
+		/*
+		 * As we likely have been woken up by task
+		 * that has cleared our blocked_on state, re-set
+		 * it to the lock we are trying to aquire.
+		 */
+		current->blocked_on = lock;
 		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
@@ -691,8 +699,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 		if (first) {
 			trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+			/* clear blocked_on as mutex_optimistic_spin may schedule() */
+			current->blocked_on = NULL;
 			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
 				break;
+			current->blocked_on = lock;
 			trace_contention_begin(lock, LCB_F_MUTEX);
 		}
 
@@ -700,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	}
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 acquired:
+	current->blocked_on = NULL;
 	__set_current_state(TASK_RUNNING);
 
 	if (ww_ctx) {
@@ -729,9 +741,11 @@ skip_wait:
 	return 0;
 
 err:
+	current->blocked_on = NULL;
 	__set_current_state(TASK_RUNNING);
 	__mutex_remove_waiter(lock, &waiter);
 err_early_kill:
+	WARN_ON(current->blocked_on);
 	trace_contention_end(lock, ret);
 	raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
 	debug_mutex_free_waiter(&waiter);
@@ -942,6 +956,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
+		/*
+		 * Unlock wakeups can be happening in parallel
+		 * (when optimistic spinners steal and release
+		 * the lock), so blocked_on may already be
+		 * cleared here.
+		 */
+		WARN_ON(next->blocked_on && next->blocked_on != lock);
+		next->blocked_on = NULL;
 		wake_q_add(&wake_q, next);
 	}
 
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 37f025a096c9..45fe05e51db1 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -283,7 +283,15 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 	if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
 #ifndef WW_RT
 		debug_mutex_wake_waiter(lock, waiter);
+		/*
+		 * When waking up the task to die, be sure to clear the
+		 * blocked_on pointer. Otherwise we can see circular
+		 * blocked_on relationships that can't resolve.
+		 */
+		WARN_ON(waiter->task->blocked_on &&
+			waiter->task->blocked_on != lock);
 #endif
+		waiter->task->blocked_on = NULL;
 		wake_q_add(wake_q, waiter->task);
 	}
 
@@ -331,9 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 		 * it's wounded in __ww_mutex_check_kill() or has a
 		 * wakeup pending to re-read the wounded state.
 		 */
-		if (owner != current)
+		if (owner != current) {
+			/*
+			 * When waking up the task to wound, be sure to clear the
+			 * blocked_on pointer. Otherwise we can see circular
+			 * blocked_on relationships that can't resolve.
+			 */
+			owner->blocked_on = NULL;
 			wake_q_add(wake_q, owner);
-
+		}
 		return true;
 	}
 
-- 
cgit v1.2.3


From a4f0b6fef4b08e9928449206390133e48ac185a7 Mon Sep 17 00:00:00 2001
From: Valentin Schneider <valentin.schneider@arm.com>
Date: Sat, 12 Jul 2025 03:33:44 +0000
Subject: locking/mutex: Add p->blocked_on wrappers for correctness checks

This lets us assert mutex::wait_lock is held whenever we access
p->blocked_on, as well as warn us for unexpected state changes.

[fix conflicts, call in more places]
[jstultz: tweaked commit subject, reworked a good bit]
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Connor O'Brien <connoro@google.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20250712033407.2383110-4-jstultz@google.com
---
 include/linux/sched.h        | 64 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/locking/mutex-debug.c |  4 +--
 kernel/locking/mutex.c       | 32 ++++++++++------------
 kernel/locking/ww_mutex.h    |  8 +++---
 4 files changed, 81 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33ad240ec900..5b4e1cd52e27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
+#include <linux/spinlock.h>
 #include <linux/syscall_user_dispatch_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/netdevice_xmit.h>
@@ -2129,6 +2130,67 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
 	__cond_resched_rwlock_write(lock);					\
 })
 
+#ifndef CONFIG_PREEMPT_RT
+static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
+{
+	struct mutex *m = p->blocked_on;
+
+	if (m)
+		lockdep_assert_held_once(&m->wait_lock);
+	return m;
+}
+
+static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	WARN_ON_ONCE(!m);
+	/* The task should only be setting itself as blocked */
+	WARN_ON_ONCE(p != current);
+	/* Currently we serialize blocked_on under the mutex::wait_lock */
+	lockdep_assert_held_once(&m->wait_lock);
+	/*
+	 * Check ensure we don't overwrite existing mutex value
+	 * with a different mutex. Note, setting it to the same
+	 * lock repeatedly is ok.
+	 */
+	WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
+	p->blocked_on = m;
+}
+
+static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	guard(raw_spinlock_irqsave)(&m->wait_lock);
+	__set_task_blocked_on(p, m);
+}
+
+static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	WARN_ON_ONCE(!m);
+	/* Currently we serialize blocked_on under the mutex::wait_lock */
+	lockdep_assert_held_once(&m->wait_lock);
+	/*
+	 * There may be cases where we re-clear already cleared
+	 * blocked_on relationships, but make sure we are not
+	 * clearing the relationship with a different lock.
+	 */
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
+	p->blocked_on = NULL;
+}
+
+static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
+{
+	guard(raw_spinlock_irqsave)(&m->wait_lock);
+	__clear_task_blocked_on(p, m);
+}
+#else
+static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
+static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
+{
+}
+#endif /* !CONFIG_PREEMPT_RT */
+
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
@@ -2168,8 +2230,6 @@ extern bool sched_task_on_rq(struct task_struct *p);
 extern unsigned long get_wchan(struct task_struct *p);
 extern struct task_struct *cpu_curr_snapshot(int cpu);
 
-#include <linux/spinlock.h>
-
 /*
  * In order to reduce various lock holder preemption latencies provide an
  * interface to see if a vCPU is currently running or not.
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 758b7a6792b0..949103fd8e9b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 	lockdep_assert_held(&lock->wait_lock);
 
 	/* Current thread can't be already blocked (since it's executing!) */
-	DEBUG_LOCKS_WARN_ON(task->blocked_on);
+	DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
 }
 
 void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 			 struct task_struct *task)
 {
-	struct mutex *blocked_on = READ_ONCE(task->blocked_on);
+	struct mutex *blocked_on = __get_task_blocked_on(task);
 
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
 	DEBUG_LOCKS_WARN_ON(waiter->task != task);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e2f59863a866..80d778fedd60 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -644,8 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			goto err_early_kill;
 	}
 
-	WARN_ON(current->blocked_on);
-	current->blocked_on = lock;
+	__set_task_blocked_on(current, lock);
 	set_current_state(state);
 	trace_contention_begin(lock, LCB_F_MUTEX);
 	for (;;) {
@@ -685,9 +684,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		/*
 		 * As we likely have been woken up by task
 		 * that has cleared our blocked_on state, re-set
-		 * it to the lock we are trying to aquire.
+		 * it to the lock we are trying to acquire.
 		 */
-		current->blocked_on = lock;
+		set_task_blocked_on(current, lock);
 		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
@@ -699,11 +698,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 		if (first) {
 			trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
-			/* clear blocked_on as mutex_optimistic_spin may schedule() */
-			current->blocked_on = NULL;
+			/*
+			 * mutex_optimistic_spin() can call schedule(), so
+			 * clear blocked on so we don't become unselectable
+			 * to run.
+			 */
+			clear_task_blocked_on(current, lock);
 			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
 				break;
-			current->blocked_on = lock;
+			set_task_blocked_on(current, lock);
 			trace_contention_begin(lock, LCB_F_MUTEX);
 		}
 
@@ -711,7 +714,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	}
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 acquired:
-	current->blocked_on = NULL;
+	__clear_task_blocked_on(current, lock);
 	__set_current_state(TASK_RUNNING);
 
 	if (ww_ctx) {
@@ -741,11 +744,11 @@ skip_wait:
 	return 0;
 
 err:
-	current->blocked_on = NULL;
+	__clear_task_blocked_on(current, lock);
 	__set_current_state(TASK_RUNNING);
 	__mutex_remove_waiter(lock, &waiter);
 err_early_kill:
-	WARN_ON(current->blocked_on);
+	WARN_ON(__get_task_blocked_on(current));
 	trace_contention_end(lock, ret);
 	raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
 	debug_mutex_free_waiter(&waiter);
@@ -956,14 +959,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
-		/*
-		 * Unlock wakeups can be happening in parallel
-		 * (when optimistic spinners steal and release
-		 * the lock), so blocked_on may already be
-		 * cleared here.
-		 */
-		WARN_ON(next->blocked_on && next->blocked_on != lock);
-		next->blocked_on = NULL;
+		__clear_task_blocked_on(next, lock);
 		wake_q_add(&wake_q, next);
 	}
 
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 45fe05e51db1..086fd5487ca7 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -283,15 +283,13 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 	if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) {
 #ifndef WW_RT
 		debug_mutex_wake_waiter(lock, waiter);
+#endif
 		/*
 		 * When waking up the task to die, be sure to clear the
 		 * blocked_on pointer. Otherwise we can see circular
 		 * blocked_on relationships that can't resolve.
 		 */
-		WARN_ON(waiter->task->blocked_on &&
-			waiter->task->blocked_on != lock);
-#endif
-		waiter->task->blocked_on = NULL;
+		__clear_task_blocked_on(waiter->task, lock);
 		wake_q_add(wake_q, waiter->task);
 	}
 
@@ -345,7 +343,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 			 * blocked_on pointer. Otherwise we can see circular
 			 * blocked_on relationships that can't resolve.
 			 */
-			owner->blocked_on = NULL;
+			__clear_task_blocked_on(owner, lock);
 			wake_q_add(wake_q, owner);
 		}
 		return true;
-- 
cgit v1.2.3


From 8671bad873ebeb082afcf7b4501395c374da6023 Mon Sep 17 00:00:00 2001
From: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Date: Mon, 7 Jul 2025 11:03:59 -0300
Subject: sched: Do not call __put_task_struct() on rt if pi_blocked_on is set

With PREEMPT_RT enabled, some of the calls to put_task_struct() coming
from rt_mutex_adjust_prio_chain() could happen in preemptible context and
with a mutex enqueued. That could lead to this sequence:

        rt_mutex_adjust_prio_chain()
          put_task_struct()
            __put_task_struct()
              sched_ext_free()
                spin_lock_irqsave()
                  rtlock_lock() --->  TRIGGERS
                                      lockdep_assert(!current->pi_blocked_on);

This is not a SCHED_EXT bug. The first cleanup function called by
__put_task_struct() is sched_ext_free() and it happens to take a
(RT) spin_lock, which in the scenario described above, would trigger
the lockdep assertion of "!current->pi_blocked_on".

Crystal Wood was able to identify the problem as __put_task_struct()
being called during rt_mutex_adjust_prio_chain(), in the context of
a process with a mutex enqueued.

Instead of adding more complex conditions to decide when to directly
call __put_task_struct() and when to defer the call, unconditionally
resort to the deferred call on PREEMPT_RT to simplify the code.

Fixes: 893cdaaa3977 ("sched: avoid false lockdep splat in put_task_struct()")
Suggested-by: Crystal Wood <crwood@redhat.com>
Signed-off-by: Luis Claudio R. Goncalves <lgoncalv@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/r/aGvTz5VaPFyj0pBV@uudg.org
---
 include/linux/sched/task.h | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c517dbc242f7..ea41795a352b 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -131,24 +131,17 @@ static inline void put_task_struct(struct task_struct *t)
 		return;
 
 	/*
-	 * In !RT, it is always safe to call __put_task_struct().
-	 * Under RT, we can only call it in preemptible context.
-	 */
-	if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
-		static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);
-
-		lock_map_acquire_try(&put_task_map);
-		__put_task_struct(t);
-		lock_map_release(&put_task_map);
-		return;
-	}
-
-	/*
-	 * under PREEMPT_RT, we can't call put_task_struct
+	 * Under PREEMPT_RT, we can't call __put_task_struct
 	 * in atomic context because it will indirectly
-	 * acquire sleeping locks.
+	 * acquire sleeping locks. The same is true if the
+	 * current process has a mutex enqueued (blocked on
+	 * a PI chain).
+	 *
+	 * In !RT, it is always safe to call __put_task_struct().
+	 * Though, in order to simplify the code, resort to the
+	 * deferred call too.
 	 *
-	 * call_rcu() will schedule delayed_put_task_struct_rcu()
+	 * call_rcu() will schedule __put_task_struct_rcu_cb()
 	 * to be called in process context.
 	 *
 	 * __put_task_struct() is called when
@@ -161,7 +154,7 @@ static inline void put_task_struct(struct task_struct *t)
 	 *
 	 * delayed_free_task() also uses ->rcu, but it is only called
 	 * when it fails to fork a process. Therefore, there is no
-	 * way it can conflict with put_task_struct().
+	 * way it can conflict with __put_task_struct().
 	 */
 	call_rcu(&t->rcu, __put_task_struct_rcu_cb);
 }
-- 
cgit v1.2.3


From 7941ad696506917fa6228f44be2df0c2f0909a62 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 11 Jul 2025 14:58:43 -0700
Subject: lib/crypto: sha2: Add hmac_sha*_init_usingrawkey()

While the HMAC library functions support both incremental and one-shot
computation and both prepared and raw keys, the combination of raw key
+ incremental was missing.  It turns out that several potential users of
the HMAC library functions (tpm2-sessions.c, smb2transport.c,
trusted_tpm1.c) want exactly that.

Therefore, add the missing functions hmac_sha*_init_usingrawkey().

Implement them in an optimized way that directly initializes the HMAC
context without a separate key preparation step.

Reimplement the one-shot raw key functions hmac_sha*_usingrawkey() on
top of the new functions, which makes them a bit more efficient.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250711215844.41715-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha2.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/crypto/sha256.c   | 53 ++++++++++++++++++++++++++++++---------------
 lib/crypto/sha512.c   | 55 +++++++++++++++++++++++++++++++---------------
 3 files changed, 134 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h
index e0a08f6addd0..15e461e568cc 100644
--- a/include/crypto/sha2.h
+++ b/include/crypto/sha2.h
@@ -247,6 +247,21 @@ static inline void hmac_sha224_init(struct hmac_sha224_ctx *ctx,
 	__hmac_sha256_init(&ctx->ctx, &key->key);
 }
 
+/**
+ * hmac_sha224_init_usingrawkey() - Initialize an HMAC-SHA224 context for a new
+ *				    message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-SHA224 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_sha224_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len);
+
 /**
  * hmac_sha224_update() - Update an HMAC-SHA224 context with message data
  * @ctx: the HMAC context to update; must have been initialized
@@ -405,6 +420,21 @@ static inline void hmac_sha256_init(struct hmac_sha256_ctx *ctx,
 	__hmac_sha256_init(&ctx->ctx, &key->key);
 }
 
+/**
+ * hmac_sha256_init_usingrawkey() - Initialize an HMAC-SHA256 context for a new
+ *				    message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-SHA256 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_sha256_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len);
+
 /**
  * hmac_sha256_update() - Update an HMAC-SHA256 context with message data
  * @ctx: the HMAC context to update; must have been initialized
@@ -597,6 +627,21 @@ static inline void hmac_sha384_init(struct hmac_sha384_ctx *ctx,
 	__hmac_sha512_init(&ctx->ctx, &key->key);
 }
 
+/**
+ * hmac_sha384_init_usingrawkey() - Initialize an HMAC-SHA384 context for a new
+ *				    message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-SHA384 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_sha384_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len);
+
 /**
  * hmac_sha384_update() - Update an HMAC-SHA384 context with message data
  * @ctx: the HMAC context to update; must have been initialized
@@ -755,6 +800,21 @@ static inline void hmac_sha512_init(struct hmac_sha512_ctx *ctx,
 	__hmac_sha512_init(&ctx->ctx, &key->key);
 }
 
+/**
+ * hmac_sha512_init_usingrawkey() - Initialize an HMAC-SHA512 context for a new
+ *				    message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-SHA512 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_sha512_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len);
+
 /**
  * hmac_sha512_update() - Update an HMAC-SHA512 context with message data
  * @ctx: the HMAC context to update; must have been initialized
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 808438d4f427..8fa15165d23e 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -263,7 +263,8 @@ EXPORT_SYMBOL(sha256);
 
 /* pre-boot environment (as indicated by __DISABLE_EXPORTS) doesn't need HMAC */
 #ifndef __DISABLE_EXPORTS
-static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key,
+static void __hmac_sha256_preparekey(struct sha256_block_state *istate,
+				     struct sha256_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
 				     const struct sha256_block_state *iv)
 {
@@ -283,14 +284,14 @@ static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key,
 
 	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
 		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
-	key->istate = *iv;
-	sha256_blocks(&key->istate, derived_key.b, 1);
+	*istate = *iv;
+	sha256_blocks(istate, derived_key.b, 1);
 
 	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
 		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
 						HMAC_IPAD_VALUE);
-	key->ostate = *iv;
-	sha256_blocks(&key->ostate, derived_key.b, 1);
+	*ostate = *iv;
+	sha256_blocks(ostate, derived_key.b, 1);
 
 	memzero_explicit(&derived_key, sizeof(derived_key));
 }
@@ -298,14 +299,16 @@ static void __hmac_sha256_preparekey(struct __hmac_sha256_key *key,
 void hmac_sha224_preparekey(struct hmac_sha224_key *key,
 			    const u8 *raw_key, size_t raw_key_len)
 {
-	__hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha224_iv);
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
 }
 EXPORT_SYMBOL_GPL(hmac_sha224_preparekey);
 
 void hmac_sha256_preparekey(struct hmac_sha256_key *key,
 			    const u8 *raw_key, size_t raw_key_len)
 {
-	__hmac_sha256_preparekey(&key->key, raw_key, raw_key_len, &sha256_iv);
+	__hmac_sha256_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
 }
 EXPORT_SYMBOL_GPL(hmac_sha256_preparekey);
 
@@ -317,6 +320,24 @@ void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(__hmac_sha256_init);
 
+void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha224_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey);
+
+void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha256_iv);
+	ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey);
+
 static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx,
 				u8 *out, size_t digest_size)
 {
@@ -376,12 +397,11 @@ void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
 			     u8 out[SHA224_DIGEST_SIZE])
 {
-	struct hmac_sha224_key key;
-
-	hmac_sha224_preparekey(&key, raw_key, raw_key_len);
-	hmac_sha224(&key, data, data_len, out);
+	struct hmac_sha224_ctx ctx;
 
-	memzero_explicit(&key, sizeof(key));
+	hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha224_update(&ctx, data, data_len);
+	hmac_sha224_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey);
 
@@ -389,12 +409,11 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
 			     u8 out[SHA256_DIGEST_SIZE])
 {
-	struct hmac_sha256_key key;
-
-	hmac_sha256_preparekey(&key, raw_key, raw_key_len);
-	hmac_sha256(&key, data, data_len, out);
+	struct hmac_sha256_ctx ctx;
 
-	memzero_explicit(&key, sizeof(key));
+	hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha256_update(&ctx, data, data_len);
+	hmac_sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
 #endif /* !__DISABLE_EXPORTS */
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
index d514721491ca..d8062188be98 100644
--- a/lib/crypto/sha512.c
+++ b/lib/crypto/sha512.c
@@ -249,7 +249,8 @@ void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE])
 }
 EXPORT_SYMBOL_GPL(sha512);
 
-static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key,
+static void __hmac_sha512_preparekey(struct sha512_block_state *istate,
+				     struct sha512_block_state *ostate,
 				     const u8 *raw_key, size_t raw_key_len,
 				     const struct sha512_block_state *iv)
 {
@@ -269,14 +270,14 @@ static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key,
 
 	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
 		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
-	key->istate = *iv;
-	sha512_blocks(&key->istate, derived_key.b, 1);
+	*istate = *iv;
+	sha512_blocks(istate, derived_key.b, 1);
 
 	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
 		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
 						HMAC_IPAD_VALUE);
-	key->ostate = *iv;
-	sha512_blocks(&key->ostate, derived_key.b, 1);
+	*ostate = *iv;
+	sha512_blocks(ostate, derived_key.b, 1);
 
 	memzero_explicit(&derived_key, sizeof(derived_key));
 }
@@ -284,14 +285,16 @@ static void __hmac_sha512_preparekey(struct __hmac_sha512_key *key,
 void hmac_sha384_preparekey(struct hmac_sha384_key *key,
 			    const u8 *raw_key, size_t raw_key_len)
 {
-	__hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha384_iv);
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
 }
 EXPORT_SYMBOL_GPL(hmac_sha384_preparekey);
 
 void hmac_sha512_preparekey(struct hmac_sha512_key *key,
 			    const u8 *raw_key, size_t raw_key_len)
 {
-	__hmac_sha512_preparekey(&key->key, raw_key, raw_key_len, &sha512_iv);
+	__hmac_sha512_preparekey(&key->key.istate, &key->key.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
 }
 EXPORT_SYMBOL_GPL(hmac_sha512_preparekey);
 
@@ -303,6 +306,26 @@ void __hmac_sha512_init(struct __hmac_sha512_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(__hmac_sha512_init);
 
+void hmac_sha384_init_usingrawkey(struct hmac_sha384_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha384_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha384_init_usingrawkey);
+
+void hmac_sha512_init_usingrawkey(struct hmac_sha512_ctx *ctx,
+				  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha512_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate,
+				 raw_key, raw_key_len, &sha512_iv);
+	ctx->ctx.sha_ctx.bytecount_lo = SHA512_BLOCK_SIZE;
+	ctx->ctx.sha_ctx.bytecount_hi = 0;
+}
+EXPORT_SYMBOL_GPL(hmac_sha512_init_usingrawkey);
+
 static void __hmac_sha512_final(struct __hmac_sha512_ctx *ctx,
 				u8 *out, size_t digest_size)
 {
@@ -362,12 +385,11 @@ void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
 			     u8 out[SHA384_DIGEST_SIZE])
 {
-	struct hmac_sha384_key key;
-
-	hmac_sha384_preparekey(&key, raw_key, raw_key_len);
-	hmac_sha384(&key, data, data_len, out);
+	struct hmac_sha384_ctx ctx;
 
-	memzero_explicit(&key, sizeof(key));
+	hmac_sha384_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha384_update(&ctx, data, data_len);
+	hmac_sha384_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha384_usingrawkey);
 
@@ -375,12 +397,11 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 			     const u8 *data, size_t data_len,
 			     u8 out[SHA512_DIGEST_SIZE])
 {
-	struct hmac_sha512_key key;
-
-	hmac_sha512_preparekey(&key, raw_key, raw_key_len);
-	hmac_sha512(&key, data, data_len, out);
+	struct hmac_sha512_ctx ctx;
 
-	memzero_explicit(&key, sizeof(key));
+	hmac_sha512_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha512_update(&ctx, data, data_len);
+	hmac_sha512_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
 
-- 
cgit v1.2.3


From 9503ca2ccafec51ee9e533d6f3aef14a589fc106 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 12 Jul 2025 16:22:53 -0700
Subject: lib/crypto: sha1: Rename sha1_init() to sha1_init_raw()

Rename the existing sha1_init() to sha1_init_raw(), since it conflicts
with the upcoming library function.  This will later be removed, but
this keeps the kernel building for the introduction of the library.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha1.h | 2 +-
 kernel/bpf/core.c     | 2 +-
 lib/crypto/sha1.c     | 6 +++---
 net/ipv6/addrconf.c   | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index f48230b1413c..d853d3b93169 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -33,7 +33,7 @@ struct sha1_state {
  */
 #define SHA1_DIGEST_WORDS	(SHA1_DIGEST_SIZE / 4)
 #define SHA1_WORKSPACE_WORDS	16
-void sha1_init(__u32 *buf);
+void sha1_init_raw(__u32 *buf);
 void sha1_transform(__u32 *digest, const char *data, __u32 *W);
 
 #endif /* _CRYPTO_SHA1_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c20babbf998f..dae281a1286d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -304,7 +304,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
 	if (!raw)
 		return -ENOMEM;
 
-	sha1_init(digest);
+	sha1_init_raw(digest);
 	memset(ws, 0, sizeof(ws));
 
 	/* We need to take out the map fd for the digest calculation
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 6d809c3088be..813ad96daa25 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -124,10 +124,10 @@ void sha1_transform(__u32 *digest, const char *data, __u32 *array)
 EXPORT_SYMBOL(sha1_transform);
 
 /**
- * sha1_init - initialize the vectors for a SHA1 digest
+ * sha1_init_raw - initialize the vectors for a SHA1 digest
  * @buf: vector to initialize
  */
-void sha1_init(__u32 *buf)
+void sha1_init_raw(__u32 *buf)
 {
 	buf[0] = 0x67452301;
 	buf[1] = 0xefcdab89;
@@ -135,7 +135,7 @@ void sha1_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
-EXPORT_SYMBOL(sha1_init);
+EXPORT_SYMBOL(sha1_init_raw);
 
 MODULE_DESCRIPTION("SHA-1 Algorithm");
 MODULE_LICENSE("GPL");
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ba2ec7c870cc..d0e5b94c10af 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3367,7 +3367,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
 retry:
 	spin_lock_bh(&lock);
 
-	sha1_init(digest);
+	sha1_init_raw(digest);
 	memset(&data, 0, sizeof(data));
 	memset(workspace, 0, sizeof(workspace));
 	memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
-- 
cgit v1.2.3


From 90860aef630c5c9e58d05044f2866fcbfa7aa4d9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 12 Jul 2025 16:22:54 -0700
Subject: lib/crypto: sha1: Add SHA-1 library functions

Add a library interface for SHA-1, following the SHA-2 one.  As was the
case with SHA-2, this will be useful for various in-kernel users.  The
crypto_shash interface will be reimplemented on top of it as well.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha1.h |  60 ++++++++++++++++++++++++++
 lib/crypto/Kconfig    |   7 +++
 lib/crypto/Makefile   |   9 +++-
 lib/crypto/sha1.c     | 115 +++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 184 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index d853d3b93169..387f6123a05e 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -36,4 +36,64 @@ struct sha1_state {
 void sha1_init_raw(__u32 *buf);
 void sha1_transform(__u32 *digest, const char *data, __u32 *W);
 
+/* State for the SHA-1 compression function */
+struct sha1_block_state {
+	u32 h[SHA1_DIGEST_SIZE / 4];
+};
+
+/**
+ * struct sha1_ctx - Context for hashing a message with SHA-1
+ * @state: the compression function state
+ * @bytecount: number of bytes processed so far
+ * @buf: partial block buffer; bytecount % SHA1_BLOCK_SIZE bytes are valid
+ */
+struct sha1_ctx {
+	struct sha1_block_state state;
+	u64 bytecount;
+	u8 buf[SHA1_BLOCK_SIZE];
+};
+
+/**
+ * sha1_init() - Initialize a SHA-1 context for a new message
+ * @ctx: the context to initialize
+ *
+ * If you don't need incremental computation, consider sha1() instead.
+ *
+ * Context: Any context.
+ */
+void sha1_init(struct sha1_ctx *ctx);
+
+/**
+ * sha1_update() - Update a SHA-1 context with message data
+ * @ctx: the context to update; must have been initialized
+ * @data: the message data
+ * @len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * sha1_final() - Finish computing a SHA-1 message digest
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting SHA-1 message digest
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+
+/**
+ * sha1() - Compute SHA-1 message digest in one shot
+ * @data: the message data
+ * @len: the data length in bytes
+ * @out: (output) the resulting SHA-1 message digest
+ *
+ * Context: Any context.
+ */
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
+
 #endif /* _CRYPTO_SHA1_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index ee7e1e27ea6a..5aaf484fc9de 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -139,6 +139,13 @@ config CRYPTO_LIB_CHACHA20POLY1305
 
 config CRYPTO_LIB_SHA1
 	tristate
+	help
+	  The SHA-1 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha1.h>.
+
+config CRYPTO_LIB_SHA1_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA1 && !UML
 
 config CRYPTO_LIB_SHA256
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 533bb1533e19..0eb0906d693f 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -65,8 +65,13 @@ libpoly1305-generic-y				:= poly1305-donna32.o
 libpoly1305-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
 libpoly1305-generic-y				+= poly1305-generic.o
 
-obj-$(CONFIG_CRYPTO_LIB_SHA1)			+= libsha1.o
-libsha1-y					:= sha1.o
+################################################################################
+
+obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
+libsha1-y := sha1.o
+ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
+CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
 
 ################################################################################
 
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 813ad96daa25..0fe9ca8d0653 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,9 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * SHA1 routine optimized to do word accesses rather than byte accesses,
- * and to avoid unnecessary copies into the context array.
- *
- * This was based on the git SHA1 implementation.
+ * SHA-1 library functions
  */
 
 #include <crypto/sha1.h>
@@ -14,6 +11,10 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 
+static const struct sha1_block_state sha1_iv = {
+	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+};
+
 /*
  * If you have 32 registers or more, the compiler can (and should)
  * try to change the array[] accesses into registers. However, on
@@ -137,5 +138,109 @@ void sha1_init_raw(__u32 *buf)
 }
 EXPORT_SYMBOL(sha1_init_raw);
 
-MODULE_DESCRIPTION("SHA-1 Algorithm");
+static void __maybe_unused sha1_blocks_generic(struct sha1_block_state *state,
+					       const u8 *data, size_t nblocks)
+{
+	u32 workspace[SHA1_WORKSPACE_WORDS];
+
+	do {
+		sha1_transform(state->h, data, workspace);
+		data += SHA1_BLOCK_SIZE;
+	} while (--nblocks);
+
+	memzero_explicit(workspace, sizeof(workspace));
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA1_ARCH
+#include "sha1.h" /* $(SRCARCH)/sha1.h */
+#else
+#define sha1_blocks sha1_blocks_generic
+#endif
+
+void sha1_init(struct sha1_ctx *ctx)
+{
+	ctx->state = sha1_iv;
+	ctx->bytecount = 0;
+}
+EXPORT_SYMBOL_GPL(sha1_init);
+
+void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
+{
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->bytecount += len;
+
+	if (partial + len >= SHA1_BLOCK_SIZE) {
+		size_t nblocks;
+
+		if (partial) {
+			size_t l = SHA1_BLOCK_SIZE - partial;
+
+			memcpy(&ctx->buf[partial], data, l);
+			data += l;
+			len -= l;
+
+			sha1_blocks(&ctx->state, ctx->buf, 1);
+		}
+
+		nblocks = len / SHA1_BLOCK_SIZE;
+		len %= SHA1_BLOCK_SIZE;
+
+		if (nblocks) {
+			sha1_blocks(&ctx->state, data, nblocks);
+			data += nblocks * SHA1_BLOCK_SIZE;
+		}
+		partial = 0;
+	}
+	if (len)
+		memcpy(&ctx->buf[partial], data, len);
+}
+EXPORT_SYMBOL_GPL(sha1_update);
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	u64 bitcount = ctx->bytecount << 3;
+	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
+
+	ctx->buf[partial++] = 0x80;
+	if (partial > SHA1_BLOCK_SIZE - 8) {
+		memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - partial);
+		sha1_blocks(&ctx->state, ctx->buf, 1);
+		partial = 0;
+	}
+	memset(&ctx->buf[partial], 0, SHA1_BLOCK_SIZE - 8 - partial);
+	*(__be64 *)&ctx->buf[SHA1_BLOCK_SIZE - 8] = cpu_to_be64(bitcount);
+	sha1_blocks(&ctx->state, ctx->buf, 1);
+
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->state.h[i / 4], out + i);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(sha1_final);
+
+void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct sha1_ctx ctx;
+
+	sha1_init(&ctx);
+	sha1_update(&ctx, data, len);
+	sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha1);
+
+#ifdef sha1_mod_init_arch
+static int __init sha1_mod_init(void)
+{
+	sha1_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha1_mod_init);
+
+static void __exit sha1_mod_exit(void)
+{
+}
+module_exit(sha1_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-1 library functions");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 4cbc84471bb606ddfaf424709dd8d56b56d7ae7b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 12 Jul 2025 16:22:55 -0700
Subject: lib/crypto: sha1: Add HMAC support

Add HMAC support to the SHA-1 library, again following what was done for
SHA-2.  Besides providing the basis for a more streamlined "hmac(sha1)"
shash, this will also be useful for multiple in-kernel users such as
net/sctp/auth.c, net/ipv6/seg6_hmac.c, and
security/keys/trusted-keys/trusted_tpm1.c.  Those are currently using
crypto_shash, but using the library functions would be much simpler.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha1.h | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/crypto/sha1.c     | 108 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 223 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h
index 387f6123a05e..162a529ec841 100644
--- a/include/crypto/sha1.h
+++ b/include/crypto/sha1.h
@@ -96,4 +96,122 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
  */
 void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]);
 
+/**
+ * struct hmac_sha1_key - Prepared key for HMAC-SHA1
+ * @istate: private
+ * @ostate: private
+ */
+struct hmac_sha1_key {
+	struct sha1_block_state istate;
+	struct sha1_block_state ostate;
+};
+
+/**
+ * struct hmac_sha1_ctx - Context for computing HMAC-SHA1 of a message
+ * @sha_ctx: private
+ * @ostate: private
+ */
+struct hmac_sha1_ctx {
+	struct sha1_ctx sha_ctx;
+	struct sha1_block_state ostate;
+};
+
+/**
+ * hmac_sha1_preparekey() - Prepare a key for HMAC-SHA1
+ * @key: (output) the key structure to initialize
+ * @raw_key: the raw HMAC-SHA1 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * Note: the caller is responsible for zeroizing both the struct hmac_sha1_key
+ * and the raw key once they are no longer needed.
+ *
+ * Context: Any context.
+ */
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+			  const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha1_init() - Initialize an HMAC-SHA1 context for a new message
+ * @ctx: (output) the HMAC context to initialize
+ * @key: the prepared HMAC key
+ *
+ * If you don't need incremental computation, consider hmac_sha1() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key);
+
+/**
+ * hmac_sha1_init_usingrawkey() - Initialize an HMAC-SHA1 context for a new
+ *				  message, using a raw key
+ * @ctx: (output) the HMAC context to initialize
+ * @raw_key: the raw HMAC-SHA1 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ *
+ * If you don't need incremental computation, consider hmac_sha1_usingrawkey()
+ * instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+				const u8 *raw_key, size_t raw_key_len);
+
+/**
+ * hmac_sha1_update() - Update an HMAC-SHA1 context with message data
+ * @ctx: the HMAC context to update; must have been initialized
+ * @data: the message data
+ * @data_len: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx,
+				    const u8 *data, size_t data_len)
+{
+	sha1_update(&ctx->sha_ctx, data, data_len);
+}
+
+/**
+ * hmac_sha1_final() - Finish computing an HMAC-SHA1 value
+ * @ctx: the HMAC context to finalize; must have been initialized
+ * @out: (output) the resulting HMAC-SHA1 value
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]);
+
+/**
+ * hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key
+ * @key: the prepared HMAC key
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA1 value
+ *
+ * If you're using the key only once, consider using hmac_sha1_usingrawkey().
+ *
+ * Context: Any context.
+ */
+void hmac_sha1(const struct hmac_sha1_key *key,
+	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]);
+
+/**
+ * hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key
+ * @raw_key: the raw HMAC-SHA1 key
+ * @raw_key_len: the key length in bytes.  All key lengths are supported.
+ * @data: the message data
+ * @data_len: the data length in bytes
+ * @out: (output) the resulting HMAC-SHA1 value
+ *
+ * If you're using the key multiple times, prefer to use hmac_sha1_preparekey()
+ * followed by multiple calls to hmac_sha1() instead.
+ *
+ * Context: Any context.
+ */
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			   const u8 *data, size_t data_len,
+			   u8 out[SHA1_DIGEST_SIZE]);
+
 #endif /* _CRYPTO_SHA1_H */
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 0fe9ca8d0653..5904e4ae85d2 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * SHA-1 library functions
+ * SHA-1 and HMAC-SHA1 library functions
  */
 
+#include <crypto/hmac.h>
 #include <crypto/sha1.h>
 #include <linux/bitops.h>
 #include <linux/export.h>
@@ -10,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/unaligned.h>
+#include <linux/wordpart.h>
 
 static const struct sha1_block_state sha1_iv = {
 	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
@@ -197,7 +199,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len)
 }
 EXPORT_SYMBOL_GPL(sha1_update);
 
-void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+static void __sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
 {
 	u64 bitcount = ctx->bytecount << 3;
 	size_t partial = ctx->bytecount % SHA1_BLOCK_SIZE;
@@ -214,6 +216,11 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
 
 	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
 		put_unaligned_be32(ctx->state.h[i / 4], out + i);
+}
+
+void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	__sha1_final(ctx, out);
 	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL_GPL(sha1_final);
@@ -228,6 +235,101 @@ void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE])
 }
 EXPORT_SYMBOL_GPL(sha1);
 
+static void __hmac_sha1_preparekey(struct sha1_block_state *istate,
+				   struct sha1_block_state *ostate,
+				   const u8 *raw_key, size_t raw_key_len)
+{
+	union {
+		u8 b[SHA1_BLOCK_SIZE];
+		unsigned long w[SHA1_BLOCK_SIZE / sizeof(unsigned long)];
+	} derived_key = { 0 };
+
+	if (unlikely(raw_key_len > SHA1_BLOCK_SIZE))
+		sha1(raw_key, raw_key_len, derived_key.b);
+	else
+		memcpy(derived_key.b, raw_key, raw_key_len);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE);
+	*istate = sha1_iv;
+	sha1_blocks(istate, derived_key.b, 1);
+
+	for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++)
+		derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^
+						HMAC_IPAD_VALUE);
+	*ostate = sha1_iv;
+	sha1_blocks(ostate, derived_key.b, 1);
+
+	memzero_explicit(&derived_key, sizeof(derived_key));
+}
+
+void hmac_sha1_preparekey(struct hmac_sha1_key *key,
+			  const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&key->istate, &key->ostate,
+			       raw_key, raw_key_len);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_preparekey);
+
+void hmac_sha1_init(struct hmac_sha1_ctx *ctx, const struct hmac_sha1_key *key)
+{
+	ctx->sha_ctx.state = key->istate;
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+	ctx->ostate = key->ostate;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init);
+
+void hmac_sha1_init_usingrawkey(struct hmac_sha1_ctx *ctx,
+				const u8 *raw_key, size_t raw_key_len)
+{
+	__hmac_sha1_preparekey(&ctx->sha_ctx.state, &ctx->ostate,
+			       raw_key, raw_key_len);
+	ctx->sha_ctx.bytecount = SHA1_BLOCK_SIZE;
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_init_usingrawkey);
+
+void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE])
+{
+	/* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */
+	__sha1_final(&ctx->sha_ctx, ctx->sha_ctx.buf);
+	memset(&ctx->sha_ctx.buf[SHA1_DIGEST_SIZE], 0,
+	       SHA1_BLOCK_SIZE - SHA1_DIGEST_SIZE);
+	ctx->sha_ctx.buf[SHA1_DIGEST_SIZE] = 0x80;
+	*(__be32 *)&ctx->sha_ctx.buf[SHA1_BLOCK_SIZE - 4] =
+		cpu_to_be32(8 * (SHA1_BLOCK_SIZE + SHA1_DIGEST_SIZE));
+
+	/* Compute the outer hash, which gives the HMAC value. */
+	sha1_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1);
+	for (size_t i = 0; i < SHA1_DIGEST_SIZE; i += 4)
+		put_unaligned_be32(ctx->ostate.h[i / 4], out + i);
+
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_final);
+
+void hmac_sha1(const struct hmac_sha1_key *key,
+	       const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init(&ctx, key);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1);
+
+void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
+			   const u8 *data, size_t data_len,
+			   u8 out[SHA1_DIGEST_SIZE])
+{
+	struct hmac_sha1_ctx ctx;
+
+	hmac_sha1_init_usingrawkey(&ctx, raw_key, raw_key_len);
+	hmac_sha1_update(&ctx, data, data_len);
+	hmac_sha1_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
+
 #ifdef sha1_mod_init_arch
 static int __init sha1_mod_init(void)
 {
@@ -242,5 +344,5 @@ static void __exit sha1_mod_exit(void)
 module_exit(sha1_mod_exit);
 #endif
 
-MODULE_DESCRIPTION("SHA-1 library functions");
+MODULE_DESCRIPTION("SHA-1 and HMAC-SHA1 library functions");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 8d43417e93073699b521f603286140415b24968b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 May 2025 13:48:45 +0200
Subject: sunrpc: simplify xdr_init_encode_pages

The rqst argument to xdr_init_encode_pages is set to NULL by all callers,
and pages is always set to buf->pages.  Remove the two arguments and
hardcode the assignments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c         |  2 +-
 fs/nfsd/nfsproc.c          |  2 +-
 include/linux/sunrpc/xdr.h |  3 +--
 net/sunrpc/xdr.c           | 11 ++++-------
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a817d8485d21..b6d03e1ef5f7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -561,7 +561,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	xdr_init_encode_pages(xdr, buf, buf->pages,  NULL);
+	xdr_init_encode_pages(xdr, buf);
 }
 
 /*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index c10fa8128a8a..8f71f5748c75 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -575,7 +575,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page++;
 
-	xdr_init_encode_pages(xdr, buf, buf->pages,  NULL);
+	xdr_init_encode_pages(xdr, buf);
 }
 
 /*
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a2ab813a9800..29d3a7659727 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -242,8 +242,7 @@ typedef int	(*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
 			    __be32 *p, struct rpc_rqst *rqst);
-extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
-			   struct page **pages, struct rpc_rqst *rqst);
+void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes);
 extern void __xdr_commit_encode(struct xdr_stream *xdr);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 2ea00e354ba6..1346fdf33835 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -993,21 +993,18 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
  * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages
  * @xdr: pointer to xdr_stream struct
  * @buf: pointer to XDR buffer into which to encode data
- * @pages: list of pages to decode into
- * @rqst: pointer to controlling rpc_rqst, for debugging
  *
  */
-void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
-			   struct page **pages, struct rpc_rqst *rqst)
+void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf)
 {
 	xdr_reset_scratch_buffer(xdr);
 
 	xdr->buf = buf;
-	xdr->page_ptr = pages;
+	xdr->page_ptr = buf->pages;
 	xdr->iov = NULL;
-	xdr->p = page_address(*pages);
+	xdr->p = page_address(*xdr->page_ptr);
 	xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
-	xdr->rqst = rqst;
+	xdr->rqst = NULL;
 }
 EXPORT_SYMBOL_GPL(xdr_init_encode_pages);
 
-- 
cgit v1.2.3


From f26c93053074bba9342b74632c195a043a825ac5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 27 May 2025 20:12:48 -0400
Subject: sunrpc: new tracepoints around svc thread wakeups

Convert the svc_wake_up tracepoint into svc_pool_thread_event class.
Have it also record the pool id, and add new tracepoints for when the
thread is already running and for when there are no idle threads.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/trace/events/sunrpc.h | 23 ++++++++++++++++++-----
 net/sunrpc/svc.c              |  6 ++++--
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index aad697da1580..ff11fa07cbe3 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -2123,22 +2123,35 @@ TRACE_EVENT(svc_xprt_accept,
 	)
 );
 
-TRACE_EVENT(svc_wake_up,
-	TP_PROTO(int pid),
+DECLARE_EVENT_CLASS(svc_pool_thread_event,
+	TP_PROTO(const struct svc_pool *pool, pid_t pid),
 
-	TP_ARGS(pid),
+	TP_ARGS(pool, pid),
 
 	TP_STRUCT__entry(
-		__field(int, pid)
+		__field(unsigned int, pool_id)
+		__field(pid_t, pid)
 	),
 
 	TP_fast_assign(
+		__entry->pool_id = pool->sp_id;
 		__entry->pid = pid;
 	),
 
-	TP_printk("pid=%d", __entry->pid)
+	TP_printk("pool=%u pid=%d", __entry->pool_id, __entry->pid)
 );
 
+#define DEFINE_SVC_POOL_THREAD_EVENT(name) \
+	DEFINE_EVENT(svc_pool_thread_event, svc_pool_thread_##name, \
+			TP_PROTO( \
+				const struct svc_pool *pool, pid_t pid \
+			), \
+			TP_ARGS(pool, pid))
+
+DEFINE_SVC_POOL_THREAD_EVENT(wake);
+DEFINE_SVC_POOL_THREAD_EVENT(running);
+DEFINE_SVC_POOL_THREAD_EVENT(noidle);
+
 TRACE_EVENT(svc_alloc_arg_err,
 	TP_PROTO(
 		unsigned int requested,
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9c93b854e809..9abdbcbf2473 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -751,14 +751,16 @@ void svc_pool_wake_idle_thread(struct svc_pool *pool)
 		WRITE_ONCE(rqstp->rq_qtime, ktime_get());
 		if (!task_is_running(rqstp->rq_task)) {
 			wake_up_process(rqstp->rq_task);
-			trace_svc_wake_up(rqstp->rq_task->pid);
+			trace_svc_pool_thread_wake(pool, rqstp->rq_task->pid);
 			percpu_counter_inc(&pool->sp_threads_woken);
+		} else {
+			trace_svc_pool_thread_running(pool, rqstp->rq_task->pid);
 		}
 		rcu_read_unlock();
 		return;
 	}
 	rcu_read_unlock();
-
+	trace_svc_pool_thread_noidle(pool, 0);
 }
 EXPORT_SYMBOL_GPL(svc_pool_wake_idle_thread);
 
-- 
cgit v1.2.3


From d49afc90a3ba3af4507049fb43cb128d9a9c66d5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 20 Jun 2025 08:16:01 -0400
Subject: sunrpc: fix handling of unknown auth status codes

In the case of an unknown error code from svc_authenticate or
pg_authenticate, return AUTH_ERROR with a status of AUTH_FAILED. Also
add the other auth_stat value from RFC 5531, and document all the status
codes.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/msg_prot.h | 18 ++++++++++--------
 include/linux/sunrpc/xdr.h      |  2 ++
 net/sunrpc/svc.c                |  3 ++-
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index c4b0eb2b2f04..ada17b57ca44 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -69,15 +69,17 @@ enum rpc_reject_stat {
 };
 
 enum rpc_auth_stat {
-	RPC_AUTH_OK = 0,
-	RPC_AUTH_BADCRED = 1,
-	RPC_AUTH_REJECTEDCRED = 2,
-	RPC_AUTH_BADVERF = 3,
-	RPC_AUTH_REJECTEDVERF = 4,
-	RPC_AUTH_TOOWEAK = 5,
+	RPC_AUTH_OK = 0,		/* success */
+	RPC_AUTH_BADCRED = 1,		/* bad credential (seal broken) */
+	RPC_AUTH_REJECTEDCRED = 2,	/* client must begin new session */
+	RPC_AUTH_BADVERF = 3,		/* bad verifier (seal broken) */
+	RPC_AUTH_REJECTEDVERF = 4,	/* verifier expired or replayed */
+	RPC_AUTH_TOOWEAK = 5,		/* rejected for security reasons */
+	RPC_AUTH_INVALIDRESP = 6,	/* bogus response verifier */
+	RPC_AUTH_FAILED = 7,		/* reason unknown */
 	/* RPCSEC_GSS errors */
-	RPCSEC_GSS_CREDPROBLEM = 13,
-	RPCSEC_GSS_CTXPROBLEM = 14
+	RPCSEC_GSS_CREDPROBLEM = 13,	/* no credentials for user */
+	RPCSEC_GSS_CTXPROBLEM = 14	/* problem with context */
 };
 
 #define RPC_MAXNETNAMELEN	256
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 29d3a7659727..e3358c630ba1 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -119,6 +119,8 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 #define	rpc_autherr_badverf	cpu_to_be32(RPC_AUTH_BADVERF)
 #define	rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF)
 #define	rpc_autherr_tooweak	cpu_to_be32(RPC_AUTH_TOOWEAK)
+#define	rpc_autherr_invalidresp	cpu_to_be32(RPC_AUTH_INVALIDRESP)
+#define	rpc_autherr_failed	cpu_to_be32(RPC_AUTH_FAILED)
 #define	rpcsec_gsserr_credproblem	cpu_to_be32(RPCSEC_GSS_CREDPROBLEM)
 #define	rpcsec_gsserr_ctxproblem	cpu_to_be32(RPCSEC_GSS_CTXPROBLEM)
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 9abdbcbf2473..195fb0bea841 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1387,7 +1387,8 @@ svc_process_common(struct svc_rqst *rqstp)
 		goto sendit;
 	default:
 		pr_warn_once("Unexpected svc_auth_status (%d)\n", auth_res);
-		goto err_system_err;
+		rqstp->rq_auth_stat = rpc_autherr_failed;
+		goto err_bad_auth;
 	}
 
 	if (progp == NULL)
-- 
cgit v1.2.3


From 6f0e26243b02f440938ab7a3782eb730f2247fb1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 20 Jun 2025 08:16:02 -0400
Subject: sunrpc: remove SVC_SYSERR

Nothing returns this error code.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svcauth.h | 1 -
 include/trace/events/sunrpc.h  | 2 --
 net/sunrpc/svc.c               | 8 --------
 3 files changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index 2e111153f7cd..4b92fec23a49 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -86,7 +86,6 @@ struct auth_domain {
 
 enum svc_auth_status {
 	SVC_GARBAGE = 1,
-	SVC_SYSERR,
 	SVC_VALID,
 	SVC_NEGATIVE,
 	SVC_OK,
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index ff11fa07cbe3..750ecce56930 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -1691,7 +1691,6 @@ SVC_RQST_FLAG_LIST
 		__print_flags(flags, "|", SVC_RQST_FLAG_LIST)
 
 TRACE_DEFINE_ENUM(SVC_GARBAGE);
-TRACE_DEFINE_ENUM(SVC_SYSERR);
 TRACE_DEFINE_ENUM(SVC_VALID);
 TRACE_DEFINE_ENUM(SVC_NEGATIVE);
 TRACE_DEFINE_ENUM(SVC_OK);
@@ -1704,7 +1703,6 @@ TRACE_DEFINE_ENUM(SVC_COMPLETE);
 #define show_svc_auth_status(status)			\
 	__print_symbolic(status,			\
 		{ SVC_GARBAGE,	"SVC_GARBAGE" },	\
-		{ SVC_SYSERR,	"SVC_SYSERR" },		\
 		{ SVC_VALID,	"SVC_VALID" },		\
 		{ SVC_NEGATIVE,	"SVC_NEGATIVE" },	\
 		{ SVC_OK,	"SVC_OK" },		\
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 195fb0bea841..c6ceacedae28 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1375,8 +1375,6 @@ svc_process_common(struct svc_rqst *rqstp)
 	case SVC_GARBAGE:
 		rqstp->rq_auth_stat = rpc_autherr_badcred;
 		goto err_bad_auth;
-	case SVC_SYSERR:
-		goto err_system_err;
 	case SVC_DENIED:
 		goto err_bad_auth;
 	case SVC_CLOSE:
@@ -1515,12 +1513,6 @@ err_bad_proc:
 		serv->sv_stats->rpcbadfmt++;
 	*rqstp->rq_accept_statp = rpc_proc_unavail;
 	goto sendit;
-
-err_system_err:
-	if (serv->sv_stats)
-		serv->sv_stats->rpcbadfmt++;
-	*rqstp->rq_accept_statp = rpc_system_err;
-	goto sendit;
 }
 
 /*
-- 
cgit v1.2.3


From 2bac9a4c5f452d42a78ce07596ef88f75978b536 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 20 Jun 2025 08:16:05 -0400
Subject: sunrpc: rearrange struct svc_rqst for fewer cachelines

This shrinks the struct by 4 bytes, but also takes it from 19 to 18
cachelines on x86_64.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 48666b83fe68..40cbe81360ed 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -245,10 +245,10 @@ struct svc_rqst {
 						 * initialisation success.
 						 */
 
-	unsigned long	bc_to_initval;
-	unsigned int	bc_to_retries;
-	void **			rq_lease_breaker; /* The v4 client breaking a lease */
+	unsigned long		bc_to_initval;
+	unsigned int		bc_to_retries;
 	unsigned int		rq_status_counter; /* RPC processing counter */
+	void			**rq_lease_breaker; /* The v4 client breaking a lease */
 };
 
 /* bits for rq_flags */
-- 
cgit v1.2.3


From f4312e6bfa2a98e94dacc75f96f916b76bdf4259 Mon Sep 17 00:00:00 2001
From: Joshua Hay <joshua.a.hay@intel.com>
Date: Tue, 8 Jul 2025 16:05:50 -0500
Subject: idpf: implement core RDMA auxiliary dev create, init, and destroy

Add the initial idpf_idc.c file with the functions to kick off the IDC
initialization, create and initialize a core RDMA auxiliary device, and
destroy said device.

The RDMA core has a dependency on the vports being created by the
control plane before it can be initialized. Therefore, once all the
vports are up after a hard reset (either during driver load a function
level reset), the core RDMA device info will be created. It is populated
with the function type (as distinguished by the IDC initialization
function pointer), the core idc_ops function points (just stubs for
now), the reserved RDMA MSIX table, and various other info the core RDMA
auxiliary driver will need. It is then plugged on to the bus.

During a function level reset or driver unload, the device will be
unplugged from the bus and destroyed.

Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Joshua Hay <joshua.a.hay@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/idpf/Makefile        |   1 +
 drivers/net/ethernet/intel/idpf/idpf.h          |  12 ++
 drivers/net/ethernet/intel/idpf/idpf_dev.c      |  13 ++
 drivers/net/ethernet/intel/idpf/idpf_idc.c      | 227 ++++++++++++++++++++++++
 drivers/net/ethernet/intel/idpf/idpf_lib.c      |   4 +
 drivers/net/ethernet/intel/idpf/idpf_vf_dev.c   |  13 ++
 drivers/net/ethernet/intel/idpf/idpf_virtchnl.c |  21 +++
 drivers/net/ethernet/intel/idpf/idpf_virtchnl.h |   3 +
 include/linux/net/intel/iidc_rdma_idpf.h        |  28 +++
 9 files changed, 322 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/idpf/idpf_idc.c
 create mode 100644 include/linux/net/intel/iidc_rdma_idpf.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/idpf/Makefile b/drivers/net/ethernet/intel/idpf/Makefile
index 83ac5e296382..4ef4b2b5e37a 100644
--- a/drivers/net/ethernet/intel/idpf/Makefile
+++ b/drivers/net/ethernet/intel/idpf/Makefile
@@ -10,6 +10,7 @@ idpf-y := \
 	idpf_controlq_setup.o	\
 	idpf_dev.o		\
 	idpf_ethtool.o		\
+	idpf_idc.o		\
 	idpf_lib.o		\
 	idpf_main.o		\
 	idpf_txrx.o		\
diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h
index d9f06764aba0..dd2aa515a31b 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -18,6 +18,9 @@ struct idpf_vport_max_q;
 #include <linux/ethtool_netlink.h>
 #include <net/gro.h>
 
+#include <linux/net/intel/iidc_rdma.h>
+#include <linux/net/intel/iidc_rdma_idpf.h>
+
 #include "virtchnl2.h"
 #include "idpf_txrx.h"
 #include "idpf_controlq.h"
@@ -206,9 +209,12 @@ struct idpf_reg_ops {
 /**
  * struct idpf_dev_ops - Device specific operations
  * @reg_ops: Register operations
+ * @idc_init: IDC initialization
  */
 struct idpf_dev_ops {
 	struct idpf_reg_ops reg_ops;
+
+	int (*idc_init)(struct idpf_adapter *adapter);
 };
 
 /**
@@ -540,6 +546,7 @@ struct idpf_vc_xn_manager;
  * @caps: Negotiated capabilities with device
  * @vcxn_mngr: Virtchnl transaction manager
  * @dev_ops: See idpf_dev_ops
+ * @cdev_info: IDC core device info pointer
  * @num_vfs: Number of allocated VFs through sysfs. PF does not directly talk
  *	     to VFs but is used to initialize them
  * @crc_enable: Enable CRC insertion offload
@@ -599,6 +606,7 @@ struct idpf_adapter {
 	struct idpf_vc_xn_manager *vcxn_mngr;
 
 	struct idpf_dev_ops dev_ops;
+	struct iidc_rdma_core_dev_info *cdev_info;
 	int num_vfs;
 	bool crc_enable;
 	bool req_tx_splitq;
@@ -877,5 +885,9 @@ int idpf_sriov_configure(struct pci_dev *pdev, int num_vfs);
 
 u8 idpf_vport_get_hsplit(const struct idpf_vport *vport);
 bool idpf_vport_set_hsplit(const struct idpf_vport *vport, u8 val);
+int idpf_idc_init(struct idpf_adapter *adapter);
+int idpf_idc_init_aux_core_dev(struct idpf_adapter *adapter,
+			       enum iidc_function_type ftype);
+void idpf_idc_deinit_core_aux_device(struct iidc_rdma_core_dev_info *cdev_info);
 
 #endif /* !_IDPF_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_dev.c
index 3fae81f1f988..dd227a4368fb 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c
@@ -161,6 +161,17 @@ static void idpf_ptp_reg_init(const struct idpf_adapter *adapter)
 	adapter->ptp->cmd.exec_cmd_mask = PF_GLTSYN_CMD_SYNC_EXEC_CMD_M;
 }
 
+/**
+ * idpf_idc_register - register for IDC callbacks
+ * @adapter: Driver specific private structure
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_idc_register(struct idpf_adapter *adapter)
+{
+	return idpf_idc_init_aux_core_dev(adapter, IIDC_FUNCTION_TYPE_PF);
+}
+
 /**
  * idpf_reg_ops_init - Initialize register API function pointers
  * @adapter: Driver specific private structure
@@ -182,4 +193,6 @@ static void idpf_reg_ops_init(struct idpf_adapter *adapter)
 void idpf_dev_ops_init(struct idpf_adapter *adapter)
 {
 	idpf_reg_ops_init(adapter);
+
+	adapter->dev_ops.idc_init = idpf_idc_register;
 }
diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c
new file mode 100644
index 000000000000..bc90699f22c5
--- /dev/null
+++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include <linux/export.h>
+
+#include "idpf.h"
+#include "idpf_virtchnl.h"
+
+static DEFINE_IDA(idpf_idc_ida);
+
+#define IDPF_IDC_MAX_ADEV_NAME_LEN	15
+
+/**
+ * idpf_idc_init - Called to initialize IDC
+ * @adapter: driver private data structure
+ *
+ * Return: 0 on success or cap not enabled, error code on failure.
+ */
+int idpf_idc_init(struct idpf_adapter *adapter)
+{
+	int err;
+
+	if (!idpf_is_rdma_cap_ena(adapter) ||
+	    !adapter->dev_ops.idc_init)
+		return 0;
+
+	err = adapter->dev_ops.idc_init(adapter);
+	if (err)
+		dev_err(&adapter->pdev->dev, "failed to initialize idc: %d\n",
+			err);
+
+	return err;
+}
+
+/**
+ * idpf_core_adev_release - function to be mapped to aux dev's release op
+ * @dev: pointer to device to free
+ */
+static void idpf_core_adev_release(struct device *dev)
+{
+	struct iidc_rdma_core_auxiliary_dev *iadev;
+
+	iadev = container_of(dev, struct iidc_rdma_core_auxiliary_dev, adev.dev);
+	kfree(iadev);
+	iadev = NULL;
+}
+
+/**
+ * idpf_plug_core_aux_dev - allocate and register an Auxiliary device
+ * @cdev_info: IDC core device info pointer
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_plug_core_aux_dev(struct iidc_rdma_core_dev_info *cdev_info)
+{
+	struct iidc_rdma_core_auxiliary_dev *iadev;
+	char name[IDPF_IDC_MAX_ADEV_NAME_LEN];
+	struct auxiliary_device *adev;
+	int ret;
+
+	iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
+	if (!iadev)
+		return -ENOMEM;
+
+	adev = &iadev->adev;
+	cdev_info->adev = adev;
+	iadev->cdev_info = cdev_info;
+
+	ret = ida_alloc(&idpf_idc_ida, GFP_KERNEL);
+	if (ret < 0) {
+		pr_err("failed to allocate unique device ID for Auxiliary driver\n");
+		goto err_ida_alloc;
+	}
+	adev->id = ret;
+	adev->dev.release = idpf_core_adev_release;
+	adev->dev.parent = &cdev_info->pdev->dev;
+	sprintf(name, "%04x.rdma.core", cdev_info->pdev->vendor);
+	adev->name = name;
+
+	ret = auxiliary_device_init(adev);
+	if (ret)
+		goto err_aux_dev_init;
+
+	ret = auxiliary_device_add(adev);
+	if (ret)
+		goto err_aux_dev_add;
+
+	return 0;
+
+err_aux_dev_add:
+	auxiliary_device_uninit(adev);
+err_aux_dev_init:
+	ida_free(&idpf_idc_ida, adev->id);
+err_ida_alloc:
+	cdev_info->adev = NULL;
+	kfree(iadev);
+
+	return ret;
+}
+
+/**
+ * idpf_unplug_aux_dev - unregister and free an Auxiliary device
+ * @adev: auxiliary device struct
+ */
+static void idpf_unplug_aux_dev(struct auxiliary_device *adev)
+{
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
+
+	ida_free(&idpf_idc_ida, adev->id);
+}
+
+/**
+ * idpf_idc_vport_dev_ctrl - Called by an Auxiliary Driver
+ * @cdev_info: IDC core device info pointer
+ * @up: RDMA core driver status
+ *
+ * This callback function is accessed by an Auxiliary Driver to indicate
+ * whether core driver is ready to support vport driver load or if vport
+ * drivers need to be taken down.
+ *
+ * Return: 0 on success or error code on failure.
+ */
+int idpf_idc_vport_dev_ctrl(struct iidc_rdma_core_dev_info *cdev_info, bool up)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(idpf_idc_vport_dev_ctrl);
+
+/**
+ * idpf_idc_request_reset - Called by an Auxiliary Driver
+ * @cdev_info: IDC core device info pointer
+ * @reset_type: function, core or other
+ *
+ * This callback function is accessed by an Auxiliary Driver to request a reset
+ * on the Auxiliary Device.
+ *
+ * Return: 0 on success or error code on failure.
+ */
+int idpf_idc_request_reset(struct iidc_rdma_core_dev_info *cdev_info,
+			   enum iidc_rdma_reset_type __always_unused reset_type)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(idpf_idc_request_reset);
+
+/**
+ * idpf_idc_init_msix_data - initialize MSIX data for the cdev_info structure
+ * @adapter: driver private data structure
+ */
+static void
+idpf_idc_init_msix_data(struct idpf_adapter *adapter)
+{
+	struct iidc_rdma_core_dev_info *cdev_info;
+	struct iidc_rdma_priv_dev_info *privd;
+
+	if (!adapter->rdma_msix_entries)
+		return;
+
+	cdev_info = adapter->cdev_info;
+	privd = cdev_info->iidc_priv;
+
+	privd->msix_entries = adapter->rdma_msix_entries;
+	privd->msix_count = adapter->num_rdma_msix_entries;
+}
+
+/**
+ * idpf_idc_init_aux_core_dev - initialize Auxiliary Device(s)
+ * @adapter: driver private data structure
+ * @ftype: PF or VF
+ *
+ * Return: 0 on success or error code on failure.
+ */
+int idpf_idc_init_aux_core_dev(struct idpf_adapter *adapter,
+			       enum iidc_function_type ftype)
+{
+	struct iidc_rdma_core_dev_info *cdev_info;
+	struct iidc_rdma_priv_dev_info *privd;
+	int err;
+
+	adapter->cdev_info = kzalloc(sizeof(*cdev_info), GFP_KERNEL);
+	if (!adapter->cdev_info)
+		return -ENOMEM;
+	cdev_info = adapter->cdev_info;
+
+	privd = kzalloc(sizeof(*privd), GFP_KERNEL);
+	if (!privd) {
+		err = -ENOMEM;
+		goto err_privd_alloc;
+	}
+
+	cdev_info->iidc_priv = privd;
+	cdev_info->pdev = adapter->pdev;
+	cdev_info->rdma_protocol = IIDC_RDMA_PROTOCOL_ROCEV2;
+	privd->ftype = ftype;
+
+	idpf_idc_init_msix_data(adapter);
+
+	err = idpf_plug_core_aux_dev(cdev_info);
+	if (err)
+		goto err_plug_aux_dev;
+
+	return 0;
+
+err_plug_aux_dev:
+	kfree(privd);
+err_privd_alloc:
+	kfree(cdev_info);
+	adapter->cdev_info = NULL;
+
+	return err;
+}
+
+/**
+ * idpf_idc_deinit_core_aux_device - de-initialize Auxiliary Device(s)
+ * @cdev_info: IDC core device info pointer
+ */
+void idpf_idc_deinit_core_aux_device(struct iidc_rdma_core_dev_info *cdev_info)
+{
+	if (!cdev_info)
+		return;
+
+	idpf_unplug_aux_dev(cdev_info->adev);
+
+	kfree(cdev_info->iidc_priv);
+	kfree(cdev_info);
+}
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 7dcb3a7bbc35..b9e04ea2cbd4 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -1834,6 +1834,10 @@ static int idpf_init_hard_reset(struct idpf_adapter *adapter)
 unlock_mutex:
 	mutex_unlock(&adapter->vport_ctrl_lock);
 
+	/* Wait until all vports are created to init RDMA CORE AUX */
+	if (!err)
+		err = idpf_idc_init(adapter);
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
index aba828abcb17..2f84bd596ae4 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
@@ -147,6 +147,17 @@ static void idpf_vf_trigger_reset(struct idpf_adapter *adapter,
 		idpf_send_mb_msg(adapter, VIRTCHNL2_OP_RESET_VF, 0, NULL, 0);
 }
 
+/**
+ * idpf_idc_vf_register - register for IDC callbacks
+ * @adapter: Driver specific private structure
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_idc_vf_register(struct idpf_adapter *adapter)
+{
+	return idpf_idc_init_aux_core_dev(adapter, IIDC_FUNCTION_TYPE_VF);
+}
+
 /**
  * idpf_vf_reg_ops_init - Initialize register API function pointers
  * @adapter: Driver specific private structure
@@ -167,4 +178,6 @@ static void idpf_vf_reg_ops_init(struct idpf_adapter *adapter)
 void idpf_vf_dev_ops_init(struct idpf_adapter *adapter)
 {
 	idpf_vf_reg_ops_init(adapter);
+
+	adapter->dev_ops.idc_init = idpf_idc_vf_register;
 }
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index 24febaaa8fbb..f7e105c67baf 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2023 Intel Corporation */
 
+#include <linux/export.h>
 #include <net/libeth/rx.h>
 
 #include "idpf.h"
@@ -868,6 +869,7 @@ static int idpf_send_get_caps_msg(struct idpf_adapter *adapter)
 
 	caps.other_caps =
 		cpu_to_le64(VIRTCHNL2_CAP_SRIOV			|
+			    VIRTCHNL2_CAP_RDMA                  |
 			    VIRTCHNL2_CAP_MACFILTER		|
 			    VIRTCHNL2_CAP_SPLITQ_QSCHED		|
 			    VIRTCHNL2_CAP_PROMISC		|
@@ -3070,6 +3072,7 @@ void idpf_vc_core_deinit(struct idpf_adapter *adapter)
 
 	idpf_ptp_release(adapter);
 	idpf_deinit_task(adapter);
+	idpf_idc_deinit_core_aux_device(adapter->cdev_info);
 	idpf_intr_rel(adapter);
 
 	if (remove_in_prog)
@@ -3728,3 +3731,21 @@ int idpf_set_promiscuous(struct idpf_adapter *adapter,
 
 	return reply_sz < 0 ? reply_sz : 0;
 }
+
+/**
+ * idpf_idc_rdma_vc_send_sync - virtchnl send callback for IDC registered drivers
+ * @cdev_info: IDC core device info pointer
+ * @send_msg: message to send
+ * @msg_size: size of message to send
+ * @recv_msg: message to populate on reception of response
+ * @recv_len: length of message copied into recv_msg or 0 on error
+ *
+ * Return: 0 on success or error code on failure.
+ */
+int idpf_idc_rdma_vc_send_sync(struct iidc_rdma_core_dev_info *cdev_info,
+			       u8 *send_msg, u16 msg_size,
+			       u8 *recv_msg, u16 *recv_len)
+{
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(idpf_idc_rdma_vc_send_sync);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
index 77578206bada..7bae09483aed 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.h
@@ -151,5 +151,8 @@ int idpf_send_set_sriov_vfs_msg(struct idpf_adapter *adapter, u16 num_vfs);
 int idpf_send_get_set_rss_key_msg(struct idpf_vport *vport, bool get);
 int idpf_send_get_set_rss_lut_msg(struct idpf_vport *vport, bool get);
 void idpf_vc_xn_shutdown(struct idpf_vc_xn_manager *vcxn_mngr);
+int idpf_idc_rdma_vc_send_sync(struct iidc_rdma_core_dev_info *cdev_info,
+			       u8 *send_msg, u16 msg_size,
+			       u8 *recv_msg, u16 *recv_len);
 
 #endif /* _IDPF_VIRTCHNL_H_ */
diff --git a/include/linux/net/intel/iidc_rdma_idpf.h b/include/linux/net/intel/iidc_rdma_idpf.h
new file mode 100644
index 000000000000..f2fe1844f660
--- /dev/null
+++ b/include/linux/net/intel/iidc_rdma_idpf.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Intel Corporation. */
+
+#ifndef _IIDC_RDMA_IDPF_H_
+#define _IIDC_RDMA_IDPF_H_
+
+#include <linux/auxiliary_bus.h>
+
+/* struct to be populated by core LAN PCI driver */
+enum iidc_function_type {
+	IIDC_FUNCTION_TYPE_PF,
+	IIDC_FUNCTION_TYPE_VF,
+};
+
+struct iidc_rdma_priv_dev_info {
+	struct msix_entry *msix_entries;
+	u16 msix_count; /* How many vectors are reserved for this device */
+	enum iidc_function_type ftype;
+};
+
+int idpf_idc_vport_dev_ctrl(struct iidc_rdma_core_dev_info *cdev_info, bool up);
+int idpf_idc_request_reset(struct iidc_rdma_core_dev_info *cdev_info,
+			   enum iidc_rdma_reset_type __always_unused reset_type);
+int idpf_idc_rdma_vc_send_sync(struct iidc_rdma_core_dev_info *cdev_info,
+			       u8 *send_msg, u16 msg_size,
+			       u8 *recv_msg, u16 *recv_len);
+
+#endif /* _IIDC_RDMA_IDPF_H_ */
-- 
cgit v1.2.3


From be91128c579c86d295da4325f6ac4710e4e6d2b4 Mon Sep 17 00:00:00 2001
From: Joshua Hay <joshua.a.hay@intel.com>
Date: Tue, 8 Jul 2025 16:05:51 -0500
Subject: idpf: implement RDMA vport auxiliary dev create, init, and destroy

Implement the functions to create, initialize, and destroy an RDMA vport
auxiliary device. The vport aux dev creation is dependent on the
core aux device to call idpf_idc_vport_dev_ctrl to signal that it is
ready for vport aux devices. Implement that core callback to either
create and initialize the vport aux dev or deinitialize.

RDMA vport aux dev creation is also dependent on the control plane to
tell us the vport is RDMA enabled. Add a flag in the create vport
message to signal individual vport RDMA capabilities.

Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Joshua Hay <joshua.a.hay@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf.h      |   4 +
 drivers/net/ethernet/intel/idpf/idpf_idc.c  | 180 +++++++++++++++++++++++++++-
 drivers/net/ethernet/intel/idpf/idpf_lib.c  |   2 +
 drivers/net/ethernet/intel/idpf/virtchnl2.h |   3 +
 include/linux/net/intel/iidc_rdma_idpf.h    |  19 +++
 5 files changed, 207 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h
index dd2aa515a31b..7103cf551bb8 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -281,6 +281,7 @@ struct idpf_port_stats {
  *	      group will yield total number of RX queues.
  * @rxq_model: Splitq queue or single queue queuing model
  * @rx_ptype_lkup: Lookup table for ptypes on RX
+ * @vdev_info: IDC vport device info pointer
  * @adapter: back pointer to associated adapter
  * @netdev: Associated net_device. Each vport should have one and only one
  *	    associated netdev.
@@ -326,6 +327,8 @@ struct idpf_vport {
 	u32 rxq_model;
 	struct libeth_rx_pt *rx_ptype_lkup;
 
+	struct iidc_rdma_vport_dev_info *vdev_info;
+
 	struct idpf_adapter *adapter;
 	struct net_device *netdev;
 	DECLARE_BITMAP(flags, IDPF_VPORT_FLAGS_NBITS);
@@ -889,5 +892,6 @@ int idpf_idc_init(struct idpf_adapter *adapter);
 int idpf_idc_init_aux_core_dev(struct idpf_adapter *adapter,
 			       enum iidc_function_type ftype);
 void idpf_idc_deinit_core_aux_device(struct iidc_rdma_core_dev_info *cdev_info);
+void idpf_idc_deinit_vport_aux_device(struct iidc_rdma_vport_dev_info *vdev_info);
 
 #endif /* !_IDPF_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c
index bc90699f22c5..237dfe1ac06d 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_idc.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c
@@ -32,6 +32,115 @@ int idpf_idc_init(struct idpf_adapter *adapter)
 	return err;
 }
 
+/**
+ * idpf_vport_adev_release - function to be mapped to aux dev's release op
+ * @dev: pointer to device to free
+ */
+static void idpf_vport_adev_release(struct device *dev)
+{
+	struct iidc_rdma_vport_auxiliary_dev *iadev;
+
+	iadev = container_of(dev, struct iidc_rdma_vport_auxiliary_dev, adev.dev);
+	kfree(iadev);
+	iadev = NULL;
+}
+
+/**
+ * idpf_plug_vport_aux_dev - allocate and register a vport Auxiliary device
+ * @cdev_info: IDC core device info pointer
+ * @vdev_info: IDC vport device info pointer
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_plug_vport_aux_dev(struct iidc_rdma_core_dev_info *cdev_info,
+				   struct iidc_rdma_vport_dev_info *vdev_info)
+{
+	struct iidc_rdma_vport_auxiliary_dev *iadev;
+	char name[IDPF_IDC_MAX_ADEV_NAME_LEN];
+	struct auxiliary_device *adev;
+	int ret;
+
+	iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
+	if (!iadev)
+		return -ENOMEM;
+
+	adev = &iadev->adev;
+	vdev_info->adev = &iadev->adev;
+	iadev->vdev_info = vdev_info;
+
+	ret = ida_alloc(&idpf_idc_ida, GFP_KERNEL);
+	if (ret < 0) {
+		pr_err("failed to allocate unique device ID for Auxiliary driver\n");
+		goto err_ida_alloc;
+	}
+	adev->id = ret;
+	adev->dev.release = idpf_vport_adev_release;
+	adev->dev.parent = &cdev_info->pdev->dev;
+	sprintf(name, "%04x.rdma.vdev", cdev_info->pdev->vendor);
+	adev->name = name;
+
+	ret = auxiliary_device_init(adev);
+	if (ret)
+		goto err_aux_dev_init;
+
+	ret = auxiliary_device_add(adev);
+	if (ret)
+		goto err_aux_dev_add;
+
+	return 0;
+
+err_aux_dev_add:
+	auxiliary_device_uninit(adev);
+err_aux_dev_init:
+	ida_free(&idpf_idc_ida, adev->id);
+err_ida_alloc:
+	vdev_info->adev = NULL;
+	kfree(iadev);
+
+	return ret;
+}
+
+/**
+ * idpf_idc_init_aux_vport_dev - initialize vport Auxiliary Device(s)
+ * @vport: virtual port data struct
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_idc_init_aux_vport_dev(struct idpf_vport *vport)
+{
+	struct idpf_adapter *adapter = vport->adapter;
+	struct iidc_rdma_vport_dev_info *vdev_info;
+	struct iidc_rdma_core_dev_info *cdev_info;
+	struct virtchnl2_create_vport *vport_msg;
+	int err;
+
+	vport_msg = (struct virtchnl2_create_vport *)
+			adapter->vport_params_recvd[vport->idx];
+
+	if (!(le16_to_cpu(vport_msg->vport_flags) & VIRTCHNL2_VPORT_ENABLE_RDMA))
+		return 0;
+
+	vport->vdev_info = kzalloc(sizeof(*vdev_info), GFP_KERNEL);
+	if (!vport->vdev_info)
+		return -ENOMEM;
+
+	cdev_info = vport->adapter->cdev_info;
+
+	vdev_info = vport->vdev_info;
+	vdev_info->vport_id = vport->vport_id;
+	vdev_info->netdev = vport->netdev;
+	vdev_info->core_adev = cdev_info->adev;
+
+	err = idpf_plug_vport_aux_dev(cdev_info, vdev_info);
+	if (err) {
+		vport->vdev_info = NULL;
+		kfree(vdev_info);
+		return err;
+	}
+
+	return 0;
+}
+
 /**
  * idpf_core_adev_release - function to be mapped to aux dev's release op
  * @dev: pointer to device to free
@@ -104,12 +213,60 @@ err_ida_alloc:
  */
 static void idpf_unplug_aux_dev(struct auxiliary_device *adev)
 {
+	if (!adev)
+		return;
+
 	auxiliary_device_delete(adev);
 	auxiliary_device_uninit(adev);
 
 	ida_free(&idpf_idc_ida, adev->id);
 }
 
+/**
+ * idpf_idc_vport_dev_up - called when CORE is ready for vport aux devs
+ * @adapter: private data struct
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_idc_vport_dev_up(struct idpf_adapter *adapter)
+{
+	int i, err = 0;
+
+	for (i = 0; i < adapter->num_alloc_vports; i++) {
+		struct idpf_vport *vport = adapter->vports[i];
+
+		if (!vport)
+			continue;
+
+		if (!vport->vdev_info)
+			err = idpf_idc_init_aux_vport_dev(vport);
+		else
+			err = idpf_plug_vport_aux_dev(vport->adapter->cdev_info,
+						      vport->vdev_info);
+	}
+
+	return err;
+}
+
+/**
+ * idpf_idc_vport_dev_down - called CORE is leaving vport aux dev support state
+ * @adapter: private data struct
+ */
+static void idpf_idc_vport_dev_down(struct idpf_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_alloc_vports; i++) {
+		struct idpf_vport *vport = adapter->vports[i];
+
+		if (!vport)
+			continue;
+
+		idpf_unplug_aux_dev(vport->vdev_info->adev);
+		vport->vdev_info->adev = NULL;
+	}
+}
+
 /**
  * idpf_idc_vport_dev_ctrl - Called by an Auxiliary Driver
  * @cdev_info: IDC core device info pointer
@@ -123,7 +280,14 @@ static void idpf_unplug_aux_dev(struct auxiliary_device *adev)
  */
 int idpf_idc_vport_dev_ctrl(struct iidc_rdma_core_dev_info *cdev_info, bool up)
 {
-	return -EOPNOTSUPP;
+	struct idpf_adapter *adapter = pci_get_drvdata(cdev_info->pdev);
+
+	if (up)
+		return idpf_idc_vport_dev_up(adapter);
+
+	idpf_idc_vport_dev_down(adapter);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(idpf_idc_vport_dev_ctrl);
 
@@ -225,3 +389,17 @@ void idpf_idc_deinit_core_aux_device(struct iidc_rdma_core_dev_info *cdev_info)
 	kfree(cdev_info->iidc_priv);
 	kfree(cdev_info);
 }
+
+/**
+ * idpf_idc_deinit_vport_aux_device - de-initialize Auxiliary Device(s)
+ * @vdev_info: IDC vport device info pointer
+ */
+void idpf_idc_deinit_vport_aux_device(struct iidc_rdma_vport_dev_info *vdev_info)
+{
+	if (!vdev_info)
+		return;
+
+	idpf_unplug_aux_dev(vdev_info->adev);
+
+	kfree(vdev_info);
+}
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index b9e04ea2cbd4..30a7beb23155 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -1021,6 +1021,8 @@ static void idpf_vport_dealloc(struct idpf_vport *vport)
 	struct idpf_adapter *adapter = vport->adapter;
 	unsigned int i = vport->idx;
 
+	idpf_idc_deinit_vport_aux_device(vport->vdev_info);
+
 	idpf_deinit_mac_addr(vport);
 	idpf_vport_stop(vport);
 
diff --git a/drivers/net/ethernet/intel/idpf/virtchnl2.h b/drivers/net/ethernet/intel/idpf/virtchnl2.h
index a2881979c7f8..82a3c307307e 100644
--- a/drivers/net/ethernet/intel/idpf/virtchnl2.h
+++ b/drivers/net/ethernet/intel/idpf/virtchnl2.h
@@ -575,9 +575,12 @@ VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_queue_reg_chunks);
 /**
  * enum virtchnl2_vport_flags - Vport flags that indicate vport capabilities.
  * @VIRTCHNL2_VPORT_UPLINK_PORT: Representatives of underlying physical ports
+ * @VIRTCHNL2_VPORT_ENABLE_RDMA: RDMA is enabled for this vport
  */
 enum virtchnl2_vport_flags {
 	VIRTCHNL2_VPORT_UPLINK_PORT	= BIT(0),
+	/* VIRTCHNL2_VPORT_* bits [1:3] rsvd */
+	VIRTCHNL2_VPORT_ENABLE_RDMA             = BIT(4),
 };
 
 /**
diff --git a/include/linux/net/intel/iidc_rdma_idpf.h b/include/linux/net/intel/iidc_rdma_idpf.h
index f2fe1844f660..16c970dd4c6e 100644
--- a/include/linux/net/intel/iidc_rdma_idpf.h
+++ b/include/linux/net/intel/iidc_rdma_idpf.h
@@ -6,6 +6,25 @@
 
 #include <linux/auxiliary_bus.h>
 
+/* struct to be populated by core LAN PCI driver */
+struct iidc_rdma_vport_dev_info {
+	struct auxiliary_device *adev;
+	struct auxiliary_device *core_adev;
+	struct net_device *netdev;
+	u16 vport_id;
+};
+
+struct iidc_rdma_vport_auxiliary_dev {
+	struct auxiliary_device adev;
+	struct iidc_rdma_vport_dev_info *vdev_info;
+};
+
+struct iidc_rdma_vport_auxiliary_drv {
+	struct auxiliary_driver adrv;
+	void (*event_handler)(struct iidc_rdma_vport_dev_info *vdev,
+			      struct iidc_rdma_event *event);
+};
+
 /* struct to be populated by core LAN PCI driver */
 enum iidc_function_type {
 	IIDC_FUNCTION_TYPE_PF,
-- 
cgit v1.2.3


From 6aa53e861c1a0c042690c9b7c5c153088ae61079 Mon Sep 17 00:00:00 2001
From: Joshua Hay <joshua.a.hay@intel.com>
Date: Tue, 8 Jul 2025 16:05:54 -0500
Subject: idpf: implement get LAN MMIO memory regions

The RDMA driver needs to map its own MMIO regions for the sake of
performance, meaning the IDPF needs to avoid mapping portions of the BAR
space. However, to be HW agnostic, the IDPF cannot assume where
these are and must avoid mapping hard coded regions as much as possible.

The IDPF maps the bare minimum to load and communicate with the
control plane, i.e., the mailbox registers and the reset state
registers. Because of how and when mailbox register offsets are
initialized, it is easier to adjust the existing defines to be relative
to the mailbox region starting address. Use a specific mailbox register
write function that uses these relative offsets. The reset state
register addresses are calculated the same way as for other registers,
described below.

The IDPF then calls a new virtchnl op to fetch a list of MMIO regions
that it should map. The addresses for the registers in these regions are
calculated by determining what region the register resides in, adjusting
the offset to be relative to that region, and then adding the
register's offset to that region's mapped address.

If the new virtchnl op is not supported, the IDPF will fallback to
mapping the whole bar. However, it will still map them as separate
regions outside the mailbox and reset state registers. This way we can
use the same logic in both cases to access the MMIO space.

Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Joshua Hay <joshua.a.hay@intel.com>
Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf.h          |  69 ++++++++++-
 drivers/net/ethernet/intel/idpf/idpf_controlq.c |  14 +--
 drivers/net/ethernet/intel/idpf/idpf_controlq.h |  18 ++-
 drivers/net/ethernet/intel/idpf/idpf_dev.c      |  36 +++---
 drivers/net/ethernet/intel/idpf/idpf_idc.c      |  32 ++++-
 drivers/net/ethernet/intel/idpf/idpf_main.c     |  32 ++++-
 drivers/net/ethernet/intel/idpf/idpf_mem.h      |   8 +-
 drivers/net/ethernet/intel/idpf/idpf_vf_dev.c   |  32 +++--
 drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 149 +++++++++++++++++++++++-
 drivers/net/ethernet/intel/idpf/virtchnl2.h     |  30 ++++-
 include/linux/net/intel/iidc_rdma_idpf.h        |   8 ++
 11 files changed, 376 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h
index 79379d6db015..0cf9120d1f97 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -12,6 +12,7 @@ struct idpf_vport_max_q;
 #include <net/pkt_sched.h>
 #include <linux/aer.h>
 #include <linux/etherdevice.h>
+#include <linux/ioport.h>
 #include <linux/pci.h>
 #include <linux/bitfield.h>
 #include <linux/sctp.h>
@@ -197,7 +198,8 @@ struct idpf_vport_max_q {
  * @ptp_reg_init: PTP register initialization
  */
 struct idpf_reg_ops {
-	void (*ctlq_reg_init)(struct idpf_ctlq_create_info *cq);
+	void (*ctlq_reg_init)(struct idpf_adapter *adapter,
+			      struct idpf_ctlq_create_info *cq);
 	int (*intr_reg_init)(struct idpf_vport *vport);
 	void (*mb_intr_reg_init)(struct idpf_adapter *adapter);
 	void (*reset_reg_init)(struct idpf_adapter *adapter);
@@ -206,15 +208,25 @@ struct idpf_reg_ops {
 	void (*ptp_reg_init)(const struct idpf_adapter *adapter);
 };
 
+#define IDPF_MMIO_REG_NUM_STATIC	2
+#define IDPF_PF_MBX_REGION_SZ		4096
+#define IDPF_PF_RSTAT_REGION_SZ		2048
+#define IDPF_VF_MBX_REGION_SZ		10240
+#define IDPF_VF_RSTAT_REGION_SZ		2048
+
 /**
  * struct idpf_dev_ops - Device specific operations
  * @reg_ops: Register operations
  * @idc_init: IDC initialization
+ * @static_reg_info: array of mailbox and rstat register info
  */
 struct idpf_dev_ops {
 	struct idpf_reg_ops reg_ops;
 
 	int (*idc_init)(struct idpf_adapter *adapter);
+
+	/* static_reg_info[0] is mailbox region, static_reg_info[1] is rstat */
+	struct resource static_reg_info[IDPF_MMIO_REG_NUM_STATIC];
 };
 
 /**
@@ -755,6 +767,34 @@ static inline u8 idpf_get_min_tx_pkt_len(struct idpf_adapter *adapter)
 	return pkt_len ? pkt_len : IDPF_TX_MIN_PKT_LEN;
 }
 
+/**
+ * idpf_get_mbx_reg_addr - Get BAR0 mailbox register address
+ * @adapter: private data struct
+ * @reg_offset: register offset value
+ *
+ * Return: BAR0 mailbox register address based on register offset.
+ */
+static inline void __iomem *idpf_get_mbx_reg_addr(struct idpf_adapter *adapter,
+						  resource_size_t reg_offset)
+{
+	return adapter->hw.mbx.vaddr + reg_offset;
+}
+
+/**
+ * idpf_get_rstat_reg_addr - Get BAR0 rstat register address
+ * @adapter: private data struct
+ * @reg_offset: register offset value
+ *
+ * Return: BAR0 rstat register address based on register offset.
+ */
+static inline void __iomem *idpf_get_rstat_reg_addr(struct idpf_adapter *adapter,
+						    resource_size_t reg_offset)
+{
+	reg_offset -= adapter->dev_ops.static_reg_info[1].start;
+
+	return adapter->hw.rstat.vaddr + reg_offset;
+}
+
 /**
  * idpf_get_reg_addr - Get BAR0 register address
  * @adapter: private data struct
@@ -765,7 +805,30 @@ static inline u8 idpf_get_min_tx_pkt_len(struct idpf_adapter *adapter)
 static inline void __iomem *idpf_get_reg_addr(struct idpf_adapter *adapter,
 					      resource_size_t reg_offset)
 {
-	return (void __iomem *)(adapter->hw.hw_addr + reg_offset);
+	struct idpf_hw *hw = &adapter->hw;
+
+	for (int i = 0; i < hw->num_lan_regs; i++) {
+		struct idpf_mmio_reg *region = &hw->lan_regs[i];
+
+		if (reg_offset >= region->addr_start &&
+		    reg_offset < (region->addr_start + region->addr_len)) {
+			/* Convert the offset so that it is relative to the
+			 * start of the region.  Then add the base address of
+			 * the region to get the final address.
+			 */
+			reg_offset -= region->addr_start;
+
+			return region->vaddr + reg_offset;
+		}
+	}
+
+	/* It's impossible to hit this case with offsets from the CP. But if we
+	 * do for any other reason, the kernel will panic on that register
+	 * access. Might as well do it here to make it clear what's happening.
+	 */
+	BUG();
+
+	return NULL;
 }
 
 /**
@@ -779,7 +842,7 @@ static inline bool idpf_is_reset_detected(struct idpf_adapter *adapter)
 	if (!adapter->hw.arq)
 		return true;
 
-	return !(readl(idpf_get_reg_addr(adapter, adapter->hw.arq->reg.len)) &
+	return !(readl(idpf_get_mbx_reg_addr(adapter, adapter->hw.arq->reg.len)) &
 		 adapter->hw.arq->reg.len_mask);
 }
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_controlq.c b/drivers/net/ethernet/intel/idpf/idpf_controlq.c
index b28991dd1870..9c5c628eb469 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_controlq.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_controlq.c
@@ -36,19 +36,19 @@ static void idpf_ctlq_init_regs(struct idpf_hw *hw, struct idpf_ctlq_info *cq,
 {
 	/* Update tail to post pre-allocated buffers for rx queues */
 	if (is_rxq)
-		wr32(hw, cq->reg.tail, (u32)(cq->ring_size - 1));
+		idpf_mbx_wr32(hw, cq->reg.tail, (u32)(cq->ring_size - 1));
 
 	/* For non-Mailbox control queues only TAIL need to be set */
 	if (cq->q_id != -1)
 		return;
 
 	/* Clear Head for both send or receive */
-	wr32(hw, cq->reg.head, 0);
+	idpf_mbx_wr32(hw, cq->reg.head, 0);
 
 	/* set starting point */
-	wr32(hw, cq->reg.bal, lower_32_bits(cq->desc_ring.pa));
-	wr32(hw, cq->reg.bah, upper_32_bits(cq->desc_ring.pa));
-	wr32(hw, cq->reg.len, (cq->ring_size | cq->reg.len_ena_mask));
+	idpf_mbx_wr32(hw, cq->reg.bal, lower_32_bits(cq->desc_ring.pa));
+	idpf_mbx_wr32(hw, cq->reg.bah, upper_32_bits(cq->desc_ring.pa));
+	idpf_mbx_wr32(hw, cq->reg.len, (cq->ring_size | cq->reg.len_ena_mask));
 }
 
 /**
@@ -329,7 +329,7 @@ int idpf_ctlq_send(struct idpf_hw *hw, struct idpf_ctlq_info *cq,
 	 */
 	dma_wmb();
 
-	wr32(hw, cq->reg.tail, cq->next_to_use);
+	idpf_mbx_wr32(hw, cq->reg.tail, cq->next_to_use);
 
 err_unlock:
 	mutex_unlock(&cq->cq_lock);
@@ -521,7 +521,7 @@ post_buffs_out:
 
 		dma_wmb();
 
-		wr32(hw, cq->reg.tail, cq->next_to_post);
+		idpf_mbx_wr32(hw, cq->reg.tail, cq->next_to_post);
 	}
 
 	mutex_unlock(&cq->cq_lock);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_controlq.h b/drivers/net/ethernet/intel/idpf/idpf_controlq.h
index c1aba09e9856..de4ece40c2ff 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_controlq.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_controlq.h
@@ -94,12 +94,26 @@ struct idpf_mbxq_desc {
 	u32 pf_vf_id;		/* used by CP when sending to PF */
 };
 
+/* Max number of MMIO regions not including the mailbox and rstat regions in
+ * the fallback case when the whole bar is mapped.
+ */
+#define IDPF_MMIO_MAP_FALLBACK_MAX_REMAINING		3
+
+struct idpf_mmio_reg {
+	void __iomem *vaddr;
+	resource_size_t addr_start;
+	resource_size_t addr_len;
+};
+
 /* Define the driver hardware struct to replace other control structs as needed
  * Align to ctlq_hw_info
  */
 struct idpf_hw {
-	void __iomem *hw_addr;
-	resource_size_t hw_addr_len;
+	struct idpf_mmio_reg mbx;
+	struct idpf_mmio_reg rstat;
+	/* Array of remaining LAN BAR regions */
+	int num_lan_regs;
+	struct idpf_mmio_reg *lan_regs;
 
 	struct idpf_adapter *back;
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_dev.c
index dd227a4368fb..bfa60f7d43de 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_dev.c
@@ -10,10 +10,13 @@
 
 /**
  * idpf_ctlq_reg_init - initialize default mailbox registers
+ * @adapter: adapter structure
  * @cq: pointer to the array of create control queues
  */
-static void idpf_ctlq_reg_init(struct idpf_ctlq_create_info *cq)
+static void idpf_ctlq_reg_init(struct idpf_adapter *adapter,
+			       struct idpf_ctlq_create_info *cq)
 {
+	resource_size_t mbx_start = adapter->dev_ops.static_reg_info[0].start;
 	int i;
 
 	for (i = 0; i < IDPF_NUM_DFLT_MBX_Q; i++) {
@@ -22,22 +25,22 @@ static void idpf_ctlq_reg_init(struct idpf_ctlq_create_info *cq)
 		switch (ccq->type) {
 		case IDPF_CTLQ_TYPE_MAILBOX_TX:
 			/* set head and tail registers in our local struct */
-			ccq->reg.head = PF_FW_ATQH;
-			ccq->reg.tail = PF_FW_ATQT;
-			ccq->reg.len = PF_FW_ATQLEN;
-			ccq->reg.bah = PF_FW_ATQBAH;
-			ccq->reg.bal = PF_FW_ATQBAL;
+			ccq->reg.head = PF_FW_ATQH - mbx_start;
+			ccq->reg.tail = PF_FW_ATQT - mbx_start;
+			ccq->reg.len = PF_FW_ATQLEN - mbx_start;
+			ccq->reg.bah = PF_FW_ATQBAH - mbx_start;
+			ccq->reg.bal = PF_FW_ATQBAL - mbx_start;
 			ccq->reg.len_mask = PF_FW_ATQLEN_ATQLEN_M;
 			ccq->reg.len_ena_mask = PF_FW_ATQLEN_ATQENABLE_M;
 			ccq->reg.head_mask = PF_FW_ATQH_ATQH_M;
 			break;
 		case IDPF_CTLQ_TYPE_MAILBOX_RX:
 			/* set head and tail registers in our local struct */
-			ccq->reg.head = PF_FW_ARQH;
-			ccq->reg.tail = PF_FW_ARQT;
-			ccq->reg.len = PF_FW_ARQLEN;
-			ccq->reg.bah = PF_FW_ARQBAH;
-			ccq->reg.bal = PF_FW_ARQBAL;
+			ccq->reg.head = PF_FW_ARQH - mbx_start;
+			ccq->reg.tail = PF_FW_ARQT - mbx_start;
+			ccq->reg.len = PF_FW_ARQLEN - mbx_start;
+			ccq->reg.bah = PF_FW_ARQBAH - mbx_start;
+			ccq->reg.bal = PF_FW_ARQBAL - mbx_start;
 			ccq->reg.len_mask = PF_FW_ARQLEN_ARQLEN_M;
 			ccq->reg.len_ena_mask = PF_FW_ARQLEN_ARQENABLE_M;
 			ccq->reg.head_mask = PF_FW_ARQH_ARQH_M;
@@ -130,7 +133,7 @@ free_reg_vals:
  */
 static void idpf_reset_reg_init(struct idpf_adapter *adapter)
 {
-	adapter->reset_reg.rstat = idpf_get_reg_addr(adapter, PFGEN_RSTAT);
+	adapter->reset_reg.rstat = idpf_get_rstat_reg_addr(adapter, PFGEN_RSTAT);
 	adapter->reset_reg.rstat_m = PFGEN_RSTAT_PFR_STATE_M;
 }
 
@@ -144,9 +147,9 @@ static void idpf_trigger_reset(struct idpf_adapter *adapter,
 {
 	u32 reset_reg;
 
-	reset_reg = readl(idpf_get_reg_addr(adapter, PFGEN_CTRL));
+	reset_reg = readl(idpf_get_rstat_reg_addr(adapter, PFGEN_CTRL));
 	writel(reset_reg | PFGEN_CTRL_PFSWR,
-	       idpf_get_reg_addr(adapter, PFGEN_CTRL));
+	       idpf_get_rstat_reg_addr(adapter, PFGEN_CTRL));
 }
 
 /**
@@ -195,4 +198,9 @@ void idpf_dev_ops_init(struct idpf_adapter *adapter)
 	idpf_reg_ops_init(adapter);
 
 	adapter->dev_ops.idc_init = idpf_idc_register;
+
+	resource_set_range(&adapter->dev_ops.static_reg_info[0],
+			   PF_FW_BASE, IDPF_PF_MBX_REGION_SZ);
+	resource_set_range(&adapter->dev_ops.static_reg_info[1],
+			   PFGEN_RTRIG, IDPF_PF_RSTAT_REGION_SZ);
 }
diff --git a/drivers/net/ethernet/intel/idpf/idpf_idc.c b/drivers/net/ethernet/intel/idpf/idpf_idc.c
index 2443337c83de..4d2905103215 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_idc.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_idc.c
@@ -412,7 +412,7 @@ int idpf_idc_init_aux_core_dev(struct idpf_adapter *adapter,
 {
 	struct iidc_rdma_core_dev_info *cdev_info;
 	struct iidc_rdma_priv_dev_info *privd;
-	int err;
+	int err, i;
 
 	adapter->cdev_info = kzalloc(sizeof(*cdev_info), GFP_KERNEL);
 	if (!adapter->cdev_info)
@@ -430,14 +430,36 @@ int idpf_idc_init_aux_core_dev(struct idpf_adapter *adapter,
 	cdev_info->rdma_protocol = IIDC_RDMA_PROTOCOL_ROCEV2;
 	privd->ftype = ftype;
 
+	privd->mapped_mem_regions =
+		kcalloc(adapter->hw.num_lan_regs,
+			sizeof(struct iidc_rdma_lan_mapped_mem_region),
+			GFP_KERNEL);
+	if (!privd->mapped_mem_regions) {
+		err = -ENOMEM;
+		goto err_plug_aux_dev;
+	}
+
+	privd->num_memory_regions = cpu_to_le16(adapter->hw.num_lan_regs);
+	for (i = 0; i < adapter->hw.num_lan_regs; i++) {
+		privd->mapped_mem_regions[i].region_addr =
+			adapter->hw.lan_regs[i].vaddr;
+		privd->mapped_mem_regions[i].size =
+			cpu_to_le64(adapter->hw.lan_regs[i].addr_len);
+		privd->mapped_mem_regions[i].start_offset =
+			cpu_to_le64(adapter->hw.lan_regs[i].addr_start);
+	}
+
 	idpf_idc_init_msix_data(adapter);
 
 	err = idpf_plug_core_aux_dev(cdev_info);
 	if (err)
-		goto err_plug_aux_dev;
+		goto err_free_mem_regions;
 
 	return 0;
 
+err_free_mem_regions:
+	kfree(privd->mapped_mem_regions);
+	privd->mapped_mem_regions = NULL;
 err_plug_aux_dev:
 	kfree(privd);
 err_privd_alloc:
@@ -453,12 +475,16 @@ err_privd_alloc:
  */
 void idpf_idc_deinit_core_aux_device(struct iidc_rdma_core_dev_info *cdev_info)
 {
+	struct iidc_rdma_priv_dev_info *privd;
+
 	if (!cdev_info)
 		return;
 
 	idpf_unplug_aux_dev(cdev_info->adev);
 
-	kfree(cdev_info->iidc_priv);
+	privd = cdev_info->iidc_priv;
+	kfree(privd->mapped_mem_regions);
+	kfree(privd);
 	kfree(cdev_info);
 }
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c
index 0efd9c0c7a90..b7422be3e967 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_main.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_main.c
@@ -106,15 +106,37 @@ static void idpf_shutdown(struct pci_dev *pdev)
  */
 static int idpf_cfg_hw(struct idpf_adapter *adapter)
 {
+	resource_size_t res_start, mbx_start, rstat_start;
 	struct pci_dev *pdev = adapter->pdev;
 	struct idpf_hw *hw = &adapter->hw;
+	struct device *dev = &pdev->dev;
+	long len;
+
+	res_start = pci_resource_start(pdev, 0);
+
+	/* Map mailbox space for virtchnl communication */
+	mbx_start = res_start + adapter->dev_ops.static_reg_info[0].start;
+	len = resource_size(&adapter->dev_ops.static_reg_info[0]);
+	hw->mbx.vaddr = devm_ioremap(dev, mbx_start, len);
+	if (!hw->mbx.vaddr) {
+		pci_err(pdev, "failed to allocate BAR0 mbx region\n");
+
+		return -ENOMEM;
+	}
+	hw->mbx.addr_start = adapter->dev_ops.static_reg_info[0].start;
+	hw->mbx.addr_len = len;
 
-	hw->hw_addr = pcim_iomap_table(pdev)[0];
-	if (!hw->hw_addr) {
-		pci_err(pdev, "failed to allocate PCI iomap table\n");
+	/* Map rstat space for resets */
+	rstat_start = res_start + adapter->dev_ops.static_reg_info[1].start;
+	len = resource_size(&adapter->dev_ops.static_reg_info[1]);
+	hw->rstat.vaddr = devm_ioremap(dev, rstat_start, len);
+	if (!hw->rstat.vaddr) {
+		pci_err(pdev, "failed to allocate BAR0 rstat region\n");
 
 		return -ENOMEM;
 	}
+	hw->rstat.addr_start = adapter->dev_ops.static_reg_info[1].start;
+	hw->rstat.addr_len = len;
 
 	hw->back = adapter;
 
@@ -161,9 +183,9 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto err_free;
 
-	err = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev));
+	err = pcim_request_region(pdev, 0, pci_name(pdev));
 	if (err) {
-		pci_err(pdev, "pcim_iomap_regions failed %pe\n", ERR_PTR(err));
+		pci_err(pdev, "pcim_request_region failed %pe\n", ERR_PTR(err));
 
 		goto err_free;
 	}
diff --git a/drivers/net/ethernet/intel/idpf/idpf_mem.h b/drivers/net/ethernet/intel/idpf/idpf_mem.h
index b21a04fccf0f..2aaabdc02dd2 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_mem.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_mem.h
@@ -12,9 +12,9 @@ struct idpf_dma_mem {
 	size_t size;
 };
 
-#define wr32(a, reg, value)	writel((value), ((a)->hw_addr + (reg)))
-#define rd32(a, reg)		readl((a)->hw_addr + (reg))
-#define wr64(a, reg, value)	writeq((value), ((a)->hw_addr + (reg)))
-#define rd64(a, reg)		readq((a)->hw_addr + (reg))
+#define idpf_mbx_wr32(a, reg, value)	writel((value), ((a)->mbx.vaddr + (reg)))
+#define idpf_mbx_rd32(a, reg)		readl((a)->mbx.vaddr + (reg))
+#define idpf_mbx_wr64(a, reg, value)	writeq((value), ((a)->mbx.vaddr + (reg)))
+#define idpf_mbx_rd64(a, reg)		readq((a)->mbx.vaddr + (reg))
 
 #endif /* _IDPF_MEM_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
index 2f84bd596ae4..259d50fded67 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_vf_dev.c
@@ -9,10 +9,13 @@
 
 /**
  * idpf_vf_ctlq_reg_init - initialize default mailbox registers
+ * @adapter: adapter structure
  * @cq: pointer to the array of create control queues
  */
-static void idpf_vf_ctlq_reg_init(struct idpf_ctlq_create_info *cq)
+static void idpf_vf_ctlq_reg_init(struct idpf_adapter *adapter,
+				  struct idpf_ctlq_create_info *cq)
 {
+	resource_size_t mbx_start = adapter->dev_ops.static_reg_info[0].start;
 	int i;
 
 	for (i = 0; i < IDPF_NUM_DFLT_MBX_Q; i++) {
@@ -21,22 +24,22 @@ static void idpf_vf_ctlq_reg_init(struct idpf_ctlq_create_info *cq)
 		switch (ccq->type) {
 		case IDPF_CTLQ_TYPE_MAILBOX_TX:
 			/* set head and tail registers in our local struct */
-			ccq->reg.head = VF_ATQH;
-			ccq->reg.tail = VF_ATQT;
-			ccq->reg.len = VF_ATQLEN;
-			ccq->reg.bah = VF_ATQBAH;
-			ccq->reg.bal = VF_ATQBAL;
+			ccq->reg.head = VF_ATQH - mbx_start;
+			ccq->reg.tail = VF_ATQT - mbx_start;
+			ccq->reg.len = VF_ATQLEN - mbx_start;
+			ccq->reg.bah = VF_ATQBAH - mbx_start;
+			ccq->reg.bal = VF_ATQBAL - mbx_start;
 			ccq->reg.len_mask = VF_ATQLEN_ATQLEN_M;
 			ccq->reg.len_ena_mask = VF_ATQLEN_ATQENABLE_M;
 			ccq->reg.head_mask = VF_ATQH_ATQH_M;
 			break;
 		case IDPF_CTLQ_TYPE_MAILBOX_RX:
 			/* set head and tail registers in our local struct */
-			ccq->reg.head = VF_ARQH;
-			ccq->reg.tail = VF_ARQT;
-			ccq->reg.len = VF_ARQLEN;
-			ccq->reg.bah = VF_ARQBAH;
-			ccq->reg.bal = VF_ARQBAL;
+			ccq->reg.head = VF_ARQH - mbx_start;
+			ccq->reg.tail = VF_ARQT - mbx_start;
+			ccq->reg.len = VF_ARQLEN - mbx_start;
+			ccq->reg.bah = VF_ARQBAH - mbx_start;
+			ccq->reg.bal = VF_ARQBAL - mbx_start;
 			ccq->reg.len_mask = VF_ARQLEN_ARQLEN_M;
 			ccq->reg.len_ena_mask = VF_ARQLEN_ARQENABLE_M;
 			ccq->reg.head_mask = VF_ARQH_ARQH_M;
@@ -129,7 +132,7 @@ free_reg_vals:
  */
 static void idpf_vf_reset_reg_init(struct idpf_adapter *adapter)
 {
-	adapter->reset_reg.rstat = idpf_get_reg_addr(adapter, VFGEN_RSTAT);
+	adapter->reset_reg.rstat = idpf_get_rstat_reg_addr(adapter, VFGEN_RSTAT);
 	adapter->reset_reg.rstat_m = VFGEN_RSTAT_VFR_STATE_M;
 }
 
@@ -180,4 +183,9 @@ void idpf_vf_dev_ops_init(struct idpf_adapter *adapter)
 	idpf_vf_reg_ops_init(adapter);
 
 	adapter->dev_ops.idc_init = idpf_idc_vf_register;
+
+	resource_set_range(&adapter->dev_ops.static_reg_info[0],
+			   VF_BASE, IDPF_VF_MBX_REGION_SZ);
+	resource_set_range(&adapter->dev_ops.static_reg_info[1],
+			   VFGEN_RSTAT, IDPF_VF_RSTAT_REGION_SZ);
 }
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index 957b3b77700a..0d2199ac5c3e 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -870,6 +870,7 @@ static int idpf_send_get_caps_msg(struct idpf_adapter *adapter)
 	caps.other_caps =
 		cpu_to_le64(VIRTCHNL2_CAP_SRIOV			|
 			    VIRTCHNL2_CAP_RDMA                  |
+			    VIRTCHNL2_CAP_LAN_MEMORY_REGIONS	|
 			    VIRTCHNL2_CAP_MACFILTER		|
 			    VIRTCHNL2_CAP_SPLITQ_QSCHED		|
 			    VIRTCHNL2_CAP_PROMISC		|
@@ -892,6 +893,128 @@ static int idpf_send_get_caps_msg(struct idpf_adapter *adapter)
 	return 0;
 }
 
+/**
+ * idpf_send_get_lan_memory_regions - Send virtchnl get LAN memory regions msg
+ * @adapter: Driver specific private struct
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_send_get_lan_memory_regions(struct idpf_adapter *adapter)
+{
+	struct virtchnl2_get_lan_memory_regions *rcvd_regions __free(kfree);
+	struct idpf_vc_xn_params xn_params = {
+		.vc_op = VIRTCHNL2_OP_GET_LAN_MEMORY_REGIONS,
+		.recv_buf.iov_len = IDPF_CTLQ_MAX_BUF_LEN,
+		.timeout_ms = IDPF_VC_XN_DEFAULT_TIMEOUT_MSEC,
+	};
+	int num_regions, size;
+	struct idpf_hw *hw;
+	ssize_t reply_sz;
+	int err = 0;
+
+	rcvd_regions = kzalloc(IDPF_CTLQ_MAX_BUF_LEN, GFP_KERNEL);
+	if (!rcvd_regions)
+		return -ENOMEM;
+
+	xn_params.recv_buf.iov_base = rcvd_regions;
+	reply_sz = idpf_vc_xn_exec(adapter, &xn_params);
+	if (reply_sz < 0)
+		return reply_sz;
+
+	num_regions = le16_to_cpu(rcvd_regions->num_memory_regions);
+	size = struct_size(rcvd_regions, mem_reg, num_regions);
+	if (reply_sz < size)
+		return -EIO;
+
+	if (size > IDPF_CTLQ_MAX_BUF_LEN)
+		return -EINVAL;
+
+	hw = &adapter->hw;
+	hw->lan_regs = kcalloc(num_regions, sizeof(*hw->lan_regs), GFP_KERNEL);
+	if (!hw->lan_regs)
+		return -ENOMEM;
+
+	for (int i = 0; i < num_regions; i++) {
+		hw->lan_regs[i].addr_len =
+			le64_to_cpu(rcvd_regions->mem_reg[i].size);
+		hw->lan_regs[i].addr_start =
+			le64_to_cpu(rcvd_regions->mem_reg[i].start_offset);
+	}
+	hw->num_lan_regs = num_regions;
+
+	return err;
+}
+
+/**
+ * idpf_calc_remaining_mmio_regs - calculate MMIO regions outside mbx and rstat
+ * @adapter: Driver specific private structure
+ *
+ * Called when idpf_send_get_lan_memory_regions is not supported. This will
+ * calculate the offsets and sizes for the regions before, in between, and
+ * after the mailbox and rstat MMIO mappings.
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_calc_remaining_mmio_regs(struct idpf_adapter *adapter)
+{
+	struct resource *rstat_reg = &adapter->dev_ops.static_reg_info[1];
+	struct resource *mbx_reg = &adapter->dev_ops.static_reg_info[0];
+	struct idpf_hw *hw = &adapter->hw;
+
+	hw->num_lan_regs = IDPF_MMIO_MAP_FALLBACK_MAX_REMAINING;
+	hw->lan_regs = kcalloc(hw->num_lan_regs, sizeof(*hw->lan_regs),
+			       GFP_KERNEL);
+	if (!hw->lan_regs)
+		return -ENOMEM;
+
+	/* Region preceding mailbox */
+	hw->lan_regs[0].addr_start = 0;
+	hw->lan_regs[0].addr_len = mbx_reg->start;
+	/* Region between mailbox and rstat */
+	hw->lan_regs[1].addr_start = mbx_reg->end + 1;
+	hw->lan_regs[1].addr_len = rstat_reg->start -
+					hw->lan_regs[1].addr_start;
+	/* Region after rstat */
+	hw->lan_regs[2].addr_start = rstat_reg->end + 1;
+	hw->lan_regs[2].addr_len = pci_resource_len(adapter->pdev, 0) -
+					hw->lan_regs[2].addr_start;
+
+	return 0;
+}
+
+/**
+ * idpf_map_lan_mmio_regs - map remaining LAN BAR regions
+ * @adapter: Driver specific private structure
+ *
+ * Return: 0 on success or error code on failure.
+ */
+static int idpf_map_lan_mmio_regs(struct idpf_adapter *adapter)
+{
+	struct pci_dev *pdev = adapter->pdev;
+	struct idpf_hw *hw = &adapter->hw;
+	resource_size_t res_start;
+
+	res_start = pci_resource_start(pdev, 0);
+
+	for (int i = 0; i < hw->num_lan_regs; i++) {
+		resource_size_t start;
+		long len;
+
+		len = hw->lan_regs[i].addr_len;
+		if (!len)
+			continue;
+		start = hw->lan_regs[i].addr_start + res_start;
+
+		hw->lan_regs[i].vaddr = devm_ioremap(&pdev->dev, start, len);
+		if (!hw->lan_regs[i].vaddr) {
+			pci_err(pdev, "failed to allocate BAR0 region\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * idpf_vport_alloc_max_qs - Allocate max queues for a vport
  * @adapter: Driver specific private structure
@@ -2803,7 +2926,7 @@ int idpf_init_dflt_mbx(struct idpf_adapter *adapter)
 	struct idpf_hw *hw = &adapter->hw;
 	int err;
 
-	adapter->dev_ops.reg_ops.ctlq_reg_init(ctlq_info);
+	adapter->dev_ops.reg_ops.ctlq_reg_init(adapter, ctlq_info);
 
 	err = idpf_ctlq_init(hw, IDPF_NUM_DFLT_MBX_Q, ctlq_info);
 	if (err)
@@ -2963,6 +3086,30 @@ restart:
 		msleep(task_delay);
 	}
 
+	if (idpf_is_cap_ena(adapter, IDPF_OTHER_CAPS, VIRTCHNL2_CAP_LAN_MEMORY_REGIONS)) {
+		err = idpf_send_get_lan_memory_regions(adapter);
+		if (err) {
+			dev_err(&adapter->pdev->dev, "Failed to get LAN memory regions: %d\n",
+				err);
+			return -EINVAL;
+		}
+	} else {
+		/* Fallback to mapping the remaining regions of the entire BAR */
+		err = idpf_calc_remaining_mmio_regs(adapter);
+		if (err) {
+			dev_err(&adapter->pdev->dev, "Failed to allocate BAR0 region(s): %d\n",
+				err);
+			return -ENOMEM;
+		}
+	}
+
+	err = idpf_map_lan_mmio_regs(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "Failed to map BAR0 region(s): %d\n",
+			err);
+		return -ENOMEM;
+	}
+
 	pci_sriov_set_totalvfs(adapter->pdev, idpf_get_max_vfs(adapter));
 	num_max_vports = idpf_get_max_vports(adapter);
 	adapter->max_vports = num_max_vports;
diff --git a/drivers/net/ethernet/intel/idpf/virtchnl2.h b/drivers/net/ethernet/intel/idpf/virtchnl2.h
index b82218d20909..48d3cc9236a4 100644
--- a/drivers/net/ethernet/intel/idpf/virtchnl2.h
+++ b/drivers/net/ethernet/intel/idpf/virtchnl2.h
@@ -79,6 +79,7 @@ enum virtchnl2_op {
 	VIRTCHNL2_OP_PTP_ADJ_DEV_CLK_FINE		= 546,
 	VIRTCHNL2_OP_PTP_ADJ_DEV_CLK_TIME		= 547,
 	VIRTCHNL2_OP_PTP_GET_VPORT_TX_TSTAMP_CAPS	= 548,
+	VIRTCHNL2_OP_GET_LAN_MEMORY_REGIONS		= 549,
 };
 
 /**
@@ -212,7 +213,8 @@ enum virtchnl2_cap_other {
 	VIRTCHNL2_CAP_RX_FLEX_DESC		= BIT_ULL(17),
 	VIRTCHNL2_CAP_PTYPE			= BIT_ULL(18),
 	VIRTCHNL2_CAP_LOOPBACK			= BIT_ULL(19),
-	/* Other capability 20 is reserved */
+	/* Other capability 20-21 is reserved */
+	VIRTCHNL2_CAP_LAN_MEMORY_REGIONS	= BIT_ULL(22),
 
 	/* this must be the last capability */
 	VIRTCHNL2_CAP_OEM			= BIT_ULL(63),
@@ -1587,4 +1589,30 @@ struct virtchnl2_ptp_adj_dev_clk_time {
 };
 VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_ptp_adj_dev_clk_time);
 
+/**
+ * struct virtchnl2_mem_region - MMIO memory region
+ * @start_offset: starting offset of the MMIO memory region
+ * @size: size of the MMIO memory region
+ */
+struct virtchnl2_mem_region {
+	__le64 start_offset;
+	__le64 size;
+};
+VIRTCHNL2_CHECK_STRUCT_LEN(16, virtchnl2_mem_region);
+
+/**
+ * struct virtchnl2_get_lan_memory_regions - List of LAN MMIO memory regions
+ * @num_memory_regions: number of memory regions
+ * @pad: Padding
+ * @mem_reg: List with memory region info
+ *
+ * PF/VF sends this message to learn what LAN MMIO memory regions it should map.
+ */
+struct virtchnl2_get_lan_memory_regions {
+	__le16 num_memory_regions;
+	u8 pad[6];
+	struct virtchnl2_mem_region mem_reg[];
+};
+VIRTCHNL2_CHECK_STRUCT_LEN(8, virtchnl2_get_lan_memory_regions);
+
 #endif /* _VIRTCHNL_2_H_ */
diff --git a/include/linux/net/intel/iidc_rdma_idpf.h b/include/linux/net/intel/iidc_rdma_idpf.h
index 16c970dd4c6e..bab697e18fd6 100644
--- a/include/linux/net/intel/iidc_rdma_idpf.h
+++ b/include/linux/net/intel/iidc_rdma_idpf.h
@@ -31,10 +31,18 @@ enum iidc_function_type {
 	IIDC_FUNCTION_TYPE_VF,
 };
 
+struct iidc_rdma_lan_mapped_mem_region {
+	u8 __iomem *region_addr;
+	__le64 size;
+	__le64 start_offset;
+};
+
 struct iidc_rdma_priv_dev_info {
 	struct msix_entry *msix_entries;
 	u16 msix_count; /* How many vectors are reserved for this device */
 	enum iidc_function_type ftype;
+	__le16 num_memory_regions;
+	struct iidc_rdma_lan_mapped_mem_region *mapped_mem_regions;
 };
 
 int idpf_idc_vport_dev_ctrl(struct iidc_rdma_core_dev_info *cdev_info, bool up);
-- 
cgit v1.2.3


From c76ed8790b3018fe36647d9aae96e0373f321184 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 12 Jul 2025 16:23:05 -0700
Subject: crypto: sha1 - Remove sha1_base.h

sha1_base.h is no longer used, so remove it.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/sha1_base.h | 82 ----------------------------------------------
 1 file changed, 82 deletions(-)
 delete mode 100644 include/crypto/sha1_base.h

(limited to 'include')

diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h
deleted file mode 100644
index 62701d136c79..000000000000
--- a/include/crypto/sha1_base.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1_base.h - core logic for SHA-1 implementations
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#ifndef _CRYPTO_SHA1_BASE_H
-#define _CRYPTO_SHA1_BASE_H
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/unaligned.h>
-
-typedef void (sha1_block_fn)(struct sha1_state *sst, u8 const *src, int blocks);
-
-static inline int sha1_base_init(struct shash_desc *desc)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA1_H0;
-	sctx->state[1] = SHA1_H1;
-	sctx->state[2] = SHA1_H2;
-	sctx->state[3] = SHA1_H3;
-	sctx->state[4] = SHA1_H4;
-	sctx->count = 0;
-
-	return 0;
-}
-
-static inline int sha1_base_do_update_blocks(struct shash_desc *desc,
-					     const u8 *data,
-					     unsigned int len,
-					     sha1_block_fn *block_fn)
-{
-	unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	sctx->count += len - remain;
-	block_fn(sctx, data, len / SHA1_BLOCK_SIZE);
-	return remain;
-}
-
-static inline int sha1_base_do_finup(struct shash_desc *desc,
-				     const u8 *src, unsigned int len,
-				     sha1_block_fn *block_fn)
-{
-	unsigned int bit_offset = SHA1_BLOCK_SIZE / 8 - 1;
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	union {
-		__be64 b64[SHA1_BLOCK_SIZE / 4];
-		u8 u8[SHA1_BLOCK_SIZE * 2];
-	} block = {};
-
-	if (len >= bit_offset * 8)
-		bit_offset += SHA1_BLOCK_SIZE / 8;
-	memcpy(&block, src, len);
-	block.u8[len] = 0x80;
-	sctx->count += len;
-	block.b64[bit_offset] = cpu_to_be64(sctx->count << 3);
-	block_fn(sctx, block.u8, (bit_offset + 1) * 8 / SHA1_BLOCK_SIZE);
-	memzero_explicit(&block, sizeof(block));
-
-	return 0;
-}
-
-static inline int sha1_base_finish(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	__be32 *digest = (__be32 *)out;
-	int i;
-
-	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
-		put_unaligned_be32(sctx->state[i], digest++);
-
-	return 0;
-}
-
-#endif /* _CRYPTO_SHA1_BASE_H */
-- 
cgit v1.2.3


From 5a8f77e24a30bbce2fa57926f3dede84894fd10a Mon Sep 17 00:00:00 2001
From: Michał Winiarski <michal.winiarski@intel.com>
Date: Wed, 2 Jul 2025 11:35:18 +0200
Subject: PCI/IOV: Restore VF resizable BAR state after reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similar to regular resizable BARs, VF BARs can also be resized, e.g. by the
system firmware or the PCI subsystem itself.

The capability layout is the same as PCI_EXT_CAP_ID_REBAR.

Add the capability ID and restore it as a part of IOV state.

See PCIe r6.2, sec 7.8.7.

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patch.msgid.link/20250702093522.518099-2-michal.winiarski@intel.com
---
 drivers/pci/iov.c             | 30 +++++++++++++++++++++++++++++-
 drivers/pci/pci.h             | 12 ++++++++++++
 include/uapi/linux/pci_regs.h |  9 +++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 10693b5d7eb6..10ccef8afe14 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -7,6 +7,7 @@
  * Copyright (C) 2009 Intel Corporation, Yu Zhao <yu.zhao@intel.com>
  */
 
+#include <linux/bitfield.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/export.h>
@@ -850,6 +851,7 @@ found:
 	pci_read_config_byte(dev, pos + PCI_SRIOV_FUNC_LINK, &iov->link);
 	if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_END)
 		iov->link = PCI_DEVFN(PCI_SLOT(dev->devfn), iov->link);
+	iov->vf_rebar_cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_VF_REBAR);
 
 	if (pdev)
 		iov->dev = pci_dev_get(pdev);
@@ -888,6 +890,30 @@ static void sriov_release(struct pci_dev *dev)
 	dev->sriov = NULL;
 }
 
+static void sriov_restore_vf_rebar_state(struct pci_dev *dev)
+{
+	unsigned int pos, nbars, i;
+	u32 ctrl;
+
+	pos = pci_iov_vf_rebar_cap(dev);
+	if (!pos)
+		return;
+
+	pci_read_config_dword(dev, pos + PCI_VF_REBAR_CTRL, &ctrl);
+	nbars = FIELD_GET(PCI_VF_REBAR_CTRL_NBAR_MASK, ctrl);
+
+	for (i = 0; i < nbars; i++, pos += 8) {
+		int bar_idx, size;
+
+		pci_read_config_dword(dev, pos + PCI_VF_REBAR_CTRL, &ctrl);
+		bar_idx = FIELD_GET(PCI_VF_REBAR_CTRL_BAR_IDX, ctrl);
+		size = pci_rebar_bytes_to_size(dev->sriov->barsz[bar_idx]);
+		ctrl &= ~PCI_VF_REBAR_CTRL_BAR_SIZE;
+		ctrl |= FIELD_PREP(PCI_VF_REBAR_CTRL_BAR_SIZE, size);
+		pci_write_config_dword(dev, pos + PCI_VF_REBAR_CTRL, ctrl);
+	}
+}
+
 static void sriov_restore_state(struct pci_dev *dev)
 {
 	int i;
@@ -1047,8 +1073,10 @@ resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno)
  */
 void pci_restore_iov_state(struct pci_dev *dev)
 {
-	if (dev->is_physfn)
+	if (dev->is_physfn) {
+		sriov_restore_vf_rebar_state(dev);
 		sriov_restore_state(dev);
+	}
 }
 
 /**
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 12215ee72afb..69258c445b28 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -486,6 +486,7 @@ struct pci_sriov {
 	u16		subsystem_vendor; /* VF subsystem vendor */
 	u16		subsystem_device; /* VF subsystem device */
 	resource_size_t	barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
+	u16		vf_rebar_cap;	/* VF Resizable BAR capability offset */
 	bool		drivers_autoprobe; /* Auto probing of VFs by driver */
 };
 
@@ -710,6 +711,13 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno);
 resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
 void pci_restore_iov_state(struct pci_dev *dev);
 int pci_iov_bus_range(struct pci_bus *bus);
+static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev)
+{
+	if (!dev->is_physfn)
+		return 0;
+
+	return dev->sriov->vf_rebar_cap;
+}
 static inline bool pci_resource_is_iov(int resno)
 {
 	return resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END;
@@ -734,6 +742,10 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 {
 	return 0;
 }
+static inline u16 pci_iov_vf_rebar_cap(struct pci_dev *dev)
+{
+	return 0;
+}
 static inline bool pci_resource_is_iov(int resno)
 {
 	return false;
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a3a3e942dedf..f5b17745de60 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -745,6 +745,7 @@
 #define PCI_EXT_CAP_ID_L1SS	0x1E	/* L1 PM Substates */
 #define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
+#define PCI_EXT_CAP_ID_VF_REBAR 0x24	/* VF Resizable BAR */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
 #define PCI_EXT_CAP_ID_NPEM	0x29	/* Native PCIe Enclosure Management */
@@ -1141,6 +1142,14 @@
 #define PCI_DVSEC_HEADER2		0x8 /* Designated Vendor-Specific Header2 */
 #define  PCI_DVSEC_HEADER2_ID(x)		((x) & 0xffff)
 
+/* VF Resizable BARs, same layout as PCI_REBAR */
+#define PCI_VF_REBAR_CAP	PCI_REBAR_CAP
+#define  PCI_VF_REBAR_CAP_SIZES		PCI_REBAR_CAP_SIZES
+#define PCI_VF_REBAR_CTRL	PCI_REBAR_CTRL
+#define  PCI_VF_REBAR_CTRL_BAR_IDX	PCI_REBAR_CTRL_BAR_IDX
+#define  PCI_VF_REBAR_CTRL_NBAR_MASK	PCI_REBAR_CTRL_NBAR_MASK
+#define  PCI_VF_REBAR_CTRL_BAR_SIZE	PCI_REBAR_CTRL_BAR_SIZE
+
 /* Data Link Feature */
 #define PCI_DLF_CAP		0x04	/* Capabilities Register */
 #define  PCI_DLF_EXCHANGE_ENABLE	0x80000000  /* Data Link Feature Exchange Enable */
-- 
cgit v1.2.3


From 84f890414a12b8d1480045b92a5e4e6ac4ab3419 Mon Sep 17 00:00:00 2001
From: Michał Winiarski <michal.winiarski@intel.com>
Date: Wed, 2 Jul 2025 11:35:22 +0200
Subject: PCI/IOV: Allow drivers to control VF BAR size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drivers could leverage the fact that the VF BAR MMIO reservation is created
for total number of VFs supported by the device by resizing the BAR to
larger size when smaller number of VFs is enabled.

Add pci_iov_vf_bar_set_size() to control the size and a
pci_iov_vf_bar_get_sizes() helper to get the VF BAR sizes that will allow
up to num_vfs to be successfully enabled with the current underlying
reservation size.

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://patch.msgid.link/20250702093522.518099-6-michal.winiarski@intel.com
---
 drivers/pci/iov.c   | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  6 +++++
 2 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index f34173c70b32..ac4375954c94 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -8,11 +8,15 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/log2.h>
 #include <linux/pci.h>
+#include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/export.h>
 #include <linux/string.h>
 #include <linux/delay.h>
+#include <asm/div64.h>
 #include "pci.h"
 
 #define VIRTFN_ID_LEN	17	/* "virtfn%u\0" for 2^32 - 1 */
@@ -1313,3 +1317,72 @@ int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn)
 	return nr_virtfn;
 }
 EXPORT_SYMBOL_GPL(pci_sriov_configure_simple);
+
+/**
+ * pci_iov_vf_bar_set_size - set a new size for a VF BAR
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @size: new size as defined in the spec (0=1MB, 31=128TB)
+ *
+ * Set the new size of a VF BAR that supports VF resizable BAR capability.
+ * Unlike pci_resize_resource(), this does not cause the resource that
+ * reserves the MMIO space (originally up to total_VFs) to be resized, which
+ * means that following calls to pci_enable_sriov() can fail if the resources
+ * no longer fit.
+ *
+ * Return: 0 on success, or negative on failure.
+ */
+int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size)
+{
+	u32 sizes;
+	int ret;
+
+	if (!pci_resource_is_iov(resno))
+		return -EINVAL;
+
+	if (pci_iov_is_memory_decoding_enabled(dev))
+		return -EBUSY;
+
+	sizes = pci_rebar_get_possible_sizes(dev, resno);
+	if (!sizes)
+		return -ENOTSUPP;
+
+	if (!(sizes & BIT(size)))
+		return -EINVAL;
+
+	ret = pci_rebar_set_size(dev, resno, size);
+	if (ret)
+		return ret;
+
+	pci_iov_resource_set_size(dev, resno, pci_rebar_size_to_bytes(size));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_iov_vf_bar_set_size);
+
+/**
+ * pci_iov_vf_bar_get_sizes - get VF BAR sizes allowing to create up to num_vfs
+ * @dev: the PCI device
+ * @resno: the resource number
+ * @num_vfs: number of VFs
+ *
+ * Get the sizes of a VF resizable BAR that can accommodate @num_vfs within
+ * the currently assigned size of the resource @resno.
+ *
+ * Return: A bitmask of sizes in format defined in the spec (bit 0=1MB,
+ * bit 31=128TB).
+ */
+u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs)
+{
+	u64 vf_len = pci_resource_len(dev, resno);
+	u32 sizes;
+
+	if (!num_vfs)
+		return 0;
+
+	do_div(vf_len, num_vfs);
+	sizes = (roundup_pow_of_two(vf_len + 1) - 1) >> ilog2(SZ_1M);
+
+	return sizes & pci_rebar_get_possible_sizes(dev, resno);
+}
+EXPORT_SYMBOL_GPL(pci_iov_vf_bar_get_sizes);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 05e68f35f392..28f06045ab20 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2438,6 +2438,8 @@ int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
 int pci_sriov_get_totalvfs(struct pci_dev *dev);
 int pci_sriov_configure_simple(struct pci_dev *dev, int nr_virtfn);
 resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno);
+int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size);
+u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs);
 void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe);
 
 /* Arch may override these (weak) */
@@ -2490,6 +2492,10 @@ static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 #define pci_sriov_configure_simple	NULL
 static inline resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
 { return 0; }
+static inline int pci_iov_vf_bar_set_size(struct pci_dev *dev, int resno, int size)
+{ return -ENODEV; }
+static inline u32 pci_iov_vf_bar_get_sizes(struct pci_dev *dev, int resno, int num_vfs)
+{ return 0; }
 static inline void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool probe) { }
 #endif
 
-- 
cgit v1.2.3


From ce60ab3964782df9ba34f0a64c0bc766dd508bde Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 29 May 2025 06:45:45 -0400
Subject: Expand the type of nfs_fattr->valid

We need to be able to track more than 32 attributes per inode.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/1e3405fca54efd0be7c91c1da77917b94f5dfcc4.1748515333.git.bcodding@redhat.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c            |  2 +-
 include/linux/nfs_fs_sb.h |  2 +-
 include/linux/nfs_xdr.h   | 54 +++++++++++++++++++++++------------------------
 3 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a2fa6bc4d74e..17f5dcda2a00 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2209,7 +2209,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	bool attr_changed = false;
 	bool have_delegation;
 
-	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
+	dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n",
 			__func__, inode->i_sb->s_id, inode->i_ino,
 			nfs_display_fhandle_hash(NFS_FH(inode)),
 			atomic_read(&inode->i_count), fattr->valid);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 63141320c2a8..d7895eeccea3 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -172,8 +172,8 @@ struct nfs_server {
 #define NFS_MOUNT_FORCE_RDIRPLUS	0x20000000
 #define NFS_MOUNT_NETUNREACH_FATAL	0x40000000
 
-	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */
+	__u64			fattr_valid;	/* Valid attributes */
 	unsigned int		rsize;		/* read size */
 	unsigned int		rpages;		/* read size (in pages) */
 	unsigned int		wsize;		/* write size */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 67f6632f723b..9cacbbd14787 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -45,7 +45,7 @@ struct nfs4_threshold {
 };
 
 struct nfs_fattr {
-	unsigned int		valid;		/* which fields are valid */
+	__u64			valid;		/* which fields are valid */
 	umode_t			mode;
 	__u32			nlink;
 	kuid_t			uid;
@@ -80,32 +80,32 @@ struct nfs_fattr {
 	struct nfs4_label	*label;
 };
 
-#define NFS_ATTR_FATTR_TYPE		(1U << 0)
-#define NFS_ATTR_FATTR_MODE		(1U << 1)
-#define NFS_ATTR_FATTR_NLINK		(1U << 2)
-#define NFS_ATTR_FATTR_OWNER		(1U << 3)
-#define NFS_ATTR_FATTR_GROUP		(1U << 4)
-#define NFS_ATTR_FATTR_RDEV		(1U << 5)
-#define NFS_ATTR_FATTR_SIZE		(1U << 6)
-#define NFS_ATTR_FATTR_PRESIZE		(1U << 7)
-#define NFS_ATTR_FATTR_BLOCKS_USED	(1U << 8)
-#define NFS_ATTR_FATTR_SPACE_USED	(1U << 9)
-#define NFS_ATTR_FATTR_FSID		(1U << 10)
-#define NFS_ATTR_FATTR_FILEID		(1U << 11)
-#define NFS_ATTR_FATTR_ATIME		(1U << 12)
-#define NFS_ATTR_FATTR_MTIME		(1U << 13)
-#define NFS_ATTR_FATTR_CTIME		(1U << 14)
-#define NFS_ATTR_FATTR_PREMTIME		(1U << 15)
-#define NFS_ATTR_FATTR_PRECTIME		(1U << 16)
-#define NFS_ATTR_FATTR_CHANGE		(1U << 17)
-#define NFS_ATTR_FATTR_PRECHANGE	(1U << 18)
-#define NFS_ATTR_FATTR_V4_LOCATIONS	(1U << 19)
-#define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 20)
-#define NFS_ATTR_FATTR_MOUNTPOINT	(1U << 21)
-#define NFS_ATTR_FATTR_MOUNTED_ON_FILEID (1U << 22)
-#define NFS_ATTR_FATTR_OWNER_NAME	(1U << 23)
-#define NFS_ATTR_FATTR_GROUP_NAME	(1U << 24)
-#define NFS_ATTR_FATTR_V4_SECURITY_LABEL (1U << 25)
+#define NFS_ATTR_FATTR_TYPE		BIT_ULL(0)
+#define NFS_ATTR_FATTR_MODE		BIT_ULL(1)
+#define NFS_ATTR_FATTR_NLINK		BIT_ULL(2)
+#define NFS_ATTR_FATTR_OWNER		BIT_ULL(3)
+#define NFS_ATTR_FATTR_GROUP		BIT_ULL(4)
+#define NFS_ATTR_FATTR_RDEV		BIT_ULL(5)
+#define NFS_ATTR_FATTR_SIZE		BIT_ULL(6)
+#define NFS_ATTR_FATTR_PRESIZE		BIT_ULL(7)
+#define NFS_ATTR_FATTR_BLOCKS_USED	BIT_ULL(8)
+#define NFS_ATTR_FATTR_SPACE_USED	BIT_ULL(9)
+#define NFS_ATTR_FATTR_FSID		BIT_ULL(10)
+#define NFS_ATTR_FATTR_FILEID		BIT_ULL(11)
+#define NFS_ATTR_FATTR_ATIME		BIT_ULL(12)
+#define NFS_ATTR_FATTR_MTIME		BIT_ULL(13)
+#define NFS_ATTR_FATTR_CTIME		BIT_ULL(14)
+#define NFS_ATTR_FATTR_PREMTIME		BIT_ULL(15)
+#define NFS_ATTR_FATTR_PRECTIME		BIT_ULL(16)
+#define NFS_ATTR_FATTR_CHANGE		BIT_ULL(17)
+#define NFS_ATTR_FATTR_PRECHANGE	BIT_ULL(18)
+#define NFS_ATTR_FATTR_V4_LOCATIONS	BIT_ULL(19)
+#define NFS_ATTR_FATTR_V4_REFERRAL	BIT_ULL(20)
+#define NFS_ATTR_FATTR_MOUNTPOINT	BIT_ULL(21)
+#define NFS_ATTR_FATTR_MOUNTED_ON_FILEID BIT_ULL(22)
+#define NFS_ATTR_FATTR_OWNER_NAME	BIT_ULL(23)
+#define NFS_ATTR_FATTR_GROUP_NAME	BIT_ULL(24)
+#define NFS_ATTR_FATTR_V4_SECURITY_LABEL BIT_ULL(25)
 
 #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
 		| NFS_ATTR_FATTR_MODE \
-- 
cgit v1.2.3


From 1c7ae2dd3f0e6d07ec0a5a348f2561f2171b9c81 Mon Sep 17 00:00:00 2001
From: Anne Marie Merritt <annemarie.merritt@primarydata.com>
Date: Thu, 29 May 2025 06:45:46 -0400
Subject: nfs: Add timecreate to nfs inode

Add tracking of the create time (a.k.a. btime) along with corresponding
bitfields, request, and decode xdr routines.

Signed-off-by: Anne Marie Merritt <annemarie.merritt@primarydata.com>
Signed-off-by: Lance Shelton <lance.shelton@hammerspace.com>
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/1e3677b0655fa2bbaba0817b41d111d94a06e5ee.1748515333.git.bcodding@redhat.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c          | 17 +++++++++++++++--
 fs/nfs/nfs4proc.c       | 14 +++++++++++++-
 fs/nfs/nfs4xdr.c        | 24 ++++++++++++++++++++++++
 fs/nfs/nfstrace.h       |  3 ++-
 include/linux/nfs_fs.h  |  8 ++++++++
 include/linux/nfs_xdr.h |  3 +++
 6 files changed, 65 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 17f5dcda2a00..c5462aed6bf5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -197,6 +197,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 		if (!(flags & NFS_INO_REVAL_FORCED))
 			flags &= ~(NFS_INO_INVALID_MODE |
 				   NFS_INO_INVALID_OTHER |
+				   NFS_INO_INVALID_BTIME |
 				   NFS_INO_INVALID_XATTR);
 		flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
 	}
@@ -522,6 +523,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		inode_set_atime(inode, 0, 0);
 		inode_set_mtime(inode, 0, 0);
 		inode_set_ctime(inode, 0, 0);
+		memset(&nfsi->btime, 0, sizeof(nfsi->btime));
 		inode_set_iversion_raw(inode, 0);
 		inode->i_size = 0;
 		clear_nlink(inode);
@@ -545,6 +547,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode_set_ctime_to_ts(inode, fattr->ctime);
 		else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
+		if (fattr->valid & NFS_ATTR_FATTR_BTIME)
+			nfsi->btime = fattr->btime;
+		else if (fattr_supported & NFS_ATTR_FATTR_BTIME)
+			nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME);
 		if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
 			inode_set_iversion_raw(inode, fattr->change_attr);
 		else
@@ -1943,7 +1949,7 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
 		NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
 		NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
 		NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
-		NFS_INO_INVALID_NLINK;
+		NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME;
 	unsigned long cache_validity = NFS_I(inode)->cache_validity;
 	enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type;
 
@@ -2304,7 +2310,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 					| NFS_INO_INVALID_BLOCKS
 					| NFS_INO_INVALID_NLINK
 					| NFS_INO_INVALID_MODE
-					| NFS_INO_INVALID_OTHER;
+					| NFS_INO_INVALID_OTHER
+					| NFS_INO_INVALID_BTIME;
 				if (S_ISDIR(inode->i_mode))
 					nfs_force_lookup_revalidate(inode);
 				attr_changed = true;
@@ -2338,6 +2345,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->cache_validity |=
 			save_cache_validity & NFS_INO_INVALID_CTIME;
 
+	if (fattr->valid & NFS_ATTR_FATTR_BTIME)
+		nfsi->btime = fattr->btime;
+	else if (fattr_supported & NFS_ATTR_FATTR_BTIME)
+		nfsi->cache_validity |=
+			save_cache_validity & NFS_INO_INVALID_BTIME;
+
 	/* Check if our cached file size is stale */
 	if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
 		new_isize = nfs_size_to_loff_t(fattr->size);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 341740fa293d..92f1b2601b67 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -222,6 +222,7 @@ const u32 nfs4_fattr_bitmap[3] = {
 	| FATTR4_WORD1_RAWDEV
 	| FATTR4_WORD1_SPACE_USED
 	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_CREATE
 	| FATTR4_WORD1_TIME_METADATA
 	| FATTR4_WORD1_TIME_MODIFY
 	| FATTR4_WORD1_MOUNTED_ON_FILEID,
@@ -243,6 +244,7 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
 	| FATTR4_WORD1_RAWDEV
 	| FATTR4_WORD1_SPACE_USED
 	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_CREATE
 	| FATTR4_WORD1_TIME_METADATA
 	| FATTR4_WORD1_TIME_MODIFY,
 	FATTR4_WORD2_MDSTHRESHOLD
@@ -323,6 +325,9 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
 	if (!(cache_validity & NFS_INO_INVALID_OTHER))
 		dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
 
+	if (!(cache_validity & NFS_INO_INVALID_BTIME))
+		dst[1] &= ~FATTR4_WORD1_TIME_CREATE;
+
 	if (nfs_have_delegated_mtime(inode)) {
 		if (!(cache_validity & NFS_INO_INVALID_ATIME))
 			dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET);
@@ -1307,7 +1312,8 @@ nfs4_update_changeattr_locked(struct inode *inode,
 				NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
 				NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
 				NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
-				NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
+				NFS_INO_INVALID_MODE | NFS_INO_INVALID_BTIME |
+				NFS_INO_INVALID_XATTR;
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	}
 	nfsi->attrtimeo_timestamp = jiffies;
@@ -4047,6 +4053,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 			server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
 		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
 			server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
+		if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_CREATE))
+			server->fattr_valid &= ~NFS_ATTR_FATTR_BTIME;
 		memcpy(server->attr_bitmask_nl, res.attr_bitmask,
 				sizeof(server->attr_bitmask));
 		server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
@@ -5781,6 +5791,8 @@ void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
 		bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
 	if (cache_validity & NFS_INO_INVALID_BLOCKS)
 		bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+	if (cache_validity & NFS_INO_INVALID_BTIME)
+		bitmask[1] |= FATTR4_WORD1_TIME_CREATE;
 
 	if (cache_validity & NFS_INO_INVALID_SIZE)
 		bitmask[0] |= FATTR4_WORD0_SIZE;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 318afde38057..49ff98571fa5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1623,6 +1623,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			| FATTR4_WORD1_RAWDEV
 			| FATTR4_WORD1_SPACE_USED
 			| FATTR4_WORD1_TIME_ACCESS
+			| FATTR4_WORD1_TIME_CREATE
 			| FATTR4_WORD1_TIME_METADATA
 			| FATTR4_WORD1_TIME_MODIFY;
 		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -4207,6 +4208,24 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 	return status;
 }
 
+static int decode_attr_time_create(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+	int status = 0;
+
+	time->tv_sec = 0;
+	time->tv_nsec = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_CREATE - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_TIME_CREATE)) {
+		status = decode_attr_time(xdr, time);
+		if (status == 0)
+			status = NFS_ATTR_FATTR_BTIME;
+		bitmap[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	}
+	dprintk("%s: btime=%lld\n", __func__, time->tv_sec);
+	return status;
+}
+
 static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
 {
 	int status = 0;
@@ -4781,6 +4800,11 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
+	status = decode_attr_time_create(xdr, bitmap, &fattr->btime);
+	if (status < 0)
+		goto xdr_error;
+	fattr->valid |= status;
+
 	status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
 	if (status < 0)
 		goto xdr_error;
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 7a058bd8c566..f49f064c5ee5 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -32,7 +32,8 @@
 			{ NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
 			{ NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
 			{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
-			{ NFS_INO_INVALID_MODE, "INVALID_MODE" })
+			{ NFS_INO_INVALID_MODE, "INVALID_MODE" }, \
+			{ NFS_INO_INVALID_BTIME, "INVALID_BTIME" })
 
 #define nfs_show_nfsi_flags(v) \
 	__print_flags(v, "|", \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 67ae2c3f41d2..c585939b6cd6 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -160,6 +160,12 @@ struct nfs_inode {
 	unsigned long		flags;			/* atomic bit ops */
 	unsigned long		cache_validity;		/* bit mask */
 
+	/*
+	 * NFS Attributes not included in struct inode
+	 */
+
+	struct timespec64	btime;
+
 	/*
 	 * read_cache_jiffies is when we started read-caching this inode.
 	 * attrtimeo is for how long the cached information is assumed
@@ -316,10 +322,12 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_XATTR	BIT(15)		/* xattrs are invalid */
 #define NFS_INO_INVALID_NLINK	BIT(16)		/* cached nlinks is invalid */
 #define NFS_INO_INVALID_MODE	BIT(17)		/* cached mode is invalid */
+#define NFS_INO_INVALID_BTIME	BIT(18)		/* cached btime is invalid */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
 		| NFS_INO_INVALID_MTIME \
+		| NFS_INO_INVALID_BTIME \
 		| NFS_INO_INVALID_SIZE \
 		| NFS_INO_INVALID_NLINK \
 		| NFS_INO_INVALID_MODE \
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9cacbbd14787..ac4bff6e9913 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -67,6 +67,7 @@ struct nfs_fattr {
 	struct timespec64	atime;
 	struct timespec64	mtime;
 	struct timespec64	ctime;
+	struct timespec64	btime;
 	__u64			change_attr;	/* NFSv4 change attribute */
 	__u64			pre_change_attr;/* pre-op NFSv4 change attribute */
 	__u64			pre_size;	/* pre_op_attr.size	  */
@@ -106,6 +107,7 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR_OWNER_NAME	BIT_ULL(23)
 #define NFS_ATTR_FATTR_GROUP_NAME	BIT_ULL(24)
 #define NFS_ATTR_FATTR_V4_SECURITY_LABEL BIT_ULL(25)
+#define NFS_ATTR_FATTR_BTIME		BIT_ULL(26)
 
 #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
 		| NFS_ATTR_FATTR_MODE \
@@ -126,6 +128,7 @@ struct nfs_fattr {
 		| NFS_ATTR_FATTR_SPACE_USED)
 #define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
 		| NFS_ATTR_FATTR_SPACE_USED \
+		| NFS_ATTR_FATTR_BTIME \
 		| NFS_ATTR_FATTR_V4_SECURITY_LABEL)
 
 /*
-- 
cgit v1.2.3


From 0715a72ee9a38461eac4b34388b772914f269119 Mon Sep 17 00:00:00 2001
From: Anthony Iliopoulos <ailiop@suse.com>
Date: Fri, 13 Jun 2025 11:44:37 +0200
Subject: NFS: remove unused wpages field from struct nfs_server

The wpages field is not serving any purpose since commit c63c7b051395
("NFS: Fix a race when doing NFS write coalescing") which was merged in
v2.6.22-rc1. Remove it completely.

Signed-off-by: Anthony Iliopoulos <ailiop@suse.com>
Link: https://lore.kernel.org/r/20250613094439.82338-2-ailiop@suse.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           | 1 -
 include/linux/nfs_fs_sb.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index cf35ad3f818a..23dafc590476 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -814,7 +814,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 		server->wsize = max_rpc_payload;
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
 		server->wsize = NFS_MAX_FILE_IO_SIZE;
-	server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d7895eeccea3..7048f9b867ab 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -177,7 +177,6 @@ struct nfs_server {
 	unsigned int		rsize;		/* read size */
 	unsigned int		rpages;		/* read size (in pages) */
 	unsigned int		wsize;		/* write size */
-	unsigned int		wpages;		/* write size (in pages) */
 	unsigned int		wtmult;		/* server disk block size */
 	unsigned int		dtsize;		/* readdir size */
 	unsigned short		port;		/* "port=" setting */
-- 
cgit v1.2.3


From 74a33326cfe8e62ebe0a65ba01ea8a8bceb532f8 Mon Sep 17 00:00:00 2001
From: Anthony Iliopoulos <ailiop@suse.com>
Date: Fri, 13 Jun 2025 11:44:38 +0200
Subject: NFS: remove unused time_delta field from struct nfs_server

The last code that was using this was removed via commit ca0daa277aca
("NFS: Cache aggressively when file is open for writing") which was
merged in v4.8-rc1, so it can be removed completely.

Signed-off-by: Anthony Iliopoulos <ailiop@suse.com>
Link: https://lore.kernel.org/r/20250613094439.82338-3-ailiop@suse.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           | 1 -
 include/linux/nfs_fs_sb.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 23dafc590476..47258dc3af70 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -830,7 +830,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 
 	server->maxfilesize = fsinfo->maxfilesize;
 
-	server->time_delta = fsinfo->time_delta;
 	server->change_attr_type = fsinfo->change_attr_type;
 
 	server->clone_blksize = fsinfo->clone_blksize;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 7048f9b867ab..e1b2cf57e765 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -202,7 +202,6 @@ struct nfs_server {
 	struct nfs_fsid		fsid;
 	int			s_sysfs_id;	/* sysfs dentry index */
 	__u64			maxfilesize;	/* maximum file size */
-	struct timespec64	time_delta;	/* smallest time granularity */
 	unsigned long		mount_time;	/* when this fs was mounted */
 	struct super_block	*super;		/* VFS super block */
 	dev_t			s_dev;		/* superblock dev numbers */
-- 
cgit v1.2.3


From 2c665d91c2a2d8b5bdf1374d1253b3c89fca4ede Mon Sep 17 00:00:00 2001
From: Anthony Iliopoulos <ailiop@suse.com>
Date: Fri, 13 Jun 2025 11:44:39 +0200
Subject: NFS: remove unused pnfs_ld_data field from struct nfs_server

The last code that was using this was removed via commit 20d655d6197d
("pnfs/blocklayout: use the device id cache") which was merged in
v3.18-rc1, so it can be removed completely.

Signed-off-by: Anthony Iliopoulos <ailiop@suse.com>
Link: https://lore.kernel.org/r/20250613094439.82338-4-ailiop@suse.com
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs_sb.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index e1b2cf57e765..d2d36711a119 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -246,7 +246,6 @@ struct nfs_server {
 						   filesystem */
 	struct pnfs_layoutdriver_type  *pnfs_curr_ld; /* Active layout driver */
 	struct rpc_wait_queue	roc_rpcwaitq;
-	void			*pnfs_ld_data;	/* per mount point data */
 
 	/* the following fields are protected by nfs_client->cl_lock */
 	struct rb_root		state_owners;
-- 
cgit v1.2.3


From 48693d119b2114f8eaf8b8f972b29e05ae581ad4 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 13 Jul 2025 00:30:06 +0100
Subject: SUNRPC: Remove unused xdr functions

Remove a bunch of unused xdr_*decode* functions:
  The last use of xdr_decode_netobj() was removed in 2021 by:
commit 7cf96b6d0104 ("lockd: Update the NLMv4 SHARE arguments decoder to
use struct xdr_stream")
  The last use of xdr_decode_string_inplace() was removed in 2021 by:
commit 3049e974a7c7 ("lockd: Update the NLMv4 FREE_ALL arguments decoder
to use struct xdr_stream")
  The last use of xdr_stream_decode_opaque() was removed in 2024 by:
commit fed8a17c61ff ("xdrgen: typedefs should use the built-in string and
opaque functions")

  The functions xdr_stream_decode_string() and
xdr_stream_decode_opaque_dup() were both added in 2018 by the
commit 0e779aa70308 ("SUNRPC: Add helpers for decoding opaque and string
types")
but never used.

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Link: https://lore.kernel.org/r/20250712233006.403226-1-linux@treblig.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   9 ----
 net/sunrpc/xdr.c           | 110 ---------------------------------------------
 2 files changed, 119 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a2ab813a9800..e370886632b0 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -128,10 +128,7 @@ xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
 __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len);
 __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len);
 __be32 *xdr_encode_string(__be32 *p, const char *s);
-__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp,
-			unsigned int maxlen);
 __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *);
-__be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 
 void	xdr_inline_pages(struct xdr_buf *, unsigned int,
 			 struct page **, unsigned int, unsigned int);
@@ -341,12 +338,6 @@ xdr_stream_remaining(const struct xdr_stream *xdr)
 	return xdr->nwords << 2;
 }
 
-ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr,
-		size_t size);
-ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
-		size_t maxlen, gfp_t gfp_flags);
-ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str,
-		size_t size);
 ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
 		size_t maxlen, gfp_t gfp_flags);
 ssize_t xdr_stream_decode_opaque_auth(struct xdr_stream *xdr, u32 *flavor,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 2ea00e354ba6..a0aae1144212 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj)
 }
 EXPORT_SYMBOL_GPL(xdr_encode_netobj);
 
-__be32 *
-xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj)
-{
-	unsigned int	len;
-
-	if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ)
-		return NULL;
-	obj->len  = len;
-	obj->data = (u8 *) p;
-	return p + XDR_QUADLEN(len);
-}
-EXPORT_SYMBOL_GPL(xdr_decode_netobj);
-
 /**
  * xdr_encode_opaque_fixed - Encode fixed length opaque data
  * @p: pointer to current position in XDR buffer.
@@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string)
 }
 EXPORT_SYMBOL_GPL(xdr_encode_string);
 
-__be32 *
-xdr_decode_string_inplace(__be32 *p, char **sp,
-			  unsigned int *lenp, unsigned int maxlen)
-{
-	u32 len;
-
-	len = be32_to_cpu(*p++);
-	if (len > maxlen)
-		return NULL;
-	*lenp = len;
-	*sp = (char *) p;
-	return p + XDR_QUADLEN(len);
-}
-EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
-
 /**
  * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf
  * @buf: XDR buffer where string resides
@@ -2247,88 +2219,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(xdr_process_buf);
 
-/**
- * xdr_stream_decode_opaque - Decode variable length opaque
- * @xdr: pointer to xdr_stream
- * @ptr: location to store opaque data
- * @size: size of storage buffer @ptr
- *
- * Return values:
- *   On success, returns size of object stored in *@ptr
- *   %-EBADMSG on XDR buffer overflow
- *   %-EMSGSIZE on overflow of storage buffer @ptr
- */
-ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size)
-{
-	ssize_t ret;
-	void *p;
-
-	ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
-	if (ret <= 0)
-		return ret;
-	memcpy(ptr, p, ret);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque);
-
-/**
- * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque
- * @xdr: pointer to xdr_stream
- * @ptr: location to store pointer to opaque data
- * @maxlen: maximum acceptable object size
- * @gfp_flags: GFP mask to use
- *
- * Return values:
- *   On success, returns size of object stored in *@ptr
- *   %-EBADMSG on XDR buffer overflow
- *   %-EMSGSIZE if the size of the object would exceed @maxlen
- *   %-ENOMEM on memory allocation failure
- */
-ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr,
-		size_t maxlen, gfp_t gfp_flags)
-{
-	ssize_t ret;
-	void *p;
-
-	ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
-	if (ret > 0) {
-		*ptr = kmemdup(p, ret, gfp_flags);
-		if (*ptr != NULL)
-			return ret;
-		ret = -ENOMEM;
-	}
-	*ptr = NULL;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup);
-
-/**
- * xdr_stream_decode_string - Decode variable length string
- * @xdr: pointer to xdr_stream
- * @str: location to store string
- * @size: size of storage buffer @str
- *
- * Return values:
- *   On success, returns length of NUL-terminated string stored in *@str
- *   %-EBADMSG on XDR buffer overflow
- *   %-EMSGSIZE on overflow of storage buffer @str
- */
-ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size)
-{
-	ssize_t ret;
-	void *p;
-
-	ret = xdr_stream_decode_opaque_inline(xdr, &p, size);
-	if (ret > 0) {
-		memcpy(str, p, ret);
-		str[ret] = '\0';
-		return strlen(str);
-	}
-	*str = '\0';
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xdr_stream_decode_string);
-
 /**
  * xdr_stream_decode_string_dup - Decode and duplicate variable length string
  * @xdr: pointer to xdr_stream
-- 
cgit v1.2.3


From c49601642f95c8c6787acb07881f2495bc8aeb27 Mon Sep 17 00:00:00 2001
From: Nitin Rawat <quic_nitirawa@quicinc.com>
Date: Mon, 14 Jul 2025 13:23:35 +0530
Subject: scsi: ufs: core: Add ufshcd_dme_rmw() to modify DME attributes

Introduce ufshcd_dme_rmw() API to read, modify, and write DME attributes
in UFS host controllers using a mask and value.

Signed-off-by: Nitin Rawat <quic_nitirawa@quicinc.com>
Link: https://lore.kernel.org/r/20250714075336.2133-3-quic_nitirawa@quicinc.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 24 ++++++++++++++++++++++++
 include/ufs/ufshcd.h      |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 4410e7d93b7d..a69aa5560932 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -4253,6 +4253,30 @@ out:
 }
 EXPORT_SYMBOL_GPL(ufshcd_dme_get_attr);
 
+/**
+ * ufshcd_dme_rmw - get modify set a DME attribute
+ * @hba: per adapter instance
+ * @mask: indicates which bits to clear from the value that has been read
+ * @val: actual value to write
+ * @attr: dme attribute
+ */
+int ufshcd_dme_rmw(struct ufs_hba *hba, u32 mask,
+		   u32 val, u32 attr)
+{
+	u32 cfg = 0;
+	int err;
+
+	err = ufshcd_dme_get(hba, UIC_ARG_MIB(attr), &cfg);
+	if (err)
+		return err;
+
+	cfg &= ~mask;
+	cfg |= (val & mask);
+
+	return ufshcd_dme_set(hba, UIC_ARG_MIB(attr), cfg);
+}
+EXPORT_SYMBOL_GPL(ufshcd_dme_rmw);
+
 /**
  * ufshcd_uic_pwr_ctrl - executes UIC commands (which affects the link power
  * state) and waits for it to take effect.
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index 9b3515cee711..1d3943777584 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1480,6 +1480,7 @@ void ufshcd_resume_complete(struct device *dev);
 bool ufshcd_is_hba_active(struct ufs_hba *hba);
 void ufshcd_pm_qos_init(struct ufs_hba *hba);
 void ufshcd_pm_qos_exit(struct ufs_hba *hba);
+int ufshcd_dme_rmw(struct ufs_hba *hba, u32 mask, u32 val, u32 attr);
 
 /* Wrapper functions for safely calling variant operations */
 static inline int ufshcd_vops_init(struct ufs_hba *hba)
-- 
cgit v1.2.3


From 2a683d005286018c6f47ef0e432829655a6a21a3 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Fri, 11 Jul 2025 05:10:59 +0000
Subject: dev: Pass netdevice_tracker to dev_get_by_flags_rcu().

This is a follow-up for commit eb1ac9ff6c4a5 ("ipv6: anycast: Don't
hold RTNL for IPV6_JOIN_ANYCAST.").

We should not add a new device lookup API without netdevice_tracker.

Let's pass netdevice_tracker to dev_get_by_flags_rcu() and rename it
with netdev_ prefix to match other newer APIs.

Note that we always use GFP_ATOMIC for netdev_hold() as it's expected
to be called under RCU.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/netdev/20250708184053.102109f6@kernel.org/
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250711051120.2866855-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  4 ++--
 net/core/dev.c            | 11 ++++++-----
 net/ipv6/anycast.c        | 11 ++++++-----
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a80d21a14612..ec23cee5245d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3332,8 +3332,6 @@ int dev_get_iflink(const struct net_device *dev);
 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
 int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 			  struct net_device_path_stack *stack);
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short flags,
-					unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
 struct net_device *__dev_get_by_name(struct net *net, const char *name);
@@ -3396,6 +3394,8 @@ struct net_device *netdev_get_by_index(struct net *net, int ifindex,
 				       netdevice_tracker *tracker, gfp_t gfp);
 struct net_device *netdev_get_by_name(struct net *net, const char *name,
 				      netdevice_tracker *tracker, gfp_t gfp);
+struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+					   unsigned short flags, unsigned short mask);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 void netdev_copy_name(struct net_device *dev, char *name);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e365b099484e..19ddc3e6990a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1267,8 +1267,9 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 
 /**
- * dev_get_by_flags_rcu - find any device with given flags
+ * netdev_get_by_flags_rcu - find any device with given flags
  * @net: the applicable net namespace
+ * @tracker: tracking object for the acquired reference
  * @if_flags: IFF_* values
  * @mask: bitmask of bits in if_flags to check
  *
@@ -1277,21 +1278,21 @@ EXPORT_SYMBOL(dev_getfirstbyhwtype);
  * Context: rcu_read_lock() must be held.
  * Returns: NULL if a device is not found or a pointer to the device.
  */
-struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
-					unsigned short mask)
+struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+					   unsigned short if_flags, unsigned short mask)
 {
 	struct net_device *dev;
 
 	for_each_netdev_rcu(net, dev) {
 		if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
-			dev_hold(dev);
+			netdev_hold(dev, tracker, GFP_ATOMIC);
 			return dev;
 		}
 	}
 
 	return NULL;
 }
-EXPORT_IPV6_MOD(dev_get_by_flags_rcu);
+EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);
 
 /**
  *	dev_valid_name - check if name is okay for network device
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 53cf68e0242b..f8a8e46286b8 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -69,6 +69,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct ipv6_ac_socklist *pac = NULL;
 	struct net *net = sock_net(sk);
+	netdevice_tracker dev_tracker;
 	struct net_device *dev = NULL;
 	struct inet6_dev *idev;
 	int err = 0, ishost;
@@ -79,7 +80,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		return -EINVAL;
 
 	if (ifindex)
-		dev = dev_get_by_index(net, ifindex);
+		dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL);
 
 	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) {
 		err = -EINVAL;
@@ -104,7 +105,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
 		if (rt) {
 			dev = dst_dev(&rt->dst);
-			dev_hold(dev);
+			netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
 			ip6_rt_put(rt);
 		} else if (ishost) {
 			rcu_read_unlock();
@@ -112,8 +113,8 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 			goto error;
 		} else {
 			/* router, no matching interface: just pick one */
-			dev = dev_get_by_flags_rcu(net, IFF_UP,
-						   IFF_UP | IFF_LOOPBACK);
+			dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP,
+						      IFF_UP | IFF_LOOPBACK);
 		}
 		rcu_read_unlock();
 	}
@@ -159,7 +160,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 error_idev:
 	in6_dev_put(idev);
 error:
-	dev_put(dev);
+	netdev_put(dev, &dev_tracker);
 
 	if (pac)
 		sock_kfree_s(sk, pac, sizeof(*pac));
-- 
cgit v1.2.3


From 08a305b2a5b8e125120bcf670ffe775c86cf1f59 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 12 Jul 2025 21:57:59 +0100
Subject: net/x25: Remove unused x25_terminate_link()

x25_terminate_link() has been unused since the last use was removed
in 2020 by:
commit 7eed751b3b2a ("net/x25: handle additional netdev events")

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Acked-by: Martin Schiller <ms@dev.tdt.de>
Link: https://patch.msgid.link/20250712205759.278777-1-linux@treblig.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/x25.h |  1 -
 net/x25/x25_dev.c | 22 ----------------------
 2 files changed, 23 deletions(-)

(limited to 'include')

diff --git a/include/net/x25.h b/include/net/x25.h
index 5e833cfc864e..414f3fd99345 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *);
 int x25_lapb_receive_frame(struct sk_buff *, struct net_device *,
 			   struct packet_type *, struct net_device *);
 void x25_establish_link(struct x25_neigh *);
-void x25_terminate_link(struct x25_neigh *);
 
 /* x25_facilities.c */
 int x25_parse_facilities(struct sk_buff *, struct x25_facilities *,
diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c
index 748d8630ab58..fb8ac1aa5826 100644
--- a/net/x25/x25_dev.c
+++ b/net/x25/x25_dev.c
@@ -170,28 +170,6 @@ void x25_establish_link(struct x25_neigh *nb)
 	dev_queue_xmit(skb);
 }
 
-void x25_terminate_link(struct x25_neigh *nb)
-{
-	struct sk_buff *skb;
-	unsigned char *ptr;
-
-	if (nb->dev->type != ARPHRD_X25)
-		return;
-
-	skb = alloc_skb(1, GFP_ATOMIC);
-	if (!skb) {
-		pr_err("x25_dev: out of memory\n");
-		return;
-	}
-
-	ptr  = skb_put(skb, 1);
-	*ptr = X25_IFACE_DISCONNECT;
-
-	skb->protocol = htons(ETH_P_X25);
-	skb->dev      = nb->dev;
-	dev_queue_xmit(skb);
-}
-
 void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb)
 {
 	unsigned char *dptr;
-- 
cgit v1.2.3


From 25236d4844ad8631a3ff12f1b33aaa27ac74172d Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne@redhat.com>
Date: Mon, 7 Jul 2025 16:22:25 -0400
Subject: scsi: scsi_transport_fc: Change to use per-rport devloss_work_q

Configurations with large numbers of FC rports per host instance are
taking a very long time to complete all devloss work.  Increase potential
parallelism by using a per-rport devloss_work_q for dev_loss_work and
fast_io_fail_work.

Signed-off-by: Ewan D. Milne <emilne@redhat.com>
Link: https://lore.kernel.org/r/20250707202225.1203189-1-emilne@redhat.com
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_fc.c | 70 +++++++++++++++++++++++-----------------
 include/scsi/scsi_transport_fc.h |  5 ++-
 2 files changed, 42 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 6b165a3ec6de..82d091d627c0 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -446,13 +446,6 @@ static int fc_host_setup(struct transport_container *tc, struct device *dev,
 		return -ENOMEM;
 
 	fc_host->dev_loss_tmo = fc_dev_loss_tmo;
-	fc_host->devloss_work_q = alloc_workqueue("fc_dl_%d", 0, 0,
-					shost->host_no);
-	if (!fc_host->devloss_work_q) {
-		destroy_workqueue(fc_host->work_q);
-		fc_host->work_q = NULL;
-		return -ENOMEM;
-	}
 
 	fc_bsg_hostadd(shost, fc_host);
 	/* ignore any bsg add error - we just can't do sgio */
@@ -2821,10 +2814,10 @@ fc_flush_work(struct Scsi_Host *shost)
  * 	1 on success / 0 already queued / < 0 for error
  */
 static int
-fc_queue_devloss_work(struct Scsi_Host *shost, struct delayed_work *work,
-				unsigned long delay)
+fc_queue_devloss_work(struct Scsi_Host *shost, struct fc_rport *rport,
+		      struct delayed_work *work, unsigned long delay)
 {
-	if (unlikely(!fc_host_devloss_work_q(shost))) {
+	if (unlikely(!rport->devloss_work_q)) {
 		printk(KERN_ERR
 			"ERROR: FC host '%s' attempted to queue work, "
 			"when no workqueue created.\n", shost->hostt->name);
@@ -2833,7 +2826,7 @@ fc_queue_devloss_work(struct Scsi_Host *shost, struct delayed_work *work,
 		return -EINVAL;
 	}
 
-	return queue_delayed_work(fc_host_devloss_work_q(shost), work, delay);
+	return queue_delayed_work(rport->devloss_work_q, work, delay);
 }
 
 /**
@@ -2841,9 +2834,9 @@ fc_queue_devloss_work(struct Scsi_Host *shost, struct delayed_work *work,
  * @shost:	Pointer to Scsi_Host bound to fc_host.
  */
 static void
-fc_flush_devloss(struct Scsi_Host *shost)
+fc_flush_devloss(struct Scsi_Host *shost, struct fc_rport *rport)
 {
-	if (!fc_host_devloss_work_q(shost)) {
+	if (unlikely(!rport->devloss_work_q)) {
 		printk(KERN_ERR
 			"ERROR: FC host '%s' attempted to flush work, "
 			"when no workqueue created.\n", shost->hostt->name);
@@ -2851,7 +2844,7 @@ fc_flush_devloss(struct Scsi_Host *shost)
 		return;
 	}
 
-	flush_workqueue(fc_host_devloss_work_q(shost));
+	flush_workqueue(rport->devloss_work_q);
 }
 
 
@@ -2913,13 +2906,6 @@ fc_remove_host(struct Scsi_Host *shost)
 		fc_host->work_q = NULL;
 		destroy_workqueue(work_q);
 	}
-
-	/* flush all devloss work items, then kill it  */
-	if (fc_host->devloss_work_q) {
-		work_q = fc_host->devloss_work_q;
-		fc_host->devloss_work_q = NULL;
-		destroy_workqueue(work_q);
-	}
 }
 EXPORT_SYMBOL(fc_remove_host);
 
@@ -2967,6 +2953,7 @@ fc_rport_final_delete(struct work_struct *work)
 	struct device *dev = &rport->dev;
 	struct Scsi_Host *shost = rport_to_shost(rport);
 	struct fc_internal *i = to_fc_internal(shost->transportt);
+	struct workqueue_struct *work_q;
 	unsigned long flags;
 	int do_callback = 0;
 
@@ -2988,9 +2975,9 @@ fc_rport_final_delete(struct work_struct *work)
 	if (rport->flags & FC_RPORT_DEVLOSS_PENDING) {
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		if (!cancel_delayed_work(&rport->fail_io_work))
-			fc_flush_devloss(shost);
+			fc_flush_devloss(shost, rport);
 		if (!cancel_delayed_work(&rport->dev_loss_work))
-			fc_flush_devloss(shost);
+			fc_flush_devloss(shost, rport);
 		cancel_work_sync(&rport->scan_work);
 		spin_lock_irqsave(shost->host_lock, flags);
 		rport->flags &= ~FC_RPORT_DEVLOSS_PENDING;
@@ -3021,6 +3008,12 @@ fc_rport_final_delete(struct work_struct *work)
 
 	fc_bsg_remove(rport->rqst_q);
 
+	if (rport->devloss_work_q) {
+		work_q = rport->devloss_work_q;
+		rport->devloss_work_q = NULL;
+		destroy_workqueue(work_q);
+	}
+
 	transport_remove_device(dev);
 	device_del(dev);
 	transport_destroy_device(dev);
@@ -3093,6 +3086,22 @@ fc_remote_port_create(struct Scsi_Host *shost, int channel,
 
 	spin_unlock_irqrestore(shost->host_lock, flags);
 
+	rport->devloss_work_q = alloc_workqueue("fc_dl_%d_%d", 0, 0,
+						shost->host_no, rport->number);
+	if (!rport->devloss_work_q) {
+		printk(KERN_ERR "FC Remote Port alloc_workqueue failed\n");
+/*
+ * Note that we have not yet called device_initialize() / get_device()
+ * Cannot reclaim incremented rport->number because we released host_lock
+ */
+		spin_lock_irqsave(shost->host_lock, flags);
+		list_del(&rport->peers);
+		scsi_host_put(shost);			/* for fc_host->rport list */
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		kfree(rport);
+		return NULL;
+	}
+
 	dev = &rport->dev;
 	device_initialize(dev);			/* takes self reference */
 	dev->parent = get_device(&shost->shost_gendev); /* parent reference */
@@ -3255,9 +3264,9 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel,
 				 * be checked and will NOOP the function.
 				 */
 				if (!cancel_delayed_work(&rport->fail_io_work))
-					fc_flush_devloss(shost);
+					fc_flush_devloss(shost, rport);
 				if (!cancel_delayed_work(&rport->dev_loss_work))
-					fc_flush_devloss(shost);
+					fc_flush_devloss(shost, rport);
 
 				spin_lock_irqsave(shost->host_lock, flags);
 
@@ -3451,11 +3460,12 @@ fc_remote_port_delete(struct fc_rport  *rport)
 	/* see if we need to kill io faster than waiting for device loss */
 	if ((rport->fast_io_fail_tmo != -1) &&
 	    (rport->fast_io_fail_tmo < timeout))
-		fc_queue_devloss_work(shost, &rport->fail_io_work,
-					rport->fast_io_fail_tmo * HZ);
+		fc_queue_devloss_work(shost, rport, &rport->fail_io_work,
+				      rport->fast_io_fail_tmo * HZ);
 
 	/* cap the length the devices can be blocked until they are deleted */
-	fc_queue_devloss_work(shost, &rport->dev_loss_work, timeout * HZ);
+	fc_queue_devloss_work(shost, rport, &rport->dev_loss_work,
+			      timeout * HZ);
 }
 EXPORT_SYMBOL(fc_remote_port_delete);
 
@@ -3514,9 +3524,9 @@ fc_remote_port_rolechg(struct fc_rport  *rport, u32 roles)
 		 * transaction.
 		 */
 		if (!cancel_delayed_work(&rport->fail_io_work))
-			fc_flush_devloss(shost);
+			fc_flush_devloss(shost, rport);
 		if (!cancel_delayed_work(&rport->dev_loss_work))
-			fc_flush_devloss(shost);
+			fc_flush_devloss(shost, rport);
 
 		spin_lock_irqsave(shost->host_lock, flags);
 		rport->flags &= ~(FC_RPORT_FAST_FAIL_TIMEDOUT |
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index d02b55261307..b908aacfef48 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -383,6 +383,8 @@ struct fc_rport {	/* aka fc_starget_attrs */
  	struct work_struct stgt_delete_work;
 	struct work_struct rport_delete_work;
 	struct request_queue *rqst_q;	/* bsg support */
+
+	struct workqueue_struct *devloss_work_q;
 } __attribute__((aligned(sizeof(unsigned long))));
 
 /* bit field values for struct fc_rport "flags" field: */
@@ -576,7 +578,6 @@ struct fc_host_attrs {
 
 	/* work queues for rport state manipulation */
 	struct workqueue_struct *work_q;
-	struct workqueue_struct *devloss_work_q;
 
 	/* bsg support */
 	struct request_queue *rqst_q;
@@ -654,8 +655,6 @@ struct fc_host_attrs {
 	(((struct fc_host_attrs *)(x)->shost_data)->npiv_vports_inuse)
 #define fc_host_work_q(x) \
 	(((struct fc_host_attrs *)(x)->shost_data)->work_q)
-#define fc_host_devloss_work_q(x) \
-	(((struct fc_host_attrs *)(x)->shost_data)->devloss_work_q)
 #define fc_host_dev_loss_tmo(x) \
 	(((struct fc_host_attrs *)(x)->shost_data)->dev_loss_tmo)
 #define fc_host_max_ct_payload(x)  \
-- 
cgit v1.2.3


From 2677010e7793451c20d895c477c4dc76f6e6a10e Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Thu, 10 Jul 2025 21:12:03 +0000
Subject: Add support to set NAPI threaded for individual NAPI

A net device has a threaded sysctl that can be used to enable threaded
NAPI polling on all of the NAPI contexts under that device. Allow
enabling threaded NAPI polling at individual NAPI level using netlink.

Extend the netlink operation `napi-set` and allow setting the threaded
attribute of a NAPI. This will enable the threaded polling on a NAPI
context.

Add a test in `nl_netdev.py` that verifies various cases of threaded
NAPI being set at NAPI and at device level.

Tested
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml  | 10 ++++
 Documentation/networking/napi.rst        |  9 +++-
 include/linux/netdevice.h                |  1 +
 include/uapi/linux/netdev.h              |  1 +
 net/core/dev.c                           | 30 +++++++++--
 net/core/dev.h                           |  7 +++
 net/core/netdev-genl-gen.c               |  5 +-
 net/core/netdev-genl.c                   | 14 +++++
 tools/include/uapi/linux/netdev.h        |  1 +
 tools/testing/selftests/net/nl_netdev.py | 91 +++++++++++++++++++++++++++++++-
 10 files changed, 162 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index ce4cfec82100..85d0ea6ac426 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -283,6 +283,14 @@ attribute-sets:
         doc: The timeout, in nanoseconds, of how long to suspend irq
              processing, if event polling finds events
         type: uint
+      -
+        name: threaded
+        doc: Whether the NAPI is configured to operate in threaded polling
+             mode. If this is set to 1 then the NAPI context operates in
+             threaded polling mode.
+        type: uint
+        checks:
+          max: 1
   -
     name: xsk-info
     attributes: []
@@ -694,6 +702,7 @@ operations:
             - defer-hard-irqs
             - gro-flush-timeout
             - irq-suspend-timeout
+            - threaded
       dump:
         request:
           attributes:
@@ -746,6 +755,7 @@ operations:
             - defer-hard-irqs
             - gro-flush-timeout
             - irq-suspend-timeout
+            - threaded
     -
       name: bind-tx
       doc: Bind dmabuf to netdev for TX
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index d0e3953cae6a..a15754adb041 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -444,7 +444,14 @@ dependent). The NAPI instance IDs will be assigned in the opposite
 order than the process IDs of the kernel threads.
 
 Threaded NAPI is controlled by writing 0/1 to the ``threaded`` file in
-netdev's sysfs directory.
+netdev's sysfs directory. It can also be enabled for a specific NAPI using
+netlink interface.
+
+For example, using the script:
+
+.. code-block:: bash
+
+  $ ynl --family netdev --do napi-set --json='{"id": 66, "threaded": 1}'
 
 .. rubric:: Footnotes
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ec23cee5245d..e49d8c98d284 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -369,6 +369,7 @@ struct napi_config {
 	u64 irq_suspend_timeout;
 	u32 defer_hard_irqs;
 	cpumask_t affinity_mask;
+	bool threaded;
 	unsigned int napi_id;
 };
 
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7eb9571786b8..1f3719a9a0eb 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -134,6 +134,7 @@ enum {
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+	NETDEV_A_NAPI_THREADED,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/net/core/dev.c b/net/core/dev.c
index 19ddc3e6990a..621a639aeba1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6961,6 +6961,31 @@ static void napi_stop_kthread(struct napi_struct *napi)
 	napi->thread = NULL;
 }
 
+int napi_set_threaded(struct napi_struct *napi, bool threaded)
+{
+	if (threaded) {
+		if (!napi->thread) {
+			int err = napi_kthread_create(napi);
+
+			if (err)
+				return err;
+		}
+	}
+
+	if (napi->config)
+		napi->config->threaded = threaded;
+
+	if (!threaded && napi->thread) {
+		napi_stop_kthread(napi);
+	} else {
+		/* Make sure kthread is created before THREADED bit is set. */
+		smp_mb__before_atomic();
+		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+	}
+
+	return 0;
+}
+
 int dev_set_threaded(struct net_device *dev, bool threaded)
 {
 	struct napi_struct *napi;
@@ -6968,9 +6993,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 
 	netdev_assert_locked_or_invisible(dev);
 
-	if (dev->threaded == threaded)
-		return 0;
-
 	if (threaded) {
 		list_for_each_entry(napi, &dev->napi_list, dev_list) {
 			if (!napi->thread) {
@@ -7221,6 +7243,8 @@ static void napi_restore_config(struct napi_struct *n)
 		napi_hash_add(n);
 		n->config->napi_id = n->napi_id;
 	}
+
+	WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
 }
 
 static void napi_save_config(struct napi_struct *n)
diff --git a/net/core/dev.h b/net/core/dev.h
index e93f36b7ddf3..a603387fb566 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -315,6 +315,13 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
 	WRITE_ONCE(n->irq_suspend_timeout, timeout);
 }
 
+static inline bool napi_get_threaded(struct napi_struct *n)
+{
+	return test_bit(NAPI_STATE_THREADED, &n->state);
+}
+
+int napi_set_threaded(struct napi_struct *n, bool threaded);
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 4fc44587f493..0994bd68a7e6 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -92,11 +92,12 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1]
 };
 
 /* NETDEV_CMD_NAPI_SET - do */
-static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = {
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = {
 	[NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
 	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
 	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
 	[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
+	[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_UINT, 1),
 };
 
 /* NETDEV_CMD_BIND_TX - do */
@@ -193,7 +194,7 @@ static const struct genl_split_ops netdev_nl_ops[] = {
 		.cmd		= NETDEV_CMD_NAPI_SET,
 		.doit		= netdev_nl_napi_set_doit,
 		.policy		= netdev_napi_set_nl_policy,
-		.maxattr	= NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+		.maxattr	= NETDEV_A_NAPI_THREADED,
 		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
 	},
 	{
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 2afa7b2141aa..5875df372415 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -184,6 +184,10 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
 	if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
 		goto nla_put_failure;
 
+	if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED,
+			 napi_get_threaded(napi)))
+		goto nla_put_failure;
+
 	if (napi->thread) {
 		pid = task_pid_nr(napi->thread);
 		if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
@@ -322,8 +326,18 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 {
 	u64 irq_suspend_timeout = 0;
 	u64 gro_flush_timeout = 0;
+	u8 threaded = 0;
 	u32 defer = 0;
 
+	if (info->attrs[NETDEV_A_NAPI_THREADED]) {
+		int ret;
+
+		threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]);
+		ret = napi_set_threaded(napi, !!threaded);
+		if (ret)
+			return ret;
+	}
+
 	if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
 		defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
 		napi_set_defer_hard_irqs(napi, defer);
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 7eb9571786b8..1f3719a9a0eb 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -134,6 +134,7 @@ enum {
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+	NETDEV_A_NAPI_THREADED,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index c9109627a741..c8ffade79a52 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -35,6 +35,91 @@ def napi_list_check(nf) -> None:
                 ksft_eq(len(napis), 100,
                         comment=f"queue count after reset queue {q} mode {i}")
 
+def napi_set_threaded(nf) -> None:
+    """
+    Test that verifies various cases of napi threaded
+    set and unset at napi and device level.
+    """
+    with NetdevSimDev(queue_count=2) as nsimdev:
+        nsim = nsimdev.nsims[0]
+
+        ip(f"link set dev {nsim.ifname} up")
+
+        napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+        ksft_eq(len(napis), 2)
+
+        napi0_id = napis[0]['id']
+        napi1_id = napis[1]['id']
+
+        # set napi threaded and verify
+        nf.napi_set({'id': napi0_id, 'threaded': 1})
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 1)
+        ksft_ne(napi0.get('pid'), None)
+
+        # check it is not set for napi1
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1.get('pid'), None)
+
+        ip(f"link set dev {nsim.ifname} down")
+        ip(f"link set dev {nsim.ifname} up")
+
+        # verify if napi threaded is still set
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 1)
+        ksft_ne(napi0.get('pid'), None)
+
+        # check it is still not set for napi1
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1.get('pid'), None)
+
+        # unset napi threaded and verify
+        nf.napi_set({'id': napi0_id, 'threaded': 0})
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0.get('pid'), None)
+
+        # set threaded at device level
+        system(f"echo 1 > /sys/class/net/{nsim.ifname}/threaded")
+
+        # check napi threaded is set for both napis
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 1)
+        ksft_ne(napi0.get('pid'), None)
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 1)
+        ksft_ne(napi1.get('pid'), None)
+
+        # unset threaded at device level
+        system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded")
+
+        # check napi threaded is unset for both napis
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0.get('pid'), None)
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1.get('pid'), None)
+
+        # set napi threaded for napi0
+        nf.napi_set({'id': napi0_id, 'threaded': 1})
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 1)
+        ksft_ne(napi0.get('pid'), None)
+
+        # unset threaded at device level
+        system(f"echo 0 > /sys/class/net/{nsim.ifname}/threaded")
+
+        # check napi threaded is unset for both napis
+        napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0.get('pid'), None)
+        napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1.get('pid'), None)
+
 def dev_set_threaded(nf) -> None:
     """
     Test that verifies various cases of napi threaded
@@ -56,8 +141,10 @@ def dev_set_threaded(nf) -> None:
 
         # check napi threaded is set for both napis
         napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 1)
         ksft_ne(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 1)
         ksft_ne(napi1.get('pid'), None)
 
         # unset threaded
@@ -65,8 +152,10 @@ def dev_set_threaded(nf) -> None:
 
         # check napi threaded is unset for both napis
         napi0 = nf.napi_get({'id': napi0_id})
+        ksft_eq(napi0['threaded'], 0)
         ksft_eq(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
+        ksft_eq(napi1['threaded'], 0)
         ksft_eq(napi1.get('pid'), None)
 
 def nsim_rxq_reset_down(nf) -> None:
@@ -156,7 +245,7 @@ def page_pool_check(nf) -> None:
 def main() -> None:
     nf = NetdevFamily()
     ksft_run([empty_check, lo_check, page_pool_check, napi_list_check,
-              dev_set_threaded, nsim_rxq_reset_down],
+              dev_set_threaded, napi_set_threaded, nsim_rxq_reset_down],
              args=(nf, ))
     ksft_exit()
 
-- 
cgit v1.2.3


From 9ca48d616ed76b284f946667a3cb7961205c8ee3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Jul 2025 11:39:59 +0000
Subject: tcp: do not accept packets beyond window

Currently, TCP accepts incoming packets which might go beyond the
offered RWIN.

Add to tcp_sequence() the validation of packet end sequence.

Add the corresponding check in the fast path.

We relax this new constraint if the receive queue is empty,
to not freeze flows from buggy peers.

Add a new drop reason : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250711114006.480026-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h |  8 +++++++-
 net/ipv4/tcp_input.c          | 22 +++++++++++++++++-----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index b9e78290269e..beb134d55747 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -45,6 +45,7 @@
 	FN(TCP_LISTEN_OVERFLOW)		\
 	FN(TCP_OLD_SEQUENCE)		\
 	FN(TCP_INVALID_SEQUENCE)	\
+	FN(TCP_INVALID_END_SEQUENCE)	\
 	FN(TCP_INVALID_ACK_SEQUENCE)	\
 	FN(TCP_RESET)			\
 	FN(TCP_INVALID_SYN)		\
@@ -303,8 +304,13 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_TCP_LISTEN_OVERFLOW,
 	/** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */
 	SKB_DROP_REASON_TCP_OLD_SEQUENCE,
-	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */
 	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
+	/**
+	 * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE:
+	 * Not acceptable END_SEQ field.
+	 */
+	SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE,
 	/**
 	 * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
 	 * field because ack sequence is not in the window between snd_una
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9b03c44c12b8..f0f9c78654b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4391,14 +4391,22 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
  * (borrowed from freebsd)
  */
 
-static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
+static enum skb_drop_reason tcp_sequence(const struct sock *sk,
 					 u32 seq, u32 end_seq)
 {
+	const struct tcp_sock *tp = tcp_sk(sk);
+
 	if (before(end_seq, tp->rcv_wup))
 		return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
 
-	if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
-		return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+	if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
+		if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+			return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+		/* Only accept this packet if receive queue is empty. */
+		if (skb_queue_len(&sk->sk_receive_queue))
+			return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
+	}
 
 	return SKB_NOT_DROPPED_YET;
 }
@@ -5881,7 +5889,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 step1:
 	/* Step 1: check sequence number */
-	reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+	reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 	if (reason) {
 		/* RFC793, page 37: "In all states except SYN-SENT, all reset
 		 * (RST) segments are validated by checking their SEQ-fields."
@@ -6110,6 +6118,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 			if (tcp_checksum_complete(skb))
 				goto csum_error;
 
+			if (after(TCP_SKB_CB(skb)->end_seq,
+				  tp->rcv_nxt + tcp_receive_window(tp)))
+				goto validate;
+
 			if ((int)skb->truesize > sk->sk_forward_alloc)
 				goto step5;
 
@@ -6165,7 +6177,7 @@ slow_path:
 	/*
 	 *	Standard slow path.
 	 */
-
+validate:
 	if (!tcp_validate_incoming(sk, skb, th, 1))
 		return;
 
-- 
cgit v1.2.3


From 6c758062c64dfbd61862801fbde4e0702f4f3a23 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Jul 2025 11:40:00 +0000
Subject: tcp: add LINUX_MIB_BEYOND_WINDOW

Add a new SNMP MIB : LINUX_MIB_BEYOND_WINDOW

Incremented when an incoming packet is received beyond the
receiver window.

nstat -az | grep TcpExtBeyondWindow

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250711114006.480026-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_cachelines/snmp.rst | 1 +
 include/net/dropreason-core.h                    | 1 +
 include/uapi/linux/snmp.h                        | 1 +
 net/ipv4/proc.c                                  | 1 +
 net/ipv4/tcp_input.c                             | 1 +
 5 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst
index bd44b3eebbef..bce4eb35ec48 100644
--- a/Documentation/networking/net_cachelines/snmp.rst
+++ b/Documentation/networking/net_cachelines/snmp.rst
@@ -36,6 +36,7 @@ unsigned_long  LINUX_MIB_TIMEWAITRECYCLED
 unsigned_long  LINUX_MIB_TIMEWAITKILLED
 unsigned_long  LINUX_MIB_PAWSACTIVEREJECTED
 unsigned_long  LINUX_MIB_PAWSESTABREJECTED
+unsigned_long  LINUX_MIB_BEYOND_WINDOW
 unsigned_long  LINUX_MIB_TSECR_REJECTED
 unsigned_long  LINUX_MIB_PAWS_OLD_ACK
 unsigned_long  LINUX_MIB_PAWS_TW_REJECTED
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index beb134d55747..229bb1826f2a 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -309,6 +309,7 @@ enum skb_drop_reason {
 	/**
 	 * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE:
 	 * Not acceptable END_SEQ field.
+	 * Corresponds to LINUX_MIB_BEYOND_WINDOW.
 	 */
 	SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE,
 	/**
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1d234d7e1892..49f5640092a0 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -186,6 +186,7 @@ enum
 	LINUX_MIB_TIMEWAITKILLED,		/* TimeWaitKilled */
 	LINUX_MIB_PAWSACTIVEREJECTED,		/* PAWSActiveRejected */
 	LINUX_MIB_PAWSESTABREJECTED,		/* PAWSEstabRejected */
+	LINUX_MIB_BEYOND_WINDOW,		/* BeyondWindow */
 	LINUX_MIB_TSECRREJECTED,		/* TSEcrRejected */
 	LINUX_MIB_PAWS_OLD_ACK,			/* PAWSOldAck */
 	LINUX_MIB_PAWS_TW_REJECTED,		/* PAWSTimewait */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ea2f01584379..65b0d0ab0084 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
 	SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
 	SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+	SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW),
 	SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
 	SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
 	SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f0f9c78654b4..5e2d82c273e2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5900,6 +5900,7 @@ step1:
 		if (!th->rst) {
 			if (th->syn)
 				goto syn_challenge;
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW);
 			if (!tcp_oow_rate_limited(sock_net(sk), skb,
 						  LINUX_MIB_TCPACKSKIPPEDSEQ,
 						  &tp->last_oow_ack_time))
-- 
cgit v1.2.3


From 75dff0584cce79203ee9968c66c7589150fed591 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Jul 2025 11:40:04 +0000
Subject: tcp: add const to tcp_try_rmem_schedule() and sk_rmem_schedule() skb

These functions to not modify the skb, add a const qualifier.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250711114006.480026-7-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h   | 2 +-
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 0f2443d4ec58..c8a4b283df6f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1553,7 +1553,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
 }
 
 static inline bool
-sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
 {
 	return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78da05933078..39de55ff898e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4888,7 +4888,7 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
 static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
 
-static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
 				 unsigned int size)
 {
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-- 
cgit v1.2.3


From d7c36d6350b5a4b27256eaeeea3b72621a819c9a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 10 Jun 2025 11:29:21 +0200
Subject: locking/lockdep: Avoid struct return in lock_stats()

Returning a large structure from the lock_stats() function causes clang
to have multiple copies of it on the stack and copy between them, which
can end up exceeding the frame size warning limit:

kernel/locking/lockdep.c:300:25: error: stack frame size (1464) exceeds limit (1280) in 'lock_stats' [-Werror,-Wframe-larger-than]
  300 | struct lock_class_stats lock_stats(struct lock_class *class)

Change the calling conventions to directly operate on the caller's copy,
which apparently is what gcc does already.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/r/20250610092941.2642847-1-arnd@kernel.org
---
 include/linux/lockdep_types.h |  2 +-
 kernel/locking/lockdep.c      | 27 ++++++++++++---------------
 kernel/locking/lockdep_proc.c |  2 +-
 3 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 9f361d3ab9d9..eae115a26488 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -175,7 +175,7 @@ struct lock_class_stats {
 	unsigned long			bounces[nr_bounce_types];
 };
 
-struct lock_class_stats lock_stats(struct lock_class *class);
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats);
 void clear_lock_stats(struct lock_class *class);
 #endif
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dd2bbf73718b..0c941418a215 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -297,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
 	dst->nr += src->nr;
 }
 
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
 {
-	struct lock_class_stats stats;
 	int cpu, i;
 
-	memset(&stats, 0, sizeof(struct lock_class_stats));
+	memset(stats, 0, sizeof(struct lock_class_stats));
 	for_each_possible_cpu(cpu) {
 		struct lock_class_stats *pcs =
 			&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
 
-		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
-			stats.contention_point[i] += pcs->contention_point[i];
+		for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+			stats->contention_point[i] += pcs->contention_point[i];
 
-		for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
-			stats.contending_point[i] += pcs->contending_point[i];
+		for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+			stats->contending_point[i] += pcs->contending_point[i];
 
-		lock_time_add(&pcs->read_waittime, &stats.read_waittime);
-		lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+		lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+		lock_time_add(&pcs->write_waittime, &stats->write_waittime);
 
-		lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
-		lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+		lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+		lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
 
-		for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
-			stats.bounces[i] += pcs->bounces[i];
+		for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+			stats->bounces[i] += pcs->bounces[i];
 	}
-
-	return stats;
 }
 
 void clear_lock_stats(struct lock_class *class)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b52c07c4707c..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -657,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
 			if (!test_bit(idx, lock_classes_in_use))
 				continue;
 			iter->class = class;
-			iter->stats = lock_stats(class);
+			lock_stats(class, &iter->stats);
 			iter++;
 		}
 
-- 
cgit v1.2.3


From 6fd9e1aa078490ed6e79307465269629fcb43018 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 1 Jul 2025 14:55:54 +0100
Subject: regset: Fix kerneldoc for struct regset_get() in user_regset

Commit 7717cb9bdd04 ("regset: new method and helpers for it") added a
new interface ->regset_get() for struct user_regset, and commit
1e6986c9db21 ("regset: kill ->get()") got rid of the old interface.

The kerneldoc comment block was never updated to take account of this
change, though.

Update it.

No functional change.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
Reviewed-by: Akihiko Odaki <odaki@rsg.ci.i.u-tokyo.ac.jp>
Link: https://lore.kernel.org/r/20250701135616.29630-2-Dave.Martin@arm.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/regset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/regset.h b/include/linux/regset.h
index 9061266dd8de..02417e934845 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -151,7 +151,7 @@ typedef int user_regset_writeback_fn(struct task_struct *target,
  * @align:		Required alignment, in bytes.
  * @bias:		Bias from natural indexing.
  * @core_note_type:	ELF note @n_type value used in core dumps.
- * @get:		Function to fetch values.
+ * @regset_get:		Function to fetch values.
  * @set:		Function to store values.
  * @active:		Function to report if regset is active, or %NULL.
  * @writeback:		Function to write data back to user memory, or %NULL.
-- 
cgit v1.2.3


From 85a7f9cbf8a83cfe0aca04053a832206c4ad1272 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 1 Jul 2025 14:55:55 +0100
Subject: regset: Add explicit core note name in struct user_regset

There is currently hard-coded logic spread around the tree for
determining the note name for regset notes emitted in coredumps.

Now that the names are declared explicitly in <uapi/elf.h>, this can be
simplified.

In preparation for getting rid of the special-case logic, add an
explicit core_note_name field in struct user_regset for specifying the
note name explicitly.  To help avoid mistakes, a convenience macro
USER_REGSET_NOTE_TYPE() is provided to set .core_note_type and
.core_note_name based on the note type.

When dumping core, use the new field to set the note name, if the
regset specifies it.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Akihiko Odaki <akihiko.odaki@daynix.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com> # s390
Reviewed-by: Akihiko Odaki <odaki@rsg.ci.i.u-tokyo.ac.jp>
Link: https://lore.kernel.org/r/20250701135616.29630-3-Dave.Martin@arm.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/binfmt_elf.c        |  8 ++++++--
 include/linux/regset.h | 10 ++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a43363d593e5..f1069103ca24 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1727,6 +1727,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	for (view_iter = 1; view_iter < view->n; ++view_iter) {
 		const struct user_regset *regset = &view->regsets[view_iter];
 		int note_type = regset->core_note_type;
+		const char *note_name = regset->core_note_name;
 		bool is_fpreg = note_type == NT_PRFPREG;
 		void *data;
 		int ret;
@@ -1747,8 +1748,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 		if (is_fpreg)
 			SET_PR_FPVALID(&t->prstatus);
 
-		fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX",
-			  note_type, ret, data);
+		if (!note_name)
+			note_name = is_fpreg ? NN_PRFPREG : "LINUX";
+
+		fill_note(&t->notes[note_iter], note_name, note_type,
+			  ret, data);
 
 		info->size += notesize(&t->notes[note_iter]);
 		note_iter++;
diff --git a/include/linux/regset.h b/include/linux/regset.h
index 02417e934845..ad1ca6fe04f4 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -151,6 +151,7 @@ typedef int user_regset_writeback_fn(struct task_struct *target,
  * @align:		Required alignment, in bytes.
  * @bias:		Bias from natural indexing.
  * @core_note_type:	ELF note @n_type value used in core dumps.
+ * @core_note_name:	ELF note name to qualify the note type.
  * @regset_get:		Function to fetch values.
  * @set:		Function to store values.
  * @active:		Function to report if regset is active, or %NULL.
@@ -190,6 +191,10 @@ typedef int user_regset_writeback_fn(struct task_struct *target,
  *
  * If nonzero, @core_note_type gives the n_type field (NT_* value)
  * of the core file note in which this regset's data appears.
+ * @core_note_name specifies the note name.  The preferred way to
+ * specify these two fields is to use the @USER_REGSET_NOTE_TYPE()
+ * macro.
+ *
  * NT_PRSTATUS is a special case in that the regset data starts at
  * offsetof(struct elf_prstatus, pr_reg) into the note data; that is
  * part of the per-machine ELF formats userland knows about.  In
@@ -207,8 +212,13 @@ struct user_regset {
 	unsigned int 			align;
 	unsigned int 			bias;
 	unsigned int 			core_note_type;
+	const char			*core_note_name;
 };
 
+#define USER_REGSET_NOTE_TYPE(type) \
+	.core_note_type			= (NT_ ## type), \
+	.core_note_name			= (NN_ ## type)
+
 /**
  * struct user_regset_view - available regsets
  * @name:	Identifier, e.g. UTS_MACHINE string.
-- 
cgit v1.2.3


From 444020f4bf06fb86805ee7e7ceec0375485fd94d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 14 Jul 2025 14:21:30 +0200
Subject: wifi: cfg80211: remove scan request n_channels counted_by

This reverts commit e3eac9f32ec0 ("wifi: cfg80211: Annotate struct
cfg80211_scan_request with __counted_by").

This really has been a completely failed experiment. There were
no actual bugs found, and yet at this point we already have four
"fixes" to it, with nothing to show for but code churn, and it
never even made the code any safer.

In all of the cases that ended up getting "fixed", the structure
is also internally inconsistent after the n_channels setting as
the channel list isn't actually filled yet. You cannot scan with
such a structure, that's just wrong. In mac80211, the struct is
also reused multiple times, so initializing it once is no good.

Some previous "fixes" (e.g. one in brcm80211) are also just setting
n_channels before accessing the array, under the assumption that the
code is correct and the array can be accessed, further showing that
the whole thing is just pointless when the allocation count and use
count are not separate.

If we really wanted to fix it, we'd need to separately track the
number of channels allocated and the number of channels currently
used, but given that no bugs were found despite the numerous syzbot
reports, that'd just be a waste of time.

Remove the __counted_by() annotation. We really should also remove
a number of the n_channels settings that are setting up a structure
that's inconsistent, but that can wait.

Reported-by: syzbot+e834e757bd9b3d3e1251@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=e834e757bd9b3d3e1251
Fixes: e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by")
Link: https://patch.msgid.link/20250714142130.9b0bbb7e1f07.I09112ccde72d445e11348fc2bef68942cb2ffc94@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d1848dc8ec99..10248d527616 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2690,7 +2690,7 @@ struct cfg80211_scan_request {
 	s8 tsf_report_link_id;
 
 	/* keep last */
-	struct ieee80211_channel *channels[] __counted_by(n_channels);
+	struct ieee80211_channel *channels[];
 };
 
 static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
-- 
cgit v1.2.3


From 14450be2332a49445106403492a367412b8c23f4 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 9 Jul 2025 23:37:55 +0300
Subject: wifi: cfg80211: Fix interface type validation

Fix a condition that verified valid values of interface types.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250709233537.7ad199ca5939.I0ac1ff74798bf59a87a57f2e18f2153c308b119b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6ec9a8865b8b..f67424ec1085 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -633,7 +633,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
 	const struct ieee80211_sband_iftype_data *data;
 	int i;
 
-	if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
+	if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
 		return NULL;
 
 	if (iftype == NL80211_IFTYPE_AP_VLAN)
-- 
cgit v1.2.3


From c932be7262323011ae8caa050811300b85347050 Mon Sep 17 00:00:00 2001
From: Yuvarani V <quic_yuvarani@quicinc.com>
Date: Thu, 10 Jul 2025 11:04:27 +0530
Subject: wifi: cfg80211: parse attribute to update unsolicited probe response
 template

At present, the updated unsolicited broadcast probe response template is
not processed during userspace commands such as channel switch or color
change. This leads to an issue where older incorrect unsolicited probe
response is still used during these events.

Add support to parse the netlink attribute and store it so that
mac80211/drivers can use it to set the BSS_CHANGED_UNSOL_BCAST_PROBE_RESP
flag in order to send the updated unsolicited broadcast probe response
templates during these events.

Signed-off-by: Yuvarani V <quic_yuvarani@quicinc.com>
Signed-off-by: Aditya Kumar Singh <aditya.kumar.singh@oss.qualcomm.com>
Link: https://patch.msgid.link/20250710-update_unsol_bcast_probe_resp-v2-1-31aca39d3b30@oss.qualcomm.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 ++++
 net/wireless/nl80211.c | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f67424ec1085..77bc17d6e96d 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1526,6 +1526,7 @@ struct cfg80211_ap_update {
  * @n_counter_offsets_beacon: number of csa counters the beacon (tail)
  * @n_counter_offsets_presp: number of csa counters in the probe response
  * @beacon_after: beacon data to be used on the new channel
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
  * @radar_required: whether radar detection is required on the new channel
  * @block_tx: whether transmissions should be blocked while changing
  * @count: number of beacons until switch
@@ -1540,6 +1541,7 @@ struct cfg80211_csa_settings {
 	unsigned int n_counter_offsets_beacon;
 	unsigned int n_counter_offsets_presp;
 	struct cfg80211_beacon_data beacon_after;
+	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
 	bool radar_required;
 	bool block_tx;
 	u8 count;
@@ -1555,6 +1557,7 @@ struct cfg80211_csa_settings {
  * @counter_offset_beacon: offsets of the counters within the beacon (tail)
  * @counter_offset_presp: offsets of the counters within the probe response
  * @beacon_next: beacon data to be used after the color change
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
  * @count: number of beacons until the color change
  * @color: the color used after the change
  * @link_id: defines the link on which color change is expected during MLO.
@@ -1565,6 +1568,7 @@ struct cfg80211_color_change_settings {
 	u16 counter_offset_beacon;
 	u16 counter_offset_presp;
 	struct cfg80211_beacon_data beacon_next;
+	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
 	u8 count;
 	u8 color;
 	u8 link_id;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 4e6c0a4e2a82..1ee14592828d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10995,6 +10995,16 @@ skip_beacons:
 	if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX])
 		params.block_tx = true;
 
+	if ((wdev->iftype == NL80211_IFTYPE_AP ||
+	     wdev->iftype == NL80211_IFTYPE_P2P_GO) &&
+	    info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
+		err = nl80211_parse_unsol_bcast_probe_resp(
+			rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
+			&params.unsol_bcast_probe_resp);
+		if (err)
+			goto free;
+	}
+
 	params.link_id = link_id;
 	err = rdev_channel_switch(rdev, dev, &params);
 
@@ -16797,6 +16807,14 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 		params.counter_offset_presp = offset;
 	}
 
+	if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) {
+		err = nl80211_parse_unsol_bcast_probe_resp(
+			rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP],
+			&params.unsol_bcast_probe_resp);
+		if (err)
+			goto out;
+	}
+
 	params.link_id = nl80211_link_id(info->attrs);
 	err = rdev_color_change(rdev, dev, &params);
 
-- 
cgit v1.2.3


From 1aeed732f4f885ad36280ca4afb331fa42bf7263 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 10 Jul 2025 16:55:58 +0800
Subject: net: mctp: Use hashtable for binds

Ensure that a specific EID (remote or local) bind will match in
preference to a MCTP_ADDR_ANY bind.

This adds infrastructure for binding a socket to receive messages from a
specific remote peer address, a future commit will expose an API for
this.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Link: https://patch.msgid.link/20250710-mctp-bind-v4-5-8ec2f6460c56@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netns/mctp.h | 20 ++++++++++---
 net/mctp/af_mctp.c       | 11 ++++---
 net/mctp/route.c         | 75 +++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 84 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 1db8f9aaddb4..89555f90b97b 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -6,19 +6,25 @@
 #ifndef __NETNS_MCTP_H__
 #define __NETNS_MCTP_H__
 
+#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
 
+#define MCTP_BINDS_BITS 7
+
 struct netns_mctp {
 	/* Only updated under RTNL, entries freed via RCU */
 	struct list_head routes;
 
-	/* Bound sockets: list of sockets bound by type.
-	 * This list is updated from non-atomic contexts (under bind_lock),
-	 * and read (under rcu) in packet rx
+	/* Bound sockets: hash table of sockets, keyed by
+	 * (type, src_eid, dest_eid).
+	 * Specific src_eid/dest_eid entries also have an entry for
+	 * MCTP_ADDR_ANY. This list is updated from non-atomic contexts
+	 * (under bind_lock), and read (under rcu) in packet rx.
 	 */
 	struct mutex bind_lock;
-	struct hlist_head binds;
+	DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS);
 
 	/* tag allocations. This list is read and updated from atomic contexts,
 	 * but elements are free()ed after a RCU grace-period
@@ -34,4 +40,10 @@ struct netns_mctp {
 	struct list_head neighbours;
 };
 
+static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr)
+{
+	return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16,
+		       MCTP_BINDS_BITS);
+}
+
 #endif /* __NETNS_MCTP_H__ */
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 20edaf840a60..16341de5cf28 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -626,17 +626,17 @@ static int mctp_sk_hash(struct sock *sk)
 	struct net *net = sock_net(sk);
 	struct sock *existing;
 	struct mctp_sock *msk;
+	u32 hash;
 	int rc;
 
 	msk = container_of(sk, struct mctp_sock, sk);
 
-	/* Bind lookup runs under RCU, remain live during that. */
-	sock_set_flag(sk, SOCK_RCU_FREE);
+	hash = mctp_bind_hash(msk->bind_type, msk->bind_addr, MCTP_ADDR_ANY);
 
 	mutex_lock(&net->mctp.bind_lock);
 
 	/* Prevent duplicate binds. */
-	sk_for_each(existing, &net->mctp.binds) {
+	sk_for_each(existing, &net->mctp.binds[hash]) {
 		struct mctp_sock *mex =
 			container_of(existing, struct mctp_sock, sk);
 
@@ -648,7 +648,10 @@ static int mctp_sk_hash(struct sock *sk)
 		}
 	}
 
-	sk_add_node_rcu(sk, &net->mctp.binds);
+	/* Bind lookup runs under RCU, remain live during that. */
+	sock_set_flag(sk, SOCK_RCU_FREE);
+
+	sk_add_node_rcu(sk, &net->mctp.binds[hash]);
 	rc = 0;
 
 out:
diff --git a/net/mctp/route.c b/net/mctp/route.c
index a20d6b11d418..69cfb0e6c545 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -40,33 +40,32 @@ static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb)
 	return 0;
 }
 
-static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+static struct mctp_sock *mctp_lookup_bind_details(struct net *net,
+						  struct sk_buff *skb,
+						  u8 type, u8 dest,
+						  u8 src, bool allow_net_any)
 {
 	struct mctp_skb_cb *cb = mctp_cb(skb);
-	struct mctp_hdr *mh;
 	struct sock *sk;
-	u8 type;
-
-	WARN_ON(!rcu_read_lock_held());
-
-	/* TODO: look up in skb->cb? */
-	mh = mctp_hdr(skb);
+	u8 hash;
 
-	if (!skb_headlen(skb))
-		return NULL;
+	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	type = (*(u8 *)skb->data) & 0x7f;
+	hash = mctp_bind_hash(type, dest, src);
 
-	sk_for_each_rcu(sk, &net->mctp.binds) {
+	sk_for_each_rcu(sk, &net->mctp.binds[hash]) {
 		struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
 
+		if (!allow_net_any && msk->bind_net == MCTP_NET_ANY)
+			continue;
+
 		if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
 			continue;
 
 		if (msk->bind_type != type)
 			continue;
 
-		if (!mctp_address_matches(msk->bind_addr, mh->dest))
+		if (!mctp_address_matches(msk->bind_addr, dest))
 			continue;
 
 		return msk;
@@ -75,6 +74,54 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
 	return NULL;
 }
 
+static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
+{
+	struct mctp_sock *msk;
+	struct mctp_hdr *mh;
+	u8 type;
+
+	/* TODO: look up in skb->cb? */
+	mh = mctp_hdr(skb);
+
+	if (!skb_headlen(skb))
+		return NULL;
+
+	type = (*(u8 *)skb->data) & 0x7f;
+
+	/* Look for binds in order of widening scope. A given destination or
+	 * source address also implies matching on a particular network.
+	 *
+	 * - Matching destination and source
+	 * - Matching destination
+	 * - Matching source
+	 * - Matching network, any address
+	 * - Any network or address
+	 */
+
+	msk = mctp_lookup_bind_details(net, skb, type, mh->dest, mh->src,
+				       false);
+	if (msk)
+		return msk;
+	msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, mh->src,
+				       false);
+	if (msk)
+		return msk;
+	msk = mctp_lookup_bind_details(net, skb, type, mh->dest, MCTP_ADDR_ANY,
+				       false);
+	if (msk)
+		return msk;
+	msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY,
+				       MCTP_ADDR_ANY, false);
+	if (msk)
+		return msk;
+	msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY,
+				       MCTP_ADDR_ANY, true);
+	if (msk)
+		return msk;
+
+	return NULL;
+}
+
 /* A note on the key allocations.
  *
  * struct net->mctp.keys contains our set of currently-allocated keys for
@@ -1671,7 +1718,7 @@ static int __net_init mctp_routes_net_init(struct net *net)
 	struct netns_mctp *ns = &net->mctp;
 
 	INIT_LIST_HEAD(&ns->routes);
-	INIT_HLIST_HEAD(&ns->binds);
+	hash_init(ns->binds);
 	mutex_init(&ns->bind_lock);
 	INIT_HLIST_HEAD(&ns->keys);
 	spin_lock_init(&ns->keys_lock);
-- 
cgit v1.2.3


From 3549eb08e5505823857838b5cf5f08567702d054 Mon Sep 17 00:00:00 2001
From: Matt Johnston <matt@codeconstruct.com.au>
Date: Thu, 10 Jul 2025 16:55:59 +0800
Subject: net: mctp: Allow limiting binds to a peer address

Prior to calling bind() a program may call connect() on a socket to
restrict to a remote peer address.

Using connect() is the normal mechanism to specify a remote network
peer, so we use that here. In MCTP connect() is only used for bound
sockets - send() is not available for MCTP since a tag must be provided
for each message.

The smctp_type must match between connect() and bind() calls.

Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
Link: https://patch.msgid.link/20250710-mctp-bind-v4-6-8ec2f6460c56@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/mctp.h |   5 ++-
 net/mctp/af_mctp.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 net/mctp/route.c   |   6 ++-
 3 files changed, 108 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/mctp.h b/include/net/mctp.h
index ac4f4ecdfc24..c3207ce98f07 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -69,7 +69,10 @@ struct mctp_sock {
 
 	/* bind() params */
 	unsigned int	bind_net;
-	mctp_eid_t	bind_addr;
+	mctp_eid_t	bind_local_addr;
+	mctp_eid_t	bind_peer_addr;
+	unsigned int	bind_peer_net;
+	bool		bind_peer_set;
 	__u8		bind_type;
 
 	/* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 16341de5cf28..df4e8cf33899 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -79,7 +79,7 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 		goto out_release;
 	}
 
-	msk->bind_addr = smctp->smctp_addr.s_addr;
+	msk->bind_local_addr = smctp->smctp_addr.s_addr;
 
 	/* MCTP_NET_ANY with a specific EID is resolved to the default net
 	 * at bind() time.
@@ -87,13 +87,35 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen)
 	 * lookup time.
 	 */
 	if (smctp->smctp_network == MCTP_NET_ANY &&
-	    msk->bind_addr != MCTP_ADDR_ANY) {
+	    msk->bind_local_addr != MCTP_ADDR_ANY) {
 		msk->bind_net = mctp_default_net(net);
 	} else {
 		msk->bind_net = smctp->smctp_network;
 	}
 
-	msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */
+	/* ignore the IC bit */
+	smctp->smctp_type &= 0x7f;
+
+	if (msk->bind_peer_set) {
+		if (msk->bind_type != smctp->smctp_type) {
+			/* Prior connect() had a different type */
+			rc = -EINVAL;
+			goto out_release;
+		}
+
+		if (msk->bind_net == MCTP_NET_ANY) {
+			/* Restrict to the network passed to connect() */
+			msk->bind_net = msk->bind_peer_net;
+		}
+
+		if (msk->bind_net != msk->bind_peer_net) {
+			/* connect() had a different net to bind() */
+			rc = -EINVAL;
+			goto out_release;
+		}
+	} else {
+		msk->bind_type = smctp->smctp_type;
+	}
 
 	rc = sk->sk_prot->hash(sk);
 
@@ -103,6 +125,67 @@ out_release:
 	return rc;
 }
 
+/* Used to set a specific peer prior to bind. Not used for outbound
+ * connections (Tag Owner set) since MCTP is a datagram protocol.
+ */
+static int mctp_connect(struct socket *sock, struct sockaddr *addr,
+			int addrlen, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
+	struct net *net = sock_net(&msk->sk);
+	struct sockaddr_mctp *smctp;
+	int rc;
+
+	if (addrlen != sizeof(*smctp))
+		return -EINVAL;
+
+	if (addr->sa_family != AF_MCTP)
+		return -EAFNOSUPPORT;
+
+	/* It's a valid sockaddr for MCTP, cast and do protocol checks */
+	smctp = (struct sockaddr_mctp *)addr;
+
+	if (!mctp_sockaddr_is_ok(smctp))
+		return -EINVAL;
+
+	/* Can't bind by tag */
+	if (smctp->smctp_tag)
+		return -EINVAL;
+
+	/* IC bit must be unset */
+	if (smctp->smctp_type & 0x80)
+		return -EINVAL;
+
+	lock_sock(sk);
+
+	if (sk_hashed(sk)) {
+		/* bind() already */
+		rc = -EADDRINUSE;
+		goto out_release;
+	}
+
+	if (msk->bind_peer_set) {
+		/* connect() already */
+		rc = -EADDRINUSE;
+		goto out_release;
+	}
+
+	msk->bind_peer_set = true;
+	msk->bind_peer_addr = smctp->smctp_addr.s_addr;
+	msk->bind_type = smctp->smctp_type;
+	if (smctp->smctp_network == MCTP_NET_ANY)
+		msk->bind_peer_net = mctp_default_net(net);
+	else
+		msk->bind_peer_net = smctp->smctp_network;
+
+	rc = 0;
+
+out_release:
+	release_sock(sk);
+	return rc;
+}
+
 static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name);
@@ -546,7 +629,7 @@ static const struct proto_ops mctp_dgram_ops = {
 	.family		= PF_MCTP,
 	.release	= mctp_release,
 	.bind		= mctp_bind,
-	.connect	= sock_no_connect,
+	.connect	= mctp_connect,
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
 	.getname	= sock_no_getname,
@@ -613,6 +696,7 @@ static int mctp_sk_init(struct sock *sk)
 
 	INIT_HLIST_HEAD(&msk->keys);
 	timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0);
+	msk->bind_peer_set = false;
 	return 0;
 }
 
@@ -626,12 +710,17 @@ static int mctp_sk_hash(struct sock *sk)
 	struct net *net = sock_net(sk);
 	struct sock *existing;
 	struct mctp_sock *msk;
+	mctp_eid_t remote;
 	u32 hash;
 	int rc;
 
 	msk = container_of(sk, struct mctp_sock, sk);
 
-	hash = mctp_bind_hash(msk->bind_type, msk->bind_addr, MCTP_ADDR_ANY);
+	if (msk->bind_peer_set)
+		remote = msk->bind_peer_addr;
+	else
+		remote = MCTP_ADDR_ANY;
+	hash = mctp_bind_hash(msk->bind_type, msk->bind_local_addr, remote);
 
 	mutex_lock(&net->mctp.bind_lock);
 
@@ -640,8 +729,12 @@ static int mctp_sk_hash(struct sock *sk)
 		struct mctp_sock *mex =
 			container_of(existing, struct mctp_sock, sk);
 
+		bool same_peer = (mex->bind_peer_set && msk->bind_peer_set &&
+				  mex->bind_peer_addr == msk->bind_peer_addr) ||
+				 (!mex->bind_peer_set && !msk->bind_peer_set);
+
 		if (mex->bind_type == msk->bind_type &&
-		    mex->bind_addr == msk->bind_addr &&
+		    mex->bind_local_addr == msk->bind_local_addr && same_peer &&
 		    mex->bind_net == msk->bind_net) {
 			rc = -EADDRINUSE;
 			goto out;
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 69cfb0e6c545..2b2b958ef6a3 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -65,7 +65,11 @@ static struct mctp_sock *mctp_lookup_bind_details(struct net *net,
 		if (msk->bind_type != type)
 			continue;
 
-		if (!mctp_address_matches(msk->bind_addr, dest))
+		if (msk->bind_peer_set &&
+		    !mctp_address_matches(msk->bind_peer_addr, src))
+			continue;
+
+		if (!mctp_address_matches(msk->bind_local_addr, dest))
 			continue;
 
 		return msk;
-- 
cgit v1.2.3


From d9c37a4904ec21ef7d45880fe023c11341869c28 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 14 Jul 2025 14:55:57 +0930
Subject: fs: add a new remove_bdev() callback

Currently all filesystems which implement super_operations::shutdown()
can not afford losing a device.

Thus fs_bdev_mark_dead() will just call the ->shutdown() callback for the
involved filesystem.

But it will no longer be the case, as multi-device filesystems like
btrfs and bcachefs can handle certain device loss without the need to
shutdown the whole filesystem.

To allow those multi-device filesystems to be integrated to use
fs_holder_ops:

- Add a new super_operations::remove_bdev() callback

- Try ->remove_bdev() callback first inside fs_bdev_mark_dead()
  If the callback returned 0, meaning the fs can handling the device
  loss, then exit without doing anything else.

  If there is no such callback or the callback returned non-zero value,
  continue to shutdown the filesystem as usual.

This means the new remove_bdev() should only do the check on whether the
operation can continue, and if so do the fs specific handlings.
The shutdown handling should still be handled by the existing
->shutdown() callback.

For all existing filesystems with shutdown callback, there is no change
to the code nor behavior.

Btrfs is going to implement both the ->remove_bdev() and ->shutdown()
callbacks soon.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Link: https://lore.kernel.org/09909fcff7f2763cc037fec97ac2482bdc0a12cb.1752470276.git.wqu@suse.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 11 +++++++++++
 include/linux/fs.h |  9 +++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/fs/super.c b/fs/super.c
index 21799e213fd7..53e135d84867 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1457,6 +1457,17 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 	if (!sb)
 		return;
 
+	if (sb->s_op->remove_bdev) {
+		int ret;
+
+		ret = sb->s_op->remove_bdev(sb, bdev);
+		if (!ret) {
+			super_unlock_shared(sb);
+			return;
+		}
+		/* Fallback to shutdown. */
+	}
+
 	if (!surprise)
 		sync_filesystem(sb);
 	shrink_dcache_sb(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 96c7925a6551..5150db41109a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2363,6 +2363,15 @@ struct super_operations {
 				  struct shrink_control *);
 	long (*free_cached_objects)(struct super_block *,
 				    struct shrink_control *);
+	/*
+	 * If a filesystem can support graceful removal of a device and
+	 * continue read-write operations, implement this callback.
+	 *
+	 * Return 0 if the filesystem can continue read-write.
+	 * Non-zero return value or no such callback means the fs will be shutdown
+	 * as usual.
+	 */
+	int (*remove_bdev)(struct super_block *sb, struct block_device *bdev);
 	void (*shutdown)(struct super_block *sb);
 };
 
-- 
cgit v1.2.3


From f99d4fccd2185176baf4ecac9a49d280fc62b953 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sat, 12 Jul 2025 15:40:17 +0800
Subject: dt-bindings: power: Add A523 PPU and PCK600 power controllers

The A523 PPU is likely the same kind of hardware seen on previous SoCs.

The A523 PCK600, as the name suggests, is likely a customized version
of ARM's PCK-600 power controller. Comparing the BSP driver against
ARM's PPU datasheet shows that the basic registers line up, but
Allwinner's hardware has some additional delay controls in the reserved
register range. As such it is likely not fully compatible with the
standard ARM version.

Document A523 PPU and PCK600 compatibles.

Also reorder the compatible string entries so they are grouped and
ordered by family first, then by SoC model.

Reviewed-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Link: https://lore.kernel.org/r/20250712074021.805953-2-wens@kernel.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 .../bindings/power/allwinner,sun20i-d1-ppu.yaml           |  4 +++-
 include/dt-bindings/power/allwinner,sun55i-a523-pck-600.h | 15 +++++++++++++++
 include/dt-bindings/power/allwinner,sun55i-a523-ppu.h     | 12 ++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 include/dt-bindings/power/allwinner,sun55i-a523-pck-600.h
 create mode 100644 include/dt-bindings/power/allwinner,sun55i-a523-ppu.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml b/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml
index f578be6a3bc8..a28e75a9cb6a 100644
--- a/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml
+++ b/Documentation/devicetree/bindings/power/allwinner,sun20i-d1-ppu.yaml
@@ -16,8 +16,10 @@ description:
 properties:
   compatible:
     enum:
-      - allwinner,sun20i-d1-ppu
       - allwinner,sun8i-v853-ppu
+      - allwinner,sun20i-d1-ppu
+      - allwinner,sun55i-a523-pck-600
+      - allwinner,sun55i-a523-ppu
 
   reg:
     maxItems: 1
diff --git a/include/dt-bindings/power/allwinner,sun55i-a523-pck-600.h b/include/dt-bindings/power/allwinner,sun55i-a523-pck-600.h
new file mode 100644
index 000000000000..6b3d8ea7bb69
--- /dev/null
+++ b/include/dt-bindings/power/allwinner,sun55i-a523-pck-600.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef _DT_BINDINGS_POWER_SUN55I_A523_PCK600_H_
+#define _DT_BINDINGS_POWER_SUN55I_A523_PCK600_H_
+
+#define PD_VE			0
+#define PD_GPU			1
+#define PD_VI			2
+#define PD_VO0			3
+#define PD_VO1			4
+#define PD_DE			5
+#define PD_NAND			6
+#define PD_PCIE			7
+
+#endif /* _DT_BINDINGS_POWER_SUN55I_A523_PCK600_H_ */
diff --git a/include/dt-bindings/power/allwinner,sun55i-a523-ppu.h b/include/dt-bindings/power/allwinner,sun55i-a523-ppu.h
new file mode 100644
index 000000000000..bc9aba73c19a
--- /dev/null
+++ b/include/dt-bindings/power/allwinner,sun55i-a523-ppu.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+
+#ifndef _DT_BINDINGS_POWER_SUN55I_A523_PPU_H_
+#define _DT_BINDINGS_POWER_SUN55I_A523_PPU_H_
+
+#define PD_DSP			0
+#define PD_NPU			1
+#define PD_AUDIO		2
+#define PD_SRAM			3
+#define PD_RISCV		4
+
+#endif /* _DT_BINDINGS_POWER_SUN55I_A523_PPU_H_ */
-- 
cgit v1.2.3


From bd116214d53c66dc7f863822af171b20c06b4784 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 15 Jul 2025 13:53:20 +0200
Subject: blktrace: add zoned block commands to blk_fill_rwbs

Add zoned block commands to blk_fill_rwbs:

- ZONE APPEND will be decoded as 'ZA'
- ZONE RESET will be decoded as 'ZR'
- ZONE RESET ALL will be decoded as 'ZRA'
- ZONE FINISH will be decoded as 'ZF'
- ZONE OPEN will be decoded as 'ZO'
- ZONE CLOSE will be decoded as 'ZC'

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20250715115324.53308-2-johannes.thumshirn@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h |  2 +-
 kernel/trace/blktrace.c      | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 14a924c0e303..d88669b3ce02 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -11,7 +11,7 @@
 #include <linux/tracepoint.h>
 #include <uapi/linux/ioprio.h>
 
-#define RWBS_LEN	9
+#define RWBS_LEN	10
 
 #define IOPRIO_CLASS_STRINGS \
 	{ IOPRIO_CLASS_NONE,	"none" }, \
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3f6a7bdc6edf..47168d2afbf1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
 	case REQ_OP_READ:
 		rwbs[i++] = 'R';
 		break;
+	case REQ_OP_ZONE_APPEND:
+		rwbs[i++] = 'Z';
+		rwbs[i++] = 'A';
+		break;
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_RESET_ALL:
+		rwbs[i++] = 'Z';
+		rwbs[i++] = 'R';
+		if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
+			rwbs[i++] = 'A';
+		break;
+	case REQ_OP_ZONE_FINISH:
+		rwbs[i++] = 'Z';
+		rwbs[i++] = 'F';
+		break;
+	case REQ_OP_ZONE_OPEN:
+		rwbs[i++] = 'Z';
+		rwbs[i++] = 'O';
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		rwbs[i++] = 'Z';
+		rwbs[i++] = 'C';
+		break;
 	default:
 		rwbs[i++] = 'N';
 	}
@@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
 	if (opf & REQ_ATOMIC)
 		rwbs[i++] = 'U';
 
+	WARN_ON_ONCE(i >= RWBS_LEN);
+
 	rwbs[i] = '\0';
 }
 EXPORT_SYMBOL_GPL(blk_fill_rwbs);
-- 
cgit v1.2.3


From 4cc21a00762b5bd4dcd743317a56c2dba500fd89 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 15 Jul 2025 13:53:22 +0200
Subject: block: add tracepoint for blk_zone_update_request_bio

Add a tracepoint in blk_zone_update_request_bio() to trace the bio sector
update on ZONE APPEND completions.

An example for this tracepoint is as follows:

<idle>-0 [001] d.h1.  381.746444: blk_zone_update_request_bio: 259,5 ZAS 131072 () 1048832 + 256 none,0,0 [swapper/1]

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20250715115324.53308-4-johannes.thumshirn@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c            |  3 +++
 include/trace/events/block.h | 11 +++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 88deb751b621..69af3caa3b1c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -17,6 +17,8 @@
 #include <linux/refcount.h>
 #include <linux/mempool.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-debugfs.h"
@@ -1198,6 +1200,7 @@ void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
 	 * lookup the zone write plug.
 	 */
 	bio->bi_iter.bi_sector = rq->__sector;
+	trace_blk_zone_append_update_request_bio(rq);
 }
 
 void blk_zone_write_plug_bio_endio(struct bio *bio)
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d88669b3ce02..4855abdf9880 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -404,6 +404,17 @@ DEFINE_EVENT(block_bio, block_getrq,
 	TP_ARGS(bio)
 );
 
+/**
+ * block_zone_update_request_bio - update the bio sector after a zone append
+ * @bio: the completed block IO operation
+ *
+ * Update the bio's bi_sector after a zone append command has been completed.
+ */
+DEFINE_EVENT(block_rq, blk_zone_append_update_request_bio,
+	     TP_PROTO(struct request *rq),
+	     TP_ARGS(rq)
+);
+
 /**
  * block_plug - keep operations requests in request queue
  * @q: request queue to plug
-- 
cgit v1.2.3


From 4020d22f0d08ccfc0d00a254a90250ff07333607 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 15 Jul 2025 13:53:23 +0200
Subject: block: add tracepoint for blkdev_zone_mgmt

Add a tracepoint for blkdev_zone_mgmt to trace zone management commands
submitted by higher layers like file systems or user space.

An example output for this tracepoint is as follows:

  mkfs.btrfs-203  [001] .....  42.877493: blkdev_zone_mgmt: 8,0 ZRS 5242880 + 0

This example output shows a REQ_OP_ZONE_RESET operation submitted by
mkfs.btrfs.

Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20250715115324.53308-5-johannes.thumshirn@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c            |  2 ++
 include/trace/events/block.h | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 69af3caa3b1c..70003a4bc7db 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -179,6 +179,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev)
 	struct bio bio;
 
 	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
+	trace_blkdev_zone_mgmt(&bio, 0);
 	return submit_bio_wait(&bio);
 }
 
@@ -242,6 +243,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 		cond_resched();
 	}
 
+	trace_blkdev_zone_mgmt(bio, nr_sectors);
 	ret = submit_bio_wait(bio);
 	bio_put(bio);
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 4855abdf9880..ff7698efdfde 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -599,6 +599,40 @@ TRACE_EVENT(block_rq_remap,
 		  (unsigned long long)__entry->old_sector, __entry->nr_bios)
 );
 
+/**
+ * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
+ * @bio: The block IO operation sent down to the device
+ * @nr_sectors: The number of sectors affected by this operation
+ *
+ * Execute a zone management operation on a specified range of zones. This
+ * range is encoded in %nr_sectors, which has to be a multiple of the zone
+ * size.
+ */
+TRACE_EVENT(blkdev_zone_mgmt,
+
+	TP_PROTO(struct bio *bio, sector_t nr_sectors),
+
+	TP_ARGS(bio, nr_sectors),
+
+	TP_STRUCT__entry(
+	    __field(  dev_t,	dev		)
+	    __field(  sector_t,	sector		)
+	    __field(  sector_t, nr_sectors	)
+	    __array(  char,	rwbs,	RWBS_LEN)
+	),
+
+	TP_fast_assign(
+	    __entry->dev	= bio_dev(bio);
+	    __entry->sector	= bio->bi_iter.bi_sector;
+	    __entry->nr_sectors	= bio_sectors(bio);
+	    blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
+        ),
+
+	TP_printk("%d,%d %s %llu + %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sectors)
+);
 #endif /* _TRACE_BLOCK_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 2e92ac61c9012ae4bcde2838c5f57f85e4b2623c Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Tue, 15 Jul 2025 13:53:24 +0200
Subject: block: add trace messages to zone write plugging

Add tracepoints to zone write plugging plug and unplug events.

Examples for these events are:

  kworker/u10:4-393  [001] d..1. 282.991660: disk_zone_wplug_add_bio: 8,0 zone 16, BIO 8388608 + 128
  kworker/0:1H-58    [ [000] d..1. 283.083294: blk_zone_wplug_bio: 8,0 zone 15, BIO 7864320 + 128

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20250715115324.53308-6-johannes.thumshirn@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c            |  5 +++++
 include/trace/events/block.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 70003a4bc7db..ef43aaca49f4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -822,6 +822,8 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
 	 * at the tail of the list to preserve the sequential write order.
 	 */
 	bio_list_add(&zwplug->bio_list, bio);
+	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
+				      bio->bi_iter.bi_sector, bio_sectors(bio));
 
 	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
 
@@ -1299,6 +1301,9 @@ again:
 		goto put_zwplug;
 	}
 
+	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
+				 bio->bi_iter.bi_sector, bio_sectors(bio));
+
 	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
 		blk_zone_wplug_bio_io_error(zwplug, bio);
 		goto again;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index ff7698efdfde..3e582d5e3a57 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -633,6 +633,50 @@ TRACE_EVENT(blkdev_zone_mgmt,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sectors)
 );
+
+DECLARE_EVENT_CLASS(block_zwplug,
+
+	TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
+		 unsigned int nr_sectors),
+
+	TP_ARGS(q, zno, sector, nr_sectors),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( unsigned int,	zno		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sectors	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(q->disk);
+		__entry->zno		= zno;
+		__entry->sector		= sector;
+		__entry->nr_sectors	= nr_sectors;
+	),
+
+	TP_printk("%d,%d zone %u, BIO %llu + %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->zno,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sectors)
+);
+
+DEFINE_EVENT(block_zwplug, disk_zone_wplug_add_bio,
+
+	TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
+		 unsigned int nr_sectors),
+
+	TP_ARGS(q, zno, sector, nr_sectors)
+);
+
+DEFINE_EVENT(block_zwplug, blk_zone_wplug_bio,
+
+	TP_PROTO(struct request_queue *q, unsigned int zno, sector_t sector,
+		 unsigned int nr_sectors),
+
+	TP_ARGS(q, zno, sector, nr_sectors)
+);
+
 #endif /* _TRACE_BLOCK_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 12f33ef6c2aaa410b7ccf039289fe2b04ab2252f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 13 Jul 2025 11:36:12 -0400
Subject: HID: core: Improve the kerneldoc for hid_report_len()

The kerneldoc for hid_report_len() needs to be improved.  The
description of the @report argument is ungrammatical, and the
documentation does not explain under what circumstances the report
length will include the byte reserved for the report ID.

Let's fix up the kerneldoc.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://patch.msgid.link/1c8416cb-7347-4a06-b00a-20518069d263@rowland.harvard.edu
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index 7f260e0e2049..2cc4f1e4ea96 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -1216,7 +1216,11 @@ static inline void hid_hw_wait(struct hid_device *hdev)
 /**
  * hid_report_len - calculate the report length
  *
- * @report: the report we want to know the length
+ * @report: the report whose length we want to know
+ *
+ * The length counts the report ID byte, but only if the ID is nonzero
+ * and therefore is included in the report.  Reports whose ID is zero
+ * never include an ID byte.
  */
 static inline u32 hid_report_len(struct hid_report *report)
 {
-- 
cgit v1.2.3


From c57ad862462f064c0bd943a5828f5e0eca469ca5 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 7 Jul 2025 13:41:51 +0100
Subject: ASoC: SDCA: Move SDCA search functions and export

The ASoC code for SDCA contains several helper functions that search for
controls/ranges/etc. As the code evolves these helpers are likely to be
useful to anything interacting with the stored DisCo data. Move the
helpers into sdca_function.c and export them so other modules can also
use them.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250707124155.2596744-4-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 11 +++++++
 sound/soc/sdca/sdca_asoc.c      | 73 ++++++++---------------------------------
 sound/soc/sdca/sdca_functions.c | 50 ++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index b4a97ff08729..543c09e99ab1 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1316,4 +1316,15 @@ int sdca_parse_function(struct device *dev,
 			struct sdca_function_desc *desc,
 			struct sdca_function_data *function);
 
+struct sdca_control *sdca_selector_find_control(struct device *dev,
+						struct sdca_entity *entity,
+						const int sel);
+struct sdca_control_range *sdca_control_find_range(struct device *dev,
+						   struct sdca_entity *entity,
+						   struct sdca_control *control,
+						   int cols, int rows);
+struct sdca_control_range *sdca_selector_find_range(struct device *dev,
+						    struct sdca_entity *entity,
+						    int sel, int cols, int rows);
+
 #endif
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index dd7b19083c85..11c9b3b935fc 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -26,53 +26,6 @@
 #include <sound/soc-dapm.h>
 #include <sound/tlv.h>
 
-static struct sdca_control *selector_find_control(struct device *dev,
-						  struct sdca_entity *entity,
-						  const int sel)
-{
-	int i;
-
-	for (i = 0; i < entity->num_controls; i++) {
-		struct sdca_control *control = &entity->controls[i];
-
-		if (control->sel == sel)
-			return control;
-	}
-
-	dev_err(dev, "%s: control %#x: missing\n", entity->label, sel);
-	return NULL;
-}
-
-static struct sdca_control_range *control_find_range(struct device *dev,
-						     struct sdca_entity *entity,
-						     struct sdca_control *control,
-						     int cols, int rows)
-{
-	struct sdca_control_range *range = &control->range;
-
-	if ((cols && range->cols != cols) || (rows && range->rows != rows) ||
-	    !range->data) {
-		dev_err(dev, "%s: control %#x: ranges invalid (%d,%d)\n",
-			entity->label, control->sel, range->cols, range->rows);
-		return NULL;
-	}
-
-	return range;
-}
-
-static struct sdca_control_range *selector_find_range(struct device *dev,
-						      struct sdca_entity *entity,
-						      int sel, int cols, int rows)
-{
-	struct sdca_control *control;
-
-	control = selector_find_control(dev, entity, sel);
-	if (!control)
-		return NULL;
-
-	return control_find_range(dev, entity, control, cols, rows);
-}
-
 static bool exported_control(struct sdca_entity *entity, struct sdca_control *control)
 {
 	switch (SDCA_CTL_TYPE(entity->type, control->sel)) {
@@ -213,7 +166,7 @@ static int entity_early_parse_ge(struct device *dev,
 	const char **texts;
 	int i;
 
-	control = selector_find_control(dev, entity, SDCA_CTL_GE_SELECTED_MODE);
+	control = sdca_selector_find_control(dev, entity, SDCA_CTL_GE_SELECTED_MODE);
 	if (!control)
 		return -EINVAL;
 
@@ -221,7 +174,7 @@ static int entity_early_parse_ge(struct device *dev,
 		dev_warn(dev, "%s: unexpected access layer: %x\n",
 			 entity->label, control->layers);
 
-	range = control_find_range(dev, entity, control, SDCA_SELECTED_MODE_NCOLS, 0);
+	range = sdca_control_find_range(dev, entity, control, SDCA_SELECTED_MODE_NCOLS, 0);
 	if (!range)
 		return -EINVAL;
 
@@ -443,7 +396,7 @@ static int entity_parse_pde(struct device *dev,
 	unsigned int mask = 0;
 	int i;
 
-	control = selector_find_control(dev, entity, SDCA_CTL_PDE_REQUESTED_PS);
+	control = sdca_selector_find_control(dev, entity, SDCA_CTL_PDE_REQUESTED_PS);
 	if (!control)
 		return -EINVAL;
 
@@ -452,7 +405,7 @@ static int entity_parse_pde(struct device *dev,
 		dev_warn(dev, "%s: unexpected access layer: %x\n",
 			 entity->label, control->layers);
 
-	range = control_find_range(dev, entity, control, SDCA_REQUESTED_PS_NCOLS, 0);
+	range = sdca_control_find_range(dev, entity, control, SDCA_REQUESTED_PS_NCOLS, 0);
 	if (!range)
 		return -EINVAL;
 
@@ -499,8 +452,8 @@ static int entity_parse_su_device(struct device *dev,
 		return -EINVAL;
 	}
 
-	range = selector_find_range(dev, entity->group, SDCA_CTL_GE_SELECTED_MODE,
-				    SDCA_SELECTED_MODE_NCOLS, 0);
+	range = sdca_selector_find_range(dev, entity->group, SDCA_CTL_GE_SELECTED_MODE,
+					 SDCA_SELECTED_MODE_NCOLS, 0);
 	if (!range)
 		return -EINVAL;
 
@@ -613,7 +566,7 @@ static int entity_parse_su(struct device *dev,
 		return -EINVAL;
 	}
 
-	control = selector_find_control(dev, entity, SDCA_CTL_SU_SELECTOR);
+	control = sdca_selector_find_control(dev, entity, SDCA_CTL_SU_SELECTOR);
 	if (!control)
 		return -EINVAL;
 
@@ -643,7 +596,7 @@ static int entity_parse_mu(struct device *dev,
 		return -EINVAL;
 	}
 
-	control = selector_find_control(dev, entity, SDCA_CTL_MU_MIXER);
+	control = sdca_selector_find_control(dev, entity, SDCA_CTL_MU_MIXER);
 	if (!control)
 		return -EINVAL;
 
@@ -853,7 +806,7 @@ static int control_limit_kctl(struct device *dev,
 	/*
 	 * FIXME: For now only handle the simple case of a single linear range
 	 */
-	range = control_find_range(dev, entity, control, SDCA_VOLUME_LINEAR_NCOLS, 1);
+	range = sdca_control_find_range(dev, entity, control, SDCA_VOLUME_LINEAR_NCOLS, 1);
 	if (!range)
 		return -EINVAL;
 
@@ -1140,9 +1093,9 @@ static int populate_rate_format(struct device *dev,
 	}
 
 	if (entity->iot.clock) {
-		range = selector_find_range(dev, entity->iot.clock,
-					    SDCA_CTL_CS_SAMPLERATEINDEX,
-					    SDCA_SAMPLERATEINDEX_NCOLS, 0);
+		range = sdca_selector_find_range(dev, entity->iot.clock,
+						 SDCA_CTL_CS_SAMPLERATEINDEX,
+						 SDCA_SAMPLERATEINDEX_NCOLS, 0);
 		if (!range)
 			return -EINVAL;
 
@@ -1154,7 +1107,7 @@ static int populate_rate_format(struct device *dev,
 		clock_rates = UINT_MAX;
 	}
 
-	range = selector_find_range(dev, entity, sel, SDCA_USAGE_NCOLS, 0);
+	range = sdca_selector_find_range(dev, entity, sel, SDCA_USAGE_NCOLS, 0);
 	if (!range)
 		return -EINVAL;
 
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index e567e364dc31..d2e2c8d10b92 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -1941,5 +1941,55 @@ int sdca_parse_function(struct device *dev,
 }
 EXPORT_SYMBOL_NS(sdca_parse_function, "SND_SOC_SDCA");
 
+struct sdca_control *sdca_selector_find_control(struct device *dev,
+						struct sdca_entity *entity,
+						const int sel)
+{
+	int i;
+
+	for (i = 0; i < entity->num_controls; i++) {
+		struct sdca_control *control = &entity->controls[i];
+
+		if (control->sel == sel)
+			return control;
+	}
+
+	dev_err(dev, "%s: control %#x: missing\n", entity->label, sel);
+	return NULL;
+}
+EXPORT_SYMBOL_NS(sdca_selector_find_control, "SND_SOC_SDCA");
+
+struct sdca_control_range *sdca_control_find_range(struct device *dev,
+						   struct sdca_entity *entity,
+						   struct sdca_control *control,
+						   int cols, int rows)
+{
+	struct sdca_control_range *range = &control->range;
+
+	if ((cols && range->cols != cols) || (rows && range->rows != rows) ||
+	    !range->data) {
+		dev_err(dev, "%s: control %#x: ranges invalid (%d,%d)\n",
+			entity->label, control->sel, range->cols, range->rows);
+		return NULL;
+	}
+
+	return range;
+}
+EXPORT_SYMBOL_NS(sdca_control_find_range, "SND_SOC_SDCA");
+
+struct sdca_control_range *sdca_selector_find_range(struct device *dev,
+						    struct sdca_entity *entity,
+						    int sel, int cols, int rows)
+{
+	struct sdca_control *control;
+
+	control = sdca_selector_find_control(dev, entity, sel);
+	if (!control)
+		return NULL;
+
+	return sdca_control_find_range(dev, entity, control, cols, rows);
+}
+EXPORT_SYMBOL_NS(sdca_selector_find_range, "SND_SOC_SDCA");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("SDCA library");
-- 
cgit v1.2.3


From 5f86d41d0410b072b5f4875ef5d38bf8d18eed55 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 7 Jul 2025 13:41:52 +0100
Subject: ASoC: soc-dai: Add private data to snd_soc_dai

Add a private data pointer that can be used to store context along
with the DAI. This will be useful to allow the SDCA class library to
store data separately from the CODEC driver itself.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250707124155.2596744-5-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dai.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h
index d19ab5572d2b..166c29557e9d 100644
--- a/include/sound/soc-dai.h
+++ b/include/sound/soc-dai.h
@@ -463,6 +463,9 @@ struct snd_soc_dai {
 
 	/* bit field */
 	unsigned int probed:1;
+
+	/* DAI private data */
+	void *priv;
 };
 
 static inline const struct snd_soc_pcm_stream *
-- 
cgit v1.2.3


From 7b0d60dbb468fa82e9053292cdc8a5436400bfaf Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 7 Jul 2025 13:41:53 +0100
Subject: ASoC: SDCA: Add helper to add DAI constraints

Currently the core SDCA code simply creates a place holder available
channels from 1 to SDCA_MAX_CHANNEL_COUNT. Add a helper function
that will constrain the number of channels based on the actual
available SDCA Clusters in DisCo. Currently this code only handles
Input Terminal Entities as they directly specify the Cluster. More
work will be required later for Output Terminals which inherit their
Cluster. Typically this new helper would be called from the DAIs
startup callback.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250707124155.2596744-6-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_asoc.h       | 10 +++++
 include/sound/sdca_function.h   | 12 +++++
 sound/soc/sdca/sdca_asoc.c      | 99 +++++++++++++++++++++++++++++++++++++++++
 sound/soc/sdca/sdca_functions.c | 18 ++++++++
 4 files changed, 139 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_asoc.h b/include/sound/sdca_asoc.h
index 9121531f0826..bbf146e4fcea 100644
--- a/include/sound/sdca_asoc.h
+++ b/include/sound/sdca_asoc.h
@@ -11,9 +11,12 @@
 #define __SDCA_ASOC_H__
 
 struct device;
+struct regmap;
 struct sdca_function_data;
 struct snd_kcontrol_new;
+struct snd_pcm_substream;
 struct snd_soc_component_driver;
+struct snd_soc_dai;
 struct snd_soc_dai_driver;
 struct snd_soc_dai_ops;
 struct snd_soc_dapm_route;
@@ -39,4 +42,11 @@ int sdca_asoc_populate_component(struct device *dev,
 				 struct snd_soc_dai_driver **dai_drv, int *num_dai_drv,
 				 const struct snd_soc_dai_ops *ops);
 
+int sdca_asoc_set_constraints(struct device *dev, struct regmap *regmap,
+			      struct sdca_function_data *function,
+			      struct snd_pcm_substream *substream,
+			      struct snd_soc_dai *dai);
+void sdca_asoc_free_constraints(struct snd_pcm_substream *substream,
+				struct snd_soc_dai *dai);
+
 #endif // __SDCA_ASOC_H__
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 543c09e99ab1..3bde07409bf3 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -1268,6 +1268,15 @@ struct sdca_cluster {
 	struct sdca_channel *channels;
 };
 
+/**
+ * enum sdca_cluster_range - SDCA Range column definitions for ClusterIndex
+ */
+enum sdca_cluster_range {
+	SDCA_CLUSTER_BYTEINDEX				= 0,
+	SDCA_CLUSTER_CLUSTERID				= 1,
+	SDCA_CLUSTER_NCOLS				= 2,
+};
+
 /**
  * struct sdca_function_data - top-level information for one SDCA function
  * @desc: Pointer to short descriptor from initial parsing.
@@ -1326,5 +1335,8 @@ struct sdca_control_range *sdca_control_find_range(struct device *dev,
 struct sdca_control_range *sdca_selector_find_range(struct device *dev,
 						    struct sdca_entity *entity,
 						    int sel, int cols, int rows);
+struct sdca_cluster *sdca_id_find_cluster(struct device *dev,
+					  struct sdca_function_data *function,
+					  const int id);
 
 #endif
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 11c9b3b935fc..1a0149287584 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -7,16 +7,20 @@
  * https://www.mipi.org/mipi-sdca-v1-0-download
  */
 
+#include <linux/bits.h>
 #include <linux/bitmap.h>
+#include <linux/build_bug.h>
 #include <linux/delay.h>
 #include <linux/dev_printk.h>
 #include <linux/device.h>
 #include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/overflow.h>
+#include <linux/regmap.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/string_helpers.h>
 #include <sound/control.h>
+#include <sound/pcm.h>
 #include <sound/sdca.h>
 #include <sound/sdca_asoc.h>
 #include <sound/sdca_function.h>
@@ -1269,3 +1273,98 @@ int sdca_asoc_populate_component(struct device *dev,
 	return 0;
 }
 EXPORT_SYMBOL_NS(sdca_asoc_populate_component, "SND_SOC_SDCA");
+
+/**
+ * sdca_asoc_set_constraints - constrain channels available on a DAI
+ * @dev: Pointer to the device, used for error messages.
+ * @regmap: Pointer to the Function register map.
+ * @function: Pointer to the Function information.
+ * @substream: Pointer to the PCM substream.
+ * @dai: Pointer to the ASoC DAI.
+ *
+ * Typically called from startup().
+ *
+ * Return: Returns zero on success, and a negative error code on failure.
+ */
+int sdca_asoc_set_constraints(struct device *dev, struct regmap *regmap,
+			      struct sdca_function_data *function,
+			      struct snd_pcm_substream *substream,
+			      struct snd_soc_dai *dai)
+{
+	static const unsigned int channel_list[] = {
+		 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	};
+	struct sdca_entity *entity = &function->entities[dai->id];
+	struct snd_pcm_hw_constraint_list *constraint;
+	struct sdca_control_range *range;
+	struct sdca_control *control;
+	unsigned int channel_mask = 0;
+	int i, ret;
+
+	static_assert(ARRAY_SIZE(channel_list) == SDCA_MAX_CHANNEL_COUNT);
+	static_assert(sizeof(channel_mask) * BITS_PER_BYTE >= SDCA_MAX_CHANNEL_COUNT);
+
+	if (entity->type != SDCA_ENTITY_TYPE_IT)
+		return 0;
+
+	control = sdca_selector_find_control(dev, entity, SDCA_CTL_IT_CLUSTERINDEX);
+	if (!control)
+		return -EINVAL;
+
+	range = sdca_control_find_range(dev, entity, control, SDCA_CLUSTER_NCOLS, 0);
+	if (!range)
+		return -EINVAL;
+
+	for (i = 0; i < range->rows; i++) {
+		int clusterid = sdca_range(range, SDCA_CLUSTER_CLUSTERID, i);
+		struct sdca_cluster *cluster;
+
+		cluster = sdca_id_find_cluster(dev, function, clusterid);
+		if (!cluster)
+			return -ENODEV;
+
+		channel_mask |= (1 << (cluster->num_channels - 1));
+	}
+
+	dev_dbg(dev, "%s: set channel constraint mask: %#x\n",
+		entity->label, channel_mask);
+
+	constraint = kzalloc(sizeof(*constraint), GFP_KERNEL);
+	if (!constraint)
+		return -ENOMEM;
+
+	constraint->count = ARRAY_SIZE(channel_list);
+	constraint->list = channel_list;
+	constraint->mask = channel_mask;
+
+	ret = snd_pcm_hw_constraint_list(substream->runtime, 0,
+					 SNDRV_PCM_HW_PARAM_CHANNELS,
+					 constraint);
+	if (ret) {
+		dev_err(dev, "%s: failed to add constraint: %d\n", entity->label, ret);
+		kfree(constraint);
+		return ret;
+	}
+
+	dai->priv = constraint;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_asoc_set_constraints, "SND_SOC_SDCA");
+
+/**
+ * sdca_asoc_free_constraints - free constraint allocations
+ * @substream: Pointer to the PCM substream.
+ * @dai: Pointer to the ASoC DAI.
+ *
+ * Typically called from shutdown().
+ */
+void sdca_asoc_free_constraints(struct snd_pcm_substream *substream,
+				struct snd_soc_dai *dai)
+{
+	struct snd_pcm_hw_constraint_list *constraint = dai->priv;
+
+	kfree(constraint);
+}
+EXPORT_SYMBOL_NS(sdca_asoc_free_constraints, "SND_SOC_SDCA");
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index d2e2c8d10b92..4b6da587c4ac 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -1991,5 +1991,23 @@ struct sdca_control_range *sdca_selector_find_range(struct device *dev,
 }
 EXPORT_SYMBOL_NS(sdca_selector_find_range, "SND_SOC_SDCA");
 
+struct sdca_cluster *sdca_id_find_cluster(struct device *dev,
+					  struct sdca_function_data *function,
+					  const int id)
+{
+	int i;
+
+	for (i = 0; i < function->num_clusters; i++) {
+		struct sdca_cluster *cluster = &function->clusters[i];
+
+		if (cluster->id == id)
+			return cluster;
+	}
+
+	dev_err(dev, "%s: cluster %#x: missing\n", function->desc->name, id);
+	return NULL;
+}
+EXPORT_SYMBOL_NS(sdca_id_find_cluster, "SND_SOC_SDCA");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("SDCA library");
-- 
cgit v1.2.3


From 264d3d776fb1a428706b0ca0f679bbed876fe7c9 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 7 Jul 2025 13:41:54 +0100
Subject: ASoC: SDCA: Add a helper to get the SoundWire port number

Add a helper function to extract the SoundWire hardware port number
from the SDCA DataPort Selector Control. Typically this would be
called from hw_params() and used to call sdw_stream_add_slave().

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250707124155.2596744-7-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_asoc.h     |  3 ++
 include/sound/sdca_function.h |  8 +++++
 sound/soc/sdca/sdca_asoc.c    | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_asoc.h b/include/sound/sdca_asoc.h
index bbf146e4fcea..800a26adcd8e 100644
--- a/include/sound/sdca_asoc.h
+++ b/include/sound/sdca_asoc.h
@@ -48,5 +48,8 @@ int sdca_asoc_set_constraints(struct device *dev, struct regmap *regmap,
 			      struct snd_soc_dai *dai);
 void sdca_asoc_free_constraints(struct snd_pcm_substream *substream,
 				struct snd_soc_dai *dai);
+int sdca_asoc_get_port(struct device *dev, struct regmap *regmap,
+		       struct sdca_function_data *function,
+		       struct snd_soc_dai *dai);
 
 #endif // __SDCA_ASOC_H__
diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 3bde07409bf3..90d77fc46416 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -185,6 +185,14 @@ enum sdca_usage_range {
 	SDCA_USAGE_NCOLS				= 7,
 };
 
+/**
+ * enum sdca_dataport_selector_range - Column definitions for DataPort_Selector
+ */
+enum sdca_dataport_selector_range {
+	SDCA_DATAPORT_SELECTOR_NCOLS			= 16,
+	SDCA_DATAPORT_SELECTOR_NROWS			= 4,
+};
+
 /**
  * enum sdca_mu_controls - SDCA Controls for Mixer Unit
  *
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 1a0149287584..03c663413cc9 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -19,6 +19,7 @@
 #include <linux/regmap.h>
 #include <linux/soundwire/sdw_registers.h>
 #include <linux/string_helpers.h>
+#include <linux/types.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
 #include <sound/sdca.h>
@@ -1368,3 +1369,77 @@ void sdca_asoc_free_constraints(struct snd_pcm_substream *substream,
 	kfree(constraint);
 }
 EXPORT_SYMBOL_NS(sdca_asoc_free_constraints, "SND_SOC_SDCA");
+
+/**
+ * sdca_asoc_get_port - return SoundWire port for a DAI
+ * @dev: Pointer to the device, used for error messages.
+ * @regmap: Pointer to the Function register map.
+ * @function: Pointer to the Function information.
+ * @dai: Pointer to the ASoC DAI.
+ *
+ * Typically called from hw_params().
+ *
+ * Return: Returns a positive port number on success, and a negative error
+ * code on failure.
+ */
+int sdca_asoc_get_port(struct device *dev, struct regmap *regmap,
+		       struct sdca_function_data *function,
+		       struct snd_soc_dai *dai)
+{
+	struct sdca_entity *entity = &function->entities[dai->id];
+	struct sdca_control_range *range;
+	unsigned int reg, val;
+	int sel = -EINVAL;
+	int i, ret;
+
+	switch (entity->type) {
+	case SDCA_ENTITY_TYPE_IT:
+		sel = SDCA_CTL_IT_DATAPORT_SELECTOR;
+		break;
+	case SDCA_ENTITY_TYPE_OT:
+		sel = SDCA_CTL_OT_DATAPORT_SELECTOR;
+		break;
+	default:
+		break;
+	}
+
+	if (sel < 0 || !entity->iot.is_dataport) {
+		dev_err(dev, "%s: port number only available for dataports\n",
+			entity->label);
+		return -EINVAL;
+	}
+
+	range = sdca_selector_find_range(dev, entity, sel, SDCA_DATAPORT_SELECTOR_NCOLS,
+					 SDCA_DATAPORT_SELECTOR_NROWS);
+	if (!range)
+		return -EINVAL;
+
+	reg = SDW_SDCA_CTL(function->desc->adr, entity->id, sel, 0);
+
+	ret = regmap_read(regmap, reg, &val);
+	if (ret) {
+		dev_err(dev, "%s: failed to read dataport selector: %d\n",
+			entity->label, ret);
+		return ret;
+	}
+
+	for (i = 0; i < range->rows; i++) {
+		static const u8 port_mask = 0xF;
+
+		sel = sdca_range(range, val & port_mask, i);
+
+		/*
+		 * FIXME: Currently only a single dataport is supported, so
+		 * return the first one found, technically up to 4 dataports
+		 * could be linked, but this is not yet supported.
+		 */
+		if (sel != 0xFF)
+			return sel;
+
+		val >>= hweight8(port_mask);
+	}
+
+	dev_err(dev, "%s: no dataport found\n", entity->label);
+	return -ENODEV;
+}
+EXPORT_SYMBOL_NS(sdca_asoc_get_port, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From 4ed357f72a0e0a691304e5f14a3323811c8ce862 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Mon, 7 Jul 2025 13:41:55 +0100
Subject: ASoC: SDCA: Add hw_params() helper function

Add a helper function that can be called from hw_params() in the DAI ops
to configure the SDCA Cluster, Clock and Usage controls. These setup the
channels, sample rate, and bit depths that will be used by the Terminal.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.dev>
Link: https://patch.msgid.link/20250707124155.2596744-8-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_asoc.h  |   6 ++
 sound/soc/sdca/sdca_asoc.c | 180 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+)

(limited to 'include')

diff --git a/include/sound/sdca_asoc.h b/include/sound/sdca_asoc.h
index 800a26adcd8e..aa9124f93218 100644
--- a/include/sound/sdca_asoc.h
+++ b/include/sound/sdca_asoc.h
@@ -14,6 +14,7 @@ struct device;
 struct regmap;
 struct sdca_function_data;
 struct snd_kcontrol_new;
+struct snd_pcm_hw_params;
 struct snd_pcm_substream;
 struct snd_soc_component_driver;
 struct snd_soc_dai;
@@ -51,5 +52,10 @@ void sdca_asoc_free_constraints(struct snd_pcm_substream *substream,
 int sdca_asoc_get_port(struct device *dev, struct regmap *regmap,
 		       struct sdca_function_data *function,
 		       struct snd_soc_dai *dai);
+int sdca_asoc_hw_params(struct device *dev, struct regmap *regmap,
+			struct sdca_function_data *function,
+			struct snd_pcm_substream *substream,
+			struct snd_pcm_hw_params *params,
+			struct snd_soc_dai *dai);
 
 #endif // __SDCA_ASOC_H__
diff --git a/sound/soc/sdca/sdca_asoc.c b/sound/soc/sdca/sdca_asoc.c
index 03c663413cc9..252d72377091 100644
--- a/sound/soc/sdca/sdca_asoc.c
+++ b/sound/soc/sdca/sdca_asoc.c
@@ -22,6 +22,7 @@
 #include <linux/types.h>
 #include <sound/control.h>
 #include <sound/pcm.h>
+#include <sound/pcm_params.h>
 #include <sound/sdca.h>
 #include <sound/sdca_asoc.h>
 #include <sound/sdca_function.h>
@@ -1443,3 +1444,182 @@ int sdca_asoc_get_port(struct device *dev, struct regmap *regmap,
 	return -ENODEV;
 }
 EXPORT_SYMBOL_NS(sdca_asoc_get_port, "SND_SOC_SDCA");
+
+static int set_cluster(struct device *dev, struct regmap *regmap,
+		       struct sdca_function_data *function,
+		       struct sdca_entity *entity, unsigned int channels)
+{
+	int sel = SDCA_CTL_IT_CLUSTERINDEX;
+	struct sdca_control_range *range;
+	int i, ret;
+
+	range = sdca_selector_find_range(dev, entity, sel, SDCA_CLUSTER_NCOLS, 0);
+	if (!range)
+		return -EINVAL;
+
+	for (i = 0; i < range->rows; i++) {
+		int cluster_id = sdca_range(range, SDCA_CLUSTER_CLUSTERID, i);
+		struct sdca_cluster *cluster;
+
+		cluster = sdca_id_find_cluster(dev, function, cluster_id);
+		if (!cluster)
+			return -ENODEV;
+
+		if (cluster->num_channels == channels) {
+			int index = sdca_range(range, SDCA_CLUSTER_BYTEINDEX, i);
+			unsigned int reg = SDW_SDCA_CTL(function->desc->adr,
+							entity->id, sel, 0);
+
+			ret = regmap_update_bits(regmap, reg, 0xFF, index);
+			if (ret) {
+				dev_err(dev, "%s: failed to write cluster index: %d\n",
+					entity->label, ret);
+				return ret;
+			}
+
+			dev_dbg(dev, "%s: set cluster to %d (%d channels)\n",
+				entity->label, index, channels);
+
+			return 0;
+		}
+	}
+
+	dev_err(dev, "%s: no cluster for %d channels\n", entity->label, channels);
+	return -EINVAL;
+}
+
+static int set_clock(struct device *dev, struct regmap *regmap,
+		     struct sdca_function_data *function,
+		     struct sdca_entity *entity, int target_rate)
+{
+	int sel = SDCA_CTL_CS_SAMPLERATEINDEX;
+	struct sdca_control_range *range;
+	int i, ret;
+
+	range = sdca_selector_find_range(dev, entity, sel, SDCA_SAMPLERATEINDEX_NCOLS, 0);
+	if (!range)
+		return -EINVAL;
+
+	for (i = 0; i < range->rows; i++) {
+		unsigned int rate = sdca_range(range, SDCA_SAMPLERATEINDEX_RATE, i);
+
+		if (rate == target_rate) {
+			unsigned int index = sdca_range(range,
+							SDCA_SAMPLERATEINDEX_INDEX,
+							i);
+			unsigned int reg = SDW_SDCA_CTL(function->desc->adr,
+							entity->id, sel, 0);
+
+			ret = regmap_update_bits(regmap, reg, 0xFF, index);
+			if (ret) {
+				dev_err(dev, "%s: failed to write clock rate: %d\n",
+					entity->label, ret);
+				return ret;
+			}
+
+			dev_dbg(dev, "%s: set clock rate to %d (%dHz)\n",
+				entity->label, index, rate);
+
+			return 0;
+		}
+	}
+
+	dev_err(dev, "%s: no clock rate for %dHz\n", entity->label, target_rate);
+	return -EINVAL;
+}
+
+static int set_usage(struct device *dev, struct regmap *regmap,
+		     struct sdca_function_data *function,
+		     struct sdca_entity *entity, int sel,
+		     int target_rate, int target_width)
+{
+	struct sdca_control_range *range;
+	int i, ret;
+
+	range = sdca_selector_find_range(dev, entity, sel, SDCA_USAGE_NCOLS, 0);
+	if (!range)
+		return -EINVAL;
+
+	for (i = 0; i < range->rows; i++) {
+		unsigned int rate = sdca_range(range, SDCA_USAGE_SAMPLE_RATE, i);
+		unsigned int width = sdca_range(range, SDCA_USAGE_SAMPLE_WIDTH, i);
+
+		if ((!rate || rate == target_rate) && width == target_width) {
+			unsigned int usage = sdca_range(range, SDCA_USAGE_NUMBER, i);
+			unsigned int reg = SDW_SDCA_CTL(function->desc->adr,
+							entity->id, sel, 0);
+
+			ret = regmap_update_bits(regmap, reg, 0xFF, usage);
+			if (ret) {
+				dev_err(dev, "%s: failed to write usage: %d\n",
+					entity->label, ret);
+				return ret;
+			}
+
+			dev_dbg(dev, "%s: set usage to %#x (%dHz, %d bits)\n",
+				entity->label, usage, target_rate, target_width);
+
+			return 0;
+		}
+	}
+
+	dev_err(dev, "%s: no usage for %dHz, %dbits\n",
+		entity->label, target_rate, target_width);
+	return -EINVAL;
+}
+
+/**
+ * sdca_asoc_hw_params - set SDCA channels, sample rate and bit depth
+ * @dev: Pointer to the device, used for error messages.
+ * @regmap: Pointer to the Function register map.
+ * @function: Pointer to the Function information.
+ * @substream: Pointer to the PCM substream.
+ * @params: Pointer to the hardware parameters.
+ * @dai: Pointer to the ASoC DAI.
+ *
+ * Typically called from hw_params().
+ *
+ * Return: Returns zero on success, and a negative error code on failure.
+ */
+int sdca_asoc_hw_params(struct device *dev, struct regmap *regmap,
+			struct sdca_function_data *function,
+			struct snd_pcm_substream *substream,
+			struct snd_pcm_hw_params *params,
+			struct snd_soc_dai *dai)
+{
+	struct sdca_entity *entity = &function->entities[dai->id];
+	int channels = params_channels(params);
+	int width = params_width(params);
+	int rate = params_rate(params);
+	int usage_sel;
+	int ret;
+
+	switch (entity->type) {
+	case SDCA_ENTITY_TYPE_IT:
+		ret = set_cluster(dev, regmap, function, entity, channels);
+		if (ret)
+			return ret;
+
+		usage_sel = SDCA_CTL_IT_USAGE;
+		break;
+	case SDCA_ENTITY_TYPE_OT:
+		usage_sel = SDCA_CTL_OT_USAGE;
+		break;
+	default:
+		dev_err(dev, "%s: hw_params on non-terminal entity\n", entity->label);
+		return -EINVAL;
+	}
+
+	if (entity->iot.clock) {
+		ret = set_clock(dev, regmap, function, entity->iot.clock, rate);
+		if (ret)
+			return ret;
+	}
+
+	ret = set_usage(dev, regmap, function, entity, usage_sel, rate, width);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL_NS(sdca_asoc_hw_params, "SND_SOC_SDCA");
-- 
cgit v1.2.3


From c34632dbb29ba7016f1cd2e629ac9dd07f84ce50 Mon Sep 17 00:00:00 2001
From: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Date: Mon, 14 Jul 2025 13:02:02 -0400
Subject: bnxt: move bnxt_hsi.h to include/linux/bnxt/hsi.h

This moves bnxt_hsi.h contents to a common location so it can be
properly referenced by bnxt_en, bnxt_re, and bnge.

Signed-off-by: Andy Gospodarek <gospo@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20250714170202.39688-1-gospo@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/hw/bnxt_re/roce_hsi.h           |     4 +-
 drivers/net/ethernet/broadcom/bnge/bnge.h          |     2 +-
 drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h     |     2 +-
 drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c |     2 +-
 drivers/net/ethernet/broadcom/bnge/bnge_netdev.h   |     2 +-
 drivers/net/ethernet/broadcom/bnge/bnge_rmem.c     |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c      |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c  |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h  |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c  |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c      |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c  |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h      | 10914 -------------------
 drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c    |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c     |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h     |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c      |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c    |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c       |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c      |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c      |     2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |     2 +-
 include/linux/bnxt/hsi.h                           | 10914 +++++++++++++++++++
 25 files changed, 10938 insertions(+), 10938 deletions(-)
 delete mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
 create mode 100644 include/linux/bnxt/hsi.h

(limited to 'include')

diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index 7eceb3e9f4ce..024845f945ff 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -39,8 +39,8 @@
 #ifndef __BNXT_RE_HSI_H__
 #define __BNXT_RE_HSI_H__
 
-/* include bnxt_hsi.h from bnxt_en driver */
-#include "bnxt_hsi.h"
+/* include linux/bnxt/hsi.h */
+#include <linux/bnxt/hsi.h>
 
 /* tx_doorbell (size:32b/4B) */
 struct tx_doorbell {
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge.h b/drivers/net/ethernet/broadcom/bnge/bnge.h
index a1795302c15a..6fb3683b6b04 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge.h
@@ -8,7 +8,7 @@
 #define DRV_SUMMARY	"Broadcom 800G Ethernet Linux Driver"
 
 #include <linux/etherdevice.h>
-#include "../bnxt/bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnge_rmem.h"
 #include "bnge_resc.h"
 
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
index 012aa4fa5aa9..83794a12cc81 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm.h
@@ -4,7 +4,7 @@
 #ifndef _BNGE_HWRM_H_
 #define _BNGE_HWRM_H_
 
-#include "../bnxt/bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 
 enum bnge_hwrm_ctx_flags {
 	BNGE_HWRM_INTERNAL_CTX_OWNED	= BIT(0),
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c
index 19091318cfdd..5c178fade065 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_hwrm_lib.c
@@ -5,9 +5,9 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/pci.h>
+#include <linux/bnxt/hsi.h>
 
 #include "bnge.h"
-#include "../bnxt/bnxt_hsi.h"
 #include "bnge_hwrm.h"
 #include "bnge_hwrm_lib.h"
 #include "bnge_rmem.h"
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.h b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.h
index 96b77e44b552..a650d71a58db 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.h
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.h
@@ -4,7 +4,7 @@
 #ifndef _BNGE_NETDEV_H_
 #define _BNGE_NETDEV_H_
 
-#include "../bnxt/bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 
 struct tx_bd {
 	__le32 tx_bd_len_flags_type;
diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
index 0e935cc46da6..52ada65943a0 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
@@ -9,9 +9,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 #include <linux/crash_dump.h>
+#include <linux/bnxt/hsi.h>
 
 #include "bnge.h"
-#include "../bnxt/bnxt_hsi.h"
 #include "bnge_hwrm_lib.h"
 #include "bnge_rmem.h"
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 6bbe875132b0..de8080df69a8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -58,8 +58,8 @@
 #include <net/netdev_queues.h>
 #include <net/netdev_rx_queue.h>
 #include <linux/pci-tph.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ulp.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c
index 67e70d3d0980..18d6c94d5cb8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c
@@ -10,7 +10,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_coredump.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
index 71e14be2507e..a00b67334f9b 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dcb.c
@@ -16,7 +16,7 @@
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
 #include <rdma/ib_verbs.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_dcb.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
index 127b7015f676..3324afbb3bec 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.c
@@ -10,7 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include <linux/dim.h>
 #include "bnxt.h"
 #include "bnxt_debugfs.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h
index d0bb4887acd0..a0a8d687dd99 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_debugfs.h
@@ -7,7 +7,7 @@
  * the Free Software Foundation.
  */
 
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 777880594a04..4c4581b0342e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -12,7 +12,7 @@
 #include <linux/vmalloc.h>
 #include <net/devlink.h>
 #include <net/netdev_lock.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_vfr.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
index 6f6576dc417a..53a3bcb0efe0 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_dim.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/dim.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 
 void bnxt_dim_work(struct work_struct *work)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 4c10373abffd..1b37612b1c01 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -26,7 +26,7 @@
 #include <linux/timecounter.h>
 #include <net/netdev_queues.h>
 #include <net/netlink.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ulp.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
deleted file mode 100644
index 549231703bce..000000000000
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h
+++ /dev/null
@@ -1,10914 +0,0 @@
-/* Broadcom NetXtreme-C/E network driver.
- *
- * Copyright (c) 2014-2016 Broadcom Corporation
- * Copyright (c) 2014-2018 Broadcom Limited
- * Copyright (c) 2018-2025 Broadcom Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation.
- *
- * DO NOT MODIFY!!! This file is automatically generated.
- */
-
-#ifndef _BNXT_HSI_H_
-#define _BNXT_HSI_H_
-
-/* hwrm_cmd_hdr (size:128b/16B) */
-struct hwrm_cmd_hdr {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_resp_hdr (size:64b/8B) */
-struct hwrm_resp_hdr {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-};
-
-#define CMD_DISCR_TLV_ENCAP 0x8000UL
-#define CMD_DISCR_LAST     CMD_DISCR_TLV_ENCAP
-
-
-#define TLV_TYPE_HWRM_REQUEST                    0x1UL
-#define TLV_TYPE_HWRM_RESPONSE                   0x2UL
-#define TLV_TYPE_ROCE_SP_COMMAND                 0x3UL
-#define TLV_TYPE_QUERY_ROCE_CC_GEN1              0x4UL
-#define TLV_TYPE_MODIFY_ROCE_CC_GEN1             0x5UL
-#define TLV_TYPE_QUERY_ROCE_CC_GEN2              0x6UL
-#define TLV_TYPE_MODIFY_ROCE_CC_GEN2             0x7UL
-#define TLV_TYPE_QUERY_ROCE_CC_GEN1_EXT          0x8UL
-#define TLV_TYPE_MODIFY_ROCE_CC_GEN1_EXT         0x9UL
-#define TLV_TYPE_QUERY_ROCE_CC_GEN2_EXT          0xaUL
-#define TLV_TYPE_MODIFY_ROCE_CC_GEN2_EXT         0xbUL
-#define TLV_TYPE_ENGINE_CKV_ALIAS_ECC_PUBLIC_KEY 0x8001UL
-#define TLV_TYPE_ENGINE_CKV_IV                   0x8003UL
-#define TLV_TYPE_ENGINE_CKV_AUTH_TAG             0x8004UL
-#define TLV_TYPE_ENGINE_CKV_CIPHERTEXT           0x8005UL
-#define TLV_TYPE_ENGINE_CKV_HOST_ALGORITHMS      0x8006UL
-#define TLV_TYPE_ENGINE_CKV_HOST_ECC_PUBLIC_KEY  0x8007UL
-#define TLV_TYPE_ENGINE_CKV_ECDSA_SIGNATURE      0x8008UL
-#define TLV_TYPE_ENGINE_CKV_FW_ECC_PUBLIC_KEY    0x8009UL
-#define TLV_TYPE_ENGINE_CKV_FW_ALGORITHMS        0x800aUL
-#define TLV_TYPE_LAST                           TLV_TYPE_ENGINE_CKV_FW_ALGORITHMS
-
-
-/* tlv (size:64b/8B) */
-struct tlv {
-	__le16	cmd_discr;
-	u8	reserved_8b;
-	u8	flags;
-	#define TLV_FLAGS_MORE         0x1UL
-	#define TLV_FLAGS_MORE_LAST      0x0UL
-	#define TLV_FLAGS_MORE_NOT_LAST  0x1UL
-	#define TLV_FLAGS_REQUIRED     0x2UL
-	#define TLV_FLAGS_REQUIRED_NO    (0x0UL << 1)
-	#define TLV_FLAGS_REQUIRED_YES   (0x1UL << 1)
-	#define TLV_FLAGS_REQUIRED_LAST TLV_FLAGS_REQUIRED_YES
-	__le16	tlv_type;
-	__le16	length;
-};
-
-/* input (size:128b/16B) */
-struct input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* output (size:64b/8B) */
-struct output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-};
-
-/* hwrm_short_input (size:128b/16B) */
-struct hwrm_short_input {
-	__le16	req_type;
-	__le16	signature;
-	#define SHORT_REQ_SIGNATURE_SHORT_CMD 0x4321UL
-	#define SHORT_REQ_SIGNATURE_LAST     SHORT_REQ_SIGNATURE_SHORT_CMD
-	__le16	target_id;
-	#define SHORT_REQ_TARGET_ID_DEFAULT 0x0UL
-	#define SHORT_REQ_TARGET_ID_TOOLS   0xfffdUL
-	#define SHORT_REQ_TARGET_ID_LAST   SHORT_REQ_TARGET_ID_TOOLS
-	__le16	size;
-	__le64	req_addr;
-};
-
-/* cmd_nums (size:64b/8B) */
-struct cmd_nums {
-	__le16	req_type;
-	#define HWRM_VER_GET                              0x0UL
-	#define HWRM_FUNC_ECHO_RESPONSE                   0xbUL
-	#define HWRM_ERROR_RECOVERY_QCFG                  0xcUL
-	#define HWRM_FUNC_DRV_IF_CHANGE                   0xdUL
-	#define HWRM_FUNC_BUF_UNRGTR                      0xeUL
-	#define HWRM_FUNC_VF_CFG                          0xfUL
-	#define HWRM_RESERVED1                            0x10UL
-	#define HWRM_FUNC_RESET                           0x11UL
-	#define HWRM_FUNC_GETFID                          0x12UL
-	#define HWRM_FUNC_VF_ALLOC                        0x13UL
-	#define HWRM_FUNC_VF_FREE                         0x14UL
-	#define HWRM_FUNC_QCAPS                           0x15UL
-	#define HWRM_FUNC_QCFG                            0x16UL
-	#define HWRM_FUNC_CFG                             0x17UL
-	#define HWRM_FUNC_QSTATS                          0x18UL
-	#define HWRM_FUNC_CLR_STATS                       0x19UL
-	#define HWRM_FUNC_DRV_UNRGTR                      0x1aUL
-	#define HWRM_FUNC_VF_RESC_FREE                    0x1bUL
-	#define HWRM_FUNC_VF_VNIC_IDS_QUERY               0x1cUL
-	#define HWRM_FUNC_DRV_RGTR                        0x1dUL
-	#define HWRM_FUNC_DRV_QVER                        0x1eUL
-	#define HWRM_FUNC_BUF_RGTR                        0x1fUL
-	#define HWRM_PORT_PHY_CFG                         0x20UL
-	#define HWRM_PORT_MAC_CFG                         0x21UL
-	#define HWRM_PORT_TS_QUERY                        0x22UL
-	#define HWRM_PORT_QSTATS                          0x23UL
-	#define HWRM_PORT_LPBK_QSTATS                     0x24UL
-	#define HWRM_PORT_CLR_STATS                       0x25UL
-	#define HWRM_PORT_LPBK_CLR_STATS                  0x26UL
-	#define HWRM_PORT_PHY_QCFG                        0x27UL
-	#define HWRM_PORT_MAC_QCFG                        0x28UL
-	#define HWRM_PORT_MAC_PTP_QCFG                    0x29UL
-	#define HWRM_PORT_PHY_QCAPS                       0x2aUL
-	#define HWRM_PORT_PHY_I2C_WRITE                   0x2bUL
-	#define HWRM_PORT_PHY_I2C_READ                    0x2cUL
-	#define HWRM_PORT_LED_CFG                         0x2dUL
-	#define HWRM_PORT_LED_QCFG                        0x2eUL
-	#define HWRM_PORT_LED_QCAPS                       0x2fUL
-	#define HWRM_QUEUE_QPORTCFG                       0x30UL
-	#define HWRM_QUEUE_QCFG                           0x31UL
-	#define HWRM_QUEUE_CFG                            0x32UL
-	#define HWRM_FUNC_VLAN_CFG                        0x33UL
-	#define HWRM_FUNC_VLAN_QCFG                       0x34UL
-	#define HWRM_QUEUE_PFCENABLE_QCFG                 0x35UL
-	#define HWRM_QUEUE_PFCENABLE_CFG                  0x36UL
-	#define HWRM_QUEUE_PRI2COS_QCFG                   0x37UL
-	#define HWRM_QUEUE_PRI2COS_CFG                    0x38UL
-	#define HWRM_QUEUE_COS2BW_QCFG                    0x39UL
-	#define HWRM_QUEUE_COS2BW_CFG                     0x3aUL
-	#define HWRM_QUEUE_DSCP_QCAPS                     0x3bUL
-	#define HWRM_QUEUE_DSCP2PRI_QCFG                  0x3cUL
-	#define HWRM_QUEUE_DSCP2PRI_CFG                   0x3dUL
-	#define HWRM_VNIC_ALLOC                           0x40UL
-	#define HWRM_VNIC_FREE                            0x41UL
-	#define HWRM_VNIC_CFG                             0x42UL
-	#define HWRM_VNIC_QCFG                            0x43UL
-	#define HWRM_VNIC_TPA_CFG                         0x44UL
-	#define HWRM_VNIC_TPA_QCFG                        0x45UL
-	#define HWRM_VNIC_RSS_CFG                         0x46UL
-	#define HWRM_VNIC_RSS_QCFG                        0x47UL
-	#define HWRM_VNIC_PLCMODES_CFG                    0x48UL
-	#define HWRM_VNIC_PLCMODES_QCFG                   0x49UL
-	#define HWRM_VNIC_QCAPS                           0x4aUL
-	#define HWRM_VNIC_UPDATE                          0x4bUL
-	#define HWRM_RING_ALLOC                           0x50UL
-	#define HWRM_RING_FREE                            0x51UL
-	#define HWRM_RING_CMPL_RING_QAGGINT_PARAMS        0x52UL
-	#define HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS     0x53UL
-	#define HWRM_RING_AGGINT_QCAPS                    0x54UL
-	#define HWRM_RING_SCHQ_ALLOC                      0x55UL
-	#define HWRM_RING_SCHQ_CFG                        0x56UL
-	#define HWRM_RING_SCHQ_FREE                       0x57UL
-	#define HWRM_RING_RESET                           0x5eUL
-	#define HWRM_RING_GRP_ALLOC                       0x60UL
-	#define HWRM_RING_GRP_FREE                        0x61UL
-	#define HWRM_RING_CFG                             0x62UL
-	#define HWRM_RING_QCFG                            0x63UL
-	#define HWRM_RESERVED5                            0x64UL
-	#define HWRM_RESERVED6                            0x65UL
-	#define HWRM_VNIC_RSS_COS_LB_CTX_ALLOC            0x70UL
-	#define HWRM_VNIC_RSS_COS_LB_CTX_FREE             0x71UL
-	#define HWRM_QUEUE_MPLS_QCAPS                     0x80UL
-	#define HWRM_QUEUE_MPLSTC2PRI_QCFG                0x81UL
-	#define HWRM_QUEUE_MPLSTC2PRI_CFG                 0x82UL
-	#define HWRM_QUEUE_VLANPRI_QCAPS                  0x83UL
-	#define HWRM_QUEUE_VLANPRI2PRI_QCFG               0x84UL
-	#define HWRM_QUEUE_VLANPRI2PRI_CFG                0x85UL
-	#define HWRM_QUEUE_GLOBAL_CFG                     0x86UL
-	#define HWRM_QUEUE_GLOBAL_QCFG                    0x87UL
-	#define HWRM_QUEUE_ADPTV_QOS_RX_FEATURE_QCFG      0x88UL
-	#define HWRM_QUEUE_ADPTV_QOS_RX_FEATURE_CFG       0x89UL
-	#define HWRM_QUEUE_ADPTV_QOS_TX_FEATURE_QCFG      0x8aUL
-	#define HWRM_QUEUE_ADPTV_QOS_TX_FEATURE_CFG       0x8bUL
-	#define HWRM_QUEUE_QCAPS                          0x8cUL
-	#define HWRM_QUEUE_ADPTV_QOS_RX_TUNING_QCFG       0x8dUL
-	#define HWRM_QUEUE_ADPTV_QOS_RX_TUNING_CFG        0x8eUL
-	#define HWRM_QUEUE_ADPTV_QOS_TX_TUNING_QCFG       0x8fUL
-	#define HWRM_CFA_L2_FILTER_ALLOC                  0x90UL
-	#define HWRM_CFA_L2_FILTER_FREE                   0x91UL
-	#define HWRM_CFA_L2_FILTER_CFG                    0x92UL
-	#define HWRM_CFA_L2_SET_RX_MASK                   0x93UL
-	#define HWRM_CFA_VLAN_ANTISPOOF_CFG               0x94UL
-	#define HWRM_CFA_TUNNEL_FILTER_ALLOC              0x95UL
-	#define HWRM_CFA_TUNNEL_FILTER_FREE               0x96UL
-	#define HWRM_CFA_ENCAP_RECORD_ALLOC               0x97UL
-	#define HWRM_CFA_ENCAP_RECORD_FREE                0x98UL
-	#define HWRM_CFA_NTUPLE_FILTER_ALLOC              0x99UL
-	#define HWRM_CFA_NTUPLE_FILTER_FREE               0x9aUL
-	#define HWRM_CFA_NTUPLE_FILTER_CFG                0x9bUL
-	#define HWRM_CFA_EM_FLOW_ALLOC                    0x9cUL
-	#define HWRM_CFA_EM_FLOW_FREE                     0x9dUL
-	#define HWRM_CFA_EM_FLOW_CFG                      0x9eUL
-	#define HWRM_TUNNEL_DST_PORT_QUERY                0xa0UL
-	#define HWRM_TUNNEL_DST_PORT_ALLOC                0xa1UL
-	#define HWRM_TUNNEL_DST_PORT_FREE                 0xa2UL
-	#define HWRM_QUEUE_ADPTV_QOS_TX_TUNING_CFG        0xa3UL
-	#define HWRM_STAT_CTX_ENG_QUERY                   0xafUL
-	#define HWRM_STAT_CTX_ALLOC                       0xb0UL
-	#define HWRM_STAT_CTX_FREE                        0xb1UL
-	#define HWRM_STAT_CTX_QUERY                       0xb2UL
-	#define HWRM_STAT_CTX_CLR_STATS                   0xb3UL
-	#define HWRM_PORT_QSTATS_EXT                      0xb4UL
-	#define HWRM_PORT_PHY_MDIO_WRITE                  0xb5UL
-	#define HWRM_PORT_PHY_MDIO_READ                   0xb6UL
-	#define HWRM_PORT_PHY_MDIO_BUS_ACQUIRE            0xb7UL
-	#define HWRM_PORT_PHY_MDIO_BUS_RELEASE            0xb8UL
-	#define HWRM_PORT_QSTATS_EXT_PFC_WD               0xb9UL
-	#define HWRM_RESERVED7                            0xbaUL
-	#define HWRM_PORT_TX_FIR_CFG                      0xbbUL
-	#define HWRM_PORT_TX_FIR_QCFG                     0xbcUL
-	#define HWRM_PORT_ECN_QSTATS                      0xbdUL
-	#define HWRM_FW_LIVEPATCH_QUERY                   0xbeUL
-	#define HWRM_FW_LIVEPATCH                         0xbfUL
-	#define HWRM_FW_RESET                             0xc0UL
-	#define HWRM_FW_QSTATUS                           0xc1UL
-	#define HWRM_FW_HEALTH_CHECK                      0xc2UL
-	#define HWRM_FW_SYNC                              0xc3UL
-	#define HWRM_FW_STATE_QCAPS                       0xc4UL
-	#define HWRM_FW_STATE_QUIESCE                     0xc5UL
-	#define HWRM_FW_STATE_BACKUP                      0xc6UL
-	#define HWRM_FW_STATE_RESTORE                     0xc7UL
-	#define HWRM_FW_SET_TIME                          0xc8UL
-	#define HWRM_FW_GET_TIME                          0xc9UL
-	#define HWRM_FW_SET_STRUCTURED_DATA               0xcaUL
-	#define HWRM_FW_GET_STRUCTURED_DATA               0xcbUL
-	#define HWRM_FW_IPC_MAILBOX                       0xccUL
-	#define HWRM_FW_ECN_CFG                           0xcdUL
-	#define HWRM_FW_ECN_QCFG                          0xceUL
-	#define HWRM_FW_SECURE_CFG                        0xcfUL
-	#define HWRM_EXEC_FWD_RESP                        0xd0UL
-	#define HWRM_REJECT_FWD_RESP                      0xd1UL
-	#define HWRM_FWD_RESP                             0xd2UL
-	#define HWRM_FWD_ASYNC_EVENT_CMPL                 0xd3UL
-	#define HWRM_OEM_CMD                              0xd4UL
-	#define HWRM_PORT_PRBS_TEST                       0xd5UL
-	#define HWRM_PORT_SFP_SIDEBAND_CFG                0xd6UL
-	#define HWRM_PORT_SFP_SIDEBAND_QCFG               0xd7UL
-	#define HWRM_FW_STATE_UNQUIESCE                   0xd8UL
-	#define HWRM_PORT_DSC_DUMP                        0xd9UL
-	#define HWRM_PORT_EP_TX_QCFG                      0xdaUL
-	#define HWRM_PORT_EP_TX_CFG                       0xdbUL
-	#define HWRM_PORT_CFG                             0xdcUL
-	#define HWRM_PORT_QCFG                            0xddUL
-	#define HWRM_PORT_MAC_QCAPS                       0xdfUL
-	#define HWRM_TEMP_MONITOR_QUERY                   0xe0UL
-	#define HWRM_REG_POWER_QUERY                      0xe1UL
-	#define HWRM_CORE_FREQUENCY_QUERY                 0xe2UL
-	#define HWRM_REG_POWER_HISTOGRAM                  0xe3UL
-	#define HWRM_WOL_FILTER_ALLOC                     0xf0UL
-	#define HWRM_WOL_FILTER_FREE                      0xf1UL
-	#define HWRM_WOL_FILTER_QCFG                      0xf2UL
-	#define HWRM_WOL_REASON_QCFG                      0xf3UL
-	#define HWRM_CFA_METER_QCAPS                      0xf4UL
-	#define HWRM_CFA_METER_PROFILE_ALLOC              0xf5UL
-	#define HWRM_CFA_METER_PROFILE_FREE               0xf6UL
-	#define HWRM_CFA_METER_PROFILE_CFG                0xf7UL
-	#define HWRM_CFA_METER_INSTANCE_ALLOC             0xf8UL
-	#define HWRM_CFA_METER_INSTANCE_FREE              0xf9UL
-	#define HWRM_CFA_METER_INSTANCE_CFG               0xfaUL
-	#define HWRM_CFA_VFR_ALLOC                        0xfdUL
-	#define HWRM_CFA_VFR_FREE                         0xfeUL
-	#define HWRM_CFA_VF_PAIR_ALLOC                    0x100UL
-	#define HWRM_CFA_VF_PAIR_FREE                     0x101UL
-	#define HWRM_CFA_VF_PAIR_INFO                     0x102UL
-	#define HWRM_CFA_FLOW_ALLOC                       0x103UL
-	#define HWRM_CFA_FLOW_FREE                        0x104UL
-	#define HWRM_CFA_FLOW_FLUSH                       0x105UL
-	#define HWRM_CFA_FLOW_STATS                       0x106UL
-	#define HWRM_CFA_FLOW_INFO                        0x107UL
-	#define HWRM_CFA_DECAP_FILTER_ALLOC               0x108UL
-	#define HWRM_CFA_DECAP_FILTER_FREE                0x109UL
-	#define HWRM_CFA_VLAN_ANTISPOOF_QCFG              0x10aUL
-	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_ALLOC       0x10bUL
-	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_FREE        0x10cUL
-	#define HWRM_CFA_PAIR_ALLOC                       0x10dUL
-	#define HWRM_CFA_PAIR_FREE                        0x10eUL
-	#define HWRM_CFA_PAIR_INFO                        0x10fUL
-	#define HWRM_FW_IPC_MSG                           0x110UL
-	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_INFO        0x111UL
-	#define HWRM_CFA_REDIRECT_QUERY_TUNNEL_TYPE       0x112UL
-	#define HWRM_CFA_FLOW_AGING_TIMER_RESET           0x113UL
-	#define HWRM_CFA_FLOW_AGING_CFG                   0x114UL
-	#define HWRM_CFA_FLOW_AGING_QCFG                  0x115UL
-	#define HWRM_CFA_FLOW_AGING_QCAPS                 0x116UL
-	#define HWRM_CFA_CTX_MEM_RGTR                     0x117UL
-	#define HWRM_CFA_CTX_MEM_UNRGTR                   0x118UL
-	#define HWRM_CFA_CTX_MEM_QCTX                     0x119UL
-	#define HWRM_CFA_CTX_MEM_QCAPS                    0x11aUL
-	#define HWRM_CFA_COUNTER_QCAPS                    0x11bUL
-	#define HWRM_CFA_COUNTER_CFG                      0x11cUL
-	#define HWRM_CFA_COUNTER_QCFG                     0x11dUL
-	#define HWRM_CFA_COUNTER_QSTATS                   0x11eUL
-	#define HWRM_CFA_TCP_FLAG_PROCESS_QCFG            0x11fUL
-	#define HWRM_CFA_EEM_QCAPS                        0x120UL
-	#define HWRM_CFA_EEM_CFG                          0x121UL
-	#define HWRM_CFA_EEM_QCFG                         0x122UL
-	#define HWRM_CFA_EEM_OP                           0x123UL
-	#define HWRM_CFA_ADV_FLOW_MGNT_QCAPS              0x124UL
-	#define HWRM_CFA_TFLIB                            0x125UL
-	#define HWRM_CFA_LAG_GROUP_MEMBER_RGTR            0x126UL
-	#define HWRM_CFA_LAG_GROUP_MEMBER_UNRGTR          0x127UL
-	#define HWRM_CFA_TLS_FILTER_ALLOC                 0x128UL
-	#define HWRM_CFA_TLS_FILTER_FREE                  0x129UL
-	#define HWRM_CFA_RELEASE_AFM_FUNC                 0x12aUL
-	#define HWRM_ENGINE_CKV_STATUS                    0x12eUL
-	#define HWRM_ENGINE_CKV_CKEK_ADD                  0x12fUL
-	#define HWRM_ENGINE_CKV_CKEK_DELETE               0x130UL
-	#define HWRM_ENGINE_CKV_KEY_ADD                   0x131UL
-	#define HWRM_ENGINE_CKV_KEY_DELETE                0x132UL
-	#define HWRM_ENGINE_CKV_FLUSH                     0x133UL
-	#define HWRM_ENGINE_CKV_RNG_GET                   0x134UL
-	#define HWRM_ENGINE_CKV_KEY_GEN                   0x135UL
-	#define HWRM_ENGINE_CKV_KEY_LABEL_CFG             0x136UL
-	#define HWRM_ENGINE_CKV_KEY_LABEL_QCFG            0x137UL
-	#define HWRM_ENGINE_QG_CONFIG_QUERY               0x13cUL
-	#define HWRM_ENGINE_QG_QUERY                      0x13dUL
-	#define HWRM_ENGINE_QG_METER_PROFILE_CONFIG_QUERY 0x13eUL
-	#define HWRM_ENGINE_QG_METER_PROFILE_QUERY        0x13fUL
-	#define HWRM_ENGINE_QG_METER_PROFILE_ALLOC        0x140UL
-	#define HWRM_ENGINE_QG_METER_PROFILE_FREE         0x141UL
-	#define HWRM_ENGINE_QG_METER_QUERY                0x142UL
-	#define HWRM_ENGINE_QG_METER_BIND                 0x143UL
-	#define HWRM_ENGINE_QG_METER_UNBIND               0x144UL
-	#define HWRM_ENGINE_QG_FUNC_BIND                  0x145UL
-	#define HWRM_ENGINE_SG_CONFIG_QUERY               0x146UL
-	#define HWRM_ENGINE_SG_QUERY                      0x147UL
-	#define HWRM_ENGINE_SG_METER_QUERY                0x148UL
-	#define HWRM_ENGINE_SG_METER_CONFIG               0x149UL
-	#define HWRM_ENGINE_SG_QG_BIND                    0x14aUL
-	#define HWRM_ENGINE_QG_SG_UNBIND                  0x14bUL
-	#define HWRM_ENGINE_CONFIG_QUERY                  0x154UL
-	#define HWRM_ENGINE_STATS_CONFIG                  0x155UL
-	#define HWRM_ENGINE_STATS_CLEAR                   0x156UL
-	#define HWRM_ENGINE_STATS_QUERY                   0x157UL
-	#define HWRM_ENGINE_STATS_QUERY_CONTINUOUS_ERROR  0x158UL
-	#define HWRM_ENGINE_RQ_ALLOC                      0x15eUL
-	#define HWRM_ENGINE_RQ_FREE                       0x15fUL
-	#define HWRM_ENGINE_CQ_ALLOC                      0x160UL
-	#define HWRM_ENGINE_CQ_FREE                       0x161UL
-	#define HWRM_ENGINE_NQ_ALLOC                      0x162UL
-	#define HWRM_ENGINE_NQ_FREE                       0x163UL
-	#define HWRM_ENGINE_ON_DIE_RQE_CREDITS            0x164UL
-	#define HWRM_ENGINE_FUNC_QCFG                     0x165UL
-	#define HWRM_FUNC_RESOURCE_QCAPS                  0x190UL
-	#define HWRM_FUNC_VF_RESOURCE_CFG                 0x191UL
-	#define HWRM_FUNC_BACKING_STORE_QCAPS             0x192UL
-	#define HWRM_FUNC_BACKING_STORE_CFG               0x193UL
-	#define HWRM_FUNC_BACKING_STORE_QCFG              0x194UL
-	#define HWRM_FUNC_VF_BW_CFG                       0x195UL
-	#define HWRM_FUNC_VF_BW_QCFG                      0x196UL
-	#define HWRM_FUNC_HOST_PF_IDS_QUERY               0x197UL
-	#define HWRM_FUNC_QSTATS_EXT                      0x198UL
-	#define HWRM_STAT_EXT_CTX_QUERY                   0x199UL
-	#define HWRM_FUNC_SPD_CFG                         0x19aUL
-	#define HWRM_FUNC_SPD_QCFG                        0x19bUL
-	#define HWRM_FUNC_PTP_PIN_QCFG                    0x19cUL
-	#define HWRM_FUNC_PTP_PIN_CFG                     0x19dUL
-	#define HWRM_FUNC_PTP_CFG                         0x19eUL
-	#define HWRM_FUNC_PTP_TS_QUERY                    0x19fUL
-	#define HWRM_FUNC_PTP_EXT_CFG                     0x1a0UL
-	#define HWRM_FUNC_PTP_EXT_QCFG                    0x1a1UL
-	#define HWRM_FUNC_KEY_CTX_ALLOC                   0x1a2UL
-	#define HWRM_FUNC_BACKING_STORE_CFG_V2            0x1a3UL
-	#define HWRM_FUNC_BACKING_STORE_QCFG_V2           0x1a4UL
-	#define HWRM_FUNC_DBR_PACING_CFG                  0x1a5UL
-	#define HWRM_FUNC_DBR_PACING_QCFG                 0x1a6UL
-	#define HWRM_FUNC_DBR_PACING_BROADCAST_EVENT      0x1a7UL
-	#define HWRM_FUNC_BACKING_STORE_QCAPS_V2          0x1a8UL
-	#define HWRM_FUNC_DBR_PACING_NQLIST_QUERY         0x1a9UL
-	#define HWRM_FUNC_DBR_RECOVERY_COMPLETED          0x1aaUL
-	#define HWRM_FUNC_SYNCE_CFG                       0x1abUL
-	#define HWRM_FUNC_SYNCE_QCFG                      0x1acUL
-	#define HWRM_FUNC_KEY_CTX_FREE                    0x1adUL
-	#define HWRM_FUNC_LAG_MODE_CFG                    0x1aeUL
-	#define HWRM_FUNC_LAG_MODE_QCFG                   0x1afUL
-	#define HWRM_FUNC_LAG_CREATE                      0x1b0UL
-	#define HWRM_FUNC_LAG_UPDATE                      0x1b1UL
-	#define HWRM_FUNC_LAG_FREE                        0x1b2UL
-	#define HWRM_FUNC_LAG_QCFG                        0x1b3UL
-	#define HWRM_FUNC_TIMEDTX_PACING_RATE_ADD         0x1c2UL
-	#define HWRM_FUNC_TIMEDTX_PACING_RATE_DELETE      0x1c3UL
-	#define HWRM_FUNC_TIMEDTX_PACING_RATE_QUERY       0x1c4UL
-	#define HWRM_SELFTEST_QLIST                       0x200UL
-	#define HWRM_SELFTEST_EXEC                        0x201UL
-	#define HWRM_SELFTEST_IRQ                         0x202UL
-	#define HWRM_SELFTEST_RETRIEVE_SERDES_DATA        0x203UL
-	#define HWRM_PCIE_QSTATS                          0x204UL
-	#define HWRM_MFG_FRU_WRITE_CONTROL                0x205UL
-	#define HWRM_MFG_TIMERS_QUERY                     0x206UL
-	#define HWRM_MFG_OTP_CFG                          0x207UL
-	#define HWRM_MFG_OTP_QCFG                         0x208UL
-	#define HWRM_MFG_HDMA_TEST                        0x209UL
-	#define HWRM_MFG_FRU_EEPROM_WRITE                 0x20aUL
-	#define HWRM_MFG_FRU_EEPROM_READ                  0x20bUL
-	#define HWRM_MFG_SOC_IMAGE                        0x20cUL
-	#define HWRM_MFG_SOC_QSTATUS                      0x20dUL
-	#define HWRM_MFG_PARAM_CRITICAL_DATA_FINALIZE     0x20eUL
-	#define HWRM_MFG_PARAM_CRITICAL_DATA_READ         0x20fUL
-	#define HWRM_MFG_PARAM_CRITICAL_DATA_HEALTH       0x210UL
-	#define HWRM_MFG_PRVSN_EXPORT_CSR                 0x211UL
-	#define HWRM_MFG_PRVSN_IMPORT_CERT                0x212UL
-	#define HWRM_MFG_PRVSN_GET_STATE                  0x213UL
-	#define HWRM_MFG_GET_NVM_MEASUREMENT              0x214UL
-	#define HWRM_MFG_PSOC_QSTATUS                     0x215UL
-	#define HWRM_MFG_SELFTEST_QLIST                   0x216UL
-	#define HWRM_MFG_SELFTEST_EXEC                    0x217UL
-	#define HWRM_STAT_GENERIC_QSTATS                  0x218UL
-	#define HWRM_MFG_PRVSN_EXPORT_CERT                0x219UL
-	#define HWRM_STAT_DB_ERROR_QSTATS                 0x21aUL
-	#define HWRM_MFG_TESTS                            0x21bUL
-	#define HWRM_MFG_WRITE_CERT_NVM                   0x21cUL
-	#define HWRM_PORT_POE_CFG                         0x230UL
-	#define HWRM_PORT_POE_QCFG                        0x231UL
-	#define HWRM_UDCC_QCAPS                           0x258UL
-	#define HWRM_UDCC_CFG                             0x259UL
-	#define HWRM_UDCC_QCFG                            0x25aUL
-	#define HWRM_UDCC_SESSION_CFG                     0x25bUL
-	#define HWRM_UDCC_SESSION_QCFG                    0x25cUL
-	#define HWRM_UDCC_SESSION_QUERY                   0x25dUL
-	#define HWRM_UDCC_COMP_CFG                        0x25eUL
-	#define HWRM_UDCC_COMP_QCFG                       0x25fUL
-	#define HWRM_UDCC_COMP_QUERY                      0x260UL
-	#define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS            0x261UL
-	#define HWRM_QUEUE_PFCWD_TIMEOUT_CFG              0x262UL
-	#define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG             0x263UL
-	#define HWRM_TF                                   0x2bcUL
-	#define HWRM_TF_VERSION_GET                       0x2bdUL
-	#define HWRM_TF_SESSION_OPEN                      0x2c6UL
-	#define HWRM_TF_SESSION_REGISTER                  0x2c8UL
-	#define HWRM_TF_SESSION_UNREGISTER                0x2c9UL
-	#define HWRM_TF_SESSION_CLOSE                     0x2caUL
-	#define HWRM_TF_SESSION_QCFG                      0x2cbUL
-	#define HWRM_TF_SESSION_RESC_QCAPS                0x2ccUL
-	#define HWRM_TF_SESSION_RESC_ALLOC                0x2cdUL
-	#define HWRM_TF_SESSION_RESC_FREE                 0x2ceUL
-	#define HWRM_TF_SESSION_RESC_FLUSH                0x2cfUL
-	#define HWRM_TF_SESSION_RESC_INFO                 0x2d0UL
-	#define HWRM_TF_SESSION_HOTUP_STATE_SET           0x2d1UL
-	#define HWRM_TF_SESSION_HOTUP_STATE_GET           0x2d2UL
-	#define HWRM_TF_TBL_TYPE_GET                      0x2daUL
-	#define HWRM_TF_TBL_TYPE_SET                      0x2dbUL
-	#define HWRM_TF_TBL_TYPE_BULK_GET                 0x2dcUL
-	#define HWRM_TF_EM_INSERT                         0x2eaUL
-	#define HWRM_TF_EM_DELETE                         0x2ebUL
-	#define HWRM_TF_EM_HASH_INSERT                    0x2ecUL
-	#define HWRM_TF_EM_MOVE                           0x2edUL
-	#define HWRM_TF_TCAM_SET                          0x2f8UL
-	#define HWRM_TF_TCAM_GET                          0x2f9UL
-	#define HWRM_TF_TCAM_MOVE                         0x2faUL
-	#define HWRM_TF_TCAM_FREE                         0x2fbUL
-	#define HWRM_TF_GLOBAL_CFG_SET                    0x2fcUL
-	#define HWRM_TF_GLOBAL_CFG_GET                    0x2fdUL
-	#define HWRM_TF_IF_TBL_SET                        0x2feUL
-	#define HWRM_TF_IF_TBL_GET                        0x2ffUL
-	#define HWRM_TF_RESC_USAGE_SET                    0x300UL
-	#define HWRM_TF_RESC_USAGE_QUERY                  0x301UL
-	#define HWRM_TF_TBL_TYPE_ALLOC                    0x302UL
-	#define HWRM_TF_TBL_TYPE_FREE                     0x303UL
-	#define HWRM_TFC_TBL_SCOPE_QCAPS                  0x380UL
-	#define HWRM_TFC_TBL_SCOPE_ID_ALLOC               0x381UL
-	#define HWRM_TFC_TBL_SCOPE_CONFIG                 0x382UL
-	#define HWRM_TFC_TBL_SCOPE_DECONFIG               0x383UL
-	#define HWRM_TFC_TBL_SCOPE_FID_ADD                0x384UL
-	#define HWRM_TFC_TBL_SCOPE_FID_REM                0x385UL
-	#define HWRM_TFC_TBL_SCOPE_POOL_ALLOC             0x386UL
-	#define HWRM_TFC_TBL_SCOPE_POOL_FREE              0x387UL
-	#define HWRM_TFC_SESSION_ID_ALLOC                 0x388UL
-	#define HWRM_TFC_SESSION_FID_ADD                  0x389UL
-	#define HWRM_TFC_SESSION_FID_REM                  0x38aUL
-	#define HWRM_TFC_IDENT_ALLOC                      0x38bUL
-	#define HWRM_TFC_IDENT_FREE                       0x38cUL
-	#define HWRM_TFC_IDX_TBL_ALLOC                    0x38dUL
-	#define HWRM_TFC_IDX_TBL_ALLOC_SET                0x38eUL
-	#define HWRM_TFC_IDX_TBL_SET                      0x38fUL
-	#define HWRM_TFC_IDX_TBL_GET                      0x390UL
-	#define HWRM_TFC_IDX_TBL_FREE                     0x391UL
-	#define HWRM_TFC_GLOBAL_ID_ALLOC                  0x392UL
-	#define HWRM_TFC_TCAM_SET                         0x393UL
-	#define HWRM_TFC_TCAM_GET                         0x394UL
-	#define HWRM_TFC_TCAM_ALLOC                       0x395UL
-	#define HWRM_TFC_TCAM_ALLOC_SET                   0x396UL
-	#define HWRM_TFC_TCAM_FREE                        0x397UL
-	#define HWRM_TFC_IF_TBL_SET                       0x398UL
-	#define HWRM_TFC_IF_TBL_GET                       0x399UL
-	#define HWRM_TFC_TBL_SCOPE_CONFIG_GET             0x39aUL
-	#define HWRM_TFC_RESC_USAGE_QUERY                 0x39bUL
-	#define HWRM_TFC_GLOBAL_ID_FREE                   0x39cUL
-	#define HWRM_TFC_TCAM_PRI_UPDATE                  0x39dUL
-	#define HWRM_TFC_HOT_UPGRADE_PROCESS              0x3a0UL
-	#define HWRM_SV                                   0x400UL
-	#define HWRM_DBG_SERDES_TEST                      0xff0eUL
-	#define HWRM_DBG_LOG_BUFFER_FLUSH                 0xff0fUL
-	#define HWRM_DBG_READ_DIRECT                      0xff10UL
-	#define HWRM_DBG_READ_INDIRECT                    0xff11UL
-	#define HWRM_DBG_WRITE_DIRECT                     0xff12UL
-	#define HWRM_DBG_WRITE_INDIRECT                   0xff13UL
-	#define HWRM_DBG_DUMP                             0xff14UL
-	#define HWRM_DBG_ERASE_NVM                        0xff15UL
-	#define HWRM_DBG_CFG                              0xff16UL
-	#define HWRM_DBG_COREDUMP_LIST                    0xff17UL
-	#define HWRM_DBG_COREDUMP_INITIATE                0xff18UL
-	#define HWRM_DBG_COREDUMP_RETRIEVE                0xff19UL
-	#define HWRM_DBG_FW_CLI                           0xff1aUL
-	#define HWRM_DBG_I2C_CMD                          0xff1bUL
-	#define HWRM_DBG_RING_INFO_GET                    0xff1cUL
-	#define HWRM_DBG_CRASHDUMP_HEADER                 0xff1dUL
-	#define HWRM_DBG_CRASHDUMP_ERASE                  0xff1eUL
-	#define HWRM_DBG_DRV_TRACE                        0xff1fUL
-	#define HWRM_DBG_QCAPS                            0xff20UL
-	#define HWRM_DBG_QCFG                             0xff21UL
-	#define HWRM_DBG_CRASHDUMP_MEDIUM_CFG             0xff22UL
-	#define HWRM_DBG_USEQ_ALLOC                       0xff23UL
-	#define HWRM_DBG_USEQ_FREE                        0xff24UL
-	#define HWRM_DBG_USEQ_FLUSH                       0xff25UL
-	#define HWRM_DBG_USEQ_QCAPS                       0xff26UL
-	#define HWRM_DBG_USEQ_CW_CFG                      0xff27UL
-	#define HWRM_DBG_USEQ_SCHED_CFG                   0xff28UL
-	#define HWRM_DBG_USEQ_RUN                         0xff29UL
-	#define HWRM_DBG_USEQ_DELIVERY_REQ                0xff2aUL
-	#define HWRM_DBG_USEQ_RESP_HDR                    0xff2bUL
-	#define HWRM_DBG_COREDUMP_CAPTURE                 0xff2cUL
-	#define HWRM_DBG_PTRACE                           0xff2dUL
-	#define HWRM_DBG_SIM_CABLE_STATE                  0xff2eUL
-	#define HWRM_NVM_GET_VPD_FIELD_INFO               0xffeaUL
-	#define HWRM_NVM_SET_VPD_FIELD_INFO               0xffebUL
-	#define HWRM_NVM_DEFRAG                           0xffecUL
-	#define HWRM_NVM_REQ_ARBITRATION                  0xffedUL
-	#define HWRM_NVM_FACTORY_DEFAULTS                 0xffeeUL
-	#define HWRM_NVM_VALIDATE_OPTION                  0xffefUL
-	#define HWRM_NVM_FLUSH                            0xfff0UL
-	#define HWRM_NVM_GET_VARIABLE                     0xfff1UL
-	#define HWRM_NVM_SET_VARIABLE                     0xfff2UL
-	#define HWRM_NVM_INSTALL_UPDATE                   0xfff3UL
-	#define HWRM_NVM_MODIFY                           0xfff4UL
-	#define HWRM_NVM_VERIFY_UPDATE                    0xfff5UL
-	#define HWRM_NVM_GET_DEV_INFO                     0xfff6UL
-	#define HWRM_NVM_ERASE_DIR_ENTRY                  0xfff7UL
-	#define HWRM_NVM_MOD_DIR_ENTRY                    0xfff8UL
-	#define HWRM_NVM_FIND_DIR_ENTRY                   0xfff9UL
-	#define HWRM_NVM_GET_DIR_ENTRIES                  0xfffaUL
-	#define HWRM_NVM_GET_DIR_INFO                     0xfffbUL
-	#define HWRM_NVM_RAW_DUMP                         0xfffcUL
-	#define HWRM_NVM_READ                             0xfffdUL
-	#define HWRM_NVM_WRITE                            0xfffeUL
-	#define HWRM_NVM_RAW_WRITE_BLK                    0xffffUL
-	#define HWRM_LAST                                HWRM_NVM_RAW_WRITE_BLK
-	__le16	unused_0[3];
-};
-
-/* ret_codes (size:64b/8B) */
-struct ret_codes {
-	__le16	error_code;
-	#define HWRM_ERR_CODE_SUCCESS                      0x0UL
-	#define HWRM_ERR_CODE_FAIL                         0x1UL
-	#define HWRM_ERR_CODE_INVALID_PARAMS               0x2UL
-	#define HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED       0x3UL
-	#define HWRM_ERR_CODE_RESOURCE_ALLOC_ERROR         0x4UL
-	#define HWRM_ERR_CODE_INVALID_FLAGS                0x5UL
-	#define HWRM_ERR_CODE_INVALID_ENABLES              0x6UL
-	#define HWRM_ERR_CODE_UNSUPPORTED_TLV              0x7UL
-	#define HWRM_ERR_CODE_NO_BUFFER                    0x8UL
-	#define HWRM_ERR_CODE_UNSUPPORTED_OPTION_ERR       0x9UL
-	#define HWRM_ERR_CODE_HOT_RESET_PROGRESS           0xaUL
-	#define HWRM_ERR_CODE_HOT_RESET_FAIL               0xbUL
-	#define HWRM_ERR_CODE_NO_FLOW_COUNTER_DURING_ALLOC 0xcUL
-	#define HWRM_ERR_CODE_KEY_HASH_COLLISION           0xdUL
-	#define HWRM_ERR_CODE_KEY_ALREADY_EXISTS           0xeUL
-	#define HWRM_ERR_CODE_HWRM_ERROR                   0xfUL
-	#define HWRM_ERR_CODE_BUSY                         0x10UL
-	#define HWRM_ERR_CODE_RESOURCE_LOCKED              0x11UL
-	#define HWRM_ERR_CODE_PF_UNAVAILABLE               0x12UL
-	#define HWRM_ERR_CODE_ENTITY_NOT_PRESENT           0x13UL
-	#define HWRM_ERR_CODE_SECURE_SOC_ERROR             0x14UL
-	#define HWRM_ERR_CODE_TLV_ENCAPSULATED_RESPONSE    0x8000UL
-	#define HWRM_ERR_CODE_UNKNOWN_ERR                  0xfffeUL
-	#define HWRM_ERR_CODE_CMD_NOT_SUPPORTED            0xffffUL
-	#define HWRM_ERR_CODE_LAST                        HWRM_ERR_CODE_CMD_NOT_SUPPORTED
-	__le16	unused_0[3];
-};
-
-/* hwrm_err_output (size:128b/16B) */
-struct hwrm_err_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	opaque_0;
-	__le16	opaque_1;
-	u8	cmd_err;
-	u8	valid;
-};
-#define HWRM_NA_SIGNATURE ((__le32)(-1))
-#define HWRM_MAX_REQ_LEN 128
-#define HWRM_MAX_RESP_LEN 704
-#define HW_HASH_INDEX_SIZE 0x80
-#define HW_HASH_KEY_SIZE 40
-#define HWRM_RESP_VALID_KEY 1
-#define HWRM_TARGET_ID_BONO 0xFFF8
-#define HWRM_TARGET_ID_KONG 0xFFF9
-#define HWRM_TARGET_ID_APE 0xFFFA
-#define HWRM_TARGET_ID_TOOLS 0xFFFD
-#define HWRM_VERSION_MAJOR 1
-#define HWRM_VERSION_MINOR 10
-#define HWRM_VERSION_UPDATE 3
-#define HWRM_VERSION_RSVD 97
-#define HWRM_VERSION_STR "1.10.3.97"
-
-/* hwrm_ver_get_input (size:192b/24B) */
-struct hwrm_ver_get_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	hwrm_intf_maj;
-	u8	hwrm_intf_min;
-	u8	hwrm_intf_upd;
-	u8	unused_0[5];
-};
-
-/* hwrm_ver_get_output (size:1408b/176B) */
-struct hwrm_ver_get_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	hwrm_intf_maj_8b;
-	u8	hwrm_intf_min_8b;
-	u8	hwrm_intf_upd_8b;
-	u8	hwrm_intf_rsvd_8b;
-	u8	hwrm_fw_maj_8b;
-	u8	hwrm_fw_min_8b;
-	u8	hwrm_fw_bld_8b;
-	u8	hwrm_fw_rsvd_8b;
-	u8	mgmt_fw_maj_8b;
-	u8	mgmt_fw_min_8b;
-	u8	mgmt_fw_bld_8b;
-	u8	mgmt_fw_rsvd_8b;
-	u8	netctrl_fw_maj_8b;
-	u8	netctrl_fw_min_8b;
-	u8	netctrl_fw_bld_8b;
-	u8	netctrl_fw_rsvd_8b;
-	__le32	dev_caps_cfg;
-	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_FW_UPD_SUPPORTED                  0x1UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_FW_DCBX_AGENT_SUPPORTED                  0x2UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED                      0x4UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED                       0x8UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_KONG_MB_CHNL_SUPPORTED                   0x10UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_FLOW_HANDLE_64BIT_SUPPORTED              0x20UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_L2_FILTER_TYPES_ROCE_OR_L2_SUPPORTED     0x40UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_VIRTIO_VSWITCH_OFFLOAD_SUPPORTED         0x80UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_TRUSTED_VF_SUPPORTED                     0x100UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_FLOW_AGING_SUPPORTED                     0x200UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_ADV_FLOW_COUNTERS_SUPPORTED              0x400UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_EEM_SUPPORTED                        0x800UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_ADV_FLOW_MGNT_SUPPORTED              0x1000UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_TFLIB_SUPPORTED                      0x2000UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED                    0x4000UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE                      0x8000UL
-	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE                       0x10000UL
-	u8	roce_fw_maj_8b;
-	u8	roce_fw_min_8b;
-	u8	roce_fw_bld_8b;
-	u8	roce_fw_rsvd_8b;
-	char	hwrm_fw_name[16];
-	char	mgmt_fw_name[16];
-	char	netctrl_fw_name[16];
-	char	active_pkg_name[16];
-	char	roce_fw_name[16];
-	__le16	chip_num;
-	u8	chip_rev;
-	u8	chip_metal;
-	u8	chip_bond_id;
-	u8	chip_platform_type;
-	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_ASIC      0x0UL
-	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_FPGA      0x1UL
-	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM 0x2UL
-	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_LAST     VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM
-	__le16	max_req_win_len;
-	__le16	max_resp_len;
-	__le16	def_req_timeout;
-	u8	flags;
-	#define VER_GET_RESP_FLAGS_DEV_NOT_RDY                   0x1UL
-	#define VER_GET_RESP_FLAGS_EXT_VER_AVAIL                 0x2UL
-	#define VER_GET_RESP_FLAGS_DEV_NOT_RDY_BACKING_STORE     0x4UL
-	u8	unused_0[2];
-	u8	always_1;
-	__le16	hwrm_intf_major;
-	__le16	hwrm_intf_minor;
-	__le16	hwrm_intf_build;
-	__le16	hwrm_intf_patch;
-	__le16	hwrm_fw_major;
-	__le16	hwrm_fw_minor;
-	__le16	hwrm_fw_build;
-	__le16	hwrm_fw_patch;
-	__le16	mgmt_fw_major;
-	__le16	mgmt_fw_minor;
-	__le16	mgmt_fw_build;
-	__le16	mgmt_fw_patch;
-	__le16	netctrl_fw_major;
-	__le16	netctrl_fw_minor;
-	__le16	netctrl_fw_build;
-	__le16	netctrl_fw_patch;
-	__le16	roce_fw_major;
-	__le16	roce_fw_minor;
-	__le16	roce_fw_build;
-	__le16	roce_fw_patch;
-	__le16	max_ext_req_len;
-	__le16	max_req_timeout;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* eject_cmpl (size:128b/16B) */
-struct eject_cmpl {
-	__le16	type;
-	#define EJECT_CMPL_TYPE_MASK       0x3fUL
-	#define EJECT_CMPL_TYPE_SFT        0
-	#define EJECT_CMPL_TYPE_STAT_EJECT   0x1aUL
-	#define EJECT_CMPL_TYPE_LAST        EJECT_CMPL_TYPE_STAT_EJECT
-	#define EJECT_CMPL_FLAGS_MASK      0xffc0UL
-	#define EJECT_CMPL_FLAGS_SFT       6
-	#define EJECT_CMPL_FLAGS_ERROR      0x40UL
-	__le16	len;
-	__le32	opaque;
-	__le16	v;
-	#define EJECT_CMPL_V                              0x1UL
-	#define EJECT_CMPL_ERRORS_MASK                    0xfffeUL
-	#define EJECT_CMPL_ERRORS_SFT                     1
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_MASK        0xeUL
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_SFT         1
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_NO_BUFFER     (0x0UL << 1)
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_DID_NOT_FIT   (0x1UL << 1)
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_BAD_FORMAT    (0x3UL << 1)
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_FLUSH         (0x5UL << 1)
-	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_LAST         EJECT_CMPL_ERRORS_BUFFER_ERROR_FLUSH
-	__le16	reserved16;
-	__le32	unused_2;
-};
-
-/* hwrm_cmpl (size:128b/16B) */
-struct hwrm_cmpl {
-	__le16	type;
-	#define CMPL_TYPE_MASK     0x3fUL
-	#define CMPL_TYPE_SFT      0
-	#define CMPL_TYPE_HWRM_DONE  0x20UL
-	#define CMPL_TYPE_LAST      CMPL_TYPE_HWRM_DONE
-	__le16	sequence_id;
-	__le32	unused_1;
-	__le32	v;
-	#define CMPL_V     0x1UL
-	__le32	unused_3;
-};
-
-/* hwrm_fwd_req_cmpl (size:128b/16B) */
-struct hwrm_fwd_req_cmpl {
-	__le16	req_len_type;
-	#define FWD_REQ_CMPL_TYPE_MASK        0x3fUL
-	#define FWD_REQ_CMPL_TYPE_SFT         0
-	#define FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ  0x22UL
-	#define FWD_REQ_CMPL_TYPE_LAST         FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ
-	#define FWD_REQ_CMPL_REQ_LEN_MASK     0xffc0UL
-	#define FWD_REQ_CMPL_REQ_LEN_SFT      6
-	__le16	source_id;
-	__le32	unused0;
-	__le32	req_buf_addr_v[2];
-	#define FWD_REQ_CMPL_V                0x1UL
-	#define FWD_REQ_CMPL_REQ_BUF_ADDR_MASK 0xfffffffeUL
-	#define FWD_REQ_CMPL_REQ_BUF_ADDR_SFT 1
-};
-
-/* hwrm_fwd_resp_cmpl (size:128b/16B) */
-struct hwrm_fwd_resp_cmpl {
-	__le16	type;
-	#define FWD_RESP_CMPL_TYPE_MASK         0x3fUL
-	#define FWD_RESP_CMPL_TYPE_SFT          0
-	#define FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP  0x24UL
-	#define FWD_RESP_CMPL_TYPE_LAST          FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP
-	__le16	source_id;
-	__le16	resp_len;
-	__le16	unused_1;
-	__le32	resp_buf_addr_v[2];
-	#define FWD_RESP_CMPL_V                 0x1UL
-	#define FWD_RESP_CMPL_RESP_BUF_ADDR_MASK 0xfffffffeUL
-	#define FWD_RESP_CMPL_RESP_BUF_ADDR_SFT 1
-};
-
-/* hwrm_async_event_cmpl (size:128b/16B) */
-struct hwrm_async_event_cmpl {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_TYPE_LAST             ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE              0x0UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_MTU_CHANGE                 0x1UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CHANGE               0x2UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE               0x3UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED           0x4UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_NOT_ALLOWED      0x5UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE           0x6UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_PHY_CFG_CHANGE             0x7UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY                    0x8UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY                  0x9UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG                0xaUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_UNLOAD                0x10UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_LOAD                  0x11UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_FLR_PROC_CMPLT             0x12UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD                  0x20UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_LOAD                    0x21UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_FLR                          0x30UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_MAC_ADDR_CHANGE              0x31UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_VF_COMM_STATUS_CHANGE        0x32UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE                   0x33UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LLFC_PFC_CHANGE                 0x34UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DEFAULT_VNIC_CHANGE             0x35UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_HW_FLOW_AGED                    0x36UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION              0x37UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CACHE_FLUSH_REQ             0x38UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CACHE_FLUSH_DONE            0x39UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_TCP_FLAG_ACTION_CHANGE          0x3aUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_FLOW_ACTIVE                 0x3bUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CFG_CHANGE                  0x3cUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_TFLIB_DEFAULT_VNIC_CHANGE       0x3dUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_TFLIB_LINK_STATUS_CHANGE        0x3eUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_QUIESCE_DONE                    0x3fUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DEFERRED_RESPONSE               0x40UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PFC_WATCHDOG_CFG_CHANGE         0x41UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST                    0x42UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE                      0x43UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PPS_TIMESTAMP                   0x44UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_ERROR_REPORT                    0x45UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DOORBELL_PACING_THRESHOLD       0x46UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_RSS_CHANGE                      0x47UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DOORBELL_PACING_NQ_UPDATE       0x48UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_HW_DOORBELL_RECOVERY_READ_ERROR 0x49UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_CTX_ERROR                       0x4aUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_UDCC_SESSION_CHANGE             0x4bUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_DBG_BUF_PRODUCER                0x4cUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_PEER_MMAP_CHANGE                0x4dUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_REPRESENTOR_PAIR_CHANGE         0x4eUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_STAT_CHANGE                  0x4fUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_HOST_COREDUMP                   0x50UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID               0x51UL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG                    0xfeUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR                      0xffUL
-	#define ASYNC_EVENT_CMPL_EVENT_ID_LAST                           ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_V          0x1UL
-	#define ASYNC_EVENT_CMPL_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-};
-
-/* hwrm_async_event_cmpl_link_status_change (size:128b/16B) */
-struct hwrm_async_event_cmpl_link_status_change {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE 0x0UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LAST              ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE     0x1UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_DOWN  0x0UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP    0x1UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_LAST ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_MASK       0xeUL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_SFT        1
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_MASK    0xffff0UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_SFT     4
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PF_ID_MASK      0xff00000UL
-	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PF_ID_SFT       20
-};
-
-/* hwrm_async_event_cmpl_port_conn_not_allowed (size:128b/16B) */
-struct hwrm_async_event_cmpl_port_conn_not_allowed {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_LAST             ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED 0x4UL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_V          0x1UL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK                 0xffffUL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_SFT                  0
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_MASK      0xff0000UL
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_SFT       16
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_NONE        (0x0UL << 16)
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_DISABLETX   (0x1UL << 16)
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_WARNINGMSG  (0x2UL << 16)
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN     (0x3UL << 16)
-	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_LAST       ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN
-};
-
-/* hwrm_async_event_cmpl_link_speed_cfg_change (size:128b/16B) */
-struct hwrm_async_event_cmpl_link_speed_cfg_change {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE 0x6UL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_MASK                     0xffffUL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_SFT                      0
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_SUPPORTED_LINK_SPEEDS_CHANGE     0x10000UL
-	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_ILLEGAL_LINK_SPEED_CFG           0x20000UL
-};
-
-/* hwrm_async_event_cmpl_reset_notify (size:128b/16B) */
-struct hwrm_async_event_cmpl_reset_notify {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_LAST             ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_RESET_NOTIFY 0x8UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_LAST        ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_RESET_NOTIFY
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA2_FW_STATUS_CODE_MASK 0xffffUL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA2_FW_STATUS_CODE_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_V          0x1UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_MASK                  0xffUL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_SFT                   0
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_STOP_TX_QUEUE    0x1UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_IFDOWN           0x2UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_LAST                   ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_IFDOWN
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK                    0xff00UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_SFT                     8
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MANAGEMENT_RESET_REQUEST  (0x1UL << 8)
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL        (0x2UL << 8)
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_NON_FATAL    (0x3UL << 8)
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FAST_RESET                (0x4UL << 8)
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION             (0x5UL << 8)
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_LAST                     ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_MASK           0xffff0000UL
-	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_SFT            16
-};
-
-/* hwrm_async_event_cmpl_error_recovery (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_recovery {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_ERROR_RECOVERY 0x9UL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_LAST          ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_ERROR_RECOVERY
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASK                 0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_SFT                  0
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC           0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_RECOVERY_ENABLED      0x2UL
-};
-
-/* hwrm_async_event_cmpl_ring_monitor_msg (size:128b/16B) */
-struct hwrm_async_event_cmpl_ring_monitor_msg {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_LAST             ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_RING_MONITOR_MSG 0xaUL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_LAST            ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_RING_MONITOR_MSG
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_MASK 0xffUL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_SFT 0
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_TX    0x0UL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_RX    0x1UL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_CMPL  0x2UL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_LAST ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_CMPL
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_V          0x1UL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-};
-
-/* hwrm_async_event_cmpl_vf_cfg_change (size:128b/16B) */
-struct hwrm_async_event_cmpl_vf_cfg_change {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE 0x33UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_LAST         ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA2_VF_ID_MASK 0xffffUL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA2_VF_ID_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MTU_CHANGE                0x1UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MRU_CHANGE                0x2UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_MAC_ADDR_CHANGE      0x4UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_VLAN_CHANGE          0x8UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_TRUSTED_VF_CFG_CHANGE     0x10UL
-	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_TF_OWNERSHIP_RELEASE      0x20UL
-};
-
-/* hwrm_async_event_cmpl_default_vnic_change (size:128b/16B) */
-struct hwrm_async_event_cmpl_default_vnic_change {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_HWRM_ASYNC_EVENT
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_UNUSED1_MASK         0xffc0UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_UNUSED1_SFT          6
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_ALLOC_FREE_NOTIFICATION 0x35UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_LAST                   ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_ALLOC_FREE_NOTIFICATION
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_MASK          0x3UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_SFT           0
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_ALLOC  0x1UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_FREE   0x2UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_LAST           ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_FREE
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_PF_ID_MASK                   0x3fcUL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_PF_ID_SFT                    2
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_VF_ID_MASK                   0x3fffc00UL
-	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_VF_ID_SFT                    10
-};
-
-/* hwrm_async_event_cmpl_hw_flow_aged (size:128b/16B) */
-struct hwrm_async_event_cmpl_hw_flow_aged {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_LAST             ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_HW_FLOW_AGED 0x36UL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_LAST        ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_HW_FLOW_AGED
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_V          0x1UL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_ID_MASK       0x7fffffffUL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_ID_SFT        0
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION     0x80000000UL
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_RX    (0x0UL << 31)
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_TX    (0x1UL << 31)
-	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_LAST ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_TX
-};
-
-/* hwrm_async_event_cmpl_eem_cache_flush_req (size:128b/16B) */
-struct hwrm_async_event_cmpl_eem_cache_flush_req {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_LAST             ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_EEM_CACHE_FLUSH_REQ 0x38UL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_LAST               ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_EEM_CACHE_FLUSH_REQ
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_V          0x1UL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-};
-
-/* hwrm_async_event_cmpl_eem_cache_flush_done (size:128b/16B) */
-struct hwrm_async_event_cmpl_eem_cache_flush_done {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_LAST             ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_EEM_CACHE_FLUSH_DONE 0x39UL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_LAST                ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_EEM_CACHE_FLUSH_DONE
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_DATA1_FID_MASK 0xffffUL
-	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_DATA1_FID_SFT 0
-};
-
-/* hwrm_async_event_cmpl_deferred_response (size:128b/16B) */
-struct hwrm_async_event_cmpl_deferred_response {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_LAST             ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_DEFERRED_RESPONSE 0x40UL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_LAST             ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_DEFERRED_RESPONSE
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_DATA2_SEQ_ID_MASK 0xffffUL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_DATA2_SEQ_ID_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-};
-
-/* hwrm_async_event_cmpl_echo_request (size:128b/16B) */
-struct hwrm_async_event_cmpl_echo_request {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_LAST             ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST 0x42UL
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-};
-
-/* hwrm_async_event_cmpl_phc_update (size:128b/16B) */
-struct hwrm_async_event_cmpl_phc_update {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_LAST             ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE 0x43UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_LAST      ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_MASK 0xffffUL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_SFT 0
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_MASK   0xffff0000UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_SFT    16
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_MASK          0xfUL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_SFT           0
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_MASTER      0x1UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_SECONDARY   0x2UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_FAILOVER    0x3UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE  0x4UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_LAST           ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_MASK   0xffff0UL
-	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_SFT    4
-};
-
-/* hwrm_async_event_cmpl_pps_timestamp (size:128b/16B) */
-struct hwrm_async_event_cmpl_pps_timestamp {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_LAST             ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_PPS_TIMESTAMP 0x44UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_LAST         ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_PPS_TIMESTAMP
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE              0x1UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_INTERNAL       0x0UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_EXTERNAL       0x1UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_LAST          ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_EXTERNAL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PIN_NUMBER_MASK         0xeUL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PIN_NUMBER_SFT          1
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PPS_TIMESTAMP_UPPER_MASK 0xffff0UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PPS_TIMESTAMP_UPPER_SFT 4
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_V          0x1UL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA1_PPS_TIMESTAMP_LOWER_MASK 0xffffffffUL
-	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA1_PPS_TIMESTAMP_LOWER_SFT 0
-};
-
-/* hwrm_async_event_cmpl_error_report (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_DATA1_ERROR_TYPE_SFT 0
-};
-
-/* hwrm_async_event_cmpl_dbg_buf_producer (size:128b/16B) */
-struct hwrm_async_event_cmpl_dbg_buf_producer {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_LAST             ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_DBG_BUF_PRODUCER 0x4cUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_LAST            ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_DBG_BUF_PRODUCER
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA2_CURR_OFF_MASK 0xffffffffUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA2_CURR_OFF_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_V          0x1UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_MASK               0xffffUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SFT                0
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SRT_TRACE            0x0UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SRT2_TRACE           0x1UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CRT_TRACE            0x2UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CRT2_TRACE           0x3UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP0_TRACE          0x4UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_L2_HWRM_TRACE        0x5UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ROCE_HWRM_TRACE      0x6UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA0_TRACE            0x7UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA1_TRACE            0x8UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA2_TRACE            0x9UL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP1_TRACE          0xaUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE  0xbUL
-	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST                ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE
-};
-
-/* hwrm_async_event_cmpl_hwrm_error (size:128b/16B) */
-struct hwrm_async_event_cmpl_hwrm_error {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_LAST             ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_HWRM_ERROR 0xffUL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_LAST      ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_HWRM_ERROR
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_MASK    0xffUL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_SFT     0
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_WARNING   0x0UL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_NONFATAL  0x1UL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_FATAL     0x2UL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_LAST     ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_FATAL
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_V          0x1UL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA1_TIMESTAMP     0x1UL
-};
-
-/* hwrm_async_event_cmpl_error_report_base (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_base {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK                        0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT                         0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED                      0x0UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM                   0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL                0x2UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM                           0x3UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD       0x4UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD             0x5UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED  0x6UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST                         ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED
-};
-
-/* hwrm_async_event_cmpl_error_report_pause_storm (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_pause_storm {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_MASK       0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_SFT        0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM  0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM
-};
-
-/* hwrm_async_event_cmpl_error_report_invalid_signal (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_invalid_signal {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK 0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_MASK          0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_SFT           0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL  0x2UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_LAST           ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL
-};
-
-/* hwrm_async_event_cmpl_error_report_nvm (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_nvm {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK 0xffffffffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT 0
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_MASK     0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_SFT      0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_NVM_ERROR  0x3UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_LAST      ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_NVM_ERROR
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK   0xff00UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_SFT    8
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE    (0x1UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE    (0x2UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_LAST    ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE
-};
-
-/* hwrm_async_event_cmpl_error_report_doorbell_drop_threshold (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_doorbell_drop_threshold {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_MASK                   0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_SFT                    0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD  0x4UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_LAST                    ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_EPOCH_MASK                        0xffffff00UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_EPOCH_SFT                         8
-};
-
-/* hwrm_async_event_cmpl_error_report_thermal (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_thermal {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_CURRENT_TEMP_MASK  0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_CURRENT_TEMP_SFT   0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_THRESHOLD_TEMP_MASK 0xff00UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_THRESHOLD_TEMP_SFT 8
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_MASK          0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_SFT           0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_THERMAL_EVENT   0x5UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_LAST           ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_THERMAL_EVENT
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_MASK      0x700UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SFT       8
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_WARN        (0x0UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_CRITICAL    (0x1UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_FATAL       (0x2UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SHUTDOWN    (0x3UL << 8)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_LAST       ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SHUTDOWN
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR           0x800UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_DECREASING  (0x0UL << 11)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_INCREASING  (0x1UL << 11)
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_LAST       ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_INCREASING
-};
-
-/* hwrm_async_event_cmpl_error_report_dual_data_rate_not_supported (size:128b/16B) */
-struct hwrm_async_event_cmpl_error_report_dual_data_rate_not_supported {
-	__le16	type;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_MASK            0x3fUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_SFT             0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_HWRM_ASYNC_EVENT
-	__le16	event_id;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_ERROR_REPORT 0x45UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_ERROR_REPORT
-	__le32	event_data2;
-	u8	opaque_v;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_V          0x1UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_OPAQUE_MASK 0xfeUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_OPAQUE_SFT 1
-	u8	timestamp_lo;
-	__le16	timestamp_hi;
-	__le32	event_data1;
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_MASK                        0xffUL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_SFT                         0
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED  0x6UL
-	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_LAST                         ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED
-};
-
-/* hwrm_func_reset_input (size:192b/24B) */
-struct hwrm_func_reset_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_RESET_REQ_ENABLES_VF_ID_VALID     0x1UL
-	__le16	vf_id;
-	u8	func_reset_level;
-	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETALL      0x0UL
-	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETME       0x1UL
-	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETCHILDREN 0x2UL
-	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF       0x3UL
-	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_LAST         FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF
-	u8	unused_0;
-};
-
-/* hwrm_func_reset_output (size:128b/16B) */
-struct hwrm_func_reset_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_getfid_input (size:192b/24B) */
-struct hwrm_func_getfid_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_GETFID_REQ_ENABLES_PCI_ID     0x1UL
-	__le16	pci_id;
-	u8	unused_0[2];
-};
-
-/* hwrm_func_getfid_output (size:128b/16B) */
-struct hwrm_func_getfid_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	fid;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_func_vf_alloc_input (size:192b/24B) */
-struct hwrm_func_vf_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_VF_ALLOC_REQ_ENABLES_FIRST_VF_ID     0x1UL
-	__le16	first_vf_id;
-	__le16	num_vfs;
-};
-
-/* hwrm_func_vf_alloc_output (size:128b/16B) */
-struct hwrm_func_vf_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	first_vf_id;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_func_vf_free_input (size:192b/24B) */
-struct hwrm_func_vf_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_VF_FREE_REQ_ENABLES_FIRST_VF_ID     0x1UL
-	__le16	first_vf_id;
-	__le16	num_vfs;
-};
-
-/* hwrm_func_vf_free_output (size:128b/16B) */
-struct hwrm_func_vf_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_vf_cfg_input (size:576b/72B) */
-struct hwrm_func_vf_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_VF_CFG_REQ_ENABLES_MTU                      0x1UL
-	#define FUNC_VF_CFG_REQ_ENABLES_GUEST_VLAN               0x2UL
-	#define FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR           0x4UL
-	#define FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR            0x8UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS          0x10UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS           0x20UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_TX_RINGS             0x40UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RX_RINGS             0x80UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_L2_CTXS              0x100UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS                0x200UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS            0x400UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS         0x800UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_KTLS_TX_KEY_CTXS     0x1000UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_KTLS_RX_KEY_CTXS     0x2000UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_QUIC_TX_KEY_CTXS     0x4000UL
-	#define FUNC_VF_CFG_REQ_ENABLES_NUM_QUIC_RX_KEY_CTXS     0x8000UL
-	__le16	mtu;
-	__le16	guest_vlan;
-	__le16	async_event_cr;
-	u8	dflt_mac_addr[6];
-	__le32	flags;
-	#define FUNC_VF_CFG_REQ_FLAGS_TX_ASSETS_TEST             0x1UL
-	#define FUNC_VF_CFG_REQ_FLAGS_RX_ASSETS_TEST             0x2UL
-	#define FUNC_VF_CFG_REQ_FLAGS_CMPL_ASSETS_TEST           0x4UL
-	#define FUNC_VF_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST     0x8UL
-	#define FUNC_VF_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST       0x10UL
-	#define FUNC_VF_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST       0x20UL
-	#define FUNC_VF_CFG_REQ_FLAGS_VNIC_ASSETS_TEST           0x40UL
-	#define FUNC_VF_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST         0x80UL
-	#define FUNC_VF_CFG_REQ_FLAGS_PPP_PUSH_MODE_ENABLE       0x100UL
-	#define FUNC_VF_CFG_REQ_FLAGS_PPP_PUSH_MODE_DISABLE      0x200UL
-	__le16	num_rsscos_ctxs;
-	__le16	num_cmpl_rings;
-	__le16	num_tx_rings;
-	__le16	num_rx_rings;
-	__le16	num_l2_ctxs;
-	__le16	num_vnics;
-	__le16	num_stat_ctxs;
-	__le16	num_hw_ring_grps;
-	__le32	num_ktls_tx_key_ctxs;
-	__le32	num_ktls_rx_key_ctxs;
-	__le16	num_msix;
-	u8	unused[2];
-	__le32	num_quic_tx_key_ctxs;
-	__le32	num_quic_rx_key_ctxs;
-};
-
-/* hwrm_func_vf_cfg_output (size:128b/16B) */
-struct hwrm_func_vf_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_qcaps_input (size:192b/24B) */
-struct hwrm_func_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	unused_0[6];
-};
-
-/* hwrm_func_qcaps_output (size:1152b/144B) */
-struct hwrm_func_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	fid;
-	__le16	port_id;
-	__le32	flags;
-	#define FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED                   0x1UL
-	#define FUNC_QCAPS_RESP_FLAGS_GLOBAL_MSIX_AUTOMASKING               0x2UL
-	#define FUNC_QCAPS_RESP_FLAGS_PTP_SUPPORTED                         0x4UL
-	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED                     0x8UL
-	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED                     0x10UL
-	#define FUNC_QCAPS_RESP_FLAGS_WOL_MAGICPKT_SUPPORTED                0x20UL
-	#define FUNC_QCAPS_RESP_FLAGS_WOL_BMP_SUPPORTED                     0x40UL
-	#define FUNC_QCAPS_RESP_FLAGS_TX_RING_RL_SUPPORTED                  0x80UL
-	#define FUNC_QCAPS_RESP_FLAGS_TX_BW_CFG_SUPPORTED                   0x100UL
-	#define FUNC_QCAPS_RESP_FLAGS_VF_TX_RING_RL_SUPPORTED               0x200UL
-	#define FUNC_QCAPS_RESP_FLAGS_VF_BW_CFG_SUPPORTED                   0x400UL
-	#define FUNC_QCAPS_RESP_FLAGS_STD_TX_RING_MODE_SUPPORTED            0x800UL
-	#define FUNC_QCAPS_RESP_FLAGS_GENEVE_TUN_FLAGS_SUPPORTED            0x1000UL
-	#define FUNC_QCAPS_RESP_FLAGS_NVGRE_TUN_FLAGS_SUPPORTED             0x2000UL
-	#define FUNC_QCAPS_RESP_FLAGS_GRE_TUN_FLAGS_SUPPORTED               0x4000UL
-	#define FUNC_QCAPS_RESP_FLAGS_MPLS_TUN_FLAGS_SUPPORTED              0x8000UL
-	#define FUNC_QCAPS_RESP_FLAGS_PCIE_STATS_SUPPORTED                  0x10000UL
-	#define FUNC_QCAPS_RESP_FLAGS_ADOPTED_PF_SUPPORTED                  0x20000UL
-	#define FUNC_QCAPS_RESP_FLAGS_ADMIN_PF_SUPPORTED                    0x40000UL
-	#define FUNC_QCAPS_RESP_FLAGS_LINK_ADMIN_STATUS_SUPPORTED           0x80000UL
-	#define FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE                         0x100000UL
-	#define FUNC_QCAPS_RESP_FLAGS_DYNAMIC_TX_RING_ALLOC                 0x200000UL
-	#define FUNC_QCAPS_RESP_FLAGS_HOT_RESET_CAPABLE                     0x400000UL
-	#define FUNC_QCAPS_RESP_FLAGS_ERROR_RECOVERY_CAPABLE                0x800000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_STATS_SUPPORTED                   0x1000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_ERR_RECOVER_RELOAD                    0x2000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_NOTIFY_VF_DEF_VNIC_CHNG_SUPPORTED     0x4000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_VLAN_ACCELERATION_TX_DISABLED         0x8000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_COREDUMP_CMD_SUPPORTED                0x10000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_CRASHDUMP_CMD_SUPPORTED               0x20000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_PFC_WD_STATS_SUPPORTED                0x40000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_DBG_QCAPS_CMD_SUPPORTED               0x80000000UL
-	u8	mac_address[6];
-	__le16	max_rsscos_ctx;
-	__le16	max_cmpl_rings;
-	__le16	max_tx_rings;
-	__le16	max_rx_rings;
-	__le16	max_l2_ctxs;
-	__le16	max_vnics;
-	__le16	first_vf_id;
-	__le16	max_vfs;
-	__le16	max_stat_ctx;
-	__le32	max_encap_records;
-	__le32	max_decap_records;
-	__le32	max_tx_em_flows;
-	__le32	max_tx_wm_flows;
-	__le32	max_rx_em_flows;
-	__le32	max_rx_wm_flows;
-	__le32	max_mcast_filters;
-	__le32	max_flow_id;
-	__le32	max_hw_ring_grps;
-	__le16	max_sp_tx_rings;
-	__le16	max_msix_vfs;
-	__le32	flags_ext;
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECN_MARK_SUPPORTED                          0x1UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECN_STATS_SUPPORTED                         0x2UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_EXT_HW_STATS_SUPPORTED                      0x4UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_HOT_RESET_IF_SUPPORT                        0x8UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PROXY_MODE_SUPPORT                          0x10UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_PROXY_SRC_INTF_OVERRIDE_SUPPORT          0x20UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_SCHQ_SUPPORTED                              0x40UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PPP_PUSH_MODE_SUPPORTED                     0x80UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_EVB_MODE_CFG_NOT_SUPPORTED                  0x100UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_SOC_SPD_SUPPORTED                           0x200UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_FW_LIVEPATCH_SUPPORTED                      0x400UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_FAST_RESET_CAPABLE                          0x800UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_METADATA_CFG_CAPABLE                     0x1000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_NVM_OPTION_ACTION_SUPPORTED                 0x2000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_BD_METADATA_SUPPORTED                       0x4000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECHO_REQUEST_SUPPORTED                      0x8000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_NPAR_1_2_SUPPORTED                          0x10000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_PTM_SUPPORTED                           0x20000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_PPS_SUPPORTED                           0x40000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_VF_CFG_ASYNC_FOR_PF_SUPPORTED               0x80000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PARTITION_BW_SUPPORTED                      0x100000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_DFLT_VLAN_TPID_PCP_SUPPORTED                0x200000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_KTLS_SUPPORTED                              0x400000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_EP_RATE_CONTROL                             0x800000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_MIN_BW_SUPPORTED                            0x1000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_COAL_CMPL_CAP                            0x2000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_SUPPORTED                             0x4000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_REQUIRED                              0x8000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_64BIT_RTC_SUPPORTED                     0x10000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_DBR_PACING_SUPPORTED                        0x20000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_HW_DBR_DROP_RECOV_SUPPORTED                 0x40000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT_DISABLE_CQ_OVERFLOW_DETECTION_SUPPORTED     0x80000000UL
-	u8	max_schqs;
-	u8	mpc_chnls_cap;
-	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TCE         0x1UL
-	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RCE         0x2UL
-	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TE_CFA      0x4UL
-	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RE_CFA      0x8UL
-	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_PRIMATE     0x10UL
-	__le16	max_key_ctxs_alloc;
-	__le32	flags_ext2;
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_RX_ALL_PKTS_TIMESTAMPS_SUPPORTED      0x1UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_QUIC_SUPPORTED                        0x2UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_KDNET_SUPPORTED                       0x4UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED              0x8UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_DBR_DROP_RECOVERY_SUPPORTED        0x10UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_GENERIC_STATS_SUPPORTED               0x20UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_UDP_GSO_SUPPORTED                     0x40UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SYNCE_SUPPORTED                       0x80UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED               0x100UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TX_PKT_TS_CMPL_SUPPORTED              0x200UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_HW_LAG_SUPPORTED                      0x400UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ON_CHIP_CTX_SUPPORTED                 0x800UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_STEERING_TAG_SUPPORTED                0x1000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ENHANCED_VF_SCALE_SUPPORTED           0x2000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_KEY_XID_PARTITION_SUPPORTED           0x4000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_CONCURRENT_KTLS_QUIC_SUPPORTED        0x8000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_CROSS_TC_CAP_SUPPORTED           0x10000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_PER_TC_CAP_SUPPORTED             0x20000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_PER_TC_RESERVATION_SUPPORTED     0x40000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DB_ERROR_STATS_SUPPORTED              0x80000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ROCE_VF_RESOURCE_MGMT_SUPPORTED       0x100000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_UDCC_SUPPORTED                        0x200000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TIMED_TX_SO_TXTIME_SUPPORTED          0x400000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_MAX_RESOURCE_LIMITS_SUPPORTED      0x800000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_INGRESS_NIC_FLOW_SUPPORTED         0x1000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_LPBK_STATS_SUPPORTED                  0x2000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_EGRESS_NIC_FLOW_SUPPORTED          0x4000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_MULTI_LOSSLESS_QUEUES_SUPPORTED       0x8000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_PEER_MMAP_SUPPORTED                   0x10000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TIMED_TX_PACING_SUPPORTED             0x20000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_VF_STAT_EJECTION_SUPPORTED            0x40000000UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT2_HOST_COREDUMP_SUPPORTED               0x80000000UL
-	__le16	tunnel_disable_flag;
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_VXLAN      0x1UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NGE        0x2UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NVGRE      0x4UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_L2GRE      0x8UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_GRE        0x10UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_IPINIP     0x20UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_MPLS       0x40UL
-	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_PPPOE      0x80UL
-	__le16	xid_partition_cap;
-	#define FUNC_QCAPS_RESP_XID_PARTITION_CAP_TX_CK     0x1UL
-	#define FUNC_QCAPS_RESP_XID_PARTITION_CAP_RX_CK     0x2UL
-	u8	device_serial_number[8];
-	__le16	ctxs_per_partition;
-	__le16	max_tso_segs;
-	__le32	roce_vf_max_av;
-	__le32	roce_vf_max_cq;
-	__le32	roce_vf_max_mrw;
-	__le32	roce_vf_max_qp;
-	__le32	roce_vf_max_srq;
-	__le32	roce_vf_max_gid;
-	__le32	flags_ext3;
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_RM_RSV_WHILE_ALLOC_CAP            0x1UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_REQUIRE_L2_FILTER                 0x2UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_MAX_ROCE_VFS_SUPPORTED            0x4UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_RX_RATE_PROFILE_SEL_SUPPORTED     0x8UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_BIDI_OPT_SUPPORTED                0x10UL
-	#define FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED          0x20UL
-	__le16	max_roce_vfs;
-	__le16	max_crypto_rx_flow_filters;
-	u8	unused_3[3];
-	u8	valid;
-};
-
-/* hwrm_func_qcfg_input (size:192b/24B) */
-struct hwrm_func_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	unused_0[6];
-};
-
-/* hwrm_func_qcfg_output (size:1344b/168B) */
-struct hwrm_func_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	fid;
-	__le16	port_id;
-	__le16	vlan;
-	__le16	flags;
-	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_MAGICPKT_ENABLED     0x1UL
-	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_BMP_ENABLED          0x2UL
-	#define FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED        0x4UL
-	#define FUNC_QCFG_RESP_FLAGS_STD_TX_RING_MODE_ENABLED     0x8UL
-	#define FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED        0x10UL
-	#define FUNC_QCFG_RESP_FLAGS_MULTI_HOST                   0x20UL
-	#define FUNC_QCFG_RESP_FLAGS_TRUSTED_VF                   0x40UL
-	#define FUNC_QCFG_RESP_FLAGS_SECURE_MODE_ENABLED          0x80UL
-	#define FUNC_QCFG_RESP_FLAGS_PREBOOT_LEGACY_L2_RINGS      0x100UL
-	#define FUNC_QCFG_RESP_FLAGS_HOT_RESET_ALLOWED            0x200UL
-	#define FUNC_QCFG_RESP_FLAGS_PPP_PUSH_MODE_ENABLED        0x400UL
-	#define FUNC_QCFG_RESP_FLAGS_RING_MONITOR_ENABLED         0x800UL
-	#define FUNC_QCFG_RESP_FLAGS_FAST_RESET_ALLOWED           0x1000UL
-	#define FUNC_QCFG_RESP_FLAGS_MULTI_ROOT                   0x2000UL
-	#define FUNC_QCFG_RESP_FLAGS_ENABLE_RDMA_SRIOV            0x4000UL
-	#define FUNC_QCFG_RESP_FLAGS_ROCE_VNIC_ID_VALID           0x8000UL
-	u8	mac_address[6];
-	__le16	pci_id;
-	__le16	alloc_rsscos_ctx;
-	__le16	alloc_cmpl_rings;
-	__le16	alloc_tx_rings;
-	__le16	alloc_rx_rings;
-	__le16	alloc_l2_ctx;
-	__le16	alloc_vnics;
-	__le16	admin_mtu;
-	__le16	mru;
-	__le16	stat_ctx_id;
-	u8	port_partition_type;
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_SPF     0x0UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_MPFS    0x1UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0 0x2UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5 0x3UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR2_0 0x4UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_2 0x5UL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN 0xffUL
-	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_LAST   FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN
-	u8	port_pf_cnt;
-	#define FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL 0x0UL
-	#define FUNC_QCFG_RESP_PORT_PF_CNT_LAST   FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL
-	__le16	dflt_vnic_id;
-	__le16	max_mtu_configured;
-	__le32	min_bw;
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_SFT              0
-	#define FUNC_QCFG_RESP_MIN_BW_SCALE                     0x10000000UL
-	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_QCFG_RESP_MIN_BW_SCALE_LAST                 FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID
-	__le32	max_bw;
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_SFT              0
-	#define FUNC_QCFG_RESP_MAX_BW_SCALE                     0x10000000UL
-	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_QCFG_RESP_MAX_BW_SCALE_LAST                 FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID
-	u8	evb_mode;
-	#define FUNC_QCFG_RESP_EVB_MODE_NO_EVB 0x0UL
-	#define FUNC_QCFG_RESP_EVB_MODE_VEB    0x1UL
-	#define FUNC_QCFG_RESP_EVB_MODE_VEPA   0x2UL
-	#define FUNC_QCFG_RESP_EVB_MODE_LAST  FUNC_QCFG_RESP_EVB_MODE_VEPA
-	u8	options;
-	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_MASK         0x3UL
-	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SFT          0
-	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_64        0x0UL
-	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128       0x1UL
-	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_LAST          FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_MASK       0xcUL
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_SFT        2
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_FORCED_DOWN  (0x0UL << 2)
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_FORCED_UP    (0x1UL << 2)
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_AUTO         (0x2UL << 2)
-	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_LAST        FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_AUTO
-	#define FUNC_QCFG_RESP_OPTIONS_RSVD_MASK                   0xf0UL
-	#define FUNC_QCFG_RESP_OPTIONS_RSVD_SFT                    4
-	__le16	alloc_vfs;
-	__le32	alloc_mcast_filters;
-	__le32	alloc_hw_ring_grps;
-	__le16	alloc_sp_tx_rings;
-	__le16	alloc_stat_ctx;
-	__le16	alloc_msix;
-	__le16	registered_vfs;
-	__le16	l2_doorbell_bar_size_kb;
-	u8	active_endpoints;
-	u8	always_1;
-	__le32	reset_addr_poll;
-	__le16	legacy_l2_db_size_kb;
-	__le16	svif_info;
-	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_MASK      0x7fffUL
-	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_SFT       0
-	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_VALID     0x8000UL
-	u8	mpc_chnls;
-	#define FUNC_QCFG_RESP_MPC_CHNLS_TCE_ENABLED         0x1UL
-	#define FUNC_QCFG_RESP_MPC_CHNLS_RCE_ENABLED         0x2UL
-	#define FUNC_QCFG_RESP_MPC_CHNLS_TE_CFA_ENABLED      0x4UL
-	#define FUNC_QCFG_RESP_MPC_CHNLS_RE_CFA_ENABLED      0x8UL
-	#define FUNC_QCFG_RESP_MPC_CHNLS_PRIMATE_ENABLED     0x10UL
-	u8	db_page_size;
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_4KB   0x0UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_8KB   0x1UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_16KB  0x2UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_32KB  0x3UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_64KB  0x4UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_128KB 0x5UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_256KB 0x6UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_512KB 0x7UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_1MB   0x8UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_2MB   0x9UL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_4MB   0xaUL
-	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_LAST FUNC_QCFG_RESP_DB_PAGE_SIZE_4MB
-	__le16	roce_vnic_id;
-	__le32	partition_min_bw;
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_SFT              0
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE                     0x10000000UL
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_LAST                 FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BYTES
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100
-	__le32	partition_max_bw;
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_SFT              0
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE                     0x10000000UL
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_LAST                 FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BYTES
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100
-	__le16	host_mtu;
-	__le16	flags2;
-	#define FUNC_QCFG_RESP_FLAGS2_SRIOV_DSCP_INSERT_ENABLED     0x1UL
-	__le16	stag_vid;
-	u8	port_kdnet_mode;
-	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_DISABLED 0x0UL
-	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED  0x1UL
-	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_LAST    FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED
-	u8	kdnet_pcie_function;
-	__le16	port_kdnet_fid;
-	u8	unused_5;
-	u8	roce_bidi_opt_mode;
-	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DISABLED      0x1UL
-	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DEDICATED     0x2UL
-	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_SHARED        0x4UL
-	__le32	num_ktls_tx_key_ctxs;
-	__le32	num_ktls_rx_key_ctxs;
-	u8	lag_id;
-	u8	parif;
-	u8	fw_lag_id;
-	u8	unused_6;
-	__le32	num_quic_tx_key_ctxs;
-	__le32	num_quic_rx_key_ctxs;
-	__le32	roce_max_av_per_vf;
-	__le32	roce_max_cq_per_vf;
-	__le32	roce_max_mrw_per_vf;
-	__le32	roce_max_qp_per_vf;
-	__le32	roce_max_srq_per_vf;
-	__le32	roce_max_gid_per_vf;
-	__le16	xid_partition_cfg;
-	#define FUNC_QCFG_RESP_XID_PARTITION_CFG_TX_CK     0x1UL
-	#define FUNC_QCFG_RESP_XID_PARTITION_CFG_RX_CK     0x2UL
-	__le16	mirror_vnic_id;
-	u8	unused_7[7];
-	u8	valid;
-};
-
-/* hwrm_func_cfg_input (size:1280b/160B) */
-struct hwrm_func_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	__le16	num_msix;
-	__le32	flags;
-	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE     0x1UL
-	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE      0x2UL
-	#define FUNC_CFG_REQ_FLAGS_RSVD_MASK                      0x1fcUL
-	#define FUNC_CFG_REQ_FLAGS_RSVD_SFT                       2
-	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_ENABLE        0x200UL
-	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_DISABLE       0x400UL
-	#define FUNC_CFG_REQ_FLAGS_VIRT_MAC_PERSIST               0x800UL
-	#define FUNC_CFG_REQ_FLAGS_NO_AUTOCLEAR_STATISTIC         0x1000UL
-	#define FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST                 0x2000UL
-	#define FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST                 0x4000UL
-	#define FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST               0x8000UL
-	#define FUNC_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST         0x10000UL
-	#define FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST           0x20000UL
-	#define FUNC_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST           0x40000UL
-	#define FUNC_CFG_REQ_FLAGS_VNIC_ASSETS_TEST               0x80000UL
-	#define FUNC_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST             0x100000UL
-	#define FUNC_CFG_REQ_FLAGS_TRUSTED_VF_ENABLE              0x200000UL
-	#define FUNC_CFG_REQ_FLAGS_DYNAMIC_TX_RING_ALLOC          0x400000UL
-	#define FUNC_CFG_REQ_FLAGS_NQ_ASSETS_TEST                 0x800000UL
-	#define FUNC_CFG_REQ_FLAGS_TRUSTED_VF_DISABLE             0x1000000UL
-	#define FUNC_CFG_REQ_FLAGS_PREBOOT_LEGACY_L2_RINGS        0x2000000UL
-	#define FUNC_CFG_REQ_FLAGS_HOT_RESET_IF_EN_DIS            0x4000000UL
-	#define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_ENABLE           0x8000000UL
-	#define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_DISABLE          0x10000000UL
-	#define FUNC_CFG_REQ_FLAGS_BD_METADATA_ENABLE             0x20000000UL
-	#define FUNC_CFG_REQ_FLAGS_BD_METADATA_DISABLE            0x40000000UL
-	__le32	enables;
-	#define FUNC_CFG_REQ_ENABLES_ADMIN_MTU                0x1UL
-	#define FUNC_CFG_REQ_ENABLES_MRU                      0x2UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS          0x4UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS           0x8UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS             0x10UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS             0x20UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_L2_CTXS              0x40UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_VNICS                0x80UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS            0x100UL
-	#define FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR            0x200UL
-	#define FUNC_CFG_REQ_ENABLES_DFLT_VLAN                0x400UL
-	#define FUNC_CFG_REQ_ENABLES_DFLT_IP_ADDR             0x800UL
-	#define FUNC_CFG_REQ_ENABLES_MIN_BW                   0x1000UL
-	#define FUNC_CFG_REQ_ENABLES_MAX_BW                   0x2000UL
-	#define FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR           0x4000UL
-	#define FUNC_CFG_REQ_ENABLES_VLAN_ANTISPOOF_MODE      0x8000UL
-	#define FUNC_CFG_REQ_ENABLES_ALLOWED_VLAN_PRIS        0x10000UL
-	#define FUNC_CFG_REQ_ENABLES_EVB_MODE                 0x20000UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_MCAST_FILTERS        0x40000UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS         0x80000UL
-	#define FUNC_CFG_REQ_ENABLES_CACHE_LINESIZE           0x100000UL
-	#define FUNC_CFG_REQ_ENABLES_NUM_MSIX                 0x200000UL
-	#define FUNC_CFG_REQ_ENABLES_ADMIN_LINK_STATE         0x400000UL
-	#define FUNC_CFG_REQ_ENABLES_HOT_RESET_IF_SUPPORT     0x800000UL
-	#define FUNC_CFG_REQ_ENABLES_SCHQ_ID                  0x1000000UL
-	#define FUNC_CFG_REQ_ENABLES_MPC_CHNLS                0x2000000UL
-	#define FUNC_CFG_REQ_ENABLES_PARTITION_MIN_BW         0x4000000UL
-	#define FUNC_CFG_REQ_ENABLES_PARTITION_MAX_BW         0x8000000UL
-	#define FUNC_CFG_REQ_ENABLES_TPID                     0x10000000UL
-	#define FUNC_CFG_REQ_ENABLES_HOST_MTU                 0x20000000UL
-	#define FUNC_CFG_REQ_ENABLES_KTLS_TX_KEY_CTXS         0x40000000UL
-	#define FUNC_CFG_REQ_ENABLES_KTLS_RX_KEY_CTXS         0x80000000UL
-	__le16	admin_mtu;
-	__le16	mru;
-	__le16	num_rsscos_ctxs;
-	__le16	num_cmpl_rings;
-	__le16	num_tx_rings;
-	__le16	num_rx_rings;
-	__le16	num_l2_ctxs;
-	__le16	num_vnics;
-	__le16	num_stat_ctxs;
-	__le16	num_hw_ring_grps;
-	u8	dflt_mac_addr[6];
-	__le16	dflt_vlan;
-	__be32	dflt_ip_addr[4];
-	__le32	min_bw;
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_SFT              0
-	#define FUNC_CFG_REQ_MIN_BW_SCALE                     0x10000000UL
-	#define FUNC_CFG_REQ_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_CFG_REQ_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_CFG_REQ_MIN_BW_SCALE_LAST                 FUNC_CFG_REQ_MIN_BW_SCALE_BYTES
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID
-	__le32	max_bw;
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_SFT              0
-	#define FUNC_CFG_REQ_MAX_BW_SCALE                     0x10000000UL
-	#define FUNC_CFG_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_CFG_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_CFG_REQ_MAX_BW_SCALE_LAST                 FUNC_CFG_REQ_MAX_BW_SCALE_BYTES
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
-	__le16	async_event_cr;
-	u8	vlan_antispoof_mode;
-	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_NOCHECK                 0x0UL
-	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_VALIDATE_VLAN           0x1UL
-	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_IF_VLANDNE       0x2UL
-	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN 0x3UL
-	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_LAST                   FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN
-	u8	allowed_vlan_pris;
-	u8	evb_mode;
-	#define FUNC_CFG_REQ_EVB_MODE_NO_EVB 0x0UL
-	#define FUNC_CFG_REQ_EVB_MODE_VEB    0x1UL
-	#define FUNC_CFG_REQ_EVB_MODE_VEPA   0x2UL
-	#define FUNC_CFG_REQ_EVB_MODE_LAST  FUNC_CFG_REQ_EVB_MODE_VEPA
-	u8	options;
-	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_MASK         0x3UL
-	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SFT          0
-	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_64        0x0UL
-	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128       0x1UL
-	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_LAST          FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_MASK       0xcUL
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_SFT        2
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_FORCED_DOWN  (0x0UL << 2)
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_FORCED_UP    (0x1UL << 2)
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_AUTO         (0x2UL << 2)
-	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_LAST        FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_AUTO
-	#define FUNC_CFG_REQ_OPTIONS_RSVD_MASK                   0xf0UL
-	#define FUNC_CFG_REQ_OPTIONS_RSVD_SFT                    4
-	__le16	num_mcast_filters;
-	__le16	schq_id;
-	__le16	mpc_chnls;
-	#define FUNC_CFG_REQ_MPC_CHNLS_TCE_ENABLE          0x1UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_TCE_DISABLE         0x2UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_RCE_ENABLE          0x4UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_RCE_DISABLE         0x8UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_TE_CFA_ENABLE       0x10UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_TE_CFA_DISABLE      0x20UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_RE_CFA_ENABLE       0x40UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_RE_CFA_DISABLE      0x80UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_PRIMATE_ENABLE      0x100UL
-	#define FUNC_CFG_REQ_MPC_CHNLS_PRIMATE_DISABLE     0x200UL
-	__le32	partition_min_bw;
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_SFT              0
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE                     0x10000000UL
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_LAST                 FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BYTES
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100
-	__le32	partition_max_bw;
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_SFT              0
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE                     0x10000000UL
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_LAST                 FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BYTES
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100
-	__be16	tpid;
-	__le16	host_mtu;
-	__le32	flags2;
-	#define FUNC_CFG_REQ_FLAGS2_KTLS_KEY_CTX_ASSETS_TEST     0x1UL
-	#define FUNC_CFG_REQ_FLAGS2_QUIC_KEY_CTX_ASSETS_TEST     0x2UL
-	__le32	enables2;
-	#define FUNC_CFG_REQ_ENABLES2_KDNET                    0x1UL
-	#define FUNC_CFG_REQ_ENABLES2_DB_PAGE_SIZE             0x2UL
-	#define FUNC_CFG_REQ_ENABLES2_QUIC_TX_KEY_CTXS         0x4UL
-	#define FUNC_CFG_REQ_ENABLES2_QUIC_RX_KEY_CTXS         0x8UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_AV_PER_VF       0x10UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_CQ_PER_VF       0x20UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_MRW_PER_VF      0x40UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_QP_PER_VF       0x80UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_SRQ_PER_VF      0x100UL
-	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_GID_PER_VF      0x200UL
-	#define FUNC_CFG_REQ_ENABLES2_XID_PARTITION_CFG        0x400UL
-	#define FUNC_CFG_REQ_ENABLES2_PHYSICAL_SLOT_NUMBER     0x800UL
-	u8	port_kdnet_mode;
-	#define FUNC_CFG_REQ_PORT_KDNET_MODE_DISABLED 0x0UL
-	#define FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED  0x1UL
-	#define FUNC_CFG_REQ_PORT_KDNET_MODE_LAST    FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED
-	u8	db_page_size;
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_4KB   0x0UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_8KB   0x1UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_16KB  0x2UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_32KB  0x3UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_64KB  0x4UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_128KB 0x5UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_256KB 0x6UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_512KB 0x7UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_1MB   0x8UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_2MB   0x9UL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_4MB   0xaUL
-	#define FUNC_CFG_REQ_DB_PAGE_SIZE_LAST FUNC_CFG_REQ_DB_PAGE_SIZE_4MB
-	__le16	physical_slot_number;
-	__le32	num_ktls_tx_key_ctxs;
-	__le32	num_ktls_rx_key_ctxs;
-	__le32	num_quic_tx_key_ctxs;
-	__le32	num_quic_rx_key_ctxs;
-	__le32	roce_max_av_per_vf;
-	__le32	roce_max_cq_per_vf;
-	__le32	roce_max_mrw_per_vf;
-	__le32	roce_max_qp_per_vf;
-	__le32	roce_max_srq_per_vf;
-	__le32	roce_max_gid_per_vf;
-	__le16	xid_partition_cfg;
-	#define FUNC_CFG_REQ_XID_PARTITION_CFG_TX_CK     0x1UL
-	#define FUNC_CFG_REQ_XID_PARTITION_CFG_RX_CK     0x2UL
-	__le16	unused_2;
-};
-
-/* hwrm_func_cfg_output (size:128b/16B) */
-struct hwrm_func_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_cfg_cmd_err (size:64b/8B) */
-struct hwrm_func_cfg_cmd_err {
-	u8	code;
-	#define FUNC_CFG_CMD_ERR_CODE_UNKNOWN                      0x0UL
-	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_RANGE       0x1UL
-	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_MORE_THAN_MAX  0x2UL
-	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_UNSUPPORTED 0x3UL
-	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT         0x4UL
-	#define FUNC_CFG_CMD_ERR_CODE_LAST                        FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT
-	u8	unused_0[7];
-};
-
-/* hwrm_func_qstats_input (size:192b/24B) */
-struct hwrm_func_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	flags;
-	#define FUNC_QSTATS_REQ_FLAGS_ROCE_ONLY        0x1UL
-	#define FUNC_QSTATS_REQ_FLAGS_COUNTER_MASK     0x2UL
-	#define FUNC_QSTATS_REQ_FLAGS_L2_ONLY          0x4UL
-	u8	unused_0[5];
-};
-
-/* hwrm_func_qstats_output (size:1408b/176B) */
-struct hwrm_func_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_drop_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_drop_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	rx_agg_pkts;
-	__le64	rx_agg_bytes;
-	__le64	rx_agg_events;
-	__le64	rx_agg_aborts;
-	u8	clear_seq;
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_func_qstats_ext_input (size:256b/32B) */
-struct hwrm_func_qstats_ext_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	flags;
-	#define FUNC_QSTATS_EXT_REQ_FLAGS_ROCE_ONLY        0x1UL
-	#define FUNC_QSTATS_EXT_REQ_FLAGS_COUNTER_MASK     0x2UL
-	u8	unused_0[1];
-	__le32	enables;
-	#define FUNC_QSTATS_EXT_REQ_ENABLES_SCHQ_ID     0x1UL
-	__le16	schq_id;
-	__le16	traffic_class;
-	u8	unused_1[4];
-};
-
-/* hwrm_func_qstats_ext_output (size:1536b/192B) */
-struct hwrm_func_qstats_ext_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_error_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_error_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	rx_tpa_eligible_pkt;
-	__le64	rx_tpa_eligible_bytes;
-	__le64	rx_tpa_pkt;
-	__le64	rx_tpa_bytes;
-	__le64	rx_tpa_errors;
-	__le64	rx_tpa_events;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_clr_stats_input (size:192b/24B) */
-struct hwrm_func_clr_stats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	unused_0[6];
-};
-
-/* hwrm_func_clr_stats_output (size:128b/16B) */
-struct hwrm_func_clr_stats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_vf_resc_free_input (size:192b/24B) */
-struct hwrm_func_vf_resc_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	vf_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_func_vf_resc_free_output (size:128b/16B) */
-struct hwrm_func_vf_resc_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_drv_rgtr_input (size:896b/112B) */
-struct hwrm_func_drv_rgtr_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_ALL_MODE                     0x1UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_NONE_MODE                    0x2UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE                   0x4UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_FLOW_HANDLE_64BIT_MODE           0x8UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT                0x10UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT           0x20UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_MASTER_SUPPORT                   0x40UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_FAST_RESET_SUPPORT               0x80UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_RSS_STRICT_HASH_TYPE_SUPPORT     0x100UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_NPAR_1_2_SUPPORT                 0x200UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_ASYM_QUEUE_CFG_SUPPORT           0x400UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_TF_INGRESS_NIC_FLOW_MODE         0x800UL
-	#define FUNC_DRV_RGTR_REQ_FLAGS_TF_EGRESS_NIC_FLOW_MODE          0x1000UL
-	__le32	enables;
-	#define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE             0x1UL
-	#define FUNC_DRV_RGTR_REQ_ENABLES_VER                 0x2UL
-	#define FUNC_DRV_RGTR_REQ_ENABLES_TIMESTAMP           0x4UL
-	#define FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD          0x8UL
-	#define FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD     0x10UL
-	__le16	os_type;
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UNKNOWN   0x0UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_OTHER     0x1UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_MSDOS     0xeUL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WINDOWS   0x12UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_SOLARIS   0x1dUL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX     0x24UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_FREEBSD   0x2aUL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_ESXI      0x68UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN864    0x73UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN2012R2 0x74UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI      0x8000UL
-	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LAST     FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI
-	u8	ver_maj_8b;
-	u8	ver_min_8b;
-	u8	ver_upd_8b;
-	u8	unused_0[3];
-	__le32	timestamp;
-	u8	unused_1[4];
-	__le32	vf_req_fwd[8];
-	__le32	async_event_fwd[8];
-	__le16	ver_maj;
-	__le16	ver_min;
-	__le16	ver_upd;
-	__le16	ver_patch;
-};
-
-/* hwrm_func_drv_rgtr_output (size:128b/16B) */
-struct hwrm_func_drv_rgtr_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define FUNC_DRV_RGTR_RESP_FLAGS_IF_CHANGE_SUPPORTED     0x1UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_func_drv_unrgtr_input (size:192b/24B) */
-struct hwrm_func_drv_unrgtr_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define FUNC_DRV_UNRGTR_REQ_FLAGS_PREPARE_FOR_SHUTDOWN     0x1UL
-	u8	unused_0[4];
-};
-
-/* hwrm_func_drv_unrgtr_output (size:128b/16B) */
-struct hwrm_func_drv_unrgtr_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_buf_rgtr_input (size:1024b/128B) */
-struct hwrm_func_buf_rgtr_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_BUF_RGTR_REQ_ENABLES_VF_ID            0x1UL
-	#define FUNC_BUF_RGTR_REQ_ENABLES_ERR_BUF_ADDR     0x2UL
-	__le16	vf_id;
-	__le16	req_buf_num_pages;
-	__le16	req_buf_page_size;
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_16B 0x4UL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4K  0xcUL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_8K  0xdUL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_64K 0x10UL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_2M  0x15UL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4M  0x16UL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G  0x1eUL
-	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_LAST FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G
-	__le16	req_buf_len;
-	__le16	resp_buf_len;
-	u8	unused_0[2];
-	__le64	req_buf_page_addr0;
-	__le64	req_buf_page_addr1;
-	__le64	req_buf_page_addr2;
-	__le64	req_buf_page_addr3;
-	__le64	req_buf_page_addr4;
-	__le64	req_buf_page_addr5;
-	__le64	req_buf_page_addr6;
-	__le64	req_buf_page_addr7;
-	__le64	req_buf_page_addr8;
-	__le64	req_buf_page_addr9;
-	__le64	error_buf_addr;
-	__le64	resp_buf_addr;
-};
-
-/* hwrm_func_buf_rgtr_output (size:128b/16B) */
-struct hwrm_func_buf_rgtr_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_drv_qver_input (size:192b/24B) */
-struct hwrm_func_drv_qver_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	reserved;
-	__le16	fid;
-	u8	driver_type;
-	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_L2   0x0UL
-	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_ROCE 0x1UL
-	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_LAST FUNC_DRV_QVER_REQ_DRIVER_TYPE_ROCE
-	u8	unused_0;
-};
-
-/* hwrm_func_drv_qver_output (size:256b/32B) */
-struct hwrm_func_drv_qver_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	os_type;
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_UNKNOWN   0x0UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_OTHER     0x1UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_MSDOS     0xeUL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_WINDOWS   0x12UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_SOLARIS   0x1dUL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_LINUX     0x24UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_FREEBSD   0x2aUL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_ESXI      0x68UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN864    0x73UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN2012R2 0x74UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_UEFI      0x8000UL
-	#define FUNC_DRV_QVER_RESP_OS_TYPE_LAST     FUNC_DRV_QVER_RESP_OS_TYPE_UEFI
-	u8	ver_maj_8b;
-	u8	ver_min_8b;
-	u8	ver_upd_8b;
-	u8	unused_0[3];
-	__le16	ver_maj;
-	__le16	ver_min;
-	__le16	ver_upd;
-	__le16	ver_patch;
-	u8	unused_1[7];
-	u8	valid;
-};
-
-/* hwrm_func_resource_qcaps_input (size:192b/24B) */
-struct hwrm_func_resource_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	unused_0[6];
-};
-
-/* hwrm_func_resource_qcaps_output (size:704b/88B) */
-struct hwrm_func_resource_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	max_vfs;
-	__le16	max_msix;
-	__le16	vf_reservation_strategy;
-	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MAXIMAL        0x0UL
-	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL        0x1UL
-	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC 0x2UL
-	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_LAST          FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC
-	__le16	min_rsscos_ctx;
-	__le16	max_rsscos_ctx;
-	__le16	min_cmpl_rings;
-	__le16	max_cmpl_rings;
-	__le16	min_tx_rings;
-	__le16	max_tx_rings;
-	__le16	min_rx_rings;
-	__le16	max_rx_rings;
-	__le16	min_l2_ctxs;
-	__le16	max_l2_ctxs;
-	__le16	min_vnics;
-	__le16	max_vnics;
-	__le16	min_stat_ctx;
-	__le16	max_stat_ctx;
-	__le16	min_hw_ring_grps;
-	__le16	max_hw_ring_grps;
-	__le16	max_tx_scheduler_inputs;
-	__le16	flags;
-	#define FUNC_RESOURCE_QCAPS_RESP_FLAGS_MIN_GUARANTEED     0x1UL
-	__le16	min_msix;
-	__le32	min_ktls_tx_key_ctxs;
-	__le32	max_ktls_tx_key_ctxs;
-	__le32	min_ktls_rx_key_ctxs;
-	__le32	max_ktls_rx_key_ctxs;
-	__le32	min_quic_tx_key_ctxs;
-	__le32	max_quic_tx_key_ctxs;
-	__le32	min_quic_rx_key_ctxs;
-	__le32	max_quic_rx_key_ctxs;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_func_vf_resource_cfg_input (size:704b/88B) */
-struct hwrm_func_vf_resource_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	vf_id;
-	__le16	max_msix;
-	__le16	min_rsscos_ctx;
-	__le16	max_rsscos_ctx;
-	__le16	min_cmpl_rings;
-	__le16	max_cmpl_rings;
-	__le16	min_tx_rings;
-	__le16	max_tx_rings;
-	__le16	min_rx_rings;
-	__le16	max_rx_rings;
-	__le16	min_l2_ctxs;
-	__le16	max_l2_ctxs;
-	__le16	min_vnics;
-	__le16	max_vnics;
-	__le16	min_stat_ctx;
-	__le16	max_stat_ctx;
-	__le16	min_hw_ring_grps;
-	__le16	max_hw_ring_grps;
-	__le16	flags;
-	#define FUNC_VF_RESOURCE_CFG_REQ_FLAGS_MIN_GUARANTEED     0x1UL
-	__le16	min_msix;
-	__le32	min_ktls_tx_key_ctxs;
-	__le32	max_ktls_tx_key_ctxs;
-	__le32	min_ktls_rx_key_ctxs;
-	__le32	max_ktls_rx_key_ctxs;
-	__le32	min_quic_tx_key_ctxs;
-	__le32	max_quic_tx_key_ctxs;
-	__le32	min_quic_rx_key_ctxs;
-	__le32	max_quic_rx_key_ctxs;
-};
-
-/* hwrm_func_vf_resource_cfg_output (size:384b/48B) */
-struct hwrm_func_vf_resource_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	reserved_rsscos_ctx;
-	__le16	reserved_cmpl_rings;
-	__le16	reserved_tx_rings;
-	__le16	reserved_rx_rings;
-	__le16	reserved_l2_ctxs;
-	__le16	reserved_vnics;
-	__le16	reserved_stat_ctx;
-	__le16	reserved_hw_ring_grps;
-	__le32	reserved_ktls_tx_key_ctxs;
-	__le32	reserved_ktls_rx_key_ctxs;
-	__le32	reserved_quic_tx_key_ctxs;
-	__le32	reserved_quic_rx_key_ctxs;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_backing_store_qcaps_input (size:128b/16B) */
-struct hwrm_func_backing_store_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_func_backing_store_qcaps_output (size:832b/104B) */
-struct hwrm_func_backing_store_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	qp_max_entries;
-	__le16	qp_min_qp1_entries;
-	__le16	qp_max_l2_entries;
-	__le16	qp_entry_size;
-	__le16	srq_max_l2_entries;
-	__le32	srq_max_entries;
-	__le16	srq_entry_size;
-	__le16	cq_max_l2_entries;
-	__le32	cq_max_entries;
-	__le16	cq_entry_size;
-	__le16	vnic_max_vnic_entries;
-	__le16	vnic_max_ring_table_entries;
-	__le16	vnic_entry_size;
-	__le32	stat_max_entries;
-	__le16	stat_entry_size;
-	__le16	tqm_entry_size;
-	__le32	tqm_min_entries_per_ring;
-	__le32	tqm_max_entries_per_ring;
-	__le32	mrav_max_entries;
-	__le16	mrav_entry_size;
-	__le16	tim_entry_size;
-	__le32	tim_max_entries;
-	__le16	mrav_num_entries_units;
-	u8	tqm_entries_multiple;
-	u8	ctx_kind_initializer;
-	__le16	ctx_init_mask;
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_QP       0x1UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_SRQ      0x2UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_CQ       0x4UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_VNIC     0x8UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_STAT     0x10UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_MRAV     0x20UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_TKC      0x40UL
-	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_RKC      0x80UL
-	u8	qp_init_offset;
-	u8	srq_init_offset;
-	u8	cq_init_offset;
-	u8	vnic_init_offset;
-	u8	tqm_fp_rings_count;
-	u8	stat_init_offset;
-	u8	mrav_init_offset;
-	u8	tqm_fp_rings_count_ext;
-	u8	tkc_init_offset;
-	u8	rkc_init_offset;
-	__le16	tkc_entry_size;
-	__le16	rkc_entry_size;
-	__le32	tkc_max_entries;
-	__le32	rkc_max_entries;
-	__le16	fast_qpmd_qp_num_entries;
-	u8	rsvd1[5];
-	u8	valid;
-};
-
-/* tqm_fp_ring_cfg (size:128b/16B) */
-struct tqm_fp_ring_cfg {
-	u8	tqm_ring_pg_size_tqm_ring_lvl;
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_MASK      0xfUL
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_SFT       0
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_0       0x0UL
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_1       0x1UL
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2       0x2UL
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LAST       TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_MASK  0xf0UL
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_SFT   4
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_LAST   TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G
-	u8	unused[3];
-	__le32	tqm_ring_num_entries;
-	__le64	tqm_ring_page_dir;
-};
-
-/* hwrm_func_backing_store_cfg_input (size:2688b/336B) */
-struct hwrm_func_backing_store_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define FUNC_BACKING_STORE_CFG_REQ_FLAGS_PREBOOT_MODE               0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_FLAGS_MRAV_RESERVATION_SPLIT     0x2UL
-	__le32	enables;
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_QP               0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_SRQ              0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_CQ               0x4UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_VNIC             0x8UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_STAT             0x10UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP           0x20UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING0        0x40UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING1        0x80UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING2        0x100UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING3        0x200UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING4        0x400UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING5        0x800UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING6        0x1000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING7        0x2000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV             0x4000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TIM              0x8000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING8        0x10000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING9        0x20000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING10       0x40000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TKC              0x80000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_RKC              0x100000UL
-	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_QP_FAST_QPMD     0x200000UL
-	u8	qpc_pg_size_qpc_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_1G
-	u8	srq_pg_size_srq_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_1G
-	u8	cq_pg_size_cq_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_1G
-	u8	vnic_pg_size_vnic_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_1G
-	u8	stat_pg_size_stat_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_1G
-	u8	tqm_sp_pg_size_tqm_sp_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_1G
-	u8	tqm_ring0_pg_size_tqm_ring0_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_1G
-	u8	tqm_ring1_pg_size_tqm_ring1_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_1G
-	u8	tqm_ring2_pg_size_tqm_ring2_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_1G
-	u8	tqm_ring3_pg_size_tqm_ring3_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_1G
-	u8	tqm_ring4_pg_size_tqm_ring4_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_1G
-	u8	tqm_ring5_pg_size_tqm_ring5_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_1G
-	u8	tqm_ring6_pg_size_tqm_ring6_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_1G
-	u8	tqm_ring7_pg_size_tqm_ring7_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_1G
-	u8	mrav_pg_size_mrav_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_1G
-	u8	tim_pg_size_tim_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_1G
-	__le64	qpc_page_dir;
-	__le64	srq_page_dir;
-	__le64	cq_page_dir;
-	__le64	vnic_page_dir;
-	__le64	stat_page_dir;
-	__le64	tqm_sp_page_dir;
-	__le64	tqm_ring0_page_dir;
-	__le64	tqm_ring1_page_dir;
-	__le64	tqm_ring2_page_dir;
-	__le64	tqm_ring3_page_dir;
-	__le64	tqm_ring4_page_dir;
-	__le64	tqm_ring5_page_dir;
-	__le64	tqm_ring6_page_dir;
-	__le64	tqm_ring7_page_dir;
-	__le64	mrav_page_dir;
-	__le64	tim_page_dir;
-	__le32	qp_num_entries;
-	__le32	srq_num_entries;
-	__le32	cq_num_entries;
-	__le32	stat_num_entries;
-	__le32	tqm_sp_num_entries;
-	__le32	tqm_ring0_num_entries;
-	__le32	tqm_ring1_num_entries;
-	__le32	tqm_ring2_num_entries;
-	__le32	tqm_ring3_num_entries;
-	__le32	tqm_ring4_num_entries;
-	__le32	tqm_ring5_num_entries;
-	__le32	tqm_ring6_num_entries;
-	__le32	tqm_ring7_num_entries;
-	__le32	mrav_num_entries;
-	__le32	tim_num_entries;
-	__le16	qp_num_qp1_entries;
-	__le16	qp_num_l2_entries;
-	__le16	qp_entry_size;
-	__le16	srq_num_l2_entries;
-	__le16	srq_entry_size;
-	__le16	cq_num_l2_entries;
-	__le16	cq_entry_size;
-	__le16	vnic_num_vnic_entries;
-	__le16	vnic_num_ring_table_entries;
-	__le16	vnic_entry_size;
-	__le16	stat_entry_size;
-	__le16	tqm_entry_size;
-	__le16	mrav_entry_size;
-	__le16	tim_entry_size;
-	u8	tqm_ring8_pg_size_tqm_ring_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_1G
-	u8	ring8_unused[3];
-	__le32	tqm_ring8_num_entries;
-	__le64	tqm_ring8_page_dir;
-	u8	tqm_ring9_pg_size_tqm_ring_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_1G
-	u8	ring9_unused[3];
-	__le32	tqm_ring9_num_entries;
-	__le64	tqm_ring9_page_dir;
-	u8	tqm_ring10_pg_size_tqm_ring_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_1G
-	u8	ring10_unused[3];
-	__le32	tqm_ring10_num_entries;
-	__le64	tqm_ring10_page_dir;
-	__le32	tkc_num_entries;
-	__le32	rkc_num_entries;
-	__le64	tkc_page_dir;
-	__le64	rkc_page_dir;
-	__le16	tkc_entry_size;
-	__le16	rkc_entry_size;
-	u8	tkc_pg_size_tkc_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_1G
-	u8	rkc_pg_size_rkc_lvl;
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_MASK      0xfUL
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_SFT       0
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_0       0x0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_1       0x1UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_2       0x2UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_1G
-	__le16	qp_num_fast_qpmd_entries;
-};
-
-/* hwrm_func_backing_store_cfg_output (size:128b/16B) */
-struct hwrm_func_backing_store_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_error_recovery_qcfg_input (size:192b/24B) */
-struct hwrm_error_recovery_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	unused_0[8];
-};
-
-/* hwrm_error_recovery_qcfg_output (size:1664b/208B) */
-struct hwrm_error_recovery_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define ERROR_RECOVERY_QCFG_RESP_FLAGS_HOST       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_FLAGS_CO_CPU     0x2UL
-	__le32	driver_polling_freq;
-	__le32	master_func_wait_period;
-	__le32	normal_func_wait_period;
-	__le32	master_func_wait_period_after_reset;
-	__le32	max_bailout_time_after_reset;
-	__le32	fw_health_status_reg;
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SFT           2
-	__le32	fw_heartbeat_reg;
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SFT           2
-	__le32	fw_reset_cnt_reg;
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SFT           2
-	__le32	reset_inprogress_reg;
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SFT           2
-	__le32	reset_inprogress_reg_mask;
-	u8	unused_0[3];
-	u8	reg_array_cnt;
-	__le32	reset_reg[16];
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SFT           2
-	__le32	reset_reg_val[16];
-	u8	delay_after_reset[16];
-	__le32	err_recovery_cnt_reg;
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_MASK    0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_SFT     0
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_GRC       0x1UL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR0      0x2UL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR1      0x3UL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR1
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_MASK          0xfffffffcUL
-	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SFT           2
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_func_echo_response_input (size:192b/24B) */
-struct hwrm_func_echo_response_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	event_data1;
-	__le32	event_data2;
-};
-
-/* hwrm_func_echo_response_output (size:128b/16B) */
-struct hwrm_func_echo_response_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_ptp_pin_qcfg_input (size:192b/24B) */
-struct hwrm_func_ptp_pin_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	unused_0[8];
-};
-
-/* hwrm_func_ptp_pin_qcfg_output (size:128b/16B) */
-struct hwrm_func_ptp_pin_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	num_pins;
-	u8	state;
-	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN0_ENABLED     0x1UL
-	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN1_ENABLED     0x2UL
-	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN2_ENABLED     0x4UL
-	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN3_ENABLED     0x8UL
-	u8	pin0_usage;
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_NONE     0x0UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_PPS_IN   0x1UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_PPS_OUT  0x2UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_IN  0x3UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_OUT 0x4UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_LAST    FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_OUT
-	u8	pin1_usage;
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_NONE     0x0UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_PPS_IN   0x1UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_PPS_OUT  0x2UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_IN  0x3UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_OUT 0x4UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_LAST    FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_OUT
-	u8	pin2_usage;
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_NONE                      0x0UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_PPS_IN                    0x1UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_PPS_OUT                   0x2UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNC_IN                   0x3UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNC_OUT                  0x4UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_LAST                     FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT
-	u8	pin3_usage;
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_NONE                      0x0UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_PPS_IN                    0x1UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_PPS_OUT                   0x2UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNC_IN                   0x3UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNC_OUT                  0x4UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
-	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_LAST                     FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_func_ptp_pin_cfg_input (size:256b/32B) */
-struct hwrm_func_ptp_pin_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN0_STATE     0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN0_USAGE     0x2UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN1_STATE     0x4UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN1_USAGE     0x8UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN2_STATE     0x10UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN2_USAGE     0x20UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN3_STATE     0x40UL
-	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN3_USAGE     0x80UL
-	u8	pin0_state;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_DISABLED 0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_ENABLED  0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_ENABLED
-	u8	pin0_usage;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_NONE     0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_PPS_IN   0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_PPS_OUT  0x2UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_IN  0x3UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_OUT 0x4UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_OUT
-	u8	pin1_state;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_DISABLED 0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_ENABLED  0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_ENABLED
-	u8	pin1_usage;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_NONE     0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_PPS_IN   0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_PPS_OUT  0x2UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_IN  0x3UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_OUT 0x4UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_OUT
-	u8	pin2_state;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_DISABLED 0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_ENABLED  0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_ENABLED
-	u8	pin2_usage;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_NONE                      0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_PPS_IN                    0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_PPS_OUT                   0x2UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNC_IN                   0x3UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNC_OUT                  0x4UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_LAST                     FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT
-	u8	pin3_state;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_DISABLED 0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_ENABLED  0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_ENABLED
-	u8	pin3_usage;
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_NONE                      0x0UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_PPS_IN                    0x1UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_PPS_OUT                   0x2UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNC_IN                   0x3UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNC_OUT                  0x4UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
-	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_LAST                     FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT
-	u8	unused_0[4];
-};
-
-/* hwrm_func_ptp_pin_cfg_output (size:128b/16B) */
-struct hwrm_func_ptp_pin_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_ptp_cfg_input (size:384b/48B) */
-struct hwrm_func_ptp_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	enables;
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_PPS_EVENT               0x1UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_DLL_SOURCE     0x2UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_DLL_PHASE      0x4UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PERIOD     0x8UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_UP         0x10UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PHASE      0x20UL
-	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_SET_TIME                0x40UL
-	u8	ptp_pps_event;
-	#define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_INTERNAL     0x1UL
-	#define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_EXTERNAL     0x2UL
-	u8	ptp_freq_adj_dll_source;
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_NONE    0x0UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_0  0x1UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_1  0x2UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_2  0x3UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_3  0x4UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_0  0x5UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_1  0x6UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_2  0x7UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_3  0x8UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_INVALID 0xffUL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_LAST   FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_INVALID
-	u8	ptp_freq_adj_dll_phase;
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_NONE 0x0UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_4K   0x1UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_8K   0x2UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_10M  0x3UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_25M  0x4UL
-	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_LAST FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_25M
-	u8	unused_0[3];
-	__le32	ptp_freq_adj_ext_period;
-	__le32	ptp_freq_adj_ext_up;
-	__le32	ptp_freq_adj_ext_phase_lower;
-	__le32	ptp_freq_adj_ext_phase_upper;
-	__le64	ptp_set_time;
-};
-
-/* hwrm_func_ptp_cfg_output (size:128b/16B) */
-struct hwrm_func_ptp_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_ptp_ts_query_input (size:192b/24B) */
-struct hwrm_func_ptp_ts_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define FUNC_PTP_TS_QUERY_REQ_FLAGS_PPS_TIME     0x1UL
-	#define FUNC_PTP_TS_QUERY_REQ_FLAGS_PTM_TIME     0x2UL
-	u8	unused_0[4];
-};
-
-/* hwrm_func_ptp_ts_query_output (size:320b/40B) */
-struct hwrm_func_ptp_ts_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	pps_event_ts;
-	__le64	ptm_local_ts;
-	__le64	ptm_system_ts;
-	__le32	ptm_link_delay;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_func_ptp_ext_cfg_input (size:256b/32B) */
-struct hwrm_func_ptp_ext_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	enables;
-	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_MASTER_FID     0x1UL
-	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_FID        0x2UL
-	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_MODE       0x4UL
-	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_FAILOVER_TIMER     0x8UL
-	__le16	phc_master_fid;
-	__le16	phc_sec_fid;
-	u8	phc_sec_mode;
-	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_SWITCH  0x0UL
-	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_ALL     0x1UL
-	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY 0x2UL
-	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_LAST   FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY
-	u8	unused_0;
-	__le32	failover_timer;
-	u8	unused_1[4];
-};
-
-/* hwrm_func_ptp_ext_cfg_output (size:128b/16B) */
-struct hwrm_func_ptp_ext_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_ptp_ext_qcfg_input (size:192b/24B) */
-struct hwrm_func_ptp_ext_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	unused_0[8];
-};
-
-/* hwrm_func_ptp_ext_qcfg_output (size:256b/32B) */
-struct hwrm_func_ptp_ext_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	phc_master_fid;
-	__le16	phc_sec_fid;
-	__le16	phc_active_fid0;
-	__le16	phc_active_fid1;
-	__le32	last_failover_event;
-	__le16	from_fid;
-	__le16	to_fid;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_func_backing_store_cfg_v2_input (size:512b/64B) */
-struct hwrm_func_backing_store_cfg_v2_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	type;
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP                  0x0UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ                 0x1UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ                  0x2UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC                0x3UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT                0x4UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING         0x5UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING         0x6UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV                0xeUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM                 0xfUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TX_CK               0x13UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RX_CK               0x14UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING         0x15UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION       0x1dUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT_TRACE           0x1eUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT_TRACE           0x20UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT2_TRACE          0x21UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA0_TRACE           0x26UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA1_TRACE           0x27UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA2_TRACE           0x28UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID             0xffffUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID
-	__le16	instance;
-	__le32	flags;
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_PREBOOT_MODE        0x1UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_BS_CFG_ALL_DONE     0x2UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_BS_EXTEND           0x4UL
-	__le64	page_dir;
-	__le32	num_entries;
-	__le16	entry_size;
-	u8	page_size_pbl_level;
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_MASK  0xfUL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_SFT   0
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_0   0x0UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_1   0x1UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2   0x2UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G
-	u8	subtype_valid_cnt;
-	__le32	split_entry_0;
-	__le32	split_entry_1;
-	__le32	split_entry_2;
-	__le32	split_entry_3;
-	__le32	enables;
-	#define FUNC_BACKING_STORE_CFG_V2_REQ_ENABLES_NEXT_BS_OFFSET     0x1UL
-	__le32	next_bs_offset;
-};
-
-/* hwrm_func_backing_store_cfg_v2_output (size:128b/16B) */
-struct hwrm_func_backing_store_cfg_v2_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	rsvd0[7];
-	u8	valid;
-};
-
-/* hwrm_func_backing_store_qcfg_v2_input (size:192b/24B) */
-struct hwrm_func_backing_store_qcfg_v2_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	type;
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_QP                  0x0UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRQ                 0x1UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CQ                  0x2UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_VNIC                0x3UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_STAT                0x4UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SP_TQM_RING         0x5UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_FP_TQM_RING         0x6UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MRAV                0xeUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TIM                 0xfUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TX_CK               0x13UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RX_CK               0x14UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MP_TQM_RING         0x15UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_XID_PARTITION_TABLE 0x1dUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRT_TRACE           0x1eUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CRT_TRACE           0x20UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CRT2_TRACE          0x21UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA0_TRACE           0x26UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA1_TRACE           0x27UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA2_TRACE           0x28UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID             0xffffUL
-	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID
-	__le16	instance;
-	u8	rsvd[4];
-};
-
-/* hwrm_func_backing_store_qcfg_v2_output (size:448b/56B) */
-struct hwrm_func_backing_store_qcfg_v2_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	type;
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP                  0x0UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ                 0x1UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ                  0x2UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC                0x3UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT                0x4UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING         0x5UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING         0x6UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV                0xeUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM                 0xfUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TX_CK               0x13UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RX_CK               0x14UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING         0x15UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE           0x1cUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION       0x1dUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT_TRACE           0x1eUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT2_TRACE          0x1fUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT_TRACE           0x20UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT2_TRACE          0x21UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP0_TRACE         0x22UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_L2_HWRM_TRACE       0x23UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_ROCE_HWRM_TRACE     0x24UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TTX_PACING_TQM_RING 0x25UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA0_TRACE           0x26UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA1_TRACE           0x27UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA2_TRACE           0x28UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP1_TRACE         0x29UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID             0xffffUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_LAST               FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID
-	__le16	instance;
-	__le32	flags;
-	__le64	page_dir;
-	__le32	num_entries;
-	u8	page_size_pbl_level;
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_MASK  0xfUL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_SFT   0
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_0   0x0UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_1   0x1UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2   0x2UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_MASK  0xf0UL
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_SFT   4
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_4K   (0x0UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8K   (0x1UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_64K  (0x2UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_2M   (0x3UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8M   (0x4UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G   (0x5UL << 4)
-	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G
-	u8	subtype_valid_cnt;
-	u8	rsvd[2];
-	__le32	split_entry_0;
-	__le32	split_entry_1;
-	__le32	split_entry_2;
-	__le32	split_entry_3;
-	u8	rsvd2[7];
-	u8	valid;
-};
-
-/* qpc_split_entries (size:128b/16B) */
-struct qpc_split_entries {
-	__le32	qp_num_l2_entries;
-	__le32	qp_num_qp1_entries;
-	__le32	qp_num_fast_qpmd_entries;
-	__le32	rsvd;
-};
-
-/* srq_split_entries (size:128b/16B) */
-struct srq_split_entries {
-	__le32	srq_num_l2_entries;
-	__le32	rsvd;
-	__le32	rsvd2[2];
-};
-
-/* cq_split_entries (size:128b/16B) */
-struct cq_split_entries {
-	__le32	cq_num_l2_entries;
-	__le32	rsvd;
-	__le32	rsvd2[2];
-};
-
-/* vnic_split_entries (size:128b/16B) */
-struct vnic_split_entries {
-	__le32	vnic_num_vnic_entries;
-	__le32	rsvd;
-	__le32	rsvd2[2];
-};
-
-/* mrav_split_entries (size:128b/16B) */
-struct mrav_split_entries {
-	__le32	mrav_num_av_entries;
-	__le32	rsvd;
-	__le32	rsvd2[2];
-};
-
-/* ts_split_entries (size:128b/16B) */
-struct ts_split_entries {
-	__le32	region_num_entries;
-	u8	tsid;
-	u8	lkup_static_bkt_cnt_exp[2];
-	u8	locked;
-	__le32	rsvd2[2];
-};
-
-/* ck_split_entries (size:128b/16B) */
-struct ck_split_entries {
-	__le32	num_quic_entries;
-	__le32	rsvd;
-	__le32	rsvd2[2];
-};
-
-/* hwrm_func_backing_store_qcaps_v2_input (size:192b/24B) */
-struct hwrm_func_backing_store_qcaps_v2_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	type;
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP                  0x0UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ                 0x1UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ                  0x2UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC                0x3UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT                0x4UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING         0x5UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING         0x6UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV                0xeUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM                 0xfUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TX_CK               0x13UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RX_CK               0x14UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING         0x15UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION       0x1dUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT_TRACE           0x1eUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT_TRACE           0x20UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT2_TRACE          0x21UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA0_TRACE           0x26UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA1_TRACE           0x27UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA2_TRACE           0x28UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID             0xffffUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID
-	u8	rsvd[6];
-};
-
-/* hwrm_func_backing_store_qcaps_v2_output (size:448b/56B) */
-struct hwrm_func_backing_store_qcaps_v2_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	type;
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP                  0x0UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ                 0x1UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ                  0x2UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC                0x3UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT                0x4UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING         0x5UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING         0x6UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV                0xeUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM                 0xfUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TX_CK               0x13UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RX_CK               0x14UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING         0x15UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW        0x16UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW        0x17UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW       0x18UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW        0x19UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE           0x1cUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION       0x1dUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT_TRACE           0x1eUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT2_TRACE          0x1fUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT_TRACE           0x20UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT2_TRACE          0x21UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP0_TRACE         0x22UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_L2_HWRM_TRACE       0x23UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_ROCE_HWRM_TRACE     0x24UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TTX_PACING_TQM_RING 0x25UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA0_TRACE           0x26UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA1_TRACE           0x27UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA2_TRACE           0x28UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP1_TRACE         0x29UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID             0xffffUL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_LAST               FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID
-	__le16	entry_size;
-	__le32	flags;
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_ENABLE_CTX_KIND_INIT            0x1UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_TYPE_VALID                      0x2UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_DRIVER_MANAGED_MEMORY           0x4UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_ROCE_QP_PSEUDO_STATIC_ALLOC     0x8UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_FW_DBG_TRACE                    0x10UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_FW_BIN_DBG_TRACE                0x20UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_NEXT_BS_OFFSET                  0x40UL
-	__le32	instance_bit_map;
-	u8	ctx_init_value;
-	u8	ctx_init_offset;
-	u8	entry_multiple;
-	u8	rsvd;
-	__le32	max_num_entries;
-	__le32	min_num_entries;
-	__le16	next_valid_type;
-	u8	subtype_valid_cnt;
-	u8	exact_cnt_bit_map;
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_0_EXACT     0x1UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_1_EXACT     0x2UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_2_EXACT     0x4UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_3_EXACT     0x8UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_UNUSED_MASK             0xf0UL
-	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_UNUSED_SFT              4
-	__le32	split_entry_0;
-	__le32	split_entry_1;
-	__le32	split_entry_2;
-	__le32	split_entry_3;
-	__le16	max_instance_count;
-	u8	rsvd3;
-	u8	valid;
-};
-
-/* hwrm_func_dbr_pacing_qcfg_input (size:128b/16B) */
-struct hwrm_func_dbr_pacing_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_func_dbr_pacing_qcfg_output (size:512b/64B) */
-struct hwrm_func_dbr_pacing_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define FUNC_DBR_PACING_QCFG_RESP_FLAGS_DBR_NQ_EVENT_ENABLED     0x1UL
-	u8	unused_0[7];
-	__le32	dbr_stat_db_fifo_reg;
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_MASK    0x3UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_SFT     0
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_GRC       0x1UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR0      0x2UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR1      0x3UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_LAST     FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR1
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_MASK          0xfffffffcUL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SFT           2
-	__le32	dbr_stat_db_fifo_reg_watermark_mask;
-	u8	dbr_stat_db_fifo_reg_watermark_shift;
-	u8	unused_1[3];
-	__le32	dbr_stat_db_fifo_reg_fifo_room_mask;
-	u8	dbr_stat_db_fifo_reg_fifo_room_shift;
-	u8	unused_2[3];
-	__le32	dbr_throttling_aeq_arm_reg;
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_MASK    0x3UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_SFT     0
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_GRC       0x1UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR0      0x2UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR1      0x3UL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_LAST     FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR1
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_MASK          0xfffffffcUL
-	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SFT           2
-	u8	dbr_throttling_aeq_arm_reg_val;
-	u8	unused_3[3];
-	__le32	dbr_stat_db_max_fifo_depth;
-	__le32	primary_nq_id;
-	__le32	pacing_threshold;
-	u8	unused_4[7];
-	u8	valid;
-};
-
-/* hwrm_func_drv_if_change_input (size:192b/24B) */
-struct hwrm_func_drv_if_change_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP     0x1UL
-	__le32	unused;
-};
-
-/* hwrm_func_drv_if_change_output (size:128b/16B) */
-struct hwrm_func_drv_if_change_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_RESC_CHANGE           0x1UL
-	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_HOT_FW_RESET_DONE     0x2UL
-	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_CAPS_CHANGE           0x4UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_port_phy_cfg_input (size:512b/64B) */
-struct hwrm_port_phy_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define PORT_PHY_CFG_REQ_FLAGS_RESET_PHY                  0x1UL
-	#define PORT_PHY_CFG_REQ_FLAGS_DEPRECATED                 0x2UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FORCE                      0x4UL
-	#define PORT_PHY_CFG_REQ_FLAGS_RESTART_AUTONEG            0x8UL
-	#define PORT_PHY_CFG_REQ_FLAGS_EEE_ENABLE                 0x10UL
-	#define PORT_PHY_CFG_REQ_FLAGS_EEE_DISABLE                0x20UL
-	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_ENABLE          0x40UL
-	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE         0x80UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_ENABLE         0x100UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_DISABLE        0x200UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_ENABLE        0x400UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_DISABLE       0x800UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_ENABLE        0x1000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_DISABLE       0x2000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FORCE_LINK_DWN             0x4000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_1XN_ENABLE       0x8000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_1XN_DISABLE      0x10000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_IEEE_ENABLE      0x20000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_IEEE_DISABLE     0x40000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_ENABLE       0x80000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_DISABLE      0x100000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_ENABLE      0x200000UL
-	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_DISABLE     0x400000UL
-	__le32	enables;
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE                     0x1UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_DUPLEX                   0x2UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE                    0x4UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED               0x8UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED_MASK          0x10UL
-	#define PORT_PHY_CFG_REQ_ENABLES_WIRESPEED                     0x20UL
-	#define PORT_PHY_CFG_REQ_ENABLES_LPBK                          0x40UL
-	#define PORT_PHY_CFG_REQ_ENABLES_PREEMPHASIS                   0x80UL
-	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_PAUSE                   0x100UL
-	#define PORT_PHY_CFG_REQ_ENABLES_EEE_LINK_SPEED_MASK           0x200UL
-	#define PORT_PHY_CFG_REQ_ENABLES_TX_LPI_TIMER                  0x400UL
-	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_PAM4_LINK_SPEED         0x800UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_PAM4_LINK_SPEED_MASK     0x1000UL
-	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_LINK_SPEEDS2            0x2000UL
-	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEEDS2_MASK        0x4000UL
-	__le16	port_id;
-	__le16	force_link_speed;
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100MB 0x1UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_1GB   0xaUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2GB   0x14UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2_5GB 0x19UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10GB  0x64UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_20GB  0xc8UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_25GB  0xfaUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_40GB  0x190UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB  0xffffUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB
-	u8	auto_mode;
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_NONE         0x0UL
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_ALL_SPEEDS   0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_SPEED    0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_OR_BELOW 0x3UL
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK   0x4UL
-	#define PORT_PHY_CFG_REQ_AUTO_MODE_LAST        PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK
-	u8	auto_duplex;
-	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_HALF 0x0UL
-	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_FULL 0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH 0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_LAST PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH
-	u8	auto_pause;
-	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_TX                0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_RX                0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
-	u8	mgmt_flag;
-	#define PORT_PHY_CFG_REQ_MGMT_FLAG_LINK_RELEASE     0x1UL
-	#define PORT_PHY_CFG_REQ_MGMT_FLAG_MGMT_VALID       0x80UL
-	__le16	auto_link_speed;
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100MB 0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_1GB   0xaUL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2GB   0x14UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2_5GB 0x19UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10GB  0x64UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_20GB  0xc8UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_25GB  0xfaUL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_40GB  0x190UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB  0xffffUL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_LAST PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB
-	__le16	auto_link_speed_mask;
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MB       0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GB         0x8UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2GB         0x10UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10GB        0x40UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_20GB        0x80UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_25GB        0x100UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_40GB        0x200UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_50GB        0x400UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100GB       0x800UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
-	u8	wirespeed;
-	#define PORT_PHY_CFG_REQ_WIRESPEED_OFF 0x0UL
-	#define PORT_PHY_CFG_REQ_WIRESPEED_ON  0x1UL
-	#define PORT_PHY_CFG_REQ_WIRESPEED_LAST PORT_PHY_CFG_REQ_WIRESPEED_ON
-	u8	lpbk;
-	#define PORT_PHY_CFG_REQ_LPBK_NONE     0x0UL
-	#define PORT_PHY_CFG_REQ_LPBK_LOCAL    0x1UL
-	#define PORT_PHY_CFG_REQ_LPBK_REMOTE   0x2UL
-	#define PORT_PHY_CFG_REQ_LPBK_EXTERNAL 0x3UL
-	#define PORT_PHY_CFG_REQ_LPBK_LAST    PORT_PHY_CFG_REQ_LPBK_EXTERNAL
-	u8	force_pause;
-	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_TX     0x1UL
-	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_RX     0x2UL
-	u8	unused_1;
-	__le32	preemphasis;
-	__le16	eee_link_speed_mask;
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_100MB     0x2UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_1GB       0x8UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
-	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_10GB      0x40UL
-	__le16	force_pam4_link_speed;
-	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_200GB 0x7d0UL
-	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_LAST PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_200GB
-	__le32	tx_lpi_timer;
-	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_MASK 0xffffffUL
-	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_SFT 0
-	__le16	auto_link_pam4_speed_mask;
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_50G      0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_100G     0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_200G     0x4UL
-	__le16	force_link_speeds2;
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_1GB            0xaUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_10GB           0x64UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_25GB           0xfaUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_40GB           0x190UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_50GB           0x1f4UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB          0x3e8UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_50GB_PAM4_56   0x1f5UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_56  0x3e9UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_56  0x7d1UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_56  0xfa1UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL
-	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST          PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112
-	__le16	auto_link_speeds2_mask;
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_1GB                0x1UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_10GB               0x2UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_25GB               0x4UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_40GB               0x8UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_50GB               0x10UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB              0x20UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_50GB_PAM4_56       0x40UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_56      0x80UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_56      0x100UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_56      0x200UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_112     0x400UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_112     0x800UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_112     0x1000UL
-	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_800GB_PAM4_112     0x2000UL
-	u8	unused_2[6];
-};
-
-/* hwrm_port_phy_cfg_output (size:128b/16B) */
-struct hwrm_port_phy_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_phy_cfg_cmd_err (size:64b/8B) */
-struct hwrm_port_phy_cfg_cmd_err {
-	u8	code;
-	#define PORT_PHY_CFG_CMD_ERR_CODE_UNKNOWN       0x0UL
-	#define PORT_PHY_CFG_CMD_ERR_CODE_ILLEGAL_SPEED 0x1UL
-	#define PORT_PHY_CFG_CMD_ERR_CODE_RETRY         0x2UL
-	#define PORT_PHY_CFG_CMD_ERR_CODE_LAST         PORT_PHY_CFG_CMD_ERR_CODE_RETRY
-	u8	unused_0[7];
-};
-
-/* hwrm_port_phy_qcfg_input (size:192b/24B) */
-struct hwrm_port_phy_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_phy_qcfg_output (size:832b/104B) */
-struct hwrm_port_phy_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	link;
-	#define PORT_PHY_QCFG_RESP_LINK_NO_LINK 0x0UL
-	#define PORT_PHY_QCFG_RESP_LINK_SIGNAL  0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_LINK    0x2UL
-	#define PORT_PHY_QCFG_RESP_LINK_LAST   PORT_PHY_QCFG_RESP_LINK_LINK
-	u8	active_fec_signal_mode;
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_MASK                0xfUL
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_SFT                 0
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_NRZ                   0x0UL
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4                  0x1UL
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4_112              0x2UL
-	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_LAST                 PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4_112
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_MASK                 0xf0UL
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_SFT                  4
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_NONE_ACTIVE        (0x0UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_CLAUSE74_ACTIVE    (0x1UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_CLAUSE91_ACTIVE    (0x2UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS544_1XN_ACTIVE   (0x3UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS544_IEEE_ACTIVE  (0x4UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_1XN_ACTIVE   (0x5UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_IEEE_ACTIVE  (0x6UL << 4)
-	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_LAST                  PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_IEEE_ACTIVE
-	__le16	link_speed;
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100MB 0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_1GB   0xaUL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2GB   0x14UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2_5GB 0x19UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10GB  0x64UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_20GB  0xc8UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_25GB  0xfaUL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_40GB  0x190UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_200GB 0x7d0UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_400GB 0xfa0UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_800GB 0x1f40UL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10MB  0xffffUL
-	#define PORT_PHY_QCFG_RESP_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_LINK_SPEED_10MB
-	u8	duplex_cfg;
-	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_HALF 0x0UL
-	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL 0x1UL
-	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_LAST PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL
-	u8	pause;
-	#define PORT_PHY_QCFG_RESP_PAUSE_TX     0x1UL
-	#define PORT_PHY_QCFG_RESP_PAUSE_RX     0x2UL
-	__le16	support_speeds;
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MBHD     0x1UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB       0x2UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GBHD       0x4UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB         0x8UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB         0x10UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2_5GB       0x20UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10GB        0x40UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_20GB        0x80UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_25GB        0x100UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_40GB        0x200UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_50GB        0x400UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100GB       0x800UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MBHD      0x1000UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MB        0x2000UL
-	__le16	force_link_speed;
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100MB 0x1UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_1GB   0xaUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2GB   0x14UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2_5GB 0x19UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10GB  0x64UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_20GB  0xc8UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_25GB  0xfaUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_40GB  0x190UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB  0xffffUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB
-	u8	auto_mode;
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_NONE         0x0UL
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ALL_SPEEDS   0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_SPEED    0x2UL
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_OR_BELOW 0x3UL
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK   0x4UL
-	#define PORT_PHY_QCFG_RESP_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK
-	u8	auto_pause;
-	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_TX                0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_RX                0x2UL
-	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
-	__le16	auto_link_speed;
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100MB 0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_1GB   0xaUL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2GB   0x14UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2_5GB 0x19UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10GB  0x64UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_20GB  0xc8UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_25GB  0xfaUL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_40GB  0x190UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB  0xffffUL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB
-	__le16	auto_link_speed_mask;
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MB       0x2UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GB         0x8UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2GB         0x10UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10GB        0x40UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_20GB        0x80UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_25GB        0x100UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_40GB        0x200UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_50GB        0x400UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100GB       0x800UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
-	u8	wirespeed;
-	#define PORT_PHY_QCFG_RESP_WIRESPEED_OFF 0x0UL
-	#define PORT_PHY_QCFG_RESP_WIRESPEED_ON  0x1UL
-	#define PORT_PHY_QCFG_RESP_WIRESPEED_LAST PORT_PHY_QCFG_RESP_WIRESPEED_ON
-	u8	lpbk;
-	#define PORT_PHY_QCFG_RESP_LPBK_NONE     0x0UL
-	#define PORT_PHY_QCFG_RESP_LPBK_LOCAL    0x1UL
-	#define PORT_PHY_QCFG_RESP_LPBK_REMOTE   0x2UL
-	#define PORT_PHY_QCFG_RESP_LPBK_EXTERNAL 0x3UL
-	#define PORT_PHY_QCFG_RESP_LPBK_LAST    PORT_PHY_QCFG_RESP_LPBK_EXTERNAL
-	u8	force_pause;
-	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_TX     0x1UL
-	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_RX     0x2UL
-	u8	module_status;
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NONE          0x0UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_DISABLETX     0x1UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_WARNINGMSG    0x2UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_PWRDOWN       0x3UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTINSERTED   0x4UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_CURRENTFAULT  0x5UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_OVERHEATED    0x6UL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE 0xffUL
-	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_LAST         PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE
-	__le32	preemphasis;
-	u8	phy_maj;
-	u8	phy_min;
-	u8	phy_bld;
-	u8	phy_type;
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_UNKNOWN          0x0UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASECR           0x1UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR4          0x2UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASELR           0x3UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASESR           0x4UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR2          0x5UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKX           0x6UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR           0x7UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASET            0x8UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASETE           0x9UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_SGMIIEXTPHY      0xaUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_L  0xbUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_S  0xcUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_N  0xdUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASESR       0xeUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR4     0xfUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR4     0x10UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR4     0x11UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER4     0x12UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR10    0x13UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASECR4      0x14UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASESR4      0x15UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASELR4      0x16UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASEER4      0x17UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_ACTIVE_CABLE 0x18UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASET         0x19UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASESX        0x1aUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASECX        0x1bUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASECR4     0x1cUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASESR4     0x1dUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASELR4     0x1eUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASEER4     0x1fUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASECR       0x20UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASESR       0x21UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASELR       0x22UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASEER       0x23UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR2     0x24UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR2     0x25UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR2     0x26UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER2     0x27UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR      0x28UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR      0x29UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR      0x2aUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER      0x2bUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASECR2     0x2cUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASESR2     0x2dUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASELR2     0x2eUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASEER2     0x2fUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASECR8     0x30UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR8     0x31UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR8     0x32UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER8     0x33UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASECR4     0x34UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR4     0x35UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR4     0x36UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4     0x37UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASECR8     0x38UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASESR8     0x39UL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASELR8     0x3aUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEER8     0x3bUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEFR8     0x3cUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8     0x3dUL
-	#define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST            PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8
-	u8	media_type;
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_UNKNOWN   0x0UL
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP        0x1UL
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_DAC       0x2UL
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_FIBRE     0x3UL
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_BACKPLANE 0x4UL
-	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_LAST     PORT_PHY_QCFG_RESP_MEDIA_TYPE_BACKPLANE
-	u8	xcvr_pkg_type;
-	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_INTERNAL 0x1UL
-	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL 0x2UL
-	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_LAST         PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL
-	u8	eee_config_phy_addr;
-	#define PORT_PHY_QCFG_RESP_PHY_ADDR_MASK              0x1fUL
-	#define PORT_PHY_QCFG_RESP_PHY_ADDR_SFT               0
-	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_MASK            0xe0UL
-	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_SFT             5
-	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ENABLED      0x20UL
-	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ACTIVE       0x40UL
-	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_TX_LPI       0x80UL
-	u8	parallel_detect;
-	#define PORT_PHY_QCFG_RESP_PARALLEL_DETECT     0x1UL
-	__le16	link_partner_adv_speeds;
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MBHD     0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MB       0x2UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GBHD       0x4UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GB         0x8UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2GB         0x10UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2_5GB       0x20UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10GB        0x40UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_20GB        0x80UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_25GB        0x100UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_40GB        0x200UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_50GB        0x400UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100GB       0x800UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MBHD      0x1000UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MB        0x2000UL
-	u8	link_partner_adv_auto_mode;
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_NONE         0x0UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ALL_SPEEDS   0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_SPEED    0x2UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_OR_BELOW 0x3UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK   0x4UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK
-	u8	link_partner_adv_pause;
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_TX     0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_RX     0x2UL
-	__le16	adv_eee_link_speed_mask;
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
-	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
-	__le16	link_partner_adv_eee_link_speed_mask;
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
-	__le32	xcvr_identifier_type_tx_lpi_timer;
-	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_MASK            0xffffffUL
-	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_SFT             0
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_MASK    0xff000000UL
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFT     24
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_UNKNOWN   (0x0UL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFP       (0x3UL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP      (0xcUL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFPPLUS  (0xdUL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP28    (0x11UL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFPDD    (0x18UL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP112   (0x1eUL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFPDD     (0x1fUL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_CSFP      (0x20UL << 24)
-	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_LAST     PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_CSFP
-	__le16	fec_cfg;
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_NONE_SUPPORTED           0x1UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_SUPPORTED        0x2UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_ENABLED          0x4UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_SUPPORTED       0x8UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_ENABLED         0x10UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_SUPPORTED       0x20UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_ENABLED         0x40UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_1XN_SUPPORTED      0x80UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_1XN_ENABLED        0x100UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_IEEE_SUPPORTED     0x200UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_IEEE_ENABLED       0x400UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_1XN_SUPPORTED      0x800UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_1XN_ENABLED        0x1000UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_IEEE_SUPPORTED     0x2000UL
-	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_IEEE_ENABLED       0x4000UL
-	u8	duplex_state;
-	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF 0x0UL
-	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL 0x1UL
-	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_LAST PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL
-	u8	option_flags;
-	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_MEDIA_AUTO_DETECT     0x1UL
-	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN     0x2UL
-	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SPEEDS2_SUPPORTED     0x4UL
-	char	phy_vendor_name[16];
-	char	phy_vendor_partnumber[16];
-	__le16	support_pam4_speeds;
-	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_50G      0x1UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_100G     0x2UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_200G     0x4UL
-	__le16	force_pam4_link_speed;
-	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_50GB  0x1f4UL
-	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_100GB 0x3e8UL
-	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_200GB 0x7d0UL
-	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_200GB
-	__le16	auto_pam4_link_speed_mask;
-	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_50G      0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_100G     0x2UL
-	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_200G     0x4UL
-	u8	link_partner_pam4_adv_speeds;
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_50GB      0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_100GB     0x2UL
-	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_200GB     0x4UL
-	u8	link_down_reason;
-	#define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF                      0x1UL
-	#define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION     0x2UL
-	__le16	support_speeds2;
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_1GB                0x1UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_10GB               0x2UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_25GB               0x4UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_40GB               0x8UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_50GB               0x10UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB              0x20UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_50GB_PAM4_56       0x40UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB_PAM4_56      0x80UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_200GB_PAM4_56      0x100UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_400GB_PAM4_56      0x200UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB_PAM4_112     0x400UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_200GB_PAM4_112     0x800UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_400GB_PAM4_112     0x1000UL
-	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_800GB_PAM4_112     0x2000UL
-	__le16	force_link_speeds2;
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_1GB            0xaUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_10GB           0x64UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_25GB           0xfaUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_40GB           0x190UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_50GB           0x1f4UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB          0x3e8UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_50GB_PAM4_56   0x1f5UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB_PAM4_56  0x3e9UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_200GB_PAM4_56  0x7d1UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_400GB_PAM4_56  0xfa1UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL
-	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_LAST          PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_800GB_PAM4_112
-	__le16	auto_link_speeds2;
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_1GB                0x1UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_10GB               0x2UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_25GB               0x4UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_40GB               0x8UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_50GB               0x10UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB              0x20UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_50GB_PAM4_56       0x40UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB_PAM4_56      0x80UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_200GB_PAM4_56      0x100UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_400GB_PAM4_56      0x200UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB_PAM4_112     0x400UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_200GB_PAM4_112     0x800UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_400GB_PAM4_112     0x1000UL
-	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_800GB_PAM4_112     0x2000UL
-	u8	active_lanes;
-	u8	valid;
-};
-
-/* hwrm_port_mac_cfg_input (size:448b/56B) */
-struct hwrm_port_mac_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define PORT_MAC_CFG_REQ_FLAGS_MATCH_LINK                    0x1UL
-	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_ENABLE           0x2UL
-	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_ENABLE         0x4UL
-	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_ENABLE            0x8UL
-	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_ENABLE      0x10UL
-	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE     0x20UL
-	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_ENABLE      0x40UL
-	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_DISABLE     0x80UL
-	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_ENABLE                0x100UL
-	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_DISABLE               0x200UL
-	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_DISABLE          0x400UL
-	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_DISABLE        0x800UL
-	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_DISABLE           0x1000UL
-	#define PORT_MAC_CFG_REQ_FLAGS_PTP_ONE_STEP_TX_TS            0x2000UL
-	#define PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE      0x4000UL
-	#define PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE     0x8000UL
-	__le32	enables;
-	#define PORT_MAC_CFG_REQ_ENABLES_IPG                            0x1UL
-	#define PORT_MAC_CFG_REQ_ENABLES_LPBK                           0x2UL
-	#define PORT_MAC_CFG_REQ_ENABLES_VLAN_PRI2COS_MAP_PRI           0x4UL
-	#define PORT_MAC_CFG_REQ_ENABLES_TUNNEL_PRI2COS_MAP_PRI         0x10UL
-	#define PORT_MAC_CFG_REQ_ENABLES_DSCP2COS_MAP_PRI               0x20UL
-	#define PORT_MAC_CFG_REQ_ENABLES_RX_TS_CAPTURE_PTP_MSG_TYPE     0x40UL
-	#define PORT_MAC_CFG_REQ_ENABLES_TX_TS_CAPTURE_PTP_MSG_TYPE     0x80UL
-	#define PORT_MAC_CFG_REQ_ENABLES_COS_FIELD_CFG                  0x100UL
-	#define PORT_MAC_CFG_REQ_ENABLES_PTP_FREQ_ADJ_PPB               0x200UL
-	#define PORT_MAC_CFG_REQ_ENABLES_PTP_ADJ_PHASE                  0x400UL
-	#define PORT_MAC_CFG_REQ_ENABLES_PTP_LOAD_CONTROL               0x800UL
-	__le16	port_id;
-	u8	ipg;
-	u8	lpbk;
-	#define PORT_MAC_CFG_REQ_LPBK_NONE   0x0UL
-	#define PORT_MAC_CFG_REQ_LPBK_LOCAL  0x1UL
-	#define PORT_MAC_CFG_REQ_LPBK_REMOTE 0x2UL
-	#define PORT_MAC_CFG_REQ_LPBK_LAST  PORT_MAC_CFG_REQ_LPBK_REMOTE
-	u8	vlan_pri2cos_map_pri;
-	u8	reserved1;
-	u8	tunnel_pri2cos_map_pri;
-	u8	dscp2pri_map_pri;
-	__le16	rx_ts_capture_ptp_msg_type;
-	__le16	tx_ts_capture_ptp_msg_type;
-	u8	cos_field_cfg;
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_RSVD1                     0x1UL
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_MASK         0x6UL
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_SFT          1
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_INNERMOST      (0x0UL << 1)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTER          (0x1UL << 1)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTERMOST      (0x2UL << 1)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED    (0x3UL << 1)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_LAST          PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_MASK       0x18UL
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_SFT        3
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_INNERMOST    (0x0UL << 3)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTER        (0x1UL << 3)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTERMOST    (0x2UL << 3)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED  (0x3UL << 3)
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_LAST        PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_MASK          0xe0UL
-	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_SFT           5
-	u8	unused_0[3];
-	__le32	ptp_freq_adj_ppb;
-	u8	unused_1[3];
-	u8	ptp_load_control;
-	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_NONE      0x0UL
-	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_IMMEDIATE 0x1UL
-	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_PPS_EVENT 0x2UL
-	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_LAST     PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_PPS_EVENT
-	__le64	ptp_adj_phase;
-};
-
-/* hwrm_port_mac_cfg_output (size:128b/16B) */
-struct hwrm_port_mac_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	mru;
-	__le16	mtu;
-	u8	ipg;
-	u8	lpbk;
-	#define PORT_MAC_CFG_RESP_LPBK_NONE   0x0UL
-	#define PORT_MAC_CFG_RESP_LPBK_LOCAL  0x1UL
-	#define PORT_MAC_CFG_RESP_LPBK_REMOTE 0x2UL
-	#define PORT_MAC_CFG_RESP_LPBK_LAST  PORT_MAC_CFG_RESP_LPBK_REMOTE
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_port_mac_ptp_qcfg_input (size:192b/24B) */
-struct hwrm_port_mac_ptp_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_mac_ptp_qcfg_output (size:704b/88B) */
-struct hwrm_port_mac_ptp_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_DIRECT_ACCESS                       0x1UL
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_ONE_STEP_TX_TS                      0x4UL
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS                         0x8UL
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_PARTIAL_DIRECT_ACCESS_REF_CLOCK     0x10UL
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_RTC_CONFIGURED                      0x20UL
-	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_64B_PHC_TIME                        0x40UL
-	u8	unused_0[3];
-	__le32	rx_ts_reg_off_lower;
-	__le32	rx_ts_reg_off_upper;
-	__le32	rx_ts_reg_off_seq_id;
-	__le32	rx_ts_reg_off_src_id_0;
-	__le32	rx_ts_reg_off_src_id_1;
-	__le32	rx_ts_reg_off_src_id_2;
-	__le32	rx_ts_reg_off_domain_id;
-	__le32	rx_ts_reg_off_fifo;
-	__le32	rx_ts_reg_off_fifo_adv;
-	__le32	rx_ts_reg_off_granularity;
-	__le32	tx_ts_reg_off_lower;
-	__le32	tx_ts_reg_off_upper;
-	__le32	tx_ts_reg_off_seq_id;
-	__le32	tx_ts_reg_off_fifo;
-	__le32	tx_ts_reg_off_granularity;
-	__le32	ts_ref_clock_reg_lower;
-	__le32	ts_ref_clock_reg_upper;
-	u8	unused_1[7];
-	u8	valid;
-};
-
-/* tx_port_stats (size:3264b/408B) */
-struct tx_port_stats {
-	__le64	tx_64b_frames;
-	__le64	tx_65b_127b_frames;
-	__le64	tx_128b_255b_frames;
-	__le64	tx_256b_511b_frames;
-	__le64	tx_512b_1023b_frames;
-	__le64	tx_1024b_1518b_frames;
-	__le64	tx_good_vlan_frames;
-	__le64	tx_1519b_2047b_frames;
-	__le64	tx_2048b_4095b_frames;
-	__le64	tx_4096b_9216b_frames;
-	__le64	tx_9217b_16383b_frames;
-	__le64	tx_good_frames;
-	__le64	tx_total_frames;
-	__le64	tx_ucast_frames;
-	__le64	tx_mcast_frames;
-	__le64	tx_bcast_frames;
-	__le64	tx_pause_frames;
-	__le64	tx_pfc_frames;
-	__le64	tx_jabber_frames;
-	__le64	tx_fcs_err_frames;
-	__le64	tx_control_frames;
-	__le64	tx_oversz_frames;
-	__le64	tx_single_dfrl_frames;
-	__le64	tx_multi_dfrl_frames;
-	__le64	tx_single_coll_frames;
-	__le64	tx_multi_coll_frames;
-	__le64	tx_late_coll_frames;
-	__le64	tx_excessive_coll_frames;
-	__le64	tx_frag_frames;
-	__le64	tx_err;
-	__le64	tx_tagged_frames;
-	__le64	tx_dbl_tagged_frames;
-	__le64	tx_runt_frames;
-	__le64	tx_fifo_underruns;
-	__le64	tx_pfc_ena_frames_pri0;
-	__le64	tx_pfc_ena_frames_pri1;
-	__le64	tx_pfc_ena_frames_pri2;
-	__le64	tx_pfc_ena_frames_pri3;
-	__le64	tx_pfc_ena_frames_pri4;
-	__le64	tx_pfc_ena_frames_pri5;
-	__le64	tx_pfc_ena_frames_pri6;
-	__le64	tx_pfc_ena_frames_pri7;
-	__le64	tx_eee_lpi_events;
-	__le64	tx_eee_lpi_duration;
-	__le64	tx_llfc_logical_msgs;
-	__le64	tx_hcfc_msgs;
-	__le64	tx_total_collisions;
-	__le64	tx_bytes;
-	__le64	tx_xthol_frames;
-	__le64	tx_stat_discard;
-	__le64	tx_stat_error;
-};
-
-/* rx_port_stats (size:4224b/528B) */
-struct rx_port_stats {
-	__le64	rx_64b_frames;
-	__le64	rx_65b_127b_frames;
-	__le64	rx_128b_255b_frames;
-	__le64	rx_256b_511b_frames;
-	__le64	rx_512b_1023b_frames;
-	__le64	rx_1024b_1518b_frames;
-	__le64	rx_good_vlan_frames;
-	__le64	rx_1519b_2047b_frames;
-	__le64	rx_2048b_4095b_frames;
-	__le64	rx_4096b_9216b_frames;
-	__le64	rx_9217b_16383b_frames;
-	__le64	rx_total_frames;
-	__le64	rx_ucast_frames;
-	__le64	rx_mcast_frames;
-	__le64	rx_bcast_frames;
-	__le64	rx_fcs_err_frames;
-	__le64	rx_ctrl_frames;
-	__le64	rx_pause_frames;
-	__le64	rx_pfc_frames;
-	__le64	rx_unsupported_opcode_frames;
-	__le64	rx_unsupported_da_pausepfc_frames;
-	__le64	rx_wrong_sa_frames;
-	__le64	rx_align_err_frames;
-	__le64	rx_oor_len_frames;
-	__le64	rx_code_err_frames;
-	__le64	rx_false_carrier_frames;
-	__le64	rx_ovrsz_frames;
-	__le64	rx_jbr_frames;
-	__le64	rx_mtu_err_frames;
-	__le64	rx_match_crc_frames;
-	__le64	rx_promiscuous_frames;
-	__le64	rx_tagged_frames;
-	__le64	rx_double_tagged_frames;
-	__le64	rx_trunc_frames;
-	__le64	rx_good_frames;
-	__le64	rx_pfc_xon2xoff_frames_pri0;
-	__le64	rx_pfc_xon2xoff_frames_pri1;
-	__le64	rx_pfc_xon2xoff_frames_pri2;
-	__le64	rx_pfc_xon2xoff_frames_pri3;
-	__le64	rx_pfc_xon2xoff_frames_pri4;
-	__le64	rx_pfc_xon2xoff_frames_pri5;
-	__le64	rx_pfc_xon2xoff_frames_pri6;
-	__le64	rx_pfc_xon2xoff_frames_pri7;
-	__le64	rx_pfc_ena_frames_pri0;
-	__le64	rx_pfc_ena_frames_pri1;
-	__le64	rx_pfc_ena_frames_pri2;
-	__le64	rx_pfc_ena_frames_pri3;
-	__le64	rx_pfc_ena_frames_pri4;
-	__le64	rx_pfc_ena_frames_pri5;
-	__le64	rx_pfc_ena_frames_pri6;
-	__le64	rx_pfc_ena_frames_pri7;
-	__le64	rx_sch_crc_err_frames;
-	__le64	rx_undrsz_frames;
-	__le64	rx_frag_frames;
-	__le64	rx_eee_lpi_events;
-	__le64	rx_eee_lpi_duration;
-	__le64	rx_llfc_physical_msgs;
-	__le64	rx_llfc_logical_msgs;
-	__le64	rx_llfc_msgs_with_crc_err;
-	__le64	rx_hcfc_msgs;
-	__le64	rx_hcfc_msgs_with_crc_err;
-	__le64	rx_bytes;
-	__le64	rx_runt_bytes;
-	__le64	rx_runt_frames;
-	__le64	rx_stat_discard;
-	__le64	rx_stat_err;
-};
-
-/* hwrm_port_qstats_input (size:320b/40B) */
-struct hwrm_port_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	flags;
-	#define PORT_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[5];
-	__le64	tx_stat_host_addr;
-	__le64	rx_stat_host_addr;
-};
-
-/* hwrm_port_qstats_output (size:128b/16B) */
-struct hwrm_port_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	tx_stat_size;
-	__le16	rx_stat_size;
-	u8	flags;
-	#define PORT_QSTATS_RESP_FLAGS_CLEARED     0x1UL
-	u8	unused_0[2];
-	u8	valid;
-};
-
-/* tx_port_stats_ext (size:2048b/256B) */
-struct tx_port_stats_ext {
-	__le64	tx_bytes_cos0;
-	__le64	tx_bytes_cos1;
-	__le64	tx_bytes_cos2;
-	__le64	tx_bytes_cos3;
-	__le64	tx_bytes_cos4;
-	__le64	tx_bytes_cos5;
-	__le64	tx_bytes_cos6;
-	__le64	tx_bytes_cos7;
-	__le64	tx_packets_cos0;
-	__le64	tx_packets_cos1;
-	__le64	tx_packets_cos2;
-	__le64	tx_packets_cos3;
-	__le64	tx_packets_cos4;
-	__le64	tx_packets_cos5;
-	__le64	tx_packets_cos6;
-	__le64	tx_packets_cos7;
-	__le64	pfc_pri0_tx_duration_us;
-	__le64	pfc_pri0_tx_transitions;
-	__le64	pfc_pri1_tx_duration_us;
-	__le64	pfc_pri1_tx_transitions;
-	__le64	pfc_pri2_tx_duration_us;
-	__le64	pfc_pri2_tx_transitions;
-	__le64	pfc_pri3_tx_duration_us;
-	__le64	pfc_pri3_tx_transitions;
-	__le64	pfc_pri4_tx_duration_us;
-	__le64	pfc_pri4_tx_transitions;
-	__le64	pfc_pri5_tx_duration_us;
-	__le64	pfc_pri5_tx_transitions;
-	__le64	pfc_pri6_tx_duration_us;
-	__le64	pfc_pri6_tx_transitions;
-	__le64	pfc_pri7_tx_duration_us;
-	__le64	pfc_pri7_tx_transitions;
-};
-
-/* rx_port_stats_ext (size:3904b/488B) */
-struct rx_port_stats_ext {
-	__le64	link_down_events;
-	__le64	continuous_pause_events;
-	__le64	resume_pause_events;
-	__le64	continuous_roce_pause_events;
-	__le64	resume_roce_pause_events;
-	__le64	rx_bytes_cos0;
-	__le64	rx_bytes_cos1;
-	__le64	rx_bytes_cos2;
-	__le64	rx_bytes_cos3;
-	__le64	rx_bytes_cos4;
-	__le64	rx_bytes_cos5;
-	__le64	rx_bytes_cos6;
-	__le64	rx_bytes_cos7;
-	__le64	rx_packets_cos0;
-	__le64	rx_packets_cos1;
-	__le64	rx_packets_cos2;
-	__le64	rx_packets_cos3;
-	__le64	rx_packets_cos4;
-	__le64	rx_packets_cos5;
-	__le64	rx_packets_cos6;
-	__le64	rx_packets_cos7;
-	__le64	pfc_pri0_rx_duration_us;
-	__le64	pfc_pri0_rx_transitions;
-	__le64	pfc_pri1_rx_duration_us;
-	__le64	pfc_pri1_rx_transitions;
-	__le64	pfc_pri2_rx_duration_us;
-	__le64	pfc_pri2_rx_transitions;
-	__le64	pfc_pri3_rx_duration_us;
-	__le64	pfc_pri3_rx_transitions;
-	__le64	pfc_pri4_rx_duration_us;
-	__le64	pfc_pri4_rx_transitions;
-	__le64	pfc_pri5_rx_duration_us;
-	__le64	pfc_pri5_rx_transitions;
-	__le64	pfc_pri6_rx_duration_us;
-	__le64	pfc_pri6_rx_transitions;
-	__le64	pfc_pri7_rx_duration_us;
-	__le64	pfc_pri7_rx_transitions;
-	__le64	rx_bits;
-	__le64	rx_buffer_passed_threshold;
-	__le64	rx_pcs_symbol_err;
-	__le64	rx_corrected_bits;
-	__le64	rx_discard_bytes_cos0;
-	__le64	rx_discard_bytes_cos1;
-	__le64	rx_discard_bytes_cos2;
-	__le64	rx_discard_bytes_cos3;
-	__le64	rx_discard_bytes_cos4;
-	__le64	rx_discard_bytes_cos5;
-	__le64	rx_discard_bytes_cos6;
-	__le64	rx_discard_bytes_cos7;
-	__le64	rx_discard_packets_cos0;
-	__le64	rx_discard_packets_cos1;
-	__le64	rx_discard_packets_cos2;
-	__le64	rx_discard_packets_cos3;
-	__le64	rx_discard_packets_cos4;
-	__le64	rx_discard_packets_cos5;
-	__le64	rx_discard_packets_cos6;
-	__le64	rx_discard_packets_cos7;
-	__le64	rx_fec_corrected_blocks;
-	__le64	rx_fec_uncorrectable_blocks;
-	__le64	rx_filter_miss;
-	__le64	rx_fec_symbol_err;
-};
-
-/* hwrm_port_qstats_ext_input (size:320b/40B) */
-struct hwrm_port_qstats_ext_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	__le16	tx_stat_size;
-	__le16	rx_stat_size;
-	u8	flags;
-	#define PORT_QSTATS_EXT_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0;
-	__le64	tx_stat_host_addr;
-	__le64	rx_stat_host_addr;
-};
-
-/* hwrm_port_qstats_ext_output (size:128b/16B) */
-struct hwrm_port_qstats_ext_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	tx_stat_size;
-	__le16	rx_stat_size;
-	__le16	total_active_cos_queues;
-	u8	flags;
-	#define PORT_QSTATS_EXT_RESP_FLAGS_CLEAR_ROCE_COUNTERS_SUPPORTED     0x1UL
-	#define PORT_QSTATS_EXT_RESP_FLAGS_CLEARED                           0x2UL
-	u8	valid;
-};
-
-/* hwrm_port_lpbk_qstats_input (size:256b/32B) */
-struct hwrm_port_lpbk_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	lpbk_stat_size;
-	u8	flags;
-	#define PORT_LPBK_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[5];
-	__le64	lpbk_stat_host_addr;
-};
-
-/* hwrm_port_lpbk_qstats_output (size:128b/16B) */
-struct hwrm_port_lpbk_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	lpbk_stat_size;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* port_lpbk_stats (size:640b/80B) */
-struct port_lpbk_stats {
-	__le64	lpbk_ucast_frames;
-	__le64	lpbk_mcast_frames;
-	__le64	lpbk_bcast_frames;
-	__le64	lpbk_ucast_bytes;
-	__le64	lpbk_mcast_bytes;
-	__le64	lpbk_bcast_bytes;
-	__le64	lpbk_tx_discards;
-	__le64	lpbk_tx_errors;
-	__le64	lpbk_rx_discards;
-	__le64	lpbk_rx_errors;
-};
-
-/* hwrm_port_ecn_qstats_input (size:256b/32B) */
-struct hwrm_port_ecn_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	__le16	ecn_stat_buf_size;
-	u8	flags;
-	#define PORT_ECN_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[3];
-	__le64	ecn_stat_host_addr;
-};
-
-/* hwrm_port_ecn_qstats_output (size:128b/16B) */
-struct hwrm_port_ecn_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	ecn_stat_buf_size;
-	u8	mark_en;
-	u8	unused_0[4];
-	u8	valid;
-};
-
-/* port_stats_ecn (size:512b/64B) */
-struct port_stats_ecn {
-	__le64	mark_cnt_cos0;
-	__le64	mark_cnt_cos1;
-	__le64	mark_cnt_cos2;
-	__le64	mark_cnt_cos3;
-	__le64	mark_cnt_cos4;
-	__le64	mark_cnt_cos5;
-	__le64	mark_cnt_cos6;
-	__le64	mark_cnt_cos7;
-};
-
-/* hwrm_port_clr_stats_input (size:192b/24B) */
-struct hwrm_port_clr_stats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	flags;
-	#define PORT_CLR_STATS_REQ_FLAGS_ROCE_COUNTERS     0x1UL
-	u8	unused_0[5];
-};
-
-/* hwrm_port_clr_stats_output (size:128b/16B) */
-struct hwrm_port_clr_stats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_lpbk_clr_stats_input (size:192b/24B) */
-struct hwrm_port_lpbk_clr_stats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_lpbk_clr_stats_output (size:128b/16B) */
-struct hwrm_port_lpbk_clr_stats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_ts_query_input (size:320b/40B) */
-struct hwrm_port_ts_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define PORT_TS_QUERY_REQ_FLAGS_PATH             0x1UL
-	#define PORT_TS_QUERY_REQ_FLAGS_PATH_TX            0x0UL
-	#define PORT_TS_QUERY_REQ_FLAGS_PATH_RX            0x1UL
-	#define PORT_TS_QUERY_REQ_FLAGS_PATH_LAST         PORT_TS_QUERY_REQ_FLAGS_PATH_RX
-	#define PORT_TS_QUERY_REQ_FLAGS_CURRENT_TIME     0x2UL
-	__le16	port_id;
-	u8	unused_0[2];
-	__le16	enables;
-	#define PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT     0x1UL
-	#define PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID         0x2UL
-	#define PORT_TS_QUERY_REQ_ENABLES_PTP_HDR_OFFSET     0x4UL
-	__le16	ts_req_timeout;
-	__le32	ptp_seq_id;
-	__le16	ptp_hdr_offset;
-	u8	unused_1[6];
-};
-
-/* hwrm_port_ts_query_output (size:192b/24B) */
-struct hwrm_port_ts_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	ptp_msg_ts;
-	__le16	ptp_msg_seqid;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_port_phy_qcaps_input (size:192b/24B) */
-struct hwrm_port_phy_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_phy_qcaps_output (size:320b/40B) */
-struct hwrm_port_phy_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED                    0x1UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED          0x2UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED           0x4UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED         0x8UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET     0x10UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED         0x20UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_FW_MANAGED_LINK_DOWN             0x40UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS_NO_FCS                           0x80UL
-	u8	port_cnt;
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_UNKNOWN 0x0UL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_1       0x1UL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_2       0x2UL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_3       0x3UL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_4       0x4UL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_12      0xcUL
-	#define PORT_PHY_QCAPS_RESP_PORT_CNT_LAST   PORT_PHY_QCAPS_RESP_PORT_CNT_12
-	__le16	supported_speeds_force_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MBHD     0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MB       0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GBHD       0x4UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GB         0x8UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2GB         0x10UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2_5GB       0x20UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10GB        0x40UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_20GB        0x80UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_25GB        0x100UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_40GB        0x200UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_50GB        0x400UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100GB       0x800UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MBHD      0x1000UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MB        0x2000UL
-	__le16	supported_speeds_auto_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MBHD     0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MB       0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GBHD       0x4UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GB         0x8UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2GB         0x10UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2_5GB       0x20UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10GB        0x40UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_20GB        0x80UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_25GB        0x100UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_40GB        0x200UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_50GB        0x400UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100GB       0x800UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MBHD      0x1000UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MB        0x2000UL
-	__le16	supported_speeds_eee_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD1     0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_100MB     0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD2     0x4UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_1GB       0x8UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD3     0x10UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD4     0x20UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_10GB      0x40UL
-	__le32	tx_lpi_timer_low;
-	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK 0xffffffUL
-	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_SFT 0
-	#define PORT_PHY_QCAPS_RESP_RSVD2_MASK           0xff000000UL
-	#define PORT_PHY_QCAPS_RESP_RSVD2_SFT            24
-	__le32	valid_tx_lpi_timer_high;
-	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK 0xffffffUL
-	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_SFT 0
-	#define PORT_PHY_QCAPS_RESP_RSVD_MASK             0xff000000UL
-	#define PORT_PHY_QCAPS_RESP_RSVD_SFT              24
-	__le16	supported_pam4_speeds_auto_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_50G      0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_100G     0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_200G     0x4UL
-	__le16	supported_pam4_speeds_force_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_50G      0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_100G     0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_200G     0x4UL
-	__le16	flags2;
-	#define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED           0x1UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED             0x2UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED         0x4UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED           0x8UL
-	#define PORT_PHY_QCAPS_RESP_FLAGS2_REMOTE_LPBK_UNSUPPORTED     0x10UL
-	u8	internal_port_cnt;
-	u8	unused_0;
-	__le16	supported_speeds2_force_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_1GB                0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_10GB               0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_25GB               0x4UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_40GB               0x8UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_50GB               0x10UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB              0x20UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_50GB_PAM4_56       0x40UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB_PAM4_56      0x80UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_200GB_PAM4_56      0x100UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_400GB_PAM4_56      0x200UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB_PAM4_112     0x400UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_200GB_PAM4_112     0x800UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_400GB_PAM4_112     0x1000UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_800GB_PAM4_112     0x2000UL
-	__le16	supported_speeds2_auto_mode;
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_1GB                0x1UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_10GB               0x2UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_25GB               0x4UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_40GB               0x8UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_50GB               0x10UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB              0x20UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_50GB_PAM4_56       0x40UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB_PAM4_56      0x80UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_200GB_PAM4_56      0x100UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_400GB_PAM4_56      0x200UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB_PAM4_112     0x400UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_200GB_PAM4_112     0x800UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_400GB_PAM4_112     0x1000UL
-	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_800GB_PAM4_112     0x2000UL
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_port_phy_i2c_write_input (size:832b/104B) */
-struct hwrm_port_phy_i2c_write_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	__le32	enables;
-	#define PORT_PHY_I2C_WRITE_REQ_ENABLES_PAGE_OFFSET     0x1UL
-	#define PORT_PHY_I2C_WRITE_REQ_ENABLES_BANK_NUMBER     0x2UL
-	__le16	port_id;
-	u8	i2c_slave_addr;
-	u8	bank_number;
-	__le16	page_number;
-	__le16	page_offset;
-	u8	data_length;
-	u8	unused_1[7];
-	__le32	data[16];
-};
-
-/* hwrm_port_phy_i2c_write_output (size:128b/16B) */
-struct hwrm_port_phy_i2c_write_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_phy_i2c_read_input (size:320b/40B) */
-struct hwrm_port_phy_i2c_read_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	__le32	enables;
-	#define PORT_PHY_I2C_READ_REQ_ENABLES_PAGE_OFFSET     0x1UL
-	#define PORT_PHY_I2C_READ_REQ_ENABLES_BANK_NUMBER     0x2UL
-	__le16	port_id;
-	u8	i2c_slave_addr;
-	u8	bank_number;
-	__le16	page_number;
-	__le16	page_offset;
-	u8	data_length;
-	u8	unused_1[7];
-};
-
-/* hwrm_port_phy_i2c_read_output (size:640b/80B) */
-struct hwrm_port_phy_i2c_read_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	data[16];
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_phy_mdio_write_input (size:320b/40B) */
-struct hwrm_port_phy_mdio_write_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	unused_0[2];
-	__le16	port_id;
-	u8	phy_addr;
-	u8	dev_addr;
-	__le16	reg_addr;
-	__le16	reg_data;
-	u8	cl45_mdio;
-	u8	unused_1[7];
-};
-
-/* hwrm_port_phy_mdio_write_output (size:128b/16B) */
-struct hwrm_port_phy_mdio_write_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_phy_mdio_read_input (size:256b/32B) */
-struct hwrm_port_phy_mdio_read_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	unused_0[2];
-	__le16	port_id;
-	u8	phy_addr;
-	u8	dev_addr;
-	__le16	reg_addr;
-	u8	cl45_mdio;
-	u8	unused_1;
-};
-
-/* hwrm_port_phy_mdio_read_output (size:128b/16B) */
-struct hwrm_port_phy_mdio_read_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	reg_data;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_port_led_cfg_input (size:512b/64B) */
-struct hwrm_port_led_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_ID            0x1UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_STATE         0x2UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_COLOR         0x4UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_ON      0x8UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_OFF     0x10UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED0_GROUP_ID      0x20UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_ID            0x40UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_STATE         0x80UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_COLOR         0x100UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_ON      0x200UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_OFF     0x400UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED1_GROUP_ID      0x800UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_ID            0x1000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_STATE         0x2000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_COLOR         0x4000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_ON      0x8000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_OFF     0x10000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED2_GROUP_ID      0x20000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_ID            0x40000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_STATE         0x80000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_COLOR         0x100000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_ON      0x200000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_OFF     0x400000UL
-	#define PORT_LED_CFG_REQ_ENABLES_LED3_GROUP_ID      0x800000UL
-	__le16	port_id;
-	u8	num_leds;
-	u8	rsvd;
-	u8	led0_id;
-	u8	led0_state;
-	#define PORT_LED_CFG_REQ_LED0_STATE_DEFAULT  0x0UL
-	#define PORT_LED_CFG_REQ_LED0_STATE_OFF      0x1UL
-	#define PORT_LED_CFG_REQ_LED0_STATE_ON       0x2UL
-	#define PORT_LED_CFG_REQ_LED0_STATE_BLINK    0x3UL
-	#define PORT_LED_CFG_REQ_LED0_STATE_BLINKALT 0x4UL
-	#define PORT_LED_CFG_REQ_LED0_STATE_LAST    PORT_LED_CFG_REQ_LED0_STATE_BLINKALT
-	u8	led0_color;
-	#define PORT_LED_CFG_REQ_LED0_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_CFG_REQ_LED0_COLOR_AMBER      0x1UL
-	#define PORT_LED_CFG_REQ_LED0_COLOR_GREEN      0x2UL
-	#define PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_CFG_REQ_LED0_COLOR_LAST      PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER
-	u8	unused_0;
-	__le16	led0_blink_on;
-	__le16	led0_blink_off;
-	u8	led0_group_id;
-	u8	rsvd0;
-	u8	led1_id;
-	u8	led1_state;
-	#define PORT_LED_CFG_REQ_LED1_STATE_DEFAULT  0x0UL
-	#define PORT_LED_CFG_REQ_LED1_STATE_OFF      0x1UL
-	#define PORT_LED_CFG_REQ_LED1_STATE_ON       0x2UL
-	#define PORT_LED_CFG_REQ_LED1_STATE_BLINK    0x3UL
-	#define PORT_LED_CFG_REQ_LED1_STATE_BLINKALT 0x4UL
-	#define PORT_LED_CFG_REQ_LED1_STATE_LAST    PORT_LED_CFG_REQ_LED1_STATE_BLINKALT
-	u8	led1_color;
-	#define PORT_LED_CFG_REQ_LED1_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_CFG_REQ_LED1_COLOR_AMBER      0x1UL
-	#define PORT_LED_CFG_REQ_LED1_COLOR_GREEN      0x2UL
-	#define PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_CFG_REQ_LED1_COLOR_LAST      PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER
-	u8	unused_1;
-	__le16	led1_blink_on;
-	__le16	led1_blink_off;
-	u8	led1_group_id;
-	u8	rsvd1;
-	u8	led2_id;
-	u8	led2_state;
-	#define PORT_LED_CFG_REQ_LED2_STATE_DEFAULT  0x0UL
-	#define PORT_LED_CFG_REQ_LED2_STATE_OFF      0x1UL
-	#define PORT_LED_CFG_REQ_LED2_STATE_ON       0x2UL
-	#define PORT_LED_CFG_REQ_LED2_STATE_BLINK    0x3UL
-	#define PORT_LED_CFG_REQ_LED2_STATE_BLINKALT 0x4UL
-	#define PORT_LED_CFG_REQ_LED2_STATE_LAST    PORT_LED_CFG_REQ_LED2_STATE_BLINKALT
-	u8	led2_color;
-	#define PORT_LED_CFG_REQ_LED2_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_CFG_REQ_LED2_COLOR_AMBER      0x1UL
-	#define PORT_LED_CFG_REQ_LED2_COLOR_GREEN      0x2UL
-	#define PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_CFG_REQ_LED2_COLOR_LAST      PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER
-	u8	unused_2;
-	__le16	led2_blink_on;
-	__le16	led2_blink_off;
-	u8	led2_group_id;
-	u8	rsvd2;
-	u8	led3_id;
-	u8	led3_state;
-	#define PORT_LED_CFG_REQ_LED3_STATE_DEFAULT  0x0UL
-	#define PORT_LED_CFG_REQ_LED3_STATE_OFF      0x1UL
-	#define PORT_LED_CFG_REQ_LED3_STATE_ON       0x2UL
-	#define PORT_LED_CFG_REQ_LED3_STATE_BLINK    0x3UL
-	#define PORT_LED_CFG_REQ_LED3_STATE_BLINKALT 0x4UL
-	#define PORT_LED_CFG_REQ_LED3_STATE_LAST    PORT_LED_CFG_REQ_LED3_STATE_BLINKALT
-	u8	led3_color;
-	#define PORT_LED_CFG_REQ_LED3_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_CFG_REQ_LED3_COLOR_AMBER      0x1UL
-	#define PORT_LED_CFG_REQ_LED3_COLOR_GREEN      0x2UL
-	#define PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_CFG_REQ_LED3_COLOR_LAST      PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER
-	u8	unused_3;
-	__le16	led3_blink_on;
-	__le16	led3_blink_off;
-	u8	led3_group_id;
-	u8	rsvd3;
-};
-
-/* hwrm_port_led_cfg_output (size:128b/16B) */
-struct hwrm_port_led_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_port_led_qcfg_input (size:192b/24B) */
-struct hwrm_port_led_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_led_qcfg_output (size:448b/56B) */
-struct hwrm_port_led_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	num_leds;
-	u8	led0_id;
-	u8	led0_type;
-	#define PORT_LED_QCFG_RESP_LED0_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCFG_RESP_LED0_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCFG_RESP_LED0_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCFG_RESP_LED0_TYPE_LAST    PORT_LED_QCFG_RESP_LED0_TYPE_INVALID
-	u8	led0_state;
-	#define PORT_LED_QCFG_RESP_LED0_STATE_DEFAULT  0x0UL
-	#define PORT_LED_QCFG_RESP_LED0_STATE_OFF      0x1UL
-	#define PORT_LED_QCFG_RESP_LED0_STATE_ON       0x2UL
-	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINK    0x3UL
-	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT 0x4UL
-	#define PORT_LED_QCFG_RESP_LED0_STATE_LAST    PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT
-	u8	led0_color;
-	#define PORT_LED_QCFG_RESP_LED0_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_QCFG_RESP_LED0_COLOR_AMBER      0x1UL
-	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREEN      0x2UL
-	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_QCFG_RESP_LED0_COLOR_LAST      PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER
-	u8	unused_0;
-	__le16	led0_blink_on;
-	__le16	led0_blink_off;
-	u8	led0_group_id;
-	u8	led1_id;
-	u8	led1_type;
-	#define PORT_LED_QCFG_RESP_LED1_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCFG_RESP_LED1_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCFG_RESP_LED1_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCFG_RESP_LED1_TYPE_LAST    PORT_LED_QCFG_RESP_LED1_TYPE_INVALID
-	u8	led1_state;
-	#define PORT_LED_QCFG_RESP_LED1_STATE_DEFAULT  0x0UL
-	#define PORT_LED_QCFG_RESP_LED1_STATE_OFF      0x1UL
-	#define PORT_LED_QCFG_RESP_LED1_STATE_ON       0x2UL
-	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINK    0x3UL
-	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT 0x4UL
-	#define PORT_LED_QCFG_RESP_LED1_STATE_LAST    PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT
-	u8	led1_color;
-	#define PORT_LED_QCFG_RESP_LED1_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_QCFG_RESP_LED1_COLOR_AMBER      0x1UL
-	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREEN      0x2UL
-	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_QCFG_RESP_LED1_COLOR_LAST      PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER
-	u8	unused_1;
-	__le16	led1_blink_on;
-	__le16	led1_blink_off;
-	u8	led1_group_id;
-	u8	led2_id;
-	u8	led2_type;
-	#define PORT_LED_QCFG_RESP_LED2_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCFG_RESP_LED2_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCFG_RESP_LED2_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCFG_RESP_LED2_TYPE_LAST    PORT_LED_QCFG_RESP_LED2_TYPE_INVALID
-	u8	led2_state;
-	#define PORT_LED_QCFG_RESP_LED2_STATE_DEFAULT  0x0UL
-	#define PORT_LED_QCFG_RESP_LED2_STATE_OFF      0x1UL
-	#define PORT_LED_QCFG_RESP_LED2_STATE_ON       0x2UL
-	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINK    0x3UL
-	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT 0x4UL
-	#define PORT_LED_QCFG_RESP_LED2_STATE_LAST    PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT
-	u8	led2_color;
-	#define PORT_LED_QCFG_RESP_LED2_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_QCFG_RESP_LED2_COLOR_AMBER      0x1UL
-	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREEN      0x2UL
-	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_QCFG_RESP_LED2_COLOR_LAST      PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER
-	u8	unused_2;
-	__le16	led2_blink_on;
-	__le16	led2_blink_off;
-	u8	led2_group_id;
-	u8	led3_id;
-	u8	led3_type;
-	#define PORT_LED_QCFG_RESP_LED3_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCFG_RESP_LED3_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCFG_RESP_LED3_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCFG_RESP_LED3_TYPE_LAST    PORT_LED_QCFG_RESP_LED3_TYPE_INVALID
-	u8	led3_state;
-	#define PORT_LED_QCFG_RESP_LED3_STATE_DEFAULT  0x0UL
-	#define PORT_LED_QCFG_RESP_LED3_STATE_OFF      0x1UL
-	#define PORT_LED_QCFG_RESP_LED3_STATE_ON       0x2UL
-	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINK    0x3UL
-	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT 0x4UL
-	#define PORT_LED_QCFG_RESP_LED3_STATE_LAST    PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT
-	u8	led3_color;
-	#define PORT_LED_QCFG_RESP_LED3_COLOR_DEFAULT    0x0UL
-	#define PORT_LED_QCFG_RESP_LED3_COLOR_AMBER      0x1UL
-	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREEN      0x2UL
-	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER 0x3UL
-	#define PORT_LED_QCFG_RESP_LED3_COLOR_LAST      PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER
-	u8	unused_3;
-	__le16	led3_blink_on;
-	__le16	led3_blink_off;
-	u8	led3_group_id;
-	u8	unused_4[6];
-	u8	valid;
-};
-
-/* hwrm_port_led_qcaps_input (size:192b/24B) */
-struct hwrm_port_led_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_led_qcaps_output (size:384b/48B) */
-struct hwrm_port_led_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	num_leds;
-	u8	unused[3];
-	u8	led0_id;
-	u8	led0_type;
-	#define PORT_LED_QCAPS_RESP_LED0_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCAPS_RESP_LED0_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCAPS_RESP_LED0_TYPE_LAST    PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID
-	u8	led0_group_id;
-	u8	unused_0;
-	__le16	led0_state_caps;
-	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ENABLED                 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_OFF_SUPPORTED           0x2UL
-	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ON_SUPPORTED            0x4UL
-	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_SUPPORTED         0x8UL
-	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
-	__le16	led0_color_caps;
-	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD                0x1UL
-	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
-	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
-	u8	led1_id;
-	u8	led1_type;
-	#define PORT_LED_QCAPS_RESP_LED1_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCAPS_RESP_LED1_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCAPS_RESP_LED1_TYPE_LAST    PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID
-	u8	led1_group_id;
-	u8	unused_1;
-	__le16	led1_state_caps;
-	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ENABLED                 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_OFF_SUPPORTED           0x2UL
-	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ON_SUPPORTED            0x4UL
-	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_SUPPORTED         0x8UL
-	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
-	__le16	led1_color_caps;
-	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD                0x1UL
-	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
-	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
-	u8	led2_id;
-	u8	led2_type;
-	#define PORT_LED_QCAPS_RESP_LED2_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCAPS_RESP_LED2_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCAPS_RESP_LED2_TYPE_LAST    PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID
-	u8	led2_group_id;
-	u8	unused_2;
-	__le16	led2_state_caps;
-	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ENABLED                 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_OFF_SUPPORTED           0x2UL
-	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ON_SUPPORTED            0x4UL
-	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_SUPPORTED         0x8UL
-	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
-	__le16	led2_color_caps;
-	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD                0x1UL
-	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
-	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
-	u8	led3_id;
-	u8	led3_type;
-	#define PORT_LED_QCAPS_RESP_LED3_TYPE_SPEED    0x0UL
-	#define PORT_LED_QCAPS_RESP_LED3_TYPE_ACTIVITY 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID  0xffUL
-	#define PORT_LED_QCAPS_RESP_LED3_TYPE_LAST    PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID
-	u8	led3_group_id;
-	u8	unused_3;
-	__le16	led3_state_caps;
-	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ENABLED                 0x1UL
-	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_OFF_SUPPORTED           0x2UL
-	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ON_SUPPORTED            0x4UL
-	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_SUPPORTED         0x8UL
-	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
-	__le16	led3_color_caps;
-	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD                0x1UL
-	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
-	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
-	u8	unused_4[3];
-	u8	valid;
-};
-
-/* hwrm_port_mac_qcaps_input (size:192b/24B) */
-struct hwrm_port_mac_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_port_mac_qcaps_output (size:128b/16B) */
-struct hwrm_port_mac_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define PORT_MAC_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED     0x1UL
-	#define PORT_MAC_QCAPS_RESP_FLAGS_REMOTE_LPBK_SUPPORTED        0x2UL
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_queue_qportcfg_input (size:192b/24B) */
-struct hwrm_queue_qportcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH     0x1UL
-	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_TX    0x0UL
-	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX    0x1UL
-	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_LAST QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX
-	__le16	port_id;
-	u8	drv_qmap_cap;
-	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_DISABLED 0x0UL
-	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED  0x1UL
-	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_LAST    QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED
-	u8	unused_0;
-};
-
-/* hwrm_queue_qportcfg_output (size:1344b/168B) */
-struct hwrm_queue_qportcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	max_configurable_queues;
-	u8	max_configurable_lossless_queues;
-	u8	queue_cfg_allowed;
-	u8	queue_cfg_info;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG             0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_USE_PROFILE_TYPE     0x2UL
-	u8	queue_pfcenable_cfg_allowed;
-	u8	queue_pri2cos_cfg_allowed;
-	u8	queue_cos2bw_cfg_allowed;
-	u8	queue_id0;
-	u8	queue_id0_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id1;
-	u8	queue_id1_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id2;
-	u8	queue_id2_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id3;
-	u8	queue_id3_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id4;
-	u8	queue_id4_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id5;
-	u8	queue_id5_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id6;
-	u8	queue_id6_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id7;
-	u8	queue_id7_service_profile;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY          0x0UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS       0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN        0xffUL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN
-	u8	queue_id0_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	char	qid0_name[16];
-	char	qid1_name[16];
-	char	qid2_name[16];
-	char	qid3_name[16];
-	char	qid4_name[16];
-	char	qid5_name[16];
-	char	qid6_name[16];
-	char	qid7_name[16];
-	u8	queue_id1_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id2_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id3_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id4_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id5_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id6_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	queue_id7_service_profile_type;
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_ROCE     0x1UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_NIC      0x2UL
-	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_CNP      0x4UL
-	u8	valid;
-};
-
-/* hwrm_queue_qcfg_input (size:192b/24B) */
-struct hwrm_queue_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_QCFG_REQ_FLAGS_PATH     0x1UL
-	#define QUEUE_QCFG_REQ_FLAGS_PATH_TX    0x0UL
-	#define QUEUE_QCFG_REQ_FLAGS_PATH_RX    0x1UL
-	#define QUEUE_QCFG_REQ_FLAGS_PATH_LAST QUEUE_QCFG_REQ_FLAGS_PATH_RX
-	__le32	queue_id;
-};
-
-/* hwrm_queue_qcfg_output (size:128b/16B) */
-struct hwrm_queue_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	queue_len;
-	u8	service_profile;
-	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LOSSY    0x0UL
-	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LOSSLESS 0x1UL
-	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_UNKNOWN  0xffUL
-	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LAST    QUEUE_QCFG_RESP_SERVICE_PROFILE_UNKNOWN
-	u8	queue_cfg_info;
-	#define QUEUE_QCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_queue_cfg_input (size:320b/40B) */
-struct hwrm_queue_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_CFG_REQ_FLAGS_PATH_MASK 0x3UL
-	#define QUEUE_CFG_REQ_FLAGS_PATH_SFT  0
-	#define QUEUE_CFG_REQ_FLAGS_PATH_TX     0x0UL
-	#define QUEUE_CFG_REQ_FLAGS_PATH_RX     0x1UL
-	#define QUEUE_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
-	#define QUEUE_CFG_REQ_FLAGS_PATH_LAST  QUEUE_CFG_REQ_FLAGS_PATH_BIDIR
-	__le32	enables;
-	#define QUEUE_CFG_REQ_ENABLES_DFLT_LEN            0x1UL
-	#define QUEUE_CFG_REQ_ENABLES_SERVICE_PROFILE     0x2UL
-	__le32	queue_id;
-	__le32	dflt_len;
-	u8	service_profile;
-	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY    0x0UL
-	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS 0x1UL
-	#define QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN  0xffUL
-	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LAST    QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN
-	u8	unused_0[7];
-};
-
-/* hwrm_queue_cfg_output (size:128b/16B) */
-struct hwrm_queue_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_queue_pfcenable_qcfg_input (size:192b/24B) */
-struct hwrm_queue_pfcenable_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_queue_pfcenable_qcfg_output (size:128b/16B) */
-struct hwrm_queue_pfcenable_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI0_PFC_ENABLED              0x1UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI1_PFC_ENABLED              0x2UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI2_PFC_ENABLED              0x4UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI3_PFC_ENABLED              0x8UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI4_PFC_ENABLED              0x10UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI5_PFC_ENABLED              0x20UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI6_PFC_ENABLED              0x40UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI7_PFC_ENABLED              0x80UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI0_PFC_WATCHDOG_ENABLED     0x100UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI1_PFC_WATCHDOG_ENABLED     0x200UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI2_PFC_WATCHDOG_ENABLED     0x400UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI3_PFC_WATCHDOG_ENABLED     0x800UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI4_PFC_WATCHDOG_ENABLED     0x1000UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI5_PFC_WATCHDOG_ENABLED     0x2000UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI6_PFC_WATCHDOG_ENABLED     0x4000UL
-	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI7_PFC_WATCHDOG_ENABLED     0x8000UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_queue_pfcenable_cfg_input (size:192b/24B) */
-struct hwrm_queue_pfcenable_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI0_PFC_ENABLED              0x1UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI1_PFC_ENABLED              0x2UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI2_PFC_ENABLED              0x4UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI3_PFC_ENABLED              0x8UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI4_PFC_ENABLED              0x10UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI5_PFC_ENABLED              0x20UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI6_PFC_ENABLED              0x40UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI7_PFC_ENABLED              0x80UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI0_PFC_WATCHDOG_ENABLED     0x100UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI1_PFC_WATCHDOG_ENABLED     0x200UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI2_PFC_WATCHDOG_ENABLED     0x400UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI3_PFC_WATCHDOG_ENABLED     0x800UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI4_PFC_WATCHDOG_ENABLED     0x1000UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI5_PFC_WATCHDOG_ENABLED     0x2000UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI6_PFC_WATCHDOG_ENABLED     0x4000UL
-	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI7_PFC_WATCHDOG_ENABLED     0x8000UL
-	__le16	port_id;
-	u8	unused_0[2];
-};
-
-/* hwrm_queue_pfcenable_cfg_output (size:128b/16B) */
-struct hwrm_queue_pfcenable_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_queue_pri2cos_qcfg_input (size:192b/24B) */
-struct hwrm_queue_pri2cos_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH      0x1UL
-	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_TX     0x0UL
-	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX     0x1UL
-	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX
-	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_IVLAN     0x2UL
-	u8	port_id;
-	u8	unused_0[3];
-};
-
-/* hwrm_queue_pri2cos_qcfg_output (size:192b/24B) */
-struct hwrm_queue_pri2cos_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	pri0_cos_queue_id;
-	u8	pri1_cos_queue_id;
-	u8	pri2_cos_queue_id;
-	u8	pri3_cos_queue_id;
-	u8	pri4_cos_queue_id;
-	u8	pri5_cos_queue_id;
-	u8	pri6_cos_queue_id;
-	u8	pri7_cos_queue_id;
-	u8	queue_cfg_info;
-	#define QUEUE_PRI2COS_QCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_queue_pri2cos_cfg_input (size:320b/40B) */
-struct hwrm_queue_pri2cos_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_MASK 0x3UL
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_SFT  0
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_TX     0x0UL
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_RX     0x1UL
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR
-	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_IVLAN     0x4UL
-	__le32	enables;
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI0_COS_QUEUE_ID     0x1UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI1_COS_QUEUE_ID     0x2UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI2_COS_QUEUE_ID     0x4UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI3_COS_QUEUE_ID     0x8UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI4_COS_QUEUE_ID     0x10UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI5_COS_QUEUE_ID     0x20UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI6_COS_QUEUE_ID     0x40UL
-	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI7_COS_QUEUE_ID     0x80UL
-	u8	port_id;
-	u8	pri0_cos_queue_id;
-	u8	pri1_cos_queue_id;
-	u8	pri2_cos_queue_id;
-	u8	pri3_cos_queue_id;
-	u8	pri4_cos_queue_id;
-	u8	pri5_cos_queue_id;
-	u8	pri6_cos_queue_id;
-	u8	pri7_cos_queue_id;
-	u8	unused_0[7];
-};
-
-/* hwrm_queue_pri2cos_cfg_output (size:128b/16B) */
-struct hwrm_queue_pri2cos_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_queue_cos2bw_qcfg_input (size:192b/24B) */
-struct hwrm_queue_cos2bw_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_queue_cos2bw_qcfg_output (size:896b/112B) */
-struct hwrm_queue_cos2bw_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	queue_id0;
-	u8	unused_0;
-	__le16	unused_1;
-	__le32	queue_id0_min_bw;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
-	__le32	queue_id0_max_bw;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
-	u8	queue_id0_tsa_assign;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
-	u8	queue_id0_pri_lvl;
-	u8	queue_id0_bw_weight;
-	struct {
-		u8	queue_id;
-		__le32	queue_id_min_bw;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID
-		__le32	queue_id_max_bw;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID
-		u8	queue_id_tsa_assign;
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_SP             0x0UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_ETS            0x1UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_RESERVED_FIRST 0x2UL
-	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_RESERVED_LAST  0xffUL
-		u8	queue_id_pri_lvl;
-		u8	queue_id_bw_weight;
-	} __packed cfg[7];
-	u8	unused_2[4];
-	u8	valid;
-};
-
-/* hwrm_queue_cos2bw_cfg_input (size:1024b/128B) */
-struct hwrm_queue_cos2bw_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	__le32	enables;
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID     0x1UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID1_VALID     0x2UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID2_VALID     0x4UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID3_VALID     0x8UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID4_VALID     0x10UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID5_VALID     0x20UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID6_VALID     0x40UL
-	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID7_VALID     0x80UL
-	__le16	port_id;
-	u8	queue_id0;
-	u8	unused_0;
-	__le32	queue_id0_min_bw;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
-	__le32	queue_id0_max_bw;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
-	u8	queue_id0_tsa_assign;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
-	u8	queue_id0_pri_lvl;
-	u8	queue_id0_bw_weight;
-	struct {
-		u8	queue_id;
-		__le32	queue_id_min_bw;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID
-		__le32	queue_id_max_bw;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_SFT              0
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE                     0x10000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BYTES
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID
-		u8	queue_id_tsa_assign;
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_SP             0x0UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_ETS            0x1UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_RESERVED_FIRST 0x2UL
-	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_RESERVED_LAST  0xffUL
-		u8	queue_id_pri_lvl;
-		u8	queue_id_bw_weight;
-	} __packed cfg[7];
-	u8	unused_1[5];
-};
-
-/* hwrm_queue_cos2bw_cfg_output (size:128b/16B) */
-struct hwrm_queue_cos2bw_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_queue_dscp_qcaps_input (size:192b/24B) */
-struct hwrm_queue_dscp_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	port_id;
-	u8	unused_0[7];
-};
-
-/* hwrm_queue_dscp_qcaps_output (size:128b/16B) */
-struct hwrm_queue_dscp_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	num_dscp_bits;
-	u8	unused_0;
-	__le16	max_entries;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_queue_dscp2pri_qcfg_input (size:256b/32B) */
-struct hwrm_queue_dscp2pri_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	dest_data_addr;
-	u8	port_id;
-	u8	unused_0;
-	__le16	dest_data_buffer_size;
-	u8	unused_1[4];
-};
-
-/* hwrm_queue_dscp2pri_qcfg_output (size:128b/16B) */
-struct hwrm_queue_dscp2pri_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	entry_cnt;
-	u8	default_pri;
-	u8	unused_0[4];
-	u8	valid;
-};
-
-/* hwrm_queue_dscp2pri_cfg_input (size:320b/40B) */
-struct hwrm_queue_dscp2pri_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	src_data_addr;
-	__le32	flags;
-	#define QUEUE_DSCP2PRI_CFG_REQ_FLAGS_USE_HW_DEFAULT_PRI     0x1UL
-	__le32	enables;
-	#define QUEUE_DSCP2PRI_CFG_REQ_ENABLES_DEFAULT_PRI     0x1UL
-	u8	port_id;
-	u8	default_pri;
-	__le16	entry_cnt;
-	u8	unused_0[4];
-};
-
-/* hwrm_queue_dscp2pri_cfg_output (size:128b/16B) */
-struct hwrm_queue_dscp2pri_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_alloc_input (size:192b/24B) */
-struct hwrm_vnic_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define VNIC_ALLOC_REQ_FLAGS_DEFAULT                  0x1UL
-	#define VNIC_ALLOC_REQ_FLAGS_VIRTIO_NET_FID_VALID     0x2UL
-	#define VNIC_ALLOC_REQ_FLAGS_VNIC_ID_VALID            0x4UL
-	__le16	virtio_net_fid;
-	__le16	vnic_id;
-};
-
-/* hwrm_vnic_alloc_output (size:128b/16B) */
-struct hwrm_vnic_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	vnic_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_vnic_update_input (size:256b/32B) */
-struct hwrm_vnic_update_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	vnic_id;
-	__le32	enables;
-	#define VNIC_UPDATE_REQ_ENABLES_VNIC_STATE_VALID               0x1UL
-	#define VNIC_UPDATE_REQ_ENABLES_MRU_VALID                      0x2UL
-	#define VNIC_UPDATE_REQ_ENABLES_METADATA_FORMAT_TYPE_VALID     0x4UL
-	u8	vnic_state;
-	#define VNIC_UPDATE_REQ_VNIC_STATE_NORMAL 0x0UL
-	#define VNIC_UPDATE_REQ_VNIC_STATE_DROP   0x1UL
-	#define VNIC_UPDATE_REQ_VNIC_STATE_LAST  VNIC_UPDATE_REQ_VNIC_STATE_DROP
-	u8	metadata_format_type;
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_0 0x0UL
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_1 0x1UL
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_2 0x2UL
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_3 0x3UL
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_4 0x4UL
-	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_LAST VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_4
-	__le16	mru;
-	u8	unused_1[4];
-};
-
-/* hwrm_vnic_update_output (size:128b/16B) */
-struct hwrm_vnic_update_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_free_input (size:192b/24B) */
-struct hwrm_vnic_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	vnic_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_vnic_free_output (size:128b/16B) */
-struct hwrm_vnic_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_cfg_input (size:384b/48B) */
-struct hwrm_vnic_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define VNIC_CFG_REQ_FLAGS_DEFAULT                              0x1UL
-	#define VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE                      0x2UL
-	#define VNIC_CFG_REQ_FLAGS_BD_STALL_MODE                        0x4UL
-	#define VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE                  0x8UL
-	#define VNIC_CFG_REQ_FLAGS_ROCE_ONLY_VNIC_MODE                  0x10UL
-	#define VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE                     0x20UL
-	#define VNIC_CFG_REQ_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_MODE     0x40UL
-	#define VNIC_CFG_REQ_FLAGS_PORTCOS_MAPPING_MODE                 0x80UL
-	__le32	enables;
-	#define VNIC_CFG_REQ_ENABLES_DFLT_RING_GRP            0x1UL
-	#define VNIC_CFG_REQ_ENABLES_RSS_RULE                 0x2UL
-	#define VNIC_CFG_REQ_ENABLES_COS_RULE                 0x4UL
-	#define VNIC_CFG_REQ_ENABLES_LB_RULE                  0x8UL
-	#define VNIC_CFG_REQ_ENABLES_MRU                      0x10UL
-	#define VNIC_CFG_REQ_ENABLES_DEFAULT_RX_RING_ID       0x20UL
-	#define VNIC_CFG_REQ_ENABLES_DEFAULT_CMPL_RING_ID     0x40UL
-	#define VNIC_CFG_REQ_ENABLES_QUEUE_ID                 0x80UL
-	#define VNIC_CFG_REQ_ENABLES_RX_CSUM_V2_MODE          0x100UL
-	#define VNIC_CFG_REQ_ENABLES_L2_CQE_MODE              0x200UL
-	#define VNIC_CFG_REQ_ENABLES_RAW_QP_ID                0x400UL
-	__le16	vnic_id;
-	__le16	dflt_ring_grp;
-	__le16	rss_rule;
-	__le16	cos_rule;
-	__le16	lb_rule;
-	__le16	mru;
-	__le16	default_rx_ring_id;
-	__le16	default_cmpl_ring_id;
-	__le16	queue_id;
-	u8	rx_csum_v2_mode;
-	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_DEFAULT 0x0UL
-	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_ALL_OK  0x1UL
-	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_MAX     0x2UL
-	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_LAST   VNIC_CFG_REQ_RX_CSUM_V2_MODE_MAX
-	u8	l2_cqe_mode;
-	#define VNIC_CFG_REQ_L2_CQE_MODE_DEFAULT    0x0UL
-	#define VNIC_CFG_REQ_L2_CQE_MODE_COMPRESSED 0x1UL
-	#define VNIC_CFG_REQ_L2_CQE_MODE_MIXED      0x2UL
-	#define VNIC_CFG_REQ_L2_CQE_MODE_LAST      VNIC_CFG_REQ_L2_CQE_MODE_MIXED
-	__le32	raw_qp_id;
-};
-
-/* hwrm_vnic_cfg_output (size:128b/16B) */
-struct hwrm_vnic_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_qcaps_input (size:192b/24B) */
-struct hwrm_vnic_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	u8	unused_0[4];
-};
-
-/* hwrm_vnic_qcaps_output (size:192b/24B) */
-struct hwrm_vnic_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	mru;
-	u8	unused_0[2];
-	__le32	flags;
-	#define VNIC_QCAPS_RESP_FLAGS_UNUSED                                  0x1UL
-	#define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP                          0x2UL
-	#define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP                            0x4UL
-	#define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP                      0x8UL
-	#define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP                      0x10UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP                         0x20UL
-	#define VNIC_QCAPS_RESP_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_CAP         0x40UL
-	#define VNIC_QCAPS_RESP_FLAGS_OUTERMOST_RSS_CAP                       0x80UL
-	#define VNIC_QCAPS_RESP_FLAGS_COS_ASSIGNMENT_CAP                      0x100UL
-	#define VNIC_QCAPS_RESP_FLAGS_RX_CMPL_V2_CAP                          0x200UL
-	#define VNIC_QCAPS_RESP_FLAGS_VNIC_STATE_CAP                          0x400UL
-	#define VNIC_QCAPS_RESP_FLAGS_VIRTIO_NET_VNIC_ALLOC_CAP               0x800UL
-	#define VNIC_QCAPS_RESP_FLAGS_METADATA_FORMAT_CAP                     0x1000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_STRICT_HASH_TYPE_CAP                0x2000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_HASH_TYPE_DELTA_CAP                 0x4000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_TOEPLITZ_CAP           0x8000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_XOR_CAP                0x10000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_TOEPLITZ_CHKSM_CAP     0x20000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPV6_FLOW_LABEL_CAP                 0x40000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RX_CMPL_V3_CAP                          0x80000UL
-	#define VNIC_QCAPS_RESP_FLAGS_L2_CQE_MODE_CAP                         0x100000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_AH_SPI_IPV4_CAP               0x200000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_ESP_SPI_IPV4_CAP              0x400000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_AH_SPI_IPV6_CAP               0x800000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_ESP_SPI_IPV6_CAP              0x1000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_OUTERMOST_RSS_TRUSTED_VF_CAP            0x2000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_PORTCOS_MAPPING_MODE                    0x4000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RSS_PROF_TCAM_MODE_ENABLED              0x8000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_VNIC_RSS_HASH_MODE_CAP                  0x10000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_HW_TUNNEL_TPA_CAP                       0x20000000UL
-	#define VNIC_QCAPS_RESP_FLAGS_RE_FLUSH_CAP                            0x40000000UL
-	__le16	max_aggs_supported;
-	u8	unused_1[5];
-	u8	valid;
-};
-
-/* hwrm_vnic_tpa_cfg_input (size:384b/48B) */
-struct hwrm_vnic_tpa_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define VNIC_TPA_CFG_REQ_FLAGS_TPA                       0x1UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_ENCAP_TPA                 0x2UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_RSC_WND_UPDATE            0x4UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_GRO                       0x8UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_ECN              0x10UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_IPID_CHECK            0x40UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_TTL_CHECK             0x80UL
-	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_PACK_AS_GRO           0x100UL
-	__le32	enables;
-	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_SEGS      0x1UL
-	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGGS          0x2UL
-	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_TIMER     0x4UL
-	#define VNIC_TPA_CFG_REQ_ENABLES_MIN_AGG_LEN       0x8UL
-	#define VNIC_TPA_CFG_REQ_ENABLES_TNL_TPA_EN        0x10UL
-	__le16	vnic_id;
-	__le16	max_agg_segs;
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_1   0x0UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_2   0x1UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_4   0x2UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_8   0x3UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX 0x1fUL
-	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_LAST VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX
-	__le16	max_aggs;
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_1   0x0UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_2   0x1UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_4   0x2UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_8   0x3UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_16  0x4UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_MAX 0x7UL
-	#define VNIC_TPA_CFG_REQ_MAX_AGGS_LAST VNIC_TPA_CFG_REQ_MAX_AGGS_MAX
-	u8	unused_0[2];
-	__le32	max_agg_timer;
-	__le32	min_agg_len;
-	__le32	tnl_tpa_en_bitmap;
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN           0x1UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GENEVE          0x2UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_NVGRE           0x4UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GRE             0x8UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_IPV4            0x10UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_IPV6            0x20UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN_GPE       0x40UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN_CUST1     0x80UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GRE_CUST1       0x100UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR1           0x200UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR2           0x400UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR3           0x800UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR4           0x1000UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR5           0x2000UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR6           0x4000UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR7           0x8000UL
-	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR8           0x10000UL
-	u8	unused_1[4];
-};
-
-/* hwrm_vnic_tpa_cfg_output (size:128b/16B) */
-struct hwrm_vnic_tpa_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_tpa_qcfg_input (size:192b/24B) */
-struct hwrm_vnic_tpa_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	vnic_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_vnic_tpa_qcfg_output (size:256b/32B) */
-struct hwrm_vnic_tpa_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define VNIC_TPA_QCFG_RESP_FLAGS_TPA                       0x1UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_ENCAP_TPA                 0x2UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_RSC_WND_UPDATE            0x4UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO                       0x8UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_ECN              0x10UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_IPID_CHECK            0x40UL
-	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_TTL_CHECK             0x80UL
-	__le16	max_agg_segs;
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_1   0x0UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_2   0x1UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_4   0x2UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_8   0x3UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX 0x1fUL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX
-	__le16	max_aggs;
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_1   0x0UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_2   0x1UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_4   0x2UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_8   0x3UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_16  0x4UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX 0x7UL
-	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX
-	__le32	max_agg_timer;
-	__le32	min_agg_len;
-	__le32	tnl_tpa_en_bitmap;
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN           0x1UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GENEVE          0x2UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_NVGRE           0x4UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GRE             0x8UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_IPV4            0x10UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_IPV6            0x20UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN_GPE       0x40UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN_CUST1     0x80UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GRE_CUST1       0x100UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR1           0x200UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR2           0x400UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR3           0x800UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR4           0x1000UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR5           0x2000UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR6           0x4000UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR7           0x8000UL
-	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR8           0x10000UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_vnic_rss_cfg_input (size:384b/48B) */
-struct hwrm_vnic_rss_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	hash_type;
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4                0x1UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4            0x2UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4            0x4UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6                0x8UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6            0x10UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6            0x20UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6_FLOW_LABEL     0x40UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_AH_SPI_IPV4         0x80UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_ESP_SPI_IPV4        0x100UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_AH_SPI_IPV6         0x200UL
-	#define VNIC_RSS_CFG_REQ_HASH_TYPE_ESP_SPI_IPV6        0x400UL
-	__le16	vnic_id;
-	u8	ring_table_pair_index;
-	u8	hash_mode_flags;
-	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_DEFAULT         0x1UL
-	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_INNERMOST_4     0x2UL
-	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_INNERMOST_2     0x4UL
-	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_OUTERMOST_4     0x8UL
-	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_OUTERMOST_2     0x10UL
-	__le64	ring_grp_tbl_addr;
-	__le64	hash_key_tbl_addr;
-	__le16	rss_ctx_idx;
-	u8	flags;
-	#define VNIC_RSS_CFG_REQ_FLAGS_HASH_TYPE_INCLUDE               0x1UL
-	#define VNIC_RSS_CFG_REQ_FLAGS_HASH_TYPE_EXCLUDE               0x2UL
-	#define VNIC_RSS_CFG_REQ_FLAGS_IPSEC_HASH_TYPE_CFG_SUPPORT     0x4UL
-	u8	ring_select_mode;
-	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ          0x0UL
-	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_XOR               0x1UL
-	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ_CHECKSUM 0x2UL
-	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_LAST             VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ_CHECKSUM
-	u8	unused_1[4];
-};
-
-/* hwrm_vnic_rss_cfg_output (size:128b/16B) */
-struct hwrm_vnic_rss_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_rss_cfg_cmd_err (size:64b/8B) */
-struct hwrm_vnic_rss_cfg_cmd_err {
-	u8	code;
-	#define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN             0x0UL
-	#define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL
-	#define VNIC_RSS_CFG_CMD_ERR_CODE_LAST               VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY
-	u8	unused_0[7];
-};
-
-/* hwrm_vnic_rss_qcfg_input (size:192b/24B) */
-struct hwrm_vnic_rss_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	rss_ctx_idx;
-	__le16	vnic_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_vnic_rss_qcfg_output (size:512b/64B) */
-struct hwrm_vnic_rss_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	hash_type;
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV4                0x1UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_TCP_IPV4            0x2UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_UDP_IPV4            0x4UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV6                0x8UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_TCP_IPV6            0x10UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_UDP_IPV6            0x20UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV6_FLOW_LABEL     0x40UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_AH_SPI_IPV4         0x80UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_ESP_SPI_IPV4        0x100UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_AH_SPI_IPV6         0x200UL
-	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_ESP_SPI_IPV6        0x400UL
-	u8	unused_0[4];
-	__le32	hash_key[10];
-	u8	hash_mode_flags;
-	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_DEFAULT         0x1UL
-	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_INNERMOST_4     0x2UL
-	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_INNERMOST_2     0x4UL
-	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_OUTERMOST_4     0x8UL
-	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_OUTERMOST_2     0x10UL
-	u8	ring_select_mode;
-	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ          0x0UL
-	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_XOR               0x1UL
-	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ_CHECKSUM 0x2UL
-	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_LAST             VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ_CHECKSUM
-	u8	unused_1[5];
-	u8	valid;
-};
-
-/* hwrm_vnic_plcmodes_cfg_input (size:320b/40B) */
-struct hwrm_vnic_plcmodes_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_REGULAR_PLACEMENT     0x1UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT       0x2UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4              0x4UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6              0x8UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_FCOE              0x10UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_ROCE              0x20UL
-	#define VNIC_PLCMODES_CFG_REQ_FLAGS_VIRTIO_PLACEMENT      0x40UL
-	__le32	enables;
-	#define VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID      0x1UL
-	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_OFFSET_VALID        0x2UL
-	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID     0x4UL
-	#define VNIC_PLCMODES_CFG_REQ_ENABLES_MAX_BDS_VALID           0x8UL
-	__le32	vnic_id;
-	__le16	jumbo_thresh;
-	__le16	hds_offset;
-	__le16	hds_threshold;
-	__le16	max_bds;
-	u8	unused_0[4];
-};
-
-/* hwrm_vnic_plcmodes_cfg_output (size:128b/16B) */
-struct hwrm_vnic_plcmodes_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vnic_plcmodes_cfg_cmd_err (size:64b/8B) */
-struct hwrm_vnic_plcmodes_cfg_cmd_err {
-	u8	code;
-	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_UNKNOWN               0x0UL
-	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD 0x1UL
-	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_LAST                 VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD
-	u8	unused_0[7];
-};
-
-/* hwrm_vnic_rss_cos_lb_ctx_alloc_input (size:128b/16B) */
-struct hwrm_vnic_rss_cos_lb_ctx_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_vnic_rss_cos_lb_ctx_alloc_output (size:128b/16B) */
-struct hwrm_vnic_rss_cos_lb_ctx_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	rss_cos_lb_ctx_id;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_vnic_rss_cos_lb_ctx_free_input (size:192b/24B) */
-struct hwrm_vnic_rss_cos_lb_ctx_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	rss_cos_lb_ctx_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_vnic_rss_cos_lb_ctx_free_output (size:128b/16B) */
-struct hwrm_vnic_rss_cos_lb_ctx_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_ring_alloc_input (size:704b/88B) */
-struct hwrm_ring_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define RING_ALLOC_REQ_ENABLES_RING_ARB_CFG              0x2UL
-	#define RING_ALLOC_REQ_ENABLES_STAT_CTX_ID_VALID         0x8UL
-	#define RING_ALLOC_REQ_ENABLES_MAX_BW_VALID              0x20UL
-	#define RING_ALLOC_REQ_ENABLES_RX_RING_ID_VALID          0x40UL
-	#define RING_ALLOC_REQ_ENABLES_NQ_RING_ID_VALID          0x80UL
-	#define RING_ALLOC_REQ_ENABLES_RX_BUF_SIZE_VALID         0x100UL
-	#define RING_ALLOC_REQ_ENABLES_SCHQ_ID                   0x200UL
-	#define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE            0x400UL
-	#define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID        0x800UL
-	#define RING_ALLOC_REQ_ENABLES_RX_RATE_PROFILE_VALID     0x1000UL
-	u8	ring_type;
-	#define RING_ALLOC_REQ_RING_TYPE_L2_CMPL   0x0UL
-	#define RING_ALLOC_REQ_RING_TYPE_TX        0x1UL
-	#define RING_ALLOC_REQ_RING_TYPE_RX        0x2UL
-	#define RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL 0x3UL
-	#define RING_ALLOC_REQ_RING_TYPE_RX_AGG    0x4UL
-	#define RING_ALLOC_REQ_RING_TYPE_NQ        0x5UL
-	#define RING_ALLOC_REQ_RING_TYPE_LAST     RING_ALLOC_REQ_RING_TYPE_NQ
-	u8	cmpl_coal_cnt;
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_OFF 0x0UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_4   0x1UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_8   0x2UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_12  0x3UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_16  0x4UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_24  0x5UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_32  0x6UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_48  0x7UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_64  0x8UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_96  0x9UL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_128 0xaUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_192 0xbUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_256 0xcUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_320 0xdUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_384 0xeUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_MAX 0xfUL
-	#define RING_ALLOC_REQ_CMPL_COAL_CNT_LAST    RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_MAX
-	__le16	flags;
-	#define RING_ALLOC_REQ_FLAGS_RX_SOP_PAD                        0x1UL
-	#define RING_ALLOC_REQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION     0x2UL
-	#define RING_ALLOC_REQ_FLAGS_NQ_DBR_PACING                     0x4UL
-	#define RING_ALLOC_REQ_FLAGS_TX_PKT_TS_CMPL_ENABLE             0x8UL
-	__le64	page_tbl_addr;
-	__le32	fbo;
-	u8	page_size;
-	u8	page_tbl_depth;
-	__le16	schq_id;
-	__le32	length;
-	__le16	logical_id;
-	__le16	cmpl_ring_id;
-	__le16	queue_id;
-	__le16	rx_buf_size;
-	__le16	rx_ring_id;
-	__le16	nq_ring_id;
-	__le16	ring_arb_cfg;
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_MASK      0xfUL
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SFT       0
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SP          0x1UL
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ         0x2UL
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_LAST       RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ
-	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_MASK            0xf0UL
-	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_SFT             4
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_MASK 0xff00UL
-	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_SFT 8
-	__le16	steering_tag;
-	__le32	reserved3;
-	__le32	stat_ctx_id;
-	__le32	reserved4;
-	__le32	max_bw;
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_SFT              0
-	#define RING_ALLOC_REQ_MAX_BW_SCALE                     0x10000000UL
-	#define RING_ALLOC_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
-	#define RING_ALLOC_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
-	#define RING_ALLOC_REQ_MAX_BW_SCALE_LAST                 RING_ALLOC_REQ_MAX_BW_SCALE_BYTES
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
-	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_LAST         RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
-	u8	int_mode;
-	#define RING_ALLOC_REQ_INT_MODE_LEGACY 0x0UL
-	#define RING_ALLOC_REQ_INT_MODE_RSVD   0x1UL
-	#define RING_ALLOC_REQ_INT_MODE_MSIX   0x2UL
-	#define RING_ALLOC_REQ_INT_MODE_POLL   0x3UL
-	#define RING_ALLOC_REQ_INT_MODE_LAST  RING_ALLOC_REQ_INT_MODE_POLL
-	u8	mpc_chnls_type;
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_TCE     0x0UL
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_RCE     0x1UL
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_TE_CFA  0x2UL
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_RE_CFA  0x3UL
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE 0x4UL
-	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_LAST   RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE
-	u8	rx_rate_profile_sel;
-	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_DEFAULT   0x0UL
-	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE 0x1UL
-	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_LAST     RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE
-	u8	unused_4;
-	__le64	cq_handle;
-};
-
-/* hwrm_ring_alloc_output (size:128b/16B) */
-struct hwrm_ring_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	ring_id;
-	__le16	logical_ring_id;
-	u8	push_buffer_index;
-	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PING_BUFFER 0x0UL
-	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER 0x1UL
-	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_LAST       RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER
-	u8	unused_0[2];
-	u8	valid;
-};
-
-/* hwrm_ring_free_input (size:256b/32B) */
-struct hwrm_ring_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	ring_type;
-	#define RING_FREE_REQ_RING_TYPE_L2_CMPL   0x0UL
-	#define RING_FREE_REQ_RING_TYPE_TX        0x1UL
-	#define RING_FREE_REQ_RING_TYPE_RX        0x2UL
-	#define RING_FREE_REQ_RING_TYPE_ROCE_CMPL 0x3UL
-	#define RING_FREE_REQ_RING_TYPE_RX_AGG    0x4UL
-	#define RING_FREE_REQ_RING_TYPE_NQ        0x5UL
-	#define RING_FREE_REQ_RING_TYPE_LAST     RING_FREE_REQ_RING_TYPE_NQ
-	u8	flags;
-	#define RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID 0x1UL
-	#define RING_FREE_REQ_FLAGS_LAST             RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID
-	__le16	ring_id;
-	__le32	prod_idx;
-	__le32	opaque;
-	__le32	unused_1;
-};
-
-/* hwrm_ring_free_output (size:128b/16B) */
-struct hwrm_ring_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_ring_reset_input (size:192b/24B) */
-struct hwrm_ring_reset_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	ring_type;
-	#define RING_RESET_REQ_RING_TYPE_L2_CMPL     0x0UL
-	#define RING_RESET_REQ_RING_TYPE_TX          0x1UL
-	#define RING_RESET_REQ_RING_TYPE_RX          0x2UL
-	#define RING_RESET_REQ_RING_TYPE_ROCE_CMPL   0x3UL
-	#define RING_RESET_REQ_RING_TYPE_RX_RING_GRP 0x6UL
-	#define RING_RESET_REQ_RING_TYPE_LAST       RING_RESET_REQ_RING_TYPE_RX_RING_GRP
-	u8	unused_0;
-	__le16	ring_id;
-	u8	unused_1[4];
-};
-
-/* hwrm_ring_reset_output (size:128b/16B) */
-struct hwrm_ring_reset_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	push_buffer_index;
-	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_PING_BUFFER 0x0UL
-	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER 0x1UL
-	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_LAST       RING_RESET_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER
-	u8	unused_0[3];
-	u8	consumer_idx[3];
-	u8	valid;
-};
-
-/* hwrm_ring_aggint_qcaps_input (size:128b/16B) */
-struct hwrm_ring_aggint_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_ring_aggint_qcaps_output (size:384b/48B) */
-struct hwrm_ring_aggint_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	cmpl_params;
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_INT_LAT_TMR_MIN                  0x1UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_INT_LAT_TMR_MAX                  0x2UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_TIMER_RESET                      0x4UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_RING_IDLE                        0x8UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_DMA_AGGR                0x10UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_DMA_AGGR_DURING_INT     0x20UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_CMPL_AGGR_DMA_TMR                0x40UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_CMPL_AGGR_DMA_TMR_DURING_INT     0x80UL
-	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_AGGR_INT                0x100UL
-	__le32	nq_params;
-	#define RING_AGGINT_QCAPS_RESP_NQ_PARAMS_INT_LAT_TMR_MIN     0x1UL
-	__le16	num_cmpl_dma_aggr_min;
-	__le16	num_cmpl_dma_aggr_max;
-	__le16	num_cmpl_dma_aggr_during_int_min;
-	__le16	num_cmpl_dma_aggr_during_int_max;
-	__le16	cmpl_aggr_dma_tmr_min;
-	__le16	cmpl_aggr_dma_tmr_max;
-	__le16	cmpl_aggr_dma_tmr_during_int_min;
-	__le16	cmpl_aggr_dma_tmr_during_int_max;
-	__le16	int_lat_tmr_min_min;
-	__le16	int_lat_tmr_min_max;
-	__le16	int_lat_tmr_max_min;
-	__le16	int_lat_tmr_max_max;
-	__le16	num_cmpl_aggr_int_min;
-	__le16	num_cmpl_aggr_int_max;
-	__le16	timer_units;
-	u8	unused_0[1];
-	u8	valid;
-};
-
-/* hwrm_ring_cmpl_ring_qaggint_params_input (size:192b/24B) */
-struct hwrm_ring_cmpl_ring_qaggint_params_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	ring_id;
-	__le16	flags;
-	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_UNUSED_0_MASK 0x3UL
-	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_UNUSED_0_SFT 0
-	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_IS_NQ        0x4UL
-	u8	unused_0[4];
-};
-
-/* hwrm_ring_cmpl_ring_qaggint_params_output (size:256b/32B) */
-struct hwrm_ring_cmpl_ring_qaggint_params_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	flags;
-	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_TIMER_RESET     0x1UL
-	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_RING_IDLE       0x2UL
-	__le16	num_cmpl_dma_aggr;
-	__le16	num_cmpl_dma_aggr_during_int;
-	__le16	cmpl_aggr_dma_tmr;
-	__le16	cmpl_aggr_dma_tmr_during_int;
-	__le16	int_lat_tmr_min;
-	__le16	int_lat_tmr_max;
-	__le16	num_cmpl_aggr_int;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_ring_cmpl_ring_cfg_aggint_params_input (size:320b/40B) */
-struct hwrm_ring_cmpl_ring_cfg_aggint_params_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	ring_id;
-	__le16	flags;
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_TIMER_RESET     0x1UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_RING_IDLE       0x2UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_IS_NQ           0x4UL
-	__le16	num_cmpl_dma_aggr;
-	__le16	num_cmpl_dma_aggr_during_int;
-	__le16	cmpl_aggr_dma_tmr;
-	__le16	cmpl_aggr_dma_tmr_during_int;
-	__le16	int_lat_tmr_min;
-	__le16	int_lat_tmr_max;
-	__le16	num_cmpl_aggr_int;
-	__le16	enables;
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_DMA_AGGR                0x1UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_DMA_AGGR_DURING_INT     0x2UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_CMPL_AGGR_DMA_TMR                0x4UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_INT_LAT_TMR_MIN                  0x8UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_INT_LAT_TMR_MAX                  0x10UL
-	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_AGGR_INT                0x20UL
-	u8	unused_0[4];
-};
-
-/* hwrm_ring_cmpl_ring_cfg_aggint_params_output (size:128b/16B) */
-struct hwrm_ring_cmpl_ring_cfg_aggint_params_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_ring_grp_alloc_input (size:192b/24B) */
-struct hwrm_ring_grp_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	cr;
-	__le16	rr;
-	__le16	ar;
-	__le16	sc;
-};
-
-/* hwrm_ring_grp_alloc_output (size:128b/16B) */
-struct hwrm_ring_grp_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	ring_group_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_ring_grp_free_input (size:192b/24B) */
-struct hwrm_ring_grp_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	ring_group_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_ring_grp_free_output (size:128b/16B) */
-struct hwrm_ring_grp_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-#define DEFAULT_FLOW_ID 0xFFFFFFFFUL
-#define ROCEV1_FLOW_ID 0xFFFFFFFEUL
-#define ROCEV2_FLOW_ID 0xFFFFFFFDUL
-#define ROCEV2_CNP_FLOW_ID 0xFFFFFFFCUL
-
-/* hwrm_cfa_l2_filter_alloc_input (size:768b/96B) */
-struct hwrm_cfa_l2_filter_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH              0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_TX             0x0UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX             0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_LAST          CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_LOOPBACK          0x2UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_DROP              0x4UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_OUTERMOST         0x8UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_MASK      0x30UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_SFT       4
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_NO_ROCE_L2  (0x0UL << 4)
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_L2          (0x1UL << 4)
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_ROCE        (0x2UL << 4)
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_LAST       CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_ROCE
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_XDP_DISABLE       0x40UL
-	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_SOURCE_VALID      0x80UL
-	__le32	enables;
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR             0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR_MASK        0x2UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN            0x4UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN_MASK       0x8UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN            0x10UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN_MASK       0x20UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR           0x40UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR_MASK      0x80UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN          0x100UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN_MASK     0x200UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN          0x400UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN_MASK     0x800UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_TYPE            0x1000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_ID              0x2000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE         0x4000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_DST_ID              0x8000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID      0x10000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_NUM_VLANS           0x20000UL
-	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_NUM_VLANS         0x40000UL
-	u8	l2_addr[6];
-	u8	num_vlans;
-	u8	t_num_vlans;
-	u8	l2_addr_mask[6];
-	__le16	l2_ovlan;
-	__le16	l2_ovlan_mask;
-	__le16	l2_ivlan;
-	__le16	l2_ivlan_mask;
-	u8	unused_1[2];
-	u8	t_l2_addr[6];
-	u8	unused_2[2];
-	u8	t_l2_addr_mask[6];
-	__le16	t_l2_ovlan;
-	__le16	t_l2_ovlan_mask;
-	__le16	t_l2_ivlan;
-	__le16	t_l2_ivlan_mask;
-	u8	src_type;
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_NPORT 0x0UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_PF    0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VF    0x2UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VNIC  0x3UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_KONG  0x4UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_APE   0x5UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_BONO  0x6UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG  0x7UL
-	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_LAST CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG
-	u8	unused_3;
-	__le32	src_id;
-	u8	tunnel_type;
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
-	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
-	u8	unused_4;
-	__le16	dst_id;
-	__le16	mirror_vnic_id;
-	u8	pri_hint;
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER    0x0UL
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_ABOVE_FILTER 0x1UL
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_BELOW_FILTER 0x2UL
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MAX          0x3UL
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN          0x4UL
-	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_LAST        CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN
-	u8	unused_5;
-	__le32	unused_6;
-	__le64	l2_filter_id_hint;
-};
-
-/* hwrm_cfa_l2_filter_alloc_output (size:192b/24B) */
-struct hwrm_cfa_l2_filter_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	l2_filter_id;
-	__le32	flow_id;
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
-	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_l2_filter_free_input (size:192b/24B) */
-struct hwrm_cfa_l2_filter_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	l2_filter_id;
-};
-
-/* hwrm_cfa_l2_filter_free_output (size:128b/16B) */
-struct hwrm_cfa_l2_filter_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_l2_filter_cfg_input (size:384b/48B) */
-struct hwrm_cfa_l2_filter_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH                  0x1UL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX                 0x0UL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX                 0x1UL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST              CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP                  0x2UL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK          0xcUL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT           2
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2      (0x0UL << 2)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2              (0x1UL << 2)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE            (0x2UL << 2)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST           CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_MASK         0x30UL
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_SFT          4
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_NO_UPDATE      (0x0UL << 4)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_BYPASS_LKUP    (0x1UL << 4)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_ENABLE_LKUP    (0x2UL << 4)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_RESTORE_FW_OP  (0x3UL << 4)
-	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_LAST          CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_RESTORE_FW_OP
-	__le32	enables;
-	#define CFA_L2_FILTER_CFG_REQ_ENABLES_DST_ID                 0x1UL
-	#define CFA_L2_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID     0x2UL
-	#define CFA_L2_FILTER_CFG_REQ_ENABLES_PROF_FUNC              0x4UL
-	#define CFA_L2_FILTER_CFG_REQ_ENABLES_L2_CONTEXT_ID          0x8UL
-	__le64	l2_filter_id;
-	__le32	dst_id;
-	__le32	new_mirror_vnic_id;
-	__le32	prof_func;
-	__le32	l2_context_id;
-};
-
-/* hwrm_cfa_l2_filter_cfg_output (size:128b/16B) */
-struct hwrm_cfa_l2_filter_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_l2_set_rx_mask_input (size:448b/56B) */
-struct hwrm_cfa_l2_set_rx_mask_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	vnic_id;
-	__le32	mask;
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_MCAST               0x2UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST           0x4UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_BCAST               0x8UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS         0x10UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_OUTERMOST           0x20UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLANONLY            0x40UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLAN_NONVLAN        0x80UL
-	#define CFA_L2_SET_RX_MASK_REQ_MASK_ANYVLAN_NONVLAN     0x100UL
-	__le64	mc_tbl_addr;
-	__le32	num_mc_entries;
-	u8	unused_0[4];
-	__le64	vlan_tag_tbl_addr;
-	__le32	num_vlan_tags;
-	u8	unused_1[4];
-};
-
-/* hwrm_cfa_l2_set_rx_mask_output (size:128b/16B) */
-struct hwrm_cfa_l2_set_rx_mask_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_l2_set_rx_mask_cmd_err (size:64b/8B) */
-struct hwrm_cfa_l2_set_rx_mask_cmd_err {
-	u8	code;
-	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_UNKNOWN                    0x0UL
-	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR 0x1UL
-	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST                      CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR
-	u8	unused_0[7];
-};
-
-/* hwrm_cfa_tunnel_filter_alloc_input (size:704b/88B) */
-struct hwrm_cfa_tunnel_filter_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
-	__le32	enables;
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID       0x1UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_ADDR            0x2UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN           0x4UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR            0x8UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR_TYPE       0x10UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR_TYPE     0x20UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR          0x40UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x80UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_VNI                0x100UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_DST_VNIC_ID        0x200UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x400UL
-	__le64	l2_filter_id;
-	u8	l2_addr[6];
-	__le16	l2_ivlan;
-	__le32	l3_addr[4];
-	__le32	t_l3_addr[4];
-	u8	l3_addr_type;
-	u8	t_l3_addr_type;
-	u8	tunnel_type;
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
-	u8	tunnel_flags;
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_OAM_CHECKSUM_EXPLHDR     0x1UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_CRITICAL_OPT_S1          0x2UL
-	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_EXTHDR_SEQNUM_S0         0x4UL
-	__le32	vni;
-	__le32	dst_vnic_id;
-	__le32	mirror_vnic_id;
-};
-
-/* hwrm_cfa_tunnel_filter_alloc_output (size:192b/24B) */
-struct hwrm_cfa_tunnel_filter_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	tunnel_filter_id;
-	__le32	flow_id;
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
-	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_tunnel_filter_free_input (size:192b/24B) */
-struct hwrm_cfa_tunnel_filter_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	tunnel_filter_id;
-};
-
-/* hwrm_cfa_tunnel_filter_free_output (size:128b/16B) */
-struct hwrm_cfa_tunnel_filter_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_vxlan_ipv4_hdr (size:128b/16B) */
-struct hwrm_vxlan_ipv4_hdr {
-	u8	ver_hlen;
-	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_MASK 0xfUL
-	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_SFT 0
-	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_MASK      0xf0UL
-	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_SFT       4
-	u8	tos;
-	__be16	ip_id;
-	__be16	flags_frag_offset;
-	u8	ttl;
-	u8	protocol;
-	__be32	src_ip_addr;
-	__be32	dest_ip_addr;
-};
-
-/* hwrm_vxlan_ipv6_hdr (size:320b/40B) */
-struct hwrm_vxlan_ipv6_hdr {
-	__be32	ver_tc_flow_label;
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_SFT         0x1cUL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_MASK        0xf0000000UL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_SFT          0x14UL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_MASK         0xff00000UL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_SFT  0x0UL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK 0xfffffUL
-	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_LAST           VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK
-	__be16	payload_len;
-	u8	next_hdr;
-	u8	ttl;
-	__be32	src_ip_addr[4];
-	__be32	dest_ip_addr[4];
-};
-
-/* hwrm_cfa_encap_data_vxlan (size:640b/80B) */
-struct hwrm_cfa_encap_data_vxlan {
-	u8	src_mac_addr[6];
-	__le16	unused_0;
-	u8	dst_mac_addr[6];
-	u8	num_vlan_tags;
-	u8	unused_1;
-	__be16	ovlan_tpid;
-	__be16	ovlan_tci;
-	__be16	ivlan_tpid;
-	__be16	ivlan_tci;
-	__le32	l3[10];
-	#define CFA_ENCAP_DATA_VXLAN_L3_VER_MASK 0xfUL
-	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV4 0x4UL
-	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6 0x6UL
-	#define CFA_ENCAP_DATA_VXLAN_L3_LAST    CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6
-	__be16	src_port;
-	__be16	dst_port;
-	__be32	vni;
-	u8	hdr_rsvd0[3];
-	u8	hdr_rsvd1;
-	u8	hdr_flags;
-	u8	unused[3];
-};
-
-/* hwrm_cfa_encap_record_alloc_input (size:832b/104B) */
-struct hwrm_cfa_encap_record_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_FLAGS_EXTERNAL     0x2UL
-	u8	encap_type;
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN        0x1UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_NVGRE        0x2UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_L2GRE        0x3UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPIP         0x4UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_GENEVE       0x5UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_MPLS         0x6UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VLAN         0x7UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE        0x8UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_V4     0x9UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE_V1     0xaUL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_L2_ETYPE     0xbUL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_LAST        CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE
-	u8	unused_0[3];
-	__le32	encap_data[20];
-};
-
-/* hwrm_cfa_encap_record_alloc_output (size:128b/16B) */
-struct hwrm_cfa_encap_record_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	encap_record_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_encap_record_free_input (size:192b/24B) */
-struct hwrm_cfa_encap_record_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	encap_record_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_cfa_encap_record_free_output (size:128b/16B) */
-struct hwrm_cfa_encap_record_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_ntuple_filter_alloc_input (size:1024b/128B) */
-struct hwrm_cfa_ntuple_filter_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_LOOPBACK              0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DROP                  0x2UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_METER                 0x4UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DEST_FID              0x8UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_ARP_REPLY             0x10UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DEST_RFS_RING_IDX     0x20UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_NO_L2_CONTEXT         0x40UL
-	__le32	enables;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID         0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE            0x2UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE          0x4UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR          0x8UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE          0x10UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR           0x20UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR_MASK      0x40UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR           0x80UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR_MASK      0x100UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL          0x200UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT             0x400UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT_MASK        0x800UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT             0x1000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT_MASK        0x2000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_PRI_HINT             0x4000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_NTUPLE_FILTER_ID     0x8000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_ID               0x10000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID       0x20000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR          0x40000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_RFS_RING_TBL_IDX     0x80000UL
-	__le64	l2_filter_id;
-	u8	src_macaddr[6];
-	__be16	ethertype;
-	u8	ip_addr_type;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
-	u8	ip_protocol;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_ICMP    0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_ICMPV6  0x3aUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_RSVD    0xffUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_RSVD
-	__le16	dst_id;
-	__le16	rfs_ring_tbl_idx;
-	u8	tunnel_type;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
-	u8	pri_hint;
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER 0x0UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_ABOVE     0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_BELOW     0x2UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_HIGHEST   0x3UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST    0x4UL
-	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LAST     CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST
-	__be32	src_ipaddr[4];
-	__be32	src_ipaddr_mask[4];
-	__be32	dst_ipaddr[4];
-	__be32	dst_ipaddr_mask[4];
-	__be16	src_port;
-	__be16	src_port_mask;
-	__be16	dst_port;
-	__be16	dst_port_mask;
-	__le64	ntuple_filter_id_hint;
-};
-
-/* hwrm_cfa_ntuple_filter_alloc_output (size:192b/24B) */
-struct hwrm_cfa_ntuple_filter_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	ntuple_filter_id;
-	__le32	flow_id;
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
-	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_ntuple_filter_alloc_cmd_err (size:64b/8B) */
-struct hwrm_cfa_ntuple_filter_alloc_cmd_err {
-	u8	code;
-	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN                   0x0UL
-	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR 0x1UL
-	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST                     CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR
-	u8	unused_0[7];
-};
-
-/* hwrm_cfa_ntuple_filter_free_input (size:192b/24B) */
-struct hwrm_cfa_ntuple_filter_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	ntuple_filter_id;
-};
-
-/* hwrm_cfa_ntuple_filter_free_output (size:128b/16B) */
-struct hwrm_cfa_ntuple_filter_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_ntuple_filter_cfg_input (size:384b/48B) */
-struct hwrm_cfa_ntuple_filter_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_DST_ID                0x1UL
-	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID        0x2UL
-	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_METER_INSTANCE_ID     0x4UL
-	__le32	flags;
-	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_DEST_FID              0x1UL
-	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_DEST_RFS_RING_IDX     0x2UL
-	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_NO_L2_CONTEXT         0x4UL
-	__le64	ntuple_filter_id;
-	__le32	new_dst_id;
-	__le32	new_mirror_vnic_id;
-	__le16	new_meter_instance_id;
-	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID 0xffffUL
-	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_LAST   CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID
-	u8	unused_1[6];
-};
-
-/* hwrm_cfa_ntuple_filter_cfg_output (size:128b/16B) */
-struct hwrm_cfa_ntuple_filter_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_decap_filter_alloc_input (size:832b/104B) */
-struct hwrm_cfa_decap_filter_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_DECAP_FILTER_ALLOC_REQ_FLAGS_OVS_TUNNEL     0x1UL
-	__le32	enables;
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x1UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_ID          0x2UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR        0x4UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR        0x8UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_OVLAN_VID          0x10UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IVLAN_VID          0x20UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_OVLAN_VID        0x40UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_IVLAN_VID        0x80UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE          0x100UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR         0x200UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR         0x400UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE        0x800UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL        0x1000UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_PORT           0x2000UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_PORT           0x4000UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_ID             0x8000UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x10000UL
-	__be32	tunnel_id;
-	u8	tunnel_type;
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
-	u8	unused_0;
-	__le16	unused_1;
-	u8	src_macaddr[6];
-	u8	unused_2[2];
-	u8	dst_macaddr[6];
-	__be16	ovlan_vid;
-	__be16	ivlan_vid;
-	__be16	t_ovlan_vid;
-	__be16	t_ivlan_vid;
-	__be16	ethertype;
-	u8	ip_addr_type;
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
-	u8	ip_protocol;
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
-	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP
-	__le16	unused_3;
-	__le32	unused_4;
-	__be32	src_ipaddr[4];
-	__be32	dst_ipaddr[4];
-	__be16	src_port;
-	__be16	dst_port;
-	__le16	dst_id;
-	__le16	l2_ctxt_ref_id;
-};
-
-/* hwrm_cfa_decap_filter_alloc_output (size:128b/16B) */
-struct hwrm_cfa_decap_filter_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	decap_filter_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_decap_filter_free_input (size:192b/24B) */
-struct hwrm_cfa_decap_filter_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	decap_filter_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_cfa_decap_filter_free_output (size:128b/16B) */
-struct hwrm_cfa_decap_filter_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_flow_alloc_input (size:1024b/128B) */
-struct hwrm_cfa_flow_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	flags;
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_TUNNEL                 0x1UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_MASK          0x6UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_SFT           1
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_NONE            (0x0UL << 1)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_ONE             (0x1UL << 1)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO             (0x2UL << 1)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_LAST           CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_MASK          0x38UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_SFT           3
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2              (0x0UL << 3)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV4            (0x1UL << 3)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6            (0x2UL << 3)
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_LAST           CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_PATH_TX                0x40UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_PATH_RX                0x80UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_MATCH_VXLAN_IP_VNI     0x100UL
-	#define CFA_FLOW_ALLOC_REQ_FLAGS_VHOST_ID_USE_VLAN      0x200UL
-	__le16	src_fid;
-	__le32	tunnel_handle;
-	__le16	action_flags;
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FWD                       0x1UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_RECYCLE                   0x2UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_DROP                      0x4UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_METER                     0x8UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL                    0x10UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_SRC                   0x20UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_DEST                  0x40UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_IPV4_ADDRESS          0x80UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_L2_HEADER_REWRITE         0x100UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TTL_DECREMENT             0x200UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL_IP                 0x400UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FLOW_AGING_ENABLED        0x800UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_PRI_HINT                  0x1000UL
-	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NO_FLOW_COUNTER_ALLOC     0x2000UL
-	__le16	dst_fid;
-	__be16	l2_rewrite_vlan_tpid;
-	__be16	l2_rewrite_vlan_tci;
-	__le16	act_meter_id;
-	__le16	ref_flow_handle;
-	__be16	ethertype;
-	__be16	outer_vlan_tci;
-	__be16	dmac[3];
-	__be16	inner_vlan_tci;
-	__be16	smac[3];
-	u8	ip_dst_mask_len;
-	u8	ip_src_mask_len;
-	__be32	ip_dst[4];
-	__be32	ip_src[4];
-	__be16	l4_src_port;
-	__be16	l4_src_port_mask;
-	__be16	l4_dst_port;
-	__be16	l4_dst_port_mask;
-	__be32	nat_ip_address[4];
-	__be16	l2_rewrite_dmac[3];
-	__be16	nat_port;
-	__be16	l2_rewrite_smac[3];
-	u8	ip_proto;
-	u8	tunnel_type;
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
-	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
-};
-
-/* hwrm_cfa_flow_alloc_output (size:256b/32B) */
-struct hwrm_cfa_flow_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	flow_handle;
-	u8	unused_0[2];
-	__le32	flow_id;
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_EXT
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
-	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_TX
-	__le64	ext_flow_handle;
-	__le32	flow_counter_id;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_flow_alloc_cmd_err (size:64b/8B) */
-struct hwrm_cfa_flow_alloc_cmd_err {
-	u8	code;
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_UNKNOWN         0x0UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_L2_CONTEXT_TCAM 0x1UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_ACTION_RECORD   0x2UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_COUNTER    0x3UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_WILD_CARD_TCAM  0x4UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_HASH_COLLISION  0x5UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_KEY_EXISTS      0x6UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_CTXT_DB    0x7UL
-	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_LAST           CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_CTXT_DB
-	u8	unused_0[7];
-};
-
-/* hwrm_cfa_flow_free_input (size:256b/32B) */
-struct hwrm_cfa_flow_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	flow_handle;
-	__le16	unused_0;
-	__le32	flow_counter_id;
-	__le64	ext_flow_handle;
-};
-
-/* hwrm_cfa_flow_free_output (size:256b/32B) */
-struct hwrm_cfa_flow_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	packet;
-	__le64	byte;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_flow_info_input (size:256b/32B) */
-struct hwrm_cfa_flow_info_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	flow_handle;
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_MAX_MASK      0xfffUL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_CNP_CNT       0x1000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV1_CNT    0x2000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_NIC_TX        0x3000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT    0x4000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_DIR_RX        0x8000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_CNP_CNT_RX    0x9000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV1_CNT_RX 0xa000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_NIC_RX        0xb000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT_RX 0xc000UL
-	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_LAST         CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT_RX
-	u8	unused_0[6];
-	__le64	ext_flow_handle;
-};
-
-/* hwrm_cfa_flow_info_output (size:5632b/704B) */
-struct hwrm_cfa_flow_info_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define CFA_FLOW_INFO_RESP_FLAGS_PATH_TX     0x1UL
-	#define CFA_FLOW_INFO_RESP_FLAGS_PATH_RX     0x2UL
-	u8	profile;
-	__le16	src_fid;
-	__le16	dst_fid;
-	__le16	l2_ctxt_id;
-	__le64	em_info;
-	__le64	tcam_info;
-	__le64	vfp_tcam_info;
-	__le16	ar_id;
-	__le16	flow_handle;
-	__le32	tunnel_handle;
-	__le16	flow_timer;
-	u8	unused_0[6];
-	__le32	flow_key_data[130];
-	__le32	flow_action_info[30];
-	u8	unused_1[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_flow_stats_input (size:640b/80B) */
-struct hwrm_cfa_flow_stats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	num_flows;
-	__le16	flow_handle_0;
-	__le16	flow_handle_1;
-	__le16	flow_handle_2;
-	__le16	flow_handle_3;
-	__le16	flow_handle_4;
-	__le16	flow_handle_5;
-	__le16	flow_handle_6;
-	__le16	flow_handle_7;
-	__le16	flow_handle_8;
-	__le16	flow_handle_9;
-	u8	unused_0[2];
-	__le32	flow_id_0;
-	__le32	flow_id_1;
-	__le32	flow_id_2;
-	__le32	flow_id_3;
-	__le32	flow_id_4;
-	__le32	flow_id_5;
-	__le32	flow_id_6;
-	__le32	flow_id_7;
-	__le32	flow_id_8;
-	__le32	flow_id_9;
-};
-
-/* hwrm_cfa_flow_stats_output (size:1408b/176B) */
-struct hwrm_cfa_flow_stats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	packet_0;
-	__le64	packet_1;
-	__le64	packet_2;
-	__le64	packet_3;
-	__le64	packet_4;
-	__le64	packet_5;
-	__le64	packet_6;
-	__le64	packet_7;
-	__le64	packet_8;
-	__le64	packet_9;
-	__le64	byte_0;
-	__le64	byte_1;
-	__le64	byte_2;
-	__le64	byte_3;
-	__le64	byte_4;
-	__le64	byte_5;
-	__le64	byte_6;
-	__le64	byte_7;
-	__le64	byte_8;
-	__le64	byte_9;
-	__le16	flow_hits;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_cfa_vfr_alloc_input (size:448b/56B) */
-struct hwrm_cfa_vfr_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	vf_id;
-	__le16	reserved;
-	u8	unused_0[4];
-	char	vfr_name[32];
-};
-
-/* hwrm_cfa_vfr_alloc_output (size:128b/16B) */
-struct hwrm_cfa_vfr_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	rx_cfa_code;
-	__le16	tx_cfa_action;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_cfa_vfr_free_input (size:448b/56B) */
-struct hwrm_cfa_vfr_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	char	vfr_name[32];
-	__le16	vf_id;
-	__le16	reserved;
-	u8	unused_0[4];
-};
-
-/* hwrm_cfa_vfr_free_output (size:128b/16B) */
-struct hwrm_cfa_vfr_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_eem_qcaps_input (size:192b/24B) */
-struct hwrm_cfa_eem_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_EEM_QCAPS_REQ_FLAGS_PATH_TX               0x1UL
-	#define CFA_EEM_QCAPS_REQ_FLAGS_PATH_RX               0x2UL
-	#define CFA_EEM_QCAPS_REQ_FLAGS_PREFERRED_OFFLOAD     0x4UL
-	__le32	unused_0;
-};
-
-/* hwrm_cfa_eem_qcaps_output (size:320b/40B) */
-struct hwrm_cfa_eem_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define CFA_EEM_QCAPS_RESP_FLAGS_PATH_TX                                         0x1UL
-	#define CFA_EEM_QCAPS_RESP_FLAGS_PATH_RX                                         0x2UL
-	#define CFA_EEM_QCAPS_RESP_FLAGS_CENTRALIZED_MEMORY_MODEL_SUPPORTED              0x4UL
-	#define CFA_EEM_QCAPS_RESP_FLAGS_DETACHED_CENTRALIZED_MEMORY_MODEL_SUPPORTED     0x8UL
-	__le32	unused_0;
-	__le32	supported;
-	#define CFA_EEM_QCAPS_RESP_SUPPORTED_KEY0_TABLE                       0x1UL
-	#define CFA_EEM_QCAPS_RESP_SUPPORTED_KEY1_TABLE                       0x2UL
-	#define CFA_EEM_QCAPS_RESP_SUPPORTED_EXTERNAL_RECORD_TABLE            0x4UL
-	#define CFA_EEM_QCAPS_RESP_SUPPORTED_EXTERNAL_FLOW_COUNTERS_TABLE     0x8UL
-	#define CFA_EEM_QCAPS_RESP_SUPPORTED_FID_TABLE                        0x10UL
-	__le32	max_entries_supported;
-	__le16	key_entry_size;
-	__le16	record_entry_size;
-	__le16	efc_entry_size;
-	__le16	fid_entry_size;
-	u8	unused_1[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_eem_cfg_input (size:384b/48B) */
-struct hwrm_cfa_eem_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_EEM_CFG_REQ_FLAGS_PATH_TX               0x1UL
-	#define CFA_EEM_CFG_REQ_FLAGS_PATH_RX               0x2UL
-	#define CFA_EEM_CFG_REQ_FLAGS_PREFERRED_OFFLOAD     0x4UL
-	#define CFA_EEM_CFG_REQ_FLAGS_SECONDARY_PF          0x8UL
-	__le16	group_id;
-	__le16	unused_0;
-	__le32	num_entries;
-	__le32	unused_1;
-	__le16	key0_ctx_id;
-	__le16	key1_ctx_id;
-	__le16	record_ctx_id;
-	__le16	efc_ctx_id;
-	__le16	fid_ctx_id;
-	__le16	unused_2;
-	__le32	unused_3;
-};
-
-/* hwrm_cfa_eem_cfg_output (size:128b/16B) */
-struct hwrm_cfa_eem_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_eem_qcfg_input (size:192b/24B) */
-struct hwrm_cfa_eem_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_EEM_QCFG_REQ_FLAGS_PATH_TX     0x1UL
-	#define CFA_EEM_QCFG_REQ_FLAGS_PATH_RX     0x2UL
-	__le32	unused_0;
-};
-
-/* hwrm_cfa_eem_qcfg_output (size:256b/32B) */
-struct hwrm_cfa_eem_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define CFA_EEM_QCFG_RESP_FLAGS_PATH_TX               0x1UL
-	#define CFA_EEM_QCFG_RESP_FLAGS_PATH_RX               0x2UL
-	#define CFA_EEM_QCFG_RESP_FLAGS_PREFERRED_OFFLOAD     0x4UL
-	__le32	num_entries;
-	__le16	key0_ctx_id;
-	__le16	key1_ctx_id;
-	__le16	record_ctx_id;
-	__le16	efc_ctx_id;
-	__le16	fid_ctx_id;
-	u8	unused_2[5];
-	u8	valid;
-};
-
-/* hwrm_cfa_eem_op_input (size:192b/24B) */
-struct hwrm_cfa_eem_op_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define CFA_EEM_OP_REQ_FLAGS_PATH_TX     0x1UL
-	#define CFA_EEM_OP_REQ_FLAGS_PATH_RX     0x2UL
-	__le16	unused_0;
-	__le16	op;
-	#define CFA_EEM_OP_REQ_OP_RESERVED    0x0UL
-	#define CFA_EEM_OP_REQ_OP_EEM_DISABLE 0x1UL
-	#define CFA_EEM_OP_REQ_OP_EEM_ENABLE  0x2UL
-	#define CFA_EEM_OP_REQ_OP_EEM_CLEANUP 0x3UL
-	#define CFA_EEM_OP_REQ_OP_LAST       CFA_EEM_OP_REQ_OP_EEM_CLEANUP
-};
-
-/* hwrm_cfa_eem_op_output (size:128b/16B) */
-struct hwrm_cfa_eem_op_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_cfa_adv_flow_mgnt_qcaps_input (size:256b/32B) */
-struct hwrm_cfa_adv_flow_mgnt_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	unused_0[4];
-};
-
-/* hwrm_cfa_adv_flow_mgnt_qcaps_output (size:128b/16B) */
-struct hwrm_cfa_adv_flow_mgnt_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	flags;
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_HND_16BIT_SUPPORTED                     0x1UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_HND_64BIT_SUPPORTED                     0x2UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_BATCH_DELETE_SUPPORTED                  0x4UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_RESET_ALL_SUPPORTED                     0x8UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_DEST_FUNC_SUPPORTED              0x10UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_TX_EEM_FLOW_SUPPORTED                        0x20UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RX_EEM_FLOW_SUPPORTED                        0x40UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_COUNTER_ALLOC_SUPPORTED                 0x80UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_SUPPORTED                   0x100UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_UNTAGGED_VLAN_SUPPORTED                      0x200UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_XDP_SUPPORTED                                0x400UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_L2_HEADER_SOURCE_FIELDS_SUPPORTED            0x800UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_ARP_SUPPORTED                 0x1000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_V2_SUPPORTED                0x2000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_ETHERTYPE_IP_SUPPORTED        0x4000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_TRUFLOW_CAPABLE                              0x8000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_L2_FILTER_TRAFFIC_TYPE_L2_ROCE_SUPPORTED     0x10000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_LAG_SUPPORTED                                0x20000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_NO_L2CTX_SUPPORTED               0x40000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NIC_FLOW_STATS_SUPPORTED                     0x80000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_EXT_IP_PROTO_SUPPORTED        0x100000UL
-	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_V3_SUPPORTED                0x200000UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_tunnel_dst_port_query_input (size:192b/24B) */
-struct hwrm_tunnel_dst_port_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	tunnel_type;
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN              0x1UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE             0x5UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI              0xeUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6               0xfUL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE                0x11UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
-	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
-	u8	tunnel_next_proto;
-	u8	unused_0[6];
-};
-
-/* hwrm_tunnel_dst_port_query_output (size:128b/16B) */
-struct hwrm_tunnel_dst_port_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	tunnel_dst_port_id;
-	__be16	tunnel_dst_port_val;
-	u8	upar_in_use;
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR0     0x1UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR1     0x2UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR2     0x4UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR3     0x8UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR4     0x10UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR5     0x20UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR6     0x40UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR7     0x80UL
-	u8	status;
-	#define TUNNEL_DST_PORT_QUERY_RESP_STATUS_CHIP_LEVEL     0x1UL
-	#define TUNNEL_DST_PORT_QUERY_RESP_STATUS_FUNC_LEVEL     0x2UL
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_tunnel_dst_port_alloc_input (size:192b/24B) */
-struct hwrm_tunnel_dst_port_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	tunnel_type;
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN              0x1UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE             0x5UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI              0xeUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6               0xfUL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE                0x11UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
-	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
-	u8	tunnel_next_proto;
-	__be16	tunnel_dst_port_val;
-	u8	unused_0[4];
-};
-
-/* hwrm_tunnel_dst_port_alloc_output (size:128b/16B) */
-struct hwrm_tunnel_dst_port_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	tunnel_dst_port_id;
-	u8	error_info;
-	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_SUCCESS         0x0UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ALLOCATED   0x1UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_NO_RESOURCE 0x2UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ENABLED     0x3UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_LAST           TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ENABLED
-	u8	upar_in_use;
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR0     0x1UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR1     0x2UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR2     0x4UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR3     0x8UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR4     0x10UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR5     0x20UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR6     0x40UL
-	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR7     0x80UL
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_tunnel_dst_port_free_input (size:192b/24B) */
-struct hwrm_tunnel_dst_port_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	tunnel_type;
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN              0x1UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE             0x5UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI              0xeUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6               0xfUL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE                0x11UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
-	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
-	u8	tunnel_next_proto;
-	__le16	tunnel_dst_port_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_tunnel_dst_port_free_output (size:128b/16B) */
-struct hwrm_tunnel_dst_port_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	error_info;
-	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_SUCCESS           0x0UL
-	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_OWNER     0x1UL
-	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_ALLOCATED 0x2UL
-	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_LAST             TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_ALLOCATED
-	u8	unused_1[6];
-	u8	valid;
-};
-
-/* ctx_hw_stats (size:1280b/160B) */
-struct ctx_hw_stats {
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_error_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_error_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	tpa_pkts;
-	__le64	tpa_bytes;
-	__le64	tpa_events;
-	__le64	tpa_aborts;
-};
-
-/* ctx_hw_stats_ext (size:1408b/176B) */
-struct ctx_hw_stats_ext {
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_error_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_error_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	rx_tpa_eligible_pkt;
-	__le64	rx_tpa_eligible_bytes;
-	__le64	rx_tpa_pkt;
-	__le64	rx_tpa_bytes;
-	__le64	rx_tpa_errors;
-	__le64	rx_tpa_events;
-};
-
-/* hwrm_stat_ctx_alloc_input (size:384b/48B) */
-struct hwrm_stat_ctx_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	stats_dma_addr;
-	__le32	update_period_ms;
-	u8	stat_ctx_flags;
-	#define STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE             0x1UL
-	#define STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_DUP_HOST_BUF     0x2UL
-	u8	unused_0;
-	__le16	stats_dma_length;
-	__le16	flags;
-	#define STAT_CTX_ALLOC_REQ_FLAGS_STEERING_TAG_VALID     0x1UL
-	__le16	steering_tag;
-	__le32	stat_ctx_id;
-	__le16	alloc_seq_id;
-	u8	unused_1[6];
-};
-
-/* hwrm_stat_ctx_alloc_output (size:128b/16B) */
-struct hwrm_stat_ctx_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	stat_ctx_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_stat_ctx_free_input (size:192b/24B) */
-struct hwrm_stat_ctx_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	stat_ctx_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_stat_ctx_free_output (size:128b/16B) */
-struct hwrm_stat_ctx_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	stat_ctx_id;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_stat_ctx_query_input (size:192b/24B) */
-struct hwrm_stat_ctx_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	stat_ctx_id;
-	u8	flags;
-	#define STAT_CTX_QUERY_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[3];
-};
-
-/* hwrm_stat_ctx_query_output (size:1408b/176B) */
-struct hwrm_stat_ctx_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_error_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_error_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	rx_agg_pkts;
-	__le64	rx_agg_bytes;
-	__le64	rx_agg_events;
-	__le64	rx_agg_aborts;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_stat_ext_ctx_query_input (size:192b/24B) */
-struct hwrm_stat_ext_ctx_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	stat_ctx_id;
-	u8	flags;
-	#define STAT_EXT_CTX_QUERY_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[3];
-};
-
-/* hwrm_stat_ext_ctx_query_output (size:1536b/192B) */
-struct hwrm_stat_ext_ctx_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	rx_ucast_pkts;
-	__le64	rx_mcast_pkts;
-	__le64	rx_bcast_pkts;
-	__le64	rx_discard_pkts;
-	__le64	rx_error_pkts;
-	__le64	rx_ucast_bytes;
-	__le64	rx_mcast_bytes;
-	__le64	rx_bcast_bytes;
-	__le64	tx_ucast_pkts;
-	__le64	tx_mcast_pkts;
-	__le64	tx_bcast_pkts;
-	__le64	tx_error_pkts;
-	__le64	tx_discard_pkts;
-	__le64	tx_ucast_bytes;
-	__le64	tx_mcast_bytes;
-	__le64	tx_bcast_bytes;
-	__le64	rx_tpa_eligible_pkt;
-	__le64	rx_tpa_eligible_bytes;
-	__le64	rx_tpa_pkt;
-	__le64	rx_tpa_bytes;
-	__le64	rx_tpa_errors;
-	__le64	rx_tpa_events;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_stat_ctx_clr_stats_input (size:192b/24B) */
-struct hwrm_stat_ctx_clr_stats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	stat_ctx_id;
-	u8	unused_0[4];
-};
-
-/* hwrm_stat_ctx_clr_stats_output (size:128b/16B) */
-struct hwrm_stat_ctx_clr_stats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_pcie_qstats_input (size:256b/32B) */
-struct hwrm_pcie_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	pcie_stat_size;
-	u8	unused_0[6];
-	__le64	pcie_stat_host_addr;
-};
-
-/* hwrm_pcie_qstats_output (size:128b/16B) */
-struct hwrm_pcie_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	pcie_stat_size;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* pcie_ctx_hw_stats (size:768b/96B) */
-struct pcie_ctx_hw_stats {
-	__le64	pcie_pl_signal_integrity;
-	__le64	pcie_dl_signal_integrity;
-	__le64	pcie_tl_signal_integrity;
-	__le64	pcie_link_integrity;
-	__le64	pcie_tx_traffic_rate;
-	__le64	pcie_rx_traffic_rate;
-	__le64	pcie_tx_dllp_statistics;
-	__le64	pcie_rx_dllp_statistics;
-	__le64	pcie_equalization_time;
-	__le32	pcie_ltssm_histogram[4];
-	__le64	pcie_recovery_histogram;
-};
-
-/* pcie_ctx_hw_stats_v2 (size:4096b/512B) */
-struct pcie_ctx_hw_stats_v2 {
-	__le64	pcie_pl_signal_integrity;
-	__le64	pcie_dl_signal_integrity;
-	__le64	pcie_tl_signal_integrity;
-	__le64	pcie_link_integrity;
-	__le64	pcie_tx_traffic_rate;
-	__le64	pcie_rx_traffic_rate;
-	__le64	pcie_tx_dllp_statistics;
-	__le64	pcie_rx_dllp_statistics;
-	__le64	pcie_equalization_time;
-	__le32	pcie_ltssm_histogram[4];
-	__le64	pcie_recovery_histogram;
-	__le32	pcie_tl_credit_nph_histogram[8];
-	__le32	pcie_tl_credit_ph_histogram[8];
-	__le32	pcie_tl_credit_pd_histogram[8];
-	__le32	pcie_cmpl_latest_times[4];
-	__le32	pcie_cmpl_longest_time;
-	__le32	pcie_cmpl_shortest_time;
-	__le32	unused_0[2];
-	__le32	pcie_cmpl_latest_headers[4][4];
-	__le32	pcie_cmpl_longest_headers[4][4];
-	__le32	pcie_cmpl_shortest_headers[4][4];
-	__le32	pcie_wr_latency_histogram[12];
-	__le32	pcie_wr_latency_all_normal_count;
-	__le32	unused_1;
-	__le64	pcie_posted_packet_count;
-	__le64	pcie_non_posted_packet_count;
-	__le64	pcie_other_packet_count;
-	__le64	pcie_blocked_packet_count;
-	__le64	pcie_cmpl_packet_count;
-};
-
-/* hwrm_stat_generic_qstats_input (size:256b/32B) */
-struct hwrm_stat_generic_qstats_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	generic_stat_size;
-	u8	flags;
-	#define STAT_GENERIC_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
-	u8	unused_0[5];
-	__le64	generic_stat_host_addr;
-};
-
-/* hwrm_stat_generic_qstats_output (size:128b/16B) */
-struct hwrm_stat_generic_qstats_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	generic_stat_size;
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* generic_sw_hw_stats (size:1472b/184B) */
-struct generic_sw_hw_stats {
-	__le64	pcie_statistics_tx_tlp;
-	__le64	pcie_statistics_rx_tlp;
-	__le64	pcie_credit_fc_hdr_posted;
-	__le64	pcie_credit_fc_hdr_nonposted;
-	__le64	pcie_credit_fc_hdr_cmpl;
-	__le64	pcie_credit_fc_data_posted;
-	__le64	pcie_credit_fc_data_nonposted;
-	__le64	pcie_credit_fc_data_cmpl;
-	__le64	pcie_credit_fc_tgt_nonposted;
-	__le64	pcie_credit_fc_tgt_data_posted;
-	__le64	pcie_credit_fc_tgt_hdr_posted;
-	__le64	pcie_credit_fc_cmpl_hdr_posted;
-	__le64	pcie_credit_fc_cmpl_data_posted;
-	__le64	pcie_cmpl_longest;
-	__le64	pcie_cmpl_shortest;
-	__le64	cache_miss_count_cfcq;
-	__le64	cache_miss_count_cfcs;
-	__le64	cache_miss_count_cfcc;
-	__le64	cache_miss_count_cfcm;
-	__le64	hw_db_recov_dbs_dropped;
-	__le64	hw_db_recov_drops_serviced;
-	__le64	hw_db_recov_dbs_recovered;
-	__le64	hw_db_recov_oo_drop_count;
-};
-
-/* hwrm_fw_reset_input (size:192b/24B) */
-struct hwrm_fw_reset_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	embedded_proc_type;
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_BOOT                  0x0UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_MGMT                  0x1UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_NETCTRL               0x2UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_ROCE                  0x3UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST                  0x4UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_AP                    0x5UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_CHIP                  0x6UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST_RESOURCE_REINIT  0x7UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_IMPACTLESS_ACTIVATION 0x8UL
-	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_LAST                 FW_RESET_REQ_EMBEDDED_PROC_TYPE_IMPACTLESS_ACTIVATION
-	u8	selfrst_status;
-	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTNONE      0x0UL
-	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTASAP      0x1UL
-	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST   0x2UL
-	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTIMMEDIATE 0x3UL
-	#define FW_RESET_REQ_SELFRST_STATUS_LAST            FW_RESET_REQ_SELFRST_STATUS_SELFRSTIMMEDIATE
-	u8	host_idx;
-	u8	flags;
-	#define FW_RESET_REQ_FLAGS_RESET_GRACEFUL     0x1UL
-	#define FW_RESET_REQ_FLAGS_FW_ACTIVATION      0x2UL
-	u8	unused_0[4];
-};
-
-/* hwrm_fw_reset_output (size:128b/16B) */
-struct hwrm_fw_reset_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	selfrst_status;
-	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTNONE      0x0UL
-	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTASAP      0x1UL
-	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTPCIERST   0x2UL
-	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTIMMEDIATE 0x3UL
-	#define FW_RESET_RESP_SELFRST_STATUS_LAST            FW_RESET_RESP_SELFRST_STATUS_SELFRSTIMMEDIATE
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_fw_qstatus_input (size:192b/24B) */
-struct hwrm_fw_qstatus_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	embedded_proc_type;
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_BOOT    0x0UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_MGMT    0x1UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_NETCTRL 0x2UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_ROCE    0x3UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_HOST    0x4UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_AP      0x5UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP    0x6UL
-	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_LAST   FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP
-	u8	unused_0[7];
-};
-
-/* hwrm_fw_qstatus_output (size:128b/16B) */
-struct hwrm_fw_qstatus_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	selfrst_status;
-	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTNONE    0x0UL
-	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTASAP    0x1UL
-	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPCIERST 0x2UL
-	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER   0x3UL
-	#define FW_QSTATUS_RESP_SELFRST_STATUS_LAST          FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER
-	u8	nvm_option_action_status;
-	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_NONE     0x0UL
-	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_HOTRESET 0x1UL
-	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_WARMBOOT 0x2UL
-	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT 0x3UL
-	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_LAST                  FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_fw_set_time_input (size:256b/32B) */
-struct hwrm_fw_set_time_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	year;
-	#define FW_SET_TIME_REQ_YEAR_UNKNOWN 0x0UL
-	#define FW_SET_TIME_REQ_YEAR_LAST   FW_SET_TIME_REQ_YEAR_UNKNOWN
-	u8	month;
-	u8	day;
-	u8	hour;
-	u8	minute;
-	u8	second;
-	u8	unused_0;
-	__le16	millisecond;
-	__le16	zone;
-	#define FW_SET_TIME_REQ_ZONE_UTC     0
-	#define FW_SET_TIME_REQ_ZONE_UNKNOWN 65535
-	#define FW_SET_TIME_REQ_ZONE_LAST   FW_SET_TIME_REQ_ZONE_UNKNOWN
-	u8	unused_1[4];
-};
-
-/* hwrm_fw_set_time_output (size:128b/16B) */
-struct hwrm_fw_set_time_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_struct_hdr (size:128b/16B) */
-struct hwrm_struct_hdr {
-	__le16	struct_id;
-	#define STRUCT_HDR_STRUCT_ID_LLDP_CFG              0x41bUL
-	#define STRUCT_HDR_STRUCT_ID_DCBX_ETS              0x41dUL
-	#define STRUCT_HDR_STRUCT_ID_DCBX_PFC              0x41fUL
-	#define STRUCT_HDR_STRUCT_ID_DCBX_APP              0x421UL
-	#define STRUCT_HDR_STRUCT_ID_DCBX_FEATURE_STATE    0x422UL
-	#define STRUCT_HDR_STRUCT_ID_LLDP_GENERIC          0x424UL
-	#define STRUCT_HDR_STRUCT_ID_LLDP_DEVICE           0x426UL
-	#define STRUCT_HDR_STRUCT_ID_POWER_BKUP            0x427UL
-	#define STRUCT_HDR_STRUCT_ID_PEER_MMAP             0x429UL
-	#define STRUCT_HDR_STRUCT_ID_AFM_OPAQUE            0x1UL
-	#define STRUCT_HDR_STRUCT_ID_PORT_DESCRIPTION      0xaUL
-	#define STRUCT_HDR_STRUCT_ID_RSS_V2                0x64UL
-	#define STRUCT_HDR_STRUCT_ID_MSIX_PER_VF           0xc8UL
-	#define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_COUNT 0x12cUL
-	#define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND 0x12dUL
-	#define STRUCT_HDR_STRUCT_ID_LAST                 STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND
-	__le16	len;
-	u8	version;
-	#define STRUCT_HDR_VERSION_0 0x0UL
-	#define STRUCT_HDR_VERSION_1 0x1UL
-	#define STRUCT_HDR_VERSION_LAST STRUCT_HDR_VERSION_1
-	u8	count;
-	__le16	subtype;
-	__le16	next_offset;
-	#define STRUCT_HDR_NEXT_OFFSET_LAST 0x0UL
-	u8	unused_0[6];
-};
-
-/* hwrm_struct_data_dcbx_app (size:64b/8B) */
-struct hwrm_struct_data_dcbx_app {
-	__be16	protocol_id;
-	u8	protocol_selector;
-	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_ETHER_TYPE   0x1UL
-	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_PORT     0x2UL
-	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_UDP_PORT     0x3UL
-	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT 0x4UL
-	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_LAST        STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT
-	u8	priority;
-	u8	valid;
-	u8	unused_0[3];
-};
-
-/* hwrm_fw_set_structured_data_input (size:256b/32B) */
-struct hwrm_fw_set_structured_data_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	src_data_addr;
-	__le16	data_len;
-	u8	hdr_cnt;
-	u8	unused_0[5];
-};
-
-/* hwrm_fw_set_structured_data_output (size:128b/16B) */
-struct hwrm_fw_set_structured_data_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_fw_set_structured_data_cmd_err (size:64b/8B) */
-struct hwrm_fw_set_structured_data_cmd_err {
-	u8	code;
-	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN     0x0UL
-	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL
-	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT     0x2UL
-	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID      0x3UL
-	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST       FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
-	u8	unused_0[7];
-};
-
-/* hwrm_fw_get_structured_data_input (size:256b/32B) */
-struct hwrm_fw_get_structured_data_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	dest_data_addr;
-	__le16	data_len;
-	__le16	structure_id;
-	__le16	subtype;
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_UNUSED                  0x0UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_ALL                     0xffffUL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_ADMIN       0x100UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_PEER        0x101UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_OPERATIONAL 0x102UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_ADMIN          0x200UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_PEER           0x201UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_OPERATIONAL    0x202UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL        0x300UL
-	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST                   FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL
-	u8	count;
-	u8	unused_0;
-};
-
-/* hwrm_fw_get_structured_data_output (size:128b/16B) */
-struct hwrm_fw_get_structured_data_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	hdr_cnt;
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_fw_get_structured_data_cmd_err (size:64b/8B) */
-struct hwrm_fw_get_structured_data_cmd_err {
-	u8	code;
-	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL
-	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID  0x3UL
-	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_LAST   FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
-	u8	unused_0[7];
-};
-
-/* hwrm_fw_livepatch_query_input (size:192b/24B) */
-struct hwrm_fw_livepatch_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	fw_target;
-	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_COMMON_FW 0x1UL
-	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_SECURE_FW 0x2UL
-	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_LAST     FW_LIVEPATCH_QUERY_REQ_FW_TARGET_SECURE_FW
-	u8	unused_0[7];
-};
-
-/* hwrm_fw_livepatch_query_output (size:640b/80B) */
-struct hwrm_fw_livepatch_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	char	install_ver[32];
-	char	active_ver[32];
-	__le16	status_flags;
-	#define FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_INSTALL     0x1UL
-	#define FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_ACTIVE      0x2UL
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_fw_livepatch_input (size:256b/32B) */
-struct hwrm_fw_livepatch_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	opcode;
-	#define FW_LIVEPATCH_REQ_OPCODE_ACTIVATE   0x1UL
-	#define FW_LIVEPATCH_REQ_OPCODE_DEACTIVATE 0x2UL
-	#define FW_LIVEPATCH_REQ_OPCODE_LAST      FW_LIVEPATCH_REQ_OPCODE_DEACTIVATE
-	u8	fw_target;
-	#define FW_LIVEPATCH_REQ_FW_TARGET_COMMON_FW 0x1UL
-	#define FW_LIVEPATCH_REQ_FW_TARGET_SECURE_FW 0x2UL
-	#define FW_LIVEPATCH_REQ_FW_TARGET_LAST     FW_LIVEPATCH_REQ_FW_TARGET_SECURE_FW
-	u8	loadtype;
-	#define FW_LIVEPATCH_REQ_LOADTYPE_NVM_INSTALL   0x1UL
-	#define FW_LIVEPATCH_REQ_LOADTYPE_MEMORY_DIRECT 0x2UL
-	#define FW_LIVEPATCH_REQ_LOADTYPE_LAST         FW_LIVEPATCH_REQ_LOADTYPE_MEMORY_DIRECT
-	u8	flags;
-	__le32	patch_len;
-	__le64	host_addr;
-};
-
-/* hwrm_fw_livepatch_output (size:128b/16B) */
-struct hwrm_fw_livepatch_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_fw_livepatch_cmd_err (size:64b/8B) */
-struct hwrm_fw_livepatch_cmd_err {
-	u8	code;
-	#define FW_LIVEPATCH_CMD_ERR_CODE_UNKNOWN         0x0UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_OPCODE  0x1UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_TARGET  0x2UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_SUPPORTED   0x3UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_INSTALLED   0x4UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_PATCHED     0x5UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_AUTH_FAIL       0x6UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_HEADER  0x7UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_SIZE    0x8UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_ALREADY_PATCHED 0x9UL
-	#define FW_LIVEPATCH_CMD_ERR_CODE_LAST           FW_LIVEPATCH_CMD_ERR_CODE_ALREADY_PATCHED
-	u8	unused_0[7];
-};
-
-/* hwrm_exec_fwd_resp_input (size:1024b/128B) */
-struct hwrm_exec_fwd_resp_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	encap_request[26];
-	__le16	encap_resp_target_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_exec_fwd_resp_output (size:128b/16B) */
-struct hwrm_exec_fwd_resp_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_reject_fwd_resp_input (size:1024b/128B) */
-struct hwrm_reject_fwd_resp_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	encap_request[26];
-	__le16	encap_resp_target_id;
-	u8	unused_0[6];
-};
-
-/* hwrm_reject_fwd_resp_output (size:128b/16B) */
-struct hwrm_reject_fwd_resp_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_fwd_resp_input (size:1024b/128B) */
-struct hwrm_fwd_resp_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	encap_resp_target_id;
-	__le16	encap_resp_cmpl_ring;
-	__le16	encap_resp_len;
-	u8	unused_0;
-	u8	unused_1;
-	__le64	encap_resp_addr;
-	__le32	encap_resp[24];
-};
-
-/* hwrm_fwd_resp_output (size:128b/16B) */
-struct hwrm_fwd_resp_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_fwd_async_event_cmpl_input (size:320b/40B) */
-struct hwrm_fwd_async_event_cmpl_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	encap_async_event_target_id;
-	u8	unused_0[6];
-	__le32	encap_async_event_cmpl[4];
-};
-
-/* hwrm_fwd_async_event_cmpl_output (size:128b/16B) */
-struct hwrm_fwd_async_event_cmpl_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_temp_monitor_query_input (size:128b/16B) */
-struct hwrm_temp_monitor_query_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_temp_monitor_query_output (size:192b/24B) */
-struct hwrm_temp_monitor_query_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	temp;
-	u8	phy_temp;
-	u8	om_temp;
-	u8	flags;
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_TEMP_NOT_AVAILABLE             0x1UL
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_PHY_TEMP_NOT_AVAILABLE         0x2UL
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_OM_NOT_PRESENT                 0x4UL
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_OM_TEMP_NOT_AVAILABLE          0x8UL
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_EXT_TEMP_FIELDS_AVAILABLE      0x10UL
-	#define TEMP_MONITOR_QUERY_RESP_FLAGS_THRESHOLD_VALUES_AVAILABLE     0x20UL
-	u8	temp2;
-	u8	phy_temp2;
-	u8	om_temp2;
-	u8	warn_threshold;
-	u8	critical_threshold;
-	u8	fatal_threshold;
-	u8	shutdown_threshold;
-	u8	unused_0[4];
-	u8	valid;
-};
-
-/* hwrm_wol_filter_alloc_input (size:512b/64B) */
-struct hwrm_wol_filter_alloc_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	__le32	enables;
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_MAC_ADDRESS           0x1UL
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_OFFSET        0x2UL
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_SIZE      0x4UL
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_ADDR      0x8UL
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_ADDR     0x10UL
-	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_SIZE     0x20UL
-	__le16	port_id;
-	u8	wol_type;
-	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_MAGICPKT 0x0UL
-	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_BMP      0x1UL
-	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID  0xffUL
-	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_LAST    WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID
-	u8	unused_0[5];
-	u8	mac_address[6];
-	__le16	pattern_offset;
-	__le16	pattern_buf_size;
-	__le16	pattern_mask_size;
-	u8	unused_1[4];
-	__le64	pattern_buf_addr;
-	__le64	pattern_mask_addr;
-};
-
-/* hwrm_wol_filter_alloc_output (size:128b/16B) */
-struct hwrm_wol_filter_alloc_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	wol_filter_id;
-	u8	unused_0[6];
-	u8	valid;
-};
-
-/* hwrm_wol_filter_free_input (size:256b/32B) */
-struct hwrm_wol_filter_free_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	flags;
-	#define WOL_FILTER_FREE_REQ_FLAGS_FREE_ALL_WOL_FILTERS     0x1UL
-	__le32	enables;
-	#define WOL_FILTER_FREE_REQ_ENABLES_WOL_FILTER_ID     0x1UL
-	__le16	port_id;
-	u8	wol_filter_id;
-	u8	unused_0[5];
-};
-
-/* hwrm_wol_filter_free_output (size:128b/16B) */
-struct hwrm_wol_filter_free_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_wol_filter_qcfg_input (size:448b/56B) */
-struct hwrm_wol_filter_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	__le16	handle;
-	u8	unused_0[4];
-	__le64	pattern_buf_addr;
-	__le16	pattern_buf_size;
-	u8	unused_1[6];
-	__le64	pattern_mask_addr;
-	__le16	pattern_mask_size;
-	u8	unused_2[6];
-};
-
-/* hwrm_wol_filter_qcfg_output (size:256b/32B) */
-struct hwrm_wol_filter_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	next_handle;
-	u8	wol_filter_id;
-	u8	wol_type;
-	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_MAGICPKT 0x0UL
-	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_BMP      0x1UL
-	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID  0xffUL
-	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_LAST    WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID
-	__le32	unused_0;
-	u8	mac_address[6];
-	__le16	pattern_offset;
-	__le16	pattern_size;
-	__le16	pattern_mask_size;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_wol_reason_qcfg_input (size:320b/40B) */
-struct hwrm_wol_reason_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	port_id;
-	u8	unused_0[6];
-	__le64	wol_pkt_buf_addr;
-	__le16	wol_pkt_buf_size;
-	u8	unused_1[6];
-};
-
-/* hwrm_wol_reason_qcfg_output (size:128b/16B) */
-struct hwrm_wol_reason_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	wol_filter_id;
-	u8	wol_reason;
-	#define WOL_REASON_QCFG_RESP_WOL_REASON_MAGICPKT 0x0UL
-	#define WOL_REASON_QCFG_RESP_WOL_REASON_BMP      0x1UL
-	#define WOL_REASON_QCFG_RESP_WOL_REASON_INVALID  0xffUL
-	#define WOL_REASON_QCFG_RESP_WOL_REASON_LAST    WOL_REASON_QCFG_RESP_WOL_REASON_INVALID
-	u8	wol_pkt_len;
-	u8	unused_0[4];
-	u8	valid;
-};
-
-/* hwrm_dbg_read_direct_input (size:256b/32B) */
-struct hwrm_dbg_read_direct_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_dest_addr;
-	__le32	read_addr;
-	__le32	read_len32;
-};
-
-/* hwrm_dbg_read_direct_output (size:128b/16B) */
-struct hwrm_dbg_read_direct_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	crc32;
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_dbg_qcaps_input (size:192b/24B) */
-struct hwrm_dbg_qcaps_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	u8	unused_0[6];
-};
-
-/* hwrm_dbg_qcaps_output (size:192b/24B) */
-struct hwrm_dbg_qcaps_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	fid;
-	u8	unused_0[2];
-	__le32	coredump_component_disable_caps;
-	#define DBG_QCAPS_RESP_COREDUMP_COMPONENT_DISABLE_CAPS_NVRAM     0x1UL
-	__le32	flags;
-	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_NVM             0x1UL
-	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_HOST_DDR        0x2UL
-	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_SOC_DDR         0x4UL
-	#define DBG_QCAPS_RESP_FLAGS_USEQ                      0x8UL
-	#define DBG_QCAPS_RESP_FLAGS_COREDUMP_HOST_DDR         0x10UL
-	#define DBG_QCAPS_RESP_FLAGS_COREDUMP_HOST_CAPTURE     0x20UL
-	#define DBG_QCAPS_RESP_FLAGS_PTRACE                    0x40UL
-	#define DBG_QCAPS_RESP_FLAGS_REG_ACCESS_RESTRICTED     0x80UL
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_dbg_qcfg_input (size:192b/24B) */
-struct hwrm_dbg_qcfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	fid;
-	__le16	flags;
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_MASK         0x3UL
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_SFT          0
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_NVM       0x0UL
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_HOST_DDR  0x1UL
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_SOC_DDR   0x2UL
-	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_LAST          DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_SOC_DDR
-	__le32	coredump_component_disable_flags;
-	#define DBG_QCFG_REQ_COREDUMP_COMPONENT_DISABLE_FLAGS_NVRAM     0x1UL
-};
-
-/* hwrm_dbg_qcfg_output (size:256b/32B) */
-struct hwrm_dbg_qcfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	fid;
-	u8	unused_0[2];
-	__le32	coredump_size;
-	__le32	flags;
-	#define DBG_QCFG_RESP_FLAGS_UART_LOG               0x1UL
-	#define DBG_QCFG_RESP_FLAGS_UART_LOG_SECONDARY     0x2UL
-	#define DBG_QCFG_RESP_FLAGS_FW_TRACE               0x4UL
-	#define DBG_QCFG_RESP_FLAGS_FW_TRACE_SECONDARY     0x8UL
-	#define DBG_QCFG_RESP_FLAGS_DEBUG_NOTIFY           0x10UL
-	#define DBG_QCFG_RESP_FLAGS_JTAG_DEBUG             0x20UL
-	__le16	async_cmpl_ring;
-	u8	unused_2[2];
-	__le32	crashdump_size;
-	u8	unused_3[3];
-	u8	valid;
-};
-
-/* hwrm_dbg_crashdump_medium_cfg_input (size:320b/40B) */
-struct hwrm_dbg_crashdump_medium_cfg_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	output_dest_flags;
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_TYPE_DDR     0x1UL
-	__le16	pg_size_lvl;
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_MASK      0x3UL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_SFT       0
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_0       0x0UL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_1       0x1UL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2       0x2UL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LAST       DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_MASK  0x1cUL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_SFT   2
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_4K   (0x0UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8K   (0x1UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_64K  (0x2UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_2M   (0x3UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8M   (0x4UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G   (0x5UL << 2)
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_LAST   DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_MASK 0xffe0UL
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_SFT  5
-	__le32	size;
-	__le32	coredump_component_disable_flags;
-	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_NVRAM     0x1UL
-	__le32	unused_0;
-	__le64	pbl;
-};
-
-/* hwrm_dbg_crashdump_medium_cfg_output (size:128b/16B) */
-struct hwrm_dbg_crashdump_medium_cfg_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_1[7];
-	u8	valid;
-};
-
-/* coredump_segment_record (size:128b/16B) */
-struct coredump_segment_record {
-	__le16	component_id;
-	__le16	segment_id;
-	__le16	max_instances;
-	u8	version_hi;
-	u8	version_low;
-	u8	seg_flags;
-	u8	compress_flags;
-	#define SFLAG_COMPRESSED_ZLIB     0x1UL
-	u8	unused_0[2];
-	__le32	segment_len;
-};
-
-/* hwrm_dbg_coredump_list_input (size:256b/32B) */
-struct hwrm_dbg_coredump_list_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_dest_addr;
-	__le32	host_buf_len;
-	__le16	seq_no;
-	u8	flags;
-	#define DBG_COREDUMP_LIST_REQ_FLAGS_CRASHDUMP     0x1UL
-	u8	unused_0[1];
-};
-
-/* hwrm_dbg_coredump_list_output (size:128b/16B) */
-struct hwrm_dbg_coredump_list_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define DBG_COREDUMP_LIST_RESP_FLAGS_MORE     0x1UL
-	u8	unused_0;
-	__le16	total_segments;
-	__le16	data_len;
-	u8	unused_1;
-	u8	valid;
-};
-
-/* hwrm_dbg_coredump_initiate_input (size:256b/32B) */
-struct hwrm_dbg_coredump_initiate_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	component_id;
-	__le16	segment_id;
-	__le16	instance;
-	__le16	unused_0;
-	u8	seg_flags;
-	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_LIVE_DATA                0x1UL
-	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_CRASH_DATA               0x2UL
-	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_COLLECT_CTX_L1_CACHE     0x4UL
-	u8	unused_1[7];
-};
-
-/* hwrm_dbg_coredump_initiate_output (size:128b/16B) */
-struct hwrm_dbg_coredump_initiate_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* coredump_data_hdr (size:128b/16B) */
-struct coredump_data_hdr {
-	__le32	address;
-	__le32	flags_length;
-	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_ACTUAL_LEN_MASK     0xffffffUL
-	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_ACTUAL_LEN_SFT      0
-	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_INDIRECT_ACCESS     0x1000000UL
-	__le32	instance;
-	__le32	next_offset;
-};
-
-/* hwrm_dbg_coredump_retrieve_input (size:448b/56B) */
-struct hwrm_dbg_coredump_retrieve_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_dest_addr;
-	__le32	host_buf_len;
-	__le32	unused_0;
-	__le16	component_id;
-	__le16	segment_id;
-	__le16	instance;
-	__le16	unused_1;
-	u8	seg_flags;
-	u8	unused_2;
-	__le16	unused_3;
-	__le32	unused_4;
-	__le32	seq_no;
-	__le32	unused_5;
-};
-
-/* hwrm_dbg_coredump_retrieve_output (size:128b/16B) */
-struct hwrm_dbg_coredump_retrieve_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	flags;
-	#define DBG_COREDUMP_RETRIEVE_RESP_FLAGS_MORE     0x1UL
-	u8	unused_0;
-	__le16	data_len;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_dbg_ring_info_get_input (size:192b/24B) */
-struct hwrm_dbg_ring_info_get_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	ring_type;
-	#define DBG_RING_INFO_GET_REQ_RING_TYPE_L2_CMPL 0x0UL
-	#define DBG_RING_INFO_GET_REQ_RING_TYPE_TX      0x1UL
-	#define DBG_RING_INFO_GET_REQ_RING_TYPE_RX      0x2UL
-	#define DBG_RING_INFO_GET_REQ_RING_TYPE_NQ      0x3UL
-	#define DBG_RING_INFO_GET_REQ_RING_TYPE_LAST   DBG_RING_INFO_GET_REQ_RING_TYPE_NQ
-	u8	unused_0[3];
-	__le32	fw_ring_id;
-};
-
-/* hwrm_dbg_ring_info_get_output (size:192b/24B) */
-struct hwrm_dbg_ring_info_get_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	producer_index;
-	__le32	consumer_index;
-	__le32	cag_vector_ctrl;
-	__le16	st_tag;
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_dbg_log_buffer_flush_input (size:192b/24B) */
-struct hwrm_dbg_log_buffer_flush_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	type;
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_SRT_TRACE           0x0UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_SRT2_TRACE          0x1UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CRT_TRACE           0x2UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CRT2_TRACE          0x3UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP0_TRACE         0x4UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_L2_HWRM_TRACE       0x5UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ROCE_HWRM_TRACE     0x6UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA0_TRACE           0x7UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA1_TRACE           0x8UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA2_TRACE           0x9UL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP1_TRACE         0xaUL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE 0xbUL
-	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST               DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE
-	u8	unused_1[2];
-	__le32	flags;
-	#define DBG_LOG_BUFFER_FLUSH_REQ_FLAGS_FLUSH_ALL_BUFFERS     0x1UL
-};
-
-/* hwrm_dbg_log_buffer_flush_output (size:128b/16B) */
-struct hwrm_dbg_log_buffer_flush_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	current_buffer_offset;
-	u8	unused_1[3];
-	u8	valid;
-};
-
-/* hwrm_nvm_read_input (size:320b/40B) */
-struct hwrm_nvm_read_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_dest_addr;
-	__le16	dir_idx;
-	u8	unused_0[2];
-	__le32	offset;
-	__le32	len;
-	u8	unused_1[4];
-};
-
-/* hwrm_nvm_read_output (size:128b/16B) */
-struct hwrm_nvm_read_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_get_dir_entries_input (size:192b/24B) */
-struct hwrm_nvm_get_dir_entries_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_dest_addr;
-};
-
-/* hwrm_nvm_get_dir_entries_output (size:128b/16B) */
-struct hwrm_nvm_get_dir_entries_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_get_dir_info_input (size:128b/16B) */
-struct hwrm_nvm_get_dir_info_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_nvm_get_dir_info_output (size:192b/24B) */
-struct hwrm_nvm_get_dir_info_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	entries;
-	__le32	entry_length;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_write_input (size:448b/56B) */
-struct hwrm_nvm_write_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_src_addr;
-	__le16	dir_type;
-	__le16	dir_ordinal;
-	__le16	dir_ext;
-	__le16	dir_attr;
-	__le32	dir_data_length;
-	__le16	option;
-	__le16	flags;
-	#define NVM_WRITE_REQ_FLAGS_KEEP_ORIG_ACTIVE_IMG     0x1UL
-	#define NVM_WRITE_REQ_FLAGS_BATCH_MODE               0x2UL
-	#define NVM_WRITE_REQ_FLAGS_BATCH_LAST               0x4UL
-	#define NVM_WRITE_REQ_FLAGS_SKIP_CRID_CHECK          0x8UL
-	__le32	dir_item_length;
-	__le32	offset;
-	__le32	len;
-	__le32	unused_0;
-};
-
-/* hwrm_nvm_write_output (size:128b/16B) */
-struct hwrm_nvm_write_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	dir_item_length;
-	__le16	dir_idx;
-	u8	unused_0;
-	u8	valid;
-};
-
-/* hwrm_nvm_write_cmd_err (size:64b/8B) */
-struct hwrm_nvm_write_cmd_err {
-	u8	code;
-	#define NVM_WRITE_CMD_ERR_CODE_UNKNOWN  0x0UL
-	#define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL
-	#define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL
-	#define NVM_WRITE_CMD_ERR_CODE_LAST    NVM_WRITE_CMD_ERR_CODE_NO_SPACE
-	u8	unused_0[7];
-};
-
-/* hwrm_nvm_modify_input (size:320b/40B) */
-struct hwrm_nvm_modify_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	host_src_addr;
-	__le16	dir_idx;
-	__le16	flags;
-	#define NVM_MODIFY_REQ_FLAGS_BATCH_MODE     0x1UL
-	#define NVM_MODIFY_REQ_FLAGS_BATCH_LAST     0x2UL
-	__le32	offset;
-	__le32	len;
-	u8	unused_1[4];
-};
-
-/* hwrm_nvm_modify_output (size:128b/16B) */
-struct hwrm_nvm_modify_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_find_dir_entry_input (size:256b/32B) */
-struct hwrm_nvm_find_dir_entry_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define NVM_FIND_DIR_ENTRY_REQ_ENABLES_DIR_IDX_VALID     0x1UL
-	__le16	dir_idx;
-	__le16	dir_type;
-	__le16	dir_ordinal;
-	__le16	dir_ext;
-	u8	opt_ordinal;
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_MASK 0x3UL
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_SFT 0
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_EQ    0x0UL
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GE    0x1UL
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT    0x2UL
-	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_LAST NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT
-	u8	unused_0[3];
-};
-
-/* hwrm_nvm_find_dir_entry_output (size:256b/32B) */
-struct hwrm_nvm_find_dir_entry_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le32	dir_item_length;
-	__le32	dir_data_length;
-	__le32	fw_ver;
-	__le16	dir_ordinal;
-	__le16	dir_idx;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_erase_dir_entry_input (size:192b/24B) */
-struct hwrm_nvm_erase_dir_entry_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	dir_idx;
-	u8	unused_0[6];
-};
-
-/* hwrm_nvm_erase_dir_entry_output (size:128b/16B) */
-struct hwrm_nvm_erase_dir_entry_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_get_dev_info_input (size:192b/24B) */
-struct hwrm_nvm_get_dev_info_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	flags;
-	#define NVM_GET_DEV_INFO_REQ_FLAGS_SECURITY_SOC_NVM     0x1UL
-	u8	unused_0[7];
-};
-
-/* hwrm_nvm_get_dev_info_output (size:768b/96B) */
-struct hwrm_nvm_get_dev_info_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	manufacturer_id;
-	__le16	device_id;
-	__le32	sector_size;
-	__le32	nvram_size;
-	__le32	reserved_size;
-	__le32	available_size;
-	u8	nvm_cfg_ver_maj;
-	u8	nvm_cfg_ver_min;
-	u8	nvm_cfg_ver_upd;
-	u8	flags;
-	#define NVM_GET_DEV_INFO_RESP_FLAGS_FW_VER_VALID     0x1UL
-	char	pkg_name[16];
-	__le16	hwrm_fw_major;
-	__le16	hwrm_fw_minor;
-	__le16	hwrm_fw_build;
-	__le16	hwrm_fw_patch;
-	__le16	mgmt_fw_major;
-	__le16	mgmt_fw_minor;
-	__le16	mgmt_fw_build;
-	__le16	mgmt_fw_patch;
-	__le16	roce_fw_major;
-	__le16	roce_fw_minor;
-	__le16	roce_fw_build;
-	__le16	roce_fw_patch;
-	__le16	netctrl_fw_major;
-	__le16	netctrl_fw_minor;
-	__le16	netctrl_fw_build;
-	__le16	netctrl_fw_patch;
-	__le16	srt2_fw_major;
-	__le16	srt2_fw_minor;
-	__le16	srt2_fw_build;
-	__le16	srt2_fw_patch;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_mod_dir_entry_input (size:256b/32B) */
-struct hwrm_nvm_mod_dir_entry_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	enables;
-	#define NVM_MOD_DIR_ENTRY_REQ_ENABLES_CHECKSUM     0x1UL
-	__le16	dir_idx;
-	__le16	dir_ordinal;
-	__le16	dir_ext;
-	__le16	dir_attr;
-	__le32	checksum;
-};
-
-/* hwrm_nvm_mod_dir_entry_output (size:128b/16B) */
-struct hwrm_nvm_mod_dir_entry_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_verify_update_input (size:192b/24B) */
-struct hwrm_nvm_verify_update_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le16	dir_type;
-	__le16	dir_ordinal;
-	__le16	dir_ext;
-	u8	unused_0[2];
-};
-
-/* hwrm_nvm_verify_update_output (size:128b/16B) */
-struct hwrm_nvm_verify_update_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_install_update_input (size:192b/24B) */
-struct hwrm_nvm_install_update_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le32	install_type;
-	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_NORMAL 0x0UL
-	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL    0xffffffffUL
-	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_LAST  NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL
-	__le16	flags;
-	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ERASE_UNUSED_SPACE     0x1UL
-	#define NVM_INSTALL_UPDATE_REQ_FLAGS_REMOVE_UNUSED_PKG      0x2UL
-	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG      0x4UL
-	#define NVM_INSTALL_UPDATE_REQ_FLAGS_VERIFY_ONLY            0x8UL
-	u8	unused_0[2];
-};
-
-/* hwrm_nvm_install_update_output (size:192b/24B) */
-struct hwrm_nvm_install_update_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le64	installed_items;
-	u8	result;
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS                      0x0UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_FAILURE                      0xffUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_MALLOC_FAILURE               0xfdUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_INDEX_PARAMETER      0xfbUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TYPE_PARAMETER       0xf3UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PREREQUISITE         0xf2UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_FILE_HEADER          0xecUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_SIGNATURE            0xebUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_STREAM          0xeaUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_LENGTH          0xe9UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_MANIFEST             0xe8UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TRAILER              0xe7UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_CHECKSUM             0xe6UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_ITEM_CHECKSUM        0xe5UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DATA_LENGTH          0xe4UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DIRECTIVE            0xe1UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_CHIP_REV         0xceUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_DEVICE_ID        0xcdUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_VENDOR    0xccUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_ID        0xcbUL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_PLATFORM         0xc5UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_DUPLICATE_ITEM               0xc4UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_ZERO_LENGTH_ITEM             0xc3UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_CHECKSUM_ERROR       0xb9UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_DATA_ERROR           0xb8UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_AUTHENTICATION_ERROR 0xb7UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_NOT_FOUND               0xb0UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED                  0xa7UL
-	#define NVM_INSTALL_UPDATE_RESP_RESULT_LAST                        NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED
-	u8	problem_item;
-	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_NONE    0x0UL
-	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE 0xffUL
-	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_LAST   NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE
-	u8	reset_required;
-	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_NONE  0x0UL
-	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_PCI   0x1UL
-	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER 0x2UL
-	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_LAST NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER
-	u8	unused_0[4];
-	u8	valid;
-};
-
-/* hwrm_nvm_install_update_cmd_err (size:64b/8B) */
-struct hwrm_nvm_install_update_cmd_err {
-	u8	code;
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN            0x0UL
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_FRAG_ERR           0x1UL
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE           0x2UL
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_ANTI_ROLLBACK      0x3UL
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT 0x4UL
-	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST              NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT
-	u8	unused_0[7];
-};
-
-/* hwrm_nvm_get_variable_input (size:320b/40B) */
-struct hwrm_nvm_get_variable_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	dest_data_addr;
-	__le16	data_len;
-	__le16	option_num;
-	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
-	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
-	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
-	__le16	dimensions;
-	__le16	index_0;
-	__le16	index_1;
-	__le16	index_2;
-	__le16	index_3;
-	u8	flags;
-	#define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT     0x1UL
-	u8	unused_0;
-};
-
-/* hwrm_nvm_get_variable_output (size:128b/16B) */
-struct hwrm_nvm_get_variable_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	__le16	data_len;
-	__le16	option_num;
-	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_0    0x0UL
-	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF 0xffffUL
-	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_LAST     NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF
-	u8	unused_0[3];
-	u8	valid;
-};
-
-/* hwrm_nvm_get_variable_cmd_err (size:64b/8B) */
-struct hwrm_nvm_get_variable_cmd_err {
-	u8	code;
-	#define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
-	#define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
-	#define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
-	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL
-	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST         NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT
-	u8	unused_0[7];
-};
-
-/* hwrm_nvm_set_variable_input (size:320b/40B) */
-struct hwrm_nvm_set_variable_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	__le64	src_data_addr;
-	__le16	data_len;
-	__le16	option_num;
-	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
-	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
-	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
-	__le16	dimensions;
-	__le16	index_0;
-	__le16	index_1;
-	__le16	index_2;
-	__le16	index_3;
-	u8	flags;
-	#define NVM_SET_VARIABLE_REQ_FLAGS_FORCE_FLUSH                0x1UL
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_MASK          0xeUL
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_SFT           1
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_NONE            (0x0UL << 1)
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1       (0x1UL << 1)
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_AES256          (0x2UL << 1)
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1_AUTH  (0x3UL << 1)
-	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_LAST           NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1_AUTH
-	#define NVM_SET_VARIABLE_REQ_FLAGS_FLAGS_UNUSED_0_MASK        0x70UL
-	#define NVM_SET_VARIABLE_REQ_FLAGS_FLAGS_UNUSED_0_SFT         4
-	#define NVM_SET_VARIABLE_REQ_FLAGS_FACTORY_DEFAULT            0x80UL
-	u8	unused_0;
-};
-
-/* hwrm_nvm_set_variable_output (size:128b/16B) */
-struct hwrm_nvm_set_variable_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* hwrm_nvm_set_variable_cmd_err (size:64b/8B) */
-struct hwrm_nvm_set_variable_cmd_err {
-	u8	code;
-	#define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
-	#define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
-	#define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
-	#define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST         NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR
-	u8	unused_0[7];
-};
-
-/* hwrm_selftest_qlist_input (size:128b/16B) */
-struct hwrm_selftest_qlist_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_selftest_qlist_output (size:2240b/280B) */
-struct hwrm_selftest_qlist_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	num_tests;
-	u8	available_tests;
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_NVM_TEST                 0x1UL
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_LINK_TEST                0x2UL
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_REGISTER_TEST            0x4UL
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_MEMORY_TEST              0x8UL
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_PCIE_SERDES_TEST         0x10UL
-	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_ETHERNET_SERDES_TEST     0x20UL
-	u8	offline_tests;
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_NVM_TEST                 0x1UL
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_LINK_TEST                0x2UL
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_REGISTER_TEST            0x4UL
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_MEMORY_TEST              0x8UL
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_PCIE_SERDES_TEST         0x10UL
-	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_ETHERNET_SERDES_TEST     0x20UL
-	u8	unused_0;
-	__le16	test_timeout;
-	u8	unused_1[2];
-	char	test_name[8][32];
-	u8	eyescope_target_BER_support;
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E8_SUPPORTED  0x0UL
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E9_SUPPORTED  0x1UL
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E10_SUPPORTED 0x2UL
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E11_SUPPORTED 0x3UL
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E12_SUPPORTED 0x4UL
-	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_LAST              SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E12_SUPPORTED
-	u8	unused_2[6];
-	u8	valid;
-};
-
-/* hwrm_selftest_exec_input (size:192b/24B) */
-struct hwrm_selftest_exec_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-	u8	flags;
-	#define SELFTEST_EXEC_REQ_FLAGS_NVM_TEST                 0x1UL
-	#define SELFTEST_EXEC_REQ_FLAGS_LINK_TEST                0x2UL
-	#define SELFTEST_EXEC_REQ_FLAGS_REGISTER_TEST            0x4UL
-	#define SELFTEST_EXEC_REQ_FLAGS_MEMORY_TEST              0x8UL
-	#define SELFTEST_EXEC_REQ_FLAGS_PCIE_SERDES_TEST         0x10UL
-	#define SELFTEST_EXEC_REQ_FLAGS_ETHERNET_SERDES_TEST     0x20UL
-	u8	unused_0[7];
-};
-
-/* hwrm_selftest_exec_output (size:128b/16B) */
-struct hwrm_selftest_exec_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	requested_tests;
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_NVM_TEST                 0x1UL
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_LINK_TEST                0x2UL
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_REGISTER_TEST            0x4UL
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_MEMORY_TEST              0x8UL
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_PCIE_SERDES_TEST         0x10UL
-	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_ETHERNET_SERDES_TEST     0x20UL
-	u8	test_success;
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_NVM_TEST                 0x1UL
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_LINK_TEST                0x2UL
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_REGISTER_TEST            0x4UL
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_MEMORY_TEST              0x8UL
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_PCIE_SERDES_TEST         0x10UL
-	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_ETHERNET_SERDES_TEST     0x20UL
-	u8	unused_0[5];
-	u8	valid;
-};
-
-/* hwrm_selftest_irq_input (size:128b/16B) */
-struct hwrm_selftest_irq_input {
-	__le16	req_type;
-	__le16	cmpl_ring;
-	__le16	seq_id;
-	__le16	target_id;
-	__le64	resp_addr;
-};
-
-/* hwrm_selftest_irq_output (size:128b/16B) */
-struct hwrm_selftest_irq_output {
-	__le16	error_code;
-	__le16	req_type;
-	__le16	seq_id;
-	__le16	resp_len;
-	u8	unused_0[7];
-	u8	valid;
-};
-
-/* dbc_dbc (size:64b/8B) */
-struct dbc_dbc {
-	__le32	index;
-	#define DBC_DBC_INDEX_MASK 0xffffffUL
-	#define DBC_DBC_INDEX_SFT  0
-	#define DBC_DBC_EPOCH      0x1000000UL
-	#define DBC_DBC_TOGGLE_MASK 0x6000000UL
-	#define DBC_DBC_TOGGLE_SFT 25
-	__le32	type_path_xid;
-	#define DBC_DBC_XID_MASK          0xfffffUL
-	#define DBC_DBC_XID_SFT           0
-	#define DBC_DBC_PATH_MASK         0x3000000UL
-	#define DBC_DBC_PATH_SFT          24
-	#define DBC_DBC_PATH_ROCE           (0x0UL << 24)
-	#define DBC_DBC_PATH_L2             (0x1UL << 24)
-	#define DBC_DBC_PATH_ENGINE         (0x2UL << 24)
-	#define DBC_DBC_PATH_LAST          DBC_DBC_PATH_ENGINE
-	#define DBC_DBC_VALID             0x4000000UL
-	#define DBC_DBC_DEBUG_TRACE       0x8000000UL
-	#define DBC_DBC_TYPE_MASK         0xf0000000UL
-	#define DBC_DBC_TYPE_SFT          28
-	#define DBC_DBC_TYPE_SQ             (0x0UL << 28)
-	#define DBC_DBC_TYPE_RQ             (0x1UL << 28)
-	#define DBC_DBC_TYPE_SRQ            (0x2UL << 28)
-	#define DBC_DBC_TYPE_SRQ_ARM        (0x3UL << 28)
-	#define DBC_DBC_TYPE_CQ             (0x4UL << 28)
-	#define DBC_DBC_TYPE_CQ_ARMSE       (0x5UL << 28)
-	#define DBC_DBC_TYPE_CQ_ARMALL      (0x6UL << 28)
-	#define DBC_DBC_TYPE_CQ_ARMENA      (0x7UL << 28)
-	#define DBC_DBC_TYPE_SRQ_ARMENA     (0x8UL << 28)
-	#define DBC_DBC_TYPE_CQ_CUTOFF_ACK  (0x9UL << 28)
-	#define DBC_DBC_TYPE_NQ             (0xaUL << 28)
-	#define DBC_DBC_TYPE_NQ_ARM         (0xbUL << 28)
-	#define DBC_DBC_TYPE_NQ_MASK        (0xeUL << 28)
-	#define DBC_DBC_TYPE_NULL           (0xfUL << 28)
-	#define DBC_DBC_TYPE_LAST          DBC_DBC_TYPE_NULL
-};
-
-/* db_push_start (size:64b/8B) */
-struct db_push_start {
-	u64	db;
-	#define DB_PUSH_START_DB_INDEX_MASK     0xffffffUL
-	#define DB_PUSH_START_DB_INDEX_SFT      0
-	#define DB_PUSH_START_DB_PI_LO_MASK     0xff000000UL
-	#define DB_PUSH_START_DB_PI_LO_SFT      24
-	#define DB_PUSH_START_DB_XID_MASK       0xfffff00000000ULL
-	#define DB_PUSH_START_DB_XID_SFT        32
-	#define DB_PUSH_START_DB_PI_HI_MASK     0xf0000000000000ULL
-	#define DB_PUSH_START_DB_PI_HI_SFT      52
-	#define DB_PUSH_START_DB_TYPE_MASK      0xf000000000000000ULL
-	#define DB_PUSH_START_DB_TYPE_SFT       60
-	#define DB_PUSH_START_DB_TYPE_PUSH_START  (0xcULL << 60)
-	#define DB_PUSH_START_DB_TYPE_PUSH_END    (0xdULL << 60)
-	#define DB_PUSH_START_DB_TYPE_LAST       DB_PUSH_START_DB_TYPE_PUSH_END
-};
-
-/* db_push_end (size:64b/8B) */
-struct db_push_end {
-	u64	db;
-	#define DB_PUSH_END_DB_INDEX_MASK      0xffffffUL
-	#define DB_PUSH_END_DB_INDEX_SFT       0
-	#define DB_PUSH_END_DB_PI_LO_MASK      0xff000000UL
-	#define DB_PUSH_END_DB_PI_LO_SFT       24
-	#define DB_PUSH_END_DB_XID_MASK        0xfffff00000000ULL
-	#define DB_PUSH_END_DB_XID_SFT         32
-	#define DB_PUSH_END_DB_PI_HI_MASK      0xf0000000000000ULL
-	#define DB_PUSH_END_DB_PI_HI_SFT       52
-	#define DB_PUSH_END_DB_PATH_MASK       0x300000000000000ULL
-	#define DB_PUSH_END_DB_PATH_SFT        56
-	#define DB_PUSH_END_DB_PATH_ROCE         (0x0ULL << 56)
-	#define DB_PUSH_END_DB_PATH_L2           (0x1ULL << 56)
-	#define DB_PUSH_END_DB_PATH_ENGINE       (0x2ULL << 56)
-	#define DB_PUSH_END_DB_PATH_LAST        DB_PUSH_END_DB_PATH_ENGINE
-	#define DB_PUSH_END_DB_DEBUG_TRACE     0x800000000000000ULL
-	#define DB_PUSH_END_DB_TYPE_MASK       0xf000000000000000ULL
-	#define DB_PUSH_END_DB_TYPE_SFT        60
-	#define DB_PUSH_END_DB_TYPE_PUSH_START   (0xcULL << 60)
-	#define DB_PUSH_END_DB_TYPE_PUSH_END     (0xdULL << 60)
-	#define DB_PUSH_END_DB_TYPE_LAST        DB_PUSH_END_DB_TYPE_PUSH_END
-};
-
-/* db_push_info (size:64b/8B) */
-struct db_push_info {
-	u32	push_size_push_index;
-	#define DB_PUSH_INFO_PUSH_INDEX_MASK 0xffffffUL
-	#define DB_PUSH_INFO_PUSH_INDEX_SFT 0
-	#define DB_PUSH_INFO_PUSH_SIZE_MASK 0x1f000000UL
-	#define DB_PUSH_INFO_PUSH_SIZE_SFT  24
-	u32	reserved32;
-};
-
-/* fw_status_reg (size:32b/4B) */
-struct fw_status_reg {
-	u32	fw_status;
-	#define FW_STATUS_REG_CODE_MASK              0xffffUL
-	#define FW_STATUS_REG_CODE_SFT               0
-	#define FW_STATUS_REG_CODE_READY               0x8000UL
-	#define FW_STATUS_REG_CODE_LAST               FW_STATUS_REG_CODE_READY
-	#define FW_STATUS_REG_IMAGE_DEGRADED         0x10000UL
-	#define FW_STATUS_REG_RECOVERABLE            0x20000UL
-	#define FW_STATUS_REG_CRASHDUMP_ONGOING      0x40000UL
-	#define FW_STATUS_REG_CRASHDUMP_COMPLETE     0x80000UL
-	#define FW_STATUS_REG_SHUTDOWN               0x100000UL
-	#define FW_STATUS_REG_CRASHED_NO_MASTER      0x200000UL
-	#define FW_STATUS_REG_RECOVERING             0x400000UL
-	#define FW_STATUS_REG_MANU_DEBUG_STATUS      0x800000UL
-};
-
-/* hcomm_status (size:64b/8B) */
-struct hcomm_status {
-	u32	sig_ver;
-	#define HCOMM_STATUS_VER_MASK      0xffUL
-	#define HCOMM_STATUS_VER_SFT       0
-	#define HCOMM_STATUS_VER_LATEST      0x1UL
-	#define HCOMM_STATUS_VER_LAST       HCOMM_STATUS_VER_LATEST
-	#define HCOMM_STATUS_SIGNATURE_MASK 0xffffff00UL
-	#define HCOMM_STATUS_SIGNATURE_SFT 8
-	#define HCOMM_STATUS_SIGNATURE_VAL   (0x484353UL << 8)
-	#define HCOMM_STATUS_SIGNATURE_LAST HCOMM_STATUS_SIGNATURE_VAL
-	u32	fw_status_loc;
-	#define HCOMM_STATUS_TRUE_ADDR_SPACE_MASK    0x3UL
-	#define HCOMM_STATUS_TRUE_ADDR_SPACE_SFT     0
-	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_PCIE_CFG  0x0UL
-	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_GRC       0x1UL
-	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR0      0x2UL
-	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR1      0x3UL
-	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_LAST     HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR1
-	#define HCOMM_STATUS_TRUE_OFFSET_MASK        0xfffffffcUL
-	#define HCOMM_STATUS_TRUE_OFFSET_SFT         2
-};
-#define HCOMM_STATUS_STRUCT_LOC 0x31001F0UL
-
-#endif /* _BNXT_HSI_H_ */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
index 669d24ba0e87..de3427c6c6aa 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwmon.c
@@ -12,8 +12,8 @@
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/pci.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_hwmon.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c
index d2fd2d04ed47..5ce190f50120 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c
@@ -20,8 +20,8 @@
 #include <linux/netdevice.h>
 #include <linux/pci.h>
 #include <linux/skbuff.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h
index fb5f5b063c3d..791b3a0cdb83 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h
@@ -10,7 +10,7 @@
 #ifndef BNXT_HWRM_H
 #define BNXT_HWRM_H
 
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 
 enum bnxt_hwrm_ctx_flags {
 	/* Update the HWRM_API_FLAGS right below for any new non-internal bit added here */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
index 0669d43472f5..471b1393ce6c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
@@ -15,7 +15,7 @@
 #include <linux/timekeeping.h>
 #include <linux/ptp_classify.h>
 #include <linux/clocksource.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ptp.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
index bc0d80356568..ec14b51ba38e 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/etherdevice.h>
 #include <net/dcbnl.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ulp.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 0599d3016224..d72fd248f3aa 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -19,8 +19,8 @@
 #include <net/tc_act/tc_pedit.h>
 #include <net/tc_act/tc_tunnel_key.h>
 #include <net/vxlan.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_sriov.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
index 2450a369b792..61cf201bb0dc 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c
@@ -21,8 +21,8 @@
 #include <linux/bitmap.h>
 #include <linux/auxiliary_bus.h>
 #include <net/netdev_lock.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_ulp.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index 619f0844e778..bd116fd578d8 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -12,8 +12,8 @@
 #include <linux/rtnetlink.h>
 #include <linux/jhash.h>
 #include <net/pkt_cls.h>
+#include <linux/bnxt/hsi.h>
 
-#include "bnxt_hsi.h"
 #include "bnxt.h"
 #include "bnxt_hwrm.h"
 #include "bnxt_vfr.h"
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 09e7e8efa6fa..58d579dca3f1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -17,7 +17,7 @@
 #include <linux/filter.h>
 #include <net/netdev_lock.h>
 #include <net/page_pool/helpers.h>
-#include "bnxt_hsi.h"
+#include <linux/bnxt/hsi.h>
 #include "bnxt.h"
 #include "bnxt_xdp.h"
 
diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h
new file mode 100644
index 000000000000..549231703bce
--- /dev/null
+++ b/include/linux/bnxt/hsi.h
@@ -0,0 +1,10914 @@
+/* Broadcom NetXtreme-C/E network driver.
+ *
+ * Copyright (c) 2014-2016 Broadcom Corporation
+ * Copyright (c) 2014-2018 Broadcom Limited
+ * Copyright (c) 2018-2025 Broadcom Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * DO NOT MODIFY!!! This file is automatically generated.
+ */
+
+#ifndef _BNXT_HSI_H_
+#define _BNXT_HSI_H_
+
+/* hwrm_cmd_hdr (size:128b/16B) */
+struct hwrm_cmd_hdr {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_resp_hdr (size:64b/8B) */
+struct hwrm_resp_hdr {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+};
+
+#define CMD_DISCR_TLV_ENCAP 0x8000UL
+#define CMD_DISCR_LAST     CMD_DISCR_TLV_ENCAP
+
+
+#define TLV_TYPE_HWRM_REQUEST                    0x1UL
+#define TLV_TYPE_HWRM_RESPONSE                   0x2UL
+#define TLV_TYPE_ROCE_SP_COMMAND                 0x3UL
+#define TLV_TYPE_QUERY_ROCE_CC_GEN1              0x4UL
+#define TLV_TYPE_MODIFY_ROCE_CC_GEN1             0x5UL
+#define TLV_TYPE_QUERY_ROCE_CC_GEN2              0x6UL
+#define TLV_TYPE_MODIFY_ROCE_CC_GEN2             0x7UL
+#define TLV_TYPE_QUERY_ROCE_CC_GEN1_EXT          0x8UL
+#define TLV_TYPE_MODIFY_ROCE_CC_GEN1_EXT         0x9UL
+#define TLV_TYPE_QUERY_ROCE_CC_GEN2_EXT          0xaUL
+#define TLV_TYPE_MODIFY_ROCE_CC_GEN2_EXT         0xbUL
+#define TLV_TYPE_ENGINE_CKV_ALIAS_ECC_PUBLIC_KEY 0x8001UL
+#define TLV_TYPE_ENGINE_CKV_IV                   0x8003UL
+#define TLV_TYPE_ENGINE_CKV_AUTH_TAG             0x8004UL
+#define TLV_TYPE_ENGINE_CKV_CIPHERTEXT           0x8005UL
+#define TLV_TYPE_ENGINE_CKV_HOST_ALGORITHMS      0x8006UL
+#define TLV_TYPE_ENGINE_CKV_HOST_ECC_PUBLIC_KEY  0x8007UL
+#define TLV_TYPE_ENGINE_CKV_ECDSA_SIGNATURE      0x8008UL
+#define TLV_TYPE_ENGINE_CKV_FW_ECC_PUBLIC_KEY    0x8009UL
+#define TLV_TYPE_ENGINE_CKV_FW_ALGORITHMS        0x800aUL
+#define TLV_TYPE_LAST                           TLV_TYPE_ENGINE_CKV_FW_ALGORITHMS
+
+
+/* tlv (size:64b/8B) */
+struct tlv {
+	__le16	cmd_discr;
+	u8	reserved_8b;
+	u8	flags;
+	#define TLV_FLAGS_MORE         0x1UL
+	#define TLV_FLAGS_MORE_LAST      0x0UL
+	#define TLV_FLAGS_MORE_NOT_LAST  0x1UL
+	#define TLV_FLAGS_REQUIRED     0x2UL
+	#define TLV_FLAGS_REQUIRED_NO    (0x0UL << 1)
+	#define TLV_FLAGS_REQUIRED_YES   (0x1UL << 1)
+	#define TLV_FLAGS_REQUIRED_LAST TLV_FLAGS_REQUIRED_YES
+	__le16	tlv_type;
+	__le16	length;
+};
+
+/* input (size:128b/16B) */
+struct input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* output (size:64b/8B) */
+struct output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+};
+
+/* hwrm_short_input (size:128b/16B) */
+struct hwrm_short_input {
+	__le16	req_type;
+	__le16	signature;
+	#define SHORT_REQ_SIGNATURE_SHORT_CMD 0x4321UL
+	#define SHORT_REQ_SIGNATURE_LAST     SHORT_REQ_SIGNATURE_SHORT_CMD
+	__le16	target_id;
+	#define SHORT_REQ_TARGET_ID_DEFAULT 0x0UL
+	#define SHORT_REQ_TARGET_ID_TOOLS   0xfffdUL
+	#define SHORT_REQ_TARGET_ID_LAST   SHORT_REQ_TARGET_ID_TOOLS
+	__le16	size;
+	__le64	req_addr;
+};
+
+/* cmd_nums (size:64b/8B) */
+struct cmd_nums {
+	__le16	req_type;
+	#define HWRM_VER_GET                              0x0UL
+	#define HWRM_FUNC_ECHO_RESPONSE                   0xbUL
+	#define HWRM_ERROR_RECOVERY_QCFG                  0xcUL
+	#define HWRM_FUNC_DRV_IF_CHANGE                   0xdUL
+	#define HWRM_FUNC_BUF_UNRGTR                      0xeUL
+	#define HWRM_FUNC_VF_CFG                          0xfUL
+	#define HWRM_RESERVED1                            0x10UL
+	#define HWRM_FUNC_RESET                           0x11UL
+	#define HWRM_FUNC_GETFID                          0x12UL
+	#define HWRM_FUNC_VF_ALLOC                        0x13UL
+	#define HWRM_FUNC_VF_FREE                         0x14UL
+	#define HWRM_FUNC_QCAPS                           0x15UL
+	#define HWRM_FUNC_QCFG                            0x16UL
+	#define HWRM_FUNC_CFG                             0x17UL
+	#define HWRM_FUNC_QSTATS                          0x18UL
+	#define HWRM_FUNC_CLR_STATS                       0x19UL
+	#define HWRM_FUNC_DRV_UNRGTR                      0x1aUL
+	#define HWRM_FUNC_VF_RESC_FREE                    0x1bUL
+	#define HWRM_FUNC_VF_VNIC_IDS_QUERY               0x1cUL
+	#define HWRM_FUNC_DRV_RGTR                        0x1dUL
+	#define HWRM_FUNC_DRV_QVER                        0x1eUL
+	#define HWRM_FUNC_BUF_RGTR                        0x1fUL
+	#define HWRM_PORT_PHY_CFG                         0x20UL
+	#define HWRM_PORT_MAC_CFG                         0x21UL
+	#define HWRM_PORT_TS_QUERY                        0x22UL
+	#define HWRM_PORT_QSTATS                          0x23UL
+	#define HWRM_PORT_LPBK_QSTATS                     0x24UL
+	#define HWRM_PORT_CLR_STATS                       0x25UL
+	#define HWRM_PORT_LPBK_CLR_STATS                  0x26UL
+	#define HWRM_PORT_PHY_QCFG                        0x27UL
+	#define HWRM_PORT_MAC_QCFG                        0x28UL
+	#define HWRM_PORT_MAC_PTP_QCFG                    0x29UL
+	#define HWRM_PORT_PHY_QCAPS                       0x2aUL
+	#define HWRM_PORT_PHY_I2C_WRITE                   0x2bUL
+	#define HWRM_PORT_PHY_I2C_READ                    0x2cUL
+	#define HWRM_PORT_LED_CFG                         0x2dUL
+	#define HWRM_PORT_LED_QCFG                        0x2eUL
+	#define HWRM_PORT_LED_QCAPS                       0x2fUL
+	#define HWRM_QUEUE_QPORTCFG                       0x30UL
+	#define HWRM_QUEUE_QCFG                           0x31UL
+	#define HWRM_QUEUE_CFG                            0x32UL
+	#define HWRM_FUNC_VLAN_CFG                        0x33UL
+	#define HWRM_FUNC_VLAN_QCFG                       0x34UL
+	#define HWRM_QUEUE_PFCENABLE_QCFG                 0x35UL
+	#define HWRM_QUEUE_PFCENABLE_CFG                  0x36UL
+	#define HWRM_QUEUE_PRI2COS_QCFG                   0x37UL
+	#define HWRM_QUEUE_PRI2COS_CFG                    0x38UL
+	#define HWRM_QUEUE_COS2BW_QCFG                    0x39UL
+	#define HWRM_QUEUE_COS2BW_CFG                     0x3aUL
+	#define HWRM_QUEUE_DSCP_QCAPS                     0x3bUL
+	#define HWRM_QUEUE_DSCP2PRI_QCFG                  0x3cUL
+	#define HWRM_QUEUE_DSCP2PRI_CFG                   0x3dUL
+	#define HWRM_VNIC_ALLOC                           0x40UL
+	#define HWRM_VNIC_FREE                            0x41UL
+	#define HWRM_VNIC_CFG                             0x42UL
+	#define HWRM_VNIC_QCFG                            0x43UL
+	#define HWRM_VNIC_TPA_CFG                         0x44UL
+	#define HWRM_VNIC_TPA_QCFG                        0x45UL
+	#define HWRM_VNIC_RSS_CFG                         0x46UL
+	#define HWRM_VNIC_RSS_QCFG                        0x47UL
+	#define HWRM_VNIC_PLCMODES_CFG                    0x48UL
+	#define HWRM_VNIC_PLCMODES_QCFG                   0x49UL
+	#define HWRM_VNIC_QCAPS                           0x4aUL
+	#define HWRM_VNIC_UPDATE                          0x4bUL
+	#define HWRM_RING_ALLOC                           0x50UL
+	#define HWRM_RING_FREE                            0x51UL
+	#define HWRM_RING_CMPL_RING_QAGGINT_PARAMS        0x52UL
+	#define HWRM_RING_CMPL_RING_CFG_AGGINT_PARAMS     0x53UL
+	#define HWRM_RING_AGGINT_QCAPS                    0x54UL
+	#define HWRM_RING_SCHQ_ALLOC                      0x55UL
+	#define HWRM_RING_SCHQ_CFG                        0x56UL
+	#define HWRM_RING_SCHQ_FREE                       0x57UL
+	#define HWRM_RING_RESET                           0x5eUL
+	#define HWRM_RING_GRP_ALLOC                       0x60UL
+	#define HWRM_RING_GRP_FREE                        0x61UL
+	#define HWRM_RING_CFG                             0x62UL
+	#define HWRM_RING_QCFG                            0x63UL
+	#define HWRM_RESERVED5                            0x64UL
+	#define HWRM_RESERVED6                            0x65UL
+	#define HWRM_VNIC_RSS_COS_LB_CTX_ALLOC            0x70UL
+	#define HWRM_VNIC_RSS_COS_LB_CTX_FREE             0x71UL
+	#define HWRM_QUEUE_MPLS_QCAPS                     0x80UL
+	#define HWRM_QUEUE_MPLSTC2PRI_QCFG                0x81UL
+	#define HWRM_QUEUE_MPLSTC2PRI_CFG                 0x82UL
+	#define HWRM_QUEUE_VLANPRI_QCAPS                  0x83UL
+	#define HWRM_QUEUE_VLANPRI2PRI_QCFG               0x84UL
+	#define HWRM_QUEUE_VLANPRI2PRI_CFG                0x85UL
+	#define HWRM_QUEUE_GLOBAL_CFG                     0x86UL
+	#define HWRM_QUEUE_GLOBAL_QCFG                    0x87UL
+	#define HWRM_QUEUE_ADPTV_QOS_RX_FEATURE_QCFG      0x88UL
+	#define HWRM_QUEUE_ADPTV_QOS_RX_FEATURE_CFG       0x89UL
+	#define HWRM_QUEUE_ADPTV_QOS_TX_FEATURE_QCFG      0x8aUL
+	#define HWRM_QUEUE_ADPTV_QOS_TX_FEATURE_CFG       0x8bUL
+	#define HWRM_QUEUE_QCAPS                          0x8cUL
+	#define HWRM_QUEUE_ADPTV_QOS_RX_TUNING_QCFG       0x8dUL
+	#define HWRM_QUEUE_ADPTV_QOS_RX_TUNING_CFG        0x8eUL
+	#define HWRM_QUEUE_ADPTV_QOS_TX_TUNING_QCFG       0x8fUL
+	#define HWRM_CFA_L2_FILTER_ALLOC                  0x90UL
+	#define HWRM_CFA_L2_FILTER_FREE                   0x91UL
+	#define HWRM_CFA_L2_FILTER_CFG                    0x92UL
+	#define HWRM_CFA_L2_SET_RX_MASK                   0x93UL
+	#define HWRM_CFA_VLAN_ANTISPOOF_CFG               0x94UL
+	#define HWRM_CFA_TUNNEL_FILTER_ALLOC              0x95UL
+	#define HWRM_CFA_TUNNEL_FILTER_FREE               0x96UL
+	#define HWRM_CFA_ENCAP_RECORD_ALLOC               0x97UL
+	#define HWRM_CFA_ENCAP_RECORD_FREE                0x98UL
+	#define HWRM_CFA_NTUPLE_FILTER_ALLOC              0x99UL
+	#define HWRM_CFA_NTUPLE_FILTER_FREE               0x9aUL
+	#define HWRM_CFA_NTUPLE_FILTER_CFG                0x9bUL
+	#define HWRM_CFA_EM_FLOW_ALLOC                    0x9cUL
+	#define HWRM_CFA_EM_FLOW_FREE                     0x9dUL
+	#define HWRM_CFA_EM_FLOW_CFG                      0x9eUL
+	#define HWRM_TUNNEL_DST_PORT_QUERY                0xa0UL
+	#define HWRM_TUNNEL_DST_PORT_ALLOC                0xa1UL
+	#define HWRM_TUNNEL_DST_PORT_FREE                 0xa2UL
+	#define HWRM_QUEUE_ADPTV_QOS_TX_TUNING_CFG        0xa3UL
+	#define HWRM_STAT_CTX_ENG_QUERY                   0xafUL
+	#define HWRM_STAT_CTX_ALLOC                       0xb0UL
+	#define HWRM_STAT_CTX_FREE                        0xb1UL
+	#define HWRM_STAT_CTX_QUERY                       0xb2UL
+	#define HWRM_STAT_CTX_CLR_STATS                   0xb3UL
+	#define HWRM_PORT_QSTATS_EXT                      0xb4UL
+	#define HWRM_PORT_PHY_MDIO_WRITE                  0xb5UL
+	#define HWRM_PORT_PHY_MDIO_READ                   0xb6UL
+	#define HWRM_PORT_PHY_MDIO_BUS_ACQUIRE            0xb7UL
+	#define HWRM_PORT_PHY_MDIO_BUS_RELEASE            0xb8UL
+	#define HWRM_PORT_QSTATS_EXT_PFC_WD               0xb9UL
+	#define HWRM_RESERVED7                            0xbaUL
+	#define HWRM_PORT_TX_FIR_CFG                      0xbbUL
+	#define HWRM_PORT_TX_FIR_QCFG                     0xbcUL
+	#define HWRM_PORT_ECN_QSTATS                      0xbdUL
+	#define HWRM_FW_LIVEPATCH_QUERY                   0xbeUL
+	#define HWRM_FW_LIVEPATCH                         0xbfUL
+	#define HWRM_FW_RESET                             0xc0UL
+	#define HWRM_FW_QSTATUS                           0xc1UL
+	#define HWRM_FW_HEALTH_CHECK                      0xc2UL
+	#define HWRM_FW_SYNC                              0xc3UL
+	#define HWRM_FW_STATE_QCAPS                       0xc4UL
+	#define HWRM_FW_STATE_QUIESCE                     0xc5UL
+	#define HWRM_FW_STATE_BACKUP                      0xc6UL
+	#define HWRM_FW_STATE_RESTORE                     0xc7UL
+	#define HWRM_FW_SET_TIME                          0xc8UL
+	#define HWRM_FW_GET_TIME                          0xc9UL
+	#define HWRM_FW_SET_STRUCTURED_DATA               0xcaUL
+	#define HWRM_FW_GET_STRUCTURED_DATA               0xcbUL
+	#define HWRM_FW_IPC_MAILBOX                       0xccUL
+	#define HWRM_FW_ECN_CFG                           0xcdUL
+	#define HWRM_FW_ECN_QCFG                          0xceUL
+	#define HWRM_FW_SECURE_CFG                        0xcfUL
+	#define HWRM_EXEC_FWD_RESP                        0xd0UL
+	#define HWRM_REJECT_FWD_RESP                      0xd1UL
+	#define HWRM_FWD_RESP                             0xd2UL
+	#define HWRM_FWD_ASYNC_EVENT_CMPL                 0xd3UL
+	#define HWRM_OEM_CMD                              0xd4UL
+	#define HWRM_PORT_PRBS_TEST                       0xd5UL
+	#define HWRM_PORT_SFP_SIDEBAND_CFG                0xd6UL
+	#define HWRM_PORT_SFP_SIDEBAND_QCFG               0xd7UL
+	#define HWRM_FW_STATE_UNQUIESCE                   0xd8UL
+	#define HWRM_PORT_DSC_DUMP                        0xd9UL
+	#define HWRM_PORT_EP_TX_QCFG                      0xdaUL
+	#define HWRM_PORT_EP_TX_CFG                       0xdbUL
+	#define HWRM_PORT_CFG                             0xdcUL
+	#define HWRM_PORT_QCFG                            0xddUL
+	#define HWRM_PORT_MAC_QCAPS                       0xdfUL
+	#define HWRM_TEMP_MONITOR_QUERY                   0xe0UL
+	#define HWRM_REG_POWER_QUERY                      0xe1UL
+	#define HWRM_CORE_FREQUENCY_QUERY                 0xe2UL
+	#define HWRM_REG_POWER_HISTOGRAM                  0xe3UL
+	#define HWRM_WOL_FILTER_ALLOC                     0xf0UL
+	#define HWRM_WOL_FILTER_FREE                      0xf1UL
+	#define HWRM_WOL_FILTER_QCFG                      0xf2UL
+	#define HWRM_WOL_REASON_QCFG                      0xf3UL
+	#define HWRM_CFA_METER_QCAPS                      0xf4UL
+	#define HWRM_CFA_METER_PROFILE_ALLOC              0xf5UL
+	#define HWRM_CFA_METER_PROFILE_FREE               0xf6UL
+	#define HWRM_CFA_METER_PROFILE_CFG                0xf7UL
+	#define HWRM_CFA_METER_INSTANCE_ALLOC             0xf8UL
+	#define HWRM_CFA_METER_INSTANCE_FREE              0xf9UL
+	#define HWRM_CFA_METER_INSTANCE_CFG               0xfaUL
+	#define HWRM_CFA_VFR_ALLOC                        0xfdUL
+	#define HWRM_CFA_VFR_FREE                         0xfeUL
+	#define HWRM_CFA_VF_PAIR_ALLOC                    0x100UL
+	#define HWRM_CFA_VF_PAIR_FREE                     0x101UL
+	#define HWRM_CFA_VF_PAIR_INFO                     0x102UL
+	#define HWRM_CFA_FLOW_ALLOC                       0x103UL
+	#define HWRM_CFA_FLOW_FREE                        0x104UL
+	#define HWRM_CFA_FLOW_FLUSH                       0x105UL
+	#define HWRM_CFA_FLOW_STATS                       0x106UL
+	#define HWRM_CFA_FLOW_INFO                        0x107UL
+	#define HWRM_CFA_DECAP_FILTER_ALLOC               0x108UL
+	#define HWRM_CFA_DECAP_FILTER_FREE                0x109UL
+	#define HWRM_CFA_VLAN_ANTISPOOF_QCFG              0x10aUL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_ALLOC       0x10bUL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_FREE        0x10cUL
+	#define HWRM_CFA_PAIR_ALLOC                       0x10dUL
+	#define HWRM_CFA_PAIR_FREE                        0x10eUL
+	#define HWRM_CFA_PAIR_INFO                        0x10fUL
+	#define HWRM_FW_IPC_MSG                           0x110UL
+	#define HWRM_CFA_REDIRECT_TUNNEL_TYPE_INFO        0x111UL
+	#define HWRM_CFA_REDIRECT_QUERY_TUNNEL_TYPE       0x112UL
+	#define HWRM_CFA_FLOW_AGING_TIMER_RESET           0x113UL
+	#define HWRM_CFA_FLOW_AGING_CFG                   0x114UL
+	#define HWRM_CFA_FLOW_AGING_QCFG                  0x115UL
+	#define HWRM_CFA_FLOW_AGING_QCAPS                 0x116UL
+	#define HWRM_CFA_CTX_MEM_RGTR                     0x117UL
+	#define HWRM_CFA_CTX_MEM_UNRGTR                   0x118UL
+	#define HWRM_CFA_CTX_MEM_QCTX                     0x119UL
+	#define HWRM_CFA_CTX_MEM_QCAPS                    0x11aUL
+	#define HWRM_CFA_COUNTER_QCAPS                    0x11bUL
+	#define HWRM_CFA_COUNTER_CFG                      0x11cUL
+	#define HWRM_CFA_COUNTER_QCFG                     0x11dUL
+	#define HWRM_CFA_COUNTER_QSTATS                   0x11eUL
+	#define HWRM_CFA_TCP_FLAG_PROCESS_QCFG            0x11fUL
+	#define HWRM_CFA_EEM_QCAPS                        0x120UL
+	#define HWRM_CFA_EEM_CFG                          0x121UL
+	#define HWRM_CFA_EEM_QCFG                         0x122UL
+	#define HWRM_CFA_EEM_OP                           0x123UL
+	#define HWRM_CFA_ADV_FLOW_MGNT_QCAPS              0x124UL
+	#define HWRM_CFA_TFLIB                            0x125UL
+	#define HWRM_CFA_LAG_GROUP_MEMBER_RGTR            0x126UL
+	#define HWRM_CFA_LAG_GROUP_MEMBER_UNRGTR          0x127UL
+	#define HWRM_CFA_TLS_FILTER_ALLOC                 0x128UL
+	#define HWRM_CFA_TLS_FILTER_FREE                  0x129UL
+	#define HWRM_CFA_RELEASE_AFM_FUNC                 0x12aUL
+	#define HWRM_ENGINE_CKV_STATUS                    0x12eUL
+	#define HWRM_ENGINE_CKV_CKEK_ADD                  0x12fUL
+	#define HWRM_ENGINE_CKV_CKEK_DELETE               0x130UL
+	#define HWRM_ENGINE_CKV_KEY_ADD                   0x131UL
+	#define HWRM_ENGINE_CKV_KEY_DELETE                0x132UL
+	#define HWRM_ENGINE_CKV_FLUSH                     0x133UL
+	#define HWRM_ENGINE_CKV_RNG_GET                   0x134UL
+	#define HWRM_ENGINE_CKV_KEY_GEN                   0x135UL
+	#define HWRM_ENGINE_CKV_KEY_LABEL_CFG             0x136UL
+	#define HWRM_ENGINE_CKV_KEY_LABEL_QCFG            0x137UL
+	#define HWRM_ENGINE_QG_CONFIG_QUERY               0x13cUL
+	#define HWRM_ENGINE_QG_QUERY                      0x13dUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_CONFIG_QUERY 0x13eUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_QUERY        0x13fUL
+	#define HWRM_ENGINE_QG_METER_PROFILE_ALLOC        0x140UL
+	#define HWRM_ENGINE_QG_METER_PROFILE_FREE         0x141UL
+	#define HWRM_ENGINE_QG_METER_QUERY                0x142UL
+	#define HWRM_ENGINE_QG_METER_BIND                 0x143UL
+	#define HWRM_ENGINE_QG_METER_UNBIND               0x144UL
+	#define HWRM_ENGINE_QG_FUNC_BIND                  0x145UL
+	#define HWRM_ENGINE_SG_CONFIG_QUERY               0x146UL
+	#define HWRM_ENGINE_SG_QUERY                      0x147UL
+	#define HWRM_ENGINE_SG_METER_QUERY                0x148UL
+	#define HWRM_ENGINE_SG_METER_CONFIG               0x149UL
+	#define HWRM_ENGINE_SG_QG_BIND                    0x14aUL
+	#define HWRM_ENGINE_QG_SG_UNBIND                  0x14bUL
+	#define HWRM_ENGINE_CONFIG_QUERY                  0x154UL
+	#define HWRM_ENGINE_STATS_CONFIG                  0x155UL
+	#define HWRM_ENGINE_STATS_CLEAR                   0x156UL
+	#define HWRM_ENGINE_STATS_QUERY                   0x157UL
+	#define HWRM_ENGINE_STATS_QUERY_CONTINUOUS_ERROR  0x158UL
+	#define HWRM_ENGINE_RQ_ALLOC                      0x15eUL
+	#define HWRM_ENGINE_RQ_FREE                       0x15fUL
+	#define HWRM_ENGINE_CQ_ALLOC                      0x160UL
+	#define HWRM_ENGINE_CQ_FREE                       0x161UL
+	#define HWRM_ENGINE_NQ_ALLOC                      0x162UL
+	#define HWRM_ENGINE_NQ_FREE                       0x163UL
+	#define HWRM_ENGINE_ON_DIE_RQE_CREDITS            0x164UL
+	#define HWRM_ENGINE_FUNC_QCFG                     0x165UL
+	#define HWRM_FUNC_RESOURCE_QCAPS                  0x190UL
+	#define HWRM_FUNC_VF_RESOURCE_CFG                 0x191UL
+	#define HWRM_FUNC_BACKING_STORE_QCAPS             0x192UL
+	#define HWRM_FUNC_BACKING_STORE_CFG               0x193UL
+	#define HWRM_FUNC_BACKING_STORE_QCFG              0x194UL
+	#define HWRM_FUNC_VF_BW_CFG                       0x195UL
+	#define HWRM_FUNC_VF_BW_QCFG                      0x196UL
+	#define HWRM_FUNC_HOST_PF_IDS_QUERY               0x197UL
+	#define HWRM_FUNC_QSTATS_EXT                      0x198UL
+	#define HWRM_STAT_EXT_CTX_QUERY                   0x199UL
+	#define HWRM_FUNC_SPD_CFG                         0x19aUL
+	#define HWRM_FUNC_SPD_QCFG                        0x19bUL
+	#define HWRM_FUNC_PTP_PIN_QCFG                    0x19cUL
+	#define HWRM_FUNC_PTP_PIN_CFG                     0x19dUL
+	#define HWRM_FUNC_PTP_CFG                         0x19eUL
+	#define HWRM_FUNC_PTP_TS_QUERY                    0x19fUL
+	#define HWRM_FUNC_PTP_EXT_CFG                     0x1a0UL
+	#define HWRM_FUNC_PTP_EXT_QCFG                    0x1a1UL
+	#define HWRM_FUNC_KEY_CTX_ALLOC                   0x1a2UL
+	#define HWRM_FUNC_BACKING_STORE_CFG_V2            0x1a3UL
+	#define HWRM_FUNC_BACKING_STORE_QCFG_V2           0x1a4UL
+	#define HWRM_FUNC_DBR_PACING_CFG                  0x1a5UL
+	#define HWRM_FUNC_DBR_PACING_QCFG                 0x1a6UL
+	#define HWRM_FUNC_DBR_PACING_BROADCAST_EVENT      0x1a7UL
+	#define HWRM_FUNC_BACKING_STORE_QCAPS_V2          0x1a8UL
+	#define HWRM_FUNC_DBR_PACING_NQLIST_QUERY         0x1a9UL
+	#define HWRM_FUNC_DBR_RECOVERY_COMPLETED          0x1aaUL
+	#define HWRM_FUNC_SYNCE_CFG                       0x1abUL
+	#define HWRM_FUNC_SYNCE_QCFG                      0x1acUL
+	#define HWRM_FUNC_KEY_CTX_FREE                    0x1adUL
+	#define HWRM_FUNC_LAG_MODE_CFG                    0x1aeUL
+	#define HWRM_FUNC_LAG_MODE_QCFG                   0x1afUL
+	#define HWRM_FUNC_LAG_CREATE                      0x1b0UL
+	#define HWRM_FUNC_LAG_UPDATE                      0x1b1UL
+	#define HWRM_FUNC_LAG_FREE                        0x1b2UL
+	#define HWRM_FUNC_LAG_QCFG                        0x1b3UL
+	#define HWRM_FUNC_TIMEDTX_PACING_RATE_ADD         0x1c2UL
+	#define HWRM_FUNC_TIMEDTX_PACING_RATE_DELETE      0x1c3UL
+	#define HWRM_FUNC_TIMEDTX_PACING_RATE_QUERY       0x1c4UL
+	#define HWRM_SELFTEST_QLIST                       0x200UL
+	#define HWRM_SELFTEST_EXEC                        0x201UL
+	#define HWRM_SELFTEST_IRQ                         0x202UL
+	#define HWRM_SELFTEST_RETRIEVE_SERDES_DATA        0x203UL
+	#define HWRM_PCIE_QSTATS                          0x204UL
+	#define HWRM_MFG_FRU_WRITE_CONTROL                0x205UL
+	#define HWRM_MFG_TIMERS_QUERY                     0x206UL
+	#define HWRM_MFG_OTP_CFG                          0x207UL
+	#define HWRM_MFG_OTP_QCFG                         0x208UL
+	#define HWRM_MFG_HDMA_TEST                        0x209UL
+	#define HWRM_MFG_FRU_EEPROM_WRITE                 0x20aUL
+	#define HWRM_MFG_FRU_EEPROM_READ                  0x20bUL
+	#define HWRM_MFG_SOC_IMAGE                        0x20cUL
+	#define HWRM_MFG_SOC_QSTATUS                      0x20dUL
+	#define HWRM_MFG_PARAM_CRITICAL_DATA_FINALIZE     0x20eUL
+	#define HWRM_MFG_PARAM_CRITICAL_DATA_READ         0x20fUL
+	#define HWRM_MFG_PARAM_CRITICAL_DATA_HEALTH       0x210UL
+	#define HWRM_MFG_PRVSN_EXPORT_CSR                 0x211UL
+	#define HWRM_MFG_PRVSN_IMPORT_CERT                0x212UL
+	#define HWRM_MFG_PRVSN_GET_STATE                  0x213UL
+	#define HWRM_MFG_GET_NVM_MEASUREMENT              0x214UL
+	#define HWRM_MFG_PSOC_QSTATUS                     0x215UL
+	#define HWRM_MFG_SELFTEST_QLIST                   0x216UL
+	#define HWRM_MFG_SELFTEST_EXEC                    0x217UL
+	#define HWRM_STAT_GENERIC_QSTATS                  0x218UL
+	#define HWRM_MFG_PRVSN_EXPORT_CERT                0x219UL
+	#define HWRM_STAT_DB_ERROR_QSTATS                 0x21aUL
+	#define HWRM_MFG_TESTS                            0x21bUL
+	#define HWRM_MFG_WRITE_CERT_NVM                   0x21cUL
+	#define HWRM_PORT_POE_CFG                         0x230UL
+	#define HWRM_PORT_POE_QCFG                        0x231UL
+	#define HWRM_UDCC_QCAPS                           0x258UL
+	#define HWRM_UDCC_CFG                             0x259UL
+	#define HWRM_UDCC_QCFG                            0x25aUL
+	#define HWRM_UDCC_SESSION_CFG                     0x25bUL
+	#define HWRM_UDCC_SESSION_QCFG                    0x25cUL
+	#define HWRM_UDCC_SESSION_QUERY                   0x25dUL
+	#define HWRM_UDCC_COMP_CFG                        0x25eUL
+	#define HWRM_UDCC_COMP_QCFG                       0x25fUL
+	#define HWRM_UDCC_COMP_QUERY                      0x260UL
+	#define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS            0x261UL
+	#define HWRM_QUEUE_PFCWD_TIMEOUT_CFG              0x262UL
+	#define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG             0x263UL
+	#define HWRM_TF                                   0x2bcUL
+	#define HWRM_TF_VERSION_GET                       0x2bdUL
+	#define HWRM_TF_SESSION_OPEN                      0x2c6UL
+	#define HWRM_TF_SESSION_REGISTER                  0x2c8UL
+	#define HWRM_TF_SESSION_UNREGISTER                0x2c9UL
+	#define HWRM_TF_SESSION_CLOSE                     0x2caUL
+	#define HWRM_TF_SESSION_QCFG                      0x2cbUL
+	#define HWRM_TF_SESSION_RESC_QCAPS                0x2ccUL
+	#define HWRM_TF_SESSION_RESC_ALLOC                0x2cdUL
+	#define HWRM_TF_SESSION_RESC_FREE                 0x2ceUL
+	#define HWRM_TF_SESSION_RESC_FLUSH                0x2cfUL
+	#define HWRM_TF_SESSION_RESC_INFO                 0x2d0UL
+	#define HWRM_TF_SESSION_HOTUP_STATE_SET           0x2d1UL
+	#define HWRM_TF_SESSION_HOTUP_STATE_GET           0x2d2UL
+	#define HWRM_TF_TBL_TYPE_GET                      0x2daUL
+	#define HWRM_TF_TBL_TYPE_SET                      0x2dbUL
+	#define HWRM_TF_TBL_TYPE_BULK_GET                 0x2dcUL
+	#define HWRM_TF_EM_INSERT                         0x2eaUL
+	#define HWRM_TF_EM_DELETE                         0x2ebUL
+	#define HWRM_TF_EM_HASH_INSERT                    0x2ecUL
+	#define HWRM_TF_EM_MOVE                           0x2edUL
+	#define HWRM_TF_TCAM_SET                          0x2f8UL
+	#define HWRM_TF_TCAM_GET                          0x2f9UL
+	#define HWRM_TF_TCAM_MOVE                         0x2faUL
+	#define HWRM_TF_TCAM_FREE                         0x2fbUL
+	#define HWRM_TF_GLOBAL_CFG_SET                    0x2fcUL
+	#define HWRM_TF_GLOBAL_CFG_GET                    0x2fdUL
+	#define HWRM_TF_IF_TBL_SET                        0x2feUL
+	#define HWRM_TF_IF_TBL_GET                        0x2ffUL
+	#define HWRM_TF_RESC_USAGE_SET                    0x300UL
+	#define HWRM_TF_RESC_USAGE_QUERY                  0x301UL
+	#define HWRM_TF_TBL_TYPE_ALLOC                    0x302UL
+	#define HWRM_TF_TBL_TYPE_FREE                     0x303UL
+	#define HWRM_TFC_TBL_SCOPE_QCAPS                  0x380UL
+	#define HWRM_TFC_TBL_SCOPE_ID_ALLOC               0x381UL
+	#define HWRM_TFC_TBL_SCOPE_CONFIG                 0x382UL
+	#define HWRM_TFC_TBL_SCOPE_DECONFIG               0x383UL
+	#define HWRM_TFC_TBL_SCOPE_FID_ADD                0x384UL
+	#define HWRM_TFC_TBL_SCOPE_FID_REM                0x385UL
+	#define HWRM_TFC_TBL_SCOPE_POOL_ALLOC             0x386UL
+	#define HWRM_TFC_TBL_SCOPE_POOL_FREE              0x387UL
+	#define HWRM_TFC_SESSION_ID_ALLOC                 0x388UL
+	#define HWRM_TFC_SESSION_FID_ADD                  0x389UL
+	#define HWRM_TFC_SESSION_FID_REM                  0x38aUL
+	#define HWRM_TFC_IDENT_ALLOC                      0x38bUL
+	#define HWRM_TFC_IDENT_FREE                       0x38cUL
+	#define HWRM_TFC_IDX_TBL_ALLOC                    0x38dUL
+	#define HWRM_TFC_IDX_TBL_ALLOC_SET                0x38eUL
+	#define HWRM_TFC_IDX_TBL_SET                      0x38fUL
+	#define HWRM_TFC_IDX_TBL_GET                      0x390UL
+	#define HWRM_TFC_IDX_TBL_FREE                     0x391UL
+	#define HWRM_TFC_GLOBAL_ID_ALLOC                  0x392UL
+	#define HWRM_TFC_TCAM_SET                         0x393UL
+	#define HWRM_TFC_TCAM_GET                         0x394UL
+	#define HWRM_TFC_TCAM_ALLOC                       0x395UL
+	#define HWRM_TFC_TCAM_ALLOC_SET                   0x396UL
+	#define HWRM_TFC_TCAM_FREE                        0x397UL
+	#define HWRM_TFC_IF_TBL_SET                       0x398UL
+	#define HWRM_TFC_IF_TBL_GET                       0x399UL
+	#define HWRM_TFC_TBL_SCOPE_CONFIG_GET             0x39aUL
+	#define HWRM_TFC_RESC_USAGE_QUERY                 0x39bUL
+	#define HWRM_TFC_GLOBAL_ID_FREE                   0x39cUL
+	#define HWRM_TFC_TCAM_PRI_UPDATE                  0x39dUL
+	#define HWRM_TFC_HOT_UPGRADE_PROCESS              0x3a0UL
+	#define HWRM_SV                                   0x400UL
+	#define HWRM_DBG_SERDES_TEST                      0xff0eUL
+	#define HWRM_DBG_LOG_BUFFER_FLUSH                 0xff0fUL
+	#define HWRM_DBG_READ_DIRECT                      0xff10UL
+	#define HWRM_DBG_READ_INDIRECT                    0xff11UL
+	#define HWRM_DBG_WRITE_DIRECT                     0xff12UL
+	#define HWRM_DBG_WRITE_INDIRECT                   0xff13UL
+	#define HWRM_DBG_DUMP                             0xff14UL
+	#define HWRM_DBG_ERASE_NVM                        0xff15UL
+	#define HWRM_DBG_CFG                              0xff16UL
+	#define HWRM_DBG_COREDUMP_LIST                    0xff17UL
+	#define HWRM_DBG_COREDUMP_INITIATE                0xff18UL
+	#define HWRM_DBG_COREDUMP_RETRIEVE                0xff19UL
+	#define HWRM_DBG_FW_CLI                           0xff1aUL
+	#define HWRM_DBG_I2C_CMD                          0xff1bUL
+	#define HWRM_DBG_RING_INFO_GET                    0xff1cUL
+	#define HWRM_DBG_CRASHDUMP_HEADER                 0xff1dUL
+	#define HWRM_DBG_CRASHDUMP_ERASE                  0xff1eUL
+	#define HWRM_DBG_DRV_TRACE                        0xff1fUL
+	#define HWRM_DBG_QCAPS                            0xff20UL
+	#define HWRM_DBG_QCFG                             0xff21UL
+	#define HWRM_DBG_CRASHDUMP_MEDIUM_CFG             0xff22UL
+	#define HWRM_DBG_USEQ_ALLOC                       0xff23UL
+	#define HWRM_DBG_USEQ_FREE                        0xff24UL
+	#define HWRM_DBG_USEQ_FLUSH                       0xff25UL
+	#define HWRM_DBG_USEQ_QCAPS                       0xff26UL
+	#define HWRM_DBG_USEQ_CW_CFG                      0xff27UL
+	#define HWRM_DBG_USEQ_SCHED_CFG                   0xff28UL
+	#define HWRM_DBG_USEQ_RUN                         0xff29UL
+	#define HWRM_DBG_USEQ_DELIVERY_REQ                0xff2aUL
+	#define HWRM_DBG_USEQ_RESP_HDR                    0xff2bUL
+	#define HWRM_DBG_COREDUMP_CAPTURE                 0xff2cUL
+	#define HWRM_DBG_PTRACE                           0xff2dUL
+	#define HWRM_DBG_SIM_CABLE_STATE                  0xff2eUL
+	#define HWRM_NVM_GET_VPD_FIELD_INFO               0xffeaUL
+	#define HWRM_NVM_SET_VPD_FIELD_INFO               0xffebUL
+	#define HWRM_NVM_DEFRAG                           0xffecUL
+	#define HWRM_NVM_REQ_ARBITRATION                  0xffedUL
+	#define HWRM_NVM_FACTORY_DEFAULTS                 0xffeeUL
+	#define HWRM_NVM_VALIDATE_OPTION                  0xffefUL
+	#define HWRM_NVM_FLUSH                            0xfff0UL
+	#define HWRM_NVM_GET_VARIABLE                     0xfff1UL
+	#define HWRM_NVM_SET_VARIABLE                     0xfff2UL
+	#define HWRM_NVM_INSTALL_UPDATE                   0xfff3UL
+	#define HWRM_NVM_MODIFY                           0xfff4UL
+	#define HWRM_NVM_VERIFY_UPDATE                    0xfff5UL
+	#define HWRM_NVM_GET_DEV_INFO                     0xfff6UL
+	#define HWRM_NVM_ERASE_DIR_ENTRY                  0xfff7UL
+	#define HWRM_NVM_MOD_DIR_ENTRY                    0xfff8UL
+	#define HWRM_NVM_FIND_DIR_ENTRY                   0xfff9UL
+	#define HWRM_NVM_GET_DIR_ENTRIES                  0xfffaUL
+	#define HWRM_NVM_GET_DIR_INFO                     0xfffbUL
+	#define HWRM_NVM_RAW_DUMP                         0xfffcUL
+	#define HWRM_NVM_READ                             0xfffdUL
+	#define HWRM_NVM_WRITE                            0xfffeUL
+	#define HWRM_NVM_RAW_WRITE_BLK                    0xffffUL
+	#define HWRM_LAST                                HWRM_NVM_RAW_WRITE_BLK
+	__le16	unused_0[3];
+};
+
+/* ret_codes (size:64b/8B) */
+struct ret_codes {
+	__le16	error_code;
+	#define HWRM_ERR_CODE_SUCCESS                      0x0UL
+	#define HWRM_ERR_CODE_FAIL                         0x1UL
+	#define HWRM_ERR_CODE_INVALID_PARAMS               0x2UL
+	#define HWRM_ERR_CODE_RESOURCE_ACCESS_DENIED       0x3UL
+	#define HWRM_ERR_CODE_RESOURCE_ALLOC_ERROR         0x4UL
+	#define HWRM_ERR_CODE_INVALID_FLAGS                0x5UL
+	#define HWRM_ERR_CODE_INVALID_ENABLES              0x6UL
+	#define HWRM_ERR_CODE_UNSUPPORTED_TLV              0x7UL
+	#define HWRM_ERR_CODE_NO_BUFFER                    0x8UL
+	#define HWRM_ERR_CODE_UNSUPPORTED_OPTION_ERR       0x9UL
+	#define HWRM_ERR_CODE_HOT_RESET_PROGRESS           0xaUL
+	#define HWRM_ERR_CODE_HOT_RESET_FAIL               0xbUL
+	#define HWRM_ERR_CODE_NO_FLOW_COUNTER_DURING_ALLOC 0xcUL
+	#define HWRM_ERR_CODE_KEY_HASH_COLLISION           0xdUL
+	#define HWRM_ERR_CODE_KEY_ALREADY_EXISTS           0xeUL
+	#define HWRM_ERR_CODE_HWRM_ERROR                   0xfUL
+	#define HWRM_ERR_CODE_BUSY                         0x10UL
+	#define HWRM_ERR_CODE_RESOURCE_LOCKED              0x11UL
+	#define HWRM_ERR_CODE_PF_UNAVAILABLE               0x12UL
+	#define HWRM_ERR_CODE_ENTITY_NOT_PRESENT           0x13UL
+	#define HWRM_ERR_CODE_SECURE_SOC_ERROR             0x14UL
+	#define HWRM_ERR_CODE_TLV_ENCAPSULATED_RESPONSE    0x8000UL
+	#define HWRM_ERR_CODE_UNKNOWN_ERR                  0xfffeUL
+	#define HWRM_ERR_CODE_CMD_NOT_SUPPORTED            0xffffUL
+	#define HWRM_ERR_CODE_LAST                        HWRM_ERR_CODE_CMD_NOT_SUPPORTED
+	__le16	unused_0[3];
+};
+
+/* hwrm_err_output (size:128b/16B) */
+struct hwrm_err_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	opaque_0;
+	__le16	opaque_1;
+	u8	cmd_err;
+	u8	valid;
+};
+#define HWRM_NA_SIGNATURE ((__le32)(-1))
+#define HWRM_MAX_REQ_LEN 128
+#define HWRM_MAX_RESP_LEN 704
+#define HW_HASH_INDEX_SIZE 0x80
+#define HW_HASH_KEY_SIZE 40
+#define HWRM_RESP_VALID_KEY 1
+#define HWRM_TARGET_ID_BONO 0xFFF8
+#define HWRM_TARGET_ID_KONG 0xFFF9
+#define HWRM_TARGET_ID_APE 0xFFFA
+#define HWRM_TARGET_ID_TOOLS 0xFFFD
+#define HWRM_VERSION_MAJOR 1
+#define HWRM_VERSION_MINOR 10
+#define HWRM_VERSION_UPDATE 3
+#define HWRM_VERSION_RSVD 97
+#define HWRM_VERSION_STR "1.10.3.97"
+
+/* hwrm_ver_get_input (size:192b/24B) */
+struct hwrm_ver_get_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	hwrm_intf_maj;
+	u8	hwrm_intf_min;
+	u8	hwrm_intf_upd;
+	u8	unused_0[5];
+};
+
+/* hwrm_ver_get_output (size:1408b/176B) */
+struct hwrm_ver_get_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	hwrm_intf_maj_8b;
+	u8	hwrm_intf_min_8b;
+	u8	hwrm_intf_upd_8b;
+	u8	hwrm_intf_rsvd_8b;
+	u8	hwrm_fw_maj_8b;
+	u8	hwrm_fw_min_8b;
+	u8	hwrm_fw_bld_8b;
+	u8	hwrm_fw_rsvd_8b;
+	u8	mgmt_fw_maj_8b;
+	u8	mgmt_fw_min_8b;
+	u8	mgmt_fw_bld_8b;
+	u8	mgmt_fw_rsvd_8b;
+	u8	netctrl_fw_maj_8b;
+	u8	netctrl_fw_min_8b;
+	u8	netctrl_fw_bld_8b;
+	u8	netctrl_fw_rsvd_8b;
+	__le32	dev_caps_cfg;
+	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_FW_UPD_SUPPORTED                  0x1UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_FW_DCBX_AGENT_SUPPORTED                  0x2UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_SUPPORTED                      0x4UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SHORT_CMD_REQUIRED                       0x8UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_KONG_MB_CHNL_SUPPORTED                   0x10UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_FLOW_HANDLE_64BIT_SUPPORTED              0x20UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_L2_FILTER_TYPES_ROCE_OR_L2_SUPPORTED     0x40UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_VIRTIO_VSWITCH_OFFLOAD_SUPPORTED         0x80UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_TRUSTED_VF_SUPPORTED                     0x100UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_FLOW_AGING_SUPPORTED                     0x200UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_ADV_FLOW_COUNTERS_SUPPORTED              0x400UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_EEM_SUPPORTED                        0x800UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_ADV_FLOW_MGNT_SUPPORTED              0x1000UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_TFLIB_SUPPORTED                      0x2000UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED                    0x4000UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE                      0x8000UL
+	#define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE                       0x10000UL
+	u8	roce_fw_maj_8b;
+	u8	roce_fw_min_8b;
+	u8	roce_fw_bld_8b;
+	u8	roce_fw_rsvd_8b;
+	char	hwrm_fw_name[16];
+	char	mgmt_fw_name[16];
+	char	netctrl_fw_name[16];
+	char	active_pkg_name[16];
+	char	roce_fw_name[16];
+	__le16	chip_num;
+	u8	chip_rev;
+	u8	chip_metal;
+	u8	chip_bond_id;
+	u8	chip_platform_type;
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_ASIC      0x0UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_FPGA      0x1UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM 0x2UL
+	#define VER_GET_RESP_CHIP_PLATFORM_TYPE_LAST     VER_GET_RESP_CHIP_PLATFORM_TYPE_PALLADIUM
+	__le16	max_req_win_len;
+	__le16	max_resp_len;
+	__le16	def_req_timeout;
+	u8	flags;
+	#define VER_GET_RESP_FLAGS_DEV_NOT_RDY                   0x1UL
+	#define VER_GET_RESP_FLAGS_EXT_VER_AVAIL                 0x2UL
+	#define VER_GET_RESP_FLAGS_DEV_NOT_RDY_BACKING_STORE     0x4UL
+	u8	unused_0[2];
+	u8	always_1;
+	__le16	hwrm_intf_major;
+	__le16	hwrm_intf_minor;
+	__le16	hwrm_intf_build;
+	__le16	hwrm_intf_patch;
+	__le16	hwrm_fw_major;
+	__le16	hwrm_fw_minor;
+	__le16	hwrm_fw_build;
+	__le16	hwrm_fw_patch;
+	__le16	mgmt_fw_major;
+	__le16	mgmt_fw_minor;
+	__le16	mgmt_fw_build;
+	__le16	mgmt_fw_patch;
+	__le16	netctrl_fw_major;
+	__le16	netctrl_fw_minor;
+	__le16	netctrl_fw_build;
+	__le16	netctrl_fw_patch;
+	__le16	roce_fw_major;
+	__le16	roce_fw_minor;
+	__le16	roce_fw_build;
+	__le16	roce_fw_patch;
+	__le16	max_ext_req_len;
+	__le16	max_req_timeout;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* eject_cmpl (size:128b/16B) */
+struct eject_cmpl {
+	__le16	type;
+	#define EJECT_CMPL_TYPE_MASK       0x3fUL
+	#define EJECT_CMPL_TYPE_SFT        0
+	#define EJECT_CMPL_TYPE_STAT_EJECT   0x1aUL
+	#define EJECT_CMPL_TYPE_LAST        EJECT_CMPL_TYPE_STAT_EJECT
+	#define EJECT_CMPL_FLAGS_MASK      0xffc0UL
+	#define EJECT_CMPL_FLAGS_SFT       6
+	#define EJECT_CMPL_FLAGS_ERROR      0x40UL
+	__le16	len;
+	__le32	opaque;
+	__le16	v;
+	#define EJECT_CMPL_V                              0x1UL
+	#define EJECT_CMPL_ERRORS_MASK                    0xfffeUL
+	#define EJECT_CMPL_ERRORS_SFT                     1
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_MASK        0xeUL
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_SFT         1
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_NO_BUFFER     (0x0UL << 1)
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_DID_NOT_FIT   (0x1UL << 1)
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_BAD_FORMAT    (0x3UL << 1)
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_FLUSH         (0x5UL << 1)
+	#define EJECT_CMPL_ERRORS_BUFFER_ERROR_LAST         EJECT_CMPL_ERRORS_BUFFER_ERROR_FLUSH
+	__le16	reserved16;
+	__le32	unused_2;
+};
+
+/* hwrm_cmpl (size:128b/16B) */
+struct hwrm_cmpl {
+	__le16	type;
+	#define CMPL_TYPE_MASK     0x3fUL
+	#define CMPL_TYPE_SFT      0
+	#define CMPL_TYPE_HWRM_DONE  0x20UL
+	#define CMPL_TYPE_LAST      CMPL_TYPE_HWRM_DONE
+	__le16	sequence_id;
+	__le32	unused_1;
+	__le32	v;
+	#define CMPL_V     0x1UL
+	__le32	unused_3;
+};
+
+/* hwrm_fwd_req_cmpl (size:128b/16B) */
+struct hwrm_fwd_req_cmpl {
+	__le16	req_len_type;
+	#define FWD_REQ_CMPL_TYPE_MASK        0x3fUL
+	#define FWD_REQ_CMPL_TYPE_SFT         0
+	#define FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ  0x22UL
+	#define FWD_REQ_CMPL_TYPE_LAST         FWD_REQ_CMPL_TYPE_HWRM_FWD_REQ
+	#define FWD_REQ_CMPL_REQ_LEN_MASK     0xffc0UL
+	#define FWD_REQ_CMPL_REQ_LEN_SFT      6
+	__le16	source_id;
+	__le32	unused0;
+	__le32	req_buf_addr_v[2];
+	#define FWD_REQ_CMPL_V                0x1UL
+	#define FWD_REQ_CMPL_REQ_BUF_ADDR_MASK 0xfffffffeUL
+	#define FWD_REQ_CMPL_REQ_BUF_ADDR_SFT 1
+};
+
+/* hwrm_fwd_resp_cmpl (size:128b/16B) */
+struct hwrm_fwd_resp_cmpl {
+	__le16	type;
+	#define FWD_RESP_CMPL_TYPE_MASK         0x3fUL
+	#define FWD_RESP_CMPL_TYPE_SFT          0
+	#define FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP  0x24UL
+	#define FWD_RESP_CMPL_TYPE_LAST          FWD_RESP_CMPL_TYPE_HWRM_FWD_RESP
+	__le16	source_id;
+	__le16	resp_len;
+	__le16	unused_1;
+	__le32	resp_buf_addr_v[2];
+	#define FWD_RESP_CMPL_V                 0x1UL
+	#define FWD_RESP_CMPL_RESP_BUF_ADDR_MASK 0xfffffffeUL
+	#define FWD_RESP_CMPL_RESP_BUF_ADDR_SFT 1
+};
+
+/* hwrm_async_event_cmpl (size:128b/16B) */
+struct hwrm_async_event_cmpl {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_TYPE_LAST             ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_STATUS_CHANGE              0x0UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_MTU_CHANGE                 0x1UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CHANGE               0x2UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE               0x3UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_CONN_NOT_ALLOWED           0x4UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_NOT_ALLOWED      0x5UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LINK_SPEED_CFG_CHANGE           0x6UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PORT_PHY_CFG_CHANGE             0x7UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_RESET_NOTIFY                    0x8UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_ERROR_RECOVERY                  0x9UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_RING_MONITOR_MSG                0xaUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_UNLOAD                0x10UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_DRVR_LOAD                  0x11UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FUNC_FLR_PROC_CMPLT             0x12UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_UNLOAD                  0x20UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_DRVR_LOAD                    0x21UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_FLR                          0x30UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_MAC_ADDR_CHANGE              0x31UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PF_VF_COMM_STATUS_CHANGE        0x32UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_CFG_CHANGE                   0x33UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LLFC_PFC_CHANGE                 0x34UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DEFAULT_VNIC_CHANGE             0x35UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_HW_FLOW_AGED                    0x36UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION              0x37UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CACHE_FLUSH_REQ             0x38UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CACHE_FLUSH_DONE            0x39UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_TCP_FLAG_ACTION_CHANGE          0x3aUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_FLOW_ACTIVE                 0x3bUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_EEM_CFG_CHANGE                  0x3cUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_TFLIB_DEFAULT_VNIC_CHANGE       0x3dUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_TFLIB_LINK_STATUS_CHANGE        0x3eUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_QUIESCE_DONE                    0x3fUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DEFERRED_RESPONSE               0x40UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PFC_WATCHDOG_CFG_CHANGE         0x41UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_ECHO_REQUEST                    0x42UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PHC_UPDATE                      0x43UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PPS_TIMESTAMP                   0x44UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_ERROR_REPORT                    0x45UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DOORBELL_PACING_THRESHOLD       0x46UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_RSS_CHANGE                      0x47UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DOORBELL_PACING_NQ_UPDATE       0x48UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_HW_DOORBELL_RECOVERY_READ_ERROR 0x49UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_CTX_ERROR                       0x4aUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_UDCC_SESSION_CHANGE             0x4bUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_DBG_BUF_PRODUCER                0x4cUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_PEER_MMAP_CHANGE                0x4dUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_REPRESENTOR_PAIR_CHANGE         0x4eUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_VF_STAT_CHANGE                  0x4fUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_HOST_COREDUMP                   0x50UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID               0x51UL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG                    0xfeUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR                      0xffUL
+	#define ASYNC_EVENT_CMPL_EVENT_ID_LAST                           ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_V          0x1UL
+	#define ASYNC_EVENT_CMPL_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_link_status_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_link_status_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE 0x0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LAST              ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_ID_LINK_STATUS_CHANGE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE     0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_DOWN  0x0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP    0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_LAST ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_LINK_CHANGE_UP
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_MASK       0xeUL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_SFT        1
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_MASK    0xffff0UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PORT_ID_SFT     4
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PF_ID_MASK      0xff00000UL
+	#define ASYNC_EVENT_CMPL_LINK_STATUS_CHANGE_EVENT_DATA1_PF_ID_SFT       20
+};
+
+/* hwrm_async_event_cmpl_port_conn_not_allowed (size:128b/16B) */
+struct hwrm_async_event_cmpl_port_conn_not_allowed {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_LAST             ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED 0x4UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_ID_PORT_CONN_NOT_ALLOWED
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_V          0x1UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_MASK                 0xffffUL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_PORT_ID_SFT                  0
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_MASK      0xff0000UL
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_SFT       16
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_NONE        (0x0UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_DISABLETX   (0x1UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_WARNINGMSG  (0x2UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN     (0x3UL << 16)
+	#define ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_LAST       ASYNC_EVENT_CMPL_PORT_CONN_NOT_ALLOWED_EVENT_DATA1_ENFORCEMENT_POLICY_PWRDOWN
+};
+
+/* hwrm_async_event_cmpl_link_speed_cfg_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_link_speed_cfg_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE 0x6UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LAST                 ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_ID_LINK_SPEED_CFG_CHANGE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_MASK                     0xffffUL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_PORT_ID_SFT                      0
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_SUPPORTED_LINK_SPEEDS_CHANGE     0x10000UL
+	#define ASYNC_EVENT_CMPL_LINK_SPEED_CFG_CHANGE_EVENT_DATA1_ILLEGAL_LINK_SPEED_CFG           0x20000UL
+};
+
+/* hwrm_async_event_cmpl_reset_notify (size:128b/16B) */
+struct hwrm_async_event_cmpl_reset_notify {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_LAST             ASYNC_EVENT_CMPL_RESET_NOTIFY_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_RESET_NOTIFY 0x8UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_LAST        ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_ID_RESET_NOTIFY
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA2_FW_STATUS_CODE_MASK 0xffffUL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA2_FW_STATUS_CODE_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_V          0x1UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_MASK                  0xffUL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_SFT                   0
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_STOP_TX_QUEUE    0x1UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_IFDOWN           0x2UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_LAST                   ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DRIVER_ACTION_DRIVER_IFDOWN
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MASK                    0xff00UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_SFT                     8
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_MANAGEMENT_RESET_REQUEST  (0x1UL << 8)
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL        (0x2UL << 8)
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_NON_FATAL    (0x3UL << 8)
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FAST_RESET                (0x4UL << 8)
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION             (0x5UL << 8)
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_LAST                     ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_MASK           0xffff0000UL
+	#define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_SFT            16
+};
+
+/* hwrm_async_event_cmpl_error_recovery (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_recovery {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_RECOVERY_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_ERROR_RECOVERY 0x9UL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_LAST          ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_ID_ERROR_RECOVERY
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASK                 0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_SFT                  0
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_MASTER_FUNC           0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_RECOVERY_EVENT_DATA1_FLAGS_RECOVERY_ENABLED      0x2UL
+};
+
+/* hwrm_async_event_cmpl_ring_monitor_msg (size:128b/16B) */
+struct hwrm_async_event_cmpl_ring_monitor_msg {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_LAST             ASYNC_EVENT_CMPL_RING_MONITOR_MSG_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_RING_MONITOR_MSG 0xaUL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_LAST            ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_ID_RING_MONITOR_MSG
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_MASK 0xffUL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_SFT 0
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_TX    0x0UL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_RX    0x1UL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_CMPL  0x2UL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_LAST ASYNC_EVENT_CMPL_RING_MONITOR_MSG_EVENT_DATA2_DISABLE_RING_TYPE_CMPL
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_V          0x1UL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_RING_MONITOR_MSG_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_vf_cfg_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_vf_cfg_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_VF_CFG_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE 0x33UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_LAST         ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_ID_VF_CFG_CHANGE
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA2_VF_ID_MASK 0xffffUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA2_VF_ID_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MTU_CHANGE                0x1UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_MRU_CHANGE                0x2UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_MAC_ADDR_CHANGE      0x4UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_DFLT_VLAN_CHANGE          0x8UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_TRUSTED_VF_CFG_CHANGE     0x10UL
+	#define ASYNC_EVENT_CMPL_VF_CFG_CHANGE_EVENT_DATA1_TF_OWNERSHIP_RELEASE      0x20UL
+};
+
+/* hwrm_async_event_cmpl_default_vnic_change (size:128b/16B) */
+struct hwrm_async_event_cmpl_default_vnic_change {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_LAST             ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_TYPE_HWRM_ASYNC_EVENT
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_UNUSED1_MASK         0xffc0UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_UNUSED1_SFT          6
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_ALLOC_FREE_NOTIFICATION 0x35UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_LAST                   ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_ID_ALLOC_FREE_NOTIFICATION
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_MASK          0x3UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_SFT           0
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_ALLOC  0x1UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_FREE   0x2UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_LAST           ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_DEF_VNIC_STATE_DEF_VNIC_FREE
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_PF_ID_MASK                   0x3fcUL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_PF_ID_SFT                    2
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_VF_ID_MASK                   0x3fffc00UL
+	#define ASYNC_EVENT_CMPL_DEFAULT_VNIC_CHANGE_EVENT_DATA1_VF_ID_SFT                    10
+};
+
+/* hwrm_async_event_cmpl_hw_flow_aged (size:128b/16B) */
+struct hwrm_async_event_cmpl_hw_flow_aged {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_LAST             ASYNC_EVENT_CMPL_HW_FLOW_AGED_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_HW_FLOW_AGED 0x36UL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_LAST        ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_ID_HW_FLOW_AGED
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_V          0x1UL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_ID_MASK       0x7fffffffUL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_ID_SFT        0
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION     0x80000000UL
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_RX    (0x0UL << 31)
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_TX    (0x1UL << 31)
+	#define ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_LAST ASYNC_EVENT_CMPL_HW_FLOW_AGED_EVENT_DATA1_FLOW_DIRECTION_TX
+};
+
+/* hwrm_async_event_cmpl_eem_cache_flush_req (size:128b/16B) */
+struct hwrm_async_event_cmpl_eem_cache_flush_req {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_LAST             ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_EEM_CACHE_FLUSH_REQ 0x38UL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_LAST               ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_EVENT_ID_EEM_CACHE_FLUSH_REQ
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_V          0x1UL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_REQ_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_eem_cache_flush_done (size:128b/16B) */
+struct hwrm_async_event_cmpl_eem_cache_flush_done {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_LAST             ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_EEM_CACHE_FLUSH_DONE 0x39UL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_LAST                ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_ID_EEM_CACHE_FLUSH_DONE
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_DATA1_FID_MASK 0xffffUL
+	#define ASYNC_EVENT_CMPL_EEM_CACHE_FLUSH_DONE_EVENT_DATA1_FID_SFT 0
+};
+
+/* hwrm_async_event_cmpl_deferred_response (size:128b/16B) */
+struct hwrm_async_event_cmpl_deferred_response {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_LAST             ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_DEFERRED_RESPONSE 0x40UL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_LAST             ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_ID_DEFERRED_RESPONSE
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_DATA2_SEQ_ID_MASK 0xffffUL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_EVENT_DATA2_SEQ_ID_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_DEFERRED_RESPONSE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_echo_request (size:128b/16B) */
+struct hwrm_async_event_cmpl_echo_request {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_LAST             ASYNC_EVENT_CMPL_ECHO_REQUEST_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST 0x42UL
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ECHO_REQUEST_EVENT_ID_ECHO_REQUEST
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ECHO_REQUEST_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+};
+
+/* hwrm_async_event_cmpl_phc_update (size:128b/16B) */
+struct hwrm_async_event_cmpl_phc_update {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_LAST             ASYNC_EVENT_CMPL_PHC_UPDATE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE 0x43UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_LAST      ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_ID_PHC_UPDATE
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_MASK 0xffffUL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_MASTER_FID_SFT 0
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_MASK   0xffff0000UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA2_PHC_SEC_FID_SFT    16
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_MASK          0xfUL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_SFT           0
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_MASTER      0x1UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_SECONDARY   0x2UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_FAILOVER    0x3UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE  0x4UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_LAST           ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_MASK   0xffff0UL
+	#define ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_PHC_TIME_MSB_SFT    4
+};
+
+/* hwrm_async_event_cmpl_pps_timestamp (size:128b/16B) */
+struct hwrm_async_event_cmpl_pps_timestamp {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_LAST             ASYNC_EVENT_CMPL_PPS_TIMESTAMP_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_PPS_TIMESTAMP 0x44UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_LAST         ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_ID_PPS_TIMESTAMP
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE              0x1UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_INTERNAL       0x0UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_EXTERNAL       0x1UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_LAST          ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_EXTERNAL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PIN_NUMBER_MASK         0xeUL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PIN_NUMBER_SFT          1
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PPS_TIMESTAMP_UPPER_MASK 0xffff0UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_PPS_TIMESTAMP_UPPER_SFT 4
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_V          0x1UL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA1_PPS_TIMESTAMP_LOWER_MASK 0xffffffffUL
+	#define ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA1_PPS_TIMESTAMP_LOWER_SFT 0
+};
+
+/* hwrm_async_event_cmpl_error_report (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_EVENT_DATA1_ERROR_TYPE_SFT 0
+};
+
+/* hwrm_async_event_cmpl_dbg_buf_producer (size:128b/16B) */
+struct hwrm_async_event_cmpl_dbg_buf_producer {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_LAST             ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_DBG_BUF_PRODUCER 0x4cUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_LAST            ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_ID_DBG_BUF_PRODUCER
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA2_CURR_OFF_MASK 0xffffffffUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA2_CURR_OFF_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_V          0x1UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_MASK               0xffffUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SFT                0
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SRT_TRACE            0x0UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_SRT2_TRACE           0x1UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CRT_TRACE            0x2UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CRT2_TRACE           0x3UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP0_TRACE          0x4UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_L2_HWRM_TRACE        0x5UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ROCE_HWRM_TRACE      0x6UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA0_TRACE            0x7UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA1_TRACE            0x8UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA2_TRACE            0x9UL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP1_TRACE          0xaUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE  0xbUL
+	#define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST                ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE
+};
+
+/* hwrm_async_event_cmpl_hwrm_error (size:128b/16B) */
+struct hwrm_async_event_cmpl_hwrm_error {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_LAST             ASYNC_EVENT_CMPL_HWRM_ERROR_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_HWRM_ERROR 0xffUL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_LAST      ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_ID_HWRM_ERROR
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_MASK    0xffUL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_SFT     0
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_WARNING   0x0UL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_NONFATAL  0x1UL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_FATAL     0x2UL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_LAST     ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA2_SEVERITY_FATAL
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_V          0x1UL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_HWRM_ERROR_EVENT_DATA1_TIMESTAMP     0x1UL
+};
+
+/* hwrm_async_event_cmpl_error_report_base (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_base {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK                        0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT                         0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED                      0x0UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM                   0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL                0x2UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM                           0x3UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD       0x4UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD             0x5UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED  0x6UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST                         ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED
+};
+
+/* hwrm_async_event_cmpl_error_report_pause_storm (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_pause_storm {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_MASK       0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_SFT        0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM  0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_PAUSE_STORM_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM
+};
+
+/* hwrm_async_event_cmpl_error_report_invalid_signal (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_invalid_signal {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK 0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_MASK          0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_SFT           0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL  0x2UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_LAST           ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL
+};
+
+/* hwrm_async_event_cmpl_error_report_nvm (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_nvm {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK 0xffffffffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT 0
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_MASK     0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_SFT      0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_NVM_ERROR  0x3UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_LAST      ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_ERROR_TYPE_NVM_ERROR
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK   0xff00UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_SFT    8
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE    (0x1UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE    (0x2UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_LAST    ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE
+};
+
+/* hwrm_async_event_cmpl_error_report_doorbell_drop_threshold (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_doorbell_drop_threshold {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_MASK                   0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_SFT                    0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD  0x4UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_LAST                    ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_EPOCH_MASK                        0xffffff00UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DOORBELL_DROP_THRESHOLD_EVENT_DATA1_EPOCH_SFT                         8
+};
+
+/* hwrm_async_event_cmpl_error_report_thermal (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_thermal {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_CURRENT_TEMP_MASK  0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_CURRENT_TEMP_SFT   0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_THRESHOLD_TEMP_MASK 0xff00UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA2_THRESHOLD_TEMP_SFT 8
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_MASK          0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_SFT           0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_THERMAL_EVENT   0x5UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_LAST           ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_ERROR_TYPE_THERMAL_EVENT
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_MASK      0x700UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SFT       8
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_WARN        (0x0UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_CRITICAL    (0x1UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_FATAL       (0x2UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SHUTDOWN    (0x3UL << 8)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_LAST       ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_THRESHOLD_TYPE_SHUTDOWN
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR           0x800UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_DECREASING  (0x0UL << 11)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_INCREASING  (0x1UL << 11)
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_LAST       ASYNC_EVENT_CMPL_ERROR_REPORT_THERMAL_EVENT_DATA1_TRANSITION_DIR_INCREASING
+};
+
+/* hwrm_async_event_cmpl_error_report_dual_data_rate_not_supported (size:128b/16B) */
+struct hwrm_async_event_cmpl_error_report_dual_data_rate_not_supported {
+	__le16	type;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_MASK            0x3fUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_SFT             0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_HWRM_ASYNC_EVENT  0x2eUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_LAST             ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_TYPE_HWRM_ASYNC_EVENT
+	__le16	event_id;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_ERROR_REPORT 0x45UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_LAST        ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_ID_ERROR_REPORT
+	__le32	event_data2;
+	u8	opaque_v;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_V          0x1UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_OPAQUE_MASK 0xfeUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_OPAQUE_SFT 1
+	u8	timestamp_lo;
+	__le16	timestamp_hi;
+	__le32	event_data1;
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_MASK                        0xffUL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_SFT                         0
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED  0x6UL
+	#define ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_LAST                         ASYNC_EVENT_CMPL_ERROR_REPORT_DUAL_DATA_RATE_NOT_SUPPORTED_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED
+};
+
+/* hwrm_func_reset_input (size:192b/24B) */
+struct hwrm_func_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_RESET_REQ_ENABLES_VF_ID_VALID     0x1UL
+	__le16	vf_id;
+	u8	func_reset_level;
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETALL      0x0UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETME       0x1UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETCHILDREN 0x2UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF       0x3UL
+	#define FUNC_RESET_REQ_FUNC_RESET_LEVEL_LAST         FUNC_RESET_REQ_FUNC_RESET_LEVEL_RESETVF
+	u8	unused_0;
+};
+
+/* hwrm_func_reset_output (size:128b/16B) */
+struct hwrm_func_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_getfid_input (size:192b/24B) */
+struct hwrm_func_getfid_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_GETFID_REQ_ENABLES_PCI_ID     0x1UL
+	__le16	pci_id;
+	u8	unused_0[2];
+};
+
+/* hwrm_func_getfid_output (size:128b/16B) */
+struct hwrm_func_getfid_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_func_vf_alloc_input (size:192b/24B) */
+struct hwrm_func_vf_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_ALLOC_REQ_ENABLES_FIRST_VF_ID     0x1UL
+	__le16	first_vf_id;
+	__le16	num_vfs;
+};
+
+/* hwrm_func_vf_alloc_output (size:128b/16B) */
+struct hwrm_func_vf_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	first_vf_id;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_func_vf_free_input (size:192b/24B) */
+struct hwrm_func_vf_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_FREE_REQ_ENABLES_FIRST_VF_ID     0x1UL
+	__le16	first_vf_id;
+	__le16	num_vfs;
+};
+
+/* hwrm_func_vf_free_output (size:128b/16B) */
+struct hwrm_func_vf_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_cfg_input (size:576b/72B) */
+struct hwrm_func_vf_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_VF_CFG_REQ_ENABLES_MTU                      0x1UL
+	#define FUNC_VF_CFG_REQ_ENABLES_GUEST_VLAN               0x2UL
+	#define FUNC_VF_CFG_REQ_ENABLES_ASYNC_EVENT_CR           0x4UL
+	#define FUNC_VF_CFG_REQ_ENABLES_DFLT_MAC_ADDR            0x8UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS          0x10UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_CMPL_RINGS           0x20UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_TX_RINGS             0x40UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_RX_RINGS             0x80UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_L2_CTXS              0x100UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS                0x200UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS            0x400UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS         0x800UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_KTLS_TX_KEY_CTXS     0x1000UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_KTLS_RX_KEY_CTXS     0x2000UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_QUIC_TX_KEY_CTXS     0x4000UL
+	#define FUNC_VF_CFG_REQ_ENABLES_NUM_QUIC_RX_KEY_CTXS     0x8000UL
+	__le16	mtu;
+	__le16	guest_vlan;
+	__le16	async_event_cr;
+	u8	dflt_mac_addr[6];
+	__le32	flags;
+	#define FUNC_VF_CFG_REQ_FLAGS_TX_ASSETS_TEST             0x1UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RX_ASSETS_TEST             0x2UL
+	#define FUNC_VF_CFG_REQ_FLAGS_CMPL_ASSETS_TEST           0x4UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST     0x8UL
+	#define FUNC_VF_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST       0x10UL
+	#define FUNC_VF_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST       0x20UL
+	#define FUNC_VF_CFG_REQ_FLAGS_VNIC_ASSETS_TEST           0x40UL
+	#define FUNC_VF_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST         0x80UL
+	#define FUNC_VF_CFG_REQ_FLAGS_PPP_PUSH_MODE_ENABLE       0x100UL
+	#define FUNC_VF_CFG_REQ_FLAGS_PPP_PUSH_MODE_DISABLE      0x200UL
+	__le16	num_rsscos_ctxs;
+	__le16	num_cmpl_rings;
+	__le16	num_tx_rings;
+	__le16	num_rx_rings;
+	__le16	num_l2_ctxs;
+	__le16	num_vnics;
+	__le16	num_stat_ctxs;
+	__le16	num_hw_ring_grps;
+	__le32	num_ktls_tx_key_ctxs;
+	__le32	num_ktls_rx_key_ctxs;
+	__le16	num_msix;
+	u8	unused[2];
+	__le32	num_quic_tx_key_ctxs;
+	__le32	num_quic_rx_key_ctxs;
+};
+
+/* hwrm_func_vf_cfg_output (size:128b/16B) */
+struct hwrm_func_vf_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_qcaps_input (size:192b/24B) */
+struct hwrm_func_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_qcaps_output (size:1152b/144B) */
+struct hwrm_func_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	__le16	port_id;
+	__le32	flags;
+	#define FUNC_QCAPS_RESP_FLAGS_PUSH_MODE_SUPPORTED                   0x1UL
+	#define FUNC_QCAPS_RESP_FLAGS_GLOBAL_MSIX_AUTOMASKING               0x2UL
+	#define FUNC_QCAPS_RESP_FLAGS_PTP_SUPPORTED                         0x4UL
+	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V1_SUPPORTED                     0x8UL
+	#define FUNC_QCAPS_RESP_FLAGS_ROCE_V2_SUPPORTED                     0x10UL
+	#define FUNC_QCAPS_RESP_FLAGS_WOL_MAGICPKT_SUPPORTED                0x20UL
+	#define FUNC_QCAPS_RESP_FLAGS_WOL_BMP_SUPPORTED                     0x40UL
+	#define FUNC_QCAPS_RESP_FLAGS_TX_RING_RL_SUPPORTED                  0x80UL
+	#define FUNC_QCAPS_RESP_FLAGS_TX_BW_CFG_SUPPORTED                   0x100UL
+	#define FUNC_QCAPS_RESP_FLAGS_VF_TX_RING_RL_SUPPORTED               0x200UL
+	#define FUNC_QCAPS_RESP_FLAGS_VF_BW_CFG_SUPPORTED                   0x400UL
+	#define FUNC_QCAPS_RESP_FLAGS_STD_TX_RING_MODE_SUPPORTED            0x800UL
+	#define FUNC_QCAPS_RESP_FLAGS_GENEVE_TUN_FLAGS_SUPPORTED            0x1000UL
+	#define FUNC_QCAPS_RESP_FLAGS_NVGRE_TUN_FLAGS_SUPPORTED             0x2000UL
+	#define FUNC_QCAPS_RESP_FLAGS_GRE_TUN_FLAGS_SUPPORTED               0x4000UL
+	#define FUNC_QCAPS_RESP_FLAGS_MPLS_TUN_FLAGS_SUPPORTED              0x8000UL
+	#define FUNC_QCAPS_RESP_FLAGS_PCIE_STATS_SUPPORTED                  0x10000UL
+	#define FUNC_QCAPS_RESP_FLAGS_ADOPTED_PF_SUPPORTED                  0x20000UL
+	#define FUNC_QCAPS_RESP_FLAGS_ADMIN_PF_SUPPORTED                    0x40000UL
+	#define FUNC_QCAPS_RESP_FLAGS_LINK_ADMIN_STATUS_SUPPORTED           0x80000UL
+	#define FUNC_QCAPS_RESP_FLAGS_WCB_PUSH_MODE                         0x100000UL
+	#define FUNC_QCAPS_RESP_FLAGS_DYNAMIC_TX_RING_ALLOC                 0x200000UL
+	#define FUNC_QCAPS_RESP_FLAGS_HOT_RESET_CAPABLE                     0x400000UL
+	#define FUNC_QCAPS_RESP_FLAGS_ERROR_RECOVERY_CAPABLE                0x800000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_STATS_SUPPORTED                   0x1000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_ERR_RECOVER_RELOAD                    0x2000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_NOTIFY_VF_DEF_VNIC_CHNG_SUPPORTED     0x4000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_VLAN_ACCELERATION_TX_DISABLED         0x8000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_COREDUMP_CMD_SUPPORTED                0x10000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_CRASHDUMP_CMD_SUPPORTED               0x20000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_PFC_WD_STATS_SUPPORTED                0x40000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_DBG_QCAPS_CMD_SUPPORTED               0x80000000UL
+	u8	mac_address[6];
+	__le16	max_rsscos_ctx;
+	__le16	max_cmpl_rings;
+	__le16	max_tx_rings;
+	__le16	max_rx_rings;
+	__le16	max_l2_ctxs;
+	__le16	max_vnics;
+	__le16	first_vf_id;
+	__le16	max_vfs;
+	__le16	max_stat_ctx;
+	__le32	max_encap_records;
+	__le32	max_decap_records;
+	__le32	max_tx_em_flows;
+	__le32	max_tx_wm_flows;
+	__le32	max_rx_em_flows;
+	__le32	max_rx_wm_flows;
+	__le32	max_mcast_filters;
+	__le32	max_flow_id;
+	__le32	max_hw_ring_grps;
+	__le16	max_sp_tx_rings;
+	__le16	max_msix_vfs;
+	__le32	flags_ext;
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECN_MARK_SUPPORTED                          0x1UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECN_STATS_SUPPORTED                         0x2UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_EXT_HW_STATS_SUPPORTED                      0x4UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_HOT_RESET_IF_SUPPORT                        0x8UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PROXY_MODE_SUPPORT                          0x10UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_PROXY_SRC_INTF_OVERRIDE_SUPPORT          0x20UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_SCHQ_SUPPORTED                              0x40UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PPP_PUSH_MODE_SUPPORTED                     0x80UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_EVB_MODE_CFG_NOT_SUPPORTED                  0x100UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_SOC_SPD_SUPPORTED                           0x200UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_FW_LIVEPATCH_SUPPORTED                      0x400UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_FAST_RESET_CAPABLE                          0x800UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_METADATA_CFG_CAPABLE                     0x1000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_NVM_OPTION_ACTION_SUPPORTED                 0x2000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_BD_METADATA_SUPPORTED                       0x4000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_ECHO_REQUEST_SUPPORTED                      0x8000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_NPAR_1_2_SUPPORTED                          0x10000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_PTM_SUPPORTED                           0x20000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_PPS_SUPPORTED                           0x40000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_VF_CFG_ASYNC_FOR_PF_SUPPORTED               0x80000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PARTITION_BW_SUPPORTED                      0x100000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_DFLT_VLAN_TPID_PCP_SUPPORTED                0x200000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_KTLS_SUPPORTED                              0x400000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_EP_RATE_CONTROL                             0x800000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_MIN_BW_SUPPORTED                            0x1000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_TX_COAL_CMPL_CAP                            0x2000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_SUPPORTED                             0x4000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_BS_V2_REQUIRED                              0x8000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_PTP_64BIT_RTC_SUPPORTED                     0x10000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_DBR_PACING_SUPPORTED                        0x20000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_HW_DBR_DROP_RECOV_SUPPORTED                 0x40000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT_DISABLE_CQ_OVERFLOW_DETECTION_SUPPORTED     0x80000000UL
+	u8	max_schqs;
+	u8	mpc_chnls_cap;
+	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TCE         0x1UL
+	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RCE         0x2UL
+	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TE_CFA      0x4UL
+	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RE_CFA      0x8UL
+	#define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_PRIMATE     0x10UL
+	__le16	max_key_ctxs_alloc;
+	__le32	flags_ext2;
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_RX_ALL_PKTS_TIMESTAMPS_SUPPORTED      0x1UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_QUIC_SUPPORTED                        0x2UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_KDNET_SUPPORTED                       0x4UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_EXT_SUPPORTED              0x8UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_DBR_DROP_RECOVERY_SUPPORTED        0x10UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_GENERIC_STATS_SUPPORTED               0x20UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_UDP_GSO_SUPPORTED                     0x40UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SYNCE_SUPPORTED                       0x80UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DBR_PACING_V0_SUPPORTED               0x100UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TX_PKT_TS_CMPL_SUPPORTED              0x200UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_HW_LAG_SUPPORTED                      0x400UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ON_CHIP_CTX_SUPPORTED                 0x800UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_STEERING_TAG_SUPPORTED                0x1000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ENHANCED_VF_SCALE_SUPPORTED           0x2000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_KEY_XID_PARTITION_SUPPORTED           0x4000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_CONCURRENT_KTLS_QUIC_SUPPORTED        0x8000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_CROSS_TC_CAP_SUPPORTED           0x10000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_PER_TC_CAP_SUPPORTED             0x20000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SCHQ_PER_TC_RESERVATION_SUPPORTED     0x40000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_DB_ERROR_STATS_SUPPORTED              0x80000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_ROCE_VF_RESOURCE_MGMT_SUPPORTED       0x100000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_UDCC_SUPPORTED                        0x200000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TIMED_TX_SO_TXTIME_SUPPORTED          0x400000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_SW_MAX_RESOURCE_LIMITS_SUPPORTED      0x800000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_INGRESS_NIC_FLOW_SUPPORTED         0x1000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_LPBK_STATS_SUPPORTED                  0x2000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TF_EGRESS_NIC_FLOW_SUPPORTED          0x4000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_MULTI_LOSSLESS_QUEUES_SUPPORTED       0x8000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_PEER_MMAP_SUPPORTED                   0x10000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_TIMED_TX_PACING_SUPPORTED             0x20000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_VF_STAT_EJECTION_SUPPORTED            0x40000000UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT2_HOST_COREDUMP_SUPPORTED               0x80000000UL
+	__le16	tunnel_disable_flag;
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_VXLAN      0x1UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NGE        0x2UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_NVGRE      0x4UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_L2GRE      0x8UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_GRE        0x10UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_IPINIP     0x20UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_MPLS       0x40UL
+	#define FUNC_QCAPS_RESP_TUNNEL_DISABLE_FLAG_DISABLE_PPPOE      0x80UL
+	__le16	xid_partition_cap;
+	#define FUNC_QCAPS_RESP_XID_PARTITION_CAP_TX_CK     0x1UL
+	#define FUNC_QCAPS_RESP_XID_PARTITION_CAP_RX_CK     0x2UL
+	u8	device_serial_number[8];
+	__le16	ctxs_per_partition;
+	__le16	max_tso_segs;
+	__le32	roce_vf_max_av;
+	__le32	roce_vf_max_cq;
+	__le32	roce_vf_max_mrw;
+	__le32	roce_vf_max_qp;
+	__le32	roce_vf_max_srq;
+	__le32	roce_vf_max_gid;
+	__le32	flags_ext3;
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_RM_RSV_WHILE_ALLOC_CAP            0x1UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_REQUIRE_L2_FILTER                 0x2UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_MAX_ROCE_VFS_SUPPORTED            0x4UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_RX_RATE_PROFILE_SEL_SUPPORTED     0x8UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_BIDI_OPT_SUPPORTED                0x10UL
+	#define FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED          0x20UL
+	__le16	max_roce_vfs;
+	__le16	max_crypto_rx_flow_filters;
+	u8	unused_3[3];
+	u8	valid;
+};
+
+/* hwrm_func_qcfg_input (size:192b/24B) */
+struct hwrm_func_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_qcfg_output (size:1344b/168B) */
+struct hwrm_func_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	__le16	port_id;
+	__le16	vlan;
+	__le16	flags;
+	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_MAGICPKT_ENABLED     0x1UL
+	#define FUNC_QCFG_RESP_FLAGS_OOB_WOL_BMP_ENABLED          0x2UL
+	#define FUNC_QCFG_RESP_FLAGS_FW_DCBX_AGENT_ENABLED        0x4UL
+	#define FUNC_QCFG_RESP_FLAGS_STD_TX_RING_MODE_ENABLED     0x8UL
+	#define FUNC_QCFG_RESP_FLAGS_FW_LLDP_AGENT_ENABLED        0x10UL
+	#define FUNC_QCFG_RESP_FLAGS_MULTI_HOST                   0x20UL
+	#define FUNC_QCFG_RESP_FLAGS_TRUSTED_VF                   0x40UL
+	#define FUNC_QCFG_RESP_FLAGS_SECURE_MODE_ENABLED          0x80UL
+	#define FUNC_QCFG_RESP_FLAGS_PREBOOT_LEGACY_L2_RINGS      0x100UL
+	#define FUNC_QCFG_RESP_FLAGS_HOT_RESET_ALLOWED            0x200UL
+	#define FUNC_QCFG_RESP_FLAGS_PPP_PUSH_MODE_ENABLED        0x400UL
+	#define FUNC_QCFG_RESP_FLAGS_RING_MONITOR_ENABLED         0x800UL
+	#define FUNC_QCFG_RESP_FLAGS_FAST_RESET_ALLOWED           0x1000UL
+	#define FUNC_QCFG_RESP_FLAGS_MULTI_ROOT                   0x2000UL
+	#define FUNC_QCFG_RESP_FLAGS_ENABLE_RDMA_SRIOV            0x4000UL
+	#define FUNC_QCFG_RESP_FLAGS_ROCE_VNIC_ID_VALID           0x8000UL
+	u8	mac_address[6];
+	__le16	pci_id;
+	__le16	alloc_rsscos_ctx;
+	__le16	alloc_cmpl_rings;
+	__le16	alloc_tx_rings;
+	__le16	alloc_rx_rings;
+	__le16	alloc_l2_ctx;
+	__le16	alloc_vnics;
+	__le16	admin_mtu;
+	__le16	mru;
+	__le16	stat_ctx_id;
+	u8	port_partition_type;
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_SPF     0x0UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_MPFS    0x1UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_0 0x2UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_5 0x3UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR2_0 0x4UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_NPAR1_2 0x5UL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN 0xffUL
+	#define FUNC_QCFG_RESP_PORT_PARTITION_TYPE_LAST   FUNC_QCFG_RESP_PORT_PARTITION_TYPE_UNKNOWN
+	u8	port_pf_cnt;
+	#define FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL 0x0UL
+	#define FUNC_QCFG_RESP_PORT_PF_CNT_LAST   FUNC_QCFG_RESP_PORT_PF_CNT_UNAVAIL
+	__le16	dflt_vnic_id;
+	__le16	max_mtu_configured;
+	__le32	min_bw;
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_MIN_BW_SCALE_LAST                 FUNC_QCFG_RESP_MIN_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	max_bw;
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_MAX_BW_SCALE_LAST                 FUNC_QCFG_RESP_MAX_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	evb_mode;
+	#define FUNC_QCFG_RESP_EVB_MODE_NO_EVB 0x0UL
+	#define FUNC_QCFG_RESP_EVB_MODE_VEB    0x1UL
+	#define FUNC_QCFG_RESP_EVB_MODE_VEPA   0x2UL
+	#define FUNC_QCFG_RESP_EVB_MODE_LAST  FUNC_QCFG_RESP_EVB_MODE_VEPA
+	u8	options;
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_MASK         0x3UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SFT          0
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_64        0x0UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128       0x1UL
+	#define FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_LAST          FUNC_QCFG_RESP_OPTIONS_CACHE_LINESIZE_SIZE_128
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_MASK       0xcUL
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_SFT        2
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_FORCED_DOWN  (0x0UL << 2)
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_FORCED_UP    (0x1UL << 2)
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_AUTO         (0x2UL << 2)
+	#define FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_LAST        FUNC_QCFG_RESP_OPTIONS_LINK_ADMIN_STATE_AUTO
+	#define FUNC_QCFG_RESP_OPTIONS_RSVD_MASK                   0xf0UL
+	#define FUNC_QCFG_RESP_OPTIONS_RSVD_SFT                    4
+	__le16	alloc_vfs;
+	__le32	alloc_mcast_filters;
+	__le32	alloc_hw_ring_grps;
+	__le16	alloc_sp_tx_rings;
+	__le16	alloc_stat_ctx;
+	__le16	alloc_msix;
+	__le16	registered_vfs;
+	__le16	l2_doorbell_bar_size_kb;
+	u8	active_endpoints;
+	u8	always_1;
+	__le32	reset_addr_poll;
+	__le16	legacy_l2_db_size_kb;
+	__le16	svif_info;
+	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_MASK      0x7fffUL
+	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_SFT       0
+	#define FUNC_QCFG_RESP_SVIF_INFO_SVIF_VALID     0x8000UL
+	u8	mpc_chnls;
+	#define FUNC_QCFG_RESP_MPC_CHNLS_TCE_ENABLED         0x1UL
+	#define FUNC_QCFG_RESP_MPC_CHNLS_RCE_ENABLED         0x2UL
+	#define FUNC_QCFG_RESP_MPC_CHNLS_TE_CFA_ENABLED      0x4UL
+	#define FUNC_QCFG_RESP_MPC_CHNLS_RE_CFA_ENABLED      0x8UL
+	#define FUNC_QCFG_RESP_MPC_CHNLS_PRIMATE_ENABLED     0x10UL
+	u8	db_page_size;
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_4KB   0x0UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_8KB   0x1UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_16KB  0x2UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_32KB  0x3UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_64KB  0x4UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_128KB 0x5UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_256KB 0x6UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_512KB 0x7UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_1MB   0x8UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_2MB   0x9UL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_4MB   0xaUL
+	#define FUNC_QCFG_RESP_DB_PAGE_SIZE_LAST FUNC_QCFG_RESP_DB_PAGE_SIZE_4MB
+	__le16	roce_vnic_id;
+	__le32	partition_min_bw;
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_LAST                 FUNC_QCFG_RESP_PARTITION_MIN_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100
+	__le32	partition_max_bw;
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_LAST                 FUNC_QCFG_RESP_PARTITION_MAX_BW_SCALE_BYTES
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100
+	__le16	host_mtu;
+	__le16	flags2;
+	#define FUNC_QCFG_RESP_FLAGS2_SRIOV_DSCP_INSERT_ENABLED     0x1UL
+	__le16	stag_vid;
+	u8	port_kdnet_mode;
+	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_DISABLED 0x0UL
+	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED  0x1UL
+	#define FUNC_QCFG_RESP_PORT_KDNET_MODE_LAST    FUNC_QCFG_RESP_PORT_KDNET_MODE_ENABLED
+	u8	kdnet_pcie_function;
+	__le16	port_kdnet_fid;
+	u8	unused_5;
+	u8	roce_bidi_opt_mode;
+	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DISABLED      0x1UL
+	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_DEDICATED     0x2UL
+	#define FUNC_QCFG_RESP_ROCE_BIDI_OPT_MODE_SHARED        0x4UL
+	__le32	num_ktls_tx_key_ctxs;
+	__le32	num_ktls_rx_key_ctxs;
+	u8	lag_id;
+	u8	parif;
+	u8	fw_lag_id;
+	u8	unused_6;
+	__le32	num_quic_tx_key_ctxs;
+	__le32	num_quic_rx_key_ctxs;
+	__le32	roce_max_av_per_vf;
+	__le32	roce_max_cq_per_vf;
+	__le32	roce_max_mrw_per_vf;
+	__le32	roce_max_qp_per_vf;
+	__le32	roce_max_srq_per_vf;
+	__le32	roce_max_gid_per_vf;
+	__le16	xid_partition_cfg;
+	#define FUNC_QCFG_RESP_XID_PARTITION_CFG_TX_CK     0x1UL
+	#define FUNC_QCFG_RESP_XID_PARTITION_CFG_RX_CK     0x2UL
+	__le16	mirror_vnic_id;
+	u8	unused_7[7];
+	u8	valid;
+};
+
+/* hwrm_func_cfg_input (size:1280b/160B) */
+struct hwrm_func_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	__le16	num_msix;
+	__le32	flags;
+	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_DISABLE     0x1UL
+	#define FUNC_CFG_REQ_FLAGS_SRC_MAC_ADDR_CHECK_ENABLE      0x2UL
+	#define FUNC_CFG_REQ_FLAGS_RSVD_MASK                      0x1fcUL
+	#define FUNC_CFG_REQ_FLAGS_RSVD_SFT                       2
+	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_ENABLE        0x200UL
+	#define FUNC_CFG_REQ_FLAGS_STD_TX_RING_MODE_DISABLE       0x400UL
+	#define FUNC_CFG_REQ_FLAGS_VIRT_MAC_PERSIST               0x800UL
+	#define FUNC_CFG_REQ_FLAGS_NO_AUTOCLEAR_STATISTIC         0x1000UL
+	#define FUNC_CFG_REQ_FLAGS_TX_ASSETS_TEST                 0x2000UL
+	#define FUNC_CFG_REQ_FLAGS_RX_ASSETS_TEST                 0x4000UL
+	#define FUNC_CFG_REQ_FLAGS_CMPL_ASSETS_TEST               0x8000UL
+	#define FUNC_CFG_REQ_FLAGS_RSSCOS_CTX_ASSETS_TEST         0x10000UL
+	#define FUNC_CFG_REQ_FLAGS_RING_GRP_ASSETS_TEST           0x20000UL
+	#define FUNC_CFG_REQ_FLAGS_STAT_CTX_ASSETS_TEST           0x40000UL
+	#define FUNC_CFG_REQ_FLAGS_VNIC_ASSETS_TEST               0x80000UL
+	#define FUNC_CFG_REQ_FLAGS_L2_CTX_ASSETS_TEST             0x100000UL
+	#define FUNC_CFG_REQ_FLAGS_TRUSTED_VF_ENABLE              0x200000UL
+	#define FUNC_CFG_REQ_FLAGS_DYNAMIC_TX_RING_ALLOC          0x400000UL
+	#define FUNC_CFG_REQ_FLAGS_NQ_ASSETS_TEST                 0x800000UL
+	#define FUNC_CFG_REQ_FLAGS_TRUSTED_VF_DISABLE             0x1000000UL
+	#define FUNC_CFG_REQ_FLAGS_PREBOOT_LEGACY_L2_RINGS        0x2000000UL
+	#define FUNC_CFG_REQ_FLAGS_HOT_RESET_IF_EN_DIS            0x4000000UL
+	#define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_ENABLE           0x8000000UL
+	#define FUNC_CFG_REQ_FLAGS_PPP_PUSH_MODE_DISABLE          0x10000000UL
+	#define FUNC_CFG_REQ_FLAGS_BD_METADATA_ENABLE             0x20000000UL
+	#define FUNC_CFG_REQ_FLAGS_BD_METADATA_DISABLE            0x40000000UL
+	__le32	enables;
+	#define FUNC_CFG_REQ_ENABLES_ADMIN_MTU                0x1UL
+	#define FUNC_CFG_REQ_ENABLES_MRU                      0x2UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_RSSCOS_CTXS          0x4UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_CMPL_RINGS           0x8UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_TX_RINGS             0x10UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_RX_RINGS             0x20UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_L2_CTXS              0x40UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_VNICS                0x80UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_STAT_CTXS            0x100UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_MAC_ADDR            0x200UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_VLAN                0x400UL
+	#define FUNC_CFG_REQ_ENABLES_DFLT_IP_ADDR             0x800UL
+	#define FUNC_CFG_REQ_ENABLES_MIN_BW                   0x1000UL
+	#define FUNC_CFG_REQ_ENABLES_MAX_BW                   0x2000UL
+	#define FUNC_CFG_REQ_ENABLES_ASYNC_EVENT_CR           0x4000UL
+	#define FUNC_CFG_REQ_ENABLES_VLAN_ANTISPOOF_MODE      0x8000UL
+	#define FUNC_CFG_REQ_ENABLES_ALLOWED_VLAN_PRIS        0x10000UL
+	#define FUNC_CFG_REQ_ENABLES_EVB_MODE                 0x20000UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_MCAST_FILTERS        0x40000UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_HW_RING_GRPS         0x80000UL
+	#define FUNC_CFG_REQ_ENABLES_CACHE_LINESIZE           0x100000UL
+	#define FUNC_CFG_REQ_ENABLES_NUM_MSIX                 0x200000UL
+	#define FUNC_CFG_REQ_ENABLES_ADMIN_LINK_STATE         0x400000UL
+	#define FUNC_CFG_REQ_ENABLES_HOT_RESET_IF_SUPPORT     0x800000UL
+	#define FUNC_CFG_REQ_ENABLES_SCHQ_ID                  0x1000000UL
+	#define FUNC_CFG_REQ_ENABLES_MPC_CHNLS                0x2000000UL
+	#define FUNC_CFG_REQ_ENABLES_PARTITION_MIN_BW         0x4000000UL
+	#define FUNC_CFG_REQ_ENABLES_PARTITION_MAX_BW         0x8000000UL
+	#define FUNC_CFG_REQ_ENABLES_TPID                     0x10000000UL
+	#define FUNC_CFG_REQ_ENABLES_HOST_MTU                 0x20000000UL
+	#define FUNC_CFG_REQ_ENABLES_KTLS_TX_KEY_CTXS         0x40000000UL
+	#define FUNC_CFG_REQ_ENABLES_KTLS_RX_KEY_CTXS         0x80000000UL
+	__le16	admin_mtu;
+	__le16	mru;
+	__le16	num_rsscos_ctxs;
+	__le16	num_cmpl_rings;
+	__le16	num_tx_rings;
+	__le16	num_rx_rings;
+	__le16	num_l2_ctxs;
+	__le16	num_vnics;
+	__le16	num_stat_ctxs;
+	__le16	num_hw_ring_grps;
+	u8	dflt_mac_addr[6];
+	__le16	dflt_vlan;
+	__be32	dflt_ip_addr[4];
+	__le32	min_bw;
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_MIN_BW_SCALE_LAST                 FUNC_CFG_REQ_MIN_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	max_bw;
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_MAX_BW_SCALE_LAST                 FUNC_CFG_REQ_MAX_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
+	__le16	async_event_cr;
+	u8	vlan_antispoof_mode;
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_NOCHECK                 0x0UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_VALIDATE_VLAN           0x1UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_IF_VLANDNE       0x2UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN 0x3UL
+	#define FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_LAST                   FUNC_CFG_REQ_VLAN_ANTISPOOF_MODE_INSERT_OR_OVERRIDE_VLAN
+	u8	allowed_vlan_pris;
+	u8	evb_mode;
+	#define FUNC_CFG_REQ_EVB_MODE_NO_EVB 0x0UL
+	#define FUNC_CFG_REQ_EVB_MODE_VEB    0x1UL
+	#define FUNC_CFG_REQ_EVB_MODE_VEPA   0x2UL
+	#define FUNC_CFG_REQ_EVB_MODE_LAST  FUNC_CFG_REQ_EVB_MODE_VEPA
+	u8	options;
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_MASK         0x3UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SFT          0
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_64        0x0UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128       0x1UL
+	#define FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_LAST          FUNC_CFG_REQ_OPTIONS_CACHE_LINESIZE_SIZE_128
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_MASK       0xcUL
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_SFT        2
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_FORCED_DOWN  (0x0UL << 2)
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_FORCED_UP    (0x1UL << 2)
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_AUTO         (0x2UL << 2)
+	#define FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_LAST        FUNC_CFG_REQ_OPTIONS_LINK_ADMIN_STATE_AUTO
+	#define FUNC_CFG_REQ_OPTIONS_RSVD_MASK                   0xf0UL
+	#define FUNC_CFG_REQ_OPTIONS_RSVD_SFT                    4
+	__le16	num_mcast_filters;
+	__le16	schq_id;
+	__le16	mpc_chnls;
+	#define FUNC_CFG_REQ_MPC_CHNLS_TCE_ENABLE          0x1UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_TCE_DISABLE         0x2UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_RCE_ENABLE          0x4UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_RCE_DISABLE         0x8UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_TE_CFA_ENABLE       0x10UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_TE_CFA_DISABLE      0x20UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_RE_CFA_ENABLE       0x40UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_RE_CFA_DISABLE      0x80UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_PRIMATE_ENABLE      0x100UL
+	#define FUNC_CFG_REQ_MPC_CHNLS_PRIMATE_DISABLE     0x200UL
+	__le32	partition_min_bw;
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_LAST                 FUNC_CFG_REQ_PARTITION_MIN_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_PARTITION_MIN_BW_BW_VALUE_UNIT_PERCENT1_100
+	__le32	partition_max_bw;
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_SFT              0
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE                     0x10000000UL
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_LAST                 FUNC_CFG_REQ_PARTITION_MAX_BW_SCALE_BYTES
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST         FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100
+	__be16	tpid;
+	__le16	host_mtu;
+	__le32	flags2;
+	#define FUNC_CFG_REQ_FLAGS2_KTLS_KEY_CTX_ASSETS_TEST     0x1UL
+	#define FUNC_CFG_REQ_FLAGS2_QUIC_KEY_CTX_ASSETS_TEST     0x2UL
+	__le32	enables2;
+	#define FUNC_CFG_REQ_ENABLES2_KDNET                    0x1UL
+	#define FUNC_CFG_REQ_ENABLES2_DB_PAGE_SIZE             0x2UL
+	#define FUNC_CFG_REQ_ENABLES2_QUIC_TX_KEY_CTXS         0x4UL
+	#define FUNC_CFG_REQ_ENABLES2_QUIC_RX_KEY_CTXS         0x8UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_AV_PER_VF       0x10UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_CQ_PER_VF       0x20UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_MRW_PER_VF      0x40UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_QP_PER_VF       0x80UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_SRQ_PER_VF      0x100UL
+	#define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_GID_PER_VF      0x200UL
+	#define FUNC_CFG_REQ_ENABLES2_XID_PARTITION_CFG        0x400UL
+	#define FUNC_CFG_REQ_ENABLES2_PHYSICAL_SLOT_NUMBER     0x800UL
+	u8	port_kdnet_mode;
+	#define FUNC_CFG_REQ_PORT_KDNET_MODE_DISABLED 0x0UL
+	#define FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED  0x1UL
+	#define FUNC_CFG_REQ_PORT_KDNET_MODE_LAST    FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED
+	u8	db_page_size;
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_4KB   0x0UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_8KB   0x1UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_16KB  0x2UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_32KB  0x3UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_64KB  0x4UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_128KB 0x5UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_256KB 0x6UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_512KB 0x7UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_1MB   0x8UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_2MB   0x9UL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_4MB   0xaUL
+	#define FUNC_CFG_REQ_DB_PAGE_SIZE_LAST FUNC_CFG_REQ_DB_PAGE_SIZE_4MB
+	__le16	physical_slot_number;
+	__le32	num_ktls_tx_key_ctxs;
+	__le32	num_ktls_rx_key_ctxs;
+	__le32	num_quic_tx_key_ctxs;
+	__le32	num_quic_rx_key_ctxs;
+	__le32	roce_max_av_per_vf;
+	__le32	roce_max_cq_per_vf;
+	__le32	roce_max_mrw_per_vf;
+	__le32	roce_max_qp_per_vf;
+	__le32	roce_max_srq_per_vf;
+	__le32	roce_max_gid_per_vf;
+	__le16	xid_partition_cfg;
+	#define FUNC_CFG_REQ_XID_PARTITION_CFG_TX_CK     0x1UL
+	#define FUNC_CFG_REQ_XID_PARTITION_CFG_RX_CK     0x2UL
+	__le16	unused_2;
+};
+
+/* hwrm_func_cfg_output (size:128b/16B) */
+struct hwrm_func_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_cfg_cmd_err (size:64b/8B) */
+struct hwrm_func_cfg_cmd_err {
+	u8	code;
+	#define FUNC_CFG_CMD_ERR_CODE_UNKNOWN                      0x0UL
+	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_RANGE       0x1UL
+	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_MORE_THAN_MAX  0x2UL
+	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_UNSUPPORTED 0x3UL
+	#define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT         0x4UL
+	#define FUNC_CFG_CMD_ERR_CODE_LAST                        FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT
+	u8	unused_0[7];
+};
+
+/* hwrm_func_qstats_input (size:192b/24B) */
+struct hwrm_func_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	flags;
+	#define FUNC_QSTATS_REQ_FLAGS_ROCE_ONLY        0x1UL
+	#define FUNC_QSTATS_REQ_FLAGS_COUNTER_MASK     0x2UL
+	#define FUNC_QSTATS_REQ_FLAGS_L2_ONLY          0x4UL
+	u8	unused_0[5];
+};
+
+/* hwrm_func_qstats_output (size:1408b/176B) */
+struct hwrm_func_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_drop_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_drop_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	rx_agg_pkts;
+	__le64	rx_agg_bytes;
+	__le64	rx_agg_events;
+	__le64	rx_agg_aborts;
+	u8	clear_seq;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_func_qstats_ext_input (size:256b/32B) */
+struct hwrm_func_qstats_ext_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	flags;
+	#define FUNC_QSTATS_EXT_REQ_FLAGS_ROCE_ONLY        0x1UL
+	#define FUNC_QSTATS_EXT_REQ_FLAGS_COUNTER_MASK     0x2UL
+	u8	unused_0[1];
+	__le32	enables;
+	#define FUNC_QSTATS_EXT_REQ_ENABLES_SCHQ_ID     0x1UL
+	__le16	schq_id;
+	__le16	traffic_class;
+	u8	unused_1[4];
+};
+
+/* hwrm_func_qstats_ext_output (size:1536b/192B) */
+struct hwrm_func_qstats_ext_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_error_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_error_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_tpa_eligible_pkt;
+	__le64	rx_tpa_eligible_bytes;
+	__le64	rx_tpa_pkt;
+	__le64	rx_tpa_bytes;
+	__le64	rx_tpa_errors;
+	__le64	rx_tpa_events;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_clr_stats_input (size:192b/24B) */
+struct hwrm_func_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_clr_stats_output (size:128b/16B) */
+struct hwrm_func_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_vf_resc_free_input (size:192b/24B) */
+struct hwrm_func_vf_resc_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_vf_resc_free_output (size:128b/16B) */
+struct hwrm_func_vf_resc_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_drv_rgtr_input (size:896b/112B) */
+struct hwrm_func_drv_rgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_ALL_MODE                     0x1UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FWD_NONE_MODE                    0x2UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_16BIT_VER_MODE                   0x4UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FLOW_HANDLE_64BIT_MODE           0x8UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_HOT_RESET_SUPPORT                0x10UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_ERROR_RECOVERY_SUPPORT           0x20UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_MASTER_SUPPORT                   0x40UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_FAST_RESET_SUPPORT               0x80UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_RSS_STRICT_HASH_TYPE_SUPPORT     0x100UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_NPAR_1_2_SUPPORT                 0x200UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_ASYM_QUEUE_CFG_SUPPORT           0x400UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_TF_INGRESS_NIC_FLOW_MODE         0x800UL
+	#define FUNC_DRV_RGTR_REQ_FLAGS_TF_EGRESS_NIC_FLOW_MODE          0x1000UL
+	__le32	enables;
+	#define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE             0x1UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_VER                 0x2UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_TIMESTAMP           0x4UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_VF_REQ_FWD          0x8UL
+	#define FUNC_DRV_RGTR_REQ_ENABLES_ASYNC_EVENT_FWD     0x10UL
+	__le16	os_type;
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UNKNOWN   0x0UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_OTHER     0x1UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_MSDOS     0xeUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WINDOWS   0x12UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_SOLARIS   0x1dUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LINUX     0x24UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_FREEBSD   0x2aUL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_ESXI      0x68UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN864    0x73UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_WIN2012R2 0x74UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI      0x8000UL
+	#define FUNC_DRV_RGTR_REQ_OS_TYPE_LAST     FUNC_DRV_RGTR_REQ_OS_TYPE_UEFI
+	u8	ver_maj_8b;
+	u8	ver_min_8b;
+	u8	ver_upd_8b;
+	u8	unused_0[3];
+	__le32	timestamp;
+	u8	unused_1[4];
+	__le32	vf_req_fwd[8];
+	__le32	async_event_fwd[8];
+	__le16	ver_maj;
+	__le16	ver_min;
+	__le16	ver_upd;
+	__le16	ver_patch;
+};
+
+/* hwrm_func_drv_rgtr_output (size:128b/16B) */
+struct hwrm_func_drv_rgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define FUNC_DRV_RGTR_RESP_FLAGS_IF_CHANGE_SUPPORTED     0x1UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_func_drv_unrgtr_input (size:192b/24B) */
+struct hwrm_func_drv_unrgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_DRV_UNRGTR_REQ_FLAGS_PREPARE_FOR_SHUTDOWN     0x1UL
+	u8	unused_0[4];
+};
+
+/* hwrm_func_drv_unrgtr_output (size:128b/16B) */
+struct hwrm_func_drv_unrgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_buf_rgtr_input (size:1024b/128B) */
+struct hwrm_func_buf_rgtr_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_BUF_RGTR_REQ_ENABLES_VF_ID            0x1UL
+	#define FUNC_BUF_RGTR_REQ_ENABLES_ERR_BUF_ADDR     0x2UL
+	__le16	vf_id;
+	__le16	req_buf_num_pages;
+	__le16	req_buf_page_size;
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_16B 0x4UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4K  0xcUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_8K  0xdUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_64K 0x10UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_2M  0x15UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_4M  0x16UL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G  0x1eUL
+	#define FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_LAST FUNC_BUF_RGTR_REQ_REQ_BUF_PAGE_SIZE_1G
+	__le16	req_buf_len;
+	__le16	resp_buf_len;
+	u8	unused_0[2];
+	__le64	req_buf_page_addr0;
+	__le64	req_buf_page_addr1;
+	__le64	req_buf_page_addr2;
+	__le64	req_buf_page_addr3;
+	__le64	req_buf_page_addr4;
+	__le64	req_buf_page_addr5;
+	__le64	req_buf_page_addr6;
+	__le64	req_buf_page_addr7;
+	__le64	req_buf_page_addr8;
+	__le64	req_buf_page_addr9;
+	__le64	error_buf_addr;
+	__le64	resp_buf_addr;
+};
+
+/* hwrm_func_buf_rgtr_output (size:128b/16B) */
+struct hwrm_func_buf_rgtr_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_drv_qver_input (size:192b/24B) */
+struct hwrm_func_drv_qver_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	reserved;
+	__le16	fid;
+	u8	driver_type;
+	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_L2   0x0UL
+	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_ROCE 0x1UL
+	#define FUNC_DRV_QVER_REQ_DRIVER_TYPE_LAST FUNC_DRV_QVER_REQ_DRIVER_TYPE_ROCE
+	u8	unused_0;
+};
+
+/* hwrm_func_drv_qver_output (size:256b/32B) */
+struct hwrm_func_drv_qver_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	os_type;
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_UNKNOWN   0x0UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_OTHER     0x1UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_MSDOS     0xeUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WINDOWS   0x12UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_SOLARIS   0x1dUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_LINUX     0x24UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_FREEBSD   0x2aUL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_ESXI      0x68UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN864    0x73UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_WIN2012R2 0x74UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_UEFI      0x8000UL
+	#define FUNC_DRV_QVER_RESP_OS_TYPE_LAST     FUNC_DRV_QVER_RESP_OS_TYPE_UEFI
+	u8	ver_maj_8b;
+	u8	ver_min_8b;
+	u8	ver_upd_8b;
+	u8	unused_0[3];
+	__le16	ver_maj;
+	__le16	ver_min;
+	__le16	ver_upd;
+	__le16	ver_patch;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* hwrm_func_resource_qcaps_input (size:192b/24B) */
+struct hwrm_func_resource_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_func_resource_qcaps_output (size:704b/88B) */
+struct hwrm_func_resource_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	max_vfs;
+	__le16	max_msix;
+	__le16	vf_reservation_strategy;
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MAXIMAL        0x0UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL        0x1UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC 0x2UL
+	#define FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_LAST          FUNC_RESOURCE_QCAPS_RESP_VF_RESERVATION_STRATEGY_MINIMAL_STATIC
+	__le16	min_rsscos_ctx;
+	__le16	max_rsscos_ctx;
+	__le16	min_cmpl_rings;
+	__le16	max_cmpl_rings;
+	__le16	min_tx_rings;
+	__le16	max_tx_rings;
+	__le16	min_rx_rings;
+	__le16	max_rx_rings;
+	__le16	min_l2_ctxs;
+	__le16	max_l2_ctxs;
+	__le16	min_vnics;
+	__le16	max_vnics;
+	__le16	min_stat_ctx;
+	__le16	max_stat_ctx;
+	__le16	min_hw_ring_grps;
+	__le16	max_hw_ring_grps;
+	__le16	max_tx_scheduler_inputs;
+	__le16	flags;
+	#define FUNC_RESOURCE_QCAPS_RESP_FLAGS_MIN_GUARANTEED     0x1UL
+	__le16	min_msix;
+	__le32	min_ktls_tx_key_ctxs;
+	__le32	max_ktls_tx_key_ctxs;
+	__le32	min_ktls_rx_key_ctxs;
+	__le32	max_ktls_rx_key_ctxs;
+	__le32	min_quic_tx_key_ctxs;
+	__le32	max_quic_tx_key_ctxs;
+	__le32	min_quic_rx_key_ctxs;
+	__le32	max_quic_rx_key_ctxs;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_func_vf_resource_cfg_input (size:704b/88B) */
+struct hwrm_func_vf_resource_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	__le16	max_msix;
+	__le16	min_rsscos_ctx;
+	__le16	max_rsscos_ctx;
+	__le16	min_cmpl_rings;
+	__le16	max_cmpl_rings;
+	__le16	min_tx_rings;
+	__le16	max_tx_rings;
+	__le16	min_rx_rings;
+	__le16	max_rx_rings;
+	__le16	min_l2_ctxs;
+	__le16	max_l2_ctxs;
+	__le16	min_vnics;
+	__le16	max_vnics;
+	__le16	min_stat_ctx;
+	__le16	max_stat_ctx;
+	__le16	min_hw_ring_grps;
+	__le16	max_hw_ring_grps;
+	__le16	flags;
+	#define FUNC_VF_RESOURCE_CFG_REQ_FLAGS_MIN_GUARANTEED     0x1UL
+	__le16	min_msix;
+	__le32	min_ktls_tx_key_ctxs;
+	__le32	max_ktls_tx_key_ctxs;
+	__le32	min_ktls_rx_key_ctxs;
+	__le32	max_ktls_rx_key_ctxs;
+	__le32	min_quic_tx_key_ctxs;
+	__le32	max_quic_tx_key_ctxs;
+	__le32	min_quic_rx_key_ctxs;
+	__le32	max_quic_rx_key_ctxs;
+};
+
+/* hwrm_func_vf_resource_cfg_output (size:384b/48B) */
+struct hwrm_func_vf_resource_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	reserved_rsscos_ctx;
+	__le16	reserved_cmpl_rings;
+	__le16	reserved_tx_rings;
+	__le16	reserved_rx_rings;
+	__le16	reserved_l2_ctxs;
+	__le16	reserved_vnics;
+	__le16	reserved_stat_ctx;
+	__le16	reserved_hw_ring_grps;
+	__le32	reserved_ktls_tx_key_ctxs;
+	__le32	reserved_ktls_rx_key_ctxs;
+	__le32	reserved_quic_tx_key_ctxs;
+	__le32	reserved_quic_rx_key_ctxs;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_backing_store_qcaps_input (size:128b/16B) */
+struct hwrm_func_backing_store_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_func_backing_store_qcaps_output (size:832b/104B) */
+struct hwrm_func_backing_store_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	qp_max_entries;
+	__le16	qp_min_qp1_entries;
+	__le16	qp_max_l2_entries;
+	__le16	qp_entry_size;
+	__le16	srq_max_l2_entries;
+	__le32	srq_max_entries;
+	__le16	srq_entry_size;
+	__le16	cq_max_l2_entries;
+	__le32	cq_max_entries;
+	__le16	cq_entry_size;
+	__le16	vnic_max_vnic_entries;
+	__le16	vnic_max_ring_table_entries;
+	__le16	vnic_entry_size;
+	__le32	stat_max_entries;
+	__le16	stat_entry_size;
+	__le16	tqm_entry_size;
+	__le32	tqm_min_entries_per_ring;
+	__le32	tqm_max_entries_per_ring;
+	__le32	mrav_max_entries;
+	__le16	mrav_entry_size;
+	__le16	tim_entry_size;
+	__le32	tim_max_entries;
+	__le16	mrav_num_entries_units;
+	u8	tqm_entries_multiple;
+	u8	ctx_kind_initializer;
+	__le16	ctx_init_mask;
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_QP       0x1UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_SRQ      0x2UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_CQ       0x4UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_VNIC     0x8UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_STAT     0x10UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_MRAV     0x20UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_TKC      0x40UL
+	#define FUNC_BACKING_STORE_QCAPS_RESP_CTX_INIT_MASK_RKC      0x80UL
+	u8	qp_init_offset;
+	u8	srq_init_offset;
+	u8	cq_init_offset;
+	u8	vnic_init_offset;
+	u8	tqm_fp_rings_count;
+	u8	stat_init_offset;
+	u8	mrav_init_offset;
+	u8	tqm_fp_rings_count_ext;
+	u8	tkc_init_offset;
+	u8	rkc_init_offset;
+	__le16	tkc_entry_size;
+	__le16	rkc_entry_size;
+	__le32	tkc_max_entries;
+	__le32	rkc_max_entries;
+	__le16	fast_qpmd_qp_num_entries;
+	u8	rsvd1[5];
+	u8	valid;
+};
+
+/* tqm_fp_ring_cfg (size:128b/16B) */
+struct tqm_fp_ring_cfg {
+	u8	tqm_ring_pg_size_tqm_ring_lvl;
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_MASK      0xfUL
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_SFT       0
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_0       0x0UL
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_1       0x1UL
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2       0x2UL
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LAST       TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_LVL_LVL_2
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_MASK  0xf0UL
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_SFT   4
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_LAST   TQM_FP_RING_CFG_TQM_RING_CFG_TQM_RING_PG_SIZE_PG_1G
+	u8	unused[3];
+	__le32	tqm_ring_num_entries;
+	__le64	tqm_ring_page_dir;
+};
+
+/* hwrm_func_backing_store_cfg_input (size:2688b/336B) */
+struct hwrm_func_backing_store_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_BACKING_STORE_CFG_REQ_FLAGS_PREBOOT_MODE               0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_FLAGS_MRAV_RESERVATION_SPLIT     0x2UL
+	__le32	enables;
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_QP               0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_SRQ              0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_CQ               0x4UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_VNIC             0x8UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_STAT             0x10UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_SP           0x20UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING0        0x40UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING1        0x80UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING2        0x100UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING3        0x200UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING4        0x400UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING5        0x800UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING6        0x1000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING7        0x2000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV             0x4000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TIM              0x8000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING8        0x10000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING9        0x20000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TQM_RING10       0x40000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_TKC              0x80000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_RKC              0x100000UL
+	#define FUNC_BACKING_STORE_CFG_REQ_ENABLES_QP_FAST_QPMD     0x200000UL
+	u8	qpc_pg_size_qpc_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_QPC_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_QPC_PG_SIZE_PG_1G
+	u8	srq_pg_size_srq_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_SRQ_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_SRQ_PG_SIZE_PG_1G
+	u8	cq_pg_size_cq_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_CQ_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_CQ_PG_SIZE_PG_1G
+	u8	vnic_pg_size_vnic_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_VNIC_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_VNIC_PG_SIZE_PG_1G
+	u8	stat_pg_size_stat_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_STAT_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_STAT_PG_SIZE_PG_1G
+	u8	tqm_sp_pg_size_tqm_sp_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_SP_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_SP_PG_SIZE_PG_1G
+	u8	tqm_ring0_pg_size_tqm_ring0_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING0_PG_SIZE_PG_1G
+	u8	tqm_ring1_pg_size_tqm_ring1_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING1_PG_SIZE_PG_1G
+	u8	tqm_ring2_pg_size_tqm_ring2_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING2_PG_SIZE_PG_1G
+	u8	tqm_ring3_pg_size_tqm_ring3_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING3_PG_SIZE_PG_1G
+	u8	tqm_ring4_pg_size_tqm_ring4_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING4_PG_SIZE_PG_1G
+	u8	tqm_ring5_pg_size_tqm_ring5_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING5_PG_SIZE_PG_1G
+	u8	tqm_ring6_pg_size_tqm_ring6_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING6_PG_SIZE_PG_1G
+	u8	tqm_ring7_pg_size_tqm_ring7_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TQM_RING7_PG_SIZE_PG_1G
+	u8	mrav_pg_size_mrav_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_MRAV_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_MRAV_PG_SIZE_PG_1G
+	u8	tim_pg_size_tim_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TIM_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TIM_PG_SIZE_PG_1G
+	__le64	qpc_page_dir;
+	__le64	srq_page_dir;
+	__le64	cq_page_dir;
+	__le64	vnic_page_dir;
+	__le64	stat_page_dir;
+	__le64	tqm_sp_page_dir;
+	__le64	tqm_ring0_page_dir;
+	__le64	tqm_ring1_page_dir;
+	__le64	tqm_ring2_page_dir;
+	__le64	tqm_ring3_page_dir;
+	__le64	tqm_ring4_page_dir;
+	__le64	tqm_ring5_page_dir;
+	__le64	tqm_ring6_page_dir;
+	__le64	tqm_ring7_page_dir;
+	__le64	mrav_page_dir;
+	__le64	tim_page_dir;
+	__le32	qp_num_entries;
+	__le32	srq_num_entries;
+	__le32	cq_num_entries;
+	__le32	stat_num_entries;
+	__le32	tqm_sp_num_entries;
+	__le32	tqm_ring0_num_entries;
+	__le32	tqm_ring1_num_entries;
+	__le32	tqm_ring2_num_entries;
+	__le32	tqm_ring3_num_entries;
+	__le32	tqm_ring4_num_entries;
+	__le32	tqm_ring5_num_entries;
+	__le32	tqm_ring6_num_entries;
+	__le32	tqm_ring7_num_entries;
+	__le32	mrav_num_entries;
+	__le32	tim_num_entries;
+	__le16	qp_num_qp1_entries;
+	__le16	qp_num_l2_entries;
+	__le16	qp_entry_size;
+	__le16	srq_num_l2_entries;
+	__le16	srq_entry_size;
+	__le16	cq_num_l2_entries;
+	__le16	cq_entry_size;
+	__le16	vnic_num_vnic_entries;
+	__le16	vnic_num_ring_table_entries;
+	__le16	vnic_entry_size;
+	__le16	stat_entry_size;
+	__le16	tqm_entry_size;
+	__le16	mrav_entry_size;
+	__le16	tim_entry_size;
+	u8	tqm_ring8_pg_size_tqm_ring_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING8_TQM_RING_PG_SIZE_PG_1G
+	u8	ring8_unused[3];
+	__le32	tqm_ring8_num_entries;
+	__le64	tqm_ring8_page_dir;
+	u8	tqm_ring9_pg_size_tqm_ring_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING9_TQM_RING_PG_SIZE_PG_1G
+	u8	ring9_unused[3];
+	__le32	tqm_ring9_num_entries;
+	__le64	tqm_ring9_page_dir;
+	u8	tqm_ring10_pg_size_tqm_ring_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RING10_TQM_RING_PG_SIZE_PG_1G
+	u8	ring10_unused[3];
+	__le32	tqm_ring10_num_entries;
+	__le64	tqm_ring10_page_dir;
+	__le32	tkc_num_entries;
+	__le32	rkc_num_entries;
+	__le64	tkc_page_dir;
+	__le64	rkc_page_dir;
+	__le16	tkc_entry_size;
+	__le16	rkc_entry_size;
+	u8	tkc_pg_size_tkc_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_TKC_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_TKC_PG_SIZE_PG_1G
+	u8	rkc_pg_size_rkc_lvl;
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_MASK      0xfUL
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_SFT       0
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_0       0x0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_1       0x1UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_2       0x2UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LAST       FUNC_BACKING_STORE_CFG_REQ_RKC_LVL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_LAST   FUNC_BACKING_STORE_CFG_REQ_RKC_PG_SIZE_PG_1G
+	__le16	qp_num_fast_qpmd_entries;
+};
+
+/* hwrm_func_backing_store_cfg_output (size:128b/16B) */
+struct hwrm_func_backing_store_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_error_recovery_qcfg_input (size:192b/24B) */
+struct hwrm_error_recovery_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	unused_0[8];
+};
+
+/* hwrm_error_recovery_qcfg_output (size:1664b/208B) */
+struct hwrm_error_recovery_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define ERROR_RECOVERY_QCFG_RESP_FLAGS_HOST       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_FLAGS_CO_CPU     0x2UL
+	__le32	driver_polling_freq;
+	__le32	master_func_wait_period;
+	__le32	normal_func_wait_period;
+	__le32	master_func_wait_period_after_reset;
+	__le32	max_bailout_time_after_reset;
+	__le32	fw_health_status_reg;
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEALTH_STATUS_REG_ADDR_SFT           2
+	__le32	fw_heartbeat_reg;
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_HEARTBEAT_REG_ADDR_SFT           2
+	__le32	fw_reset_cnt_reg;
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_FW_RESET_CNT_REG_ADDR_SFT           2
+	__le32	reset_inprogress_reg;
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_INPROGRESS_REG_ADDR_SFT           2
+	__le32	reset_inprogress_reg_mask;
+	u8	unused_0[3];
+	u8	reg_array_cnt;
+	__le32	reset_reg[16];
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_RESET_REG_ADDR_SFT           2
+	__le32	reset_reg_val[16];
+	u8	delay_after_reset[16];
+	__le32	err_recovery_cnt_reg;
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_MASK    0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_SFT     0
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_GRC       0x1UL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR0      0x2UL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR1      0x3UL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_LAST     ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SPACE_BAR1
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_MASK          0xfffffffcUL
+	#define ERROR_RECOVERY_QCFG_RESP_ERR_RECOVERY_CNT_REG_ADDR_SFT           2
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_func_echo_response_input (size:192b/24B) */
+struct hwrm_func_echo_response_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	event_data1;
+	__le32	event_data2;
+};
+
+/* hwrm_func_echo_response_output (size:128b/16B) */
+struct hwrm_func_echo_response_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_ptp_pin_qcfg_input (size:192b/24B) */
+struct hwrm_func_ptp_pin_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	unused_0[8];
+};
+
+/* hwrm_func_ptp_pin_qcfg_output (size:128b/16B) */
+struct hwrm_func_ptp_pin_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_pins;
+	u8	state;
+	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN0_ENABLED     0x1UL
+	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN1_ENABLED     0x2UL
+	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN2_ENABLED     0x4UL
+	#define FUNC_PTP_PIN_QCFG_RESP_STATE_PIN3_ENABLED     0x8UL
+	u8	pin0_usage;
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_NONE     0x0UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_PPS_IN   0x1UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_PPS_OUT  0x2UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_IN  0x3UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_OUT 0x4UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_LAST    FUNC_PTP_PIN_QCFG_RESP_PIN0_USAGE_SYNC_OUT
+	u8	pin1_usage;
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_NONE     0x0UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_PPS_IN   0x1UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_PPS_OUT  0x2UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_IN  0x3UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_OUT 0x4UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_LAST    FUNC_PTP_PIN_QCFG_RESP_PIN1_USAGE_SYNC_OUT
+	u8	pin2_usage;
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_NONE                      0x0UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_PPS_IN                    0x1UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_PPS_OUT                   0x2UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNC_IN                   0x3UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNC_OUT                  0x4UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_LAST                     FUNC_PTP_PIN_QCFG_RESP_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT
+	u8	pin3_usage;
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_NONE                      0x0UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_PPS_IN                    0x1UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_PPS_OUT                   0x2UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNC_IN                   0x3UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNC_OUT                  0x4UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
+	#define FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_LAST                     FUNC_PTP_PIN_QCFG_RESP_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_func_ptp_pin_cfg_input (size:256b/32B) */
+struct hwrm_func_ptp_pin_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN0_STATE     0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN0_USAGE     0x2UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN1_STATE     0x4UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN1_USAGE     0x8UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN2_STATE     0x10UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN2_USAGE     0x20UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN3_STATE     0x40UL
+	#define FUNC_PTP_PIN_CFG_REQ_ENABLES_PIN3_USAGE     0x80UL
+	u8	pin0_state;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_DISABLED 0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_ENABLED  0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN0_STATE_ENABLED
+	u8	pin0_usage;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_NONE     0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_PPS_IN   0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_PPS_OUT  0x2UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_IN  0x3UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_OUT 0x4UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN0_USAGE_SYNC_OUT
+	u8	pin1_state;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_DISABLED 0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_ENABLED  0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN1_STATE_ENABLED
+	u8	pin1_usage;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_NONE     0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_PPS_IN   0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_PPS_OUT  0x2UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_IN  0x3UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_OUT 0x4UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN1_USAGE_SYNC_OUT
+	u8	pin2_state;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_DISABLED 0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_ENABLED  0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN2_STATE_ENABLED
+	u8	pin2_usage;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_NONE                      0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_PPS_IN                    0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_PPS_OUT                   0x2UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNC_IN                   0x3UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNC_OUT                  0x4UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_LAST                     FUNC_PTP_PIN_CFG_REQ_PIN2_USAGE_SYNCE_SECONDARY_CLOCK_OUT
+	u8	pin3_state;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_DISABLED 0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_ENABLED  0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_LAST    FUNC_PTP_PIN_CFG_REQ_PIN3_STATE_ENABLED
+	u8	pin3_usage;
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_NONE                      0x0UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_PPS_IN                    0x1UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_PPS_OUT                   0x2UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNC_IN                   0x3UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNC_OUT                  0x4UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_PRIMARY_CLOCK_OUT   0x5UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT 0x6UL
+	#define FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_LAST                     FUNC_PTP_PIN_CFG_REQ_PIN3_USAGE_SYNCE_SECONDARY_CLOCK_OUT
+	u8	unused_0[4];
+};
+
+/* hwrm_func_ptp_pin_cfg_output (size:128b/16B) */
+struct hwrm_func_ptp_pin_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_ptp_cfg_input (size:384b/48B) */
+struct hwrm_func_ptp_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	enables;
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_PPS_EVENT               0x1UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_DLL_SOURCE     0x2UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_DLL_PHASE      0x4UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PERIOD     0x8UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_UP         0x10UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_FREQ_ADJ_EXT_PHASE      0x20UL
+	#define FUNC_PTP_CFG_REQ_ENABLES_PTP_SET_TIME                0x40UL
+	u8	ptp_pps_event;
+	#define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_INTERNAL     0x1UL
+	#define FUNC_PTP_CFG_REQ_PTP_PPS_EVENT_EXTERNAL     0x2UL
+	u8	ptp_freq_adj_dll_source;
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_NONE    0x0UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_0  0x1UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_1  0x2UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_2  0x3UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_TSIO_3  0x4UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_0  0x5UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_1  0x6UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_2  0x7UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_PORT_3  0x8UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_INVALID 0xffUL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_LAST   FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_SOURCE_INVALID
+	u8	ptp_freq_adj_dll_phase;
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_NONE 0x0UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_4K   0x1UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_8K   0x2UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_10M  0x3UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_25M  0x4UL
+	#define FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_LAST FUNC_PTP_CFG_REQ_PTP_FREQ_ADJ_DLL_PHASE_25M
+	u8	unused_0[3];
+	__le32	ptp_freq_adj_ext_period;
+	__le32	ptp_freq_adj_ext_up;
+	__le32	ptp_freq_adj_ext_phase_lower;
+	__le32	ptp_freq_adj_ext_phase_upper;
+	__le64	ptp_set_time;
+};
+
+/* hwrm_func_ptp_cfg_output (size:128b/16B) */
+struct hwrm_func_ptp_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_ptp_ts_query_input (size:192b/24B) */
+struct hwrm_func_ptp_ts_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_PTP_TS_QUERY_REQ_FLAGS_PPS_TIME     0x1UL
+	#define FUNC_PTP_TS_QUERY_REQ_FLAGS_PTM_TIME     0x2UL
+	u8	unused_0[4];
+};
+
+/* hwrm_func_ptp_ts_query_output (size:320b/40B) */
+struct hwrm_func_ptp_ts_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	pps_event_ts;
+	__le64	ptm_local_ts;
+	__le64	ptm_system_ts;
+	__le32	ptm_link_delay;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_func_ptp_ext_cfg_input (size:256b/32B) */
+struct hwrm_func_ptp_ext_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	enables;
+	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_MASTER_FID     0x1UL
+	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_FID        0x2UL
+	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_PHC_SEC_MODE       0x4UL
+	#define FUNC_PTP_EXT_CFG_REQ_ENABLES_FAILOVER_TIMER     0x8UL
+	__le16	phc_master_fid;
+	__le16	phc_sec_fid;
+	u8	phc_sec_mode;
+	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_SWITCH  0x0UL
+	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_ALL     0x1UL
+	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY 0x2UL
+	#define FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_LAST   FUNC_PTP_EXT_CFG_REQ_PHC_SEC_MODE_PF_ONLY
+	u8	unused_0;
+	__le32	failover_timer;
+	u8	unused_1[4];
+};
+
+/* hwrm_func_ptp_ext_cfg_output (size:128b/16B) */
+struct hwrm_func_ptp_ext_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_ptp_ext_qcfg_input (size:192b/24B) */
+struct hwrm_func_ptp_ext_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	unused_0[8];
+};
+
+/* hwrm_func_ptp_ext_qcfg_output (size:256b/32B) */
+struct hwrm_func_ptp_ext_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	phc_master_fid;
+	__le16	phc_sec_fid;
+	__le16	phc_active_fid0;
+	__le16	phc_active_fid1;
+	__le32	last_failover_event;
+	__le16	from_fid;
+	__le16	to_fid;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_func_backing_store_cfg_v2_input (size:512b/64B) */
+struct hwrm_func_backing_store_cfg_v2_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	type;
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_QP                  0x0UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ                 0x1UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ                  0x2UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_VNIC                0x3UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_STAT                0x4UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SP_TQM_RING         0x5UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_FP_TQM_RING         0x6UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MRAV                0xeUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TIM                 0xfUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TX_CK               0x13UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RX_CK               0x14UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_MP_TQM_RING         0x15UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_XID_PARTITION       0x1dUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT_TRACE           0x1eUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT_TRACE           0x20UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CRT2_TRACE          0x21UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA0_TRACE           0x26UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA1_TRACE           0x27UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA2_TRACE           0x28UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID             0xffffUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID
+	__le16	instance;
+	__le32	flags;
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_PREBOOT_MODE        0x1UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_BS_CFG_ALL_DONE     0x2UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_FLAGS_BS_EXTEND           0x4UL
+	__le64	page_dir;
+	__le32	num_entries;
+	__le16	entry_size;
+	u8	page_size_pbl_level;
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_MASK  0xfUL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_SFT   0
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_0   0x0UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_1   0x1UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2   0x2UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PBL_LEVEL_LVL_2
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_LAST   FUNC_BACKING_STORE_CFG_V2_REQ_PAGE_SIZE_PG_1G
+	u8	subtype_valid_cnt;
+	__le32	split_entry_0;
+	__le32	split_entry_1;
+	__le32	split_entry_2;
+	__le32	split_entry_3;
+	__le32	enables;
+	#define FUNC_BACKING_STORE_CFG_V2_REQ_ENABLES_NEXT_BS_OFFSET     0x1UL
+	__le32	next_bs_offset;
+};
+
+/* hwrm_func_backing_store_cfg_v2_output (size:128b/16B) */
+struct hwrm_func_backing_store_cfg_v2_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	rsvd0[7];
+	u8	valid;
+};
+
+/* hwrm_func_backing_store_qcfg_v2_input (size:192b/24B) */
+struct hwrm_func_backing_store_qcfg_v2_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	type;
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_QP                  0x0UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRQ                 0x1UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CQ                  0x2UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_VNIC                0x3UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_STAT                0x4UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SP_TQM_RING         0x5UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_FP_TQM_RING         0x6UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MRAV                0xeUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TIM                 0xfUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TX_CK               0x13UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RX_CK               0x14UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_MP_TQM_RING         0x15UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_XID_PARTITION_TABLE 0x1dUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRT_TRACE           0x1eUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CRT_TRACE           0x20UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CRT2_TRACE          0x21UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA0_TRACE           0x26UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA1_TRACE           0x27UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA2_TRACE           0x28UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID             0xffffUL
+	#define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID
+	__le16	instance;
+	u8	rsvd[4];
+};
+
+/* hwrm_func_backing_store_qcfg_v2_output (size:448b/56B) */
+struct hwrm_func_backing_store_qcfg_v2_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	type;
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_QP                  0x0UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRQ                 0x1UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CQ                  0x2UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_VNIC                0x3UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_STAT                0x4UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SP_TQM_RING         0x5UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_FP_TQM_RING         0x6UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MRAV                0xeUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TIM                 0xfUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TX_CK               0x13UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RX_CK               0x14UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_MP_TQM_RING         0x15UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TBL_SCOPE           0x1cUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_XID_PARTITION       0x1dUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT_TRACE           0x1eUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_SRT2_TRACE          0x1fUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT_TRACE           0x20UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CRT2_TRACE          0x21UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP0_TRACE         0x22UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_L2_HWRM_TRACE       0x23UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_ROCE_HWRM_TRACE     0x24UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_TTX_PACING_TQM_RING 0x25UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA0_TRACE           0x26UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA1_TRACE           0x27UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA2_TRACE           0x28UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP1_TRACE         0x29UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID             0xffffUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_LAST               FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID
+	__le16	instance;
+	__le32	flags;
+	__le64	page_dir;
+	__le32	num_entries;
+	u8	page_size_pbl_level;
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_MASK  0xfUL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_SFT   0
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_0   0x0UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_1   0x1UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2   0x2UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PBL_LEVEL_LVL_2
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_MASK  0xf0UL
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_SFT   4
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_4K   (0x0UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8K   (0x1UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_64K  (0x2UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_2M   (0x3UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_8M   (0x4UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G   (0x5UL << 4)
+	#define FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_LAST   FUNC_BACKING_STORE_QCFG_V2_RESP_PAGE_SIZE_PG_1G
+	u8	subtype_valid_cnt;
+	u8	rsvd[2];
+	__le32	split_entry_0;
+	__le32	split_entry_1;
+	__le32	split_entry_2;
+	__le32	split_entry_3;
+	u8	rsvd2[7];
+	u8	valid;
+};
+
+/* qpc_split_entries (size:128b/16B) */
+struct qpc_split_entries {
+	__le32	qp_num_l2_entries;
+	__le32	qp_num_qp1_entries;
+	__le32	qp_num_fast_qpmd_entries;
+	__le32	rsvd;
+};
+
+/* srq_split_entries (size:128b/16B) */
+struct srq_split_entries {
+	__le32	srq_num_l2_entries;
+	__le32	rsvd;
+	__le32	rsvd2[2];
+};
+
+/* cq_split_entries (size:128b/16B) */
+struct cq_split_entries {
+	__le32	cq_num_l2_entries;
+	__le32	rsvd;
+	__le32	rsvd2[2];
+};
+
+/* vnic_split_entries (size:128b/16B) */
+struct vnic_split_entries {
+	__le32	vnic_num_vnic_entries;
+	__le32	rsvd;
+	__le32	rsvd2[2];
+};
+
+/* mrav_split_entries (size:128b/16B) */
+struct mrav_split_entries {
+	__le32	mrav_num_av_entries;
+	__le32	rsvd;
+	__le32	rsvd2[2];
+};
+
+/* ts_split_entries (size:128b/16B) */
+struct ts_split_entries {
+	__le32	region_num_entries;
+	u8	tsid;
+	u8	lkup_static_bkt_cnt_exp[2];
+	u8	locked;
+	__le32	rsvd2[2];
+};
+
+/* ck_split_entries (size:128b/16B) */
+struct ck_split_entries {
+	__le32	num_quic_entries;
+	__le32	rsvd;
+	__le32	rsvd2[2];
+};
+
+/* hwrm_func_backing_store_qcaps_v2_input (size:192b/24B) */
+struct hwrm_func_backing_store_qcaps_v2_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	type;
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_QP                  0x0UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ                 0x1UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ                  0x2UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_VNIC                0x3UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_STAT                0x4UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SP_TQM_RING         0x5UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_FP_TQM_RING         0x6UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MRAV                0xeUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TIM                 0xfUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TX_CK               0x13UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RX_CK               0x14UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_MP_TQM_RING         0x15UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SQ_DB_SHADOW        0x16UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RQ_DB_SHADOW        0x17UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRQ_DB_SHADOW       0x18UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CQ_DB_SHADOW        0x19UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TBL_SCOPE           0x1cUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_XID_PARTITION       0x1dUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT_TRACE           0x1eUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_SRT2_TRACE          0x1fUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT_TRACE           0x20UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CRT2_TRACE          0x21UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP0_TRACE         0x22UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_L2_HWRM_TRACE       0x23UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_ROCE_HWRM_TRACE     0x24UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_TTX_PACING_TQM_RING 0x25UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA0_TRACE           0x26UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA1_TRACE           0x27UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA2_TRACE           0x28UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP1_TRACE         0x29UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID             0xffffUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_LAST               FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID
+	u8	rsvd[6];
+};
+
+/* hwrm_func_backing_store_qcaps_v2_output (size:448b/56B) */
+struct hwrm_func_backing_store_qcaps_v2_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	type;
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_QP                  0x0UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ                 0x1UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ                  0x2UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_VNIC                0x3UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_STAT                0x4UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SP_TQM_RING         0x5UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_FP_TQM_RING         0x6UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MRAV                0xeUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TIM                 0xfUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TX_CK               0x13UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RX_CK               0x14UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_MP_TQM_RING         0x15UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SQ_DB_SHADOW        0x16UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RQ_DB_SHADOW        0x17UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRQ_DB_SHADOW       0x18UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CQ_DB_SHADOW        0x19UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TBL_SCOPE           0x1cUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_XID_PARTITION       0x1dUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT_TRACE           0x1eUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_SRT2_TRACE          0x1fUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT_TRACE           0x20UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CRT2_TRACE          0x21UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP0_TRACE         0x22UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_L2_HWRM_TRACE       0x23UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_ROCE_HWRM_TRACE     0x24UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_TTX_PACING_TQM_RING 0x25UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA0_TRACE           0x26UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA1_TRACE           0x27UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA2_TRACE           0x28UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP1_TRACE         0x29UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID             0xffffUL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_LAST               FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID
+	__le16	entry_size;
+	__le32	flags;
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_ENABLE_CTX_KIND_INIT            0x1UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_TYPE_VALID                      0x2UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_DRIVER_MANAGED_MEMORY           0x4UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_ROCE_QP_PSEUDO_STATIC_ALLOC     0x8UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_FW_DBG_TRACE                    0x10UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_FW_BIN_DBG_TRACE                0x20UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_FLAGS_NEXT_BS_OFFSET                  0x40UL
+	__le32	instance_bit_map;
+	u8	ctx_init_value;
+	u8	ctx_init_offset;
+	u8	entry_multiple;
+	u8	rsvd;
+	__le32	max_num_entries;
+	__le32	min_num_entries;
+	__le16	next_valid_type;
+	u8	subtype_valid_cnt;
+	u8	exact_cnt_bit_map;
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_0_EXACT     0x1UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_1_EXACT     0x2UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_2_EXACT     0x4UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_SPLIT_ENTRY_3_EXACT     0x8UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_UNUSED_MASK             0xf0UL
+	#define FUNC_BACKING_STORE_QCAPS_V2_RESP_EXACT_CNT_BIT_MAP_UNUSED_SFT              4
+	__le32	split_entry_0;
+	__le32	split_entry_1;
+	__le32	split_entry_2;
+	__le32	split_entry_3;
+	__le16	max_instance_count;
+	u8	rsvd3;
+	u8	valid;
+};
+
+/* hwrm_func_dbr_pacing_qcfg_input (size:128b/16B) */
+struct hwrm_func_dbr_pacing_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_func_dbr_pacing_qcfg_output (size:512b/64B) */
+struct hwrm_func_dbr_pacing_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define FUNC_DBR_PACING_QCFG_RESP_FLAGS_DBR_NQ_EVENT_ENABLED     0x1UL
+	u8	unused_0[7];
+	__le32	dbr_stat_db_fifo_reg;
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_MASK    0x3UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_SFT     0
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_GRC       0x1UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR0      0x2UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR1      0x3UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_LAST     FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SPACE_BAR1
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_MASK          0xfffffffcUL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_STAT_DB_FIFO_REG_ADDR_SFT           2
+	__le32	dbr_stat_db_fifo_reg_watermark_mask;
+	u8	dbr_stat_db_fifo_reg_watermark_shift;
+	u8	unused_1[3];
+	__le32	dbr_stat_db_fifo_reg_fifo_room_mask;
+	u8	dbr_stat_db_fifo_reg_fifo_room_shift;
+	u8	unused_2[3];
+	__le32	dbr_throttling_aeq_arm_reg;
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_MASK    0x3UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_SFT     0
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_GRC       0x1UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR0      0x2UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR1      0x3UL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_LAST     FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SPACE_BAR1
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_MASK          0xfffffffcUL
+	#define FUNC_DBR_PACING_QCFG_RESP_DBR_THROTTLING_AEQ_ARM_REG_ADDR_SFT           2
+	u8	dbr_throttling_aeq_arm_reg_val;
+	u8	unused_3[3];
+	__le32	dbr_stat_db_max_fifo_depth;
+	__le32	primary_nq_id;
+	__le32	pacing_threshold;
+	u8	unused_4[7];
+	u8	valid;
+};
+
+/* hwrm_func_drv_if_change_input (size:192b/24B) */
+struct hwrm_func_drv_if_change_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define FUNC_DRV_IF_CHANGE_REQ_FLAGS_UP     0x1UL
+	__le32	unused;
+};
+
+/* hwrm_func_drv_if_change_output (size:128b/16B) */
+struct hwrm_func_drv_if_change_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_RESC_CHANGE           0x1UL
+	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_HOT_FW_RESET_DONE     0x2UL
+	#define FUNC_DRV_IF_CHANGE_RESP_FLAGS_CAPS_CHANGE           0x4UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_port_phy_cfg_input (size:512b/64B) */
+struct hwrm_port_phy_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define PORT_PHY_CFG_REQ_FLAGS_RESET_PHY                  0x1UL
+	#define PORT_PHY_CFG_REQ_FLAGS_DEPRECATED                 0x2UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FORCE                      0x4UL
+	#define PORT_PHY_CFG_REQ_FLAGS_RESTART_AUTONEG            0x8UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_ENABLE                 0x10UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_DISABLE                0x20UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_ENABLE          0x40UL
+	#define PORT_PHY_CFG_REQ_FLAGS_EEE_TX_LPI_DISABLE         0x80UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_ENABLE         0x100UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_AUTONEG_DISABLE        0x200UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_ENABLE        0x400UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE74_DISABLE       0x800UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_ENABLE        0x1000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_CLAUSE91_DISABLE       0x2000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FORCE_LINK_DWN             0x4000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_1XN_ENABLE       0x8000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_1XN_DISABLE      0x10000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_IEEE_ENABLE      0x20000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS544_IEEE_DISABLE     0x40000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_ENABLE       0x80000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_DISABLE      0x100000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_ENABLE      0x200000UL
+	#define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_DISABLE     0x400000UL
+	__le32	enables;
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE                     0x1UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_DUPLEX                   0x2UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_PAUSE                    0x4UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED               0x8UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEED_MASK          0x10UL
+	#define PORT_PHY_CFG_REQ_ENABLES_WIRESPEED                     0x20UL
+	#define PORT_PHY_CFG_REQ_ENABLES_LPBK                          0x40UL
+	#define PORT_PHY_CFG_REQ_ENABLES_PREEMPHASIS                   0x80UL
+	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_PAUSE                   0x100UL
+	#define PORT_PHY_CFG_REQ_ENABLES_EEE_LINK_SPEED_MASK           0x200UL
+	#define PORT_PHY_CFG_REQ_ENABLES_TX_LPI_TIMER                  0x400UL
+	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_PAM4_LINK_SPEED         0x800UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_PAM4_LINK_SPEED_MASK     0x1000UL
+	#define PORT_PHY_CFG_REQ_ENABLES_FORCE_LINK_SPEEDS2            0x2000UL
+	#define PORT_PHY_CFG_REQ_ENABLES_AUTO_LINK_SPEEDS2_MASK        0x4000UL
+	__le16	port_id;
+	__le16	force_link_speed;
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_LAST PORT_PHY_CFG_REQ_FORCE_LINK_SPEED_10MB
+	u8	auto_mode;
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_CFG_REQ_AUTO_MODE_LAST        PORT_PHY_CFG_REQ_AUTO_MODE_SPEED_MASK
+	u8	auto_duplex;
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_HALF 0x0UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_FULL 0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH 0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_DUPLEX_LAST PORT_PHY_CFG_REQ_AUTO_DUPLEX_BOTH
+	u8	auto_pause;
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_TX                0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_RX                0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
+	u8	mgmt_flag;
+	#define PORT_PHY_CFG_REQ_MGMT_FLAG_LINK_RELEASE     0x1UL
+	#define PORT_PHY_CFG_REQ_MGMT_FLAG_MGMT_VALID       0x80UL
+	__le16	auto_link_speed;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_LAST PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_10MB
+	__le16	auto_link_speed_mask;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100MB       0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_1GB         0x8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2GB         0x10UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10GB        0x40UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_20GB        0x80UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_25GB        0x100UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_40GB        0x200UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_50GB        0x400UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_100GB       0x800UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
+	u8	wirespeed;
+	#define PORT_PHY_CFG_REQ_WIRESPEED_OFF 0x0UL
+	#define PORT_PHY_CFG_REQ_WIRESPEED_ON  0x1UL
+	#define PORT_PHY_CFG_REQ_WIRESPEED_LAST PORT_PHY_CFG_REQ_WIRESPEED_ON
+	u8	lpbk;
+	#define PORT_PHY_CFG_REQ_LPBK_NONE     0x0UL
+	#define PORT_PHY_CFG_REQ_LPBK_LOCAL    0x1UL
+	#define PORT_PHY_CFG_REQ_LPBK_REMOTE   0x2UL
+	#define PORT_PHY_CFG_REQ_LPBK_EXTERNAL 0x3UL
+	#define PORT_PHY_CFG_REQ_LPBK_LAST    PORT_PHY_CFG_REQ_LPBK_EXTERNAL
+	u8	force_pause;
+	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_TX     0x1UL
+	#define PORT_PHY_CFG_REQ_FORCE_PAUSE_RX     0x2UL
+	u8	unused_1;
+	__le32	preemphasis;
+	__le16	eee_link_speed_mask;
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_CFG_REQ_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	__le16	force_pam4_link_speed;
+	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_200GB 0x7d0UL
+	#define PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_LAST PORT_PHY_CFG_REQ_FORCE_PAM4_LINK_SPEED_200GB
+	__le32	tx_lpi_timer;
+	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_MASK 0xffffffUL
+	#define PORT_PHY_CFG_REQ_TX_LPI_TIMER_SFT 0
+	__le16	auto_link_pam4_speed_mask;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_50G      0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_100G     0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_PAM4_SPEED_MASK_200G     0x4UL
+	__le16	force_link_speeds2;
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_1GB            0xaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_10GB           0x64UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_25GB           0xfaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_40GB           0x190UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_50GB           0x1f4UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB          0x3e8UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_50GB_PAM4_56   0x1f5UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_56  0x3e9UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_56  0x7d1UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_56  0xfa1UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL
+	#define PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_LAST          PORT_PHY_CFG_REQ_FORCE_LINK_SPEEDS2_800GB_PAM4_112
+	__le16	auto_link_speeds2_mask;
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_1GB                0x1UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_10GB               0x2UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_25GB               0x4UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_40GB               0x8UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_50GB               0x10UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB              0x20UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_50GB_PAM4_56       0x40UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_56      0x80UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_56      0x100UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_56      0x200UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_100GB_PAM4_112     0x400UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_200GB_PAM4_112     0x800UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_400GB_PAM4_112     0x1000UL
+	#define PORT_PHY_CFG_REQ_AUTO_LINK_SPEEDS2_MASK_800GB_PAM4_112     0x2000UL
+	u8	unused_2[6];
+};
+
+/* hwrm_port_phy_cfg_output (size:128b/16B) */
+struct hwrm_port_phy_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_cfg_cmd_err (size:64b/8B) */
+struct hwrm_port_phy_cfg_cmd_err {
+	u8	code;
+	#define PORT_PHY_CFG_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_ILLEGAL_SPEED 0x1UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_RETRY         0x2UL
+	#define PORT_PHY_CFG_CMD_ERR_CODE_LAST         PORT_PHY_CFG_CMD_ERR_CODE_RETRY
+	u8	unused_0[7];
+};
+
+/* hwrm_port_phy_qcfg_input (size:192b/24B) */
+struct hwrm_port_phy_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_phy_qcfg_output (size:832b/104B) */
+struct hwrm_port_phy_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	link;
+	#define PORT_PHY_QCFG_RESP_LINK_NO_LINK 0x0UL
+	#define PORT_PHY_QCFG_RESP_LINK_SIGNAL  0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_LINK    0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_LAST   PORT_PHY_QCFG_RESP_LINK_LINK
+	u8	active_fec_signal_mode;
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_MASK                0xfUL
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_SFT                 0
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_NRZ                   0x0UL
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4                  0x1UL
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4_112              0x2UL
+	#define PORT_PHY_QCFG_RESP_SIGNAL_MODE_LAST                 PORT_PHY_QCFG_RESP_SIGNAL_MODE_PAM4_112
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_MASK                 0xf0UL
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_SFT                  4
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_NONE_ACTIVE        (0x0UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_CLAUSE74_ACTIVE    (0x1UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_CLAUSE91_ACTIVE    (0x2UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS544_1XN_ACTIVE   (0x3UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS544_IEEE_ACTIVE  (0x4UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_1XN_ACTIVE   (0x5UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_IEEE_ACTIVE  (0x6UL << 4)
+	#define PORT_PHY_QCFG_RESP_ACTIVE_FEC_LAST                  PORT_PHY_QCFG_RESP_ACTIVE_FEC_FEC_RS272_IEEE_ACTIVE
+	__le16	link_speed;
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_200GB 0x7d0UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_400GB 0xfa0UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_800GB 0x1f40UL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_LINK_SPEED_10MB
+	u8	duplex_cfg;
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_HALF 0x0UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL 0x1UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_CFG_LAST PORT_PHY_QCFG_RESP_DUPLEX_CFG_FULL
+	u8	pause;
+	#define PORT_PHY_QCFG_RESP_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_PAUSE_RX     0x2UL
+	__le16	support_speeds;
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS_10MB        0x2000UL
+	__le16	force_link_speed;
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_FORCE_LINK_SPEED_10MB
+	u8	auto_mode;
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_AUTO_MODE_SPEED_MASK
+	u8	auto_pause;
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_TX                0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_RX                0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAUSE_AUTONEG_PAUSE     0x4UL
+	__le16	auto_link_speed;
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100MB 0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_1GB   0xaUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2GB   0x14UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_2_5GB 0x19UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10GB  0x64UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_20GB  0xc8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_25GB  0xfaUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_40GB  0x190UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB  0xffffUL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_10MB
+	__le16	auto_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEED_MASK_10MB        0x2000UL
+	u8	wirespeed;
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_OFF 0x0UL
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_ON  0x1UL
+	#define PORT_PHY_QCFG_RESP_WIRESPEED_LAST PORT_PHY_QCFG_RESP_WIRESPEED_ON
+	u8	lpbk;
+	#define PORT_PHY_QCFG_RESP_LPBK_NONE     0x0UL
+	#define PORT_PHY_QCFG_RESP_LPBK_LOCAL    0x1UL
+	#define PORT_PHY_QCFG_RESP_LPBK_REMOTE   0x2UL
+	#define PORT_PHY_QCFG_RESP_LPBK_EXTERNAL 0x3UL
+	#define PORT_PHY_QCFG_RESP_LPBK_LAST    PORT_PHY_QCFG_RESP_LPBK_EXTERNAL
+	u8	force_pause;
+	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_PAUSE_RX     0x2UL
+	u8	module_status;
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NONE          0x0UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_DISABLETX     0x1UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_WARNINGMSG    0x2UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_PWRDOWN       0x3UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTINSERTED   0x4UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_CURRENTFAULT  0x5UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_OVERHEATED    0x6UL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE 0xffUL
+	#define PORT_PHY_QCFG_RESP_MODULE_STATUS_LAST         PORT_PHY_QCFG_RESP_MODULE_STATUS_NOTAPPLICABLE
+	__le32	preemphasis;
+	u8	phy_maj;
+	u8	phy_min;
+	u8	phy_bld;
+	u8	phy_type;
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_UNKNOWN          0x0UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASECR           0x1UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR4          0x2UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASELR           0x3UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASESR           0x4UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR2          0x5UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKX           0x6UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASEKR           0x7UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASET            0x8UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_BASETE           0x9UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_SGMIIEXTPHY      0xaUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_L  0xbUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_S  0xcUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASECR_CA_N  0xdUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_25G_BASESR       0xeUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR4     0xfUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR4     0x10UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR4     0x11UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER4     0x12UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR10    0x13UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASECR4      0x14UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASESR4      0x15UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASELR4      0x16UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_BASEER4      0x17UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_40G_ACTIVE_CABLE 0x18UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASET         0x19UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASESX        0x1aUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_1G_BASECX        0x1bUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASECR4     0x1cUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASESR4     0x1dUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASELR4     0x1eUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASEER4     0x1fUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASECR       0x20UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASESR       0x21UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASELR       0x22UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_50G_BASEER       0x23UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR2     0x24UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR2     0x25UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR2     0x26UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER2     0x27UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASECR      0x28UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASESR      0x29UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASELR      0x2aUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_100G_BASEER      0x2bUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASECR2     0x2cUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASESR2     0x2dUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASELR2     0x2eUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_200G_BASEER2     0x2fUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASECR8     0x30UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR8     0x31UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR8     0x32UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER8     0x33UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASECR4     0x34UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASESR4     0x35UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASELR4     0x36UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_400G_BASEER4     0x37UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASECR8     0x38UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASESR8     0x39UL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASELR8     0x3aUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEER8     0x3bUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEFR8     0x3cUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8     0x3dUL
+	#define PORT_PHY_QCFG_RESP_PHY_TYPE_LAST            PORT_PHY_QCFG_RESP_PHY_TYPE_800G_BASEDR8
+	u8	media_type;
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_UNKNOWN   0x0UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_TP        0x1UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_DAC       0x2UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_FIBRE     0x3UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_BACKPLANE 0x4UL
+	#define PORT_PHY_QCFG_RESP_MEDIA_TYPE_LAST     PORT_PHY_QCFG_RESP_MEDIA_TYPE_BACKPLANE
+	u8	xcvr_pkg_type;
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_INTERNAL 0x1UL
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL 0x2UL
+	#define PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_LAST         PORT_PHY_QCFG_RESP_XCVR_PKG_TYPE_XCVR_EXTERNAL
+	u8	eee_config_phy_addr;
+	#define PORT_PHY_QCFG_RESP_PHY_ADDR_MASK              0x1fUL
+	#define PORT_PHY_QCFG_RESP_PHY_ADDR_SFT               0
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_MASK            0xe0UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_SFT             5
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ENABLED      0x20UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_ACTIVE       0x40UL
+	#define PORT_PHY_QCFG_RESP_EEE_CONFIG_EEE_TX_LPI       0x80UL
+	u8	parallel_detect;
+	#define PORT_PHY_QCFG_RESP_PARALLEL_DETECT     0x1UL
+	__le16	link_partner_adv_speeds;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MBHD     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100MB       0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GBHD       0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_1GB         0x8UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2GB         0x10UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_2_5GB       0x20UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10GB        0x40UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_20GB        0x80UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_25GB        0x100UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_40GB        0x200UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_50GB        0x400UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_100GB       0x800UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MBHD      0x1000UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_SPEEDS_10MB        0x2000UL
+	u8	link_partner_adv_auto_mode;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_NONE         0x0UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ALL_SPEEDS   0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_SPEED    0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_ONE_OR_BELOW 0x3UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK   0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_LAST        PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_AUTO_MODE_SPEED_MASK
+	u8	link_partner_adv_pause;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_TX     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_PAUSE_RX     0x2UL
+	__le16	adv_eee_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_QCFG_RESP_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	__le16	link_partner_adv_eee_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD1     0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_100MB     0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD2     0x4UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_1GB       0x8UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD3     0x10UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_RSVD4     0x20UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_ADV_EEE_LINK_SPEED_MASK_10GB      0x40UL
+	__le32	xcvr_identifier_type_tx_lpi_timer;
+	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_MASK            0xffffffUL
+	#define PORT_PHY_QCFG_RESP_TX_LPI_TIMER_SFT             0
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_MASK    0xff000000UL
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFT     24
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_UNKNOWN   (0x0UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFP       (0x3UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP      (0xcUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFPPLUS  (0xdUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP28    (0x11UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFPDD    (0x18UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_QSFP112   (0x1eUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_SFPDD     (0x1fUL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_CSFP      (0x20UL << 24)
+	#define PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_LAST     PORT_PHY_QCFG_RESP_XCVR_IDENTIFIER_TYPE_CSFP
+	__le16	fec_cfg;
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_NONE_SUPPORTED           0x1UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_SUPPORTED        0x2UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_AUTONEG_ENABLED          0x4UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_SUPPORTED       0x8UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE74_ENABLED         0x10UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_SUPPORTED       0x20UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_CLAUSE91_ENABLED         0x40UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_1XN_SUPPORTED      0x80UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_1XN_ENABLED        0x100UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_IEEE_SUPPORTED     0x200UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS544_IEEE_ENABLED       0x400UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_1XN_SUPPORTED      0x800UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_1XN_ENABLED        0x1000UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_IEEE_SUPPORTED     0x2000UL
+	#define PORT_PHY_QCFG_RESP_FEC_CFG_FEC_RS272_IEEE_ENABLED       0x4000UL
+	u8	duplex_state;
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_HALF 0x0UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL 0x1UL
+	#define PORT_PHY_QCFG_RESP_DUPLEX_STATE_LAST PORT_PHY_QCFG_RESP_DUPLEX_STATE_FULL
+	u8	option_flags;
+	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_MEDIA_AUTO_DETECT     0x1UL
+	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN     0x2UL
+	#define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SPEEDS2_SUPPORTED     0x4UL
+	char	phy_vendor_name[16];
+	char	phy_vendor_partnumber[16];
+	__le16	support_pam4_speeds;
+	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_50G      0x1UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_100G     0x2UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_PAM4_SPEEDS_200G     0x4UL
+	__le16	force_pam4_link_speed;
+	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_50GB  0x1f4UL
+	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_100GB 0x3e8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_200GB 0x7d0UL
+	#define PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_LAST PORT_PHY_QCFG_RESP_FORCE_PAM4_LINK_SPEED_200GB
+	__le16	auto_pam4_link_speed_mask;
+	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_50G      0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_100G     0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_PAM4_LINK_SPEED_MASK_200G     0x4UL
+	u8	link_partner_pam4_adv_speeds;
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_50GB      0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_100GB     0x2UL
+	#define PORT_PHY_QCFG_RESP_LINK_PARTNER_PAM4_ADV_SPEEDS_200GB     0x4UL
+	u8	link_down_reason;
+	#define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF                      0x1UL
+	#define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION     0x2UL
+	__le16	support_speeds2;
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_1GB                0x1UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_10GB               0x2UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_25GB               0x4UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_40GB               0x8UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_50GB               0x10UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB              0x20UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_50GB_PAM4_56       0x40UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB_PAM4_56      0x80UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_200GB_PAM4_56      0x100UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_400GB_PAM4_56      0x200UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_100GB_PAM4_112     0x400UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_200GB_PAM4_112     0x800UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_400GB_PAM4_112     0x1000UL
+	#define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_800GB_PAM4_112     0x2000UL
+	__le16	force_link_speeds2;
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_1GB            0xaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_10GB           0x64UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_25GB           0xfaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_40GB           0x190UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_50GB           0x1f4UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB          0x3e8UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_50GB_PAM4_56   0x1f5UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB_PAM4_56  0x3e9UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_200GB_PAM4_56  0x7d1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_400GB_PAM4_56  0xfa1UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_100GB_PAM4_112 0x3eaUL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_200GB_PAM4_112 0x7d2UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_400GB_PAM4_112 0xfa2UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_800GB_PAM4_112 0x1f42UL
+	#define PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_LAST          PORT_PHY_QCFG_RESP_FORCE_LINK_SPEEDS2_800GB_PAM4_112
+	__le16	auto_link_speeds2;
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_1GB                0x1UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_10GB               0x2UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_25GB               0x4UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_40GB               0x8UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_50GB               0x10UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB              0x20UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_50GB_PAM4_56       0x40UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB_PAM4_56      0x80UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_200GB_PAM4_56      0x100UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_400GB_PAM4_56      0x200UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_100GB_PAM4_112     0x400UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_200GB_PAM4_112     0x800UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_400GB_PAM4_112     0x1000UL
+	#define PORT_PHY_QCFG_RESP_AUTO_LINK_SPEEDS2_800GB_PAM4_112     0x2000UL
+	u8	active_lanes;
+	u8	valid;
+};
+
+/* hwrm_port_mac_cfg_input (size:448b/56B) */
+struct hwrm_port_mac_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define PORT_MAC_CFG_REQ_FLAGS_MATCH_LINK                    0x1UL
+	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_ENABLE           0x2UL
+	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_ENABLE         0x4UL
+	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_ENABLE            0x8UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_ENABLE      0x10UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_RX_TS_CAPTURE_DISABLE     0x20UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_ENABLE      0x40UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_TX_TS_CAPTURE_DISABLE     0x80UL
+	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_ENABLE                0x100UL
+	#define PORT_MAC_CFG_REQ_FLAGS_OOB_WOL_DISABLE               0x200UL
+	#define PORT_MAC_CFG_REQ_FLAGS_VLAN_PRI2COS_DISABLE          0x400UL
+	#define PORT_MAC_CFG_REQ_FLAGS_TUNNEL_PRI2COS_DISABLE        0x800UL
+	#define PORT_MAC_CFG_REQ_FLAGS_IP_DSCP2COS_DISABLE           0x1000UL
+	#define PORT_MAC_CFG_REQ_FLAGS_PTP_ONE_STEP_TX_TS            0x2000UL
+	#define PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_ENABLE      0x4000UL
+	#define PORT_MAC_CFG_REQ_FLAGS_ALL_RX_TS_CAPTURE_DISABLE     0x8000UL
+	__le32	enables;
+	#define PORT_MAC_CFG_REQ_ENABLES_IPG                            0x1UL
+	#define PORT_MAC_CFG_REQ_ENABLES_LPBK                           0x2UL
+	#define PORT_MAC_CFG_REQ_ENABLES_VLAN_PRI2COS_MAP_PRI           0x4UL
+	#define PORT_MAC_CFG_REQ_ENABLES_TUNNEL_PRI2COS_MAP_PRI         0x10UL
+	#define PORT_MAC_CFG_REQ_ENABLES_DSCP2COS_MAP_PRI               0x20UL
+	#define PORT_MAC_CFG_REQ_ENABLES_RX_TS_CAPTURE_PTP_MSG_TYPE     0x40UL
+	#define PORT_MAC_CFG_REQ_ENABLES_TX_TS_CAPTURE_PTP_MSG_TYPE     0x80UL
+	#define PORT_MAC_CFG_REQ_ENABLES_COS_FIELD_CFG                  0x100UL
+	#define PORT_MAC_CFG_REQ_ENABLES_PTP_FREQ_ADJ_PPB               0x200UL
+	#define PORT_MAC_CFG_REQ_ENABLES_PTP_ADJ_PHASE                  0x400UL
+	#define PORT_MAC_CFG_REQ_ENABLES_PTP_LOAD_CONTROL               0x800UL
+	__le16	port_id;
+	u8	ipg;
+	u8	lpbk;
+	#define PORT_MAC_CFG_REQ_LPBK_NONE   0x0UL
+	#define PORT_MAC_CFG_REQ_LPBK_LOCAL  0x1UL
+	#define PORT_MAC_CFG_REQ_LPBK_REMOTE 0x2UL
+	#define PORT_MAC_CFG_REQ_LPBK_LAST  PORT_MAC_CFG_REQ_LPBK_REMOTE
+	u8	vlan_pri2cos_map_pri;
+	u8	reserved1;
+	u8	tunnel_pri2cos_map_pri;
+	u8	dscp2pri_map_pri;
+	__le16	rx_ts_capture_ptp_msg_type;
+	__le16	tx_ts_capture_ptp_msg_type;
+	u8	cos_field_cfg;
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_RSVD1                     0x1UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_MASK         0x6UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_SFT          1
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_INNERMOST      (0x0UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTER          (0x1UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_OUTERMOST      (0x2UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED    (0x3UL << 1)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_LAST          PORT_MAC_CFG_REQ_COS_FIELD_CFG_VLAN_PRI_SEL_UNSPECIFIED
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_MASK       0x18UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_SFT        3
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_INNERMOST    (0x0UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTER        (0x1UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_OUTERMOST    (0x2UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED  (0x3UL << 3)
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_LAST        PORT_MAC_CFG_REQ_COS_FIELD_CFG_T_VLAN_PRI_SEL_UNSPECIFIED
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_MASK          0xe0UL
+	#define PORT_MAC_CFG_REQ_COS_FIELD_CFG_DEFAULT_COS_SFT           5
+	u8	unused_0[3];
+	__le32	ptp_freq_adj_ppb;
+	u8	unused_1[3];
+	u8	ptp_load_control;
+	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_NONE      0x0UL
+	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_IMMEDIATE 0x1UL
+	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_PPS_EVENT 0x2UL
+	#define PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_LAST     PORT_MAC_CFG_REQ_PTP_LOAD_CONTROL_PPS_EVENT
+	__le64	ptp_adj_phase;
+};
+
+/* hwrm_port_mac_cfg_output (size:128b/16B) */
+struct hwrm_port_mac_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	mru;
+	__le16	mtu;
+	u8	ipg;
+	u8	lpbk;
+	#define PORT_MAC_CFG_RESP_LPBK_NONE   0x0UL
+	#define PORT_MAC_CFG_RESP_LPBK_LOCAL  0x1UL
+	#define PORT_MAC_CFG_RESP_LPBK_REMOTE 0x2UL
+	#define PORT_MAC_CFG_RESP_LPBK_LAST  PORT_MAC_CFG_RESP_LPBK_REMOTE
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_port_mac_ptp_qcfg_input (size:192b/24B) */
+struct hwrm_port_mac_ptp_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_mac_ptp_qcfg_output (size:704b/88B) */
+struct hwrm_port_mac_ptp_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_DIRECT_ACCESS                       0x1UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_ONE_STEP_TX_TS                      0x4UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_HWRM_ACCESS                         0x8UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_PARTIAL_DIRECT_ACCESS_REF_CLOCK     0x10UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_RTC_CONFIGURED                      0x20UL
+	#define PORT_MAC_PTP_QCFG_RESP_FLAGS_64B_PHC_TIME                        0x40UL
+	u8	unused_0[3];
+	__le32	rx_ts_reg_off_lower;
+	__le32	rx_ts_reg_off_upper;
+	__le32	rx_ts_reg_off_seq_id;
+	__le32	rx_ts_reg_off_src_id_0;
+	__le32	rx_ts_reg_off_src_id_1;
+	__le32	rx_ts_reg_off_src_id_2;
+	__le32	rx_ts_reg_off_domain_id;
+	__le32	rx_ts_reg_off_fifo;
+	__le32	rx_ts_reg_off_fifo_adv;
+	__le32	rx_ts_reg_off_granularity;
+	__le32	tx_ts_reg_off_lower;
+	__le32	tx_ts_reg_off_upper;
+	__le32	tx_ts_reg_off_seq_id;
+	__le32	tx_ts_reg_off_fifo;
+	__le32	tx_ts_reg_off_granularity;
+	__le32	ts_ref_clock_reg_lower;
+	__le32	ts_ref_clock_reg_upper;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* tx_port_stats (size:3264b/408B) */
+struct tx_port_stats {
+	__le64	tx_64b_frames;
+	__le64	tx_65b_127b_frames;
+	__le64	tx_128b_255b_frames;
+	__le64	tx_256b_511b_frames;
+	__le64	tx_512b_1023b_frames;
+	__le64	tx_1024b_1518b_frames;
+	__le64	tx_good_vlan_frames;
+	__le64	tx_1519b_2047b_frames;
+	__le64	tx_2048b_4095b_frames;
+	__le64	tx_4096b_9216b_frames;
+	__le64	tx_9217b_16383b_frames;
+	__le64	tx_good_frames;
+	__le64	tx_total_frames;
+	__le64	tx_ucast_frames;
+	__le64	tx_mcast_frames;
+	__le64	tx_bcast_frames;
+	__le64	tx_pause_frames;
+	__le64	tx_pfc_frames;
+	__le64	tx_jabber_frames;
+	__le64	tx_fcs_err_frames;
+	__le64	tx_control_frames;
+	__le64	tx_oversz_frames;
+	__le64	tx_single_dfrl_frames;
+	__le64	tx_multi_dfrl_frames;
+	__le64	tx_single_coll_frames;
+	__le64	tx_multi_coll_frames;
+	__le64	tx_late_coll_frames;
+	__le64	tx_excessive_coll_frames;
+	__le64	tx_frag_frames;
+	__le64	tx_err;
+	__le64	tx_tagged_frames;
+	__le64	tx_dbl_tagged_frames;
+	__le64	tx_runt_frames;
+	__le64	tx_fifo_underruns;
+	__le64	tx_pfc_ena_frames_pri0;
+	__le64	tx_pfc_ena_frames_pri1;
+	__le64	tx_pfc_ena_frames_pri2;
+	__le64	tx_pfc_ena_frames_pri3;
+	__le64	tx_pfc_ena_frames_pri4;
+	__le64	tx_pfc_ena_frames_pri5;
+	__le64	tx_pfc_ena_frames_pri6;
+	__le64	tx_pfc_ena_frames_pri7;
+	__le64	tx_eee_lpi_events;
+	__le64	tx_eee_lpi_duration;
+	__le64	tx_llfc_logical_msgs;
+	__le64	tx_hcfc_msgs;
+	__le64	tx_total_collisions;
+	__le64	tx_bytes;
+	__le64	tx_xthol_frames;
+	__le64	tx_stat_discard;
+	__le64	tx_stat_error;
+};
+
+/* rx_port_stats (size:4224b/528B) */
+struct rx_port_stats {
+	__le64	rx_64b_frames;
+	__le64	rx_65b_127b_frames;
+	__le64	rx_128b_255b_frames;
+	__le64	rx_256b_511b_frames;
+	__le64	rx_512b_1023b_frames;
+	__le64	rx_1024b_1518b_frames;
+	__le64	rx_good_vlan_frames;
+	__le64	rx_1519b_2047b_frames;
+	__le64	rx_2048b_4095b_frames;
+	__le64	rx_4096b_9216b_frames;
+	__le64	rx_9217b_16383b_frames;
+	__le64	rx_total_frames;
+	__le64	rx_ucast_frames;
+	__le64	rx_mcast_frames;
+	__le64	rx_bcast_frames;
+	__le64	rx_fcs_err_frames;
+	__le64	rx_ctrl_frames;
+	__le64	rx_pause_frames;
+	__le64	rx_pfc_frames;
+	__le64	rx_unsupported_opcode_frames;
+	__le64	rx_unsupported_da_pausepfc_frames;
+	__le64	rx_wrong_sa_frames;
+	__le64	rx_align_err_frames;
+	__le64	rx_oor_len_frames;
+	__le64	rx_code_err_frames;
+	__le64	rx_false_carrier_frames;
+	__le64	rx_ovrsz_frames;
+	__le64	rx_jbr_frames;
+	__le64	rx_mtu_err_frames;
+	__le64	rx_match_crc_frames;
+	__le64	rx_promiscuous_frames;
+	__le64	rx_tagged_frames;
+	__le64	rx_double_tagged_frames;
+	__le64	rx_trunc_frames;
+	__le64	rx_good_frames;
+	__le64	rx_pfc_xon2xoff_frames_pri0;
+	__le64	rx_pfc_xon2xoff_frames_pri1;
+	__le64	rx_pfc_xon2xoff_frames_pri2;
+	__le64	rx_pfc_xon2xoff_frames_pri3;
+	__le64	rx_pfc_xon2xoff_frames_pri4;
+	__le64	rx_pfc_xon2xoff_frames_pri5;
+	__le64	rx_pfc_xon2xoff_frames_pri6;
+	__le64	rx_pfc_xon2xoff_frames_pri7;
+	__le64	rx_pfc_ena_frames_pri0;
+	__le64	rx_pfc_ena_frames_pri1;
+	__le64	rx_pfc_ena_frames_pri2;
+	__le64	rx_pfc_ena_frames_pri3;
+	__le64	rx_pfc_ena_frames_pri4;
+	__le64	rx_pfc_ena_frames_pri5;
+	__le64	rx_pfc_ena_frames_pri6;
+	__le64	rx_pfc_ena_frames_pri7;
+	__le64	rx_sch_crc_err_frames;
+	__le64	rx_undrsz_frames;
+	__le64	rx_frag_frames;
+	__le64	rx_eee_lpi_events;
+	__le64	rx_eee_lpi_duration;
+	__le64	rx_llfc_physical_msgs;
+	__le64	rx_llfc_logical_msgs;
+	__le64	rx_llfc_msgs_with_crc_err;
+	__le64	rx_hcfc_msgs;
+	__le64	rx_hcfc_msgs_with_crc_err;
+	__le64	rx_bytes;
+	__le64	rx_runt_bytes;
+	__le64	rx_runt_frames;
+	__le64	rx_stat_discard;
+	__le64	rx_stat_err;
+};
+
+/* hwrm_port_qstats_input (size:320b/40B) */
+struct hwrm_port_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	flags;
+	#define PORT_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[5];
+	__le64	tx_stat_host_addr;
+	__le64	rx_stat_host_addr;
+};
+
+/* hwrm_port_qstats_output (size:128b/16B) */
+struct hwrm_port_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	u8	flags;
+	#define PORT_QSTATS_RESP_FLAGS_CLEARED     0x1UL
+	u8	unused_0[2];
+	u8	valid;
+};
+
+/* tx_port_stats_ext (size:2048b/256B) */
+struct tx_port_stats_ext {
+	__le64	tx_bytes_cos0;
+	__le64	tx_bytes_cos1;
+	__le64	tx_bytes_cos2;
+	__le64	tx_bytes_cos3;
+	__le64	tx_bytes_cos4;
+	__le64	tx_bytes_cos5;
+	__le64	tx_bytes_cos6;
+	__le64	tx_bytes_cos7;
+	__le64	tx_packets_cos0;
+	__le64	tx_packets_cos1;
+	__le64	tx_packets_cos2;
+	__le64	tx_packets_cos3;
+	__le64	tx_packets_cos4;
+	__le64	tx_packets_cos5;
+	__le64	tx_packets_cos6;
+	__le64	tx_packets_cos7;
+	__le64	pfc_pri0_tx_duration_us;
+	__le64	pfc_pri0_tx_transitions;
+	__le64	pfc_pri1_tx_duration_us;
+	__le64	pfc_pri1_tx_transitions;
+	__le64	pfc_pri2_tx_duration_us;
+	__le64	pfc_pri2_tx_transitions;
+	__le64	pfc_pri3_tx_duration_us;
+	__le64	pfc_pri3_tx_transitions;
+	__le64	pfc_pri4_tx_duration_us;
+	__le64	pfc_pri4_tx_transitions;
+	__le64	pfc_pri5_tx_duration_us;
+	__le64	pfc_pri5_tx_transitions;
+	__le64	pfc_pri6_tx_duration_us;
+	__le64	pfc_pri6_tx_transitions;
+	__le64	pfc_pri7_tx_duration_us;
+	__le64	pfc_pri7_tx_transitions;
+};
+
+/* rx_port_stats_ext (size:3904b/488B) */
+struct rx_port_stats_ext {
+	__le64	link_down_events;
+	__le64	continuous_pause_events;
+	__le64	resume_pause_events;
+	__le64	continuous_roce_pause_events;
+	__le64	resume_roce_pause_events;
+	__le64	rx_bytes_cos0;
+	__le64	rx_bytes_cos1;
+	__le64	rx_bytes_cos2;
+	__le64	rx_bytes_cos3;
+	__le64	rx_bytes_cos4;
+	__le64	rx_bytes_cos5;
+	__le64	rx_bytes_cos6;
+	__le64	rx_bytes_cos7;
+	__le64	rx_packets_cos0;
+	__le64	rx_packets_cos1;
+	__le64	rx_packets_cos2;
+	__le64	rx_packets_cos3;
+	__le64	rx_packets_cos4;
+	__le64	rx_packets_cos5;
+	__le64	rx_packets_cos6;
+	__le64	rx_packets_cos7;
+	__le64	pfc_pri0_rx_duration_us;
+	__le64	pfc_pri0_rx_transitions;
+	__le64	pfc_pri1_rx_duration_us;
+	__le64	pfc_pri1_rx_transitions;
+	__le64	pfc_pri2_rx_duration_us;
+	__le64	pfc_pri2_rx_transitions;
+	__le64	pfc_pri3_rx_duration_us;
+	__le64	pfc_pri3_rx_transitions;
+	__le64	pfc_pri4_rx_duration_us;
+	__le64	pfc_pri4_rx_transitions;
+	__le64	pfc_pri5_rx_duration_us;
+	__le64	pfc_pri5_rx_transitions;
+	__le64	pfc_pri6_rx_duration_us;
+	__le64	pfc_pri6_rx_transitions;
+	__le64	pfc_pri7_rx_duration_us;
+	__le64	pfc_pri7_rx_transitions;
+	__le64	rx_bits;
+	__le64	rx_buffer_passed_threshold;
+	__le64	rx_pcs_symbol_err;
+	__le64	rx_corrected_bits;
+	__le64	rx_discard_bytes_cos0;
+	__le64	rx_discard_bytes_cos1;
+	__le64	rx_discard_bytes_cos2;
+	__le64	rx_discard_bytes_cos3;
+	__le64	rx_discard_bytes_cos4;
+	__le64	rx_discard_bytes_cos5;
+	__le64	rx_discard_bytes_cos6;
+	__le64	rx_discard_bytes_cos7;
+	__le64	rx_discard_packets_cos0;
+	__le64	rx_discard_packets_cos1;
+	__le64	rx_discard_packets_cos2;
+	__le64	rx_discard_packets_cos3;
+	__le64	rx_discard_packets_cos4;
+	__le64	rx_discard_packets_cos5;
+	__le64	rx_discard_packets_cos6;
+	__le64	rx_discard_packets_cos7;
+	__le64	rx_fec_corrected_blocks;
+	__le64	rx_fec_uncorrectable_blocks;
+	__le64	rx_filter_miss;
+	__le64	rx_fec_symbol_err;
+};
+
+/* hwrm_port_qstats_ext_input (size:320b/40B) */
+struct hwrm_port_qstats_ext_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	u8	flags;
+	#define PORT_QSTATS_EXT_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0;
+	__le64	tx_stat_host_addr;
+	__le64	rx_stat_host_addr;
+};
+
+/* hwrm_port_qstats_ext_output (size:128b/16B) */
+struct hwrm_port_qstats_ext_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tx_stat_size;
+	__le16	rx_stat_size;
+	__le16	total_active_cos_queues;
+	u8	flags;
+	#define PORT_QSTATS_EXT_RESP_FLAGS_CLEAR_ROCE_COUNTERS_SUPPORTED     0x1UL
+	#define PORT_QSTATS_EXT_RESP_FLAGS_CLEARED                           0x2UL
+	u8	valid;
+};
+
+/* hwrm_port_lpbk_qstats_input (size:256b/32B) */
+struct hwrm_port_lpbk_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	lpbk_stat_size;
+	u8	flags;
+	#define PORT_LPBK_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[5];
+	__le64	lpbk_stat_host_addr;
+};
+
+/* hwrm_port_lpbk_qstats_output (size:128b/16B) */
+struct hwrm_port_lpbk_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	lpbk_stat_size;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* port_lpbk_stats (size:640b/80B) */
+struct port_lpbk_stats {
+	__le64	lpbk_ucast_frames;
+	__le64	lpbk_mcast_frames;
+	__le64	lpbk_bcast_frames;
+	__le64	lpbk_ucast_bytes;
+	__le64	lpbk_mcast_bytes;
+	__le64	lpbk_bcast_bytes;
+	__le64	lpbk_tx_discards;
+	__le64	lpbk_tx_errors;
+	__le64	lpbk_rx_discards;
+	__le64	lpbk_rx_errors;
+};
+
+/* hwrm_port_ecn_qstats_input (size:256b/32B) */
+struct hwrm_port_ecn_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	__le16	ecn_stat_buf_size;
+	u8	flags;
+	#define PORT_ECN_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[3];
+	__le64	ecn_stat_host_addr;
+};
+
+/* hwrm_port_ecn_qstats_output (size:128b/16B) */
+struct hwrm_port_ecn_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	ecn_stat_buf_size;
+	u8	mark_en;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* port_stats_ecn (size:512b/64B) */
+struct port_stats_ecn {
+	__le64	mark_cnt_cos0;
+	__le64	mark_cnt_cos1;
+	__le64	mark_cnt_cos2;
+	__le64	mark_cnt_cos3;
+	__le64	mark_cnt_cos4;
+	__le64	mark_cnt_cos5;
+	__le64	mark_cnt_cos6;
+	__le64	mark_cnt_cos7;
+};
+
+/* hwrm_port_clr_stats_input (size:192b/24B) */
+struct hwrm_port_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	flags;
+	#define PORT_CLR_STATS_REQ_FLAGS_ROCE_COUNTERS     0x1UL
+	u8	unused_0[5];
+};
+
+/* hwrm_port_clr_stats_output (size:128b/16B) */
+struct hwrm_port_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_lpbk_clr_stats_input (size:192b/24B) */
+struct hwrm_port_lpbk_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_lpbk_clr_stats_output (size:128b/16B) */
+struct hwrm_port_lpbk_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_ts_query_input (size:320b/40B) */
+struct hwrm_port_ts_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define PORT_TS_QUERY_REQ_FLAGS_PATH             0x1UL
+	#define PORT_TS_QUERY_REQ_FLAGS_PATH_TX            0x0UL
+	#define PORT_TS_QUERY_REQ_FLAGS_PATH_RX            0x1UL
+	#define PORT_TS_QUERY_REQ_FLAGS_PATH_LAST         PORT_TS_QUERY_REQ_FLAGS_PATH_RX
+	#define PORT_TS_QUERY_REQ_FLAGS_CURRENT_TIME     0x2UL
+	__le16	port_id;
+	u8	unused_0[2];
+	__le16	enables;
+	#define PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT     0x1UL
+	#define PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID         0x2UL
+	#define PORT_TS_QUERY_REQ_ENABLES_PTP_HDR_OFFSET     0x4UL
+	__le16	ts_req_timeout;
+	__le32	ptp_seq_id;
+	__le16	ptp_hdr_offset;
+	u8	unused_1[6];
+};
+
+/* hwrm_port_ts_query_output (size:192b/24B) */
+struct hwrm_port_ts_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	ptp_msg_ts;
+	__le16	ptp_msg_seqid;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_port_phy_qcaps_input (size:192b/24B) */
+struct hwrm_port_phy_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_phy_qcaps_output (size:320b/40B) */
+struct hwrm_port_phy_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define PORT_PHY_QCAPS_RESP_FLAGS_EEE_SUPPORTED                    0x1UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_EXTERNAL_LPBK_SUPPORTED          0x2UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_AUTONEG_LPBK_SUPPORTED           0x4UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_SHARED_PHY_CFG_SUPPORTED         0x8UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_CUMULATIVE_COUNTERS_ON_RESET     0x10UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED         0x20UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_FW_MANAGED_LINK_DOWN             0x40UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS_NO_FCS                           0x80UL
+	u8	port_cnt;
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_UNKNOWN 0x0UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_1       0x1UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_2       0x2UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_3       0x3UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_4       0x4UL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_12      0xcUL
+	#define PORT_PHY_QCAPS_RESP_PORT_CNT_LAST   PORT_PHY_QCAPS_RESP_PORT_CNT_12
+	__le16	supported_speeds_force_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MBHD     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100MB       0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GBHD       0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_1GB         0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2GB         0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_2_5GB       0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10GB        0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_20GB        0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_25GB        0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_40GB        0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_50GB        0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_100GB       0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MBHD      0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_FORCE_MODE_10MB        0x2000UL
+	__le16	supported_speeds_auto_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MBHD     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100MB       0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GBHD       0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_1GB         0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2GB         0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_2_5GB       0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10GB        0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_20GB        0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_25GB        0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_40GB        0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_50GB        0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_100GB       0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MBHD      0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_AUTO_MODE_10MB        0x2000UL
+	__le16	supported_speeds_eee_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD1     0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_100MB     0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD2     0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_1GB       0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD3     0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_RSVD4     0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS_EEE_MODE_10GB      0x40UL
+	__le32	tx_lpi_timer_low;
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_MASK 0xffffffUL
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_LOW_SFT 0
+	#define PORT_PHY_QCAPS_RESP_RSVD2_MASK           0xff000000UL
+	#define PORT_PHY_QCAPS_RESP_RSVD2_SFT            24
+	__le32	valid_tx_lpi_timer_high;
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_MASK 0xffffffUL
+	#define PORT_PHY_QCAPS_RESP_TX_LPI_TIMER_HIGH_SFT 0
+	#define PORT_PHY_QCAPS_RESP_RSVD_MASK             0xff000000UL
+	#define PORT_PHY_QCAPS_RESP_RSVD_SFT              24
+	__le16	supported_pam4_speeds_auto_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_50G      0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_100G     0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_AUTO_MODE_200G     0x4UL
+	__le16	supported_pam4_speeds_force_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_50G      0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_100G     0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_PAM4_SPEEDS_FORCE_MODE_200G     0x4UL
+	__le16	flags2;
+	#define PORT_PHY_QCAPS_RESP_FLAGS2_PAUSE_UNSUPPORTED           0x1UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS2_PFC_UNSUPPORTED             0x2UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS2_BANK_ADDR_SUPPORTED         0x4UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS2_SPEEDS2_SUPPORTED           0x8UL
+	#define PORT_PHY_QCAPS_RESP_FLAGS2_REMOTE_LPBK_UNSUPPORTED     0x10UL
+	u8	internal_port_cnt;
+	u8	unused_0;
+	__le16	supported_speeds2_force_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_1GB                0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_10GB               0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_25GB               0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_40GB               0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_50GB               0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB              0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_50GB_PAM4_56       0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB_PAM4_56      0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_200GB_PAM4_56      0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_400GB_PAM4_56      0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_100GB_PAM4_112     0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_200GB_PAM4_112     0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_400GB_PAM4_112     0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_FORCE_MODE_800GB_PAM4_112     0x2000UL
+	__le16	supported_speeds2_auto_mode;
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_1GB                0x1UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_10GB               0x2UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_25GB               0x4UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_40GB               0x8UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_50GB               0x10UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB              0x20UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_50GB_PAM4_56       0x40UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB_PAM4_56      0x80UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_200GB_PAM4_56      0x100UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_400GB_PAM4_56      0x200UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_100GB_PAM4_112     0x400UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_200GB_PAM4_112     0x800UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_400GB_PAM4_112     0x1000UL
+	#define PORT_PHY_QCAPS_RESP_SUPPORTED_SPEEDS2_AUTO_MODE_800GB_PAM4_112     0x2000UL
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_port_phy_i2c_write_input (size:832b/104B) */
+struct hwrm_port_phy_i2c_write_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define PORT_PHY_I2C_WRITE_REQ_ENABLES_PAGE_OFFSET     0x1UL
+	#define PORT_PHY_I2C_WRITE_REQ_ENABLES_BANK_NUMBER     0x2UL
+	__le16	port_id;
+	u8	i2c_slave_addr;
+	u8	bank_number;
+	__le16	page_number;
+	__le16	page_offset;
+	u8	data_length;
+	u8	unused_1[7];
+	__le32	data[16];
+};
+
+/* hwrm_port_phy_i2c_write_output (size:128b/16B) */
+struct hwrm_port_phy_i2c_write_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_i2c_read_input (size:320b/40B) */
+struct hwrm_port_phy_i2c_read_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define PORT_PHY_I2C_READ_REQ_ENABLES_PAGE_OFFSET     0x1UL
+	#define PORT_PHY_I2C_READ_REQ_ENABLES_BANK_NUMBER     0x2UL
+	__le16	port_id;
+	u8	i2c_slave_addr;
+	u8	bank_number;
+	__le16	page_number;
+	__le16	page_offset;
+	u8	data_length;
+	u8	unused_1[7];
+};
+
+/* hwrm_port_phy_i2c_read_output (size:640b/80B) */
+struct hwrm_port_phy_i2c_read_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	data[16];
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_mdio_write_input (size:320b/40B) */
+struct hwrm_port_phy_mdio_write_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	unused_0[2];
+	__le16	port_id;
+	u8	phy_addr;
+	u8	dev_addr;
+	__le16	reg_addr;
+	__le16	reg_data;
+	u8	cl45_mdio;
+	u8	unused_1[7];
+};
+
+/* hwrm_port_phy_mdio_write_output (size:128b/16B) */
+struct hwrm_port_phy_mdio_write_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_phy_mdio_read_input (size:256b/32B) */
+struct hwrm_port_phy_mdio_read_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	unused_0[2];
+	__le16	port_id;
+	u8	phy_addr;
+	u8	dev_addr;
+	__le16	reg_addr;
+	u8	cl45_mdio;
+	u8	unused_1;
+};
+
+/* hwrm_port_phy_mdio_read_output (size:128b/16B) */
+struct hwrm_port_phy_mdio_read_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	reg_data;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_port_led_cfg_input (size:512b/64B) */
+struct hwrm_port_led_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_ID            0x1UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_STATE         0x2UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_COLOR         0x4UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_ON      0x8UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_BLINK_OFF     0x10UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED0_GROUP_ID      0x20UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_ID            0x40UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_STATE         0x80UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_COLOR         0x100UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_ON      0x200UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_BLINK_OFF     0x400UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED1_GROUP_ID      0x800UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_ID            0x1000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_STATE         0x2000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_COLOR         0x4000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_ON      0x8000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_BLINK_OFF     0x10000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED2_GROUP_ID      0x20000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_ID            0x40000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_STATE         0x80000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_COLOR         0x100000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_ON      0x200000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_BLINK_OFF     0x400000UL
+	#define PORT_LED_CFG_REQ_ENABLES_LED3_GROUP_ID      0x800000UL
+	__le16	port_id;
+	u8	num_leds;
+	u8	rsvd;
+	u8	led0_id;
+	u8	led0_state;
+	#define PORT_LED_CFG_REQ_LED0_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED0_STATE_LAST    PORT_LED_CFG_REQ_LED0_STATE_BLINKALT
+	u8	led0_color;
+	#define PORT_LED_CFG_REQ_LED0_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED0_COLOR_LAST      PORT_LED_CFG_REQ_LED0_COLOR_GREENAMBER
+	u8	unused_0;
+	__le16	led0_blink_on;
+	__le16	led0_blink_off;
+	u8	led0_group_id;
+	u8	rsvd0;
+	u8	led1_id;
+	u8	led1_state;
+	#define PORT_LED_CFG_REQ_LED1_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED1_STATE_LAST    PORT_LED_CFG_REQ_LED1_STATE_BLINKALT
+	u8	led1_color;
+	#define PORT_LED_CFG_REQ_LED1_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED1_COLOR_LAST      PORT_LED_CFG_REQ_LED1_COLOR_GREENAMBER
+	u8	unused_1;
+	__le16	led1_blink_on;
+	__le16	led1_blink_off;
+	u8	led1_group_id;
+	u8	rsvd1;
+	u8	led2_id;
+	u8	led2_state;
+	#define PORT_LED_CFG_REQ_LED2_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED2_STATE_LAST    PORT_LED_CFG_REQ_LED2_STATE_BLINKALT
+	u8	led2_color;
+	#define PORT_LED_CFG_REQ_LED2_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED2_COLOR_LAST      PORT_LED_CFG_REQ_LED2_COLOR_GREENAMBER
+	u8	unused_2;
+	__le16	led2_blink_on;
+	__le16	led2_blink_off;
+	u8	led2_group_id;
+	u8	rsvd2;
+	u8	led3_id;
+	u8	led3_state;
+	#define PORT_LED_CFG_REQ_LED3_STATE_DEFAULT  0x0UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_OFF      0x1UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_ON       0x2UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_BLINK    0x3UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_BLINKALT 0x4UL
+	#define PORT_LED_CFG_REQ_LED3_STATE_LAST    PORT_LED_CFG_REQ_LED3_STATE_BLINKALT
+	u8	led3_color;
+	#define PORT_LED_CFG_REQ_LED3_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_AMBER      0x1UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_GREEN      0x2UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_CFG_REQ_LED3_COLOR_LAST      PORT_LED_CFG_REQ_LED3_COLOR_GREENAMBER
+	u8	unused_3;
+	__le16	led3_blink_on;
+	__le16	led3_blink_off;
+	u8	led3_group_id;
+	u8	rsvd3;
+};
+
+/* hwrm_port_led_cfg_output (size:128b/16B) */
+struct hwrm_port_led_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_port_led_qcfg_input (size:192b/24B) */
+struct hwrm_port_led_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_led_qcfg_output (size:448b/56B) */
+struct hwrm_port_led_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_leds;
+	u8	led0_id;
+	u8	led0_type;
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED0_TYPE_LAST    PORT_LED_QCFG_RESP_LED0_TYPE_INVALID
+	u8	led0_state;
+	#define PORT_LED_QCFG_RESP_LED0_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED0_STATE_LAST    PORT_LED_QCFG_RESP_LED0_STATE_BLINKALT
+	u8	led0_color;
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED0_COLOR_LAST      PORT_LED_QCFG_RESP_LED0_COLOR_GREENAMBER
+	u8	unused_0;
+	__le16	led0_blink_on;
+	__le16	led0_blink_off;
+	u8	led0_group_id;
+	u8	led1_id;
+	u8	led1_type;
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED1_TYPE_LAST    PORT_LED_QCFG_RESP_LED1_TYPE_INVALID
+	u8	led1_state;
+	#define PORT_LED_QCFG_RESP_LED1_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED1_STATE_LAST    PORT_LED_QCFG_RESP_LED1_STATE_BLINKALT
+	u8	led1_color;
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED1_COLOR_LAST      PORT_LED_QCFG_RESP_LED1_COLOR_GREENAMBER
+	u8	unused_1;
+	__le16	led1_blink_on;
+	__le16	led1_blink_off;
+	u8	led1_group_id;
+	u8	led2_id;
+	u8	led2_type;
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED2_TYPE_LAST    PORT_LED_QCFG_RESP_LED2_TYPE_INVALID
+	u8	led2_state;
+	#define PORT_LED_QCFG_RESP_LED2_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED2_STATE_LAST    PORT_LED_QCFG_RESP_LED2_STATE_BLINKALT
+	u8	led2_color;
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED2_COLOR_LAST      PORT_LED_QCFG_RESP_LED2_COLOR_GREENAMBER
+	u8	unused_2;
+	__le16	led2_blink_on;
+	__le16	led2_blink_off;
+	u8	led2_group_id;
+	u8	led3_id;
+	u8	led3_type;
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCFG_RESP_LED3_TYPE_LAST    PORT_LED_QCFG_RESP_LED3_TYPE_INVALID
+	u8	led3_state;
+	#define PORT_LED_QCFG_RESP_LED3_STATE_DEFAULT  0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_OFF      0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_ON       0x2UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINK    0x3UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT 0x4UL
+	#define PORT_LED_QCFG_RESP_LED3_STATE_LAST    PORT_LED_QCFG_RESP_LED3_STATE_BLINKALT
+	u8	led3_color;
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_DEFAULT    0x0UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_AMBER      0x1UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREEN      0x2UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER 0x3UL
+	#define PORT_LED_QCFG_RESP_LED3_COLOR_LAST      PORT_LED_QCFG_RESP_LED3_COLOR_GREENAMBER
+	u8	unused_3;
+	__le16	led3_blink_on;
+	__le16	led3_blink_off;
+	u8	led3_group_id;
+	u8	unused_4[6];
+	u8	valid;
+};
+
+/* hwrm_port_led_qcaps_input (size:192b/24B) */
+struct hwrm_port_led_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_led_qcaps_output (size:384b/48B) */
+struct hwrm_port_led_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_leds;
+	u8	unused[3];
+	u8	led0_id;
+	u8	led0_type;
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED0_TYPE_LAST    PORT_LED_QCAPS_RESP_LED0_TYPE_INVALID
+	u8	led0_group_id;
+	u8	unused_0;
+	__le16	led0_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led0_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led1_id;
+	u8	led1_type;
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED1_TYPE_LAST    PORT_LED_QCAPS_RESP_LED1_TYPE_INVALID
+	u8	led1_group_id;
+	u8	unused_1;
+	__le16	led1_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led1_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led2_id;
+	u8	led2_type;
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED2_TYPE_LAST    PORT_LED_QCAPS_RESP_LED2_TYPE_INVALID
+	u8	led2_group_id;
+	u8	unused_2;
+	__le16	led2_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led2_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	led3_id;
+	u8	led3_type;
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_SPEED    0x0UL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_ACTIVITY 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID  0xffUL
+	#define PORT_LED_QCAPS_RESP_LED3_TYPE_LAST    PORT_LED_QCAPS_RESP_LED3_TYPE_INVALID
+	u8	led3_group_id;
+	u8	unused_3;
+	__le16	led3_state_caps;
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ENABLED                 0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_OFF_SUPPORTED           0x2UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_ON_SUPPORTED            0x4UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_SUPPORTED         0x8UL
+	#define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_ALT_SUPPORTED     0x10UL
+	__le16	led3_color_caps;
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD                0x1UL
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED     0x2UL
+	#define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED     0x4UL
+	u8	unused_4[3];
+	u8	valid;
+};
+
+/* hwrm_port_mac_qcaps_input (size:192b/24B) */
+struct hwrm_port_mac_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_port_mac_qcaps_output (size:128b/16B) */
+struct hwrm_port_mac_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define PORT_MAC_QCAPS_RESP_FLAGS_LOCAL_LPBK_NOT_SUPPORTED     0x1UL
+	#define PORT_MAC_QCAPS_RESP_FLAGS_REMOTE_LPBK_SUPPORTED        0x2UL
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_queue_qportcfg_input (size:192b/24B) */
+struct hwrm_queue_qportcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH     0x1UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_TX    0x0UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX    0x1UL
+	#define QUEUE_QPORTCFG_REQ_FLAGS_PATH_LAST QUEUE_QPORTCFG_REQ_FLAGS_PATH_RX
+	__le16	port_id;
+	u8	drv_qmap_cap;
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_DISABLED 0x0UL
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED  0x1UL
+	#define QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_LAST    QUEUE_QPORTCFG_REQ_DRV_QMAP_CAP_ENABLED
+	u8	unused_0;
+};
+
+/* hwrm_queue_qportcfg_output (size:1344b/168B) */
+struct hwrm_queue_qportcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	max_configurable_queues;
+	u8	max_configurable_lossless_queues;
+	u8	queue_cfg_allowed;
+	u8	queue_cfg_info;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG             0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_CFG_INFO_USE_PROFILE_TYPE     0x2UL
+	u8	queue_pfcenable_cfg_allowed;
+	u8	queue_pri2cos_cfg_allowed;
+	u8	queue_cos2bw_cfg_allowed;
+	u8	queue_id0;
+	u8	queue_id0_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id1;
+	u8	queue_id1_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id2;
+	u8	queue_id2_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id3;
+	u8	queue_id3_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id4;
+	u8	queue_id4_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id5;
+	u8	queue_id5_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id6;
+	u8	queue_id6_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id7;
+	u8	queue_id7_service_profile;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY          0x0UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS       0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_ROCE  0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSY_ROCE_CNP 0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LOSSLESS_NIC   0x3UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN        0xffUL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_LAST          QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_UNKNOWN
+	u8	queue_id0_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID0_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	char	qid0_name[16];
+	char	qid1_name[16];
+	char	qid2_name[16];
+	char	qid3_name[16];
+	char	qid4_name[16];
+	char	qid5_name[16];
+	char	qid6_name[16];
+	char	qid7_name[16];
+	u8	queue_id1_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID1_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id2_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID2_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id3_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID3_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id4_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID4_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id5_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID5_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id6_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID6_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	queue_id7_service_profile_type;
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_ROCE     0x1UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_NIC      0x2UL
+	#define QUEUE_QPORTCFG_RESP_QUEUE_ID7_SERVICE_PROFILE_TYPE_CNP      0x4UL
+	u8	valid;
+};
+
+/* hwrm_queue_qcfg_input (size:192b/24B) */
+struct hwrm_queue_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_QCFG_REQ_FLAGS_PATH     0x1UL
+	#define QUEUE_QCFG_REQ_FLAGS_PATH_TX    0x0UL
+	#define QUEUE_QCFG_REQ_FLAGS_PATH_RX    0x1UL
+	#define QUEUE_QCFG_REQ_FLAGS_PATH_LAST QUEUE_QCFG_REQ_FLAGS_PATH_RX
+	__le32	queue_id;
+};
+
+/* hwrm_queue_qcfg_output (size:128b/16B) */
+struct hwrm_queue_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	queue_len;
+	u8	service_profile;
+	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LOSSY    0x0UL
+	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LOSSLESS 0x1UL
+	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_UNKNOWN  0xffUL
+	#define QUEUE_QCFG_RESP_SERVICE_PROFILE_LAST    QUEUE_QCFG_RESP_SERVICE_PROFILE_UNKNOWN
+	u8	queue_cfg_info;
+	#define QUEUE_QCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_queue_cfg_input (size:320b/40B) */
+struct hwrm_queue_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_CFG_REQ_FLAGS_PATH_MASK 0x3UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_SFT  0
+	#define QUEUE_CFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
+	#define QUEUE_CFG_REQ_FLAGS_PATH_LAST  QUEUE_CFG_REQ_FLAGS_PATH_BIDIR
+	__le32	enables;
+	#define QUEUE_CFG_REQ_ENABLES_DFLT_LEN            0x1UL
+	#define QUEUE_CFG_REQ_ENABLES_SERVICE_PROFILE     0x2UL
+	__le32	queue_id;
+	__le32	dflt_len;
+	u8	service_profile;
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSY    0x0UL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LOSSLESS 0x1UL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN  0xffUL
+	#define QUEUE_CFG_REQ_SERVICE_PROFILE_LAST    QUEUE_CFG_REQ_SERVICE_PROFILE_UNKNOWN
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_cfg_output (size:128b/16B) */
+struct hwrm_queue_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_pfcenable_qcfg_input (size:192b/24B) */
+struct hwrm_queue_pfcenable_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_queue_pfcenable_qcfg_output (size:128b/16B) */
+struct hwrm_queue_pfcenable_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI0_PFC_ENABLED              0x1UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI1_PFC_ENABLED              0x2UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI2_PFC_ENABLED              0x4UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI3_PFC_ENABLED              0x8UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI4_PFC_ENABLED              0x10UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI5_PFC_ENABLED              0x20UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI6_PFC_ENABLED              0x40UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI7_PFC_ENABLED              0x80UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI0_PFC_WATCHDOG_ENABLED     0x100UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI1_PFC_WATCHDOG_ENABLED     0x200UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI2_PFC_WATCHDOG_ENABLED     0x400UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI3_PFC_WATCHDOG_ENABLED     0x800UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI4_PFC_WATCHDOG_ENABLED     0x1000UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI5_PFC_WATCHDOG_ENABLED     0x2000UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI6_PFC_WATCHDOG_ENABLED     0x4000UL
+	#define QUEUE_PFCENABLE_QCFG_RESP_FLAGS_PRI7_PFC_WATCHDOG_ENABLED     0x8000UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_queue_pfcenable_cfg_input (size:192b/24B) */
+struct hwrm_queue_pfcenable_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI0_PFC_ENABLED              0x1UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI1_PFC_ENABLED              0x2UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI2_PFC_ENABLED              0x4UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI3_PFC_ENABLED              0x8UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI4_PFC_ENABLED              0x10UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI5_PFC_ENABLED              0x20UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI6_PFC_ENABLED              0x40UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI7_PFC_ENABLED              0x80UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI0_PFC_WATCHDOG_ENABLED     0x100UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI1_PFC_WATCHDOG_ENABLED     0x200UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI2_PFC_WATCHDOG_ENABLED     0x400UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI3_PFC_WATCHDOG_ENABLED     0x800UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI4_PFC_WATCHDOG_ENABLED     0x1000UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI5_PFC_WATCHDOG_ENABLED     0x2000UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI6_PFC_WATCHDOG_ENABLED     0x4000UL
+	#define QUEUE_PFCENABLE_CFG_REQ_FLAGS_PRI7_PFC_WATCHDOG_ENABLED     0x8000UL
+	__le16	port_id;
+	u8	unused_0[2];
+};
+
+/* hwrm_queue_pfcenable_cfg_output (size:128b/16B) */
+struct hwrm_queue_pfcenable_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_pri2cos_qcfg_input (size:192b/24B) */
+struct hwrm_queue_pri2cos_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH      0x1UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_QCFG_REQ_FLAGS_PATH_RX
+	#define QUEUE_PRI2COS_QCFG_REQ_FLAGS_IVLAN     0x2UL
+	u8	port_id;
+	u8	unused_0[3];
+};
+
+/* hwrm_queue_pri2cos_qcfg_output (size:192b/24B) */
+struct hwrm_queue_pri2cos_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	pri0_cos_queue_id;
+	u8	pri1_cos_queue_id;
+	u8	pri2_cos_queue_id;
+	u8	pri3_cos_queue_id;
+	u8	pri4_cos_queue_id;
+	u8	pri5_cos_queue_id;
+	u8	pri6_cos_queue_id;
+	u8	pri7_cos_queue_id;
+	u8	queue_cfg_info;
+	#define QUEUE_PRI2COS_QCFG_RESP_QUEUE_CFG_INFO_ASYM_CFG     0x1UL
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_queue_pri2cos_cfg_input (size:320b/40B) */
+struct hwrm_queue_pri2cos_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_MASK 0x3UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_SFT  0
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_TX     0x0UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_RX     0x1UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR  0x2UL
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_LAST  QUEUE_PRI2COS_CFG_REQ_FLAGS_PATH_BIDIR
+	#define QUEUE_PRI2COS_CFG_REQ_FLAGS_IVLAN     0x4UL
+	__le32	enables;
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI0_COS_QUEUE_ID     0x1UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI1_COS_QUEUE_ID     0x2UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI2_COS_QUEUE_ID     0x4UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI3_COS_QUEUE_ID     0x8UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI4_COS_QUEUE_ID     0x10UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI5_COS_QUEUE_ID     0x20UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI6_COS_QUEUE_ID     0x40UL
+	#define QUEUE_PRI2COS_CFG_REQ_ENABLES_PRI7_COS_QUEUE_ID     0x80UL
+	u8	port_id;
+	u8	pri0_cos_queue_id;
+	u8	pri1_cos_queue_id;
+	u8	pri2_cos_queue_id;
+	u8	pri3_cos_queue_id;
+	u8	pri4_cos_queue_id;
+	u8	pri5_cos_queue_id;
+	u8	pri6_cos_queue_id;
+	u8	pri7_cos_queue_id;
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_pri2cos_cfg_output (size:128b/16B) */
+struct hwrm_queue_pri2cos_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_cos2bw_qcfg_input (size:192b/24B) */
+struct hwrm_queue_cos2bw_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_queue_cos2bw_qcfg_output (size:896b/112B) */
+struct hwrm_queue_cos2bw_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	queue_id0;
+	u8	unused_0;
+	__le16	unused_1;
+	__le32	queue_id0_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id0_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id0_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id0_pri_lvl;
+	u8	queue_id0_bw_weight;
+	struct {
+		u8	queue_id;
+		__le32	queue_id_min_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID
+		__le32	queue_id_max_bw;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID
+		u8	queue_id_tsa_assign;
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_QCFG_RESP_QUEUE_ID_TSA_ASSIGN_RESERVED_LAST  0xffUL
+		u8	queue_id_pri_lvl;
+		u8	queue_id_bw_weight;
+	} __packed cfg[7];
+	u8	unused_2[4];
+	u8	valid;
+};
+
+/* hwrm_queue_cos2bw_cfg_input (size:1024b/128B) */
+struct hwrm_queue_cos2bw_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID0_VALID     0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID1_VALID     0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID2_VALID     0x4UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID3_VALID     0x8UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID4_VALID     0x10UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID5_VALID     0x20UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID6_VALID     0x40UL
+	#define QUEUE_COS2BW_CFG_REQ_ENABLES_COS_QUEUE_ID7_VALID     0x80UL
+	__le16	port_id;
+	u8	queue_id0;
+	u8	unused_0;
+	__le32	queue_id0_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MIN_BW_BW_VALUE_UNIT_INVALID
+	__le32	queue_id0_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	queue_id0_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID0_TSA_ASSIGN_RESERVED_LAST  0xffUL
+	u8	queue_id0_pri_lvl;
+	u8	queue_id0_bw_weight;
+	struct {
+		u8	queue_id;
+		__le32	queue_id_min_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MIN_BW_BW_VALUE_UNIT_INVALID
+		__le32	queue_id_max_bw;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_SFT              0
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE                     0x10000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_LAST                 QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_SCALE_BYTES
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_LAST         QUEUE_COS2BW_CFG_REQ_QUEUE_ID_MAX_BW_BW_VALUE_UNIT_INVALID
+		u8	queue_id_tsa_assign;
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_SP             0x0UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_ETS            0x1UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_RESERVED_FIRST 0x2UL
+	#define QUEUE_COS2BW_CFG_REQ_QUEUE_ID_TSA_ASSIGN_RESERVED_LAST  0xffUL
+		u8	queue_id_pri_lvl;
+		u8	queue_id_bw_weight;
+	} __packed cfg[7];
+	u8	unused_1[5];
+};
+
+/* hwrm_queue_cos2bw_cfg_output (size:128b/16B) */
+struct hwrm_queue_cos2bw_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp_qcaps_input (size:192b/24B) */
+struct hwrm_queue_dscp_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	port_id;
+	u8	unused_0[7];
+};
+
+/* hwrm_queue_dscp_qcaps_output (size:128b/16B) */
+struct hwrm_queue_dscp_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_dscp_bits;
+	u8	unused_0;
+	__le16	max_entries;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp2pri_qcfg_input (size:256b/32B) */
+struct hwrm_queue_dscp2pri_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	u8	port_id;
+	u8	unused_0;
+	__le16	dest_data_buffer_size;
+	u8	unused_1[4];
+};
+
+/* hwrm_queue_dscp2pri_qcfg_output (size:128b/16B) */
+struct hwrm_queue_dscp2pri_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	entry_cnt;
+	u8	default_pri;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_queue_dscp2pri_cfg_input (size:320b/40B) */
+struct hwrm_queue_dscp2pri_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le32	flags;
+	#define QUEUE_DSCP2PRI_CFG_REQ_FLAGS_USE_HW_DEFAULT_PRI     0x1UL
+	__le32	enables;
+	#define QUEUE_DSCP2PRI_CFG_REQ_ENABLES_DEFAULT_PRI     0x1UL
+	u8	port_id;
+	u8	default_pri;
+	__le16	entry_cnt;
+	u8	unused_0[4];
+};
+
+/* hwrm_queue_dscp2pri_cfg_output (size:128b/16B) */
+struct hwrm_queue_dscp2pri_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_alloc_input (size:192b/24B) */
+struct hwrm_vnic_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_ALLOC_REQ_FLAGS_DEFAULT                  0x1UL
+	#define VNIC_ALLOC_REQ_FLAGS_VIRTIO_NET_FID_VALID     0x2UL
+	#define VNIC_ALLOC_REQ_FLAGS_VNIC_ID_VALID            0x4UL
+	__le16	virtio_net_fid;
+	__le16	vnic_id;
+};
+
+/* hwrm_vnic_alloc_output (size:128b/16B) */
+struct hwrm_vnic_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	vnic_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_vnic_update_input (size:256b/32B) */
+struct hwrm_vnic_update_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	vnic_id;
+	__le32	enables;
+	#define VNIC_UPDATE_REQ_ENABLES_VNIC_STATE_VALID               0x1UL
+	#define VNIC_UPDATE_REQ_ENABLES_MRU_VALID                      0x2UL
+	#define VNIC_UPDATE_REQ_ENABLES_METADATA_FORMAT_TYPE_VALID     0x4UL
+	u8	vnic_state;
+	#define VNIC_UPDATE_REQ_VNIC_STATE_NORMAL 0x0UL
+	#define VNIC_UPDATE_REQ_VNIC_STATE_DROP   0x1UL
+	#define VNIC_UPDATE_REQ_VNIC_STATE_LAST  VNIC_UPDATE_REQ_VNIC_STATE_DROP
+	u8	metadata_format_type;
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_0 0x0UL
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_1 0x1UL
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_2 0x2UL
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_3 0x3UL
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_4 0x4UL
+	#define VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_LAST VNIC_UPDATE_REQ_METADATA_FORMAT_TYPE_4
+	__le16	mru;
+	u8	unused_1[4];
+};
+
+/* hwrm_vnic_update_output (size:128b/16B) */
+struct hwrm_vnic_update_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_free_input (size:192b/24B) */
+struct hwrm_vnic_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	vnic_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_free_output (size:128b/16B) */
+struct hwrm_vnic_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_cfg_input (size:384b/48B) */
+struct hwrm_vnic_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_CFG_REQ_FLAGS_DEFAULT                              0x1UL
+	#define VNIC_CFG_REQ_FLAGS_VLAN_STRIP_MODE                      0x2UL
+	#define VNIC_CFG_REQ_FLAGS_BD_STALL_MODE                        0x4UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_DUAL_VNIC_MODE                  0x8UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_ONLY_VNIC_MODE                  0x10UL
+	#define VNIC_CFG_REQ_FLAGS_RSS_DFLT_CR_MODE                     0x20UL
+	#define VNIC_CFG_REQ_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_MODE     0x40UL
+	#define VNIC_CFG_REQ_FLAGS_PORTCOS_MAPPING_MODE                 0x80UL
+	__le32	enables;
+	#define VNIC_CFG_REQ_ENABLES_DFLT_RING_GRP            0x1UL
+	#define VNIC_CFG_REQ_ENABLES_RSS_RULE                 0x2UL
+	#define VNIC_CFG_REQ_ENABLES_COS_RULE                 0x4UL
+	#define VNIC_CFG_REQ_ENABLES_LB_RULE                  0x8UL
+	#define VNIC_CFG_REQ_ENABLES_MRU                      0x10UL
+	#define VNIC_CFG_REQ_ENABLES_DEFAULT_RX_RING_ID       0x20UL
+	#define VNIC_CFG_REQ_ENABLES_DEFAULT_CMPL_RING_ID     0x40UL
+	#define VNIC_CFG_REQ_ENABLES_QUEUE_ID                 0x80UL
+	#define VNIC_CFG_REQ_ENABLES_RX_CSUM_V2_MODE          0x100UL
+	#define VNIC_CFG_REQ_ENABLES_L2_CQE_MODE              0x200UL
+	#define VNIC_CFG_REQ_ENABLES_RAW_QP_ID                0x400UL
+	__le16	vnic_id;
+	__le16	dflt_ring_grp;
+	__le16	rss_rule;
+	__le16	cos_rule;
+	__le16	lb_rule;
+	__le16	mru;
+	__le16	default_rx_ring_id;
+	__le16	default_cmpl_ring_id;
+	__le16	queue_id;
+	u8	rx_csum_v2_mode;
+	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_DEFAULT 0x0UL
+	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_ALL_OK  0x1UL
+	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_MAX     0x2UL
+	#define VNIC_CFG_REQ_RX_CSUM_V2_MODE_LAST   VNIC_CFG_REQ_RX_CSUM_V2_MODE_MAX
+	u8	l2_cqe_mode;
+	#define VNIC_CFG_REQ_L2_CQE_MODE_DEFAULT    0x0UL
+	#define VNIC_CFG_REQ_L2_CQE_MODE_COMPRESSED 0x1UL
+	#define VNIC_CFG_REQ_L2_CQE_MODE_MIXED      0x2UL
+	#define VNIC_CFG_REQ_L2_CQE_MODE_LAST      VNIC_CFG_REQ_L2_CQE_MODE_MIXED
+	__le32	raw_qp_id;
+};
+
+/* hwrm_vnic_cfg_output (size:128b/16B) */
+struct hwrm_vnic_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_qcaps_input (size:192b/24B) */
+struct hwrm_vnic_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_qcaps_output (size:192b/24B) */
+struct hwrm_vnic_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	mru;
+	u8	unused_0[2];
+	__le32	flags;
+	#define VNIC_QCAPS_RESP_FLAGS_UNUSED                                  0x1UL
+	#define VNIC_QCAPS_RESP_FLAGS_VLAN_STRIP_CAP                          0x2UL
+	#define VNIC_QCAPS_RESP_FLAGS_BD_STALL_CAP                            0x4UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_DUAL_VNIC_CAP                      0x8UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_ONLY_VNIC_CAP                      0x10UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_DFLT_CR_CAP                         0x20UL
+	#define VNIC_QCAPS_RESP_FLAGS_ROCE_MIRRORING_CAPABLE_VNIC_CAP         0x40UL
+	#define VNIC_QCAPS_RESP_FLAGS_OUTERMOST_RSS_CAP                       0x80UL
+	#define VNIC_QCAPS_RESP_FLAGS_COS_ASSIGNMENT_CAP                      0x100UL
+	#define VNIC_QCAPS_RESP_FLAGS_RX_CMPL_V2_CAP                          0x200UL
+	#define VNIC_QCAPS_RESP_FLAGS_VNIC_STATE_CAP                          0x400UL
+	#define VNIC_QCAPS_RESP_FLAGS_VIRTIO_NET_VNIC_ALLOC_CAP               0x800UL
+	#define VNIC_QCAPS_RESP_FLAGS_METADATA_FORMAT_CAP                     0x1000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_STRICT_HASH_TYPE_CAP                0x2000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_HASH_TYPE_DELTA_CAP                 0x4000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_TOEPLITZ_CAP           0x8000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_XOR_CAP                0x10000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RING_SELECT_MODE_TOEPLITZ_CHKSM_CAP     0x20000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPV6_FLOW_LABEL_CAP                 0x40000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RX_CMPL_V3_CAP                          0x80000UL
+	#define VNIC_QCAPS_RESP_FLAGS_L2_CQE_MODE_CAP                         0x100000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_AH_SPI_IPV4_CAP               0x200000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_ESP_SPI_IPV4_CAP              0x400000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_AH_SPI_IPV6_CAP               0x800000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_IPSEC_ESP_SPI_IPV6_CAP              0x1000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_OUTERMOST_RSS_TRUSTED_VF_CAP            0x2000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_PORTCOS_MAPPING_MODE                    0x4000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RSS_PROF_TCAM_MODE_ENABLED              0x8000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_VNIC_RSS_HASH_MODE_CAP                  0x10000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_HW_TUNNEL_TPA_CAP                       0x20000000UL
+	#define VNIC_QCAPS_RESP_FLAGS_RE_FLUSH_CAP                            0x40000000UL
+	__le16	max_aggs_supported;
+	u8	unused_1[5];
+	u8	valid;
+};
+
+/* hwrm_vnic_tpa_cfg_input (size:384b/48B) */
+struct hwrm_vnic_tpa_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_TPA_CFG_REQ_FLAGS_TPA                       0x1UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_ENCAP_TPA                 0x2UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_RSC_WND_UPDATE            0x4UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO                       0x8UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_ECN              0x10UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_IPID_CHECK            0x40UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_GRO_TTL_CHECK             0x80UL
+	#define VNIC_TPA_CFG_REQ_FLAGS_AGG_PACK_AS_GRO           0x100UL
+	__le32	enables;
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_SEGS      0x1UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGGS          0x2UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MAX_AGG_TIMER     0x4UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_MIN_AGG_LEN       0x8UL
+	#define VNIC_TPA_CFG_REQ_ENABLES_TNL_TPA_EN        0x10UL
+	__le16	vnic_id;
+	__le16	max_agg_segs;
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_1   0x0UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_2   0x1UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_4   0x2UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_8   0x3UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX 0x1fUL
+	#define VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_LAST VNIC_TPA_CFG_REQ_MAX_AGG_SEGS_MAX
+	__le16	max_aggs;
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_1   0x0UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_2   0x1UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_4   0x2UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_8   0x3UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_16  0x4UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_MAX 0x7UL
+	#define VNIC_TPA_CFG_REQ_MAX_AGGS_LAST VNIC_TPA_CFG_REQ_MAX_AGGS_MAX
+	u8	unused_0[2];
+	__le32	max_agg_timer;
+	__le32	min_agg_len;
+	__le32	tnl_tpa_en_bitmap;
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN           0x1UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GENEVE          0x2UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_NVGRE           0x4UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GRE             0x8UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_IPV4            0x10UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_IPV6            0x20UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN_GPE       0x40UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_VXLAN_CUST1     0x80UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_GRE_CUST1       0x100UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR1           0x200UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR2           0x400UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR3           0x800UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR4           0x1000UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR5           0x2000UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR6           0x4000UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR7           0x8000UL
+	#define VNIC_TPA_CFG_REQ_TNL_TPA_EN_BITMAP_UPAR8           0x10000UL
+	u8	unused_1[4];
+};
+
+/* hwrm_vnic_tpa_cfg_output (size:128b/16B) */
+struct hwrm_vnic_tpa_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_tpa_qcfg_input (size:192b/24B) */
+struct hwrm_vnic_tpa_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vnic_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_vnic_tpa_qcfg_output (size:256b/32B) */
+struct hwrm_vnic_tpa_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define VNIC_TPA_QCFG_RESP_FLAGS_TPA                       0x1UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_ENCAP_TPA                 0x2UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_RSC_WND_UPDATE            0x4UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO                       0x8UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_ECN              0x10UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_AGG_WITH_SAME_GRE_SEQ     0x20UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_IPID_CHECK            0x40UL
+	#define VNIC_TPA_QCFG_RESP_FLAGS_GRO_TTL_CHECK             0x80UL
+	__le16	max_agg_segs;
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_1   0x0UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_2   0x1UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_4   0x2UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_8   0x3UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX 0x1fUL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGG_SEGS_MAX
+	__le16	max_aggs;
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_1   0x0UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_2   0x1UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_4   0x2UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_8   0x3UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_16  0x4UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX 0x7UL
+	#define VNIC_TPA_QCFG_RESP_MAX_AGGS_LAST VNIC_TPA_QCFG_RESP_MAX_AGGS_MAX
+	__le32	max_agg_timer;
+	__le32	min_agg_len;
+	__le32	tnl_tpa_en_bitmap;
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN           0x1UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GENEVE          0x2UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_NVGRE           0x4UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GRE             0x8UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_IPV4            0x10UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_IPV6            0x20UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN_GPE       0x40UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_VXLAN_CUST1     0x80UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_GRE_CUST1       0x100UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR1           0x200UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR2           0x400UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR3           0x800UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR4           0x1000UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR5           0x2000UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR6           0x4000UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR7           0x8000UL
+	#define VNIC_TPA_QCFG_RESP_TNL_TPA_EN_BITMAP_UPAR8           0x10000UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cfg_input (size:384b/48B) */
+struct hwrm_vnic_rss_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	hash_type;
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV4                0x1UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV4            0x2UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV4            0x4UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6                0x8UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_TCP_IPV6            0x10UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_UDP_IPV6            0x20UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_IPV6_FLOW_LABEL     0x40UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_AH_SPI_IPV4         0x80UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_ESP_SPI_IPV4        0x100UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_AH_SPI_IPV6         0x200UL
+	#define VNIC_RSS_CFG_REQ_HASH_TYPE_ESP_SPI_IPV6        0x400UL
+	__le16	vnic_id;
+	u8	ring_table_pair_index;
+	u8	hash_mode_flags;
+	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_DEFAULT         0x1UL
+	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_INNERMOST_4     0x2UL
+	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_INNERMOST_2     0x4UL
+	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_OUTERMOST_4     0x8UL
+	#define VNIC_RSS_CFG_REQ_HASH_MODE_FLAGS_OUTERMOST_2     0x10UL
+	__le64	ring_grp_tbl_addr;
+	__le64	hash_key_tbl_addr;
+	__le16	rss_ctx_idx;
+	u8	flags;
+	#define VNIC_RSS_CFG_REQ_FLAGS_HASH_TYPE_INCLUDE               0x1UL
+	#define VNIC_RSS_CFG_REQ_FLAGS_HASH_TYPE_EXCLUDE               0x2UL
+	#define VNIC_RSS_CFG_REQ_FLAGS_IPSEC_HASH_TYPE_CFG_SUPPORT     0x4UL
+	u8	ring_select_mode;
+	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ          0x0UL
+	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_XOR               0x1UL
+	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ_CHECKSUM 0x2UL
+	#define VNIC_RSS_CFG_REQ_RING_SELECT_MODE_LAST             VNIC_RSS_CFG_REQ_RING_SELECT_MODE_TOEPLITZ_CHECKSUM
+	u8	unused_1[4];
+};
+
+/* hwrm_vnic_rss_cfg_output (size:128b/16B) */
+struct hwrm_vnic_rss_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cfg_cmd_err (size:64b/8B) */
+struct hwrm_vnic_rss_cfg_cmd_err {
+	u8	code;
+	#define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN             0x0UL
+	#define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL
+	#define VNIC_RSS_CFG_CMD_ERR_CODE_LAST               VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY
+	u8	unused_0[7];
+};
+
+/* hwrm_vnic_rss_qcfg_input (size:192b/24B) */
+struct hwrm_vnic_rss_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	rss_ctx_idx;
+	__le16	vnic_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_rss_qcfg_output (size:512b/64B) */
+struct hwrm_vnic_rss_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	hash_type;
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV4                0x1UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_TCP_IPV4            0x2UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_UDP_IPV4            0x4UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV6                0x8UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_TCP_IPV6            0x10UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_UDP_IPV6            0x20UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_IPV6_FLOW_LABEL     0x40UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_AH_SPI_IPV4         0x80UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_ESP_SPI_IPV4        0x100UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_AH_SPI_IPV6         0x200UL
+	#define VNIC_RSS_QCFG_RESP_HASH_TYPE_ESP_SPI_IPV6        0x400UL
+	u8	unused_0[4];
+	__le32	hash_key[10];
+	u8	hash_mode_flags;
+	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_DEFAULT         0x1UL
+	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_INNERMOST_4     0x2UL
+	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_INNERMOST_2     0x4UL
+	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_OUTERMOST_4     0x8UL
+	#define VNIC_RSS_QCFG_RESP_HASH_MODE_FLAGS_OUTERMOST_2     0x10UL
+	u8	ring_select_mode;
+	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ          0x0UL
+	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_XOR               0x1UL
+	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ_CHECKSUM 0x2UL
+	#define VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_LAST             VNIC_RSS_QCFG_RESP_RING_SELECT_MODE_TOEPLITZ_CHECKSUM
+	u8	unused_1[5];
+	u8	valid;
+};
+
+/* hwrm_vnic_plcmodes_cfg_input (size:320b/40B) */
+struct hwrm_vnic_plcmodes_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_REGULAR_PLACEMENT     0x1UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_JUMBO_PLACEMENT       0x2UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV4              0x4UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_IPV6              0x8UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_FCOE              0x10UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_HDS_ROCE              0x20UL
+	#define VNIC_PLCMODES_CFG_REQ_FLAGS_VIRTIO_PLACEMENT      0x40UL
+	__le32	enables;
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_JUMBO_THRESH_VALID      0x1UL
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_OFFSET_VALID        0x2UL
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_HDS_THRESHOLD_VALID     0x4UL
+	#define VNIC_PLCMODES_CFG_REQ_ENABLES_MAX_BDS_VALID           0x8UL
+	__le32	vnic_id;
+	__le16	jumbo_thresh;
+	__le16	hds_offset;
+	__le16	hds_threshold;
+	__le16	max_bds;
+	u8	unused_0[4];
+};
+
+/* hwrm_vnic_plcmodes_cfg_output (size:128b/16B) */
+struct hwrm_vnic_plcmodes_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vnic_plcmodes_cfg_cmd_err (size:64b/8B) */
+struct hwrm_vnic_plcmodes_cfg_cmd_err {
+	u8	code;
+	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_UNKNOWN               0x0UL
+	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD 0x1UL
+	#define VNIC_PLCMODES_CFG_CMD_ERR_CODE_LAST                 VNIC_PLCMODES_CFG_CMD_ERR_CODE_INVALID_HDS_THRESHOLD
+	u8	unused_0[7];
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_alloc_input (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_alloc_output (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	rss_cos_lb_ctx_id;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_free_input (size:192b/24B) */
+struct hwrm_vnic_rss_cos_lb_ctx_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	rss_cos_lb_ctx_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_vnic_rss_cos_lb_ctx_free_output (size:128b/16B) */
+struct hwrm_vnic_rss_cos_lb_ctx_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_alloc_input (size:704b/88B) */
+struct hwrm_ring_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define RING_ALLOC_REQ_ENABLES_RING_ARB_CFG              0x2UL
+	#define RING_ALLOC_REQ_ENABLES_STAT_CTX_ID_VALID         0x8UL
+	#define RING_ALLOC_REQ_ENABLES_MAX_BW_VALID              0x20UL
+	#define RING_ALLOC_REQ_ENABLES_RX_RING_ID_VALID          0x40UL
+	#define RING_ALLOC_REQ_ENABLES_NQ_RING_ID_VALID          0x80UL
+	#define RING_ALLOC_REQ_ENABLES_RX_BUF_SIZE_VALID         0x100UL
+	#define RING_ALLOC_REQ_ENABLES_SCHQ_ID                   0x200UL
+	#define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE            0x400UL
+	#define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID        0x800UL
+	#define RING_ALLOC_REQ_ENABLES_RX_RATE_PROFILE_VALID     0x1000UL
+	u8	ring_type;
+	#define RING_ALLOC_REQ_RING_TYPE_L2_CMPL   0x0UL
+	#define RING_ALLOC_REQ_RING_TYPE_TX        0x1UL
+	#define RING_ALLOC_REQ_RING_TYPE_RX        0x2UL
+	#define RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL 0x3UL
+	#define RING_ALLOC_REQ_RING_TYPE_RX_AGG    0x4UL
+	#define RING_ALLOC_REQ_RING_TYPE_NQ        0x5UL
+	#define RING_ALLOC_REQ_RING_TYPE_LAST     RING_ALLOC_REQ_RING_TYPE_NQ
+	u8	cmpl_coal_cnt;
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_OFF 0x0UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_4   0x1UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_8   0x2UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_12  0x3UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_16  0x4UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_24  0x5UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_32  0x6UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_48  0x7UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_64  0x8UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_96  0x9UL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_128 0xaUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_192 0xbUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_256 0xcUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_320 0xdUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_384 0xeUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_MAX 0xfUL
+	#define RING_ALLOC_REQ_CMPL_COAL_CNT_LAST    RING_ALLOC_REQ_CMPL_COAL_CNT_COAL_MAX
+	__le16	flags;
+	#define RING_ALLOC_REQ_FLAGS_RX_SOP_PAD                        0x1UL
+	#define RING_ALLOC_REQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION     0x2UL
+	#define RING_ALLOC_REQ_FLAGS_NQ_DBR_PACING                     0x4UL
+	#define RING_ALLOC_REQ_FLAGS_TX_PKT_TS_CMPL_ENABLE             0x8UL
+	__le64	page_tbl_addr;
+	__le32	fbo;
+	u8	page_size;
+	u8	page_tbl_depth;
+	__le16	schq_id;
+	__le32	length;
+	__le16	logical_id;
+	__le16	cmpl_ring_id;
+	__le16	queue_id;
+	__le16	rx_buf_size;
+	__le16	rx_ring_id;
+	__le16	nq_ring_id;
+	__le16	ring_arb_cfg;
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_MASK      0xfUL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SFT       0
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_SP          0x1UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ         0x2UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_LAST       RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_WFQ
+	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_MASK            0xf0UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_RSVD_SFT             4
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_MASK 0xff00UL
+	#define RING_ALLOC_REQ_RING_ARB_CFG_ARB_POLICY_PARAM_SFT 8
+	__le16	steering_tag;
+	__le32	reserved3;
+	__le32	stat_ctx_id;
+	__le32	reserved4;
+	__le32	max_bw;
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_MASK             0xfffffffUL
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_SFT              0
+	#define RING_ALLOC_REQ_MAX_BW_SCALE                     0x10000000UL
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_BITS                  (0x0UL << 28)
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_BYTES                 (0x1UL << 28)
+	#define RING_ALLOC_REQ_MAX_BW_SCALE_LAST                 RING_ALLOC_REQ_MAX_BW_SCALE_BYTES
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MASK        0xe0000000UL
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_SFT         29
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_MEGA          (0x0UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_KILO          (0x2UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_BASE          (0x4UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_GIGA          (0x6UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_PERCENT1_100  (0x1UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID       (0x7UL << 29)
+	#define RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_LAST         RING_ALLOC_REQ_MAX_BW_BW_VALUE_UNIT_INVALID
+	u8	int_mode;
+	#define RING_ALLOC_REQ_INT_MODE_LEGACY 0x0UL
+	#define RING_ALLOC_REQ_INT_MODE_RSVD   0x1UL
+	#define RING_ALLOC_REQ_INT_MODE_MSIX   0x2UL
+	#define RING_ALLOC_REQ_INT_MODE_POLL   0x3UL
+	#define RING_ALLOC_REQ_INT_MODE_LAST  RING_ALLOC_REQ_INT_MODE_POLL
+	u8	mpc_chnls_type;
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_TCE     0x0UL
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_RCE     0x1UL
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_TE_CFA  0x2UL
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_RE_CFA  0x3UL
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE 0x4UL
+	#define RING_ALLOC_REQ_MPC_CHNLS_TYPE_LAST   RING_ALLOC_REQ_MPC_CHNLS_TYPE_PRIMATE
+	u8	rx_rate_profile_sel;
+	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_DEFAULT   0x0UL
+	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE 0x1UL
+	#define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_LAST     RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE
+	u8	unused_4;
+	__le64	cq_handle;
+};
+
+/* hwrm_ring_alloc_output (size:128b/16B) */
+struct hwrm_ring_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	ring_id;
+	__le16	logical_ring_id;
+	u8	push_buffer_index;
+	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PING_BUFFER 0x0UL
+	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER 0x1UL
+	#define RING_ALLOC_RESP_PUSH_BUFFER_INDEX_LAST       RING_ALLOC_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER
+	u8	unused_0[2];
+	u8	valid;
+};
+
+/* hwrm_ring_free_input (size:256b/32B) */
+struct hwrm_ring_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	ring_type;
+	#define RING_FREE_REQ_RING_TYPE_L2_CMPL   0x0UL
+	#define RING_FREE_REQ_RING_TYPE_TX        0x1UL
+	#define RING_FREE_REQ_RING_TYPE_RX        0x2UL
+	#define RING_FREE_REQ_RING_TYPE_ROCE_CMPL 0x3UL
+	#define RING_FREE_REQ_RING_TYPE_RX_AGG    0x4UL
+	#define RING_FREE_REQ_RING_TYPE_NQ        0x5UL
+	#define RING_FREE_REQ_RING_TYPE_LAST     RING_FREE_REQ_RING_TYPE_NQ
+	u8	flags;
+	#define RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID 0x1UL
+	#define RING_FREE_REQ_FLAGS_LAST             RING_FREE_REQ_FLAGS_VIRTIO_RING_VALID
+	__le16	ring_id;
+	__le32	prod_idx;
+	__le32	opaque;
+	__le32	unused_1;
+};
+
+/* hwrm_ring_free_output (size:128b/16B) */
+struct hwrm_ring_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_reset_input (size:192b/24B) */
+struct hwrm_ring_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	ring_type;
+	#define RING_RESET_REQ_RING_TYPE_L2_CMPL     0x0UL
+	#define RING_RESET_REQ_RING_TYPE_TX          0x1UL
+	#define RING_RESET_REQ_RING_TYPE_RX          0x2UL
+	#define RING_RESET_REQ_RING_TYPE_ROCE_CMPL   0x3UL
+	#define RING_RESET_REQ_RING_TYPE_RX_RING_GRP 0x6UL
+	#define RING_RESET_REQ_RING_TYPE_LAST       RING_RESET_REQ_RING_TYPE_RX_RING_GRP
+	u8	unused_0;
+	__le16	ring_id;
+	u8	unused_1[4];
+};
+
+/* hwrm_ring_reset_output (size:128b/16B) */
+struct hwrm_ring_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	push_buffer_index;
+	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_PING_BUFFER 0x0UL
+	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER 0x1UL
+	#define RING_RESET_RESP_PUSH_BUFFER_INDEX_LAST       RING_RESET_RESP_PUSH_BUFFER_INDEX_PONG_BUFFER
+	u8	unused_0[3];
+	u8	consumer_idx[3];
+	u8	valid;
+};
+
+/* hwrm_ring_aggint_qcaps_input (size:128b/16B) */
+struct hwrm_ring_aggint_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_ring_aggint_qcaps_output (size:384b/48B) */
+struct hwrm_ring_aggint_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	cmpl_params;
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_INT_LAT_TMR_MIN                  0x1UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_INT_LAT_TMR_MAX                  0x2UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_TIMER_RESET                      0x4UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_RING_IDLE                        0x8UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_DMA_AGGR                0x10UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_DMA_AGGR_DURING_INT     0x20UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_CMPL_AGGR_DMA_TMR                0x40UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_CMPL_AGGR_DMA_TMR_DURING_INT     0x80UL
+	#define RING_AGGINT_QCAPS_RESP_CMPL_PARAMS_NUM_CMPL_AGGR_INT                0x100UL
+	__le32	nq_params;
+	#define RING_AGGINT_QCAPS_RESP_NQ_PARAMS_INT_LAT_TMR_MIN     0x1UL
+	__le16	num_cmpl_dma_aggr_min;
+	__le16	num_cmpl_dma_aggr_max;
+	__le16	num_cmpl_dma_aggr_during_int_min;
+	__le16	num_cmpl_dma_aggr_during_int_max;
+	__le16	cmpl_aggr_dma_tmr_min;
+	__le16	cmpl_aggr_dma_tmr_max;
+	__le16	cmpl_aggr_dma_tmr_during_int_min;
+	__le16	cmpl_aggr_dma_tmr_during_int_max;
+	__le16	int_lat_tmr_min_min;
+	__le16	int_lat_tmr_min_max;
+	__le16	int_lat_tmr_max_min;
+	__le16	int_lat_tmr_max_max;
+	__le16	num_cmpl_aggr_int_min;
+	__le16	num_cmpl_aggr_int_max;
+	__le16	timer_units;
+	u8	unused_0[1];
+	u8	valid;
+};
+
+/* hwrm_ring_cmpl_ring_qaggint_params_input (size:192b/24B) */
+struct hwrm_ring_cmpl_ring_qaggint_params_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	ring_id;
+	__le16	flags;
+	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_UNUSED_0_MASK 0x3UL
+	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_UNUSED_0_SFT 0
+	#define RING_CMPL_RING_QAGGINT_PARAMS_REQ_FLAGS_IS_NQ        0x4UL
+	u8	unused_0[4];
+};
+
+/* hwrm_ring_cmpl_ring_qaggint_params_output (size:256b/32B) */
+struct hwrm_ring_cmpl_ring_qaggint_params_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	flags;
+	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_TIMER_RESET     0x1UL
+	#define RING_CMPL_RING_QAGGINT_PARAMS_RESP_FLAGS_RING_IDLE       0x2UL
+	__le16	num_cmpl_dma_aggr;
+	__le16	num_cmpl_dma_aggr_during_int;
+	__le16	cmpl_aggr_dma_tmr;
+	__le16	cmpl_aggr_dma_tmr_during_int;
+	__le16	int_lat_tmr_min;
+	__le16	int_lat_tmr_max;
+	__le16	num_cmpl_aggr_int;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_cmpl_ring_cfg_aggint_params_input (size:320b/40B) */
+struct hwrm_ring_cmpl_ring_cfg_aggint_params_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	ring_id;
+	__le16	flags;
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_TIMER_RESET     0x1UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_RING_IDLE       0x2UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_FLAGS_IS_NQ           0x4UL
+	__le16	num_cmpl_dma_aggr;
+	__le16	num_cmpl_dma_aggr_during_int;
+	__le16	cmpl_aggr_dma_tmr;
+	__le16	cmpl_aggr_dma_tmr_during_int;
+	__le16	int_lat_tmr_min;
+	__le16	int_lat_tmr_max;
+	__le16	num_cmpl_aggr_int;
+	__le16	enables;
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_DMA_AGGR                0x1UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_DMA_AGGR_DURING_INT     0x2UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_CMPL_AGGR_DMA_TMR                0x4UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_INT_LAT_TMR_MIN                  0x8UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_INT_LAT_TMR_MAX                  0x10UL
+	#define RING_CMPL_RING_CFG_AGGINT_PARAMS_REQ_ENABLES_NUM_CMPL_AGGR_INT                0x20UL
+	u8	unused_0[4];
+};
+
+/* hwrm_ring_cmpl_ring_cfg_aggint_params_output (size:128b/16B) */
+struct hwrm_ring_cmpl_ring_cfg_aggint_params_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_ring_grp_alloc_input (size:192b/24B) */
+struct hwrm_ring_grp_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	cr;
+	__le16	rr;
+	__le16	ar;
+	__le16	sc;
+};
+
+/* hwrm_ring_grp_alloc_output (size:128b/16B) */
+struct hwrm_ring_grp_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	ring_group_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_ring_grp_free_input (size:192b/24B) */
+struct hwrm_ring_grp_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	ring_group_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_ring_grp_free_output (size:128b/16B) */
+struct hwrm_ring_grp_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+#define DEFAULT_FLOW_ID 0xFFFFFFFFUL
+#define ROCEV1_FLOW_ID 0xFFFFFFFEUL
+#define ROCEV2_FLOW_ID 0xFFFFFFFDUL
+#define ROCEV2_CNP_FLOW_ID 0xFFFFFFFCUL
+
+/* hwrm_cfa_l2_filter_alloc_input (size:768b/96B) */
+struct hwrm_cfa_l2_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH              0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_TX             0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX             0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_LAST          CFA_L2_FILTER_ALLOC_REQ_FLAGS_PATH_RX
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_LOOPBACK          0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_DROP              0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_OUTERMOST         0x8UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_MASK      0x30UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_SFT       4
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_NO_ROCE_L2  (0x0UL << 4)
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_L2          (0x1UL << 4)
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_ROCE        (0x2UL << 4)
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_LAST       CFA_L2_FILTER_ALLOC_REQ_FLAGS_TRAFFIC_ROCE
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_XDP_DISABLE       0x40UL
+	#define CFA_L2_FILTER_ALLOC_REQ_FLAGS_SOURCE_VALID      0x80UL
+	__le32	enables;
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR             0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_ADDR_MASK        0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN            0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_OVLAN_MASK       0x8UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN            0x10UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN_MASK       0x20UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR           0x40UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_ADDR_MASK      0x80UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN          0x100UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_OVLAN_MASK     0x200UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN          0x400UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_L2_IVLAN_MASK     0x800UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_TYPE            0x1000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_SRC_ID              0x2000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE         0x4000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_DST_ID              0x8000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID      0x10000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_NUM_VLANS           0x20000UL
+	#define CFA_L2_FILTER_ALLOC_REQ_ENABLES_T_NUM_VLANS         0x40000UL
+	u8	l2_addr[6];
+	u8	num_vlans;
+	u8	t_num_vlans;
+	u8	l2_addr_mask[6];
+	__le16	l2_ovlan;
+	__le16	l2_ovlan_mask;
+	__le16	l2_ivlan;
+	__le16	l2_ivlan_mask;
+	u8	unused_1[2];
+	u8	t_l2_addr[6];
+	u8	unused_2[2];
+	u8	t_l2_addr_mask[6];
+	__le16	t_l2_ovlan;
+	__le16	t_l2_ovlan_mask;
+	__le16	t_l2_ivlan;
+	__le16	t_l2_ivlan_mask;
+	u8	src_type;
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_NPORT 0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_PF    0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VF    0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_VNIC  0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_KONG  0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_APE   0x5UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_BONO  0x6UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG  0x7UL
+	#define CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_LAST CFA_L2_FILTER_ALLOC_REQ_SRC_TYPE_TANG
+	u8	unused_3;
+	__le32	src_id;
+	u8	tunnel_type;
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
+	#define CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_L2_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	unused_4;
+	__le16	dst_id;
+	__le16	mirror_vnic_id;
+	u8	pri_hint;
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER    0x0UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_ABOVE_FILTER 0x1UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_BELOW_FILTER 0x2UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MAX          0x3UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN          0x4UL
+	#define CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_LAST        CFA_L2_FILTER_ALLOC_REQ_PRI_HINT_MIN
+	u8	unused_5;
+	__le32	unused_6;
+	__le64	l2_filter_id_hint;
+};
+
+/* hwrm_cfa_l2_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_l2_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	l2_filter_id;
+	__le32	flow_id;
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
+	#define CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_L2_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_l2_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	l2_filter_id;
+};
+
+/* hwrm_cfa_l2_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_l2_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_filter_cfg_input (size:384b/48B) */
+struct hwrm_cfa_l2_filter_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH                  0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_TX                 0x0UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX                 0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_LAST              CFA_L2_FILTER_CFG_REQ_FLAGS_PATH_RX
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_DROP                  0x2UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_MASK          0xcUL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_SFT           2
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_NO_ROCE_L2      (0x0UL << 2)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_L2              (0x1UL << 2)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE            (0x2UL << 2)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_LAST           CFA_L2_FILTER_CFG_REQ_FLAGS_TRAFFIC_ROCE
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_MASK         0x30UL
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_SFT          4
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_NO_UPDATE      (0x0UL << 4)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_BYPASS_LKUP    (0x1UL << 4)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_ENABLE_LKUP    (0x2UL << 4)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_RESTORE_FW_OP  (0x3UL << 4)
+	#define CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_LAST          CFA_L2_FILTER_CFG_REQ_FLAGS_REMAP_OP_RESTORE_FW_OP
+	__le32	enables;
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_DST_ID                 0x1UL
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID     0x2UL
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_PROF_FUNC              0x4UL
+	#define CFA_L2_FILTER_CFG_REQ_ENABLES_L2_CONTEXT_ID          0x8UL
+	__le64	l2_filter_id;
+	__le32	dst_id;
+	__le32	new_mirror_vnic_id;
+	__le32	prof_func;
+	__le32	l2_context_id;
+};
+
+/* hwrm_cfa_l2_filter_cfg_output (size:128b/16B) */
+struct hwrm_cfa_l2_filter_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_set_rx_mask_input (size:448b/56B) */
+struct hwrm_cfa_l2_set_rx_mask_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	vnic_id;
+	__le32	mask;
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_MCAST               0x2UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_ALL_MCAST           0x4UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_BCAST               0x8UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_PROMISCUOUS         0x10UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_OUTERMOST           0x20UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLANONLY            0x40UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_VLAN_NONVLAN        0x80UL
+	#define CFA_L2_SET_RX_MASK_REQ_MASK_ANYVLAN_NONVLAN     0x100UL
+	__le64	mc_tbl_addr;
+	__le32	num_mc_entries;
+	u8	unused_0[4];
+	__le64	vlan_tag_tbl_addr;
+	__le32	num_vlan_tags;
+	u8	unused_1[4];
+};
+
+/* hwrm_cfa_l2_set_rx_mask_output (size:128b/16B) */
+struct hwrm_cfa_l2_set_rx_mask_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_l2_set_rx_mask_cmd_err (size:64b/8B) */
+struct hwrm_cfa_l2_set_rx_mask_cmd_err {
+	u8	code;
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_UNKNOWN                    0x0UL
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR 0x1UL
+	#define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST                      CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR
+	u8	unused_0[7];
+};
+
+/* hwrm_cfa_tunnel_filter_alloc_input (size:704b/88B) */
+struct hwrm_cfa_tunnel_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
+	__le32	enables;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID       0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_ADDR            0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L2_IVLAN           0x4UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR            0x8UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_L3_ADDR_TYPE       0x10UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR_TYPE     0x20UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_T_L3_ADDR          0x40UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x80UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_VNI                0x100UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_DST_VNIC_ID        0x200UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x400UL
+	__le64	l2_filter_id;
+	u8	l2_addr[6];
+	__le16	l2_ivlan;
+	__le32	l3_addr[4];
+	__le32	t_l3_addr[4];
+	u8	l3_addr_type;
+	u8	t_l3_addr_type;
+	u8	tunnel_type;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	tunnel_flags;
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_OAM_CHECKSUM_EXPLHDR     0x1UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_CRITICAL_OPT_S1          0x2UL
+	#define CFA_TUNNEL_FILTER_ALLOC_REQ_TUNNEL_FLAGS_TUN_FLAGS_EXTHDR_SEQNUM_S0         0x4UL
+	__le32	vni;
+	__le32	dst_vnic_id;
+	__le32	mirror_vnic_id;
+};
+
+/* hwrm_cfa_tunnel_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_tunnel_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tunnel_filter_id;
+	__le32	flow_id;
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
+	#define CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_TUNNEL_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_tunnel_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_tunnel_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	tunnel_filter_id;
+};
+
+/* hwrm_cfa_tunnel_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_tunnel_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_vxlan_ipv4_hdr (size:128b/16B) */
+struct hwrm_vxlan_ipv4_hdr {
+	u8	ver_hlen;
+	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_MASK 0xfUL
+	#define VXLAN_IPV4_HDR_VER_HLEN_HEADER_LENGTH_SFT 0
+	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_MASK      0xf0UL
+	#define VXLAN_IPV4_HDR_VER_HLEN_VERSION_SFT       4
+	u8	tos;
+	__be16	ip_id;
+	__be16	flags_frag_offset;
+	u8	ttl;
+	u8	protocol;
+	__be32	src_ip_addr;
+	__be32	dest_ip_addr;
+};
+
+/* hwrm_vxlan_ipv6_hdr (size:320b/40B) */
+struct hwrm_vxlan_ipv6_hdr {
+	__be32	ver_tc_flow_label;
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_SFT         0x1cUL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_VER_MASK        0xf0000000UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_SFT          0x14UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_TC_MASK         0xff00000UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_SFT  0x0UL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK 0xfffffUL
+	#define VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_LAST           VXLAN_IPV6_HDR_VER_TC_FLOW_LABEL_FLOW_LABEL_MASK
+	__be16	payload_len;
+	u8	next_hdr;
+	u8	ttl;
+	__be32	src_ip_addr[4];
+	__be32	dest_ip_addr[4];
+};
+
+/* hwrm_cfa_encap_data_vxlan (size:640b/80B) */
+struct hwrm_cfa_encap_data_vxlan {
+	u8	src_mac_addr[6];
+	__le16	unused_0;
+	u8	dst_mac_addr[6];
+	u8	num_vlan_tags;
+	u8	unused_1;
+	__be16	ovlan_tpid;
+	__be16	ovlan_tci;
+	__be16	ivlan_tpid;
+	__be16	ivlan_tci;
+	__le32	l3[10];
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_MASK 0xfUL
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV4 0x4UL
+	#define CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6 0x6UL
+	#define CFA_ENCAP_DATA_VXLAN_L3_LAST    CFA_ENCAP_DATA_VXLAN_L3_VER_IPV6
+	__be16	src_port;
+	__be16	dst_port;
+	__be32	vni;
+	u8	hdr_rsvd0[3];
+	u8	hdr_rsvd1;
+	u8	hdr_flags;
+	u8	unused[3];
+};
+
+/* hwrm_cfa_encap_record_alloc_input (size:832b/104B) */
+struct hwrm_cfa_encap_record_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_FLAGS_LOOPBACK     0x1UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_FLAGS_EXTERNAL     0x2UL
+	u8	encap_type;
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN        0x1UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_NVGRE        0x2UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_L2GRE        0x3UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPIP         0x4UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_GENEVE       0x5UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_MPLS         0x6UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VLAN         0x7UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE        0x8UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_V4     0x9UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_IPGRE_V1     0xaUL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_L2_ETYPE     0xbUL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_LAST        CFA_ENCAP_RECORD_ALLOC_REQ_ENCAP_TYPE_VXLAN_GPE
+	u8	unused_0[3];
+	__le32	encap_data[20];
+};
+
+/* hwrm_cfa_encap_record_alloc_output (size:128b/16B) */
+struct hwrm_cfa_encap_record_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	encap_record_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_encap_record_free_input (size:192b/24B) */
+struct hwrm_cfa_encap_record_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_record_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_cfa_encap_record_free_output (size:128b/16B) */
+struct hwrm_cfa_encap_record_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_input (size:1024b/128B) */
+struct hwrm_cfa_ntuple_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_LOOPBACK              0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DROP                  0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_METER                 0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DEST_FID              0x8UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_ARP_REPLY             0x10UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_DEST_RFS_RING_IDX     0x20UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_FLAGS_NO_L2_CONTEXT         0x40UL
+	__le32	enables;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_L2_FILTER_ID         0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE            0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE          0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR          0x8UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE          0x10UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR           0x20UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR_MASK      0x40UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR           0x80UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR_MASK      0x100UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL          0x200UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT             0x400UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_SRC_PORT_MASK        0x800UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT             0x1000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_PORT_MASK        0x2000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_PRI_HINT             0x4000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_NTUPLE_FILTER_ID     0x8000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_ID               0x10000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID       0x20000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR          0x40000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_ENABLES_RFS_RING_TBL_IDX     0x80000UL
+	__le64	l2_filter_id;
+	u8	src_macaddr[6];
+	__be16	ethertype;
+	u8	ip_addr_type;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
+	u8	ip_protocol;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_ICMP    0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_ICMPV6  0x3aUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_RSVD    0xffUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_NTUPLE_FILTER_ALLOC_REQ_IP_PROTOCOL_RSVD
+	__le16	dst_id;
+	__le16	rfs_ring_tbl_idx;
+	u8	tunnel_type;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_NTUPLE_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	pri_hint;
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_NO_PREFER 0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_ABOVE     0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_BELOW     0x2UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_HIGHEST   0x3UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST    0x4UL
+	#define CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LAST     CFA_NTUPLE_FILTER_ALLOC_REQ_PRI_HINT_LOWEST
+	__be32	src_ipaddr[4];
+	__be32	src_ipaddr_mask[4];
+	__be32	dst_ipaddr[4];
+	__be32	dst_ipaddr_mask[4];
+	__be16	src_port;
+	__be16	src_port_mask;
+	__be16	dst_port;
+	__be16	dst_port_mask;
+	__le64	ntuple_filter_id_hint;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_output (size:192b/24B) */
+struct hwrm_cfa_ntuple_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	ntuple_filter_id;
+	__le32	flow_id;
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_TYPE_EXT
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
+	#define CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_NTUPLE_FILTER_ALLOC_RESP_FLOW_ID_DIR_TX
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_alloc_cmd_err (size:64b/8B) */
+struct hwrm_cfa_ntuple_filter_alloc_cmd_err {
+	u8	code;
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN                   0x0UL
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR 0x1UL
+	#define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST                     CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR
+	u8	unused_0[7];
+};
+
+/* hwrm_cfa_ntuple_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_ntuple_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	ntuple_filter_id;
+};
+
+/* hwrm_cfa_ntuple_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_ntuple_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_ntuple_filter_cfg_input (size:384b/48B) */
+struct hwrm_cfa_ntuple_filter_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_DST_ID                0x1UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_MIRROR_VNIC_ID        0x2UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_ENABLES_NEW_METER_INSTANCE_ID     0x4UL
+	__le32	flags;
+	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_DEST_FID              0x1UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_DEST_RFS_RING_IDX     0x2UL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_FLAGS_NO_L2_CONTEXT         0x4UL
+	__le64	ntuple_filter_id;
+	__le32	new_dst_id;
+	__le32	new_mirror_vnic_id;
+	__le16	new_meter_instance_id;
+	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID 0xffffUL
+	#define CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_LAST   CFA_NTUPLE_FILTER_CFG_REQ_NEW_METER_INSTANCE_ID_INVALID
+	u8	unused_1[6];
+};
+
+/* hwrm_cfa_ntuple_filter_cfg_output (size:128b/16B) */
+struct hwrm_cfa_ntuple_filter_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_decap_filter_alloc_input (size:832b/104B) */
+struct hwrm_cfa_decap_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_FLAGS_OVS_TUNNEL     0x1UL
+	__le32	enables;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_TYPE        0x1UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_TUNNEL_ID          0x2UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_MACADDR        0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_MACADDR        0x8UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_OVLAN_VID          0x10UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IVLAN_VID          0x20UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_OVLAN_VID        0x40UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_T_IVLAN_VID        0x80UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_ETHERTYPE          0x100UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_IPADDR         0x200UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_IPADDR         0x400UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IPADDR_TYPE        0x800UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_IP_PROTOCOL        0x1000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_SRC_PORT           0x2000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_PORT           0x4000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_DST_ID             0x8000UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_ENABLES_MIRROR_VNIC_ID     0x10000UL
+	__be32	tunnel_id;
+	u8	tunnel_type;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_DECAP_FILTER_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+	u8	unused_0;
+	__le16	unused_1;
+	u8	src_macaddr[6];
+	u8	unused_2[2];
+	u8	dst_macaddr[6];
+	__be16	ovlan_vid;
+	__be16	ivlan_vid;
+	__be16	t_ovlan_vid;
+	__be16	t_ivlan_vid;
+	__be16	ethertype;
+	u8	ip_addr_type;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_UNKNOWN 0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV4    0x4UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6    0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_ADDR_TYPE_IPV6
+	u8	ip_protocol;
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UNKNOWN 0x0UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_TCP     0x6UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP     0x11UL
+	#define CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_LAST   CFA_DECAP_FILTER_ALLOC_REQ_IP_PROTOCOL_UDP
+	__le16	unused_3;
+	__le32	unused_4;
+	__be32	src_ipaddr[4];
+	__be32	dst_ipaddr[4];
+	__be16	src_port;
+	__be16	dst_port;
+	__le16	dst_id;
+	__le16	l2_ctxt_ref_id;
+};
+
+/* hwrm_cfa_decap_filter_alloc_output (size:128b/16B) */
+struct hwrm_cfa_decap_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	decap_filter_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_decap_filter_free_input (size:192b/24B) */
+struct hwrm_cfa_decap_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	decap_filter_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_cfa_decap_filter_free_output (size:128b/16B) */
+struct hwrm_cfa_decap_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_alloc_input (size:1024b/128B) */
+struct hwrm_cfa_flow_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	flags;
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_TUNNEL                 0x1UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_MASK          0x6UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_SFT           1
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_NONE            (0x0UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_ONE             (0x1UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO             (0x2UL << 1)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_LAST           CFA_FLOW_ALLOC_REQ_FLAGS_NUM_VLAN_TWO
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_MASK          0x38UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_SFT           3
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_L2              (0x0UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV4            (0x1UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6            (0x2UL << 3)
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_LAST           CFA_FLOW_ALLOC_REQ_FLAGS_FLOWTYPE_IPV6
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_PATH_TX                0x40UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_PATH_RX                0x80UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_MATCH_VXLAN_IP_VNI     0x100UL
+	#define CFA_FLOW_ALLOC_REQ_FLAGS_VHOST_ID_USE_VLAN      0x200UL
+	__le16	src_fid;
+	__le32	tunnel_handle;
+	__le16	action_flags;
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FWD                       0x1UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_RECYCLE                   0x2UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_DROP                      0x4UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_METER                     0x8UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL                    0x10UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_SRC                   0x20UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_DEST                  0x40UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NAT_IPV4_ADDRESS          0x80UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_L2_HEADER_REWRITE         0x100UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TTL_DECREMENT             0x200UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_TUNNEL_IP                 0x400UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_FLOW_AGING_ENABLED        0x800UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_PRI_HINT                  0x1000UL
+	#define CFA_FLOW_ALLOC_REQ_ACTION_FLAGS_NO_FLOW_COUNTER_ALLOC     0x2000UL
+	__le16	dst_fid;
+	__be16	l2_rewrite_vlan_tpid;
+	__be16	l2_rewrite_vlan_tci;
+	__le16	act_meter_id;
+	__le16	ref_flow_handle;
+	__be16	ethertype;
+	__be16	outer_vlan_tci;
+	__be16	dmac[3];
+	__be16	inner_vlan_tci;
+	__be16	smac[3];
+	u8	ip_dst_mask_len;
+	u8	ip_src_mask_len;
+	__be32	ip_dst[4];
+	__be32	ip_src[4];
+	__be16	l4_src_port;
+	__be16	l4_src_port_mask;
+	__be16	l4_dst_port;
+	__be16	l4_dst_port_mask;
+	__be32	nat_ip_address[4];
+	__be16	l2_rewrite_dmac[3];
+	__be16	nat_port;
+	__be16	l2_rewrite_smac[3];
+	u8	ip_proto;
+	u8	tunnel_type;
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_NONTUNNEL    0x0UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN        0x1UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_NVGRE        0x2UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_L2GRE        0x3UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPIP         0x4UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_GENEVE       0x5UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_MPLS         0x6UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_STT          0x7UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPGRE        0x8UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4     0x9UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1     0xaUL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE     0xbUL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6 0xcUL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE    0x10UL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL    0xffUL
+	#define CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_LAST        CFA_FLOW_ALLOC_REQ_TUNNEL_TYPE_ANYTUNNEL
+};
+
+/* hwrm_cfa_flow_alloc_output (size:256b/32B) */
+struct hwrm_cfa_flow_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	flow_handle;
+	u8	unused_0[2];
+	__le32	flow_id;
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_VALUE_MASK 0x3fffffffUL
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_VALUE_SFT 0
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE      0x40000000UL
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_INT    (0x0UL << 30)
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_EXT    (0x1UL << 30)
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_LAST  CFA_FLOW_ALLOC_RESP_FLOW_ID_TYPE_EXT
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR       0x80000000UL
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_RX      (0x0UL << 31)
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_TX      (0x1UL << 31)
+	#define CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_LAST   CFA_FLOW_ALLOC_RESP_FLOW_ID_DIR_TX
+	__le64	ext_flow_handle;
+	__le32	flow_counter_id;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_alloc_cmd_err (size:64b/8B) */
+struct hwrm_cfa_flow_alloc_cmd_err {
+	u8	code;
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_UNKNOWN         0x0UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_L2_CONTEXT_TCAM 0x1UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_ACTION_RECORD   0x2UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_COUNTER    0x3UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_WILD_CARD_TCAM  0x4UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_HASH_COLLISION  0x5UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_KEY_EXISTS      0x6UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_CTXT_DB    0x7UL
+	#define CFA_FLOW_ALLOC_CMD_ERR_CODE_LAST           CFA_FLOW_ALLOC_CMD_ERR_CODE_FLOW_CTXT_DB
+	u8	unused_0[7];
+};
+
+/* hwrm_cfa_flow_free_input (size:256b/32B) */
+struct hwrm_cfa_flow_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	flow_handle;
+	__le16	unused_0;
+	__le32	flow_counter_id;
+	__le64	ext_flow_handle;
+};
+
+/* hwrm_cfa_flow_free_output (size:256b/32B) */
+struct hwrm_cfa_flow_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	packet;
+	__le64	byte;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_info_input (size:256b/32B) */
+struct hwrm_cfa_flow_info_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	flow_handle;
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_MAX_MASK      0xfffUL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_CNP_CNT       0x1000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV1_CNT    0x2000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_NIC_TX        0x3000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT    0x4000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_DIR_RX        0x8000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_CNP_CNT_RX    0x9000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV1_CNT_RX 0xa000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_NIC_RX        0xb000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT_RX 0xc000UL
+	#define CFA_FLOW_INFO_REQ_FLOW_HANDLE_LAST         CFA_FLOW_INFO_REQ_FLOW_HANDLE_ROCEV2_CNT_RX
+	u8	unused_0[6];
+	__le64	ext_flow_handle;
+};
+
+/* hwrm_cfa_flow_info_output (size:5632b/704B) */
+struct hwrm_cfa_flow_info_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define CFA_FLOW_INFO_RESP_FLAGS_PATH_TX     0x1UL
+	#define CFA_FLOW_INFO_RESP_FLAGS_PATH_RX     0x2UL
+	u8	profile;
+	__le16	src_fid;
+	__le16	dst_fid;
+	__le16	l2_ctxt_id;
+	__le64	em_info;
+	__le64	tcam_info;
+	__le64	vfp_tcam_info;
+	__le16	ar_id;
+	__le16	flow_handle;
+	__le32	tunnel_handle;
+	__le16	flow_timer;
+	u8	unused_0[6];
+	__le32	flow_key_data[130];
+	__le32	flow_action_info[30];
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_flow_stats_input (size:640b/80B) */
+struct hwrm_cfa_flow_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	num_flows;
+	__le16	flow_handle_0;
+	__le16	flow_handle_1;
+	__le16	flow_handle_2;
+	__le16	flow_handle_3;
+	__le16	flow_handle_4;
+	__le16	flow_handle_5;
+	__le16	flow_handle_6;
+	__le16	flow_handle_7;
+	__le16	flow_handle_8;
+	__le16	flow_handle_9;
+	u8	unused_0[2];
+	__le32	flow_id_0;
+	__le32	flow_id_1;
+	__le32	flow_id_2;
+	__le32	flow_id_3;
+	__le32	flow_id_4;
+	__le32	flow_id_5;
+	__le32	flow_id_6;
+	__le32	flow_id_7;
+	__le32	flow_id_8;
+	__le32	flow_id_9;
+};
+
+/* hwrm_cfa_flow_stats_output (size:1408b/176B) */
+struct hwrm_cfa_flow_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	packet_0;
+	__le64	packet_1;
+	__le64	packet_2;
+	__le64	packet_3;
+	__le64	packet_4;
+	__le64	packet_5;
+	__le64	packet_6;
+	__le64	packet_7;
+	__le64	packet_8;
+	__le64	packet_9;
+	__le64	byte_0;
+	__le64	byte_1;
+	__le64	byte_2;
+	__le64	byte_3;
+	__le64	byte_4;
+	__le64	byte_5;
+	__le64	byte_6;
+	__le64	byte_7;
+	__le64	byte_8;
+	__le64	byte_9;
+	__le16	flow_hits;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_cfa_vfr_alloc_input (size:448b/56B) */
+struct hwrm_cfa_vfr_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	vf_id;
+	__le16	reserved;
+	u8	unused_0[4];
+	char	vfr_name[32];
+};
+
+/* hwrm_cfa_vfr_alloc_output (size:128b/16B) */
+struct hwrm_cfa_vfr_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	rx_cfa_code;
+	__le16	tx_cfa_action;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_cfa_vfr_free_input (size:448b/56B) */
+struct hwrm_cfa_vfr_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	char	vfr_name[32];
+	__le16	vf_id;
+	__le16	reserved;
+	u8	unused_0[4];
+};
+
+/* hwrm_cfa_vfr_free_output (size:128b/16B) */
+struct hwrm_cfa_vfr_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_eem_qcaps_input (size:192b/24B) */
+struct hwrm_cfa_eem_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_EEM_QCAPS_REQ_FLAGS_PATH_TX               0x1UL
+	#define CFA_EEM_QCAPS_REQ_FLAGS_PATH_RX               0x2UL
+	#define CFA_EEM_QCAPS_REQ_FLAGS_PREFERRED_OFFLOAD     0x4UL
+	__le32	unused_0;
+};
+
+/* hwrm_cfa_eem_qcaps_output (size:320b/40B) */
+struct hwrm_cfa_eem_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define CFA_EEM_QCAPS_RESP_FLAGS_PATH_TX                                         0x1UL
+	#define CFA_EEM_QCAPS_RESP_FLAGS_PATH_RX                                         0x2UL
+	#define CFA_EEM_QCAPS_RESP_FLAGS_CENTRALIZED_MEMORY_MODEL_SUPPORTED              0x4UL
+	#define CFA_EEM_QCAPS_RESP_FLAGS_DETACHED_CENTRALIZED_MEMORY_MODEL_SUPPORTED     0x8UL
+	__le32	unused_0;
+	__le32	supported;
+	#define CFA_EEM_QCAPS_RESP_SUPPORTED_KEY0_TABLE                       0x1UL
+	#define CFA_EEM_QCAPS_RESP_SUPPORTED_KEY1_TABLE                       0x2UL
+	#define CFA_EEM_QCAPS_RESP_SUPPORTED_EXTERNAL_RECORD_TABLE            0x4UL
+	#define CFA_EEM_QCAPS_RESP_SUPPORTED_EXTERNAL_FLOW_COUNTERS_TABLE     0x8UL
+	#define CFA_EEM_QCAPS_RESP_SUPPORTED_FID_TABLE                        0x10UL
+	__le32	max_entries_supported;
+	__le16	key_entry_size;
+	__le16	record_entry_size;
+	__le16	efc_entry_size;
+	__le16	fid_entry_size;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_eem_cfg_input (size:384b/48B) */
+struct hwrm_cfa_eem_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_EEM_CFG_REQ_FLAGS_PATH_TX               0x1UL
+	#define CFA_EEM_CFG_REQ_FLAGS_PATH_RX               0x2UL
+	#define CFA_EEM_CFG_REQ_FLAGS_PREFERRED_OFFLOAD     0x4UL
+	#define CFA_EEM_CFG_REQ_FLAGS_SECONDARY_PF          0x8UL
+	__le16	group_id;
+	__le16	unused_0;
+	__le32	num_entries;
+	__le32	unused_1;
+	__le16	key0_ctx_id;
+	__le16	key1_ctx_id;
+	__le16	record_ctx_id;
+	__le16	efc_ctx_id;
+	__le16	fid_ctx_id;
+	__le16	unused_2;
+	__le32	unused_3;
+};
+
+/* hwrm_cfa_eem_cfg_output (size:128b/16B) */
+struct hwrm_cfa_eem_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_eem_qcfg_input (size:192b/24B) */
+struct hwrm_cfa_eem_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_EEM_QCFG_REQ_FLAGS_PATH_TX     0x1UL
+	#define CFA_EEM_QCFG_REQ_FLAGS_PATH_RX     0x2UL
+	__le32	unused_0;
+};
+
+/* hwrm_cfa_eem_qcfg_output (size:256b/32B) */
+struct hwrm_cfa_eem_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define CFA_EEM_QCFG_RESP_FLAGS_PATH_TX               0x1UL
+	#define CFA_EEM_QCFG_RESP_FLAGS_PATH_RX               0x2UL
+	#define CFA_EEM_QCFG_RESP_FLAGS_PREFERRED_OFFLOAD     0x4UL
+	__le32	num_entries;
+	__le16	key0_ctx_id;
+	__le16	key1_ctx_id;
+	__le16	record_ctx_id;
+	__le16	efc_ctx_id;
+	__le16	fid_ctx_id;
+	u8	unused_2[5];
+	u8	valid;
+};
+
+/* hwrm_cfa_eem_op_input (size:192b/24B) */
+struct hwrm_cfa_eem_op_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define CFA_EEM_OP_REQ_FLAGS_PATH_TX     0x1UL
+	#define CFA_EEM_OP_REQ_FLAGS_PATH_RX     0x2UL
+	__le16	unused_0;
+	__le16	op;
+	#define CFA_EEM_OP_REQ_OP_RESERVED    0x0UL
+	#define CFA_EEM_OP_REQ_OP_EEM_DISABLE 0x1UL
+	#define CFA_EEM_OP_REQ_OP_EEM_ENABLE  0x2UL
+	#define CFA_EEM_OP_REQ_OP_EEM_CLEANUP 0x3UL
+	#define CFA_EEM_OP_REQ_OP_LAST       CFA_EEM_OP_REQ_OP_EEM_CLEANUP
+};
+
+/* hwrm_cfa_eem_op_output (size:128b/16B) */
+struct hwrm_cfa_eem_op_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_cfa_adv_flow_mgnt_qcaps_input (size:256b/32B) */
+struct hwrm_cfa_adv_flow_mgnt_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	unused_0[4];
+};
+
+/* hwrm_cfa_adv_flow_mgnt_qcaps_output (size:128b/16B) */
+struct hwrm_cfa_adv_flow_mgnt_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	flags;
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_HND_16BIT_SUPPORTED                     0x1UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_HND_64BIT_SUPPORTED                     0x2UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_BATCH_DELETE_SUPPORTED                  0x4UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_RESET_ALL_SUPPORTED                     0x8UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_DEST_FUNC_SUPPORTED              0x10UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_TX_EEM_FLOW_SUPPORTED                        0x20UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RX_EEM_FLOW_SUPPORTED                        0x40UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_FLOW_COUNTER_ALLOC_SUPPORTED                 0x80UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_SUPPORTED                   0x100UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_UNTAGGED_VLAN_SUPPORTED                      0x200UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_XDP_SUPPORTED                                0x400UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_L2_HEADER_SOURCE_FIELDS_SUPPORTED            0x800UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_ARP_SUPPORTED                 0x1000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_V2_SUPPORTED                0x2000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_ETHERTYPE_IP_SUPPORTED        0x4000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_TRUFLOW_CAPABLE                              0x8000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_L2_FILTER_TRAFFIC_TYPE_L2_ROCE_SUPPORTED     0x10000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_LAG_SUPPORTED                                0x20000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_NO_L2CTX_SUPPORTED               0x40000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NIC_FLOW_STATS_SUPPORTED                     0x80000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_NTUPLE_FLOW_RX_EXT_IP_PROTO_SUPPORTED        0x100000UL
+	#define CFA_ADV_FLOW_MGNT_QCAPS_RESP_FLAGS_RFS_RING_TBL_IDX_V3_SUPPORTED                0x200000UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_query_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN              0x1UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GENEVE             0x5UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ECPRI              0xeUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_SRV6               0xfUL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_GRE                0x11UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
+	#define TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_QUERY_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
+	u8	tunnel_next_proto;
+	u8	unused_0[6];
+};
+
+/* hwrm_tunnel_dst_port_query_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tunnel_dst_port_id;
+	__be16	tunnel_dst_port_val;
+	u8	upar_in_use;
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR0     0x1UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR1     0x2UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR2     0x4UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR3     0x8UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR4     0x10UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR5     0x20UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR6     0x40UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_UPAR_IN_USE_UPAR7     0x80UL
+	u8	status;
+	#define TUNNEL_DST_PORT_QUERY_RESP_STATUS_CHIP_LEVEL     0x1UL
+	#define TUNNEL_DST_PORT_QUERY_RESP_STATUS_FUNC_LEVEL     0x2UL
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_alloc_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN              0x1UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE             0x5UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ECPRI              0xeUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_SRV6               0xfUL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GRE                0x11UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
+	#define TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
+	u8	tunnel_next_proto;
+	__be16	tunnel_dst_port_val;
+	u8	unused_0[4];
+};
+
+/* hwrm_tunnel_dst_port_alloc_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	tunnel_dst_port_id;
+	u8	error_info;
+	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_SUCCESS         0x0UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ALLOCATED   0x1UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_NO_RESOURCE 0x2UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ENABLED     0x3UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_LAST           TUNNEL_DST_PORT_ALLOC_RESP_ERROR_INFO_ERR_ENABLED
+	u8	upar_in_use;
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR0     0x1UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR1     0x2UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR2     0x4UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR3     0x8UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR4     0x10UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR5     0x20UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR6     0x40UL
+	#define TUNNEL_DST_PORT_ALLOC_RESP_UPAR_IN_USE_UPAR7     0x80UL
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_tunnel_dst_port_free_input (size:192b/24B) */
+struct hwrm_tunnel_dst_port_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	tunnel_type;
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN              0x1UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE             0x5UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_V4           0x9UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_IPGRE_V1           0xaUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_L2_ETYPE           0xbUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE_V6       0xcUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_CUSTOM_GRE         0xdUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ECPRI              0xeUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_SRV6               0xfUL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN_GPE          0x10UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GRE                0x11UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR       0x12UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES01 0x13UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES02 0x14UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES03 0x15UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES04 0x16UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES05 0x17UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES06 0x18UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07 0x19UL
+	#define TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_LAST              TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_ULP_DYN_UPAR_RES07
+	u8	tunnel_next_proto;
+	__le16	tunnel_dst_port_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_tunnel_dst_port_free_output (size:128b/16B) */
+struct hwrm_tunnel_dst_port_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	error_info;
+	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_SUCCESS           0x0UL
+	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_OWNER     0x1UL
+	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_ALLOCATED 0x2UL
+	#define TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_LAST             TUNNEL_DST_PORT_FREE_RESP_ERROR_INFO_ERR_NOT_ALLOCATED
+	u8	unused_1[6];
+	u8	valid;
+};
+
+/* ctx_hw_stats (size:1280b/160B) */
+struct ctx_hw_stats {
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_error_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_error_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	tpa_pkts;
+	__le64	tpa_bytes;
+	__le64	tpa_events;
+	__le64	tpa_aborts;
+};
+
+/* ctx_hw_stats_ext (size:1408b/176B) */
+struct ctx_hw_stats_ext {
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_error_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_error_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_tpa_eligible_pkt;
+	__le64	rx_tpa_eligible_bytes;
+	__le64	rx_tpa_pkt;
+	__le64	rx_tpa_bytes;
+	__le64	rx_tpa_errors;
+	__le64	rx_tpa_events;
+};
+
+/* hwrm_stat_ctx_alloc_input (size:384b/48B) */
+struct hwrm_stat_ctx_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	stats_dma_addr;
+	__le32	update_period_ms;
+	u8	stat_ctx_flags;
+	#define STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE             0x1UL
+	#define STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_DUP_HOST_BUF     0x2UL
+	u8	unused_0;
+	__le16	stats_dma_length;
+	__le16	flags;
+	#define STAT_CTX_ALLOC_REQ_FLAGS_STEERING_TAG_VALID     0x1UL
+	__le16	steering_tag;
+	__le32	stat_ctx_id;
+	__le16	alloc_seq_id;
+	u8	unused_1[6];
+};
+
+/* hwrm_stat_ctx_alloc_output (size:128b/16B) */
+struct hwrm_stat_ctx_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	stat_ctx_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_free_input (size:192b/24B) */
+struct hwrm_stat_ctx_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_stat_ctx_free_output (size:128b/16B) */
+struct hwrm_stat_ctx_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	stat_ctx_id;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_query_input (size:192b/24B) */
+struct hwrm_stat_ctx_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	flags;
+	#define STAT_CTX_QUERY_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[3];
+};
+
+/* hwrm_stat_ctx_query_output (size:1408b/176B) */
+struct hwrm_stat_ctx_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_error_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_error_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	rx_agg_pkts;
+	__le64	rx_agg_bytes;
+	__le64	rx_agg_events;
+	__le64	rx_agg_aborts;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_stat_ext_ctx_query_input (size:192b/24B) */
+struct hwrm_stat_ext_ctx_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	flags;
+	#define STAT_EXT_CTX_QUERY_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[3];
+};
+
+/* hwrm_stat_ext_ctx_query_output (size:1536b/192B) */
+struct hwrm_stat_ext_ctx_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	rx_ucast_pkts;
+	__le64	rx_mcast_pkts;
+	__le64	rx_bcast_pkts;
+	__le64	rx_discard_pkts;
+	__le64	rx_error_pkts;
+	__le64	rx_ucast_bytes;
+	__le64	rx_mcast_bytes;
+	__le64	rx_bcast_bytes;
+	__le64	tx_ucast_pkts;
+	__le64	tx_mcast_pkts;
+	__le64	tx_bcast_pkts;
+	__le64	tx_error_pkts;
+	__le64	tx_discard_pkts;
+	__le64	tx_ucast_bytes;
+	__le64	tx_mcast_bytes;
+	__le64	tx_bcast_bytes;
+	__le64	rx_tpa_eligible_pkt;
+	__le64	rx_tpa_eligible_bytes;
+	__le64	rx_tpa_pkt;
+	__le64	rx_tpa_bytes;
+	__le64	rx_tpa_errors;
+	__le64	rx_tpa_events;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_stat_ctx_clr_stats_input (size:192b/24B) */
+struct hwrm_stat_ctx_clr_stats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	stat_ctx_id;
+	u8	unused_0[4];
+};
+
+/* hwrm_stat_ctx_clr_stats_output (size:128b/16B) */
+struct hwrm_stat_ctx_clr_stats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_pcie_qstats_input (size:256b/32B) */
+struct hwrm_pcie_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	pcie_stat_size;
+	u8	unused_0[6];
+	__le64	pcie_stat_host_addr;
+};
+
+/* hwrm_pcie_qstats_output (size:128b/16B) */
+struct hwrm_pcie_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	pcie_stat_size;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* pcie_ctx_hw_stats (size:768b/96B) */
+struct pcie_ctx_hw_stats {
+	__le64	pcie_pl_signal_integrity;
+	__le64	pcie_dl_signal_integrity;
+	__le64	pcie_tl_signal_integrity;
+	__le64	pcie_link_integrity;
+	__le64	pcie_tx_traffic_rate;
+	__le64	pcie_rx_traffic_rate;
+	__le64	pcie_tx_dllp_statistics;
+	__le64	pcie_rx_dllp_statistics;
+	__le64	pcie_equalization_time;
+	__le32	pcie_ltssm_histogram[4];
+	__le64	pcie_recovery_histogram;
+};
+
+/* pcie_ctx_hw_stats_v2 (size:4096b/512B) */
+struct pcie_ctx_hw_stats_v2 {
+	__le64	pcie_pl_signal_integrity;
+	__le64	pcie_dl_signal_integrity;
+	__le64	pcie_tl_signal_integrity;
+	__le64	pcie_link_integrity;
+	__le64	pcie_tx_traffic_rate;
+	__le64	pcie_rx_traffic_rate;
+	__le64	pcie_tx_dllp_statistics;
+	__le64	pcie_rx_dllp_statistics;
+	__le64	pcie_equalization_time;
+	__le32	pcie_ltssm_histogram[4];
+	__le64	pcie_recovery_histogram;
+	__le32	pcie_tl_credit_nph_histogram[8];
+	__le32	pcie_tl_credit_ph_histogram[8];
+	__le32	pcie_tl_credit_pd_histogram[8];
+	__le32	pcie_cmpl_latest_times[4];
+	__le32	pcie_cmpl_longest_time;
+	__le32	pcie_cmpl_shortest_time;
+	__le32	unused_0[2];
+	__le32	pcie_cmpl_latest_headers[4][4];
+	__le32	pcie_cmpl_longest_headers[4][4];
+	__le32	pcie_cmpl_shortest_headers[4][4];
+	__le32	pcie_wr_latency_histogram[12];
+	__le32	pcie_wr_latency_all_normal_count;
+	__le32	unused_1;
+	__le64	pcie_posted_packet_count;
+	__le64	pcie_non_posted_packet_count;
+	__le64	pcie_other_packet_count;
+	__le64	pcie_blocked_packet_count;
+	__le64	pcie_cmpl_packet_count;
+};
+
+/* hwrm_stat_generic_qstats_input (size:256b/32B) */
+struct hwrm_stat_generic_qstats_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	generic_stat_size;
+	u8	flags;
+	#define STAT_GENERIC_QSTATS_REQ_FLAGS_COUNTER_MASK     0x1UL
+	u8	unused_0[5];
+	__le64	generic_stat_host_addr;
+};
+
+/* hwrm_stat_generic_qstats_output (size:128b/16B) */
+struct hwrm_stat_generic_qstats_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	generic_stat_size;
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* generic_sw_hw_stats (size:1472b/184B) */
+struct generic_sw_hw_stats {
+	__le64	pcie_statistics_tx_tlp;
+	__le64	pcie_statistics_rx_tlp;
+	__le64	pcie_credit_fc_hdr_posted;
+	__le64	pcie_credit_fc_hdr_nonposted;
+	__le64	pcie_credit_fc_hdr_cmpl;
+	__le64	pcie_credit_fc_data_posted;
+	__le64	pcie_credit_fc_data_nonposted;
+	__le64	pcie_credit_fc_data_cmpl;
+	__le64	pcie_credit_fc_tgt_nonposted;
+	__le64	pcie_credit_fc_tgt_data_posted;
+	__le64	pcie_credit_fc_tgt_hdr_posted;
+	__le64	pcie_credit_fc_cmpl_hdr_posted;
+	__le64	pcie_credit_fc_cmpl_data_posted;
+	__le64	pcie_cmpl_longest;
+	__le64	pcie_cmpl_shortest;
+	__le64	cache_miss_count_cfcq;
+	__le64	cache_miss_count_cfcs;
+	__le64	cache_miss_count_cfcc;
+	__le64	cache_miss_count_cfcm;
+	__le64	hw_db_recov_dbs_dropped;
+	__le64	hw_db_recov_drops_serviced;
+	__le64	hw_db_recov_dbs_recovered;
+	__le64	hw_db_recov_oo_drop_count;
+};
+
+/* hwrm_fw_reset_input (size:192b/24B) */
+struct hwrm_fw_reset_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	embedded_proc_type;
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_BOOT                  0x0UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_MGMT                  0x1UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_NETCTRL               0x2UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_ROCE                  0x3UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST                  0x4UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_AP                    0x5UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_CHIP                  0x6UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_HOST_RESOURCE_REINIT  0x7UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_IMPACTLESS_ACTIVATION 0x8UL
+	#define FW_RESET_REQ_EMBEDDED_PROC_TYPE_LAST                 FW_RESET_REQ_EMBEDDED_PROC_TYPE_IMPACTLESS_ACTIVATION
+	u8	selfrst_status;
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTNONE      0x0UL
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTASAP      0x1UL
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTPCIERST   0x2UL
+	#define FW_RESET_REQ_SELFRST_STATUS_SELFRSTIMMEDIATE 0x3UL
+	#define FW_RESET_REQ_SELFRST_STATUS_LAST            FW_RESET_REQ_SELFRST_STATUS_SELFRSTIMMEDIATE
+	u8	host_idx;
+	u8	flags;
+	#define FW_RESET_REQ_FLAGS_RESET_GRACEFUL     0x1UL
+	#define FW_RESET_REQ_FLAGS_FW_ACTIVATION      0x2UL
+	u8	unused_0[4];
+};
+
+/* hwrm_fw_reset_output (size:128b/16B) */
+struct hwrm_fw_reset_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	selfrst_status;
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTNONE      0x0UL
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTASAP      0x1UL
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTPCIERST   0x2UL
+	#define FW_RESET_RESP_SELFRST_STATUS_SELFRSTIMMEDIATE 0x3UL
+	#define FW_RESET_RESP_SELFRST_STATUS_LAST            FW_RESET_RESP_SELFRST_STATUS_SELFRSTIMMEDIATE
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_fw_qstatus_input (size:192b/24B) */
+struct hwrm_fw_qstatus_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	embedded_proc_type;
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_BOOT    0x0UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_MGMT    0x1UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_NETCTRL 0x2UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_ROCE    0x3UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_HOST    0x4UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_AP      0x5UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP    0x6UL
+	#define FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_LAST   FW_QSTATUS_REQ_EMBEDDED_PROC_TYPE_CHIP
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_qstatus_output (size:128b/16B) */
+struct hwrm_fw_qstatus_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	selfrst_status;
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTNONE    0x0UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTASAP    0x1UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPCIERST 0x2UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER   0x3UL
+	#define FW_QSTATUS_RESP_SELFRST_STATUS_LAST          FW_QSTATUS_RESP_SELFRST_STATUS_SELFRSTPOWER
+	u8	nvm_option_action_status;
+	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_NONE     0x0UL
+	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_HOTRESET 0x1UL
+	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_WARMBOOT 0x2UL
+	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT 0x3UL
+	#define FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_LAST                  FW_QSTATUS_RESP_NVM_OPTION_ACTION_STATUS_NVMOPT_ACTION_COLDBOOT
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_fw_set_time_input (size:256b/32B) */
+struct hwrm_fw_set_time_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	year;
+	#define FW_SET_TIME_REQ_YEAR_UNKNOWN 0x0UL
+	#define FW_SET_TIME_REQ_YEAR_LAST   FW_SET_TIME_REQ_YEAR_UNKNOWN
+	u8	month;
+	u8	day;
+	u8	hour;
+	u8	minute;
+	u8	second;
+	u8	unused_0;
+	__le16	millisecond;
+	__le16	zone;
+	#define FW_SET_TIME_REQ_ZONE_UTC     0
+	#define FW_SET_TIME_REQ_ZONE_UNKNOWN 65535
+	#define FW_SET_TIME_REQ_ZONE_LAST   FW_SET_TIME_REQ_ZONE_UNKNOWN
+	u8	unused_1[4];
+};
+
+/* hwrm_fw_set_time_output (size:128b/16B) */
+struct hwrm_fw_set_time_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_struct_hdr (size:128b/16B) */
+struct hwrm_struct_hdr {
+	__le16	struct_id;
+	#define STRUCT_HDR_STRUCT_ID_LLDP_CFG              0x41bUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_ETS              0x41dUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_PFC              0x41fUL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_APP              0x421UL
+	#define STRUCT_HDR_STRUCT_ID_DCBX_FEATURE_STATE    0x422UL
+	#define STRUCT_HDR_STRUCT_ID_LLDP_GENERIC          0x424UL
+	#define STRUCT_HDR_STRUCT_ID_LLDP_DEVICE           0x426UL
+	#define STRUCT_HDR_STRUCT_ID_POWER_BKUP            0x427UL
+	#define STRUCT_HDR_STRUCT_ID_PEER_MMAP             0x429UL
+	#define STRUCT_HDR_STRUCT_ID_AFM_OPAQUE            0x1UL
+	#define STRUCT_HDR_STRUCT_ID_PORT_DESCRIPTION      0xaUL
+	#define STRUCT_HDR_STRUCT_ID_RSS_V2                0x64UL
+	#define STRUCT_HDR_STRUCT_ID_MSIX_PER_VF           0xc8UL
+	#define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_COUNT 0x12cUL
+	#define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND 0x12dUL
+	#define STRUCT_HDR_STRUCT_ID_LAST                 STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND
+	__le16	len;
+	u8	version;
+	#define STRUCT_HDR_VERSION_0 0x0UL
+	#define STRUCT_HDR_VERSION_1 0x1UL
+	#define STRUCT_HDR_VERSION_LAST STRUCT_HDR_VERSION_1
+	u8	count;
+	__le16	subtype;
+	__le16	next_offset;
+	#define STRUCT_HDR_NEXT_OFFSET_LAST 0x0UL
+	u8	unused_0[6];
+};
+
+/* hwrm_struct_data_dcbx_app (size:64b/8B) */
+struct hwrm_struct_data_dcbx_app {
+	__be16	protocol_id;
+	u8	protocol_selector;
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_ETHER_TYPE   0x1UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_PORT     0x2UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_UDP_PORT     0x3UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT 0x4UL
+	#define STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_LAST        STRUCT_DATA_DCBX_APP_PROTOCOL_SELECTOR_TCP_UDP_PORT
+	u8	priority;
+	u8	valid;
+	u8	unused_0[3];
+};
+
+/* hwrm_fw_set_structured_data_input (size:256b/32B) */
+struct hwrm_fw_set_structured_data_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le16	data_len;
+	u8	hdr_cnt;
+	u8	unused_0[5];
+};
+
+/* hwrm_fw_set_structured_data_output (size:128b/16B) */
+struct hwrm_fw_set_structured_data_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fw_set_structured_data_cmd_err (size:64b/8B) */
+struct hwrm_fw_set_structured_data_cmd_err {
+	u8	code;
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN     0x0UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT     0x2UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID      0x3UL
+	#define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST       FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_get_structured_data_input (size:256b/32B) */
+struct hwrm_fw_get_structured_data_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	__le16	data_len;
+	__le16	structure_id;
+	__le16	subtype;
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_UNUSED                  0x0UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_ALL                     0xffffUL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_ADMIN       0x100UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_PEER        0x101UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NEAR_BRIDGE_OPERATIONAL 0x102UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_ADMIN          0x200UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_PEER           0x201UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_OPERATIONAL    0x202UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL        0x300UL
+	#define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST                   FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL
+	u8	count;
+	u8	unused_0;
+};
+
+/* hwrm_fw_get_structured_data_output (size:128b/16B) */
+struct hwrm_fw_get_structured_data_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	hdr_cnt;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_fw_get_structured_data_cmd_err (size:64b/8B) */
+struct hwrm_fw_get_structured_data_cmd_err {
+	u8	code;
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID  0x3UL
+	#define FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_LAST   FW_GET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_livepatch_query_input (size:192b/24B) */
+struct hwrm_fw_livepatch_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	fw_target;
+	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_COMMON_FW 0x1UL
+	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_SECURE_FW 0x2UL
+	#define FW_LIVEPATCH_QUERY_REQ_FW_TARGET_LAST     FW_LIVEPATCH_QUERY_REQ_FW_TARGET_SECURE_FW
+	u8	unused_0[7];
+};
+
+/* hwrm_fw_livepatch_query_output (size:640b/80B) */
+struct hwrm_fw_livepatch_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	char	install_ver[32];
+	char	active_ver[32];
+	__le16	status_flags;
+	#define FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_INSTALL     0x1UL
+	#define FW_LIVEPATCH_QUERY_RESP_STATUS_FLAGS_ACTIVE      0x2UL
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_fw_livepatch_input (size:256b/32B) */
+struct hwrm_fw_livepatch_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	opcode;
+	#define FW_LIVEPATCH_REQ_OPCODE_ACTIVATE   0x1UL
+	#define FW_LIVEPATCH_REQ_OPCODE_DEACTIVATE 0x2UL
+	#define FW_LIVEPATCH_REQ_OPCODE_LAST      FW_LIVEPATCH_REQ_OPCODE_DEACTIVATE
+	u8	fw_target;
+	#define FW_LIVEPATCH_REQ_FW_TARGET_COMMON_FW 0x1UL
+	#define FW_LIVEPATCH_REQ_FW_TARGET_SECURE_FW 0x2UL
+	#define FW_LIVEPATCH_REQ_FW_TARGET_LAST     FW_LIVEPATCH_REQ_FW_TARGET_SECURE_FW
+	u8	loadtype;
+	#define FW_LIVEPATCH_REQ_LOADTYPE_NVM_INSTALL   0x1UL
+	#define FW_LIVEPATCH_REQ_LOADTYPE_MEMORY_DIRECT 0x2UL
+	#define FW_LIVEPATCH_REQ_LOADTYPE_LAST         FW_LIVEPATCH_REQ_LOADTYPE_MEMORY_DIRECT
+	u8	flags;
+	__le32	patch_len;
+	__le64	host_addr;
+};
+
+/* hwrm_fw_livepatch_output (size:128b/16B) */
+struct hwrm_fw_livepatch_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fw_livepatch_cmd_err (size:64b/8B) */
+struct hwrm_fw_livepatch_cmd_err {
+	u8	code;
+	#define FW_LIVEPATCH_CMD_ERR_CODE_UNKNOWN         0x0UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_OPCODE  0x1UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_TARGET  0x2UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_SUPPORTED   0x3UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_INSTALLED   0x4UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_NOT_PATCHED     0x5UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_AUTH_FAIL       0x6UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_HEADER  0x7UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_INVALID_SIZE    0x8UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_ALREADY_PATCHED 0x9UL
+	#define FW_LIVEPATCH_CMD_ERR_CODE_LAST           FW_LIVEPATCH_CMD_ERR_CODE_ALREADY_PATCHED
+	u8	unused_0[7];
+};
+
+/* hwrm_exec_fwd_resp_input (size:1024b/128B) */
+struct hwrm_exec_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_request[26];
+	__le16	encap_resp_target_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_exec_fwd_resp_output (size:128b/16B) */
+struct hwrm_exec_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_reject_fwd_resp_input (size:1024b/128B) */
+struct hwrm_reject_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	encap_request[26];
+	__le16	encap_resp_target_id;
+	u8	unused_0[6];
+};
+
+/* hwrm_reject_fwd_resp_output (size:128b/16B) */
+struct hwrm_reject_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fwd_resp_input (size:1024b/128B) */
+struct hwrm_fwd_resp_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	encap_resp_target_id;
+	__le16	encap_resp_cmpl_ring;
+	__le16	encap_resp_len;
+	u8	unused_0;
+	u8	unused_1;
+	__le64	encap_resp_addr;
+	__le32	encap_resp[24];
+};
+
+/* hwrm_fwd_resp_output (size:128b/16B) */
+struct hwrm_fwd_resp_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_fwd_async_event_cmpl_input (size:320b/40B) */
+struct hwrm_fwd_async_event_cmpl_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	encap_async_event_target_id;
+	u8	unused_0[6];
+	__le32	encap_async_event_cmpl[4];
+};
+
+/* hwrm_fwd_async_event_cmpl_output (size:128b/16B) */
+struct hwrm_fwd_async_event_cmpl_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_temp_monitor_query_input (size:128b/16B) */
+struct hwrm_temp_monitor_query_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_temp_monitor_query_output (size:192b/24B) */
+struct hwrm_temp_monitor_query_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	temp;
+	u8	phy_temp;
+	u8	om_temp;
+	u8	flags;
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_TEMP_NOT_AVAILABLE             0x1UL
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_PHY_TEMP_NOT_AVAILABLE         0x2UL
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_OM_NOT_PRESENT                 0x4UL
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_OM_TEMP_NOT_AVAILABLE          0x8UL
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_EXT_TEMP_FIELDS_AVAILABLE      0x10UL
+	#define TEMP_MONITOR_QUERY_RESP_FLAGS_THRESHOLD_VALUES_AVAILABLE     0x20UL
+	u8	temp2;
+	u8	phy_temp2;
+	u8	om_temp2;
+	u8	warn_threshold;
+	u8	critical_threshold;
+	u8	fatal_threshold;
+	u8	shutdown_threshold;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_alloc_input (size:512b/64B) */
+struct hwrm_wol_filter_alloc_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	__le32	enables;
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_MAC_ADDRESS           0x1UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_OFFSET        0x2UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_SIZE      0x4UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_BUF_ADDR      0x8UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_ADDR     0x10UL
+	#define WOL_FILTER_ALLOC_REQ_ENABLES_PATTERN_MASK_SIZE     0x20UL
+	__le16	port_id;
+	u8	wol_type;
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_MAGICPKT 0x0UL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_BMP      0x1UL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID  0xffUL
+	#define WOL_FILTER_ALLOC_REQ_WOL_TYPE_LAST    WOL_FILTER_ALLOC_REQ_WOL_TYPE_INVALID
+	u8	unused_0[5];
+	u8	mac_address[6];
+	__le16	pattern_offset;
+	__le16	pattern_buf_size;
+	__le16	pattern_mask_size;
+	u8	unused_1[4];
+	__le64	pattern_buf_addr;
+	__le64	pattern_mask_addr;
+};
+
+/* hwrm_wol_filter_alloc_output (size:128b/16B) */
+struct hwrm_wol_filter_alloc_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	wol_filter_id;
+	u8	unused_0[6];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_free_input (size:256b/32B) */
+struct hwrm_wol_filter_free_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	flags;
+	#define WOL_FILTER_FREE_REQ_FLAGS_FREE_ALL_WOL_FILTERS     0x1UL
+	__le32	enables;
+	#define WOL_FILTER_FREE_REQ_ENABLES_WOL_FILTER_ID     0x1UL
+	__le16	port_id;
+	u8	wol_filter_id;
+	u8	unused_0[5];
+};
+
+/* hwrm_wol_filter_free_output (size:128b/16B) */
+struct hwrm_wol_filter_free_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_wol_filter_qcfg_input (size:448b/56B) */
+struct hwrm_wol_filter_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	__le16	handle;
+	u8	unused_0[4];
+	__le64	pattern_buf_addr;
+	__le16	pattern_buf_size;
+	u8	unused_1[6];
+	__le64	pattern_mask_addr;
+	__le16	pattern_mask_size;
+	u8	unused_2[6];
+};
+
+/* hwrm_wol_filter_qcfg_output (size:256b/32B) */
+struct hwrm_wol_filter_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	next_handle;
+	u8	wol_filter_id;
+	u8	wol_type;
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_MAGICPKT 0x0UL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_BMP      0x1UL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID  0xffUL
+	#define WOL_FILTER_QCFG_RESP_WOL_TYPE_LAST    WOL_FILTER_QCFG_RESP_WOL_TYPE_INVALID
+	__le32	unused_0;
+	u8	mac_address[6];
+	__le16	pattern_offset;
+	__le16	pattern_size;
+	__le16	pattern_mask_size;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_wol_reason_qcfg_input (size:320b/40B) */
+struct hwrm_wol_reason_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	port_id;
+	u8	unused_0[6];
+	__le64	wol_pkt_buf_addr;
+	__le16	wol_pkt_buf_size;
+	u8	unused_1[6];
+};
+
+/* hwrm_wol_reason_qcfg_output (size:128b/16B) */
+struct hwrm_wol_reason_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	wol_filter_id;
+	u8	wol_reason;
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_MAGICPKT 0x0UL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_BMP      0x1UL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_INVALID  0xffUL
+	#define WOL_REASON_QCFG_RESP_WOL_REASON_LAST    WOL_REASON_QCFG_RESP_WOL_REASON_INVALID
+	u8	wol_pkt_len;
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_dbg_read_direct_input (size:256b/32B) */
+struct hwrm_dbg_read_direct_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+	__le32	read_addr;
+	__le32	read_len32;
+};
+
+/* hwrm_dbg_read_direct_output (size:128b/16B) */
+struct hwrm_dbg_read_direct_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	crc32;
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_dbg_qcaps_input (size:192b/24B) */
+struct hwrm_dbg_qcaps_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	u8	unused_0[6];
+};
+
+/* hwrm_dbg_qcaps_output (size:192b/24B) */
+struct hwrm_dbg_qcaps_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	u8	unused_0[2];
+	__le32	coredump_component_disable_caps;
+	#define DBG_QCAPS_RESP_COREDUMP_COMPONENT_DISABLE_CAPS_NVRAM     0x1UL
+	__le32	flags;
+	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_NVM             0x1UL
+	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_HOST_DDR        0x2UL
+	#define DBG_QCAPS_RESP_FLAGS_CRASHDUMP_SOC_DDR         0x4UL
+	#define DBG_QCAPS_RESP_FLAGS_USEQ                      0x8UL
+	#define DBG_QCAPS_RESP_FLAGS_COREDUMP_HOST_DDR         0x10UL
+	#define DBG_QCAPS_RESP_FLAGS_COREDUMP_HOST_CAPTURE     0x20UL
+	#define DBG_QCAPS_RESP_FLAGS_PTRACE                    0x40UL
+	#define DBG_QCAPS_RESP_FLAGS_REG_ACCESS_RESTRICTED     0x80UL
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_dbg_qcfg_input (size:192b/24B) */
+struct hwrm_dbg_qcfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	fid;
+	__le16	flags;
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_MASK         0x3UL
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_SFT          0
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_NVM       0x0UL
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_HOST_DDR  0x1UL
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_SOC_DDR   0x2UL
+	#define DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_LAST          DBG_QCFG_REQ_FLAGS_CRASHDUMP_SIZE_FOR_DEST_DEST_SOC_DDR
+	__le32	coredump_component_disable_flags;
+	#define DBG_QCFG_REQ_COREDUMP_COMPONENT_DISABLE_FLAGS_NVRAM     0x1UL
+};
+
+/* hwrm_dbg_qcfg_output (size:256b/32B) */
+struct hwrm_dbg_qcfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	fid;
+	u8	unused_0[2];
+	__le32	coredump_size;
+	__le32	flags;
+	#define DBG_QCFG_RESP_FLAGS_UART_LOG               0x1UL
+	#define DBG_QCFG_RESP_FLAGS_UART_LOG_SECONDARY     0x2UL
+	#define DBG_QCFG_RESP_FLAGS_FW_TRACE               0x4UL
+	#define DBG_QCFG_RESP_FLAGS_FW_TRACE_SECONDARY     0x8UL
+	#define DBG_QCFG_RESP_FLAGS_DEBUG_NOTIFY           0x10UL
+	#define DBG_QCFG_RESP_FLAGS_JTAG_DEBUG             0x20UL
+	__le16	async_cmpl_ring;
+	u8	unused_2[2];
+	__le32	crashdump_size;
+	u8	unused_3[3];
+	u8	valid;
+};
+
+/* hwrm_dbg_crashdump_medium_cfg_input (size:320b/40B) */
+struct hwrm_dbg_crashdump_medium_cfg_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	output_dest_flags;
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_TYPE_DDR     0x1UL
+	__le16	pg_size_lvl;
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_MASK      0x3UL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_SFT       0
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_0       0x0UL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_1       0x1UL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2       0x2UL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LAST       DBG_CRASHDUMP_MEDIUM_CFG_REQ_LVL_LVL_2
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_MASK  0x1cUL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_SFT   2
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_4K   (0x0UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8K   (0x1UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_64K  (0x2UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_2M   (0x3UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_8M   (0x4UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G   (0x5UL << 2)
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_LAST   DBG_CRASHDUMP_MEDIUM_CFG_REQ_PG_SIZE_PG_1G
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_MASK 0xffe0UL
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_UNUSED11_SFT  5
+	__le32	size;
+	__le32	coredump_component_disable_flags;
+	#define DBG_CRASHDUMP_MEDIUM_CFG_REQ_NVRAM     0x1UL
+	__le32	unused_0;
+	__le64	pbl;
+};
+
+/* hwrm_dbg_crashdump_medium_cfg_output (size:128b/16B) */
+struct hwrm_dbg_crashdump_medium_cfg_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_1[7];
+	u8	valid;
+};
+
+/* coredump_segment_record (size:128b/16B) */
+struct coredump_segment_record {
+	__le16	component_id;
+	__le16	segment_id;
+	__le16	max_instances;
+	u8	version_hi;
+	u8	version_low;
+	u8	seg_flags;
+	u8	compress_flags;
+	#define SFLAG_COMPRESSED_ZLIB     0x1UL
+	u8	unused_0[2];
+	__le32	segment_len;
+};
+
+/* hwrm_dbg_coredump_list_input (size:256b/32B) */
+struct hwrm_dbg_coredump_list_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+	__le32	host_buf_len;
+	__le16	seq_no;
+	u8	flags;
+	#define DBG_COREDUMP_LIST_REQ_FLAGS_CRASHDUMP     0x1UL
+	u8	unused_0[1];
+};
+
+/* hwrm_dbg_coredump_list_output (size:128b/16B) */
+struct hwrm_dbg_coredump_list_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define DBG_COREDUMP_LIST_RESP_FLAGS_MORE     0x1UL
+	u8	unused_0;
+	__le16	total_segments;
+	__le16	data_len;
+	u8	unused_1;
+	u8	valid;
+};
+
+/* hwrm_dbg_coredump_initiate_input (size:256b/32B) */
+struct hwrm_dbg_coredump_initiate_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	component_id;
+	__le16	segment_id;
+	__le16	instance;
+	__le16	unused_0;
+	u8	seg_flags;
+	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_LIVE_DATA                0x1UL
+	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_CRASH_DATA               0x2UL
+	#define DBG_COREDUMP_INITIATE_REQ_SEG_FLAGS_COLLECT_CTX_L1_CACHE     0x4UL
+	u8	unused_1[7];
+};
+
+/* hwrm_dbg_coredump_initiate_output (size:128b/16B) */
+struct hwrm_dbg_coredump_initiate_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* coredump_data_hdr (size:128b/16B) */
+struct coredump_data_hdr {
+	__le32	address;
+	__le32	flags_length;
+	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_ACTUAL_LEN_MASK     0xffffffUL
+	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_ACTUAL_LEN_SFT      0
+	#define COREDUMP_DATA_HDR_FLAGS_LENGTH_INDIRECT_ACCESS     0x1000000UL
+	__le32	instance;
+	__le32	next_offset;
+};
+
+/* hwrm_dbg_coredump_retrieve_input (size:448b/56B) */
+struct hwrm_dbg_coredump_retrieve_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+	__le32	host_buf_len;
+	__le32	unused_0;
+	__le16	component_id;
+	__le16	segment_id;
+	__le16	instance;
+	__le16	unused_1;
+	u8	seg_flags;
+	u8	unused_2;
+	__le16	unused_3;
+	__le32	unused_4;
+	__le32	seq_no;
+	__le32	unused_5;
+};
+
+/* hwrm_dbg_coredump_retrieve_output (size:128b/16B) */
+struct hwrm_dbg_coredump_retrieve_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	flags;
+	#define DBG_COREDUMP_RETRIEVE_RESP_FLAGS_MORE     0x1UL
+	u8	unused_0;
+	__le16	data_len;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_dbg_ring_info_get_input (size:192b/24B) */
+struct hwrm_dbg_ring_info_get_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	ring_type;
+	#define DBG_RING_INFO_GET_REQ_RING_TYPE_L2_CMPL 0x0UL
+	#define DBG_RING_INFO_GET_REQ_RING_TYPE_TX      0x1UL
+	#define DBG_RING_INFO_GET_REQ_RING_TYPE_RX      0x2UL
+	#define DBG_RING_INFO_GET_REQ_RING_TYPE_NQ      0x3UL
+	#define DBG_RING_INFO_GET_REQ_RING_TYPE_LAST   DBG_RING_INFO_GET_REQ_RING_TYPE_NQ
+	u8	unused_0[3];
+	__le32	fw_ring_id;
+};
+
+/* hwrm_dbg_ring_info_get_output (size:192b/24B) */
+struct hwrm_dbg_ring_info_get_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	producer_index;
+	__le32	consumer_index;
+	__le32	cag_vector_ctrl;
+	__le16	st_tag;
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_dbg_log_buffer_flush_input (size:192b/24B) */
+struct hwrm_dbg_log_buffer_flush_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	type;
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_SRT_TRACE           0x0UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_SRT2_TRACE          0x1UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CRT_TRACE           0x2UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CRT2_TRACE          0x3UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP0_TRACE         0x4UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_L2_HWRM_TRACE       0x5UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ROCE_HWRM_TRACE     0x6UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA0_TRACE           0x7UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA1_TRACE           0x8UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA2_TRACE           0x9UL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP1_TRACE         0xaUL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE 0xbUL
+	#define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST               DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE
+	u8	unused_1[2];
+	__le32	flags;
+	#define DBG_LOG_BUFFER_FLUSH_REQ_FLAGS_FLUSH_ALL_BUFFERS     0x1UL
+};
+
+/* hwrm_dbg_log_buffer_flush_output (size:128b/16B) */
+struct hwrm_dbg_log_buffer_flush_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	current_buffer_offset;
+	u8	unused_1[3];
+	u8	valid;
+};
+
+/* hwrm_nvm_read_input (size:320b/40B) */
+struct hwrm_nvm_read_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+	__le16	dir_idx;
+	u8	unused_0[2];
+	__le32	offset;
+	__le32	len;
+	u8	unused_1[4];
+};
+
+/* hwrm_nvm_read_output (size:128b/16B) */
+struct hwrm_nvm_read_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dir_entries_input (size:192b/24B) */
+struct hwrm_nvm_get_dir_entries_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_dest_addr;
+};
+
+/* hwrm_nvm_get_dir_entries_output (size:128b/16B) */
+struct hwrm_nvm_get_dir_entries_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dir_info_input (size:128b/16B) */
+struct hwrm_nvm_get_dir_info_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_nvm_get_dir_info_output (size:192b/24B) */
+struct hwrm_nvm_get_dir_info_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	entries;
+	__le32	entry_length;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_write_input (size:448b/56B) */
+struct hwrm_nvm_write_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_src_addr;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	__le16	dir_attr;
+	__le32	dir_data_length;
+	__le16	option;
+	__le16	flags;
+	#define NVM_WRITE_REQ_FLAGS_KEEP_ORIG_ACTIVE_IMG     0x1UL
+	#define NVM_WRITE_REQ_FLAGS_BATCH_MODE               0x2UL
+	#define NVM_WRITE_REQ_FLAGS_BATCH_LAST               0x4UL
+	#define NVM_WRITE_REQ_FLAGS_SKIP_CRID_CHECK          0x8UL
+	__le32	dir_item_length;
+	__le32	offset;
+	__le32	len;
+	__le32	unused_0;
+};
+
+/* hwrm_nvm_write_output (size:128b/16B) */
+struct hwrm_nvm_write_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	dir_item_length;
+	__le16	dir_idx;
+	u8	unused_0;
+	u8	valid;
+};
+
+/* hwrm_nvm_write_cmd_err (size:64b/8B) */
+struct hwrm_nvm_write_cmd_err {
+	u8	code;
+	#define NVM_WRITE_CMD_ERR_CODE_UNKNOWN  0x0UL
+	#define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL
+	#define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL
+	#define NVM_WRITE_CMD_ERR_CODE_LAST    NVM_WRITE_CMD_ERR_CODE_NO_SPACE
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_modify_input (size:320b/40B) */
+struct hwrm_nvm_modify_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	host_src_addr;
+	__le16	dir_idx;
+	__le16	flags;
+	#define NVM_MODIFY_REQ_FLAGS_BATCH_MODE     0x1UL
+	#define NVM_MODIFY_REQ_FLAGS_BATCH_LAST     0x2UL
+	__le32	offset;
+	__le32	len;
+	u8	unused_1[4];
+};
+
+/* hwrm_nvm_modify_output (size:128b/16B) */
+struct hwrm_nvm_modify_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_find_dir_entry_input (size:256b/32B) */
+struct hwrm_nvm_find_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define NVM_FIND_DIR_ENTRY_REQ_ENABLES_DIR_IDX_VALID     0x1UL
+	__le16	dir_idx;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	u8	opt_ordinal;
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_MASK 0x3UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_SFT 0
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_EQ    0x0UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GE    0x1UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT    0x2UL
+	#define NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_LAST NVM_FIND_DIR_ENTRY_REQ_OPT_ORDINAL_GT
+	u8	unused_0[3];
+};
+
+/* hwrm_nvm_find_dir_entry_output (size:256b/32B) */
+struct hwrm_nvm_find_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le32	dir_item_length;
+	__le32	dir_data_length;
+	__le32	fw_ver;
+	__le16	dir_ordinal;
+	__le16	dir_idx;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_erase_dir_entry_input (size:192b/24B) */
+struct hwrm_nvm_erase_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	dir_idx;
+	u8	unused_0[6];
+};
+
+/* hwrm_nvm_erase_dir_entry_output (size:128b/16B) */
+struct hwrm_nvm_erase_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_dev_info_input (size:192b/24B) */
+struct hwrm_nvm_get_dev_info_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	flags;
+	#define NVM_GET_DEV_INFO_REQ_FLAGS_SECURITY_SOC_NVM     0x1UL
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_get_dev_info_output (size:768b/96B) */
+struct hwrm_nvm_get_dev_info_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	manufacturer_id;
+	__le16	device_id;
+	__le32	sector_size;
+	__le32	nvram_size;
+	__le32	reserved_size;
+	__le32	available_size;
+	u8	nvm_cfg_ver_maj;
+	u8	nvm_cfg_ver_min;
+	u8	nvm_cfg_ver_upd;
+	u8	flags;
+	#define NVM_GET_DEV_INFO_RESP_FLAGS_FW_VER_VALID     0x1UL
+	char	pkg_name[16];
+	__le16	hwrm_fw_major;
+	__le16	hwrm_fw_minor;
+	__le16	hwrm_fw_build;
+	__le16	hwrm_fw_patch;
+	__le16	mgmt_fw_major;
+	__le16	mgmt_fw_minor;
+	__le16	mgmt_fw_build;
+	__le16	mgmt_fw_patch;
+	__le16	roce_fw_major;
+	__le16	roce_fw_minor;
+	__le16	roce_fw_build;
+	__le16	roce_fw_patch;
+	__le16	netctrl_fw_major;
+	__le16	netctrl_fw_minor;
+	__le16	netctrl_fw_build;
+	__le16	netctrl_fw_patch;
+	__le16	srt2_fw_major;
+	__le16	srt2_fw_minor;
+	__le16	srt2_fw_build;
+	__le16	srt2_fw_patch;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_mod_dir_entry_input (size:256b/32B) */
+struct hwrm_nvm_mod_dir_entry_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	enables;
+	#define NVM_MOD_DIR_ENTRY_REQ_ENABLES_CHECKSUM     0x1UL
+	__le16	dir_idx;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	__le16	dir_attr;
+	__le32	checksum;
+};
+
+/* hwrm_nvm_mod_dir_entry_output (size:128b/16B) */
+struct hwrm_nvm_mod_dir_entry_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_verify_update_input (size:192b/24B) */
+struct hwrm_nvm_verify_update_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le16	dir_type;
+	__le16	dir_ordinal;
+	__le16	dir_ext;
+	u8	unused_0[2];
+};
+
+/* hwrm_nvm_verify_update_output (size:128b/16B) */
+struct hwrm_nvm_verify_update_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_install_update_input (size:192b/24B) */
+struct hwrm_nvm_install_update_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le32	install_type;
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_NORMAL 0x0UL
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL    0xffffffffUL
+	#define NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_LAST  NVM_INSTALL_UPDATE_REQ_INSTALL_TYPE_ALL
+	__le16	flags;
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ERASE_UNUSED_SPACE     0x1UL
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_REMOVE_UNUSED_PKG      0x2UL
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_ALLOWED_TO_DEFRAG      0x4UL
+	#define NVM_INSTALL_UPDATE_REQ_FLAGS_VERIFY_ONLY            0x8UL
+	u8	unused_0[2];
+};
+
+/* hwrm_nvm_install_update_output (size:192b/24B) */
+struct hwrm_nvm_install_update_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le64	installed_items;
+	u8	result;
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_SUCCESS                      0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_FAILURE                      0xffUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_MALLOC_FAILURE               0xfdUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_INDEX_PARAMETER      0xfbUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TYPE_PARAMETER       0xf3UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PREREQUISITE         0xf2UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_FILE_HEADER          0xecUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_SIGNATURE            0xebUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_STREAM          0xeaUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_PROP_LENGTH          0xe9UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_MANIFEST             0xe8UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_TRAILER              0xe7UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_CHECKSUM             0xe6UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_ITEM_CHECKSUM        0xe5UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DATA_LENGTH          0xe4UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INVALID_DIRECTIVE            0xe1UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_CHIP_REV         0xceUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_DEVICE_ID        0xcdUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_VENDOR    0xccUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_SUBSYS_ID        0xcbUL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_UNSUPPORTED_PLATFORM         0xc5UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_DUPLICATE_ITEM               0xc4UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_ZERO_LENGTH_ITEM             0xc3UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_CHECKSUM_ERROR       0xb9UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_DATA_ERROR           0xb8UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_INSTALL_AUTHENTICATION_ERROR 0xb7UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_NOT_FOUND               0xb0UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED                  0xa7UL
+	#define NVM_INSTALL_UPDATE_RESP_RESULT_LAST                        NVM_INSTALL_UPDATE_RESP_RESULT_ITEM_LOCKED
+	u8	problem_item;
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_NONE    0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE 0xffUL
+	#define NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_LAST   NVM_INSTALL_UPDATE_RESP_PROBLEM_ITEM_PACKAGE
+	u8	reset_required;
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_NONE  0x0UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_PCI   0x1UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER 0x2UL
+	#define NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_LAST NVM_INSTALL_UPDATE_RESP_RESET_REQUIRED_POWER
+	u8	unused_0[4];
+	u8	valid;
+};
+
+/* hwrm_nvm_install_update_cmd_err (size:64b/8B) */
+struct hwrm_nvm_install_update_cmd_err {
+	u8	code;
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN            0x0UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_FRAG_ERR           0x1UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE           0x2UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_ANTI_ROLLBACK      0x3UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT 0x4UL
+	#define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST              NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_get_variable_input (size:320b/40B) */
+struct hwrm_nvm_get_variable_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	dest_data_addr;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_GET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_GET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
+	__le16	dimensions;
+	__le16	index_0;
+	__le16	index_1;
+	__le16	index_2;
+	__le16	index_3;
+	u8	flags;
+	#define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT     0x1UL
+	u8	unused_0;
+};
+
+/* hwrm_nvm_get_variable_output (size:128b/16B) */
+struct hwrm_nvm_get_variable_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_GET_VARIABLE_RESP_OPTION_NUM_LAST     NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF
+	u8	unused_0[3];
+	u8	valid;
+};
+
+/* hwrm_nvm_get_variable_cmd_err (size:64b/8B) */
+struct hwrm_nvm_get_variable_cmd_err {
+	u8	code;
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL
+	#define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST         NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT
+	u8	unused_0[7];
+};
+
+/* hwrm_nvm_set_variable_input (size:320b/40B) */
+struct hwrm_nvm_set_variable_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	__le64	src_data_addr;
+	__le16	data_len;
+	__le16	option_num;
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_0    0x0UL
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF 0xffffUL
+	#define NVM_SET_VARIABLE_REQ_OPTION_NUM_LAST     NVM_SET_VARIABLE_REQ_OPTION_NUM_RSVD_FFFF
+	__le16	dimensions;
+	__le16	index_0;
+	__le16	index_1;
+	__le16	index_2;
+	__le16	index_3;
+	u8	flags;
+	#define NVM_SET_VARIABLE_REQ_FLAGS_FORCE_FLUSH                0x1UL
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_MASK          0xeUL
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_SFT           1
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_NONE            (0x0UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1       (0x1UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_AES256          (0x2UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1_AUTH  (0x3UL << 1)
+	#define NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_LAST           NVM_SET_VARIABLE_REQ_FLAGS_ENCRYPT_MODE_HMAC_SHA1_AUTH
+	#define NVM_SET_VARIABLE_REQ_FLAGS_FLAGS_UNUSED_0_MASK        0x70UL
+	#define NVM_SET_VARIABLE_REQ_FLAGS_FLAGS_UNUSED_0_SFT         4
+	#define NVM_SET_VARIABLE_REQ_FLAGS_FACTORY_DEFAULT            0x80UL
+	u8	unused_0;
+};
+
+/* hwrm_nvm_set_variable_output (size:128b/16B) */
+struct hwrm_nvm_set_variable_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* hwrm_nvm_set_variable_cmd_err (size:64b/8B) */
+struct hwrm_nvm_set_variable_cmd_err {
+	u8	code;
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN       0x0UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR   0x2UL
+	#define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST         NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR
+	u8	unused_0[7];
+};
+
+/* hwrm_selftest_qlist_input (size:128b/16B) */
+struct hwrm_selftest_qlist_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_selftest_qlist_output (size:2240b/280B) */
+struct hwrm_selftest_qlist_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	num_tests;
+	u8	available_tests;
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_QLIST_RESP_AVAILABLE_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	offline_tests;
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_QLIST_RESP_OFFLINE_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0;
+	__le16	test_timeout;
+	u8	unused_1[2];
+	char	test_name[8][32];
+	u8	eyescope_target_BER_support;
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E8_SUPPORTED  0x0UL
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E9_SUPPORTED  0x1UL
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E10_SUPPORTED 0x2UL
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E11_SUPPORTED 0x3UL
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E12_SUPPORTED 0x4UL
+	#define SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_LAST              SELFTEST_QLIST_RESP_EYESCOPE_TARGET_BER_SUPPORT_BER_1E12_SUPPORTED
+	u8	unused_2[6];
+	u8	valid;
+};
+
+/* hwrm_selftest_exec_input (size:192b/24B) */
+struct hwrm_selftest_exec_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+	u8	flags;
+	#define SELFTEST_EXEC_REQ_FLAGS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_REQ_FLAGS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_REQ_FLAGS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_REQ_FLAGS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_REQ_FLAGS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_REQ_FLAGS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0[7];
+};
+
+/* hwrm_selftest_exec_output (size:128b/16B) */
+struct hwrm_selftest_exec_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	requested_tests;
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_RESP_REQUESTED_TESTS_ETHERNET_SERDES_TEST     0x20UL
+	u8	test_success;
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_NVM_TEST                 0x1UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_LINK_TEST                0x2UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_REGISTER_TEST            0x4UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_MEMORY_TEST              0x8UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_PCIE_SERDES_TEST         0x10UL
+	#define SELFTEST_EXEC_RESP_TEST_SUCCESS_ETHERNET_SERDES_TEST     0x20UL
+	u8	unused_0[5];
+	u8	valid;
+};
+
+/* hwrm_selftest_irq_input (size:128b/16B) */
+struct hwrm_selftest_irq_input {
+	__le16	req_type;
+	__le16	cmpl_ring;
+	__le16	seq_id;
+	__le16	target_id;
+	__le64	resp_addr;
+};
+
+/* hwrm_selftest_irq_output (size:128b/16B) */
+struct hwrm_selftest_irq_output {
+	__le16	error_code;
+	__le16	req_type;
+	__le16	seq_id;
+	__le16	resp_len;
+	u8	unused_0[7];
+	u8	valid;
+};
+
+/* dbc_dbc (size:64b/8B) */
+struct dbc_dbc {
+	__le32	index;
+	#define DBC_DBC_INDEX_MASK 0xffffffUL
+	#define DBC_DBC_INDEX_SFT  0
+	#define DBC_DBC_EPOCH      0x1000000UL
+	#define DBC_DBC_TOGGLE_MASK 0x6000000UL
+	#define DBC_DBC_TOGGLE_SFT 25
+	__le32	type_path_xid;
+	#define DBC_DBC_XID_MASK          0xfffffUL
+	#define DBC_DBC_XID_SFT           0
+	#define DBC_DBC_PATH_MASK         0x3000000UL
+	#define DBC_DBC_PATH_SFT          24
+	#define DBC_DBC_PATH_ROCE           (0x0UL << 24)
+	#define DBC_DBC_PATH_L2             (0x1UL << 24)
+	#define DBC_DBC_PATH_ENGINE         (0x2UL << 24)
+	#define DBC_DBC_PATH_LAST          DBC_DBC_PATH_ENGINE
+	#define DBC_DBC_VALID             0x4000000UL
+	#define DBC_DBC_DEBUG_TRACE       0x8000000UL
+	#define DBC_DBC_TYPE_MASK         0xf0000000UL
+	#define DBC_DBC_TYPE_SFT          28
+	#define DBC_DBC_TYPE_SQ             (0x0UL << 28)
+	#define DBC_DBC_TYPE_RQ             (0x1UL << 28)
+	#define DBC_DBC_TYPE_SRQ            (0x2UL << 28)
+	#define DBC_DBC_TYPE_SRQ_ARM        (0x3UL << 28)
+	#define DBC_DBC_TYPE_CQ             (0x4UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMSE       (0x5UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMALL      (0x6UL << 28)
+	#define DBC_DBC_TYPE_CQ_ARMENA      (0x7UL << 28)
+	#define DBC_DBC_TYPE_SRQ_ARMENA     (0x8UL << 28)
+	#define DBC_DBC_TYPE_CQ_CUTOFF_ACK  (0x9UL << 28)
+	#define DBC_DBC_TYPE_NQ             (0xaUL << 28)
+	#define DBC_DBC_TYPE_NQ_ARM         (0xbUL << 28)
+	#define DBC_DBC_TYPE_NQ_MASK        (0xeUL << 28)
+	#define DBC_DBC_TYPE_NULL           (0xfUL << 28)
+	#define DBC_DBC_TYPE_LAST          DBC_DBC_TYPE_NULL
+};
+
+/* db_push_start (size:64b/8B) */
+struct db_push_start {
+	u64	db;
+	#define DB_PUSH_START_DB_INDEX_MASK     0xffffffUL
+	#define DB_PUSH_START_DB_INDEX_SFT      0
+	#define DB_PUSH_START_DB_PI_LO_MASK     0xff000000UL
+	#define DB_PUSH_START_DB_PI_LO_SFT      24
+	#define DB_PUSH_START_DB_XID_MASK       0xfffff00000000ULL
+	#define DB_PUSH_START_DB_XID_SFT        32
+	#define DB_PUSH_START_DB_PI_HI_MASK     0xf0000000000000ULL
+	#define DB_PUSH_START_DB_PI_HI_SFT      52
+	#define DB_PUSH_START_DB_TYPE_MASK      0xf000000000000000ULL
+	#define DB_PUSH_START_DB_TYPE_SFT       60
+	#define DB_PUSH_START_DB_TYPE_PUSH_START  (0xcULL << 60)
+	#define DB_PUSH_START_DB_TYPE_PUSH_END    (0xdULL << 60)
+	#define DB_PUSH_START_DB_TYPE_LAST       DB_PUSH_START_DB_TYPE_PUSH_END
+};
+
+/* db_push_end (size:64b/8B) */
+struct db_push_end {
+	u64	db;
+	#define DB_PUSH_END_DB_INDEX_MASK      0xffffffUL
+	#define DB_PUSH_END_DB_INDEX_SFT       0
+	#define DB_PUSH_END_DB_PI_LO_MASK      0xff000000UL
+	#define DB_PUSH_END_DB_PI_LO_SFT       24
+	#define DB_PUSH_END_DB_XID_MASK        0xfffff00000000ULL
+	#define DB_PUSH_END_DB_XID_SFT         32
+	#define DB_PUSH_END_DB_PI_HI_MASK      0xf0000000000000ULL
+	#define DB_PUSH_END_DB_PI_HI_SFT       52
+	#define DB_PUSH_END_DB_PATH_MASK       0x300000000000000ULL
+	#define DB_PUSH_END_DB_PATH_SFT        56
+	#define DB_PUSH_END_DB_PATH_ROCE         (0x0ULL << 56)
+	#define DB_PUSH_END_DB_PATH_L2           (0x1ULL << 56)
+	#define DB_PUSH_END_DB_PATH_ENGINE       (0x2ULL << 56)
+	#define DB_PUSH_END_DB_PATH_LAST        DB_PUSH_END_DB_PATH_ENGINE
+	#define DB_PUSH_END_DB_DEBUG_TRACE     0x800000000000000ULL
+	#define DB_PUSH_END_DB_TYPE_MASK       0xf000000000000000ULL
+	#define DB_PUSH_END_DB_TYPE_SFT        60
+	#define DB_PUSH_END_DB_TYPE_PUSH_START   (0xcULL << 60)
+	#define DB_PUSH_END_DB_TYPE_PUSH_END     (0xdULL << 60)
+	#define DB_PUSH_END_DB_TYPE_LAST        DB_PUSH_END_DB_TYPE_PUSH_END
+};
+
+/* db_push_info (size:64b/8B) */
+struct db_push_info {
+	u32	push_size_push_index;
+	#define DB_PUSH_INFO_PUSH_INDEX_MASK 0xffffffUL
+	#define DB_PUSH_INFO_PUSH_INDEX_SFT 0
+	#define DB_PUSH_INFO_PUSH_SIZE_MASK 0x1f000000UL
+	#define DB_PUSH_INFO_PUSH_SIZE_SFT  24
+	u32	reserved32;
+};
+
+/* fw_status_reg (size:32b/4B) */
+struct fw_status_reg {
+	u32	fw_status;
+	#define FW_STATUS_REG_CODE_MASK              0xffffUL
+	#define FW_STATUS_REG_CODE_SFT               0
+	#define FW_STATUS_REG_CODE_READY               0x8000UL
+	#define FW_STATUS_REG_CODE_LAST               FW_STATUS_REG_CODE_READY
+	#define FW_STATUS_REG_IMAGE_DEGRADED         0x10000UL
+	#define FW_STATUS_REG_RECOVERABLE            0x20000UL
+	#define FW_STATUS_REG_CRASHDUMP_ONGOING      0x40000UL
+	#define FW_STATUS_REG_CRASHDUMP_COMPLETE     0x80000UL
+	#define FW_STATUS_REG_SHUTDOWN               0x100000UL
+	#define FW_STATUS_REG_CRASHED_NO_MASTER      0x200000UL
+	#define FW_STATUS_REG_RECOVERING             0x400000UL
+	#define FW_STATUS_REG_MANU_DEBUG_STATUS      0x800000UL
+};
+
+/* hcomm_status (size:64b/8B) */
+struct hcomm_status {
+	u32	sig_ver;
+	#define HCOMM_STATUS_VER_MASK      0xffUL
+	#define HCOMM_STATUS_VER_SFT       0
+	#define HCOMM_STATUS_VER_LATEST      0x1UL
+	#define HCOMM_STATUS_VER_LAST       HCOMM_STATUS_VER_LATEST
+	#define HCOMM_STATUS_SIGNATURE_MASK 0xffffff00UL
+	#define HCOMM_STATUS_SIGNATURE_SFT 8
+	#define HCOMM_STATUS_SIGNATURE_VAL   (0x484353UL << 8)
+	#define HCOMM_STATUS_SIGNATURE_LAST HCOMM_STATUS_SIGNATURE_VAL
+	u32	fw_status_loc;
+	#define HCOMM_STATUS_TRUE_ADDR_SPACE_MASK    0x3UL
+	#define HCOMM_STATUS_TRUE_ADDR_SPACE_SFT     0
+	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_PCIE_CFG  0x0UL
+	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_GRC       0x1UL
+	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR0      0x2UL
+	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR1      0x3UL
+	#define HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_LAST     HCOMM_STATUS_FW_STATUS_LOC_ADDR_SPACE_BAR1
+	#define HCOMM_STATUS_TRUE_OFFSET_MASK        0xfffffffcUL
+	#define HCOMM_STATUS_TRUE_OFFSET_SFT         2
+};
+#define HCOMM_STATUS_STRUCT_LOC 0x31001F0UL
+
+#endif /* _BNXT_HSI_H_ */
-- 
cgit v1.2.3


From 941ab0b369c983f7867de54c8579fd7f1676ee3c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 1 Jul 2025 17:23:28 -0700
Subject: rcutorture: Remove support for SRCU-lite

Because SRCU-lite is being replaced by SRCU-fast, this commit removes
support for SRCU-lite from rcutorture.c

Both SRCU-lite and SRCU-fast provide faster readers by dropping the
smp_mb() call from their lock and unlock primitives, but incur a pair
of added RCU grace periods during the SRCU grace period.  There is a
trivial mapping from the SRCU-lite API to that of SRCU-fast, so there
should be no transition issues.

[ paulmck: Apply Christoph Hellwig feedback. ]

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.upadhyay@kernel.org>
---
 include/linux/srcu.h    | 2 +-
 kernel/rcu/rcutorture.c | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 900b0d5c05f5..c20dacb563e5 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -49,7 +49,7 @@ int init_srcu_struct(struct srcu_struct *ssp);
 #define SRCU_READ_FLAVOR_LITE	0x4		// srcu_read_lock_lite().
 #define SRCU_READ_FLAVOR_FAST	0x8		// srcu_read_lock_fast().
 #define SRCU_READ_FLAVOR_ALL   (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
-				SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above.
+				SRCU_READ_FLAVOR_FAST) // All of the above.
 #define SRCU_READ_FLAVOR_SLOWGP	(SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST)
 						// Flavors requiring synchronize_rcu()
 						// instead of smp_mb().
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 213f23f20a64..7a893d51d02b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -718,11 +718,6 @@ static int srcu_torture_read_lock(void)
 		WARN_ON_ONCE(idx & ~0x1);
 		ret += idx << 1;
 	}
-	if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
-		idx = srcu_read_lock_lite(srcu_ctlp);
-		WARN_ON_ONCE(idx & ~0x1);
-		ret += idx << 2;
-	}
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
 		scp = srcu_read_lock_fast(srcu_ctlp);
 		idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
@@ -756,8 +751,6 @@ static void srcu_torture_read_unlock(int idx)
 	WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
 	if (reader_flavor & SRCU_READ_FLAVOR_FAST)
 		srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
-	if (reader_flavor & SRCU_READ_FLAVOR_LITE)
-		srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
 	if (reader_flavor & SRCU_READ_FLAVOR_NMI)
 		srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
 	if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
-- 
cgit v1.2.3


From 623baa01d5b43ca06ba337751d9a4f62199d1715 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 1 Jul 2025 17:23:29 -0700
Subject: srcu: Remove SRCU-lite implementation

This commit removes the SRCU-lite implementation, which has been replaced
by SRCU-fast.

Both SRCU-lite and SRCU-fast provide faster readers by dropping the
smp_mb() call from their lock and unlock primitives, but incur a pair
of added RCU grace periods during the SRCU grace period.  There is a
trivial mapping from the SRCU-lite API to that of SRCU-fast, so there
should be no transition issues.

[ paulmck: Apply Christoph Hellwig feedback. ]

Signed-off-by: "Paul E. McKenney" <paulmck@kernel.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.upadhyay@kernel.org>
---
 include/linux/srcu.h     | 47 ++---------------------------------------------
 include/linux/srcutiny.h |  3 ---
 include/linux/srcutree.h | 38 --------------------------------------
 3 files changed, 2 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index c20dacb563e5..cf711a0f440b 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -46,11 +46,11 @@ int init_srcu_struct(struct srcu_struct *ssp);
 /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */
 #define SRCU_READ_FLAVOR_NORMAL	0x1		// srcu_read_lock().
 #define SRCU_READ_FLAVOR_NMI	0x2		// srcu_read_lock_nmisafe().
-#define SRCU_READ_FLAVOR_LITE	0x4		// srcu_read_lock_lite().
+//				0x4		// SRCU-lite is no longer with us.
 #define SRCU_READ_FLAVOR_FAST	0x8		// srcu_read_lock_fast().
 #define SRCU_READ_FLAVOR_ALL   (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
 				SRCU_READ_FLAVOR_FAST) // All of the above.
-#define SRCU_READ_FLAVOR_SLOWGP	(SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST)
+#define SRCU_READ_FLAVOR_SLOWGP	SRCU_READ_FLAVOR_FAST
 						// Flavors requiring synchronize_rcu()
 						// instead of smp_mb().
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
@@ -299,33 +299,6 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *
 	return __srcu_read_lock_fast(ssp);
 }
 
-/**
- * srcu_read_lock_lite - register a new reader for an SRCU-protected structure.
- * @ssp: srcu_struct in which to register the new reader.
- *
- * Enter an SRCU read-side critical section, but for a light-weight
- * smp_mb()-free reader.  See srcu_read_lock() for more information.
- *
- * If srcu_read_lock_lite() is ever used on an srcu_struct structure,
- * then none of the other flavors may be used, whether before, during,
- * or after.  Note that grace-period auto-expediting is disabled for _lite
- * srcu_struct structures because auto-expedited grace periods invoke
- * synchronize_rcu_expedited(), IPIs and all.
- *
- * Note that srcu_read_lock_lite() can be invoked only from those contexts
- * where RCU is watching, that is, from contexts where it would be legal
- * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
- */
-static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
-{
-	int retval;
-
-	srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE);
-	retval = __srcu_read_lock_lite(ssp);
-	rcu_try_lock_acquire(&ssp->dep_map);
-	return retval;
-}
-
 /**
  * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
  * @ssp: srcu_struct in which to register the new reader.
@@ -434,22 +407,6 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __
 	__srcu_read_unlock_fast(ssp, scp);
 }
 
-/**
- * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure.
- * @ssp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock_lite().
- *
- * Exit a light-weight SRCU read-side critical section.
- */
-static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
-	__releases(ssp)
-{
-	WARN_ON_ONCE(idx & ~0x1);
-	srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
-	srcu_lock_release(&ssp->dep_map);
-	__srcu_read_unlock_lite(ssp, idx);
-}
-
 /**
  * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
  * @ssp: srcu_struct in which to unregister the old reader.
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 380260317d98..51ce25f07930 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -93,9 +93,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_
 	__srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp));
 }
 
-#define __srcu_read_lock_lite __srcu_read_lock
-#define __srcu_read_unlock_lite __srcu_read_unlock
-
 static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
 {
 	synchronize_srcu(ssp);
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8bed7e6cc4c1..bf44d8d1e69e 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -278,44 +278,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast().");
 }
 
-/*
- * Counts the new reader in the appropriate per-CPU element of the
- * srcu_struct.  Returns an index that must be passed to the matching
- * srcu_read_unlock_lite().
- *
- * Note that this_cpu_inc() is an RCU read-side critical section either
- * because it disables interrupts, because it is a single instruction,
- * or because it is a read-modify-write atomic operation, depending on
- * the whims of the architecture.
- */
-static inline int __srcu_read_lock_lite(struct srcu_struct *ssp)
-{
-	struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
-
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite().");
-	this_cpu_inc(scp->srcu_locks.counter); /* Y */
-	barrier(); /* Avoid leaking the critical section. */
-	return __srcu_ptr_to_ctr(ssp, scp);
-}
-
-/*
- * Removes the count for the old reader from the appropriate
- * per-CPU element of the srcu_struct.  Note that this may well be a
- * different CPU than that which was incremented by the corresponding
- * srcu_read_lock_lite(), but it must be within the same task.
- *
- * Note that this_cpu_inc() is an RCU read-side critical section either
- * because it disables interrupts, because it is a single instruction,
- * or because it is a read-modify-write atomic operation, depending on
- * the whims of the architecture.
- */
-static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
-{
-	barrier();  /* Avoid leaking the critical section. */
-	this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);  /* Z */
-	RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
-}
-
 void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
 
 // Record reader usage even for CONFIG_PROVE_RCU=n kernels.  This is
-- 
cgit v1.2.3


From df6f9a918ea856fc288b9001b0414c5be136d7d0 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 16 Jul 2025 11:03:13 +0900
Subject: ata: libata-eh: Remove ata_do_eh()

The only reason for ata_do_eh() to exist is that the two caller sites,
ata_std_error_handler() and ata_sff_error_handler() may pass it a
NULL hardreset operation so that the built-in (generic) hardreset
operation for a driver is ignored if the adapter SCR access is not
available.

However, ata_std_error_handler() and ata_sff_error_handler()
modifications of the hardreset port operation can easily be combined as
they are mutually exclusive. That is, a driver using sata_std_hardreset()
as its hardreset operation cannot use sata_sff_hardreset() and
vice-versa.

With this observation, ata_do_eh() can be removed and its code moved to
ata_std_error_handler(). The condition used to ignore the built-in
hardreset port operation is modified to be the one that was used in
ata_sff_error_handler(). This requires defining a stub for the function
sata_sff_hardreset() to avoid compilation errors when CONFIG_ATA_SFF is
not enabled. Furthermore, instead of modifying the local hardreset
operation definition, set the ATA_LFLAG_NO_HRST link flag to prevent
the use of built-in hardreset methods for ports without a valid scr_read
function. This flag is checked in ata_eh_reset() and if set, the
hardreset method is ignored.

This change simplifies ata_sff_error_handler() as this function now only
needs to call ata_std_error_handler().

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20250716020315.235457-2-dlemoal@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-eh.c  | 48 ++++++++++++++----------------------------------
 drivers/ata/libata-sff.c | 10 +---------
 include/linux/libata.h   |  9 ++++++---
 3 files changed, 21 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 436536112043..30c831e56a7f 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -4067,59 +4067,39 @@ void ata_eh_finish(struct ata_port *ap)
 }
 
 /**
- *	ata_do_eh - do standard error handling
+ *	ata_std_error_handler - standard error handler
  *	@ap: host port to handle error for
  *
- *	@prereset: prereset method (can be NULL)
- *	@softreset: softreset method (can be NULL)
- *	@hardreset: hardreset method (can be NULL)
- *	@postreset: postreset method (can be NULL)
- *
  *	Perform standard error handling sequence.
  *
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
-	       ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-	       ata_postreset_fn_t postreset)
+void ata_std_error_handler(struct ata_port *ap)
 {
-	struct ata_device *dev;
+	struct ata_port_operations *ops = ap->ops;
+	struct ata_link *link = &ap->link;
 	int rc;
 
+	/* Ignore built-in hardresets if SCR access is not available */
+	if ((ops->hardreset == sata_std_hardreset ||
+	     ops->hardreset == sata_sff_hardreset) && !sata_scr_valid(link))
+		link->flags |= ATA_LFLAG_NO_HRST;
+
 	ata_eh_autopsy(ap);
 	ata_eh_report(ap);
 
-	rc = ata_eh_recover(ap, prereset, softreset, hardreset, postreset,
-			    NULL);
+	rc = ata_eh_recover(ap, ops->prereset, ops->softreset,
+			    ops->hardreset, ops->postreset, NULL);
 	if (rc) {
-		ata_for_each_dev(dev, &ap->link, ALL)
+		struct ata_device *dev;
+
+		ata_for_each_dev(dev, link, ALL)
 			ata_dev_disable(dev);
 	}
 
 	ata_eh_finish(ap);
 }
-
-/**
- *	ata_std_error_handler - standard error handler
- *	@ap: host port to handle error for
- *
- *	Standard error handler
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- */
-void ata_std_error_handler(struct ata_port *ap)
-{
-	struct ata_port_operations *ops = ap->ops;
-	ata_reset_fn_t hardreset = ops->hardreset;
-
-	/* ignore built-in hardreset if SCR access is not available */
-	if (hardreset == sata_std_hardreset && !sata_scr_valid(&ap->link))
-		hardreset = NULL;
-
-	ata_do_eh(ap, ops->prereset, ops->softreset, hardreset, ops->postreset);
-}
 EXPORT_SYMBOL_GPL(ata_std_error_handler);
 
 #ifdef CONFIG_PM
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 5a46c066abc3..e61f00779e40 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -2054,8 +2054,6 @@ EXPORT_SYMBOL_GPL(ata_sff_drain_fifo);
  */
 void ata_sff_error_handler(struct ata_port *ap)
 {
-	ata_reset_fn_t softreset = ap->ops->softreset;
-	ata_reset_fn_t hardreset = ap->ops->hardreset;
 	struct ata_queued_cmd *qc;
 	unsigned long flags;
 
@@ -2077,13 +2075,7 @@ void ata_sff_error_handler(struct ata_port *ap)
 
 	spin_unlock_irqrestore(ap->lock, flags);
 
-	/* ignore built-in hardresets if SCR access is not available */
-	if ((hardreset == sata_std_hardreset ||
-	     hardreset == sata_sff_hardreset) && !sata_scr_valid(&ap->link))
-		hardreset = NULL;
-
-	ata_do_eh(ap, ap->ops->prereset, softreset, hardreset,
-		  ap->ops->postreset);
+	ata_std_error_handler(ap);
 }
 EXPORT_SYMBOL_GPL(ata_sff_error_handler);
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index d092747be588..cf0b3fff3198 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1412,9 +1412,6 @@ extern void ata_eh_thaw_port(struct ata_port *ap);
 extern void ata_eh_qc_complete(struct ata_queued_cmd *qc);
 extern void ata_eh_qc_retry(struct ata_queued_cmd *qc);
 
-extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset,
-		      ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-		      ata_postreset_fn_t postreset);
 extern void ata_std_error_handler(struct ata_port *ap);
 extern void ata_std_sched_eh(struct ata_port *ap);
 extern void ata_std_end_eh(struct ata_port *ap);
@@ -2152,6 +2149,12 @@ static inline u8 ata_wait_idle(struct ata_port *ap)
 
 	return status;
 }
+#else /* CONFIG_ATA_SFF */
+static inline int sata_sff_hardreset(struct ata_link *link, unsigned int *class,
+				     unsigned long deadline)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_ATA_SFF */
 
 #endif /* __LINUX_LIBATA_H__ */
-- 
cgit v1.2.3


From a4daf088a77323154514eb1f8626bbdf9329cfd4 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Wed, 16 Jul 2025 11:03:14 +0900
Subject: ata: libata-eh: Simplify reset operation management

Introduce struct ata_reset_operations to aggregate in a single structure
the definitions of the 4 reset methods (prereset, softreset, hardreset
and postreset) for a port. This new structure is used in struct ata_port
to define the reset methods for a regular port (reset field) and for a
port-multiplier port (pmp_reset field). A pointer to either of these
fields replaces the 4 reset method arguments passed to ata_eh_recover()
and ata_eh_reset().

The definition of the reset methods for all drivers is changed to use
the reset and pmp_reset fields in struct ata_port_operations.

A large number of files is modifed, but no functional changes are
introduced.

Suggested-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Link: https://lore.kernel.org/r/20250716020315.235457-3-dlemoal@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/ahci.c                      |  6 +++---
 drivers/ata/ahci_da850.c                |  6 +++---
 drivers/ata/ahci_dm816.c                |  2 +-
 drivers/ata/ahci_imx.c                  | 13 +++++++------
 drivers/ata/ahci_qoriq.c                |  4 ++--
 drivers/ata/ahci_xgene.c                |  8 ++++----
 drivers/ata/ata_piix.c                  |  4 ++--
 drivers/ata/libahci.c                   | 10 +++++-----
 drivers/ata/libata-core.c               |  4 ++--
 drivers/ata/libata-eh.c                 | 29 +++++++++++++----------------
 drivers/ata/libata-pmp.c                | 26 +++++++++-----------------
 drivers/ata/libata-sata.c               |  2 +-
 drivers/ata/libata-sff.c                |  8 ++++----
 drivers/ata/libata.h                    |  8 +++-----
 drivers/ata/pata_acpi.c                 |  2 +-
 drivers/ata/pata_ali.c                  | 10 +++++-----
 drivers/ata/pata_amd.c                  |  4 ++--
 drivers/ata/pata_artop.c                |  4 ++--
 drivers/ata/pata_atiixp.c               |  2 +-
 drivers/ata/pata_efar.c                 |  2 +-
 drivers/ata/pata_ep93xx.c               |  4 ++--
 drivers/ata/pata_hpt366.c               |  2 +-
 drivers/ata/pata_hpt37x.c               |  4 ++--
 drivers/ata/pata_hpt3x2n.c              |  2 +-
 drivers/ata/pata_icside.c               |  2 +-
 drivers/ata/pata_it8213.c               |  2 +-
 drivers/ata/pata_jmicron.c              |  2 +-
 drivers/ata/pata_marvell.c              |  2 +-
 drivers/ata/pata_mpiix.c                |  2 +-
 drivers/ata/pata_ns87410.c              |  2 +-
 drivers/ata/pata_octeon_cf.c            |  2 +-
 drivers/ata/pata_oldpiix.c              |  2 +-
 drivers/ata/pata_opti.c                 |  2 +-
 drivers/ata/pata_optidma.c              |  2 +-
 drivers/ata/pata_parport/pata_parport.c |  4 ++--
 drivers/ata/pata_pdc2027x.c             |  2 +-
 drivers/ata/pata_rdc.c                  |  2 +-
 drivers/ata/pata_sis.c                  |  2 +-
 drivers/ata/pata_sl82c105.c             |  2 +-
 drivers/ata/pata_triflex.c              |  2 +-
 drivers/ata/pata_via.c                  |  2 +-
 drivers/ata/pdc_adma.c                  |  2 +-
 drivers/ata/sata_dwc_460ex.c            |  2 +-
 drivers/ata/sata_fsl.c                  |  6 +++---
 drivers/ata/sata_highbank.c             |  2 +-
 drivers/ata/sata_inic162x.c             |  2 +-
 drivers/ata/sata_mv.c                   | 10 +++++-----
 drivers/ata/sata_nv.c                   |  2 +-
 drivers/ata/sata_promise.c              |  4 ++--
 drivers/ata/sata_qstor.c                |  4 ++--
 drivers/ata/sata_rcar.c                 |  2 +-
 drivers/ata/sata_sil24.c                |  8 ++++----
 drivers/ata/sata_svw.c                  |  4 ++--
 drivers/ata/sata_sx4.c                  |  2 +-
 drivers/ata/sata_uli.c                  |  2 +-
 drivers/ata/sata_via.c                  |  4 ++--
 drivers/scsi/libsas/sas_ata.c           |  4 ++--
 include/linux/libata.h                  | 17 +++++++++--------
 58 files changed, 134 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 5558e9f7b85d..e1c24bbacf64 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -110,17 +110,17 @@ static const struct scsi_host_template ahci_sht = {
 
 static struct ata_port_operations ahci_vt8251_ops = {
 	.inherits		= &ahci_ops,
-	.hardreset		= ahci_vt8251_hardreset,
+	.reset.hardreset	= ahci_vt8251_hardreset,
 };
 
 static struct ata_port_operations ahci_p5wdh_ops = {
 	.inherits		= &ahci_ops,
-	.hardreset		= ahci_p5wdh_hardreset,
+	.reset.hardreset	= ahci_p5wdh_hardreset,
 };
 
 static struct ata_port_operations ahci_avn_ops = {
 	.inherits		= &ahci_ops,
-	.hardreset		= ahci_avn_hardreset,
+	.reset.hardreset	= ahci_avn_hardreset,
 };
 
 static const struct ata_port_info ahci_port_info[] = {
diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index ca0924dc5bd2..f97566c420f8 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -137,13 +137,13 @@ static int ahci_da850_hardreset(struct ata_link *link,
 
 static struct ata_port_operations ahci_da850_port_ops = {
 	.inherits = &ahci_platform_ops,
-	.softreset = ahci_da850_softreset,
+	.reset.softreset = ahci_da850_softreset,
 	/*
 	 * No need to override .pmp_softreset - it's only used for actual
 	 * PMP-enabled ports.
 	 */
-	.hardreset = ahci_da850_hardreset,
-	.pmp_hardreset = ahci_da850_hardreset,
+	.reset.hardreset = ahci_da850_hardreset,
+	.pmp_reset.hardreset = ahci_da850_hardreset,
 };
 
 static const struct ata_port_info ahci_da850_port_info = {
diff --git a/drivers/ata/ahci_dm816.c b/drivers/ata/ahci_dm816.c
index b08547b877a1..93faed2cfeb6 100644
--- a/drivers/ata/ahci_dm816.c
+++ b/drivers/ata/ahci_dm816.c
@@ -124,7 +124,7 @@ static int ahci_dm816_softreset(struct ata_link *link,
 
 static struct ata_port_operations ahci_dm816_port_ops = {
 	.inherits = &ahci_platform_ops,
-	.softreset = ahci_dm816_softreset,
+	.reset.softreset = ahci_dm816_softreset,
 };
 
 static const struct ata_port_info ahci_dm816_port_info = {
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index f01f08048f97..86aedd5923ac 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -642,18 +642,19 @@ static int ahci_imx_softreset(struct ata_link *link, unsigned int *class,
 	int ret;
 
 	if (imxpriv->type == AHCI_IMX53)
-		ret = ahci_pmp_retry_srst_ops.softreset(link, class, deadline);
+		ret = ahci_pmp_retry_srst_ops.reset.softreset(link, class,
+							      deadline);
 	else
-		ret = ahci_ops.softreset(link, class, deadline);
+		ret = ahci_ops.reset.softreset(link, class, deadline);
 
 	return ret;
 }
 
 static struct ata_port_operations ahci_imx_ops = {
-	.inherits	= &ahci_ops,
-	.host_stop	= ahci_imx_host_stop,
-	.error_handler	= ahci_imx_error_handler,
-	.softreset	= ahci_imx_softreset,
+	.inherits		= &ahci_ops,
+	.host_stop		= ahci_imx_host_stop,
+	.error_handler		= ahci_imx_error_handler,
+	.reset.softreset	= ahci_imx_softreset,
 };
 
 static const struct ata_port_info ahci_imx_port_info = {
diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c
index 30e39885b64e..0dec1a17e5b1 100644
--- a/drivers/ata/ahci_qoriq.c
+++ b/drivers/ata/ahci_qoriq.c
@@ -146,8 +146,8 @@ static int ahci_qoriq_hardreset(struct ata_link *link, unsigned int *class,
 }
 
 static struct ata_port_operations ahci_qoriq_ops = {
-	.inherits	= &ahci_ops,
-	.hardreset	= ahci_qoriq_hardreset,
+	.inherits		= &ahci_ops,
+	.reset.hardreset	= ahci_qoriq_hardreset,
 };
 
 static const struct ata_port_info ahci_qoriq_port_info = {
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index dfbd8c53abcb..5d5a51a77f5d 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -613,11 +613,11 @@ static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance)
 static struct ata_port_operations xgene_ahci_v1_ops = {
 	.inherits = &ahci_ops,
 	.host_stop = xgene_ahci_host_stop,
-	.hardreset = xgene_ahci_hardreset,
+	.reset.hardreset = xgene_ahci_hardreset,
+	.reset.softreset = xgene_ahci_softreset,
+	.pmp_reset.softreset = xgene_ahci_pmp_softreset,
 	.read_id = xgene_ahci_read_id,
 	.qc_issue = xgene_ahci_qc_issue,
-	.softreset = xgene_ahci_softreset,
-	.pmp_softreset = xgene_ahci_pmp_softreset
 };
 
 static const struct ata_port_info xgene_ahci_v1_port_info = {
@@ -630,7 +630,7 @@ static const struct ata_port_info xgene_ahci_v1_port_info = {
 static struct ata_port_operations xgene_ahci_v2_ops = {
 	.inherits = &ahci_ops,
 	.host_stop = xgene_ahci_host_stop,
-	.hardreset = xgene_ahci_hardreset,
+	.reset.hardreset = xgene_ahci_hardreset,
 	.read_id = xgene_ahci_read_id,
 };
 
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index d441246fa357..229429ba5027 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1074,7 +1074,7 @@ static struct ata_port_operations piix_pata_ops = {
 	.cable_detect		= ata_cable_40wire,
 	.set_piomode		= piix_set_piomode,
 	.set_dmamode		= piix_set_dmamode,
-	.prereset		= piix_pata_prereset,
+	.reset.prereset		= piix_pata_prereset,
 };
 
 static struct ata_port_operations piix_vmw_ops = {
@@ -1102,7 +1102,7 @@ static const struct scsi_host_template piix_sidpr_sht = {
 
 static struct ata_port_operations piix_sidpr_sata_ops = {
 	.inherits		= &piix_sata_ops,
-	.hardreset		= sata_std_hardreset,
+	.reset.hardreset	= sata_std_hardreset,
 	.scr_read		= piix_sidpr_scr_read,
 	.scr_write		= piix_sidpr_scr_write,
 	.set_lpm		= piix_sidpr_set_lpm,
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 4e9c82f36df1..b335fb7e5cb4 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -162,10 +162,10 @@ struct ata_port_operations ahci_ops = {
 
 	.freeze			= ahci_freeze,
 	.thaw			= ahci_thaw,
-	.softreset		= ahci_softreset,
-	.hardreset		= ahci_hardreset,
-	.postreset		= ahci_postreset,
-	.pmp_softreset		= ahci_softreset,
+	.reset.softreset	= ahci_softreset,
+	.reset.hardreset	= ahci_hardreset,
+	.reset.postreset	= ahci_postreset,
+	.pmp_reset.softreset	= ahci_softreset,
 	.error_handler		= ahci_error_handler,
 	.post_internal_cmd	= ahci_post_internal_cmd,
 	.dev_config		= ahci_dev_config,
@@ -192,7 +192,7 @@ EXPORT_SYMBOL_GPL(ahci_ops);
 
 struct ata_port_operations ahci_pmp_retry_srst_ops = {
 	.inherits		= &ahci_ops,
-	.softreset		= ahci_pmp_retry_softreset,
+	.reset.softreset	= ahci_pmp_retry_softreset,
 };
 EXPORT_SYMBOL_GPL(ahci_pmp_retry_srst_ops);
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index bbf1318a2b9a..97d9f0488cc1 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -65,8 +65,8 @@
 #include "libata-transport.h"
 
 const struct ata_port_operations ata_base_port_ops = {
-	.prereset		= ata_std_prereset,
-	.postreset		= ata_std_postreset,
+	.reset.prereset		= ata_std_prereset,
+	.reset.postreset	= ata_std_postreset,
 	.error_handler		= ata_std_error_handler,
 	.sched_eh		= ata_std_sched_eh,
 	.end_eh			= ata_std_end_eh,
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 30c831e56a7f..2946ae6d4b2c 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -2793,13 +2793,16 @@ static bool ata_eh_followup_srst_needed(struct ata_link *link, int rc)
 }
 
 int ata_eh_reset(struct ata_link *link, int classify,
-		 ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
-		 ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+		 struct ata_reset_operations *reset_ops)
 {
 	struct ata_port *ap = link->ap;
 	struct ata_link *slave = ap->slave_link;
 	struct ata_eh_context *ehc = &link->eh_context;
 	struct ata_eh_context *sehc = slave ? &slave->eh_context : NULL;
+	ata_reset_fn_t hardreset = reset_ops->hardreset;
+	ata_reset_fn_t softreset = reset_ops->softreset;
+	ata_prereset_fn_t prereset = reset_ops->prereset;
+	ata_postreset_fn_t postreset = reset_ops->postreset;
 	unsigned int *classes = ehc->classes;
 	unsigned int lflags = link->flags;
 	int verbose = !(ehc->i.flags & ATA_EHI_QUIET);
@@ -3756,10 +3759,7 @@ static int ata_eh_handle_dev_fail(struct ata_device *dev, int err)
 /**
  *	ata_eh_recover - recover host port after error
  *	@ap: host port to recover
- *	@prereset: prereset method (can be NULL)
- *	@softreset: softreset method (can be NULL)
- *	@hardreset: hardreset method (can be NULL)
- *	@postreset: postreset method (can be NULL)
+ *	@reset_ops: The set of reset operations to use
  *	@r_failed_link: out parameter for failed link
  *
  *	This is the alpha and omega, eum and yang, heart and soul of
@@ -3775,9 +3775,7 @@ static int ata_eh_handle_dev_fail(struct ata_device *dev, int err)
  *	RETURNS:
  *	0 on success, -errno on failure.
  */
-int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
-		   ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-		   ata_postreset_fn_t postreset,
+int ata_eh_recover(struct ata_port *ap, struct ata_reset_operations *reset_ops,
 		   struct ata_link **r_failed_link)
 {
 	struct ata_link *link;
@@ -3845,8 +3843,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 		if (!(ehc->i.action & ATA_EH_RESET))
 			continue;
 
-		rc = ata_eh_reset(link, ata_link_nr_vacant(link),
-				  prereset, softreset, hardreset, postreset);
+		rc = ata_eh_reset(link, ata_link_nr_vacant(link), reset_ops);
 		if (rc) {
 			ata_link_err(link, "reset failed, giving up\n");
 			goto out;
@@ -4077,20 +4074,20 @@ void ata_eh_finish(struct ata_port *ap)
  */
 void ata_std_error_handler(struct ata_port *ap)
 {
-	struct ata_port_operations *ops = ap->ops;
+	struct ata_reset_operations *reset_ops = &ap->ops->reset;
 	struct ata_link *link = &ap->link;
 	int rc;
 
 	/* Ignore built-in hardresets if SCR access is not available */
-	if ((ops->hardreset == sata_std_hardreset ||
-	     ops->hardreset == sata_sff_hardreset) && !sata_scr_valid(link))
+	if ((reset_ops->hardreset == sata_std_hardreset ||
+	     reset_ops->hardreset == sata_sff_hardreset) &&
+	    !sata_scr_valid(link))
 		link->flags |= ATA_LFLAG_NO_HRST;
 
 	ata_eh_autopsy(ap);
 	ata_eh_report(ap);
 
-	rc = ata_eh_recover(ap, ops->prereset, ops->softreset,
-			    ops->hardreset, ops->postreset, NULL);
+	rc = ata_eh_recover(ap, reset_ops, NULL);
 	if (rc) {
 		struct ata_device *dev;
 
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index d5d189328ae6..57023324a56f 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -15,9 +15,9 @@
 
 const struct ata_port_operations sata_pmp_port_ops = {
 	.inherits		= &sata_port_ops,
-	.pmp_prereset		= ata_std_prereset,
-	.pmp_hardreset		= sata_std_hardreset,
-	.pmp_postreset		= ata_std_postreset,
+	.pmp_reset.prereset	= ata_std_prereset,
+	.pmp_reset.hardreset	= sata_std_hardreset,
+	.pmp_reset.postreset	= ata_std_postreset,
 	.error_handler		= sata_pmp_error_handler,
 };
 
@@ -727,10 +727,7 @@ static int sata_pmp_revalidate_quick(struct ata_device *dev)
 /**
  *	sata_pmp_eh_recover_pmp - recover PMP
  *	@ap: ATA port PMP is attached to
- *	@prereset: prereset method (can be NULL)
- *	@softreset: softreset method
- *	@hardreset: hardreset method
- *	@postreset: postreset method (can be NULL)
+ *	@reset_ops: The set of reset operations to use
  *
  *	Recover PMP attached to @ap.  Recovery procedure is somewhat
  *	similar to that of ata_eh_recover() except that reset should
@@ -744,8 +741,7 @@ static int sata_pmp_revalidate_quick(struct ata_device *dev)
  *	0 on success, -errno on failure.
  */
 static int sata_pmp_eh_recover_pmp(struct ata_port *ap,
-		ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
-		ata_reset_fn_t hardreset, ata_postreset_fn_t postreset)
+				   struct ata_reset_operations *reset_ops)
 {
 	struct ata_link *link = &ap->link;
 	struct ata_eh_context *ehc = &link->eh_context;
@@ -767,8 +763,7 @@ static int sata_pmp_eh_recover_pmp(struct ata_port *ap,
 		struct ata_link *tlink;
 
 		/* reset */
-		rc = ata_eh_reset(link, 0, prereset, softreset, hardreset,
-				  postreset);
+		rc = ata_eh_reset(link, 0, reset_ops);
 		if (rc) {
 			ata_link_err(link, "failed to reset PMP, giving up\n");
 			goto fail;
@@ -932,8 +927,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
  retry:
 	/* PMP attached? */
 	if (!sata_pmp_attached(ap)) {
-		rc = ata_eh_recover(ap, ops->prereset, ops->softreset,
-				    ops->hardreset, ops->postreset, NULL);
+		rc = ata_eh_recover(ap, &ops->reset, NULL);
 		if (rc) {
 			ata_for_each_dev(dev, &ap->link, ALL)
 				ata_dev_disable(dev);
@@ -951,8 +945,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 	}
 
 	/* recover pmp */
-	rc = sata_pmp_eh_recover_pmp(ap, ops->prereset, ops->softreset,
-				     ops->hardreset, ops->postreset);
+	rc = sata_pmp_eh_recover_pmp(ap, &ops->reset);
 	if (rc)
 		goto pmp_fail;
 
@@ -978,8 +971,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 		goto pmp_fail;
 
 	/* recover links */
-	rc = ata_eh_recover(ap, ops->pmp_prereset, ops->pmp_softreset,
-			    ops->pmp_hardreset, ops->pmp_postreset, &link);
+	rc = ata_eh_recover(ap, &ops->pmp_reset, &link);
 	if (rc)
 		goto link_fail;
 
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 47169c469f43..4734465d3b1e 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1704,6 +1704,6 @@ const struct ata_port_operations sata_port_ops = {
 	.inherits		= &ata_base_port_ops,
 
 	.qc_defer		= ata_std_qc_defer,
-	.hardreset		= sata_std_hardreset,
+	.reset.hardreset	= sata_std_hardreset,
 };
 EXPORT_SYMBOL_GPL(sata_port_ops);
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index e61f00779e40..7fc407255eb4 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -31,10 +31,10 @@ const struct ata_port_operations ata_sff_port_ops = {
 
 	.freeze			= ata_sff_freeze,
 	.thaw			= ata_sff_thaw,
-	.prereset		= ata_sff_prereset,
-	.softreset		= ata_sff_softreset,
-	.hardreset		= sata_sff_hardreset,
-	.postreset		= ata_sff_postreset,
+	.reset.prereset		= ata_sff_prereset,
+	.reset.softreset	= ata_sff_softreset,
+	.reset.hardreset	= sata_sff_hardreset,
+	.reset.postreset	= ata_sff_postreset,
 	.error_handler		= ata_sff_error_handler,
 
 	.sff_dev_select		= ata_sff_dev_select,
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index ca44fb792aeb..e5b977a8d3e1 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -180,11 +180,9 @@ extern void ata_eh_autopsy(struct ata_port *ap);
 const char *ata_get_cmd_name(u8 command);
 extern void ata_eh_report(struct ata_port *ap);
 extern int ata_eh_reset(struct ata_link *link, int classify,
-			ata_prereset_fn_t prereset, ata_reset_fn_t softreset,
-			ata_reset_fn_t hardreset, ata_postreset_fn_t postreset);
-extern int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
-			  ata_reset_fn_t softreset, ata_reset_fn_t hardreset,
-			  ata_postreset_fn_t postreset,
+			struct ata_reset_operations *reset_ops);
+extern int ata_eh_recover(struct ata_port *ap,
+			  struct ata_reset_operations *reset_ops,
 			  struct ata_link **r_failed_disk);
 extern void ata_eh_finish(struct ata_port *ap);
 extern int ata_ering_map(struct ata_ering *ering,
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index ab38871b5e00..23fff10af2ac 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -216,7 +216,7 @@ static struct ata_port_operations pacpi_ops = {
 	.mode_filter		= pacpi_mode_filter,
 	.set_piomode		= pacpi_set_piomode,
 	.set_dmamode		= pacpi_set_dmamode,
-	.prereset		= pacpi_pre_reset,
+	.reset.prereset		= pacpi_pre_reset,
 	.port_start		= pacpi_port_start,
 };
 
diff --git a/drivers/ata/pata_ali.c b/drivers/ata/pata_ali.c
index bb790edd6036..9d5cb9c34c52 100644
--- a/drivers/ata/pata_ali.c
+++ b/drivers/ata/pata_ali.c
@@ -392,11 +392,11 @@ static struct ata_port_operations ali_20_port_ops = {
  *	Port operations for DMA capable ALi with cable detect
  */
 static struct ata_port_operations ali_c2_port_ops = {
-	.inherits	= &ali_dma_base_ops,
-	.check_atapi_dma = ali_check_atapi_dma,
-	.cable_detect	= ali_c2_cable_detect,
-	.dev_config	= ali_lock_sectors,
-	.postreset	= ali_c2_c3_postreset,
+	.inherits		= &ali_dma_base_ops,
+	.check_atapi_dma	= ali_check_atapi_dma,
+	.cable_detect		= ali_c2_cable_detect,
+	.dev_config		= ali_lock_sectors,
+	.reset.postreset	= ali_c2_c3_postreset,
 };
 
 /*
diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c
index 5b02b89748b7..a2fecadc927d 100644
--- a/drivers/ata/pata_amd.c
+++ b/drivers/ata/pata_amd.c
@@ -394,7 +394,7 @@ static const struct scsi_host_template amd_sht = {
 
 static const struct ata_port_operations amd_base_port_ops = {
 	.inherits	= &ata_bmdma32_port_ops,
-	.prereset	= amd_pre_reset,
+	.reset.prereset	= amd_pre_reset,
 };
 
 static struct ata_port_operations amd33_port_ops = {
@@ -429,7 +429,7 @@ static const struct ata_port_operations nv_base_port_ops = {
 	.inherits	= &ata_bmdma_port_ops,
 	.cable_detect	= ata_cable_ignore,
 	.mode_filter	= nv_mode_filter,
-	.prereset	= nv_pre_reset,
+	.reset.prereset	= nv_pre_reset,
 	.host_stop	= nv_host_stop,
 };
 
diff --git a/drivers/ata/pata_artop.c b/drivers/ata/pata_artop.c
index 40544282f455..6160414172a3 100644
--- a/drivers/ata/pata_artop.c
+++ b/drivers/ata/pata_artop.c
@@ -301,7 +301,7 @@ static struct ata_port_operations artop6210_ops = {
 	.cable_detect		= ata_cable_40wire,
 	.set_piomode		= artop6210_set_piomode,
 	.set_dmamode		= artop6210_set_dmamode,
-	.prereset		= artop62x0_pre_reset,
+	.reset.prereset		= artop62x0_pre_reset,
 	.qc_defer		= artop6210_qc_defer,
 };
 
@@ -310,7 +310,7 @@ static struct ata_port_operations artop6260_ops = {
 	.cable_detect		= artop6260_cable_detect,
 	.set_piomode		= artop6260_set_piomode,
 	.set_dmamode		= artop6260_set_dmamode,
-	.prereset		= artop62x0_pre_reset,
+	.reset.prereset		= artop62x0_pre_reset,
 };
 
 static void atp8xx_fixup(struct pci_dev *pdev)
diff --git a/drivers/ata/pata_atiixp.c b/drivers/ata/pata_atiixp.c
index 8c5cc803aab3..4c612f9543f6 100644
--- a/drivers/ata/pata_atiixp.c
+++ b/drivers/ata/pata_atiixp.c
@@ -264,7 +264,7 @@ static struct ata_port_operations atiixp_port_ops = {
 	.bmdma_start 	= atiixp_bmdma_start,
 	.bmdma_stop	= atiixp_bmdma_stop,
 
-	.prereset	= atiixp_prereset,
+	.reset.prereset	= atiixp_prereset,
 	.cable_detect	= atiixp_cable_detect,
 	.set_piomode	= atiixp_set_piomode,
 	.set_dmamode	= atiixp_set_dmamode,
diff --git a/drivers/ata/pata_efar.c b/drivers/ata/pata_efar.c
index 2e6eccf2902f..6fe49b303fee 100644
--- a/drivers/ata/pata_efar.c
+++ b/drivers/ata/pata_efar.c
@@ -243,7 +243,7 @@ static struct ata_port_operations efar_ops = {
 	.cable_detect		= efar_cable_detect,
 	.set_piomode		= efar_set_piomode,
 	.set_dmamode		= efar_set_dmamode,
-	.prereset		= efar_pre_reset,
+	.reset.prereset		= efar_pre_reset,
 };
 
 
diff --git a/drivers/ata/pata_ep93xx.c b/drivers/ata/pata_ep93xx.c
index e8cda988feb5..b2b9e0058333 100644
--- a/drivers/ata/pata_ep93xx.c
+++ b/drivers/ata/pata_ep93xx.c
@@ -879,8 +879,8 @@ static const struct scsi_host_template ep93xx_pata_sht = {
 static struct ata_port_operations ep93xx_pata_port_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 
-	.softreset		= ep93xx_pata_softreset,
-	.hardreset		= ATA_OP_NULL,
+	.reset.softreset	= ep93xx_pata_softreset,
+	.reset.hardreset	= ATA_OP_NULL,
 
 	.sff_dev_select		= ep93xx_pata_dev_select,
 	.sff_set_devctl		= ep93xx_pata_set_devctl,
diff --git a/drivers/ata/pata_hpt366.c b/drivers/ata/pata_hpt366.c
index 5280e9960025..b96e8bd2a3f8 100644
--- a/drivers/ata/pata_hpt366.c
+++ b/drivers/ata/pata_hpt366.c
@@ -322,7 +322,7 @@ static const struct scsi_host_template hpt36x_sht = {
 
 static struct ata_port_operations hpt366_port_ops = {
 	.inherits	= &ata_bmdma_port_ops,
-	.prereset	= hpt366_prereset,
+	.reset.prereset	= hpt366_prereset,
 	.cable_detect	= hpt36x_cable_detect,
 	.mode_filter	= hpt366_filter,
 	.set_piomode	= hpt366_set_piomode,
diff --git a/drivers/ata/pata_hpt37x.c b/drivers/ata/pata_hpt37x.c
index 4af22b819416..07e3a984cbb1 100644
--- a/drivers/ata/pata_hpt37x.c
+++ b/drivers/ata/pata_hpt37x.c
@@ -543,7 +543,7 @@ static struct ata_port_operations hpt370_port_ops = {
 	.cable_detect	= hpt37x_cable_detect,
 	.set_piomode	= hpt37x_set_piomode,
 	.set_dmamode	= hpt37x_set_dmamode,
-	.prereset	= hpt37x_pre_reset,
+	.reset.prereset	= hpt37x_pre_reset,
 };
 
 /*
@@ -567,7 +567,7 @@ static struct ata_port_operations hpt302_port_ops = {
 	.cable_detect	= hpt37x_cable_detect,
 	.set_piomode	= hpt37x_set_piomode,
 	.set_dmamode	= hpt37x_set_dmamode,
-	.prereset	= hpt37x_pre_reset,
+	.reset.prereset	= hpt37x_pre_reset,
 };
 
 /*
diff --git a/drivers/ata/pata_hpt3x2n.c b/drivers/ata/pata_hpt3x2n.c
index 5b1ecccf3c83..2cc57fcf2c46 100644
--- a/drivers/ata/pata_hpt3x2n.c
+++ b/drivers/ata/pata_hpt3x2n.c
@@ -356,7 +356,7 @@ static struct ata_port_operations hpt3xxn_port_ops = {
 	.cable_detect	= hpt3x2n_cable_detect,
 	.set_piomode	= hpt3x2n_set_piomode,
 	.set_dmamode	= hpt3x2n_set_dmamode,
-	.prereset	= hpt3x2n_pre_reset,
+	.reset.prereset	= hpt3x2n_pre_reset,
 };
 
 /*
diff --git a/drivers/ata/pata_icside.c b/drivers/ata/pata_icside.c
index 61d8760f09d9..70f056e47e6b 100644
--- a/drivers/ata/pata_icside.c
+++ b/drivers/ata/pata_icside.c
@@ -336,7 +336,7 @@ static struct ata_port_operations pata_icside_port_ops = {
 
 	.cable_detect		= ata_cable_40wire,
 	.set_dmamode		= pata_icside_set_dmamode,
-	.postreset		= pata_icside_postreset,
+	.reset.postreset	= pata_icside_postreset,
 
 	.port_start		= ATA_OP_NULL,	/* don't need PRD table */
 };
diff --git a/drivers/ata/pata_it8213.c b/drivers/ata/pata_it8213.c
index 9cbe2132ce59..a6f2cfc1602e 100644
--- a/drivers/ata/pata_it8213.c
+++ b/drivers/ata/pata_it8213.c
@@ -238,7 +238,7 @@ static struct ata_port_operations it8213_ops = {
 	.cable_detect		= it8213_cable_detect,
 	.set_piomode		= it8213_set_piomode,
 	.set_dmamode		= it8213_set_dmamode,
-	.prereset		= it8213_pre_reset,
+	.reset.prereset		= it8213_pre_reset,
 };
 
 
diff --git a/drivers/ata/pata_jmicron.c b/drivers/ata/pata_jmicron.c
index f51fb8219762..b885f33e8980 100644
--- a/drivers/ata/pata_jmicron.c
+++ b/drivers/ata/pata_jmicron.c
@@ -113,7 +113,7 @@ static const struct scsi_host_template jmicron_sht = {
 
 static struct ata_port_operations jmicron_ops = {
 	.inherits		= &ata_bmdma_port_ops,
-	.prereset		= jmicron_pre_reset,
+	.reset.prereset		= jmicron_pre_reset,
 };
 
 
diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c
index 8119caaad605..deab67328388 100644
--- a/drivers/ata/pata_marvell.c
+++ b/drivers/ata/pata_marvell.c
@@ -99,7 +99,7 @@ static const struct scsi_host_template marvell_sht = {
 static struct ata_port_operations marvell_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 	.cable_detect		= marvell_cable_detect,
-	.prereset		= marvell_pre_reset,
+	.reset.prereset		= marvell_pre_reset,
 };
 
 
diff --git a/drivers/ata/pata_mpiix.c b/drivers/ata/pata_mpiix.c
index 69e4baf27d72..ce310ae7c93a 100644
--- a/drivers/ata/pata_mpiix.c
+++ b/drivers/ata/pata_mpiix.c
@@ -145,7 +145,7 @@ static struct ata_port_operations mpiix_port_ops = {
 	.qc_issue	= mpiix_qc_issue,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= mpiix_set_piomode,
-	.prereset	= mpiix_pre_reset,
+	.reset.prereset	= mpiix_pre_reset,
 	.sff_data_xfer	= ata_sff_data_xfer32,
 };
 
diff --git a/drivers/ata/pata_ns87410.c b/drivers/ata/pata_ns87410.c
index 44cc24d21d5f..bdb55c1a3280 100644
--- a/drivers/ata/pata_ns87410.c
+++ b/drivers/ata/pata_ns87410.c
@@ -123,7 +123,7 @@ static struct ata_port_operations ns87410_port_ops = {
 	.qc_issue	= ns87410_qc_issue,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= ns87410_set_piomode,
-	.prereset	= ns87410_pre_reset,
+	.reset.prereset	= ns87410_pre_reset,
 };
 
 static int ns87410_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index 2d32125c16fd..df42ebe98db7 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -941,7 +941,7 @@ static int octeon_cf_probe(struct platform_device *pdev)
 		/* 16 bit but not True IDE */
 		base = cs0 + 0x800;
 		octeon_cf_ops.sff_data_xfer	= octeon_cf_data_xfer16;
-		octeon_cf_ops.softreset		= octeon_cf_softreset16;
+		octeon_cf_ops.reset.softreset	= octeon_cf_softreset16;
 		octeon_cf_ops.sff_check_status	= octeon_cf_check_status16;
 		octeon_cf_ops.sff_tf_read	= octeon_cf_tf_read16;
 		octeon_cf_ops.sff_tf_load	= octeon_cf_tf_load16;
diff --git a/drivers/ata/pata_oldpiix.c b/drivers/ata/pata_oldpiix.c
index 3d01b7000e41..81a7f3eb5654 100644
--- a/drivers/ata/pata_oldpiix.c
+++ b/drivers/ata/pata_oldpiix.c
@@ -214,7 +214,7 @@ static struct ata_port_operations oldpiix_pata_ops = {
 	.cable_detect		= ata_cable_40wire,
 	.set_piomode		= oldpiix_set_piomode,
 	.set_dmamode		= oldpiix_set_dmamode,
-	.prereset		= oldpiix_pre_reset,
+	.reset.prereset		= oldpiix_pre_reset,
 };
 
 
diff --git a/drivers/ata/pata_opti.c b/drivers/ata/pata_opti.c
index 3d23f57eb128..3db1b95d1404 100644
--- a/drivers/ata/pata_opti.c
+++ b/drivers/ata/pata_opti.c
@@ -156,7 +156,7 @@ static struct ata_port_operations opti_port_ops = {
 	.inherits	= &ata_sff_port_ops,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= opti_set_piomode,
-	.prereset	= opti_pre_reset,
+	.reset.prereset	= opti_pre_reset,
 };
 
 static int opti_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ata/pata_optidma.c b/drivers/ata/pata_optidma.c
index cc876dc7a9d8..b42dba5f4e05 100644
--- a/drivers/ata/pata_optidma.c
+++ b/drivers/ata/pata_optidma.c
@@ -346,7 +346,7 @@ static struct ata_port_operations optidma_port_ops = {
 	.set_piomode	= optidma_set_pio_mode,
 	.set_dmamode	= optidma_set_dma_mode,
 	.set_mode	= optidma_set_mode,
-	.prereset	= optidma_pre_reset,
+	.reset.prereset	= optidma_pre_reset,
 };
 
 static struct ata_port_operations optiplus_port_ops = {
diff --git a/drivers/ata/pata_parport/pata_parport.c b/drivers/ata/pata_parport/pata_parport.c
index 93ebf566b54e..22bd3ff6b7ae 100644
--- a/drivers/ata/pata_parport/pata_parport.c
+++ b/drivers/ata/pata_parport/pata_parport.c
@@ -321,8 +321,8 @@ static void pata_parport_drain_fifo(struct ata_queued_cmd *qc)
 static struct ata_port_operations pata_parport_port_ops = {
 	.inherits		= &ata_sff_port_ops,
 
-	.softreset		= pata_parport_softreset,
-	.hardreset		= NULL,
+	.reset.softreset	= pata_parport_softreset,
+	.reset.hardreset	= NULL,
 
 	.sff_dev_select		= pata_parport_dev_select,
 	.sff_set_devctl		= pata_parport_set_devctl,
diff --git a/drivers/ata/pata_pdc2027x.c b/drivers/ata/pata_pdc2027x.c
index a4ee3b92c9aa..d792ce6d97bf 100644
--- a/drivers/ata/pata_pdc2027x.c
+++ b/drivers/ata/pata_pdc2027x.c
@@ -130,7 +130,7 @@ static struct ata_port_operations pdc2027x_pata100_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 	.check_atapi_dma	= pdc2027x_check_atapi_dma,
 	.cable_detect		= pdc2027x_cable_detect,
-	.prereset		= pdc2027x_prereset,
+	.reset.prereset		= pdc2027x_prereset,
 };
 
 static struct ata_port_operations pdc2027x_pata133_ops = {
diff --git a/drivers/ata/pata_rdc.c b/drivers/ata/pata_rdc.c
index 19cbb5c94b42..6ff4c11e937d 100644
--- a/drivers/ata/pata_rdc.c
+++ b/drivers/ata/pata_rdc.c
@@ -276,7 +276,7 @@ static struct ata_port_operations rdc_pata_ops = {
 	.cable_detect		= rdc_pata_cable_detect,
 	.set_piomode		= rdc_set_piomode,
 	.set_dmamode		= rdc_set_dmamode,
-	.prereset		= rdc_pata_prereset,
+	.reset.prereset		= rdc_pata_prereset,
 };
 
 static const struct ata_port_info rdc_port_info = {
diff --git a/drivers/ata/pata_sis.c b/drivers/ata/pata_sis.c
index 31de06b66221..2b751e393771 100644
--- a/drivers/ata/pata_sis.c
+++ b/drivers/ata/pata_sis.c
@@ -552,7 +552,7 @@ static struct ata_port_operations sis_133_for_sata_ops = {
 
 static struct ata_port_operations sis_base_ops = {
 	.inherits		= &ata_bmdma_port_ops,
-	.prereset		= sis_pre_reset,
+	.reset.prereset		= sis_pre_reset,
 };
 
 static struct ata_port_operations sis_133_ops = {
diff --git a/drivers/ata/pata_sl82c105.c b/drivers/ata/pata_sl82c105.c
index 93882e976ede..2d24c6b3e9d9 100644
--- a/drivers/ata/pata_sl82c105.c
+++ b/drivers/ata/pata_sl82c105.c
@@ -248,7 +248,7 @@ static struct ata_port_operations sl82c105_port_ops = {
 	.bmdma_stop	= sl82c105_bmdma_stop,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= sl82c105_set_piomode,
-	.prereset	= sl82c105_pre_reset,
+	.reset.prereset	= sl82c105_pre_reset,
 	.sff_irq_check	= sl82c105_sff_irq_check,
 };
 
diff --git a/drivers/ata/pata_triflex.c b/drivers/ata/pata_triflex.c
index 26d448a869e2..596e86a031b3 100644
--- a/drivers/ata/pata_triflex.c
+++ b/drivers/ata/pata_triflex.c
@@ -170,7 +170,7 @@ static struct ata_port_operations triflex_port_ops = {
 	.bmdma_stop	= triflex_bmdma_stop,
 	.cable_detect	= ata_cable_40wire,
 	.set_piomode	= triflex_set_piomode,
-	.prereset	= triflex_prereset,
+	.reset.prereset	= triflex_prereset,
 };
 
 static int triflex_init_one(struct pci_dev *dev, const struct pci_device_id *id)
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index bb80e7800dcb..a8c9cf685b4b 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -451,7 +451,7 @@ static struct ata_port_operations via_port_ops = {
 	.cable_detect	= via_cable_detect,
 	.set_piomode	= via_set_piomode,
 	.set_dmamode	= via_set_dmamode,
-	.prereset	= via_pre_reset,
+	.reset.prereset	= via_pre_reset,
 	.sff_tf_load	= via_tf_load,
 	.port_start	= via_port_start,
 	.mode_filter	= via_mode_filter,
diff --git a/drivers/ata/pdc_adma.c b/drivers/ata/pdc_adma.c
index 8e6b2599f0d5..17a5a59861c3 100644
--- a/drivers/ata/pdc_adma.c
+++ b/drivers/ata/pdc_adma.c
@@ -140,7 +140,7 @@ static struct ata_port_operations adma_ata_ops = {
 
 	.freeze			= adma_freeze,
 	.thaw			= adma_thaw,
-	.prereset		= adma_prereset,
+	.reset.prereset		= adma_prereset,
 
 	.port_start		= adma_port_start,
 	.port_stop		= adma_port_stop,
diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 6e1dd0d9c035..7a4f59202156 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -1097,7 +1097,7 @@ static struct ata_port_operations sata_dwc_ops = {
 	.inherits		= &ata_sff_port_ops,
 
 	.error_handler		= sata_dwc_error_handler,
-	.hardreset		= sata_dwc_hardreset,
+	.reset.hardreset	= sata_dwc_hardreset,
 
 	.qc_issue		= sata_dwc_qc_issue,
 
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 87e91a937a44..84da8d6ef28e 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1395,9 +1395,9 @@ static struct ata_port_operations sata_fsl_ops = {
 
 	.freeze = sata_fsl_freeze,
 	.thaw = sata_fsl_thaw,
-	.softreset = sata_fsl_softreset,
-	.hardreset = sata_fsl_hardreset,
-	.pmp_softreset = sata_fsl_softreset,
+	.reset.softreset = sata_fsl_softreset,
+	.reset.hardreset = sata_fsl_hardreset,
+	.pmp_reset.softreset = sata_fsl_softreset,
 	.error_handler = sata_fsl_error_handler,
 	.post_internal_cmd = sata_fsl_post_internal_cmd,
 
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index c8c817c51230..3421039f4bae 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -428,7 +428,7 @@ static int ahci_highbank_hardreset(struct ata_link *link, unsigned int *class,
 
 static struct ata_port_operations ahci_highbank_ops = {
 	.inherits		= &ahci_ops,
-	.hardreset		= ahci_highbank_hardreset,
+	.reset.hardreset	= ahci_highbank_hardreset,
 	.transmit_led_message   = ecx_transmit_led_message,
 };
 
diff --git a/drivers/ata/sata_inic162x.c b/drivers/ata/sata_inic162x.c
index db9c255dc9f2..46a8c20daf18 100644
--- a/drivers/ata/sata_inic162x.c
+++ b/drivers/ata/sata_inic162x.c
@@ -730,7 +730,7 @@ static struct ata_port_operations inic_port_ops = {
 
 	.freeze			= inic_freeze,
 	.thaw			= inic_thaw,
-	.hardreset		= inic_hardreset,
+	.reset.hardreset	= inic_hardreset,
 	.error_handler		= inic_error_handler,
 	.post_internal_cmd	= inic_post_internal_cmd,
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index bcbf96867f89..ffb396f61731 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -687,7 +687,7 @@ static struct ata_port_operations mv5_ops = {
 
 	.freeze			= mv_eh_freeze,
 	.thaw			= mv_eh_thaw,
-	.hardreset		= mv_hardreset,
+	.reset.hardreset	= mv_hardreset,
 
 	.scr_read		= mv5_scr_read,
 	.scr_write		= mv5_scr_write,
@@ -709,10 +709,10 @@ static struct ata_port_operations mv6_ops = {
 
 	.freeze			= mv_eh_freeze,
 	.thaw			= mv_eh_thaw,
-	.hardreset		= mv_hardreset,
-	.softreset		= mv_softreset,
-	.pmp_hardreset		= mv_pmp_hardreset,
-	.pmp_softreset		= mv_softreset,
+	.reset.hardreset	= mv_hardreset,
+	.reset.softreset	= mv_softreset,
+	.pmp_reset.hardreset	= mv_pmp_hardreset,
+	.pmp_reset.softreset	= mv_softreset,
 	.error_handler		= mv_pmp_error_handler,
 
 	.scr_read		= mv_scr_read,
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index f36e2915ccf1..841e7de2bba6 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -462,7 +462,7 @@ static struct ata_port_operations nv_generic_ops = {
 	.lost_interrupt		= ATA_OP_NULL,
 	.scr_read		= nv_scr_read,
 	.scr_write		= nv_scr_write,
-	.hardreset		= nv_hardreset,
+	.reset.hardreset	= nv_hardreset,
 };
 
 static struct ata_port_operations nv_nf2_ops = {
diff --git a/drivers/ata/sata_promise.c b/drivers/ata/sata_promise.c
index 2df1a070b25a..2a005aede123 100644
--- a/drivers/ata/sata_promise.c
+++ b/drivers/ata/sata_promise.c
@@ -188,7 +188,7 @@ static struct ata_port_operations pdc_sata_ops = {
 	.scr_read		= pdc_sata_scr_read,
 	.scr_write		= pdc_sata_scr_write,
 	.port_start		= pdc_sata_port_start,
-	.hardreset		= pdc_sata_hardreset,
+	.reset.hardreset	= pdc_sata_hardreset,
 };
 
 /* First-generation chips need a more restrictive ->check_atapi_dma op,
@@ -206,7 +206,7 @@ static struct ata_port_operations pdc_pata_ops = {
 	.freeze			= pdc_freeze,
 	.thaw			= pdc_thaw,
 	.port_start		= pdc_common_port_start,
-	.softreset		= pdc_pata_softreset,
+	.reset.softreset	= pdc_pata_softreset,
 };
 
 static const struct ata_port_info pdc_port_info[] = {
diff --git a/drivers/ata/sata_qstor.c b/drivers/ata/sata_qstor.c
index 8a6286159044..cfb9b5b61cd7 100644
--- a/drivers/ata/sata_qstor.c
+++ b/drivers/ata/sata_qstor.c
@@ -123,8 +123,8 @@ static struct ata_port_operations qs_ata_ops = {
 
 	.freeze			= qs_freeze,
 	.thaw			= qs_thaw,
-	.prereset		= qs_prereset,
-	.softreset		= ATA_OP_NULL,
+	.reset.prereset		= qs_prereset,
+	.reset.softreset	= ATA_OP_NULL,
 	.error_handler		= qs_error_handler,
 	.lost_interrupt		= ATA_OP_NULL,
 
diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index 22820a02d740..487eadd4073f 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -624,7 +624,7 @@ static struct ata_port_operations sata_rcar_port_ops = {
 
 	.freeze			= sata_rcar_freeze,
 	.thaw			= sata_rcar_thaw,
-	.softreset		= sata_rcar_softreset,
+	.reset.softreset	= sata_rcar_softreset,
 
 	.scr_read		= sata_rcar_scr_read,
 	.scr_write		= sata_rcar_scr_write,
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 87f4cde6a686..d642ece9f07a 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -393,10 +393,10 @@ static struct ata_port_operations sil24_ops = {
 
 	.freeze			= sil24_freeze,
 	.thaw			= sil24_thaw,
-	.softreset		= sil24_softreset,
-	.hardreset		= sil24_hardreset,
-	.pmp_softreset		= sil24_softreset,
-	.pmp_hardreset		= sil24_pmp_hardreset,
+	.reset.softreset	= sil24_softreset,
+	.reset.hardreset	= sil24_hardreset,
+	.pmp_reset.softreset	= sil24_softreset,
+	.pmp_reset.hardreset	= sil24_pmp_hardreset,
 	.error_handler		= sil24_error_handler,
 	.post_internal_cmd	= sil24_post_internal_cmd,
 	.dev_config		= sil24_dev_config,
diff --git a/drivers/ata/sata_svw.c b/drivers/ata/sata_svw.c
index 598a872f6a08..c5d6aa36c9c3 100644
--- a/drivers/ata/sata_svw.c
+++ b/drivers/ata/sata_svw.c
@@ -340,8 +340,8 @@ static const struct scsi_host_template k2_sata_sht = {
 
 static struct ata_port_operations k2_sata_ops = {
 	.inherits		= &ata_bmdma_port_ops,
-	.softreset              = k2_sata_softreset,
-	.hardreset              = k2_sata_hardreset,
+	.reset.softreset	= k2_sata_softreset,
+	.reset.hardreset	= k2_sata_hardreset,
 	.sff_tf_load		= k2_sata_tf_load,
 	.sff_tf_read		= k2_sata_tf_read,
 	.sff_check_status	= k2_stat_check_status,
diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
index f7f5131af937..0986ebd1eb4e 100644
--- a/drivers/ata/sata_sx4.c
+++ b/drivers/ata/sata_sx4.c
@@ -241,7 +241,7 @@ static struct ata_port_operations pdc_20621_ops = {
 
 	.freeze			= pdc_freeze,
 	.thaw			= pdc_thaw,
-	.softreset		= pdc_softreset,
+	.reset.softreset	= pdc_softreset,
 	.error_handler		= pdc_error_handler,
 	.lost_interrupt		= ATA_OP_NULL,
 	.post_internal_cmd	= pdc_post_internal_cmd,
diff --git a/drivers/ata/sata_uli.c b/drivers/ata/sata_uli.c
index 52894ff49dcb..44985796cc47 100644
--- a/drivers/ata/sata_uli.c
+++ b/drivers/ata/sata_uli.c
@@ -67,7 +67,7 @@ static struct ata_port_operations uli_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 	.scr_read		= uli_scr_read,
 	.scr_write		= uli_scr_write,
-	.hardreset		= ATA_OP_NULL,
+	.reset.hardreset	= ATA_OP_NULL,
 };
 
 static const struct ata_port_info uli_port_info = {
diff --git a/drivers/ata/sata_via.c b/drivers/ata/sata_via.c
index 4ecd8f33b082..68e9003ec2d4 100644
--- a/drivers/ata/sata_via.c
+++ b/drivers/ata/sata_via.c
@@ -120,7 +120,7 @@ static struct ata_port_operations svia_base_ops = {
 static struct ata_port_operations vt6420_sata_ops = {
 	.inherits		= &svia_base_ops,
 	.freeze			= svia_noop_freeze,
-	.prereset		= vt6420_prereset,
+	.reset.prereset		= vt6420_prereset,
 	.bmdma_start		= vt6420_bmdma_start,
 };
 
@@ -140,7 +140,7 @@ static struct ata_port_operations vt6421_sata_ops = {
 
 static struct ata_port_operations vt8251_ops = {
 	.inherits		= &svia_base_ops,
-	.hardreset		= sata_std_hardreset,
+	.reset.hardreset	= sata_std_hardreset,
 	.scr_read		= vt8251_scr_read,
 	.scr_write		= vt8251_scr_write,
 };
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 7b4e7a61965a..adb9e7a94785 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -559,8 +559,8 @@ static int sas_ata_prereset(struct ata_link *link, unsigned long deadline)
 }
 
 static struct ata_port_operations sas_sata_ops = {
-	.prereset		= sas_ata_prereset,
-	.hardreset		= sas_ata_hard_reset,
+	.reset.prereset		= sas_ata_prereset,
+	.reset.hardreset	= sas_ata_hard_reset,
 	.error_handler		= ata_std_error_handler,
 	.post_internal_cmd	= sas_ata_post_internal,
 	.qc_defer               = ata_std_qc_defer,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index cf0b3fff3198..912ace523880 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -944,6 +944,13 @@ struct ata_port {
  */
 #define ATA_OP_NULL		(void *)(unsigned long)(-ENOENT)
 
+struct ata_reset_operations {
+	ata_prereset_fn_t	prereset;
+	ata_reset_fn_t		softreset;
+	ata_reset_fn_t		hardreset;
+	ata_postreset_fn_t	postreset;
+};
+
 struct ata_port_operations {
 	/*
 	 * Command execution
@@ -970,14 +977,8 @@ struct ata_port_operations {
 
 	void (*freeze)(struct ata_port *ap);
 	void (*thaw)(struct ata_port *ap);
-	ata_prereset_fn_t	prereset;
-	ata_reset_fn_t		softreset;
-	ata_reset_fn_t		hardreset;
-	ata_postreset_fn_t	postreset;
-	ata_prereset_fn_t	pmp_prereset;
-	ata_reset_fn_t		pmp_softreset;
-	ata_reset_fn_t		pmp_hardreset;
-	ata_postreset_fn_t	pmp_postreset;
+	struct ata_reset_operations reset;
+	struct ata_reset_operations pmp_reset;
 	void (*error_handler)(struct ata_port *ap);
 	void (*lost_interrupt)(struct ata_port *ap);
 	void (*post_internal_cmd)(struct ata_queued_cmd *qc);
-- 
cgit v1.2.3


From 95a16160ca1d75c66bf7a1c5e0bcaffb18e7c7fc Mon Sep 17 00:00:00 2001
From: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Date: Wed, 16 Jul 2025 13:21:24 +0530
Subject: drm/amdgpu: Reset the clear flag in buddy during resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Added a handler in DRM buddy manager to reset the cleared
  flag for the blocks in the freelist.

- This is necessary because, upon resuming, the VRAM becomes
  cluttered with BIOS data, yet the VRAM backend manager
  believes that everything has been cleared.

v2:
  - Add lock before accessing drm_buddy_clear_reset_blocks()(Matthew Auld)
  - Force merge the two dirty blocks.(Matthew Auld)
  - Add a new unit test case for this issue.(Matthew Auld)
  - Having this function being able to flip the state either way would be
    good. (Matthew Brost)

v3(Matthew Auld):
  - Do merge step first to avoid the use of extra reset flag.

Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Cc: stable@vger.kernel.org
Fixes: a68c7eaa7a8f ("drm/amdgpu: Enable clear page functionality")
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3812
Signed-off-by: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20250716075125.240637-2-Arunpravin.PaneerSelvam@amd.com
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 +++++++++++
 drivers/gpu/drm/drm_buddy.c                  | 43 ++++++++++++++++++++++++++++
 include/drm/drm_buddy.h                      |  2 ++
 5 files changed, 65 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e1bab6a96cb6..4ecacbcb66bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5193,6 +5193,8 @@ exit:
 		dev->dev->power.disable_depth--;
 #endif
 	}
+
+	amdgpu_vram_mgr_clear_reset_blocks(adev);
 	adev->in_suspend = false;
 
 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 208b7d1d8a27..450e4bf093b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -154,6 +154,7 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,
 				  uint64_t start, uint64_t size);
 int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr,
 				      uint64_t start);
+void amdgpu_vram_mgr_clear_reset_blocks(struct amdgpu_device *adev);
 
 bool amdgpu_res_cpu_visible(struct amdgpu_device *adev,
 			    struct ttm_resource *res);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index abdc52b0895a..07c936e90d8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -782,6 +782,23 @@ uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_vram_mgr *mgr)
 	return atomic64_read(&mgr->vis_usage);
 }
 
+/**
+ * amdgpu_vram_mgr_clear_reset_blocks - reset clear blocks
+ *
+ * @adev: amdgpu device pointer
+ *
+ * Reset the cleared drm buddy blocks.
+ */
+void amdgpu_vram_mgr_clear_reset_blocks(struct amdgpu_device *adev)
+{
+	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr;
+	struct drm_buddy *mm = &mgr->mm;
+
+	mutex_lock(&mgr->lock);
+	drm_buddy_reset_clear(mm, false);
+	mutex_unlock(&mgr->lock);
+}
+
 /**
  * amdgpu_vram_mgr_intersects - test each drm buddy block for intersection
  *
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 241c855f891f..66aff35f8647 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -404,6 +404,49 @@ drm_get_buddy(struct drm_buddy_block *block)
 }
 EXPORT_SYMBOL(drm_get_buddy);
 
+/**
+ * drm_buddy_reset_clear - reset blocks clear state
+ *
+ * @mm: DRM buddy manager
+ * @is_clear: blocks clear state
+ *
+ * Reset the clear state based on @is_clear value for each block
+ * in the freelist.
+ */
+void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear)
+{
+	u64 root_size, size, start;
+	unsigned int order;
+	int i;
+
+	size = mm->size;
+	for (i = 0; i < mm->n_roots; ++i) {
+		order = ilog2(size) - ilog2(mm->chunk_size);
+		start = drm_buddy_block_offset(mm->roots[i]);
+		__force_merge(mm, start, start + size, order);
+
+		root_size = mm->chunk_size << order;
+		size -= root_size;
+	}
+
+	for (i = 0; i <= mm->max_order; ++i) {
+		struct drm_buddy_block *block;
+
+		list_for_each_entry_reverse(block, &mm->free_list[i], link) {
+			if (is_clear != drm_buddy_block_is_clear(block)) {
+				if (is_clear) {
+					mark_cleared(block);
+					mm->clear_avail += drm_buddy_block_size(mm, block);
+				} else {
+					clear_reset(block);
+					mm->clear_avail -= drm_buddy_block_size(mm, block);
+				}
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(drm_buddy_reset_clear);
+
 /**
  * drm_buddy_free_block - free a block
  *
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index 9689a7c5dd36..513837632b7d 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -160,6 +160,8 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 			 u64 new_size,
 			 struct list_head *blocks);
 
+void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear);
+
 void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block);
 
 void drm_buddy_free_list(struct drm_buddy *mm,
-- 
cgit v1.2.3


From 35cff7af7598b9eb143cc0556e5532e2ded3b61a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 20 May 2025 13:34:37 +0300
Subject: container_of: Document container_of() is not to be used in new code

There is a warning in the kerneldoc documentation of container_of() that
constness of its ptr argument is lost. While this is a valid suggestion
container_of_const() should be used instead, the vast majority of new
code still uses container_of():

$ git diff v6.13 v6.14|grep container_of\(|wc -l
646
$ git diff v6.13 v6.14|grep container_of_const|wc -l
9

Make an explicit recommendation to use container_of_const().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20250520103437.468691-1-sakari.ailus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/container_of.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/container_of.h b/include/linux/container_of.h
index 713890c867be..1f6ebf27d962 100644
--- a/include/linux/container_of.h
+++ b/include/linux/container_of.h
@@ -14,6 +14,7 @@
  * @member:	the name of the member within the struct.
  *
  * WARNING: any const qualifier of @ptr is lost.
+ * Do not use container_of() in new code.
  */
 #define container_of(ptr, type, member) ({				\
 	void *__mptr = (void *)(ptr);					\
@@ -28,6 +29,8 @@
  * @ptr:		the pointer to the member
  * @type:		the type of the container struct this is embedded in.
  * @member:		the name of the member within the struct.
+ *
+ * Always prefer container_of_const() instead of container_of() in new code.
  */
 #define container_of_const(ptr, type, member)				\
 	_Generic(ptr,							\
-- 
cgit v1.2.3


From bdfa89c489296f092751fcee23b5d171c9fdc7f5 Mon Sep 17 00:00:00 2001
From: Tiffany Yang <ynaffit@google.com>
Date: Mon, 14 Jul 2025 11:53:16 -0700
Subject: kunit: test: Export kunit_attach_mm()

Tests can allocate from virtual memory using kunit_vm_mmap(), which
transparently creates and attaches an mm_struct to the test runner if
one is not already attached. This is suitable for most cases, except for
when the code under test must access a task's mm before performing an
mmap. Expose kunit_attach_mm() as part of the interface for those
cases. This does not change the existing behavior.

Cc: David Gow <davidgow@google.com>
Signed-off-by: Tiffany Yang <ynaffit@google.com>
Reviewed-by: Carlos Llamas <cmllamas@google.com>
Link: https://lore.kernel.org/r/20250714185321.2417234-4-ynaffit@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/kunit/test.h   | 12 ++++++++++++
 lib/kunit/user_alloc.c |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 39c768f87dc9..d958ee53050e 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -531,6 +531,18 @@ static inline char *kunit_kstrdup(struct kunit *test, const char *str, gfp_t gfp
  */
 const char *kunit_kstrdup_const(struct kunit *test, const char *str, gfp_t gfp);
 
+/**
+ * kunit_attach_mm() - Create and attach a new mm if it doesn't already exist.
+ *
+ * Allocates a &struct mm_struct and attaches it to @current. In most cases, call
+ * kunit_vm_mmap() without calling kunit_attach_mm() directly. Only necessary when
+ * code under test accesses the mm before executing the mmap (e.g., to perform
+ * additional initialization beforehand).
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int kunit_attach_mm(void);
+
 /**
  * kunit_vm_mmap() - Allocate KUnit-tracked vm_mmap() area
  * @test: The test context object.
diff --git a/lib/kunit/user_alloc.c b/lib/kunit/user_alloc.c
index 46951be018be..b8cac765e620 100644
--- a/lib/kunit/user_alloc.c
+++ b/lib/kunit/user_alloc.c
@@ -22,8 +22,7 @@ struct kunit_vm_mmap_params {
 	unsigned long offset;
 };
 
-/* Create and attach a new mm if it doesn't already exist. */
-static int kunit_attach_mm(void)
+int kunit_attach_mm(void)
 {
 	struct mm_struct *mm;
 
@@ -49,6 +48,7 @@ static int kunit_attach_mm(void)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(kunit_attach_mm);
 
 static int kunit_vm_mmap_init(struct kunit_resource *res, void *context)
 {
-- 
cgit v1.2.3


From 5eac636917486f3f072328d7f5bcdc22bbc9a1d1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 1 Jul 2025 14:07:01 +0200
Subject: fsi: make fsi_bus_type constant

Now that the driver core can properly handle constant struct bus_type,
move the fsi_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Ninad Palsule <ninad@linux.ibm.com>
Cc: linux-fsi@lists.ozlabs.org
Reviewed-by: Eddie James <eajames@linux.ibm.com>
Link: https://lore.kernel.org/r/2025070100-overblown-busily-a04b@gregkh
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fsi/fsi-core.c | 2 +-
 include/linux/fsi.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c
index 50e8736039fe..ee39d1699387 100644
--- a/drivers/fsi/fsi-core.c
+++ b/drivers/fsi/fsi-core.c
@@ -1404,7 +1404,7 @@ void fsi_driver_unregister(struct fsi_driver *fsi_drv)
 }
 EXPORT_SYMBOL_GPL(fsi_driver_unregister);
 
-struct bus_type fsi_bus_type = {
+const struct bus_type fsi_bus_type = {
 	.name		= "fsi",
 	.match		= fsi_bus_match,
 };
diff --git a/include/linux/fsi.h b/include/linux/fsi.h
index 8c5eef808788..adea1b432f2d 100644
--- a/include/linux/fsi.h
+++ b/include/linux/fsi.h
@@ -68,7 +68,7 @@ extern int fsi_slave_read(struct fsi_slave *slave, uint32_t addr,
 extern int fsi_slave_write(struct fsi_slave *slave, uint32_t addr,
 		const void *val, size_t size);
 
-extern struct bus_type fsi_bus_type;
+extern const struct bus_type fsi_bus_type;
 extern const struct device_type fsi_cdev_type;
 
 enum fsi_dev_type {
-- 
cgit v1.2.3


From e9d8e2bf23206825ca9b4d3caf587945ba807939 Mon Sep 17 00:00:00 2001
From: Taotao Chen <chentaotao@didiglobal.com>
Date: Wed, 16 Jul 2025 09:36:06 +0000
Subject: fs: change write_begin/write_end interface to take struct kiocb *

Change the address_space_operations callbacks write_begin() and
write_end() to take struct kiocb * as the first argument instead of
struct file *.

Update all affected function prototypes, implementations, call sites,
and related documentation across VFS, filesystems, and block layer.

Part of a series refactoring address_space_operations write_begin and
write_end callbacks to use struct kiocb for passing write context and
flags.

Signed-off-by: Taotao Chen <chentaotao@didiglobal.com>
Link: https://lore.kernel.org/20250716093559.217344-4-chentaotao@didiglobal.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/locking.rst |  4 ++--
 Documentation/filesystems/vfs.rst     |  6 +++---
 block/fops.c                          | 13 ++++++++-----
 fs/adfs/inode.c                       |  9 +++++----
 fs/affs/file.c                        | 26 +++++++++++++++-----------
 fs/bcachefs/fs-io-buffered.c          |  4 ++--
 fs/bcachefs/fs-io-buffered.h          |  4 ++--
 fs/bfs/file.c                         |  7 ++++---
 fs/buffer.c                           | 26 +++++++++++++-------------
 fs/ceph/addr.c                        | 10 +++++++---
 fs/ecryptfs/mmap.c                    | 10 +++++-----
 fs/exfat/file.c                       | 11 +++++------
 fs/exfat/inode.c                      | 16 +++++++++-------
 fs/ext2/inode.c                       | 11 ++++++-----
 fs/ext4/inode.c                       | 18 ++++++++++--------
 fs/f2fs/data.c                        |  8 +++++---
 fs/fat/inode.c                        | 18 ++++++++++--------
 fs/fuse/file.c                        | 14 +++++++++-----
 fs/hfs/hfs_fs.h                       |  2 +-
 fs/hfs/inode.c                        |  4 ++--
 fs/hfsplus/hfsplus_fs.h               |  6 ++++--
 fs/hfsplus/inode.c                    |  8 +++++---
 fs/hostfs/hostfs_kern.c               |  8 +++++---
 fs/hpfs/file.c                        | 18 ++++++++++--------
 fs/hugetlbfs/inode.c                  |  9 +++++----
 fs/jffs2/file.c                       | 28 ++++++++++++++++------------
 fs/jfs/inode.c                        | 16 +++++++++-------
 fs/libfs.c                            | 11 ++++++-----
 fs/minix/inode.c                      |  7 ++++---
 fs/nfs/file.c                         |  8 ++++++--
 fs/nilfs2/inode.c                     |  8 +++++---
 fs/ntfs3/file.c                       |  4 ++--
 fs/ntfs3/inode.c                      |  7 ++++---
 fs/ntfs3/ntfs_fs.h                    | 10 ++++++----
 fs/ocfs2/aops.c                       |  6 ++++--
 fs/omfs/file.c                        |  7 ++++---
 fs/orangefs/inode.c                   | 16 +++++++++-------
 fs/ubifs/file.c                       |  8 +++++---
 fs/udf/inode.c                        | 11 +++++++----
 fs/ufs/inode.c                        | 16 +++++++++-------
 fs/vboxsf/file.c                      |  5 +++--
 include/linux/buffer_head.h           |  4 ++--
 include/linux/fs.h                    | 11 ++++++-----
 mm/filemap.c                          |  4 ++--
 mm/shmem.c                            | 12 ++++++------
 45 files changed, 267 insertions(+), 202 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 2e567e341c3b..580581281ed7 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -253,10 +253,10 @@ prototypes::
 	int (*writepages)(struct address_space *, struct writeback_control *);
 	bool (*dirty_folio)(struct address_space *, struct folio *folio);
 	void (*readahead)(struct readahead_control *);
-	int (*write_begin)(struct file *, struct address_space *mapping,
+	int (*write_begin)(const struct kiocb *, struct address_space *mapping,
 				loff_t pos, unsigned len,
 				struct folio **foliop, void **fsdata);
-	int (*write_end)(struct file *, struct address_space *mapping,
+	int (*write_end)(const struct kiocb *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct folio *folio, void *fsdata);
 	sector_t (*bmap)(struct address_space *, sector_t);
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index dd9da7e04a99..57604b07bdc9 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -823,10 +823,10 @@ cache in your filesystem.  The following members are defined:
 		int (*writepages)(struct address_space *, struct writeback_control *);
 		bool (*dirty_folio)(struct address_space *, struct folio *);
 		void (*readahead)(struct readahead_control *);
-		int (*write_begin)(struct file *, struct address_space *mapping,
+		int (*write_begin)(const struct kiocb *, struct address_space *mapping,
 				   loff_t pos, unsigned len,
-				struct page **pagep, void **fsdata);
-		int (*write_end)(struct file *, struct address_space *mapping,
+				   struct page **pagep, void **fsdata);
+		int (*write_end)(const struct kiocb *, struct address_space *mapping,
 				 loff_t pos, unsigned len, unsigned copied,
 				 struct folio *folio, void *fsdata);
 		sector_t (*bmap)(struct address_space *, sector_t);
diff --git a/block/fops.c b/block/fops.c
index 35cea0cb304d..f34e7315c83c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -496,15 +496,18 @@ static void blkdev_readahead(struct readahead_control *rac)
 	mpage_readahead(rac, blkdev_get_block);
 }
 
-static int blkdev_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int blkdev_write_begin(const struct kiocb *iocb,
+			      struct address_space *mapping, loff_t pos,
+			      unsigned len, struct folio **foliop,
+			      void **fsdata)
 {
 	return block_write_begin(mapping, pos, len, foliop, blkdev_get_block);
 }
 
-static int blkdev_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned copied, struct folio *folio,
-		void *fsdata)
+static int blkdev_write_end(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned copied,
+			    struct folio *folio, void *fsdata)
 {
 	int ret;
 	ret = block_write_end(pos, len, copied, folio);
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 21527189e430..6830f8bc8d4e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
 		truncate_pagecache(inode, inode->i_size);
 }
 
-static int adfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int adfs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				adfs_get_block,
 				&ADFS_I(mapping->host)->mmu_private);
 	if (unlikely(ret))
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7a71018e3f67..219ea0353906 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
-static int affs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int affs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				affs_get_block,
 				&AFFS_I(mapping->host)->mmu_private);
 	if (unlikely(ret))
@@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int affs_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned int len, unsigned int copied,
+static int affs_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping, loff_t pos,
+			  unsigned int len, unsigned int copied,
 			  struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 
 	/* Clear Archived bit on file writes, as AmigaOS would do */
 	if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
 	return err;
 }
 
-static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+static int affs_write_begin_ofs(const struct kiocb *iocb,
+				struct address_space *mapping,
 				loff_t pos, unsigned len,
 				struct folio **foliop, void **fsdata)
 {
@@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 	return err;
 }
 
-static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct folio *folio, void *fsdata)
+static int affs_write_end_ofs(const struct kiocb *iocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned copied,
+			      struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 66bacdd49f78..1c54b9b5bd69 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -674,7 +674,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 
 /* buffered writes: */
 
-int bch2_write_begin(struct file *file, struct address_space *mapping,
+int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 		     loff_t pos, unsigned len,
 		     struct folio **foliop, void **fsdata)
 {
@@ -757,7 +757,7 @@ err_unlock:
 	return bch2_err_class(ret);
 }
 
-int bch2_write_end(struct file *file, struct address_space *mapping,
+int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
 		   loff_t pos, unsigned len, unsigned copied,
 		   struct folio *folio, void *fsdata)
 {
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
index 3207ebbb4ab4..14de91c27656 100644
--- a/fs/bcachefs/fs-io-buffered.h
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *);
 int bch2_writepages(struct address_space *, struct writeback_control *);
 void bch2_readahead(struct readahead_control *);
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
 		     unsigned len, struct folio **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
+int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
 		   unsigned len, unsigned copied, struct folio *, void *);
 
 ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index fa66a09e496a..10dc0151ea55 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
 		truncate_pagecache(inode, inode->i_size);
 }
 
-static int bfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int bfs_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len,
+			   struct folio **foliop, void **fsdata)
 {
 	int ret;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 565fe88773c2..ead4dc85debd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2297,9 +2297,9 @@ int block_write_end(loff_t pos, unsigned len, unsigned copied,
 }
 EXPORT_SYMBOL(block_write_end);
 
-int generic_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
+		      loff_t pos, unsigned len, unsigned copied,
+		      struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	loff_t old_size = inode->i_size;
@@ -2494,7 +2494,8 @@ out:
 }
 EXPORT_SYMBOL(generic_cont_expand_simple);
 
-static int cont_expand_zero(struct file *file, struct address_space *mapping,
+static int cont_expand_zero(const struct kiocb *iocb,
+			    struct address_space *mapping,
 			    loff_t pos, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
@@ -2518,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 		}
 		len = PAGE_SIZE - zerofrom;
 
-		err = aops->write_begin(file, mapping, curpos, len,
+		err = aops->write_begin(iocb, mapping, curpos, len,
 					    &folio, &fsdata);
 		if (err)
 			goto out;
 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
-		err = aops->write_end(file, mapping, curpos, len, len,
+		err = aops->write_end(iocb, mapping, curpos, len, len,
 						folio, fsdata);
 		if (err < 0)
 			goto out;
@@ -2551,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 		}
 		len = offset - zerofrom;
 
-		err = aops->write_begin(file, mapping, curpos, len,
+		err = aops->write_begin(iocb, mapping, curpos, len,
 					    &folio, &fsdata);
 		if (err)
 			goto out;
 		folio_zero_range(folio, offset_in_folio(folio, curpos), len);
-		err = aops->write_end(file, mapping, curpos, len, len,
+		err = aops->write_end(iocb, mapping, curpos, len, len,
 						folio, fsdata);
 		if (err < 0)
 			goto out;
@@ -2571,17 +2572,16 @@ out:
  * For moronic filesystems that do not allow holes in file.
  * We may have to extend the file.
  */
-int cont_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata,
-			get_block_t *get_block, loff_t *bytes)
+int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+		     loff_t pos, unsigned len, struct folio **foliop,
+		     void **fsdata, get_block_t *get_block, loff_t *bytes)
 {
 	struct inode *inode = mapping->host;
 	unsigned int blocksize = i_blocksize(inode);
 	unsigned int zerofrom;
 	int err;
 
-	err = cont_expand_zero(file, mapping, pos, bytes);
+	err = cont_expand_zero(iocb, mapping, pos, bytes);
 	if (err)
 		return err;
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 60a621b00c65..02468c848cce 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1864,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
  */
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
+static int ceph_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
 			    loff_t pos, unsigned len,
 			    struct folio **foliop, void **fsdata)
 {
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int r;
@@ -1885,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
  * we don't do anything in here that simple_write_end doesn't do
  * except adjust dirty page accounting
  */
-static int ceph_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
+static int ceph_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping, loff_t pos,
+			  unsigned len, unsigned copied,
 			  struct folio *folio, void *fsdata)
 {
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ceph_client *cl = ceph_inode_to_client(inode);
 	bool check_cap = false;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 60f0ac8744b5..2c2b12fedeae 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -228,7 +228,7 @@ out:
 
 /**
  * ecryptfs_write_begin
- * @file: The eCryptfs file
+ * @iocb: I/O control block for the eCryptfs file
  * @mapping: The eCryptfs object
  * @pos: The file offset at which to start writing
  * @len: Length of the write
@@ -239,7 +239,7 @@ out:
  *
  * Returns zero on success; non-zero otherwise
  */
-static int ecryptfs_write_begin(struct file *file,
+static int ecryptfs_write_begin(const struct kiocb *iocb,
 			struct address_space *mapping,
 			loff_t pos, unsigned len,
 			struct folio **foliop, void **fsdata)
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
 	 * Note, this will increase i_size. */
 	if (index != 0) {
 		if (prev_page_end_size > i_size_read(mapping->host)) {
-			rc = ecryptfs_truncate(file->f_path.dentry,
+			rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry,
 					       prev_page_end_size);
 			if (rc) {
 				printk(KERN_ERR "%s: Error on attempt to "
@@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
 
 /**
  * ecryptfs_write_end
- * @file: The eCryptfs file object
+ * @iocb: I/O control block for the eCryptfs file
  * @mapping: The eCryptfs object
  * @pos: The file position
  * @len: The length of the data (unused)
@@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
  * @folio: The eCryptfs folio
  * @fsdata: The fsdata (unused)
  */
-static int ecryptfs_write_end(struct file *file,
+static int ecryptfs_write_end(const struct kiocb *iocb,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct folio *folio, void *fsdata)
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 841a5b18e3df..70f53edd0a10 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	return blkdev_issue_flush(inode->i_sb->s_bdev);
 }
 
-static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size)
 {
 	int err;
 	loff_t pos;
-	struct inode *inode = file_inode(file);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 	struct address_space *mapping = inode->i_mapping;
 	const struct address_space_operations *ops = mapping->a_ops;
@@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
 		if (pos + len > new_valid_size)
 			len = new_valid_size - pos;
 
-		err = ops->write_begin(file, mapping, pos, len, &folio, NULL);
+		err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL);
 		if (err)
 			goto out;
 
 		off = offset_in_folio(folio, pos);
 		folio_zero_new_buffers(folio, off, off + len);
 
-		err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+		err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL);
 		if (err < 0)
 			goto out;
 		pos += len;
@@ -604,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	}
 
 	if (pos > valid_size) {
-		ret = exfat_extend_valid_size(file, pos);
+		ret = exfat_extend_valid_size(inode, pos);
 		if (ret < 0 && ret != -ENOSPC) {
 			exfat_err(inode->i_sb,
 				"write: fail to zero from %llu to %llu(%zd)",
@@ -665,7 +664,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
 			start + vma->vm_end - vma->vm_start);
 
 	if (ei->valid_size < end) {
-		err = exfat_extend_valid_size(file, end);
+		err = exfat_extend_valid_size(inode, end);
 		if (err < 0) {
 			inode_unlock(inode);
 			return vmf_fs_error(err);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index b22c02d6000f..c10844e1e16c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -446,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int exfat_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned int len,
-		struct folio **foliop, void **fsdata)
+static int exfat_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned int len,
+			     struct folio **foliop, void **fsdata)
 {
 	int ret;
 
@@ -463,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int exfat_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned int len, unsigned int copied,
-		struct folio *folio, void *fsdata)
+static int exfat_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned int len, unsigned int copied,
+			   struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct exfat_inode_info *ei = EXFAT_I(inode);
 	int err;
 
-	err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (err < len)
 		exfat_write_failed(mapping, pos+len);
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30f8201c155f..d35ca26eee3c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -915,7 +915,7 @@ static void ext2_readahead(struct readahead_control *rac)
 }
 
 static int
-ext2_write_begin(struct file *file, struct address_space *mapping,
+ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
 {
 	int ret;
@@ -926,13 +926,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int ext2_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int ext2_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct folio *folio, void *fsdata)
 {
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (ret < len)
 		ext2_write_failed(mapping, pos + len);
 	return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6aa7ca6d842..9a16efd072bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1252,7 +1252,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
  * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
  * ext4_write_begin() is the right place.
  */
-static int ext4_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
 			    loff_t pos, unsigned len,
 			    struct folio **foliop, void **fsdata)
 {
@@ -1400,12 +1401,12 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
 
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
+ * `iocb` can be NULL - eg, when called from page_symlink().
  *
  * ext4 never places buffers on inode->i_mapping->i_private_list.  metadata
  * buffers are managed internally.
  */
-static int ext4_write_end(struct file *file,
+static int ext4_write_end(const struct kiocb *iocb,
 			  struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned copied,
 			  struct folio *folio, void *fsdata)
@@ -1510,7 +1511,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
 	} while (bh != head);
 }
 
-static int ext4_journalled_write_end(struct file *file,
+static int ext4_journalled_write_end(const struct kiocb *iocb,
 				     struct address_space *mapping,
 				     loff_t pos, unsigned len, unsigned copied,
 				     struct folio *folio, void *fsdata)
@@ -3036,7 +3037,8 @@ static int ext4_nonda_switch(struct super_block *sb)
 	return 0;
 }
 
-static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_da_write_begin(const struct kiocb *iocb,
+			       struct address_space *mapping,
 			       loff_t pos, unsigned len,
 			       struct folio **foliop, void **fsdata)
 {
@@ -3054,7 +3056,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 
 	if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
-		return ext4_write_begin(file, mapping, pos,
+		return ext4_write_begin(iocb, mapping, pos,
 					len, foliop, fsdata);
 	}
 	*fsdata = (void *)0;
@@ -3195,7 +3197,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
 	return copied;
 }
 
-static int ext4_da_write_end(struct file *file,
+static int ext4_da_write_end(const struct kiocb *iocb,
 			     struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned copied,
 			     struct folio *folio, void *fsdata)
@@ -3204,7 +3206,7 @@ static int ext4_da_write_end(struct file *file,
 	int write_mode = (int)(unsigned long)fsdata;
 
 	if (write_mode == FALL_BACK_TO_NONDELALLOC)
-		return ext4_write_end(file, mapping, pos,
+		return ext4_write_end(iocb, mapping, pos,
 				      len, copied, folio, fsdata);
 
 	trace_ext4_da_write_end(inode, pos, len, copied);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 31e892842625..711ad80b38d0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3519,8 +3519,10 @@ reserve_block:
 	return 0;
 }
 
-static int f2fs_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int f2fs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len, struct folio **foliop,
+			    void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3656,7 +3658,7 @@ fail:
 	return err;
 }
 
-static int f2fs_write_end(struct file *file,
+static int f2fs_write_end(const struct kiocb *iocb,
 			struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct folio *folio, void *fsdata)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3852bb66358c..9648ed097816 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int fat_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int fat_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len,
+			   struct folio **foliop, void **fsdata)
 {
 	int err;
 
-	err = cont_write_begin(file, mapping, pos, len,
+	err = cont_write_begin(iocb, mapping, pos, len,
 				foliop, fsdata, fat_get_block,
 				&MSDOS_I(mapping->host)->mmu_private);
 	if (err < 0)
@@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
 	return err;
 }
 
-static int fat_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int fat_write_end(const struct kiocb *iocb,
+			 struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned copied,
+			 struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int err;
-	err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (err < len)
 		fat_write_failed(mapping, pos + len);
 	if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f102afc03359..21c6f8654bfe 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2213,10 +2213,13 @@ out:
  * It's worthy to make sure that space is reserved on disk for the write,
  * but how to implement it without killing performance need more thinking.
  */
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int fuse_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len, struct folio **foliop,
+			    void **fsdata)
 {
 	pgoff_t index = pos >> PAGE_SHIFT;
+	struct file *file = iocb->ki_filp;
 	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
 	struct folio *folio;
 	loff_t fsize;
@@ -2256,9 +2259,10 @@ error:
 	return err;
 }
 
-static int fuse_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned copied,
-		struct folio *folio, void *fsdata)
+static int fuse_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct folio *folio, void *fsdata)
 {
 	struct inode *inode = folio->mapping->host;
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a0c7cb0f79fc..c3fd3172fdd6 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -201,7 +201,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern const struct address_space_operations hfs_aops;
 extern const struct address_space_operations hfs_btree_aops;
 
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 		loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
 extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a81ce7a740b9..096f338134f9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				hfs_get_block,
 				&HFS_I(mapping->host)->phys_size);
 	if (unlikely(ret))
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2f089bff0095..3d5c65aef3b2 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
 extern const struct dentry_operations hfsplus_dentry_operations;
 
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
+int hfsplus_write_begin(const struct kiocb *iocb,
+			struct address_space *mapping,
+			loff_t pos, unsigned len, struct folio **foliop,
+			void **fsdata);
 struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
 				umode_t mode);
 void hfsplus_delete_inode(struct inode *inode);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f331e9574217..97d75bb2c388 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+int hfsplus_write_begin(const struct kiocb *iocb,
+			struct address_space *mapping, loff_t pos,
+			unsigned len, struct folio **foliop,
+			void **fsdata)
 {
 	int ret;
 
-	ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				hfsplus_get_block,
 				&HFSPLUS_I(mapping->host)->phys_size);
 	if (unlikely(ret))
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 702c41317589..6c2cf0cdf3d6 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -445,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
 	return ret;
 }
 
-static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+static int hostfs_write_begin(const struct kiocb *iocb,
+			      struct address_space *mapping,
 			      loff_t pos, unsigned len,
 			      struct folio **foliop, void **fsdata)
 {
@@ -458,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static int hostfs_write_end(struct file *file, struct address_space *mapping,
+static int hostfs_write_end(const struct kiocb *iocb,
+			    struct address_space *mapping,
 			    loff_t pos, unsigned len, unsigned copied,
 			    struct folio *folio, void *fsdata)
 {
@@ -468,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
 	int err;
 
 	buffer = kmap_local_folio(folio, from);
-	err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied);
+	err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied);
 	kunmap_local(buffer);
 
 	if (!folio_test_uptodate(folio) && err == folio_size(folio))
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 449a3fc1b8d9..7b95a3d2e2a6 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
 	hpfs_unlock(inode->i_sb);
 }
 
-static int hpfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int hpfs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
-	ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+	ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
 				hpfs_get_block,
 				&hpfs_i(mapping->host)->mmu_private);
 	if (unlikely(ret))
@@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int hpfs_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int hpfs_write_end(const struct kiocb *iocb,
+			  struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 	int err;
-	err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (err < len)
 		hpfs_write_failed(mapping, pos + len);
 	if (!(err < 0)) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e4de5425838d..541aae502d4d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -311,7 +311,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return retval;
 }
 
-static int hugetlbfs_write_begin(struct file *file,
+static int hugetlbfs_write_begin(const struct kiocb *iocb,
 			struct address_space *mapping,
 			loff_t pos, unsigned len,
 			struct folio **foliop, void **fsdata)
@@ -319,9 +319,10 @@ static int hugetlbfs_write_begin(struct file *file,
 	return -EINVAL;
 }
 
-static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int hugetlbfs_write_end(const struct kiocb *iocb,
+			       struct address_space *mapping,
+			       loff_t pos, unsigned len, unsigned copied,
+			       struct folio *folio, void *fsdata)
 {
 	BUG();
 	return -EINVAL;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 13c18ccc13b0..adec3af9bf8d 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -21,12 +21,14 @@
 #include <linux/jffs2.h>
 #include "nodelist.h"
 
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata);
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata);
+static int jffs2_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct folio *folio, void *fsdata);
+static int jffs2_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len,
+			     struct folio **foliop, void **fsdata);
 static int jffs2_read_folio(struct file *filp, struct folio *folio);
 
 int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
@@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio)
 	return ret;
 }
 
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int jffs2_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len,
+			     struct folio **foliop, void **fsdata)
 {
 	struct folio *folio;
 	struct inode *inode = mapping->host;
@@ -235,9 +238,10 @@ out_err:
 	return ret;
 }
 
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int jffs2_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct folio *folio, void *fsdata)
 {
 	/* Actually commit the write from the page cache page we're looking at.
 	 * For now, we write the full page out each time. It sucks, but it's simple
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 60fc92dee24d..083e7fa54709 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int jfs_write_begin(struct file *file, struct address_space *mapping,
-				loff_t pos, unsigned len,
-				struct folio **foliop, void **fsdata)
+static int jfs_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len,
+			   struct folio **foliop, void **fsdata)
 {
 	int ret;
 
@@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int jfs_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned copied, struct folio *folio,
-		void *fsdata)
+static int jfs_write_end(const struct kiocb *iocb,
+			 struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned copied,
+			 struct folio *folio, void *fsdata)
 {
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (ret < len)
 		jfs_write_failed(mapping, pos + len);
 	return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index f99ecc300647..52196566ccbc 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -910,7 +910,7 @@ static int simple_read_folio(struct file *file, struct folio *folio)
 	return 0;
 }
 
-int simple_write_begin(struct file *file, struct address_space *mapping,
+int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 			loff_t pos, unsigned len,
 			struct folio **foliop, void **fsdata)
 {
@@ -935,7 +935,7 @@ EXPORT_SYMBOL(simple_write_begin);
 
 /**
  * simple_write_end - .write_end helper for non-block-device FSes
- * @file: See .write_end of address_space_operations
+ * @iocb: kernel I/O control block
  * @mapping: 		"
  * @pos: 		"
  * @len: 		"
@@ -956,9 +956,10 @@ EXPORT_SYMBOL(simple_write_begin);
  *
  * Use *ONLY* with simple_read_folio()
  */
-static int simple_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int simple_write_end(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned copied,
+			    struct folio *folio, void *fsdata)
 {
 	struct inode *inode = folio->mapping->host;
 	loff_t last_pos = pos + copied;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007e389d5d2..df9d11479caf 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int minix_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int minix_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
+			     loff_t pos, unsigned len,
+			     struct folio **foliop, void **fsdata)
 {
 	int ret;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 033feeab8c34..2bd557ca1af9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -342,12 +342,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
  * If the writer ends up delaying the write, the writer needs to
  * increment the page use counts until he is done with the page.
  */
-static int nfs_write_begin(struct file *file, struct address_space *mapping,
+static int nfs_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
 			   loff_t pos, unsigned len, struct folio **foliop,
 			   void **fsdata)
 {
 	fgf_t fgp = FGP_WRITEBEGIN;
 	struct folio *folio;
+	struct file *file = iocb->ki_filp;
 	int once_thru = 0;
 	int ret;
 
@@ -377,10 +379,12 @@ start:
 	return ret;
 }
 
-static int nfs_write_end(struct file *file, struct address_space *mapping,
+static int nfs_write_end(const struct kiocb *iocb,
+			 struct address_space *mapping,
 			 loff_t pos, unsigned len, unsigned copied,
 			 struct folio *folio, void *fsdata)
 {
+	struct file *file = iocb->ki_filp;
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	unsigned offset = offset_in_folio(folio, pos);
 	int status;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6613b8fcceb0..c2ccafdf4a19 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+static int nilfs_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len,
 			     struct folio **foliop, void **fsdata)
 
@@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
 	return err;
 }
 
-static int nilfs_write_end(struct file *file, struct address_space *mapping,
+static int nilfs_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct folio *folio, void *fsdata)
 {
@@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 
 	nr_dirty = nilfs_page_count_clean_buffers(folio, start,
 						  start + copied);
-	copied = generic_write_end(file, mapping, pos, len, copied, folio,
+	copied = generic_write_end(iocb, mapping, pos, len, copied, folio,
 				   fsdata);
 	nilfs_set_file_dirty(inode, nr_dirty);
 	err = nilfs_transaction_commit(inode->i_sb);
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1e99a35691cd..1590c2a3c48f 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -154,13 +154,13 @@ static int ntfs_extend_initialized_size(struct file *file,
 		if (pos + len > new_valid)
 			len = new_valid - pos;
 
-		err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL);
+		err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL);
 		if (err)
 			goto out;
 
 		folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom);
 
-		err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL);
+		err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL);
 		if (err < 0)
 			goto out;
 		pos += len;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 0f0d27d4644a..dad088e64b3c 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -912,7 +912,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
 				  bh_result, create, GET_BLOCK_WRITE_BEGIN);
 }
 
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
 		     loff_t pos, u32 len, struct folio **foliop, void **fsdata)
 {
 	int err;
@@ -957,7 +957,8 @@ out:
 /*
  * ntfs_write_end - Address_space_operations::write_end.
  */
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+int ntfs_write_end(const struct kiocb *iocb,
+		   struct address_space *mapping, loff_t pos,
 		   u32 len, u32 copied, struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
@@ -989,7 +990,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
 		folio_unlock(folio);
 		folio_put(folio);
 	} else {
-		err = generic_write_end(file, mapping, pos, len, copied, folio,
+		err = generic_write_end(iocb, mapping, pos, len, copied, folio,
 					fsdata);
 	}
 
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 36b8052660d5..921257773eec 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -702,10 +702,12 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
 int ntfs_set_size(struct inode *inode, u64 new_size);
 int ntfs_get_block(struct inode *inode, sector_t vbn,
 		   struct buffer_head *bh_result, int create);
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
-		     loff_t pos, u32 len, struct folio **foliop, void **fsdata);
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
-		   u32 len, u32 copied, struct folio *folio, void *fsdata);
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+		     loff_t pos, u32 len, struct folio **foliop,
+		     void **fsdata);
+int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping,
+		   loff_t pos, u32 len, u32 copied, struct folio *folio,
+		   void *fsdata);
 int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
 int ntfs_sync_inode(struct inode *inode);
 int inode_read_data(struct inode *inode, void *data, size_t bytes);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 40b6bce12951..2203438738f6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1856,7 +1856,8 @@ out:
 	return ret;
 }
 
-static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+static int ocfs2_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len,
 			     struct folio **foliop, void **fsdata)
 {
@@ -2047,7 +2048,8 @@ out:
 	return copied;
 }
 
-static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+static int ocfs2_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct folio *folio, void *fsdata)
 {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 98358d405b6a..8d70f816b0c9 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int omfs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int omfs_write_begin(const struct kiocb *iocb,
+			    struct address_space *mapping,
+			    loff_t pos, unsigned len,
+			    struct folio **foliop, void **fsdata)
 {
 	int ret;
 
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 08a6f372a352..a7ab63776735 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -285,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
 	return ret;
 }
 
-static int orangefs_write_begin(struct file *file,
-		struct address_space *mapping, loff_t pos, unsigned len,
-		struct folio **foliop, void **fsdata)
+static int orangefs_write_begin(const struct kiocb *iocb,
+				struct address_space *mapping, loff_t pos,
+				unsigned len, struct folio **foliop,
+				void **fsdata)
 {
 	struct orangefs_write_range *wr;
 	struct folio *folio;
@@ -340,9 +341,10 @@ okay:
 	return 0;
 }
 
-static int orangefs_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned copied, struct folio *folio,
-		void *fsdata)
+static int orangefs_write_end(const struct kiocb *iocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned len, unsigned copied,
+			      struct folio *folio, void *fsdata)
 {
 	struct inode *inode = folio->mapping->host;
 	loff_t last_pos = pos + copied;
@@ -372,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
 	folio_unlock(folio);
 	folio_put(folio);
 
-	mark_inode_dirty_sync(file_inode(file));
+	mark_inode_dirty_sync(file_inode(iocb->ki_filp));
 	return copied;
 }
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf311c38d9a8..cee8bec1ea26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -404,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio,
  * there is a plenty of flash space and the budget will be acquired quickly,
  * without forcing write-back. The slow path does not make this assumption.
  */
-static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+static int ubifs_write_begin(const struct kiocb *iocb,
+			     struct address_space *mapping,
 			     loff_t pos, unsigned len,
 			     struct folio **foliop, void **fsdata)
 {
@@ -514,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio,
 	}
 }
 
-static int ubifs_write_end(struct file *file, struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied,
+static int ubifs_write_end(const struct kiocb *iocb,
+			   struct address_space *mapping, loff_t pos,
+			   unsigned len, unsigned copied,
 			   struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4386dd845e40..356b75676fa9 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -244,10 +244,12 @@ static void udf_readahead(struct readahead_control *rac)
 	mpage_readahead(rac, udf_get_block);
 }
 
-static int udf_write_begin(struct file *file, struct address_space *mapping,
+static int udf_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
 			   loff_t pos, unsigned len,
 			   struct folio **foliop, void **fsdata)
 {
+	struct file *file = iocb->ki_filp;
 	struct udf_inode_info *iinfo = UDF_I(file_inode(file));
 	struct folio *folio;
 	int ret;
@@ -271,15 +273,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 	return 0;
 }
 
-static int udf_write_end(struct file *file, struct address_space *mapping,
+static int udf_write_end(const struct kiocb *iocb,
+			 struct address_space *mapping,
 			 loff_t pos, unsigned len, unsigned copied,
 			 struct folio *folio, void *fsdata)
 {
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(iocb->ki_filp);
 	loff_t last_pos;
 
 	if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
-		return generic_write_end(file, mapping, pos, len, copied, folio,
+		return generic_write_end(iocb, mapping, pos, len, copied, folio,
 					 fsdata);
 	last_pos = pos + copied;
 	if (last_pos > inode->i_size)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7dc38fdef2ea..8361c00e8fa6 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
 	}
 }
 
-static int ufs_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+static int ufs_write_begin(const struct kiocb *iocb,
+			   struct address_space *mapping,
+			   loff_t pos, unsigned len,
+			   struct folio **foliop, void **fsdata)
 {
 	int ret;
 
@@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static int ufs_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+static int ufs_write_end(const struct kiocb *iocb,
+			 struct address_space *mapping,
+			 loff_t pos, unsigned len, unsigned copied,
+			 struct folio *folio, void *fsdata)
 {
 	int ret;
 
-	ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+	ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
 	if (ret < len)
 		ufs_write_failed(mapping, pos + len);
 	return ret;
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index b492794f8e9a..af01e3beaa42 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -300,12 +300,13 @@ static int vboxsf_writepages(struct address_space *mapping,
 	return error;
 }
 
-static int vboxsf_write_end(struct file *file, struct address_space *mapping,
+static int vboxsf_write_end(const struct kiocb *iocb,
+			    struct address_space *mapping,
 			    loff_t pos, unsigned int len, unsigned int copied,
 			    struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	struct vboxsf_handle *sf_handle = file->private_data;
+	struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data;
 	size_t from = offset_in_folio(folio, pos);
 	u32 nwritten = len;
 	u8 *buf;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 178eb90e9cf3..b16b88bfbc3e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -263,11 +263,11 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 		get_block_t *get_block);
 int block_write_end(loff_t pos, unsigned len, unsigned copied, struct folio *);
-int generic_write_end(struct file *, struct address_space *,
+int generic_write_end(const struct kiocb *, struct address_space *,
 				loff_t, unsigned len, unsigned copied,
 				struct folio *, void *);
 void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to);
-int cont_write_begin(struct file *, struct address_space *, loff_t,
+int cont_write_begin(const struct kiocb *, struct address_space *, loff_t,
 			unsigned, struct folio **, void **,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 09e3e80b0528..df8c503100c4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -444,10 +444,10 @@ struct address_space_operations {
 
 	void (*readahead)(struct readahead_control *);
 
-	int (*write_begin)(struct file *, struct address_space *mapping,
+	int (*write_begin)(const struct kiocb *, struct address_space *mapping,
 				loff_t pos, unsigned len,
 				struct folio **foliop, void **fsdata);
-	int (*write_end)(struct file *, struct address_space *mapping,
+	int (*write_end)(const struct kiocb *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct folio *folio, void *fsdata);
 
@@ -3598,9 +3598,10 @@ extern void simple_recursive_removal(struct dentry *,
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
 extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
-extern int simple_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata);
+extern int simple_write_begin(const struct kiocb *iocb,
+			      struct address_space *mapping,
+			      loff_t pos, unsigned len,
+			      struct folio **foliop, void **fsdata);
 extern const struct address_space_operations ram_aops;
 extern int always_delete_dentry(const struct dentry *);
 extern struct inode *alloc_anon_inode(struct super_block *);
diff --git a/mm/filemap.c b/mm/filemap.c
index bada249b9fb7..ba089d75fc86 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4109,7 +4109,7 @@ retry:
 			break;
 		}
 
-		status = a_ops->write_begin(file, mapping, pos, bytes,
+		status = a_ops->write_begin(iocb, mapping, pos, bytes,
 						&folio, &fsdata);
 		if (unlikely(status < 0))
 			break;
@@ -4130,7 +4130,7 @@ retry:
 		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
 		flush_dcache_folio(folio);
 
-		status = a_ops->write_end(file, mapping, pos, bytes, copied,
+		status = a_ops->write_end(iocb, mapping, pos, bytes, copied,
 						folio, fsdata);
 		if (unlikely(status != copied)) {
 			iov_iter_revert(i, copied - max(status, 0L));
diff --git a/mm/shmem.c b/mm/shmem.c
index 0c5fb4ffa03a..2229425e1b29 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3266,9 +3266,9 @@ static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
 
 static int
-shmem_write_begin(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len,
-			struct folio **foliop, void **fsdata)
+shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+		  loff_t pos, unsigned len,
+		  struct folio **foliop, void **fsdata)
 {
 	struct inode *inode = mapping->host;
 	struct shmem_inode_info *info = SHMEM_I(inode);
@@ -3300,9 +3300,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 }
 
 static int
-shmem_write_end(struct file *file, struct address_space *mapping,
-			loff_t pos, unsigned len, unsigned copied,
-			struct folio *folio, void *fsdata)
+shmem_write_end(const struct kiocb *iocb, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied,
+		struct folio *folio, void *fsdata)
 {
 	struct inode *inode = mapping->host;
 
-- 
cgit v1.2.3


From b799474b9aeb46ec698874d4de1a799de8b5f64f Mon Sep 17 00:00:00 2001
From: Taotao Chen <chentaotao@didiglobal.com>
Date: Wed, 16 Jul 2025 09:36:08 +0000
Subject: mm/pagemap: add write_begin_get_folio() helper function

Add write_begin_get_folio() to simplify the common folio lookup logic
used by filesystem ->write_begin() implementations.

This helper wraps __filemap_get_folio() with common flags such as
FGP_WRITEBEGIN, conditional FGP_DONTCACHE, and set folio order based
on the write length.

Part of a series refactoring address_space_operations write_begin and
write_end callbacks to use struct kiocb for passing write context and
flags.

Signed-off-by: Taotao Chen <chentaotao@didiglobal.com>
Link: https://lore.kernel.org/20250716093559.217344-5-chentaotao@didiglobal.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pagemap.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e63fbfbd5b0f..ce2bcdcadb73 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -750,6 +750,33 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
 		fgf_t fgp_flags, gfp_t gfp);
 
+/**
+ * write_begin_get_folio - Get folio for write_begin with flags.
+ * @iocb: The kiocb passed from write_begin (may be NULL).
+ * @mapping: The address space to search.
+ * @index: The page cache index.
+ * @len: Length of data being written.
+ *
+ * This is a helper for filesystem write_begin() implementations.
+ * It wraps __filemap_get_folio(), setting appropriate flags in
+ * the write begin context.
+ *
+ * Return: A folio or an ERR_PTR.
+ */
+static inline struct folio *write_begin_get_folio(const struct kiocb *iocb,
+		  struct address_space *mapping, pgoff_t index, size_t len)
+{
+        fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+        fgp_flags |= fgf_set_order(len);
+
+        if (iocb && iocb->ki_flags & IOCB_DONTCACHE)
+                fgp_flags |= FGP_DONTCACHE;
+
+        return __filemap_get_folio(mapping, index, fgp_flags,
+                                   mapping_gfp_mask(mapping));
+}
+
 /**
  * filemap_get_folio - Find and get a folio.
  * @mapping: The address_space to search.
-- 
cgit v1.2.3


From ab17ead0e0ee8650cd1cf4e481b1ed0ee9731956 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 16 Jul 2025 15:36:31 +0200
Subject: block: fix blk_zone_append_update_request_bio() kernel-doc

Stephen reported new 'make htmldocs' warnings introduced by 4cc21a00762b
("block: add tracepoint for blk_zone_update_request_bio").

One is a wrong function name in the tracepoint's kernel-doc and one is a
wrong function parameter.

Fix these so 'make htmldocs' is warning free again for the block layer
tracepoints.

Fixes: 4cc21a00762b ("block: add tracepoint for blk_zone_update_request_bio")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20250716133631.94898-1-johannes.thumshirn@wdc.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 3e582d5e3a57..6aa79e2d799c 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -405,8 +405,8 @@ DEFINE_EVENT(block_bio, block_getrq,
 );
 
 /**
- * block_zone_update_request_bio - update the bio sector after a zone append
- * @bio: the completed block IO operation
+ * blk_zone_append_update_request_bio - update bio sector after zone append
+ * @rq: the completed request that sets the bio sector
  *
  * Update the bio's bi_sector after a zone append command has been completed.
  */
-- 
cgit v1.2.3


From 857d18f23ab17284d1b6de6f61f4e74958596376 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 11 Jul 2025 16:49:25 -0700
Subject: cleanup: Introduce ACQUIRE() and ACQUIRE_ERR() for conditional locks

scoped_cond_guard(), automatic cleanup for conditional locks, has a couple
pain points:

* It causes existing straight-line code to be re-indented into a new
  bracketed scope. While this can be mitigated by a new helper function
  to contain the scope, that is not always a comfortable conversion.

* The return code from the conditional lock is tossed in favor of a scheme
  to pass a 'return err;' statement to the macro.

Other attempts to clean this up, to behave more like guard() [1], got hung
up trying to both establish and evaluate the conditional lock in one
statement.

ACQUIRE() solves this by reflecting the result of the condition in the
automatic variable established by the lock CLASS(). The result is
separately retrieved with the ACQUIRE_ERR() helper, effectively a PTR_ERR()
operation.

Link: http://lore.kernel.org/all/Z1LBnX9TpZLR5Dkf@gmail.com [1]
Link: http://patch.msgid.link/20250512105026.GP4439@noisy.programming.kicks-ass.net
Link: http://patch.msgid.link/20250512185817.GA1808@noisy.programming.kicks-ass.net
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: David Lechner <dlechner@baylibre.com>
Cc: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[djbw: wrap Peter's proposal with changelog and comments]
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20250711234932.671292-2-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cleanup.h | 95 +++++++++++++++++++++++++++++++++++++++++--------
 include/linux/mutex.h   |  2 +-
 include/linux/rwsem.h   |  2 +-
 3 files changed, 83 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 7093e1d08af0..4eb83dd71cfe 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -3,6 +3,8 @@
 #define _LINUX_CLEANUP_H
 
 #include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/args.h>
 
 /**
  * DOC: scope-based cleanup helpers
@@ -61,9 +63,21 @@
  * Observe the lock is held for the remainder of the "if ()" block not
  * the remainder of "func()".
  *
- * Now, when a function uses both __free() and guard(), or multiple
- * instances of __free(), the LIFO order of variable definition order
- * matters. GCC documentation says:
+ * The ACQUIRE() macro can be used in all places that guard() can be
+ * used and additionally support conditional locks
+ *
+ *
+ *	DEFINE_GUARD_COND(pci_dev, _try, pci_dev_trylock(_T))
+ *	...
+ *	ACQUIRE(pci_dev_try, lock)(dev);
+ *	rc = ACQUIRE_ERR(pci_dev_try, &lock);
+ *	if (rc)
+ *		return rc;
+ *	// @lock is held
+ *
+ * Now, when a function uses both __free() and guard()/ACQUIRE(), or
+ * multiple instances of __free(), the LIFO order of variable definition
+ * order matters. GCC documentation says:
  *
  * "When multiple variables in the same scope have cleanup attributes,
  * at exit from the scope their associated cleanup functions are run in
@@ -305,14 +319,46 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
  *      acquire fails.
  *
  *      Only for conditional locks.
+ *
+ * ACQUIRE(name, var):
+ *	a named instance of the (guard) class, suitable for conditional
+ *	locks when paired with ACQUIRE_ERR().
+ *
+ * ACQUIRE_ERR(name, &var):
+ *	a helper that is effectively a PTR_ERR() conversion of the guard
+ *	pointer. Returns 0 when the lock was acquired and a negative
+ *	error code otherwise.
  */
 
 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond)	\
 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 
-#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
-	static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \
-	{ return (void *)(__force unsigned long)*(_exp); }
+#define __GUARD_IS_ERR(_ptr)                                       \
+	({                                                         \
+		unsigned long _rc = (__force unsigned long)(_ptr); \
+		unlikely((_rc - 1) >= -MAX_ERRNO - 1);             \
+	})
+
+#define __DEFINE_GUARD_LOCK_PTR(_name, _exp)                                \
+	static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
+	{                                                                   \
+		void *_ptr = (void *)(__force unsigned long)*(_exp);        \
+		if (IS_ERR(_ptr)) {                                         \
+			_ptr = NULL;                                        \
+		}                                                           \
+		return _ptr;                                                \
+	}                                                                   \
+	static inline int class_##_name##_lock_err(class_##_name##_t *_T)   \
+	{                                                                   \
+		long _rc = (__force unsigned long)*(_exp);                  \
+		if (!_rc) {                                                 \
+			_rc = -EBUSY;                                       \
+		}                                                           \
+		if (!IS_ERR_VALUE(_rc)) {                                   \
+			_rc = 0;                                            \
+		}                                                           \
+		return _rc;                                                 \
+	}
 
 #define DEFINE_CLASS_IS_GUARD(_name) \
 	__DEFINE_CLASS_IS_CONDITIONAL(_name, false); \
@@ -323,23 +369,37 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
 	__DEFINE_GUARD_LOCK_PTR(_name, _T)
 
 #define DEFINE_GUARD(_name, _type, _lock, _unlock) \
-	DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \
+	DEFINE_CLASS(_name, _type, if (!__GUARD_IS_ERR(_T)) { _unlock; }, ({ _lock; _T; }), _type _T); \
 	DEFINE_CLASS_IS_GUARD(_name)
 
-#define DEFINE_GUARD_COND(_name, _ext, _condlock) \
+#define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \
 	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \
 	EXTEND_CLASS(_name, _ext, \
-		     ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \
+		     ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
 		     class_##_name##_t _T) \
 	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
-	{ return class_##_name##_lock_ptr(_T); }
+	{ return class_##_name##_lock_ptr(_T); } \
+	static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_err(_T); }
+
+/*
+ * Default binary condition; success on 'true'.
+ */
+#define DEFINE_GUARD_COND_3(_name, _ext, _lock) \
+	DEFINE_GUARD_COND_4(_name, _ext, _lock, _RET)
+
+#define DEFINE_GUARD_COND(X...) CONCATENATE(DEFINE_GUARD_COND_, COUNT_ARGS(X))(X)
 
 #define guard(_name) \
 	CLASS(_name, __UNIQUE_ID(guard))
 
 #define __guard_ptr(_name) class_##_name##_lock_ptr
+#define __guard_err(_name) class_##_name##_lock_err
 #define __is_cond_ptr(_name) class_##_name##_is_conditional
 
+#define ACQUIRE(_name, _var)     CLASS(_name, _var)
+#define ACQUIRE_ERR(_name, _var) __guard_err(_name)(_var)
+
 /*
  * Helper macro for scoped_guard().
  *
@@ -401,7 +461,7 @@ typedef struct {							\
 									\
 static inline void class_##_name##_destructor(class_##_name##_t *_T)	\
 {									\
-	if (_T->lock) { _unlock; }					\
+	if (!__GUARD_IS_ERR(_T->lock)) { _unlock; }			\
 }									\
 									\
 __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
@@ -433,15 +493,22 @@ __DEFINE_CLASS_IS_CONDITIONAL(_name, false);				\
 __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__)		\
 __DEFINE_LOCK_GUARD_0(_name, _lock)
 
-#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock)		\
+#define DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _cond)		\
 	__DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true);		\
 	EXTEND_CLASS(_name, _ext,					\
 		     ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\
-		        if (_T->lock && !(_condlock)) _T->lock = NULL;	\
+		        int _RET = (_lock);                             \
+		        if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
 			_t; }),						\
 		     typeof_member(class_##_name##_t, lock) l)		\
 	static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
-	{ return class_##_name##_lock_ptr(_T); }
+	{ return class_##_name##_lock_ptr(_T); } \
+	static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+	{ return class_##_name##_lock_err(_T); }
+
+#define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \
+	DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _RET)
 
+#define DEFINE_LOCK_GUARD_1_COND(X...) CONCATENATE(DEFINE_LOCK_GUARD_1_COND_, COUNT_ARGS(X))(X)
 
 #endif /* _LINUX_CLEANUP_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a039fa8c1780..9d5d7ed5c101 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -224,7 +224,7 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
 DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
-DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)
+DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T), _RET == 0)
 
 extern unsigned long mutex_get_owner(struct mutex *lock);
 
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c8b543d428b0..c810deb88d13 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -240,7 +240,7 @@ extern void up_write(struct rw_semaphore *sem);
 
 DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
 DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
-DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)
+DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0)
 
 DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
 DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
-- 
cgit v1.2.3


From d03fcf50ba56f4479685b951506422eeca230853 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 11 Jul 2025 16:49:32 -0700
Subject: cxl: Convert to ACQUIRE() for conditional rwsem locking

Use ACQUIRE() to cleanup conditional locking paths in the CXL driver
The ACQUIRE() macro and its associated ACQUIRE_ERR() helpers, like
scoped_cond_guard(), arrange for scoped-based conditional locking. Unlike
scoped_cond_guard(), these macros arrange for an ERR_PTR() to be retrieved
representing the state of the conditional lock.

The goal of this conversion is to complete the removal of all explicit
unlock calls in the subsystem. I.e. the methods to acquire a lock are
solely via guard(), scoped_guard() (for limited cases), or ACQUIRE(). All
unlock is implicit / scope-based. In order to make sure all lock sites are
converted, the existing rwsem's are consolidated and renamed in 'struct
cxl_rwsem'. While that makes the patch noisier it gives a clean cut-off
between old-world (explicit unlock allowed), and new world (explicit unlock
deleted).

Cc: David Lechner <dlechner@baylibre.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron@huawei.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alison Schofield <alison.schofield@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Shiju Jose <shiju.jose@huawei.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Fabio M. De Francesco <fabio.m.de.francesco@linux.intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Shiju Jose <shiju.jose@huawei.com>
Link: https://patch.msgid.link/20250711234932.671292-9-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/cdat.c   |   6 +-
 drivers/cxl/core/core.h   |  17 ++-
 drivers/cxl/core/edac.c   |  44 ++++---
 drivers/cxl/core/hdm.c    |  41 +++----
 drivers/cxl/core/mbox.c   |   6 +-
 drivers/cxl/core/memdev.c |  50 +++-----
 drivers/cxl/core/port.c   |  18 +--
 drivers/cxl/core/region.c | 295 +++++++++++++++++++---------------------------
 drivers/cxl/cxl.h         |  13 +-
 include/linux/rwsem.h     |   1 +
 10 files changed, 212 insertions(+), 279 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index 0ccef2f2a26a..c0af645425f4 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -336,7 +336,7 @@ static int match_cxlrd_hb(struct device *dev, void *data)
 	cxlrd = to_cxl_root_decoder(dev);
 	cxlsd = &cxlrd->cxlsd;
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	for (int i = 0; i < cxlsd->nr_targets; i++) {
 		if (host_bridge == cxlsd->target[i]->dport_dev)
 			return 1;
@@ -987,7 +987,7 @@ void cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
 	bool is_root;
 	int rc;
 
-	lockdep_assert_held(&cxl_dpa_rwsem);
+	lockdep_assert_held(&cxl_rwsem.dpa);
 
 	struct xarray *usp_xa __free(free_perf_xa) =
 		kzalloc(sizeof(*usp_xa), GFP_KERNEL);
@@ -1057,7 +1057,7 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
 {
 	struct cxl_dpa_perf *perf;
 
-	lockdep_assert_held(&cxl_dpa_rwsem);
+	lockdep_assert_held(&cxl_rwsem.dpa);
 
 	perf = cxled_get_dpa_perf(cxled);
 	if (IS_ERR(perf))
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 2be37084409f..f796731deedf 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -5,6 +5,7 @@
 #define __CXL_CORE_H__
 
 #include <cxl/mailbox.h>
+#include <linux/rwsem.h>
 
 extern const struct device_type cxl_nvdimm_bridge_type;
 extern const struct device_type cxl_nvdimm_type;
@@ -107,8 +108,20 @@ u16 cxl_rcrb_to_aer(struct device *dev, resource_size_t rcrb);
 #define PCI_RCRB_CAP_HDR_NEXT_MASK	GENMASK(15, 8)
 #define PCI_CAP_EXP_SIZEOF		0x3c
 
-extern struct rw_semaphore cxl_dpa_rwsem;
-extern struct rw_semaphore cxl_region_rwsem;
+struct cxl_rwsem {
+	/*
+	 * All changes to HPA (interleave configuration) occur with this
+	 * lock held for write.
+	 */
+	struct rw_semaphore region;
+	/*
+	 * All changes to a device DPA space occur with this lock held
+	 * for write.
+	 */
+	struct rw_semaphore dpa;
+};
+
+extern struct cxl_rwsem cxl_rwsem;
 
 int cxl_memdev_init(void);
 void cxl_memdev_exit(void);
diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c
index 623aaa4439c4..9ed1b670efb8 100644
--- a/drivers/cxl/core/edac.c
+++ b/drivers/cxl/core/edac.c
@@ -115,10 +115,9 @@ static int cxl_scrub_get_attrbs(struct cxl_patrol_scrub_context *cxl_ps_ctx,
 						flags, min_cycle);
 	}
 
-	struct rw_semaphore *region_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_region_rwsem);
-	if (!region_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+		return ret;
 
 	cxlr = cxl_ps_ctx->cxlr;
 	p = &cxlr->params;
@@ -158,10 +157,9 @@ static int cxl_scrub_set_attrbs_region(struct device *dev,
 	struct cxl_region *cxlr;
 	int ret, i;
 
-	struct rw_semaphore *region_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_region_rwsem);
-	if (!region_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+		return ret;
 
 	cxlr = cxl_ps_ctx->cxlr;
 	p = &cxlr->params;
@@ -1340,16 +1338,15 @@ cxl_mem_perform_sparing(struct device *dev,
 	struct cxl_memdev_sparing_in_payload sparing_pi;
 	struct cxl_event_dram *rec = NULL;
 	u16 validity_flags = 0;
+	int ret;
 
-	struct rw_semaphore *region_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_region_rwsem);
-	if (!region_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+		return ret;
 
-	struct rw_semaphore *dpa_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_dpa_rwsem);
-	if (!dpa_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+		return ret;
 
 	if (!cxl_sparing_ctx->cap_safe_when_in_use) {
 		/* Memory to repair must be offline */
@@ -1787,16 +1784,15 @@ static int cxl_mem_perform_ppr(struct cxl_ppr_context *cxl_ppr_ctx)
 	struct cxl_memdev_ppr_maintenance_attrbs maintenance_attrbs;
 	struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
 	struct cxl_mem_repair_attrbs attrbs = { 0 };
+	int ret;
 
-	struct rw_semaphore *region_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_region_rwsem);
-	if (!region_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+		return ret;
 
-	struct rw_semaphore *dpa_lock __free(rwsem_read_release) =
-		rwsem_read_intr_acquire(&cxl_dpa_rwsem);
-	if (!dpa_lock)
-		return -EINTR;
+	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+	if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+		return ret;
 
 	if (!cxl_ppr_ctx->media_accessible || !cxl_ppr_ctx->data_retained) {
 		/* Memory to repair must be offline */
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index e9cb34e30248..865a71bce251 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -16,7 +16,10 @@
  * for enumerating these registers and capabilities.
  */
 
-DECLARE_RWSEM(cxl_dpa_rwsem);
+struct cxl_rwsem cxl_rwsem = {
+	.region = __RWSEM_INITIALIZER(cxl_rwsem.region),
+	.dpa = __RWSEM_INITIALIZER(cxl_rwsem.dpa),
+};
 
 static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 			   int *target_map)
@@ -214,7 +217,7 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
 {
 	struct resource *p1, *p2;
 
-	guard(rwsem_read)(&cxl_dpa_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.dpa);
 	for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
 		__cxl_dpa_debug(file, p1, 0);
 		for (p2 = p1->child; p2; p2 = p2->sibling)
@@ -266,7 +269,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 	struct resource *res = cxled->dpa_res;
 	resource_size_t skip_start;
 
-	lockdep_assert_held_write(&cxl_dpa_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.dpa);
 
 	/* save @skip_start, before @res is released */
 	skip_start = res->start - cxled->skip;
@@ -281,7 +284,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 
 static void cxl_dpa_release(void *cxled)
 {
-	guard(rwsem_write)(&cxl_dpa_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.dpa);
 	__cxl_dpa_release(cxled);
 }
 
@@ -293,7 +296,7 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 {
 	struct cxl_port *port = cxled_to_port(cxled);
 
-	lockdep_assert_held_write(&cxl_dpa_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.dpa);
 	devm_remove_action(&port->dev, cxl_dpa_release, cxled);
 	__cxl_dpa_release(cxled);
 }
@@ -361,7 +364,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	struct resource *res;
 	int rc;
 
-	lockdep_assert_held_write(&cxl_dpa_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.dpa);
 
 	if (!len) {
 		dev_warn(dev, "decoder%d.%d: empty reservation attempted\n",
@@ -470,7 +473,7 @@ int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info)
 {
 	struct device *dev = cxlds->dev;
 
-	guard(rwsem_write)(&cxl_dpa_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.dpa);
 
 	if (cxlds->nr_partitions)
 		return -EBUSY;
@@ -516,9 +519,8 @@ int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	struct cxl_port *port = cxled_to_port(cxled);
 	int rc;
 
-	down_write(&cxl_dpa_rwsem);
-	rc = __cxl_dpa_reserve(cxled, base, len, skipped);
-	up_write(&cxl_dpa_rwsem);
+	scoped_guard(rwsem_write, &cxl_rwsem.dpa)
+		rc = __cxl_dpa_reserve(cxled, base, len, skipped);
 
 	if (rc)
 		return rc;
@@ -529,7 +531,7 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL");
 
 resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
 {
-	guard(rwsem_read)(&cxl_dpa_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.dpa);
 	if (cxled->dpa_res)
 		return resource_size(cxled->dpa_res);
 
@@ -540,7 +542,7 @@ resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
 {
 	resource_size_t base = -1;
 
-	lockdep_assert_held(&cxl_dpa_rwsem);
+	lockdep_assert_held(&cxl_rwsem.dpa);
 	if (cxled->dpa_res)
 		base = cxled->dpa_res->start;
 
@@ -552,7 +554,7 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
 	struct cxl_port *port = cxled_to_port(cxled);
 	struct device *dev = &cxled->cxld.dev;
 
-	guard(rwsem_write)(&cxl_dpa_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.dpa);
 	if (!cxled->dpa_res)
 		return 0;
 	if (cxled->cxld.region) {
@@ -582,7 +584,7 @@ int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
 	struct device *dev = &cxled->cxld.dev;
 	int part;
 
-	guard(rwsem_write)(&cxl_dpa_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.dpa);
 	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE)
 		return -EBUSY;
 
@@ -614,7 +616,7 @@ static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size)
 	struct resource *p, *last;
 	int part;
 
-	guard(rwsem_write)(&cxl_dpa_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.dpa);
 	if (cxled->cxld.region) {
 		dev_dbg(dev, "decoder attached to %s\n",
 			dev_name(&cxled->cxld.region->dev));
@@ -842,9 +844,8 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld)
 		}
 	}
 
-	down_read(&cxl_dpa_rwsem);
-	setup_hw_decoder(cxld, hdm);
-	up_read(&cxl_dpa_rwsem);
+	scoped_guard(rwsem_read, &cxl_rwsem.dpa)
+		setup_hw_decoder(cxld, hdm);
 
 	port->commit_end++;
 	rc = cxld_await_commit(hdm, cxld->id);
@@ -882,7 +883,7 @@ void cxl_port_commit_reap(struct cxl_decoder *cxld)
 {
 	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	/*
 	 * Once the highest committed decoder is disabled, free any other
@@ -1030,7 +1031,7 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 		else
 			cxld->target_type = CXL_DECODER_DEVMEM;
 
-		guard(rwsem_write)(&cxl_region_rwsem);
+		guard(rwsem_write)(&cxl_rwsem.region);
 		if (cxld->id != cxl_num_decoders_committed(port)) {
 			dev_warn(&port->dev,
 				 "decoder%d.%d: Committed out of order\n",
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 81b21effe8cf..92cd3cbdd8ec 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -909,8 +909,8 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		 * translations. Take topology mutation locks and lookup
 		 * { HPA, REGION } from { DPA, MEMDEV } in the event record.
 		 */
-		guard(rwsem_read)(&cxl_region_rwsem);
-		guard(rwsem_read)(&cxl_dpa_rwsem);
+		guard(rwsem_read)(&cxl_rwsem.region);
+		guard(rwsem_read)(&cxl_rwsem.dpa);
 
 		dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
 		cxlr = cxl_dpa_to_region(cxlmd, dpa);
@@ -1265,7 +1265,7 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
 	/* synchronize with cxl_mem_probe() and decoder write operations */
 	guard(device)(&cxlmd->dev);
 	endpoint = cxlmd->endpoint;
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	/*
 	 * Require an endpoint to be safe otherwise the driver can not
 	 * be sure that the device is unmapped.
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index f88a13adf7fa..f5fbd34310fd 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -232,15 +232,13 @@ int cxl_trigger_poison_list(struct cxl_memdev *cxlmd)
 	if (!port || !is_cxl_endpoint(port))
 		return -EINVAL;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
 		return rc;
 
-	rc = down_read_interruptible(&cxl_dpa_rwsem);
-	if (rc) {
-		up_read(&cxl_region_rwsem);
+	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
 		return rc;
-	}
 
 	if (cxl_num_decoders_committed(port) == 0) {
 		/* No regions mapped to this memdev */
@@ -249,8 +247,6 @@ int cxl_trigger_poison_list(struct cxl_memdev *cxlmd)
 		/* Regions mapped, collect poison by endpoint */
 		rc =  cxl_get_poison_by_endpoint(port);
 	}
-	up_read(&cxl_dpa_rwsem);
-	up_read(&cxl_region_rwsem);
 
 	return rc;
 }
@@ -292,19 +288,17 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
 		return 0;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
 		return rc;
 
-	rc = down_read_interruptible(&cxl_dpa_rwsem);
-	if (rc) {
-		up_read(&cxl_region_rwsem);
+	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
 		return rc;
-	}
 
 	rc = cxl_validate_poison_dpa(cxlmd, dpa);
 	if (rc)
-		goto out;
+		return rc;
 
 	inject.address = cpu_to_le64(dpa);
 	mbox_cmd = (struct cxl_mbox_cmd) {
@@ -314,7 +308,7 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
 	};
 	rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
 	if (rc)
-		goto out;
+		return rc;
 
 	cxlr = cxl_dpa_to_region(cxlmd, dpa);
 	if (cxlr)
@@ -327,11 +321,8 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
 		.length = cpu_to_le32(1),
 	};
 	trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_INJECT);
-out:
-	up_read(&cxl_dpa_rwsem);
-	up_read(&cxl_region_rwsem);
 
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, "CXL");
 
@@ -347,19 +338,17 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
 	if (!IS_ENABLED(CONFIG_DEBUG_FS))
 		return 0;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
 		return rc;
 
-	rc = down_read_interruptible(&cxl_dpa_rwsem);
-	if (rc) {
-		up_read(&cxl_region_rwsem);
+	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
 		return rc;
-	}
 
 	rc = cxl_validate_poison_dpa(cxlmd, dpa);
 	if (rc)
-		goto out;
+		return rc;
 
 	/*
 	 * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
@@ -378,7 +367,7 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
 
 	rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
 	if (rc)
-		goto out;
+		return rc;
 
 	cxlr = cxl_dpa_to_region(cxlmd, dpa);
 	if (cxlr)
@@ -391,11 +380,8 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
 		.length = cpu_to_le32(1),
 	};
 	trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_CLEAR);
-out:
-	up_read(&cxl_dpa_rwsem);
-	up_read(&cxl_region_rwsem);
 
-	return rc;
+	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, "CXL");
 
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 087a20a9ee1c..bacf1380dc4d 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -30,18 +30,12 @@
  * instantiated by the core.
  */
 
-/*
- * All changes to the interleave configuration occur with this lock held
- * for write.
- */
-DECLARE_RWSEM(cxl_region_rwsem);
-
 static DEFINE_IDA(cxl_port_ida);
 static DEFINE_XARRAY(cxl_root_buses);
 
 int cxl_num_decoders_committed(struct cxl_port *port)
 {
-	lockdep_assert_held(&cxl_region_rwsem);
+	lockdep_assert_held(&cxl_rwsem.region);
 
 	return port->commit_end + 1;
 }
@@ -176,7 +170,7 @@ static ssize_t target_list_show(struct device *dev,
 	ssize_t offset;
 	int rc;
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	rc = emit_target_list(cxlsd, buf);
 	if (rc < 0)
 		return rc;
@@ -196,7 +190,7 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
 	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	/* without @cxl_dpa_rwsem, make sure @part is not reloaded */
+	/* without @cxl_rwsem.dpa, make sure @part is not reloaded */
 	int part = READ_ONCE(cxled->part);
 	const char *desc;
 
@@ -235,7 +229,7 @@ static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *at
 {
 	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
 
-	guard(rwsem_read)(&cxl_dpa_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.dpa);
 	return sysfs_emit(buf, "%#llx\n", (u64)cxl_dpa_resource_start(cxled));
 }
 static DEVICE_ATTR_RO(dpa_resource);
@@ -560,7 +554,7 @@ static ssize_t decoders_committed_show(struct device *dev,
 {
 	struct cxl_port *port = to_cxl_port(dev);
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
 }
 
@@ -1722,7 +1716,7 @@ static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd,
 	if (xa_empty(&port->dports))
 		return -EINVAL;
 
-	guard(rwsem_write)(&cxl_region_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.region);
 	for (i = 0; i < cxlsd->cxld.interleave_ways; i++) {
 		struct cxl_dport *dport = find_dport(port, target_map[i]);
 
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 4314aaed8ad8..ad60c93be803 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -141,16 +141,12 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
 	struct cxl_region_params *p = &cxlr->params;
 	ssize_t rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
 		return rc;
 	if (cxlr->mode != CXL_PARTMODE_PMEM)
-		rc = sysfs_emit(buf, "\n");
-	else
-		rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+		return sysfs_emit(buf, "\n");
+	return sysfs_emit(buf, "%pUb\n", &p->uuid);
 }
 
 static int is_dup(struct device *match, void *data)
@@ -162,7 +158,7 @@ static int is_dup(struct device *match, void *data)
 	if (!is_cxl_region(match))
 		return 0;
 
-	lockdep_assert_held(&cxl_region_rwsem);
+	lockdep_assert_held(&cxl_rwsem.region);
 	cxlr = to_cxl_region(match);
 	p = &cxlr->params;
 
@@ -192,27 +188,22 @@ static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
 	if (uuid_is_null(&temp))
 		return -EINVAL;
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &region_rwsem)))
 		return rc;
 
 	if (uuid_equal(&p->uuid, &temp))
-		goto out;
+		return len;
 
-	rc = -EBUSY;
 	if (p->state >= CXL_CONFIG_ACTIVE)
-		goto out;
+		return -EBUSY;
 
 	rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
 	if (rc < 0)
-		goto out;
+		return rc;
 
 	uuid_copy(&p->uuid, &temp);
-out:
-	up_write(&cxl_region_rwsem);
 
-	if (rc)
-		return rc;
 	return len;
 }
 static DEVICE_ATTR_RW(uuid);
@@ -354,20 +345,17 @@ static int queue_reset(struct cxl_region *cxlr)
 	struct cxl_region_params *p = &cxlr->params;
 	int rc;
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 		return rc;
 
 	/* Already in the requested state? */
 	if (p->state < CXL_CONFIG_COMMIT)
-		goto out;
+		return 0;
 
 	p->state = CXL_CONFIG_RESET_PENDING;
 
-out:
-	up_write(&cxl_region_rwsem);
-
-	return rc;
+	return 0;
 }
 
 static int __commit(struct cxl_region *cxlr)
@@ -375,19 +363,17 @@ static int __commit(struct cxl_region *cxlr)
 	struct cxl_region_params *p = &cxlr->params;
 	int rc;
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 		return rc;
 
 	/* Already in the requested state? */
 	if (p->state >= CXL_CONFIG_COMMIT)
-		goto out;
+		return 0;
 
 	/* Not ready to commit? */
-	if (p->state < CXL_CONFIG_ACTIVE) {
-		rc = -ENXIO;
-		goto out;
-	}
+	if (p->state < CXL_CONFIG_ACTIVE)
+		return -ENXIO;
 
 	/*
 	 * Invalidate caches before region setup to drop any speculative
@@ -395,16 +381,15 @@ static int __commit(struct cxl_region *cxlr)
 	 */
 	rc = cxl_region_invalidate_memregion(cxlr);
 	if (rc)
-		goto out;
+		return rc;
 
 	rc = cxl_region_decode_commit(cxlr);
-	if (rc == 0)
-		p->state = CXL_CONFIG_COMMIT;
+	if (rc)
+		return rc;
 
-out:
-	up_write(&cxl_region_rwsem);
+	p->state = CXL_CONFIG_COMMIT;
 
-	return rc;
+	return 0;
 }
 
 static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
@@ -437,10 +422,10 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
 	device_release_driver(&cxlr->dev);
 
 	/*
-	 * With the reset pending take cxl_region_rwsem unconditionally
+	 * With the reset pending take cxl_rwsem.region unconditionally
 	 * to ensure the reset gets handled before returning.
 	 */
-	guard(rwsem_write)(&cxl_region_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.region);
 
 	/*
 	 * Revalidate that the reset is still pending in case another
@@ -461,13 +446,10 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
 	struct cxl_region_params *p = &cxlr->params;
 	ssize_t rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
-	rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+	return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
 }
 static DEVICE_ATTR_RW(commit);
 
@@ -491,15 +473,12 @@ static ssize_t interleave_ways_show(struct device *dev,
 {
 	struct cxl_region *cxlr = to_cxl_region(dev);
 	struct cxl_region_params *p = &cxlr->params;
-	ssize_t rc;
+	int rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
-	rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+	return sysfs_emit(buf, "%d\n", p->interleave_ways);
 }
 
 static const struct attribute_group *get_cxl_region_target_group(void);
@@ -534,23 +513,21 @@ static ssize_t interleave_ways_store(struct device *dev,
 		return -EINVAL;
 	}
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 		return rc;
-	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
-		rc = -EBUSY;
-		goto out;
-	}
+
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+		return -EBUSY;
 
 	save = p->interleave_ways;
 	p->interleave_ways = val;
 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
-	if (rc)
+	if (rc) {
 		p->interleave_ways = save;
-out:
-	up_write(&cxl_region_rwsem);
-	if (rc)
 		return rc;
+	}
+
 	return len;
 }
 static DEVICE_ATTR_RW(interleave_ways);
@@ -561,15 +538,12 @@ static ssize_t interleave_granularity_show(struct device *dev,
 {
 	struct cxl_region *cxlr = to_cxl_region(dev);
 	struct cxl_region_params *p = &cxlr->params;
-	ssize_t rc;
+	int rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
-	rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+	return sysfs_emit(buf, "%d\n", p->interleave_granularity);
 }
 
 static ssize_t interleave_granularity_store(struct device *dev,
@@ -602,19 +576,15 @@ static ssize_t interleave_granularity_store(struct device *dev,
 	if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
 		return -EINVAL;
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 		return rc;
-	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
-		rc = -EBUSY;
-		goto out;
-	}
+
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+		return -EBUSY;
 
 	p->interleave_granularity = val;
-out:
-	up_write(&cxl_region_rwsem);
-	if (rc)
-		return rc;
+
 	return len;
 }
 static DEVICE_ATTR_RW(interleave_granularity);
@@ -625,17 +595,15 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
 	struct cxl_region *cxlr = to_cxl_region(dev);
 	struct cxl_region_params *p = &cxlr->params;
 	u64 resource = -1ULL;
-	ssize_t rc;
+	int rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
+
 	if (p->res)
 		resource = p->res->start;
-	rc = sysfs_emit(buf, "%#llx\n", resource);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+	return sysfs_emit(buf, "%#llx\n", resource);
 }
 static DEVICE_ATTR_RO(resource);
 
@@ -663,7 +631,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
 	struct resource *res;
 	u64 remainder = 0;
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	/* Nothing to do... */
 	if (p->res && resource_size(p->res) == size)
@@ -705,7 +673,7 @@ static void cxl_region_iomem_release(struct cxl_region *cxlr)
 	struct cxl_region_params *p = &cxlr->params;
 
 	if (device_is_registered(&cxlr->dev))
-		lockdep_assert_held_write(&cxl_region_rwsem);
+		lockdep_assert_held_write(&cxl_rwsem.region);
 	if (p->res) {
 		/*
 		 * Autodiscovered regions may not have been able to insert their
@@ -722,7 +690,7 @@ static int free_hpa(struct cxl_region *cxlr)
 {
 	struct cxl_region_params *p = &cxlr->params;
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	if (!p->res)
 		return 0;
@@ -746,15 +714,14 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr,
 	if (rc)
 		return rc;
 
-	rc = down_write_killable(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 		return rc;
 
 	if (val)
 		rc = alloc_hpa(cxlr, val);
 	else
 		rc = free_hpa(cxlr);
-	up_write(&cxl_region_rwsem);
 
 	if (rc)
 		return rc;
@@ -770,15 +737,12 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 	u64 size = 0;
 	ssize_t rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
 	if (p->res)
 		size = resource_size(p->res);
-	rc = sysfs_emit(buf, "%#llx\n", size);
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+	return sysfs_emit(buf, "%#llx\n", size);
 }
 static DEVICE_ATTR_RW(size);
 
@@ -804,26 +768,20 @@ static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
 	struct cxl_endpoint_decoder *cxled;
 	int rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
 
 	if (pos >= p->interleave_ways) {
 		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
 			p->interleave_ways);
-		rc = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 
 	cxled = p->targets[pos];
 	if (!cxled)
-		rc = sysfs_emit(buf, "\n");
-	else
-		rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
-out:
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+		return sysfs_emit(buf, "\n");
+	return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
 }
 
 static int check_commit_order(struct device *dev, void *data)
@@ -938,7 +896,7 @@ cxl_port_pick_region_decoder(struct cxl_port *port,
 	/*
 	 * This decoder is pinned registered as long as the endpoint decoder is
 	 * registered, and endpoint decoder unregistration holds the
-	 * cxl_region_rwsem over unregister events, so no need to hold on to
+	 * cxl_rwsem.region over unregister events, so no need to hold on to
 	 * this extra reference.
 	 */
 	put_device(dev);
@@ -1129,7 +1087,7 @@ static int cxl_port_attach_region(struct cxl_port *port,
 	unsigned long index;
 	int rc = -EBUSY;
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	cxl_rr = cxl_rr_load(port, cxlr);
 	if (cxl_rr) {
@@ -1239,7 +1197,7 @@ static void cxl_port_detach_region(struct cxl_port *port,
 	struct cxl_region_ref *cxl_rr;
 	struct cxl_ep *ep = NULL;
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	cxl_rr = cxl_rr_load(port, cxlr);
 	if (!cxl_rr)
@@ -2142,7 +2100,7 @@ __cxl_decoder_detach(struct cxl_region *cxlr,
 {
 	struct cxl_region_params *p;
 
-	lockdep_assert_held_write(&cxl_region_rwsem);
+	lockdep_assert_held_write(&cxl_rwsem.region);
 
 	if (!cxled) {
 		p = &cxlr->params;
@@ -2215,18 +2173,18 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
 	struct cxl_region *detach;
 
 	/* when the decoder is being destroyed lock unconditionally */
-	if (mode == DETACH_INVALIDATE)
-		down_write(&cxl_region_rwsem);
-	else {
-		int rc = down_write_killable(&cxl_region_rwsem);
+	if (mode == DETACH_INVALIDATE) {
+		guard(rwsem_write)(&cxl_rwsem.region);
+		detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
+	} else {
+		int rc;
 
-		if (rc)
+		ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+		if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
 			return rc;
+		detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
 	}
 
-	detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
-	up_write(&cxl_region_rwsem);
-
 	if (detach) {
 		device_release_driver(&detach->dev);
 		put_device(&detach->dev);
@@ -2234,29 +2192,35 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
 	return 0;
 }
 
+static int __attach_target(struct cxl_region *cxlr,
+			   struct cxl_endpoint_decoder *cxled, int pos,
+			   unsigned int state)
+{
+	int rc;
+
+	if (state == TASK_INTERRUPTIBLE) {
+		ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+		if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+			return rc;
+		guard(rwsem_read)(&cxl_rwsem.dpa);
+		return cxl_region_attach(cxlr, cxled, pos);
+	}
+	guard(rwsem_write)(&cxl_rwsem.region);
+	guard(rwsem_read)(&cxl_rwsem.dpa);
+	return cxl_region_attach(cxlr, cxled, pos);
+}
+
 static int attach_target(struct cxl_region *cxlr,
 			 struct cxl_endpoint_decoder *cxled, int pos,
 			 unsigned int state)
 {
-	int rc = 0;
-
-	if (state == TASK_INTERRUPTIBLE)
-		rc = down_write_killable(&cxl_region_rwsem);
-	else
-		down_write(&cxl_region_rwsem);
-	if (rc)
-		return rc;
-
-	down_read(&cxl_dpa_rwsem);
-	rc = cxl_region_attach(cxlr, cxled, pos);
-	up_read(&cxl_dpa_rwsem);
-	up_write(&cxl_region_rwsem);
+	int rc = __attach_target(cxlr, cxled, pos, state);
 
-	if (rc)
-		dev_warn(cxled->cxld.dev.parent,
-			"failed to attach %s to %s: %d\n",
-			dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
+	if (rc == 0)
+		return 0;
 
+	dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n",
+		 dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
 	return rc;
 }
 
@@ -2516,7 +2480,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
 		return NOTIFY_DONE;
 
 	/*
-	 * No need to hold cxl_region_rwsem; region parameters are stable
+	 * No need to hold cxl_rwsem.region; region parameters are stable
 	 * within the cxl_region driver.
 	 */
 	region_nid = phys_to_target_node(cxlr->params.res->start);
@@ -2539,7 +2503,7 @@ static int cxl_region_calculate_adistance(struct notifier_block *nb,
 	int region_nid;
 
 	/*
-	 * No need to hold cxl_region_rwsem; region parameters are stable
+	 * No need to hold cxl_rwsem.region; region parameters are stable
 	 * within the cxl_region driver.
 	 */
 	region_nid = phys_to_target_node(cxlr->params.res->start);
@@ -2688,17 +2652,13 @@ static ssize_t region_show(struct device *dev, struct device_attribute *attr,
 	struct cxl_decoder *cxld = to_cxl_decoder(dev);
 	ssize_t rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc)
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
 		return rc;
 
 	if (cxld->region)
-		rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
-	else
-		rc = sysfs_emit(buf, "\n");
-	up_read(&cxl_region_rwsem);
-
-	return rc;
+		return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
+	return sysfs_emit(buf, "\n");
 }
 DEVICE_ATTR_RO(region);
 
@@ -3037,7 +2997,7 @@ static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
 	struct device *dev;
 	int i;
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	if (p->state != CXL_CONFIG_COMMIT)
 		return -ENXIO;
 
@@ -3049,7 +3009,7 @@ static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
 	cxlr_pmem->hpa_range.start = p->res->start;
 	cxlr_pmem->hpa_range.end = p->res->end;
 
-	/* Snapshot the region configuration underneath the cxl_region_rwsem */
+	/* Snapshot the region configuration underneath the cxl_rwsem.region */
 	cxlr_pmem->nr_mappings = p->nr_targets;
 	for (i = 0; i < p->nr_targets; i++) {
 		struct cxl_endpoint_decoder *cxled = p->targets[i];
@@ -3126,7 +3086,7 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
 	struct cxl_dax_region *cxlr_dax;
 	struct device *dev;
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	if (p->state != CXL_CONFIG_COMMIT)
 		return ERR_PTR(-ENXIO);
 
@@ -3326,7 +3286,7 @@ static int match_region_by_range(struct device *dev, const void *data)
 	cxlr = to_cxl_region(dev);
 	p = &cxlr->params;
 
-	guard(rwsem_read)(&cxl_region_rwsem);
+	guard(rwsem_read)(&cxl_rwsem.region);
 	if (p->res && p->res->start == r->start && p->res->end == r->end)
 		return 1;
 
@@ -3386,7 +3346,7 @@ static int __construct_region(struct cxl_region *cxlr,
 	struct resource *res;
 	int rc;
 
-	guard(rwsem_write)(&cxl_region_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.region);
 	p = &cxlr->params;
 	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
 		dev_err(cxlmd->dev.parent,
@@ -3522,10 +3482,10 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
 
 	attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
 
-	down_read(&cxl_region_rwsem);
-	p = &cxlr->params;
-	attach = p->state == CXL_CONFIG_COMMIT;
-	up_read(&cxl_region_rwsem);
+	scoped_guard(rwsem_read, &cxl_rwsem.region) {
+		p = &cxlr->params;
+		attach = p->state == CXL_CONFIG_COMMIT;
+	}
 
 	if (attach) {
 		/*
@@ -3550,7 +3510,7 @@ u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
 	if (!endpoint)
 		return ~0ULL;
 
-	guard(rwsem_write)(&cxl_region_rwsem);
+	guard(rwsem_write)(&cxl_rwsem.region);
 
 	xa_for_each(&endpoint->regions, index, iter) {
 		struct cxl_region_params *p = &iter->region->params;
@@ -3592,30 +3552,23 @@ static int cxl_region_can_probe(struct cxl_region *cxlr)
 	struct cxl_region_params *p = &cxlr->params;
 	int rc;
 
-	rc = down_read_interruptible(&cxl_region_rwsem);
-	if (rc) {
+	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) {
 		dev_dbg(&cxlr->dev, "probe interrupted\n");
 		return rc;
 	}
 
 	if (p->state < CXL_CONFIG_COMMIT) {
 		dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
-		rc = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 
 	if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
 		dev_err(&cxlr->dev,
 			"failed to activate, re-commit region and retry\n");
-		rc = -ENXIO;
-		goto out;
+		return -ENXIO;
 	}
 
-out:
-	up_read(&cxl_region_rwsem);
-
-	if (rc)
-		return rc;
 	return 0;
 }
 
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 3f1695c96abc..50799a681231 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -469,7 +469,7 @@ enum cxl_config_state {
  * @nr_targets: number of targets
  * @cache_size: extended linear cache size if exists, otherwise zero.
  *
- * State transitions are protected by the cxl_region_rwsem
+ * State transitions are protected by cxl_rwsem.region
  */
 struct cxl_region_params {
 	enum cxl_config_state state;
@@ -912,15 +912,4 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
 #endif
 
 u16 cxl_gpf_get_dvsec(struct device *dev);
-
-static inline struct rw_semaphore *rwsem_read_intr_acquire(struct rw_semaphore *rwsem)
-{
-	if (down_read_interruptible(rwsem))
-		return NULL;
-
-	return rwsem;
-}
-
-DEFINE_FREE(rwsem_read_release, struct rw_semaphore *, if (_T) up_read(_T))
-
 #endif /* __CXL_H__ */
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c810deb88d13..cbafdc12e743 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -244,6 +244,7 @@ DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T), _RET == 0)
 
 DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
 DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))
+DEFINE_GUARD_COND(rwsem_write, _kill, down_write_killable(_T), _RET == 0)
 
 /*
  * downgrade write lock to read lock
-- 
cgit v1.2.3


From 09813cde376d9d8f30eaf761534532101a0a7755 Mon Sep 17 00:00:00 2001
From: Hiago De Franco <hiago.franco@toradex.com>
Date: Sun, 29 Jun 2025 14:25:10 -0300
Subject: pmdomain: core: introduce dev_pm_genpd_is_on()

This helper function returns the current power status of a given generic
power domain.

As example, remoteproc/imx_rproc.c can now use this function to check
the power status of the remote core to properly set "attached" or
"offline" modes.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Reviewed-by: Peng Fan <peng.fan@nxp.com>
Signed-off-by: Hiago De Franco <hiago.franco@toradex.com>
Link: https://lore.kernel.org/r/20250629172512.14857-2-hiagofranco@gmail.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/pmdomain/core.c   | 33 +++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h |  6 ++++++
 2 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index a86aeda1c955..0006ab3d0789 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -769,6 +769,39 @@ int dev_pm_genpd_rpm_always_on(struct device *dev, bool on)
 }
 EXPORT_SYMBOL_GPL(dev_pm_genpd_rpm_always_on);
 
+/**
+ * dev_pm_genpd_is_on() - Get device's current power domain status
+ *
+ * @dev: Device to get the current power status
+ *
+ * This function checks whether the generic power domain associated with the
+ * given device is on or not by verifying if genpd_status_on equals
+ * GENPD_STATE_ON.
+ *
+ * Note: this function returns the power status of the genpd at the time of the
+ * call. The power status may change after due to activity from other devices
+ * sharing the same genpd. Therefore, this information should not be relied for
+ * long-term decisions about the device power state.
+ *
+ * Return: 'true' if the device's power domain is on, 'false' otherwise.
+ */
+bool dev_pm_genpd_is_on(struct device *dev)
+{
+	struct generic_pm_domain *genpd;
+	bool is_on;
+
+	genpd = dev_to_genpd_safe(dev);
+	if (!genpd)
+		return false;
+
+	genpd_lock(genpd);
+	is_on = genpd_status_on(genpd);
+	genpd_unlock(genpd);
+
+	return is_on;
+}
+EXPORT_SYMBOL_GPL(dev_pm_genpd_is_on);
+
 /**
  * pm_genpd_inc_rejected() - Adjust the rejected/usage counts for an idle-state.
  *
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 99556589f45e..b9d3c7d5c4f8 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -315,6 +315,7 @@ void dev_pm_genpd_synced_poweroff(struct device *dev);
 int dev_pm_genpd_set_hwmode(struct device *dev, bool enable);
 bool dev_pm_genpd_get_hwmode(struct device *dev);
 int dev_pm_genpd_rpm_always_on(struct device *dev, bool on);
+bool dev_pm_genpd_is_on(struct device *dev);
 
 extern struct dev_power_governor simple_qos_governor;
 extern struct dev_power_governor pm_domain_always_on_gov;
@@ -407,6 +408,11 @@ static inline int dev_pm_genpd_rpm_always_on(struct device *dev, bool on)
 	return -EOPNOTSUPP;
 }
 
+static inline bool dev_pm_genpd_is_on(struct device *dev)
+{
+	return false;
+}
+
 #define simple_qos_governor		(*(struct dev_power_governor *)(NULL))
 #define pm_domain_always_on_gov		(*(struct dev_power_governor *)(NULL))
 #endif
-- 
cgit v1.2.3


From c1f1fda141373d7253b4c1497043b0ef85f534ce Mon Sep 17 00:00:00 2001
From: Shuai Xue <xueshuai@linux.alibaba.com>
Date: Mon, 14 Jul 2025 19:42:12 +0800
Subject: ACPI: APEI: handle synchronous exceptions in task work

The memory uncorrected error could be signaled by asynchronous interrupt
(specifically, SPI in arm64 platform), e.g. when an error is detected by
a background scrubber, or signaled by synchronous exception
(specifically, data abort exception in arm64 platform), e.g. when a CPU
tries to access a poisoned cache line. Currently, both synchronous and
asynchronous errors use memory_failure_queue() to schedule
memory_failure() to exectute in a kworker context.

As a result, when a user-space process is accessing a poisoned data, a
data abort is taken and the memory_failure() is executed in the kworker
context, which:

  - will send wrong si_code by SIGBUS signal in early_kill mode, and
  - can not kill the user-space in some cases resulting a synchronous
    error infinite loop

Issue 1: send wrong si_code in early_kill mode

Since commit a70297d22132 ("ACPI: APEI: set memory failure flags as
MF_ACTION_REQUIRED on synchronous events")', the flag MF_ACTION_REQUIRED
could be used to determine whether a synchronous exception occurs on
ARM64 platform.  When a synchronous exception is detected, the kernel is
expected to terminate the current process which has accessed a poisoned
page. This is done by sending a SIGBUS signal with error code
BUS_MCEERR_AR, indicating an action-required machine check error on
read.

However, when kill_proc() is called to terminate the processes who has
the poisoned page mapped, it sends the incorrect SIGBUS error code
BUS_MCEERR_AO because the context in which it operates is not the one
where the error was triggered.

To reproduce this problem:

  #sysctl -w vm.memory_failure_early_kill=1
  vm.memory_failure_early_kill = 1

  # STEP2: inject an UCE error and consume it to trigger a synchronous error
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 5 addr 0xffffb0d75000
  page not present
  Test passed

The si_code (code 5) from einj_mem_uc indicates that it is BUS_MCEERR_AO
error and it is not factually correct.

After this change:

  # STEP1: enable early kill mode
  #sysctl -w vm.memory_failure_early_kill=1
  vm.memory_failure_early_kill = 1
  # STEP2: inject an UCE error and consume it to trigger a synchronous error
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 4 addr 0xffffb0d75000
  page not present
  Test passed

The si_code (code 4) from einj_mem_uc indicates that it is a BUS_MCEERR_AR
error as expected.

Issue 2: a synchronous error infinite loop

If a user-space process, e.g. devmem, accesses a poisoned page for which
the HWPoison flag is set, kill_accessing_process() is called to send
SIGBUS to current processs with error info. Since the memory_failure()
is executed in the kworker context, it will just do nothing but return
EFAULT. So, devmem will access the posioned page and trigger an
exception again, resulting in a synchronous error infinite loop. Such
exception loop may cause platform firmware to exceed some threshold and
reboot when Linux could have recovered from this error.

To reproduce this problem:

  # STEP 1: inject an UCE error, and kernel will set HWPosion flag for related page
  #einj_mem_uc single
  0: single   vaddr = 0xffffb0d75400 paddr = 4092d55b400
  injecting ...
  triggering ...
  signal 7 code 4 addr 0xffffb0d75000
  page not present
  Test passed

  # STEP 2: access the same page and it will trigger a synchronous error infinite loop
  devmem 0x4092d55b400

To fix above two issues, queue memory_failure() as a task_work so that
it runs in the context of the process that is actually consuming the
poisoned data.

Signed-off-by: Shuai Xue <xueshuai@linux.alibaba.com>
Tested-by: Ma Wupeng <mawupeng1@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xiaofei Tan <tanxiaofei@huawei.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://patch.msgid.link/20250714114212.31660-3-xueshuai@linux.alibaba.com
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/ghes.c | 79 +++++++++++++++++++++++++++---------------------
 include/acpi/ghes.h      |  3 --
 include/linux/mm.h       |  1 -
 mm/memory-failure.c      | 13 --------
 4 files changed, 45 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index bda33a0f0a01..a0d54993edb3 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -464,28 +464,41 @@ static void ghes_clear_estatus(struct ghes *ghes,
 		ghes_ack_error(ghes->generic_v2);
 }
 
-/*
- * Called as task_work before returning to user-space.
- * Ensure any queued work has been done before we return to the context that
- * triggered the notification.
+/**
+ * struct ghes_task_work - for synchronous RAS event
+ *
+ * @twork:                callback_head for task work
+ * @pfn:                  page frame number of corrupted page
+ * @flags:                work control flags
+ *
+ * Structure to pass task work to be handled before
+ * returning to user-space via task_work_add().
  */
-static void ghes_kick_task_work(struct callback_head *head)
+struct ghes_task_work {
+	struct callback_head twork;
+	u64 pfn;
+	int flags;
+};
+
+static void memory_failure_cb(struct callback_head *twork)
 {
-	struct acpi_hest_generic_status *estatus;
-	struct ghes_estatus_node *estatus_node;
-	u32 node_len;
+	struct ghes_task_work *twcb = container_of(twork, struct ghes_task_work, twork);
+	int ret;
 
-	estatus_node = container_of(head, struct ghes_estatus_node, task_work);
-	if (IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
-		memory_failure_queue_kick(estatus_node->task_work_cpu);
+	ret = memory_failure(twcb->pfn, twcb->flags);
+	gen_pool_free(ghes_estatus_pool, (unsigned long)twcb, sizeof(*twcb));
 
-	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
-	node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus));
-	gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+	if (!ret || ret == -EHWPOISON || ret == -EOPNOTSUPP)
+		return;
+
+	pr_err("%#llx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+			twcb->pfn, current->comm, task_pid_nr(current));
+	force_sig(SIGBUS);
 }
 
 static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
+	struct ghes_task_work *twcb;
 	unsigned long pfn;
 
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
@@ -499,6 +512,18 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 		return false;
 	}
 
+	if (flags == MF_ACTION_REQUIRED && current->mm) {
+		twcb = (void *)gen_pool_alloc(ghes_estatus_pool, sizeof(*twcb));
+		if (!twcb)
+			return false;
+
+		twcb->pfn = pfn;
+		twcb->flags = flags;
+		init_task_work(&twcb->twork, memory_failure_cb);
+		task_work_add(current, &twcb->twork, TWA_RESUME);
+		return true;
+	}
+
 	memory_failure_queue(pfn, flags);
 	return true;
 }
@@ -842,7 +867,7 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
 
-static bool ghes_do_proc(struct ghes *ghes,
+static void ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
 	int sev, sec_sev;
@@ -912,8 +937,6 @@ static bool ghes_do_proc(struct ghes *ghes,
 			current->comm, task_pid_nr(current));
 		force_sig(SIGBUS);
 	}
-
-	return queued;
 }
 
 static void __ghes_print_estatus(const char *pfx,
@@ -1219,9 +1242,7 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 	struct ghes_estatus_node *estatus_node;
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
-	bool task_work_pending;
 	u32 len, node_len;
-	int ret;
 
 	llnode = llist_del_all(&ghes_estatus_llist);
 	/*
@@ -1236,25 +1257,16 @@ static void ghes_proc_in_irq(struct irq_work *irq_work)
 		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 		len = cper_estatus_len(estatus);
 		node_len = GHES_ESTATUS_NODE_LEN(len);
-		task_work_pending = ghes_do_proc(estatus_node->ghes, estatus);
+
+		ghes_do_proc(estatus_node->ghes, estatus);
+
 		if (!ghes_estatus_cached(estatus)) {
 			generic = estatus_node->generic;
 			if (ghes_print_estatus(NULL, generic, estatus))
 				ghes_estatus_cache_add(generic, estatus);
 		}
-
-		if (task_work_pending && current->mm) {
-			estatus_node->task_work.func = ghes_kick_task_work;
-			estatus_node->task_work_cpu = smp_processor_id();
-			ret = task_work_add(current, &estatus_node->task_work,
-					    TWA_RESUME);
-			if (ret)
-				estatus_node->task_work.func = NULL;
-		}
-
-		if (!estatus_node->task_work.func)
-			gen_pool_free(ghes_estatus_pool,
-				      (unsigned long)estatus_node, node_len);
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+			      node_len);
 
 		llnode = next;
 	}
@@ -1315,7 +1327,6 @@ static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
 
 	estatus_node->ghes = ghes;
 	estatus_node->generic = ghes->generic;
-	estatus_node->task_work.func = NULL;
 	estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
 
 	if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index be1dd4c1a917..ebd21b05fe6e 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -35,9 +35,6 @@ struct ghes_estatus_node {
 	struct llist_node llnode;
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes;
-
-	int task_work_cpu;
-	struct callback_head task_work;
 };
 
 struct ghes_estatus_cache {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0ef2ba0c667a..02dffff2508b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3896,7 +3896,6 @@ enum mf_flags {
 int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		      unsigned long count, int mf_flags);
 extern int memory_failure(unsigned long pfn, int flags);
-extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b91a33fb6c69..5278132a2b6d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2503,19 +2503,6 @@ static void memory_failure_work_func(struct work_struct *work)
 	}
 }
 
-/*
- * Process memory_failure work queued on the specified CPU.
- * Used to avoid return-to-userspace racing with the memory_failure workqueue.
- */
-void memory_failure_queue_kick(int cpu)
-{
-	struct memory_failure_cpu *mf_cpu;
-
-	mf_cpu = &per_cpu(memory_failure_cpu, cpu);
-	cancel_work_sync(&mf_cpu->work);
-	memory_failure_work_func(&mf_cpu->work);
-}
-
 static int __init memory_failure_init(void)
 {
 	struct memory_failure_cpu *mf_cpu;
-- 
cgit v1.2.3


From dfef8d87a031ac1a46dde3de804e0fcf3c3a6afd Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Mon, 14 Jul 2025 22:27:43 +0200
Subject: Bluetooth: hci_core: fix typos in macros

The provided macro parameter is named 'dev' (rather than 'hdev', which
may be a variable on the stack where the macro is used).

Fixes: a9a830a676a9 ("Bluetooth: hci_event: Fix sending HCI_OP_READ_ENC_KEY_SIZE")
Fixes: 6126ffabba6b ("Bluetooth: Introduce HCI_CONN_FLAG_DEVICE_PRIVACY device flag")
Signed-off-by: Christian Eggers <ceggers@arri.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 0da011fc8146..052c91613bb9 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1940,11 +1940,11 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY)
 
 #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \
-				   (hdev->commands[39] & 0x04))
+				   ((dev)->commands[39] & 0x04))
 
 #define read_key_size_capable(dev) \
 	((dev)->commands[20] & 0x10 && \
-	 !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks))
+	 !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks))
 
 #define read_voice_setting_capable(dev) \
 	((dev)->commands[9] & 0x04 && \
-- 
cgit v1.2.3


From cdee6a4416b2a57c89082929cc60e2275bb32a3a Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Mon, 14 Jul 2025 22:27:44 +0200
Subject: Bluetooth: hci_core: add missing braces when using macro parameters

Macro parameters should always be put into braces when accessing it.

Fixes: 4fc9857ab8c6 ("Bluetooth: hci_sync: Add check simultaneous roles support")
Signed-off-by: Christian Eggers <ceggers@arri.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 052c91613bb9..367ca43f45d1 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -829,20 +829,20 @@ extern struct mutex hci_cb_list_lock;
 #define hci_dev_test_and_clear_flag(hdev, nr)  test_and_clear_bit((nr), (hdev)->dev_flags)
 #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags)
 
-#define hci_dev_clear_volatile_flags(hdev)			\
-	do {							\
-		hci_dev_clear_flag(hdev, HCI_LE_SCAN);		\
-		hci_dev_clear_flag(hdev, HCI_LE_ADV);		\
-		hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\
-		hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);	\
-		hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT);	\
+#define hci_dev_clear_volatile_flags(hdev)				\
+	do {								\
+		hci_dev_clear_flag((hdev), HCI_LE_SCAN);		\
+		hci_dev_clear_flag((hdev), HCI_LE_ADV);			\
+		hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION);	\
+		hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ);		\
+		hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT);		\
 	} while (0)
 
 #define hci_dev_le_state_simultaneous(hdev) \
-	(!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \
-	 (hdev->le_states[4] & 0x08) &&	/* Central */ \
-	 (hdev->le_states[4] & 0x40) &&	/* Peripheral */ \
-	 (hdev->le_states[3] & 0x10))	/* Simultaneous */
+	(!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \
+	 ((hdev)->le_states[4] & 0x08) &&	/* Central */ \
+	 ((hdev)->le_states[4] & 0x40) &&	/* Peripheral */ \
+	 ((hdev)->le_states[3] & 0x10))		/* Simultaneous */
 
 /* ----- HCI interface to upper protocols ----- */
 int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
-- 
cgit v1.2.3


From 6851a0c228fc040dce8e4c393004209e7372e0a3 Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Mon, 14 Jul 2025 22:27:45 +0200
Subject: Bluetooth: hci_dev: replace 'quirks' integer by 'quirk_flags' bitmap

The 'quirks' member already ran out of bits on some platforms some time
ago. Replace the integer member by a bitmap in order to have enough bits
in future. Replace raw bit operations by accessor macros.

Fixes: ff26b2dd6568 ("Bluetooth: Add quirk for broken READ_VOICE_SETTING")
Fixes: 127881334eaa ("Bluetooth: Add quirk for broken READ_PAGE_SCAN_TYPE")
Suggested-by: Pauli Virtanen <pav@iki.fi>
Tested-by: Ivan Pravdin <ipravdin.official@gmail.com>
Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Christian Eggers <ceggers@arri.de>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/bfusb.c        |  2 +-
 drivers/bluetooth/bpa10x.c       |  2 +-
 drivers/bluetooth/btbcm.c        |  8 ++---
 drivers/bluetooth/btintel.c      | 28 ++++++++--------
 drivers/bluetooth/btintel_pcie.c |  8 ++---
 drivers/bluetooth/btmtksdio.c    |  4 +--
 drivers/bluetooth/btmtkuart.c    |  2 +-
 drivers/bluetooth/btnxpuart.c    |  2 +-
 drivers/bluetooth/btqca.c        |  2 +-
 drivers/bluetooth/btqcomsmd.c    |  2 +-
 drivers/bluetooth/btrtl.c        | 10 +++---
 drivers/bluetooth/btsdio.c       |  2 +-
 drivers/bluetooth/btusb.c        | 70 ++++++++++++++++++++--------------------
 drivers/bluetooth/hci_aml.c      |  2 +-
 drivers/bluetooth/hci_bcm.c      |  4 +--
 drivers/bluetooth/hci_bcm4377.c  | 10 +++---
 drivers/bluetooth/hci_intel.c    |  2 +-
 drivers/bluetooth/hci_ldisc.c    |  6 ++--
 drivers/bluetooth/hci_ll.c       |  4 +--
 drivers/bluetooth/hci_nokia.c    |  2 +-
 drivers/bluetooth/hci_qca.c      | 14 ++++----
 drivers/bluetooth/hci_serdev.c   |  8 ++---
 drivers/bluetooth/hci_vhci.c     |  8 ++---
 drivers/bluetooth/virtio_bt.c    | 10 +++---
 include/net/bluetooth/hci.h      |  2 ++
 include/net/bluetooth/hci_core.h | 28 +++++++++-------
 net/bluetooth/hci_core.c         |  4 +--
 net/bluetooth/hci_debugfs.c      |  8 ++---
 net/bluetooth/hci_event.c        | 19 ++++++-----
 net/bluetooth/hci_sync.c         | 59 +++++++++++++++++----------------
 net/bluetooth/mgmt.c             | 38 +++++++++++-----------
 net/bluetooth/msft.c             |  2 +-
 32 files changed, 187 insertions(+), 185 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c
index 0d6ad50da046..8df310983bf6 100644
--- a/drivers/bluetooth/bfusb.c
+++ b/drivers/bluetooth/bfusb.c
@@ -670,7 +670,7 @@ static int bfusb_probe(struct usb_interface *intf, const struct usb_device_id *i
 	hdev->flush = bfusb_flush;
 	hdev->send  = bfusb_send_frame;
 
-	set_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS);
 
 	if (hci_register_dev(hdev) < 0) {
 		BT_ERR("Can't register HCI device");
diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index 1fa58c059cbf..8b43dfc755de 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -398,7 +398,7 @@ static int bpa10x_probe(struct usb_interface *intf,
 	hdev->send     = bpa10x_send_frame;
 	hdev->set_diag = bpa10x_set_diag;
 
-	set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 
 	err = hci_register_dev(hdev);
 	if (err < 0) {
diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c
index 0a60660fc8ce..3a3a56ddbb06 100644
--- a/drivers/bluetooth/btbcm.c
+++ b/drivers/bluetooth/btbcm.c
@@ -135,7 +135,7 @@ int btbcm_check_bdaddr(struct hci_dev *hdev)
 		if (btbcm_set_bdaddr_from_efi(hdev) != 0) {
 			bt_dev_info(hdev, "BCM: Using default device address (%pMR)",
 				    &bda->bdaddr);
-			set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 		}
 	}
 
@@ -467,7 +467,7 @@ static int btbcm_print_controller_features(struct hci_dev *hdev)
 
 	/* Read DMI and disable broken Read LE Min/Max Tx Power */
 	if (dmi_first_match(disable_broken_read_transmit_power))
-		set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER);
 
 	return 0;
 }
@@ -706,7 +706,7 @@ int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_m
 
 	btbcm_check_bdaddr(hdev);
 
-	set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
 
 	return 0;
 }
@@ -769,7 +769,7 @@ int btbcm_setup_apple(struct hci_dev *hdev)
 		kfree_skb(skb);
 	}
 
-	set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
 
 	return 0;
 }
diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index 06575a0b9aee..06016ac3965c 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -88,7 +88,7 @@ int btintel_check_bdaddr(struct hci_dev *hdev)
 	if (!bacmp(&bda->bdaddr, BDADDR_INTEL)) {
 		bt_dev_err(hdev, "Found Intel default device address (%pMR)",
 			   &bda->bdaddr);
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 	}
 
 	kfree_skb(skb);
@@ -2027,7 +2027,7 @@ static int btintel_download_fw(struct hci_dev *hdev,
 	 */
 	if (!bacmp(&params->otp_bdaddr, BDADDR_ANY)) {
 		bt_dev_info(hdev, "No device address configured");
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 	}
 
 download:
@@ -2295,7 +2295,7 @@ static int btintel_prepare_fw_download_tlv(struct hci_dev *hdev,
 		 */
 		if (!bacmp(&ver->otp_bd_addr, BDADDR_ANY)) {
 			bt_dev_info(hdev, "No device address configured");
-			set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 		}
 	}
 
@@ -3435,9 +3435,9 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 	}
 
 	/* Apply the common HCI quirks for Intel device */
-	set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
-	set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
-	set_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
+	hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+	hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG);
 
 	/* Set up the quality report callback for Intel devices */
 	hdev->set_quality_report = btintel_set_quality_report;
@@ -3475,8 +3475,8 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 			 */
 			if (!btintel_test_flag(hdev,
 					       INTEL_ROM_LEGACY_NO_WBS_SUPPORT))
-				set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
-					&hdev->quirks);
+				hci_set_quirk(hdev,
+					      HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 			err = btintel_legacy_rom_setup(hdev, &ver);
 			break;
@@ -3491,11 +3491,11 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 			 *
 			 * All Legacy bootloader devices support WBS
 			 */
-			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
-				&hdev->quirks);
+			hci_set_quirk(hdev,
+				      HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 			/* These variants don't seem to support LE Coded PHY */
-			set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED);
 
 			/* Setup MSFT Extension support */
 			btintel_set_msft_opcode(hdev, ver.hw_variant);
@@ -3571,10 +3571,10 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 		 *
 		 * All Legacy bootloader devices support WBS
 		 */
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		/* These variants don't seem to support LE Coded PHY */
-		set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED);
 
 		/* Setup MSFT Extension support */
 		btintel_set_msft_opcode(hdev, ver.hw_variant);
@@ -3600,7 +3600,7 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 		 *
 		 * All TLV based devices support WBS
 		 */
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		/* Setup MSFT Extension support */
 		btintel_set_msft_opcode(hdev,
diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c
index e1c688dd2d45..f4e3fb54fe76 100644
--- a/drivers/bluetooth/btintel_pcie.c
+++ b/drivers/bluetooth/btintel_pcie.c
@@ -2081,9 +2081,9 @@ static int btintel_pcie_setup_internal(struct hci_dev *hdev)
 	}
 
 	/* Apply the common HCI quirks for Intel device */
-	set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
-	set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
-	set_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
+	hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+	hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG);
 
 	/* Set up the quality report callback for Intel devices */
 	hdev->set_quality_report = btintel_set_quality_report;
@@ -2123,7 +2123,7 @@ static int btintel_pcie_setup_internal(struct hci_dev *hdev)
 		 *
 		 * All TLV based devices support WBS
 		 */
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		/* Setup MSFT Extension support */
 		btintel_set_msft_opcode(hdev,
diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c
index c16a3518b8ff..4fc673640bfc 100644
--- a/drivers/bluetooth/btmtksdio.c
+++ b/drivers/bluetooth/btmtksdio.c
@@ -1141,7 +1141,7 @@ static int btmtksdio_setup(struct hci_dev *hdev)
 		}
 
 		/* Enable WBS with mSBC codec */
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		/* Enable GPIO reset mechanism */
 		if (bdev->reset) {
@@ -1384,7 +1384,7 @@ static int btmtksdio_probe(struct sdio_func *func,
 	SET_HCIDEV_DEV(hdev, &func->dev);
 
 	hdev->manufacturer = 70;
-	set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP);
 
 	sdio_set_drvdata(func, bdev);
 
diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c
index c97e260fcb0c..51400a891f6e 100644
--- a/drivers/bluetooth/btmtkuart.c
+++ b/drivers/bluetooth/btmtkuart.c
@@ -872,7 +872,7 @@ static int btmtkuart_probe(struct serdev_device *serdev)
 	SET_HCIDEV_DEV(hdev, &serdev->dev);
 
 	hdev->manufacturer = 70;
-	set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP);
 
 	if (btmtkuart_is_standalone(bdev)) {
 		err = clk_prepare_enable(bdev->osc);
diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index 1088db6056a4..24f9b52605a1 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -1807,7 +1807,7 @@ static int nxp_serdev_probe(struct serdev_device *serdev)
 				      "local-bd-address",
 				      (u8 *)&ba, sizeof(ba));
 	if (bacmp(&ba, BDADDR_ANY))
-		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 
 	if (hci_register_dev(hdev) < 0) {
 		dev_err(&serdev->dev, "Can't register HCI device\n");
diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c
index edefb9dc76aa..7c958d6065be 100644
--- a/drivers/bluetooth/btqca.c
+++ b/drivers/bluetooth/btqca.c
@@ -739,7 +739,7 @@ static int qca_check_bdaddr(struct hci_dev *hdev, const struct qca_fw_config *co
 
 	bda = (struct hci_rp_read_bd_addr *)skb->data;
 	if (!bacmp(&bda->bdaddr, &config->bdaddr))
-		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 
 	kfree_skb(skb);
 
diff --git a/drivers/bluetooth/btqcomsmd.c b/drivers/bluetooth/btqcomsmd.c
index c0eb71d6ffd3..d2e13fcb6bab 100644
--- a/drivers/bluetooth/btqcomsmd.c
+++ b/drivers/bluetooth/btqcomsmd.c
@@ -117,7 +117,7 @@ static int btqcomsmd_setup(struct hci_dev *hdev)
 	/* Devices do not have persistent storage for BD address. Retrieve
 	 * it from the firmware node property.
 	 */
-	set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 
 	return 0;
 }
diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c
index 7838c89e529e..4d182cf6e037 100644
--- a/drivers/bluetooth/btrtl.c
+++ b/drivers/bluetooth/btrtl.c
@@ -1287,7 +1287,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 	/* Enable controller to do both LE scan and BR/EDR inquiry
 	 * simultaneously.
 	 */
-	set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 
 	/* Enable central-peripheral role (able to create new connections with
 	 * an existing connection in slave role).
@@ -1301,7 +1301,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 	case CHIP_ID_8851B:
 	case CHIP_ID_8922A:
 	case CHIP_ID_8852BT:
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		/* RTL8852C needs to transmit mSBC data continuously without
 		 * the zero length of USB packets for the ALT 6 supported chips
@@ -1312,7 +1312,8 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 		if (btrtl_dev->project_id == CHIP_ID_8852A ||
 		    btrtl_dev->project_id == CHIP_ID_8852B ||
 		    btrtl_dev->project_id == CHIP_ID_8852C)
-			set_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks);
+			hci_set_quirk(hdev,
+				      HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER);
 
 		hci_set_aosp_capable(hdev);
 		break;
@@ -1331,8 +1332,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev)
 		 * but it doesn't support any features from page 2 -
 		 * it either responds with garbage or with error status
 		 */
-		set_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
-			&hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2);
 		break;
 	default:
 		break;
diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
index a69feb08486a..8325655ce6aa 100644
--- a/drivers/bluetooth/btsdio.c
+++ b/drivers/bluetooth/btsdio.c
@@ -327,7 +327,7 @@ static int btsdio_probe(struct sdio_func *func,
 	hdev->send     = btsdio_send_frame;
 
 	if (func->vendor == 0x0104 && func->device == 0x00c5)
-		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 
 	err = hci_register_dev(hdev);
 	if (err < 0) {
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 9ab661d2d1e6..64509f5bfc99 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2472,18 +2472,18 @@ static int btusb_setup_csr(struct hci_dev *hdev)
 		 * Probably will need to be expanded in the future;
 		 * without these the controller will lock up.
 		 */
-		set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks);
-		set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL);
+		hci_set_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_VOICE_SETTING);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE);
 
 		/* Clear the reset quirk since this is not an actual
 		 * early Bluetooth 1.1 device from CSR.
 		 */
-		clear_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
-		clear_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+		hci_clear_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
+		hci_clear_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 
 		/*
 		 * Special workaround for these BT 4.0 chip clones, and potentially more:
@@ -3494,7 +3494,7 @@ static int btusb_setup_qca(struct hci_dev *hdev)
 	/* Mark HCI_OP_ENHANCED_SETUP_SYNC_CONN as broken as it doesn't seem to
 	 * work with the likes of HSP/HFP mSBC.
 	 */
-	set_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN);
 
 	return 0;
 }
@@ -4008,10 +4008,10 @@ static int btusb_probe(struct usb_interface *intf,
 	}
 #endif
 	if (id->driver_info & BTUSB_CW6622)
-		set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY);
 
 	if (id->driver_info & BTUSB_BCM2045)
-		set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY);
 
 	if (id->driver_info & BTUSB_BCM92035)
 		hdev->setup = btusb_setup_bcm92035;
@@ -4068,8 +4068,8 @@ static int btusb_probe(struct usb_interface *intf,
 		hdev->reset = btmtk_reset_sync;
 		hdev->set_bdaddr = btmtk_set_bdaddr;
 		hdev->send = btusb_send_frame_mtk;
-		set_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &hdev->quirks);
-		set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN);
+		hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP);
 		data->recv_acl = btmtk_usb_recv_acl;
 		data->suspend = btmtk_usb_suspend;
 		data->resume = btmtk_usb_resume;
@@ -4077,20 +4077,20 @@ static int btusb_probe(struct usb_interface *intf,
 	}
 
 	if (id->driver_info & BTUSB_SWAVE) {
-		set_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS);
 	}
 
 	if (id->driver_info & BTUSB_INTEL_BOOT) {
 		hdev->manufacturer = 2;
-		set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE);
 	}
 
 	if (id->driver_info & BTUSB_ATH3012) {
 		data->setup_on_usb = btusb_setup_qca;
 		hdev->set_bdaddr = btusb_set_bdaddr_ath3012;
-		set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
-		set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+		hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
 	}
 
 	if (id->driver_info & BTUSB_QCA_ROME) {
@@ -4098,7 +4098,7 @@ static int btusb_probe(struct usb_interface *intf,
 		hdev->shutdown = btusb_shutdown_qca;
 		hdev->set_bdaddr = btusb_set_bdaddr_ath3012;
 		hdev->reset = btusb_qca_reset;
-		set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 		btusb_check_needs_reset_resume(intf);
 	}
 
@@ -4112,7 +4112,7 @@ static int btusb_probe(struct usb_interface *intf,
 		hdev->shutdown = btusb_shutdown_qca;
 		hdev->set_bdaddr = btusb_set_bdaddr_wcn6855;
 		hdev->reset = btusb_qca_reset;
-		set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 		hci_set_msft_opcode(hdev, 0xFD70);
 	}
 
@@ -4140,35 +4140,35 @@ static int btusb_probe(struct usb_interface *intf,
 
 	if (id->driver_info & BTUSB_ACTIONS_SEMI) {
 		/* Support is advertised, but not implemented */
-		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &hdev->quirks);
-		set_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_SCAN);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_CREATE_CONN);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT);
 	}
 
 	if (!reset)
-		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 
 	if (force_scofix || id->driver_info & BTUSB_WRONG_SCO_MTU) {
 		if (!disable_scofix)
-			set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE);
 	}
 
 	if (id->driver_info & BTUSB_BROKEN_ISOC)
 		data->isoc = NULL;
 
 	if (id->driver_info & BTUSB_WIDEBAND_SPEECH)
-		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 	if (id->driver_info & BTUSB_INVALID_LE_STATES)
-		set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES);
 
 	if (id->driver_info & BTUSB_DIGIANSWER) {
 		data->cmdreq_type = USB_TYPE_VENDOR;
-		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 	}
 
 	if (id->driver_info & BTUSB_CSR) {
@@ -4177,10 +4177,10 @@ static int btusb_probe(struct usb_interface *intf,
 
 		/* Old firmware would otherwise execute USB reset */
 		if (bcdDevice < 0x117)
-			set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 
 		/* This must be set first in case we disable it for fakes */
-		set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 
 		/* Fake CSR devices with broken commands */
 		if (le16_to_cpu(udev->descriptor.idVendor)  == 0x0a12 &&
@@ -4193,7 +4193,7 @@ static int btusb_probe(struct usb_interface *intf,
 
 		/* New sniffer firmware has crippled HCI interface */
 		if (le16_to_cpu(udev->descriptor.bcdDevice) > 0x997)
-			set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE);
 	}
 
 	if (id->driver_info & BTUSB_INTEL_BOOT) {
diff --git a/drivers/bluetooth/hci_aml.c b/drivers/bluetooth/hci_aml.c
index 1394c575aa6d..707e90f80130 100644
--- a/drivers/bluetooth/hci_aml.c
+++ b/drivers/bluetooth/hci_aml.c
@@ -424,7 +424,7 @@ static int aml_check_bdaddr(struct hci_dev *hdev)
 
 	if (!bacmp(&paddr->bdaddr, AML_BDADDR_DEFAULT)) {
 		bt_dev_info(hdev, "amlbt using default bdaddr (%pM)", &paddr->bdaddr);
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 	}
 
 exit:
diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index 9684eb16059b..f96617b85d87 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -643,8 +643,8 @@ static int bcm_setup(struct hci_uart *hu)
 	 * Allow the bootloader to set a valid address through the
 	 * device tree.
 	 */
-	if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks))
-		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hu->hdev->quirks);
+	if (hci_test_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR))
+		hci_set_quirk(hu->hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 
 	if (!bcm_request_irq(bcm))
 		err = bcm_setup_sleep(hu);
diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c
index 9bce53e49cfa..8a9aa33776b0 100644
--- a/drivers/bluetooth/hci_bcm4377.c
+++ b/drivers/bluetooth/hci_bcm4377.c
@@ -1435,7 +1435,7 @@ static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377)
 
 	bda = (struct hci_rp_read_bd_addr *)skb->data;
 	if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr))
-		set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &bcm4377->hdev->quirks);
+		hci_set_quirk(bcm4377->hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 
 	kfree_skb(skb);
 	return 0;
@@ -2389,13 +2389,13 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	hdev->setup = bcm4377_hci_setup;
 
 	if (bcm4377->hw->broken_mws_transport_config)
-		set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG);
 	if (bcm4377->hw->broken_ext_scan)
-		set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_SCAN);
 	if (bcm4377->hw->broken_le_coded)
-		set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED);
 	if (bcm4377->hw->broken_le_ext_adv_report_phy)
-		set_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY);
 
 	pci_set_drvdata(pdev, bcm4377);
 	hci_set_drvdata(hdev, bcm4377);
diff --git a/drivers/bluetooth/hci_intel.c b/drivers/bluetooth/hci_intel.c
index 811f33701f84..d22fbb7f9fc5 100644
--- a/drivers/bluetooth/hci_intel.c
+++ b/drivers/bluetooth/hci_intel.c
@@ -660,7 +660,7 @@ static int intel_setup(struct hci_uart *hu)
 	 */
 	if (!bacmp(&params.otp_bdaddr, BDADDR_ANY)) {
 		bt_dev_info(hdev, "No device address configured");
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR);
 	}
 
 	/* With this Intel bootloader only the hardware variant and device
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index acba83156de9..d0adae3267b4 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -667,13 +667,13 @@ static int hci_uart_register_dev(struct hci_uart *hu)
 	SET_HCIDEV_DEV(hdev, hu->tty->dev);
 
 	if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags))
-		set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE);
 
 	if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags))
-		set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG);
 
 	if (!test_bit(HCI_UART_RESET_ON_INIT, &hu->hdev_flags))
-		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE);
 
 	/* Only call open() for the protocol after hdev is fully initialized as
 	 * open() (or a timer/workqueue it starts) may attempt to reference it.
diff --git a/drivers/bluetooth/hci_ll.c b/drivers/bluetooth/hci_ll.c
index e19e9bd49555..7044c86325ce 100644
--- a/drivers/bluetooth/hci_ll.c
+++ b/drivers/bluetooth/hci_ll.c
@@ -649,11 +649,11 @@ static int ll_setup(struct hci_uart *hu)
 		/* This means that there was an error getting the BD address
 		 * during probe, so mark the device as having a bad address.
 		 */
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks);
+		hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR);
 	} else if (bacmp(&lldev->bdaddr, BDADDR_ANY)) {
 		err = ll_set_bdaddr(hu->hdev, &lldev->bdaddr);
 		if (err)
-			set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks);
+			hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR);
 	}
 
 	/* Operational speed if any */
diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c
index 9fc10a16fd96..cd7575c20f65 100644
--- a/drivers/bluetooth/hci_nokia.c
+++ b/drivers/bluetooth/hci_nokia.c
@@ -439,7 +439,7 @@ static int nokia_setup(struct hci_uart *hu)
 
 	if (btdev->man_id == NOKIA_ID_BCM2048) {
 		hu->hdev->set_bdaddr = btbcm_set_bdaddr;
-		set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks);
+		hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR);
 		dev_dbg(dev, "bcm2048 has invalid bluetooth address!");
 	}
 
diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c
index 3ec0be496820..33c43503714b 100644
--- a/drivers/bluetooth/hci_qca.c
+++ b/drivers/bluetooth/hci_qca.c
@@ -1892,7 +1892,7 @@ static int qca_setup(struct hci_uart *hu)
 	/* Enable controller to do both LE scan and BR/EDR inquiry
 	 * simultaneously.
 	 */
-	set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
 
 	switch (soc_type) {
 	case QCA_QCA2066:
@@ -1944,7 +1944,7 @@ retry:
 	case QCA_WCN7850:
 		qcadev = serdev_device_get_drvdata(hu->serdev);
 		if (qcadev->bdaddr_property_broken)
-			set_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN);
 
 		hci_set_aosp_capable(hdev);
 
@@ -2487,7 +2487,7 @@ static int qca_serdev_probe(struct serdev_device *serdev)
 	hdev = qcadev->serdev_hu.hdev;
 
 	if (power_ctrl_enabled) {
-		set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP);
 		hdev->shutdown = qca_power_off;
 	}
 
@@ -2496,11 +2496,11 @@ static int qca_serdev_probe(struct serdev_device *serdev)
 		 * be queried via hci. Same with the valid le states quirk.
 		 */
 		if (data->capabilities & QCA_CAP_WIDEBAND_SPEECH)
-			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
-				&hdev->quirks);
+			hci_set_quirk(hdev,
+				      HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 
 		if (!(data->capabilities & QCA_CAP_VALID_LE_STATES))
-			set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES);
 	}
 
 	return 0;
@@ -2550,7 +2550,7 @@ static void qca_serdev_shutdown(struct device *dev)
 		 * invoked and the SOC is already in the initial state, so
 		 * don't also need to send the VSC.
 		 */
-		if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks) ||
+		if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP) ||
 		    hci_dev_test_flag(hdev, HCI_SETUP))
 			return;
 
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index 89a22e9b3253..593d9cefbbf9 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -152,7 +152,7 @@ static int hci_uart_close(struct hci_dev *hdev)
 	 * BT SOC is completely powered OFF during BT OFF, holding port
 	 * open may drain the battery.
 	 */
-	if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
+	if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP)) {
 		clear_bit(HCI_UART_PROTO_READY, &hu->flags);
 		serdev_device_close(hu->serdev);
 	}
@@ -358,13 +358,13 @@ int hci_uart_register_device_priv(struct hci_uart *hu,
 	SET_HCIDEV_DEV(hdev, &hu->serdev->dev);
 
 	if (test_bit(HCI_UART_NO_SUSPEND_NOTIFIER, &hu->flags))
-		set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER);
 
 	if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags))
-		set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE);
 
 	if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags))
-		set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG);
 
 	if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags))
 		return 0;
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index 59f4d7bdffdc..f7d8c3c00655 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -415,16 +415,16 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 	hdev->get_codec_config_data = vhci_get_codec_config_data;
 	hdev->wakeup = vhci_wakeup;
 	hdev->setup = vhci_setup;
-	set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks);
-	set_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks);
+	hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP);
+	hci_set_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED);
 
 	/* bit 6 is for external configuration */
 	if (opcode & 0x40)
-		set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG);
 
 	/* bit 7 is for raw device */
 	if (opcode & 0x80)
-		set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks);
+		hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE);
 
 	if (hci_register_dev(hdev) < 0) {
 		BT_ERR("Can't register HCI device");
diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c
index 756f292df9e8..6f1a37e85c6a 100644
--- a/drivers/bluetooth/virtio_bt.c
+++ b/drivers/bluetooth/virtio_bt.c
@@ -327,17 +327,17 @@ static int virtbt_probe(struct virtio_device *vdev)
 			hdev->setup = virtbt_setup_intel;
 			hdev->shutdown = virtbt_shutdown_generic;
 			hdev->set_bdaddr = virtbt_set_bdaddr_intel;
-			set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks);
-			set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
-			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER);
+			hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+			hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 			break;
 
 		case VIRTIO_BT_CONFIG_VENDOR_REALTEK:
 			hdev->manufacturer = 93;
 			hdev->setup = virtbt_setup_realtek;
 			hdev->shutdown = virtbt_shutdown_generic;
-			set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks);
-			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
+			hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY);
+			hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED);
 			break;
 		}
 	}
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 82cbd54443ac..c79901f2dc2a 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -377,6 +377,8 @@ enum {
 	 * This quirk must be set before hci_register_dev is called.
 	 */
 	HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE,
+
+	__HCI_NUM_QUIRKS,
 };
 
 /* HCI device flags */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 367ca43f45d1..f79f59e67114 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -464,7 +464,7 @@ struct hci_dev {
 
 	unsigned int	auto_accept_delay;
 
-	unsigned long	quirks;
+	DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS);
 
 	atomic_t	cmd_cnt;
 	unsigned int	acl_cnt;
@@ -656,6 +656,10 @@ struct hci_dev {
 	u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb);
 };
 
+#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags)
+#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags)
+#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags)
+
 #define HCI_PHY_HANDLE(handle)	(handle & 0xff)
 
 enum conn_reasons {
@@ -839,7 +843,7 @@ extern struct mutex hci_cb_list_lock;
 	} while (0)
 
 #define hci_dev_le_state_simultaneous(hdev) \
-	(!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \
+	(!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \
 	 ((hdev)->le_states[4] & 0x08) &&	/* Central */ \
 	 ((hdev)->le_states[4] & 0x40) &&	/* Peripheral */ \
 	 ((hdev)->le_states[3] & 0x10))		/* Simultaneous */
@@ -1931,8 +1935,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M))
 
 #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \
-			       !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \
-					 &(dev)->quirks))
+			       !hci_test_quirk((dev), \
+					       HCI_QUIRK_BROKEN_LE_CODED))
 
 #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \
 			 ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))
@@ -1944,27 +1948,27 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 
 #define read_key_size_capable(dev) \
 	((dev)->commands[20] & 0x10 && \
-	 !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks))
+	 !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE))
 
 #define read_voice_setting_capable(dev) \
 	((dev)->commands[9] & 0x04 && \
-	 !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks))
+	 !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING))
 
 /* Use enhanced synchronous connection if command is supported and its quirk
  * has not been set.
  */
 #define enhanced_sync_conn_capable(dev) \
 	(((dev)->commands[29] & 0x08) && \
-	 !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks))
+	 !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN))
 
 /* Use ext scanning if set ext scan param and ext scan enable is supported */
 #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
 			   ((dev)->commands[37] & 0x40) && \
-			   !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks))
+			   !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN))
 
 /* Use ext create connection if command is supported */
 #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \
-	!test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks))
+	!hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN))
 /* Extended advertising support */
 #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV))
 
@@ -1979,8 +1983,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
  */
 #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \
 					 ext_adv_capable(dev)) && \
-					 !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \
-						 &(dev)->quirks))
+					 !hci_test_quirk((dev), \
+							 HCI_QUIRK_BROKEN_EXT_CREATE_CONN))
 
 /* Periodic advertising support */
 #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV))
@@ -1997,7 +2001,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
 
 #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
-	(!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks)))
+	(!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG)))
 
 /* ----- HCI protocols ----- */
 #define HCI_PROTO_DEFER             0x01
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 14d7221b8ac0..441cb1700f99 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2654,7 +2654,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	/* Devices that are marked for raw-only usage are unconfigured
 	 * and should not be included in normal operation.
 	 */
-	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
 		hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
 
 	/* Mark Remote Wakeup connection flag as supported if driver has wakeup
@@ -2784,7 +2784,7 @@ int hci_register_suspend_notifier(struct hci_dev *hdev)
 	int ret = 0;
 
 	if (!hdev->suspend_notifier.notifier_call &&
-	    !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) {
+	    !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) {
 		hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
 		ret = register_pm_notifier(&hdev->suspend_notifier);
 	}
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index f625074d1f00..99e2e9fc70e8 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -38,7 +38,7 @@ static ssize_t __name ## _read(struct file *file,			      \
 	struct hci_dev *hdev = file->private_data;			      \
 	char buf[3];							      \
 									      \
-	buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N';		      \
+	buf[0] = test_bit(__quirk, hdev->quirk_flags) ? 'Y' : 'N';	      \
 	buf[1] = '\n';							      \
 	buf[2] = '\0';							      \
 	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);	      \
@@ -59,10 +59,10 @@ static ssize_t __name ## _write(struct file *file,			      \
 	if (err)							      \
 		return err;						      \
 									      \
-	if (enable == test_bit(__quirk, &hdev->quirks))			      \
+	if (enable == test_bit(__quirk, hdev->quirk_flags))		      \
 		return -EALREADY;					      \
 									      \
-	change_bit(__quirk, &hdev->quirks);				      \
+	change_bit(__quirk, hdev->quirk_flags);				      \
 									      \
 	return count;							      \
 }									      \
@@ -1356,7 +1356,7 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
 	 * for the vendor callback. Instead just store the desired value and
 	 * the setting will be programmed when the controller gets powered on.
 	 */
-	if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
+	if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
 	    (!test_bit(HCI_RUNNING, &hdev->flags) ||
 	     hci_dev_test_flag(hdev, HCI_USER_CHANNEL)))
 		goto done;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 992131f88a45..cf4b30ac9e0e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -908,8 +908,8 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
 		return rp->status;
 
 	if (hdev->max_page < rp->max_page) {
-		if (test_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
-			     &hdev->quirks))
+		if (hci_test_quirk(hdev,
+				   HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2))
 			bt_dev_warn(hdev, "broken local ext features page 2");
 		else
 			hdev->max_page = rp->max_page;
@@ -936,7 +936,7 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data,
 	hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt);
 	hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt);
 
-	if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
+	if (hci_test_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE)) {
 		hdev->sco_mtu  = 64;
 		hdev->sco_pkts = 8;
 	}
@@ -2971,7 +2971,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
 		 * state to indicate completion.
 		 */
 		if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
-		    !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+		    !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
 			hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
 		goto unlock;
 	}
@@ -2990,7 +2990,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
 		 * state to indicate completion.
 		 */
 		if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
-		    !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+		    !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
 			hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
 	}
 
@@ -3614,8 +3614,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 	/* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers
 	 * to avoid unexpected SMP command errors when pairing.
 	 */
-	if (test_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT,
-		     &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT))
 		goto notify;
 
 	/* Set the default Authenticated Payload Timeout after
@@ -5914,7 +5913,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
 	 * while we have an existing one in peripheral role.
 	 */
 	if (hdev->conn_hash.le_num_peripheral > 0 &&
-	    (test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) ||
+	    (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES) ||
 	     !(hdev->le_states[3] & 0x10)))
 		return NULL;
 
@@ -6310,8 +6309,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 		evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK;
 		legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
 
-		if (test_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY,
-			     &hdev->quirks)) {
+		if (hci_test_quirk(hdev,
+				   HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY)) {
 			info->primary_phy &= 0x1f;
 			info->secondary_phy &= 0x1f;
 		}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index acbf06aa3dd7..7938c004071c 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -393,7 +393,7 @@ static void le_scan_disable(struct work_struct *work)
 	if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
 		goto _return;
 
-	if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) {
+	if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
 		if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
 		    hdev->discovery.state != DISCOVERY_RESOLVING)
 			goto discov_stopped;
@@ -3587,7 +3587,7 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev)
 	if (ret < 0 || !bacmp(&ba, BDADDR_ANY))
 		return;
 
-	if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN))
 		baswap(&hdev->public_addr, &ba);
 	else
 		bacpy(&hdev->public_addr, &ba);
@@ -3662,7 +3662,7 @@ static int hci_init0_sync(struct hci_dev *hdev)
 	bt_dev_dbg(hdev, "");
 
 	/* Reset */
-	if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) {
+	if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
 		err = hci_reset_sync(hdev);
 		if (err)
 			return err;
@@ -3675,7 +3675,7 @@ static int hci_unconf_init_sync(struct hci_dev *hdev)
 {
 	int err;
 
-	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
 		return 0;
 
 	err = hci_init0_sync(hdev);
@@ -3718,7 +3718,7 @@ static int hci_read_local_cmds_sync(struct hci_dev *hdev)
 	 * supported commands.
 	 */
 	if (hdev->hci_ver > BLUETOOTH_VER_1_1 &&
-	    !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks))
+	    !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS))
 		return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS,
 					     0, NULL, HCI_CMD_TIMEOUT);
 
@@ -3732,7 +3732,7 @@ static int hci_init1_sync(struct hci_dev *hdev)
 	bt_dev_dbg(hdev, "");
 
 	/* Reset */
-	if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) {
+	if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
 		err = hci_reset_sync(hdev);
 		if (err)
 			return err;
@@ -3795,7 +3795,7 @@ static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type,
 	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
 		return 0;
 
-	if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
 		return 0;
 
 	memset(&cp, 0, sizeof(cp));
@@ -3822,7 +3822,7 @@ static int hci_clear_event_filter_sync(struct hci_dev *hdev)
 	 * a hci_set_event_filter_sync() call succeeds, but we do
 	 * the check both for parity and as a future reminder.
 	 */
-	if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
 		return 0;
 
 	return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00,
@@ -3846,7 +3846,7 @@ static int hci_write_sync_flowctl_sync(struct hci_dev *hdev)
 
 	/* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */
 	if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) ||
-	    !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks))
+	    !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED))
 		return 0;
 
 	memset(&cp, 0, sizeof(cp));
@@ -3921,7 +3921,7 @@ static int hci_write_inquiry_mode_sync(struct hci_dev *hdev)
 	u8 mode;
 
 	if (!lmp_inq_rssi_capable(hdev) &&
-	    !test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks))
+	    !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
 		return 0;
 
 	/* If Extended Inquiry Result events are supported, then
@@ -4111,7 +4111,7 @@ static int hci_set_event_mask_sync(struct hci_dev *hdev)
 	}
 
 	if (lmp_inq_rssi_capable(hdev) ||
-	    test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
 		events[4] |= 0x02; /* Inquiry Result with RSSI */
 
 	if (lmp_ext_feat_capable(hdev))
@@ -4163,7 +4163,7 @@ static int hci_read_stored_link_key_sync(struct hci_dev *hdev)
 	struct hci_cp_read_stored_link_key cp;
 
 	if (!(hdev->commands[6] & 0x20) ||
-	    test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
 		return 0;
 
 	memset(&cp, 0, sizeof(cp));
@@ -4212,7 +4212,7 @@ static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
 {
 	if (!(hdev->commands[18] & 0x04) ||
 	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
-	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
@@ -4226,7 +4226,7 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev)
 	 * this command in the bit mask of supported commands.
 	 */
 	if (!(hdev->commands[13] & 0x01) ||
-	    test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE,
@@ -4421,7 +4421,7 @@ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev)
 static int hci_le_read_tx_power_sync(struct hci_dev *hdev)
 {
 	if (!(hdev->commands[38] & 0x80) ||
-	    test_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER,
@@ -4464,7 +4464,7 @@ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev)
 	__le16 timeout = cpu_to_le16(hdev->rpa_timeout);
 
 	if (!(hdev->commands[35] & 0x04) ||
-	    test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT,
@@ -4609,7 +4609,7 @@ static int hci_delete_stored_link_key_sync(struct hci_dev *hdev)
 	 * just disable this command.
 	 */
 	if (!(hdev->commands[6] & 0x80) ||
-	    test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
 		return 0;
 
 	memset(&cp, 0, sizeof(cp));
@@ -4735,7 +4735,7 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev)
 
 	if (!(hdev->commands[18] & 0x08) ||
 	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
-	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
+	    hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
 		return 0;
 
 	if (enabled == hdev->err_data_reporting)
@@ -4948,7 +4948,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev)
 	size_t i;
 
 	if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
-	    !test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks))
+	    !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP))
 		return 0;
 
 	bt_dev_dbg(hdev, "");
@@ -4959,7 +4959,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev)
 		ret = hdev->setup(hdev);
 
 	for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) {
-		if (test_bit(hci_broken_table[i].quirk, &hdev->quirks))
+		if (hci_test_quirk(hdev, hci_broken_table[i].quirk))
 			bt_dev_warn(hdev, "%s", hci_broken_table[i].desc);
 	}
 
@@ -4967,10 +4967,10 @@ static int hci_dev_setup_sync(struct hci_dev *hdev)
 	 * BD_ADDR invalid before creating the HCI device or in
 	 * its setup callback.
 	 */
-	invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) ||
-			 test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks);
+	invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+			 hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
 	if (!ret) {
-		if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks) &&
+		if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) &&
 		    !bacmp(&hdev->public_addr, BDADDR_ANY))
 			hci_dev_get_bd_addr_from_property(hdev);
 
@@ -4992,7 +4992,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev)
 	 * In case any of them is set, the controller has to
 	 * start up as unconfigured.
 	 */
-	if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
+	if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) ||
 	    invalid_bdaddr)
 		hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
 
@@ -5052,7 +5052,7 @@ static int hci_dev_init_sync(struct hci_dev *hdev)
 	 * then they need to be reprogrammed after the init procedure
 	 * completed.
 	 */
-	if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
+	if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
 	    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
 	    hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
 		ret = hdev->set_diag(hdev, true);
@@ -5309,7 +5309,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 	/* Reset device */
 	skb_queue_purge(&hdev->cmd_q);
 	atomic_set(&hdev->cmd_cnt, 1);
-	if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) &&
+	if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) &&
 	    !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
 		set_bit(HCI_INIT, &hdev->flags);
 		hci_reset_sync(hdev);
@@ -5959,7 +5959,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval)
 		own_addr_type = ADDR_LE_DEV_PUBLIC;
 
 	if (hci_is_adv_monitoring(hdev) ||
-	    (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) &&
+	    (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) &&
 	    hdev->discovery.result_filtering)) {
 		/* Duplicate filter should be disabled when some advertisement
 		 * monitor is activated, otherwise AdvMon can only receive one
@@ -6022,8 +6022,7 @@ int hci_start_discovery_sync(struct hci_dev *hdev)
 		 * and LE scanning are done sequentially with separate
 		 * timeouts.
 		 */
-		if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
-			     &hdev->quirks)) {
+		if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
 			timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
 			/* During simultaneous discovery, we double LE scan
 			 * interval. We must leave some time for the controller
@@ -6100,7 +6099,7 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev)
 	/* Some fake CSR controllers lock up after setting this type of
 	 * filter, so avoid sending the request altogether.
 	 */
-	if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
 		return 0;
 
 	/* Always clear event filter when starting */
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 1485b455ade4..63dba0503653 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -464,7 +464,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 		/* Devices marked as raw-only are neither configured
 		 * nor unconfigured controllers.
 		 */
-		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+		if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
 			continue;
 
 		if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
@@ -522,7 +522,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 		/* Devices marked as raw-only are neither configured
 		 * nor unconfigured controllers.
 		 */
-		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+		if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
 			continue;
 
 		if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
@@ -576,7 +576,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 		/* Devices marked as raw-only are neither configured
 		 * nor unconfigured controllers.
 		 */
-		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+		if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
 			continue;
 
 		if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
@@ -612,12 +612,12 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 
 static bool is_configured(struct hci_dev *hdev)
 {
-	if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+	if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
 	    !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
 		return false;
 
-	if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) ||
-	     test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) &&
+	if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+	     hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
 	    !bacmp(&hdev->public_addr, BDADDR_ANY))
 		return false;
 
@@ -628,12 +628,12 @@ static __le32 get_missing_options(struct hci_dev *hdev)
 {
 	u32 options = 0;
 
-	if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+	if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
 	    !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
 		options |= MGMT_OPTION_EXTERNAL_CONFIG;
 
-	if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) ||
-	     test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) &&
+	if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+	     hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
 	    !bacmp(&hdev->public_addr, BDADDR_ANY))
 		options |= MGMT_OPTION_PUBLIC_ADDRESS;
 
@@ -669,7 +669,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
 	memset(&rp, 0, sizeof(rp));
 	rp.manufacturer = cpu_to_le16(hdev->manufacturer);
 
-	if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
 		options |= MGMT_OPTION_EXTERNAL_CONFIG;
 
 	if (hdev->set_bdaddr)
@@ -828,8 +828,7 @@ static u32 get_supported_settings(struct hci_dev *hdev)
 		if (lmp_sc_capable(hdev))
 			settings |= MGMT_SETTING_SECURE_CONN;
 
-		if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
-			     &hdev->quirks))
+		if (hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
 			settings |= MGMT_SETTING_WIDEBAND_SPEECH;
 	}
 
@@ -841,8 +840,7 @@ static u32 get_supported_settings(struct hci_dev *hdev)
 		settings |= MGMT_SETTING_ADVERTISING;
 	}
 
-	if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
-	    hdev->set_bdaddr)
+	if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || hdev->set_bdaddr)
 		settings |= MGMT_SETTING_CONFIGURATION;
 
 	if (cis_central_capable(hdev))
@@ -4307,7 +4305,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev,
 
 	bt_dev_dbg(hdev, "sock %p", sk);
 
-	if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks))
+	if (!hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
 		return mgmt_cmd_status(sk, hdev->id,
 				       MGMT_OP_SET_WIDEBAND_SPEECH,
 				       MGMT_STATUS_NOT_SUPPORTED);
@@ -7935,7 +7933,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev,
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
 				         MGMT_STATUS_INVALID_PARAMS);
 
-	if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+	if (!hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
 				       MGMT_STATUS_NOT_SUPPORTED);
 
@@ -9338,7 +9336,7 @@ void mgmt_index_added(struct hci_dev *hdev)
 {
 	struct mgmt_ev_ext_index ev;
 
-	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
 		return;
 
 	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
@@ -9362,7 +9360,7 @@ void mgmt_index_removed(struct hci_dev *hdev)
 	struct mgmt_ev_ext_index ev;
 	struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX };
 
-	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+	if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
 		return;
 
 	mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match);
@@ -10089,7 +10087,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
 	if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
 	    (rssi == HCI_RSSI_INVALID ||
 	    (rssi < hdev->discovery.rssi &&
-	     !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks))))
+	     !hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER))))
 		return  false;
 
 	if (hdev->discovery.uuid_count != 0) {
@@ -10107,7 +10105,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
 	/* If duplicate filtering does not report RSSI changes, then restart
 	 * scanning to ensure updated result with updated RSSI values.
 	 */
-	if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) {
+	if (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)) {
 		/* Validate RSSI value against the RSSI threshold once more. */
 		if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
 		    rssi < hdev->discovery.rssi)
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
index 5a8ccc491b14..c560d8467669 100644
--- a/net/bluetooth/msft.c
+++ b/net/bluetooth/msft.c
@@ -989,7 +989,7 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb)
 
 	handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false);
 
-	if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) {
+	if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) {
 		if (!handle_data)
 			return;
 		mgmt_handle = handle_data->mgmt_handle;
-- 
cgit v1.2.3


From 55d42f6169760d052330f3c949c02e37867b87d8 Mon Sep 17 00:00:00 2001
From: Pratap Nirujogi <pratap.nirujogi@amd.com>
Date: Mon, 23 Jun 2025 18:44:50 -0400
Subject: drm/amd/amdgpu: Add helper functions for isp buffers

Accessing amdgpu internal data structures "struct amdgpu_device"
and "struct amdgpu_bo" in ISP V4L2 driver to alloc/free GART
buffers is not recommended.

Add new amdgpu_isp helper functions that takes opaque params
from ISP V4L2 driver and calls the amdgpu internal functions
amdgpu_bo_create_isp_user() and amdgpu_bo_create_kernel() to
alloc/free GART buffers.

Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Pratap Nirujogi <pratap.nirujogi@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c    | 175 +++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_isp.h    |   7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   4 -
 include/drm/amd/isp.h                      |  51 +++++++++
 4 files changed, 227 insertions(+), 10 deletions(-)
 create mode 100644 include/drm/amd/isp.h

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
index 43fc941dfa57..9cddbf50442a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.c
@@ -33,6 +33,8 @@
 #include "isp_v4_1_0.h"
 #include "isp_v4_1_1.h"
 
+#define ISP_MC_ADDR_ALIGN (1024 * 32)
+
 /**
  * isp_hw_init - start and test isp block
  *
@@ -141,6 +143,179 @@ static int isp_set_powergating_state(struct amdgpu_ip_block *ip_block,
 	return 0;
 }
 
+static int is_valid_isp_device(struct device *isp_parent, struct device *amdgpu_dev)
+{
+	if (isp_parent != amdgpu_dev)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * isp_user_buffer_alloc - create user buffer object (BO) for isp
+ *
+ * @dev: isp device handle
+ * @dmabuf: DMABUF handle for isp buffer allocated in system memory
+ * @buf_obj: GPU buffer object handle to initialize
+ * @buf_addr: GPU addr of the pinned BO to initialize
+ *
+ * Imports isp DMABUF to allocate and pin a user BO for isp internal use. It does
+ * GART alloc to generate GPU addr for BO to make it accessible through the
+ * GART aperture for ISP HW.
+ *
+ * This function is exported to allow the V4L2 isp device external to drm device
+ * to create and access the isp user BO.
+ *
+ * Returns:
+ * 0 on success, negative error code otherwise.
+ */
+int isp_user_buffer_alloc(struct device *dev, void *dmabuf,
+			  void **buf_obj, u64 *buf_addr)
+{
+	struct platform_device *ispdev = to_platform_device(dev);
+	const struct isp_platform_data *isp_pdata;
+	struct amdgpu_device *adev;
+	struct mfd_cell *mfd_cell;
+	struct amdgpu_bo *bo;
+	u64 gpu_addr;
+	int ret;
+
+	if (WARN_ON(!ispdev))
+		return -ENODEV;
+
+	if (WARN_ON(!buf_obj))
+		return -EINVAL;
+
+	if (WARN_ON(!buf_addr))
+		return -EINVAL;
+
+	mfd_cell = &ispdev->mfd_cell[0];
+	if (!mfd_cell)
+		return -ENODEV;
+
+	isp_pdata = mfd_cell->platform_data;
+	adev = isp_pdata->adev;
+
+	ret = is_valid_isp_device(ispdev->dev.parent, adev->dev);
+	if (ret)
+		return ret;
+
+	ret = amdgpu_bo_create_isp_user(adev, dmabuf,
+					AMDGPU_GEM_DOMAIN_GTT, &bo, &gpu_addr);
+	if (ret) {
+		drm_err(&adev->ddev, "failed to alloc gart user buffer (%d)", ret);
+		return ret;
+	}
+
+	*buf_obj = (void *)bo;
+	*buf_addr = gpu_addr;
+
+	return 0;
+}
+EXPORT_SYMBOL(isp_user_buffer_alloc);
+
+/**
+ * isp_user_buffer_free - free isp user buffer object (BO)
+ *
+ * @buf_obj: amdgpu isp user BO to free
+ *
+ * unpin and unref BO for isp internal use.
+ *
+ * This function is exported to allow the V4L2 isp device
+ * external to drm device to free the isp user BO.
+ */
+void isp_user_buffer_free(void *buf_obj)
+{
+	amdgpu_bo_free_isp_user(buf_obj);
+}
+EXPORT_SYMBOL(isp_user_buffer_free);
+
+/**
+ * isp_kernel_buffer_alloc - create kernel buffer object (BO) for isp
+ *
+ * @dev: isp device handle
+ * @size: size for the new BO
+ * @buf_obj: GPU BO handle to initialize
+ * @gpu_addr: GPU addr of the pinned BO
+ * @cpu_addr: CPU address mapping of BO
+ *
+ * Allocates and pins a kernel BO for internal isp firmware use.
+ *
+ * This function is exported to allow the V4L2 isp device
+ * external to drm device to create and access the kernel BO.
+ *
+ * Returns:
+ * 0 on success, negative error code otherwise.
+ */
+int isp_kernel_buffer_alloc(struct device *dev, u64 size,
+			    void **buf_obj, u64 *gpu_addr, void **cpu_addr)
+{
+	struct platform_device *ispdev = to_platform_device(dev);
+	struct amdgpu_bo **bo = (struct amdgpu_bo **)buf_obj;
+	const struct isp_platform_data *isp_pdata;
+	struct amdgpu_device *adev;
+	struct mfd_cell *mfd_cell;
+	int ret;
+
+	if (WARN_ON(!ispdev))
+		return -ENODEV;
+
+	if (WARN_ON(!buf_obj))
+		return -EINVAL;
+
+	if (WARN_ON(!gpu_addr))
+		return -EINVAL;
+
+	if (WARN_ON(!cpu_addr))
+		return -EINVAL;
+
+	mfd_cell = &ispdev->mfd_cell[0];
+	if (!mfd_cell)
+		return -ENODEV;
+
+	isp_pdata = mfd_cell->platform_data;
+	adev = isp_pdata->adev;
+
+	ret = is_valid_isp_device(ispdev->dev.parent, adev->dev);
+	if (ret)
+		return ret;
+
+	ret = amdgpu_bo_create_kernel(adev,
+				      size,
+				      ISP_MC_ADDR_ALIGN,
+				      AMDGPU_GEM_DOMAIN_GTT,
+				      bo,
+				      gpu_addr,
+				      cpu_addr);
+	if (!cpu_addr || ret) {
+		drm_err(&adev->ddev, "failed to alloc gart kernel buffer (%d)", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(isp_kernel_buffer_alloc);
+
+/**
+ * isp_kernel_buffer_free - free isp kernel buffer object (BO)
+ *
+ * @buf_obj: amdgpu isp user BO to free
+ * @gpu_addr: GPU addr of isp kernel BO
+ * @cpu_addr: CPU addr of isp kernel BO
+ *
+ * unmaps and unpin a isp kernel BO.
+ *
+ * This function is exported to allow the V4L2 isp device
+ * external to drm device to free the kernel BO.
+ */
+void isp_kernel_buffer_free(void **buf_obj, u64 *gpu_addr, void **cpu_addr)
+{
+	struct amdgpu_bo **bo = (struct amdgpu_bo **)buf_obj;
+
+	amdgpu_bo_free_kernel(bo, gpu_addr, cpu_addr);
+}
+EXPORT_SYMBOL(isp_kernel_buffer_free);
+
 static const struct amd_ip_funcs isp_ip_funcs = {
 	.name = "isp_ip",
 	.early_init = isp_early_init,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.h
index 1d1c4b1ec7e7..d6f4ffa4c97c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_isp.h
@@ -28,18 +28,13 @@
 #ifndef __AMDGPU_ISP_H__
 #define __AMDGPU_ISP_H__
 
+#include <drm/amd/isp.h>
 #include <linux/pm_domain.h>
 
 #define ISP_REGS_OFFSET_END 0x629A4
 
 struct amdgpu_isp;
 
-struct isp_platform_data {
-	void *adev;
-	u32 asic_type;
-	resource_size_t base_rmmio_size;
-};
-
 struct isp_funcs {
 	int (*hw_init)(struct amdgpu_isp *isp);
 	int (*hw_fini)(struct amdgpu_isp *isp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index c5fda18967c8..122a88294883 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -352,7 +352,6 @@ int amdgpu_bo_create_kernel(struct amdgpu_device *adev,
 
 	return 0;
 }
-EXPORT_SYMBOL(amdgpu_bo_create_kernel);
 
 /**
  * amdgpu_bo_create_isp_user - create user BO for isp
@@ -421,7 +420,6 @@ error_unreserve:
 
 	return r;
 }
-EXPORT_SYMBOL(amdgpu_bo_create_isp_user);
 
 /**
  * amdgpu_bo_create_kernel_at - create BO for kernel use at specific location
@@ -525,7 +523,6 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr,
 	if (cpu_addr)
 		*cpu_addr = NULL;
 }
-EXPORT_SYMBOL(amdgpu_bo_free_kernel);
 
 /**
  * amdgpu_bo_free_isp_user - free BO for isp use
@@ -548,7 +545,6 @@ void amdgpu_bo_free_isp_user(struct amdgpu_bo *bo)
 	}
 	amdgpu_bo_unref(&bo);
 }
-EXPORT_SYMBOL(amdgpu_bo_free_isp_user);
 
 /* Validate bo size is bit bigger than the request domain */
 static bool amdgpu_bo_validate_size(struct amdgpu_device *adev,
diff --git a/include/drm/amd/isp.h b/include/drm/amd/isp.h
new file mode 100644
index 000000000000..ec868288abf2
--- /dev/null
+++ b/include/drm/amd/isp.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ */
+
+#ifndef __ISP_H__
+#define __ISP_H__
+
+#include <linux/types.h>
+
+struct device;
+
+struct isp_platform_data {
+	void *adev;
+	u32 asic_type;
+	resource_size_t base_rmmio_size;
+};
+
+int isp_user_buffer_alloc(struct device *dev, void *dmabuf,
+			  void **buf_obj, u64 *buf_addr);
+
+void isp_user_buffer_free(void *buf_obj);
+
+int isp_kernel_buffer_alloc(struct device *dev, u64 size,
+			    void **buf_obj, u64 *gpu_addr, void **cpu_addr);
+
+void isp_kernel_buffer_free(void **buf_obj, u64 *gpu_addr, void **cpu_addr);
+
+#endif
-- 
cgit v1.2.3


From 78d0a27ae0e2e70b22895f4b388cc0ab88e3c6ca Mon Sep 17 00:00:00 2001
From: Jesse Zhang <jesse.zhang@amd.com>
Date: Wed, 25 Jun 2025 15:29:45 +0800
Subject: drm/amdgpu: Add user queue instance count in HW IP info

This change exposes the number of available user queue instances
for each hardware IP type (GFX, COMPUTE, SDMA) through the
drm_amdgpu_info_hw_ip interface.

Key changes:
1. Added userq_num_instance field to drm_amdgpu_info_hw_ip structure
2. Implemented counting of available HQD slots using:
   - mes.gfx_hqd_mask for GFX queues
   - mes.compute_hqd_mask for COMPUTE queues
   - mes.sdma_hqd_mask for SDMA queues
3. Only counts available instances when user queues are enabled
   (!disable_uq)

v2: using the adev->mes.gfx_hqd_mask[]/compute_hqd_mask[]/sdma_hqd_mask[] masks
  to determine the number of queue slots available for each engine type (Alex)
v3: rename userq_num_instance to userq_num_hqds (Alex)

Suggested-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 20 ++++++++++++++++++++
 include/uapi/drm/amdgpu_drm.h           |  2 ++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index c78eea14c70a..3d4185a1d938 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -399,6 +399,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 	uint32_t ib_size_alignment = 0;
 	enum amd_ip_block_type type;
 	unsigned int num_rings = 0;
+	uint32_t num_hqds = 0;
 	unsigned int i, j;
 
 	if (info->query_hw_ip.ip_instance >= AMDGPU_HW_IP_INSTANCE_MAX_COUNT)
@@ -411,6 +412,12 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 			if (adev->gfx.gfx_ring[i].sched.ready &&
 			    !adev->gfx.gfx_ring[i].no_user_submission)
 				++num_rings;
+
+		if (!adev->gfx.disable_uq) {
+			for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
+				num_hqds += hweight32(adev->mes.gfx_hqd_mask[i]);
+		}
+
 		ib_start_alignment = 32;
 		ib_size_alignment = 32;
 		break;
@@ -420,6 +427,12 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 			if (adev->gfx.compute_ring[i].sched.ready &&
 			    !adev->gfx.compute_ring[i].no_user_submission)
 				++num_rings;
+
+		if (!adev->sdma.disable_uq) {
+			for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
+				num_hqds += hweight32(adev->mes.compute_hqd_mask[i]);
+		}
+
 		ib_start_alignment = 32;
 		ib_size_alignment = 32;
 		break;
@@ -429,6 +442,12 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 			if (adev->sdma.instance[i].ring.sched.ready &&
 			    !adev->sdma.instance[i].ring.no_user_submission)
 				++num_rings;
+
+		if (!adev->gfx.disable_uq) {
+			for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
+				num_hqds += hweight32(adev->mes.sdma_hqd_mask[i]);
+		}
+
 		ib_start_alignment = 256;
 		ib_size_alignment = 4;
 		break;
@@ -570,6 +589,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 	}
 	result->capabilities_flags = 0;
 	result->available_rings = (1 << num_rings) - 1;
+	result->userq_num_hqds = num_hqds;
 	result->ib_start_alignment = ib_start_alignment;
 	result->ib_size_alignment = ib_size_alignment;
 	return 0;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 45c4fa13499c..66c4a03ac9f9 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1493,6 +1493,8 @@ struct drm_amdgpu_info_hw_ip {
 	__u32  available_rings;
 	/** version info: bits 23:16 major, 15:8 minor, 7:0 revision */
 	__u32  ip_discovery_version;
+	/* Userq available hqds */
+	__u32  userq_num_hqds;
 };
 
 /* GFX metadata BO sizes and alignment info (in bytes) */
-- 
cgit v1.2.3


From 9ffab039bcb0bbfade0e659552d2fb912347a871 Mon Sep 17 00:00:00 2001
From: Jesse Zhang <jesse.zhang@amd.com>
Date: Fri, 4 Jul 2025 15:17:43 +0800
Subject: drm/amdgpu: Replace HQD terminology with slots naming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The term "HQD" is CP-specific and doesn't
accurately describe the queue resources for other IP blocks like SDMA,
VCN, or VPE. This change:

1. Renames `num_hqds` to `num_slots` in amdgpu_kms.c to better reflect
   the generic nature of the resource counting
2. Updates the UAPI struct member from `userq_num_hqds` to `userq_num_slots`
3. Maintains the same functionality while using more appropriate terminology

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 +++++-----
 include/uapi/drm/amdgpu_drm.h           |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 3d4185a1d938..8a76960803c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -399,7 +399,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 	uint32_t ib_size_alignment = 0;
 	enum amd_ip_block_type type;
 	unsigned int num_rings = 0;
-	uint32_t num_hqds = 0;
+	uint32_t num_slots = 0;
 	unsigned int i, j;
 
 	if (info->query_hw_ip.ip_instance >= AMDGPU_HW_IP_INSTANCE_MAX_COUNT)
@@ -415,7 +415,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 
 		if (!adev->gfx.disable_uq) {
 			for (i = 0; i < AMDGPU_MES_MAX_GFX_PIPES; i++)
-				num_hqds += hweight32(adev->mes.gfx_hqd_mask[i]);
+				num_slots += hweight32(adev->mes.gfx_hqd_mask[i]);
 		}
 
 		ib_start_alignment = 32;
@@ -430,7 +430,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 
 		if (!adev->sdma.disable_uq) {
 			for (i = 0; i < AMDGPU_MES_MAX_COMPUTE_PIPES; i++)
-				num_hqds += hweight32(adev->mes.compute_hqd_mask[i]);
+				num_slots += hweight32(adev->mes.compute_hqd_mask[i]);
 		}
 
 		ib_start_alignment = 32;
@@ -445,7 +445,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 
 		if (!adev->gfx.disable_uq) {
 			for (i = 0; i < AMDGPU_MES_MAX_SDMA_PIPES; i++)
-				num_hqds += hweight32(adev->mes.sdma_hqd_mask[i]);
+				num_slots += hweight32(adev->mes.sdma_hqd_mask[i]);
 		}
 
 		ib_start_alignment = 256;
@@ -589,7 +589,7 @@ static int amdgpu_hw_ip_info(struct amdgpu_device *adev,
 	}
 	result->capabilities_flags = 0;
 	result->available_rings = (1 << num_rings) - 1;
-	result->userq_num_hqds = num_hqds;
+	result->userq_num_slots = num_slots;
 	result->ib_start_alignment = ib_start_alignment;
 	result->ib_size_alignment = ib_size_alignment;
 	return 0;
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 66c4a03ac9f9..bdedbaccf776 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1493,8 +1493,8 @@ struct drm_amdgpu_info_hw_ip {
 	__u32  available_rings;
 	/** version info: bits 23:16 major, 15:8 minor, 7:0 revision */
 	__u32  ip_discovery_version;
-	/* Userq available hqds */
-	__u32  userq_num_hqds;
+	/* Userq available slots */
+	__u32  userq_num_slots;
 };
 
 /* GFX metadata BO sizes and alignment info (in bytes) */
-- 
cgit v1.2.3


From 19d18fdfc79217c86802271c9ce5b4ed174628cc Mon Sep 17 00:00:00 2001
From: Tao Chen <chen.dylane@linux.dev>
Date: Wed, 16 Jul 2025 21:46:53 +0800
Subject: bpf: Add struct bpf_token_info

The 'commit 35f96de04127 ("bpf: Introduce BPF token object")' added
BPF token as a new kind of BPF kernel object. And BPF_OBJ_GET_INFO_BY_FD
already used to get BPF object info, so we can also get token info with
this cmd.
One usage scenario, when program runs failed with token, because of
the permission failure, we can report what BPF token is allowing with
this API for debugging.

Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Tao Chen <chen.dylane@linux.dev>
Link: https://lore.kernel.org/r/20250716134654.1162635-1-chen.dylane@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 11 +++++++++++
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/bpf/syscall.c           | 18 ++++++++++++++++++
 kernel/bpf/token.c             | 25 ++++++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h |  8 ++++++++
 5 files changed, 69 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bc887831eaa5..f9cd2164ed23 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2354,6 +2354,7 @@ extern const struct super_operations bpf_super_ops;
 extern const struct file_operations bpf_map_fops;
 extern const struct file_operations bpf_prog_fops;
 extern const struct file_operations bpf_iter_fops;
+extern const struct file_operations bpf_token_fops;
 
 #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
@@ -2551,6 +2552,9 @@ void bpf_token_inc(struct bpf_token *token);
 void bpf_token_put(struct bpf_token *token);
 int bpf_token_create(union bpf_attr *attr);
 struct bpf_token *bpf_token_get_from_fd(u32 ufd);
+int bpf_token_get_info_by_fd(struct bpf_token *token,
+			     const union bpf_attr *attr,
+			     union bpf_attr __user *uattr);
 
 bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd);
 bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type);
@@ -2949,6 +2953,13 @@ static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline int bpf_token_get_info_by_fd(struct bpf_token *token,
+					   const union bpf_attr *attr,
+					   union bpf_attr __user *uattr)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void __dev_flush(struct list_head *flush_list)
 {
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0670e15a6100..233de8677382 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -450,6 +450,7 @@ union bpf_iter_link_info {
  *		* **struct bpf_map_info**
  *		* **struct bpf_btf_info**
  *		* **struct bpf_link_info**
+ *		* **struct bpf_token_info**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
@@ -6803,6 +6804,13 @@ struct bpf_link_info {
 	};
 } __attribute__((aligned(8)));
 
+struct bpf_token_info {
+	__u64 allowed_cmds;
+	__u64 allowed_maps;
+	__u64 allowed_progs;
+	__u64 allowed_attachs;
+} __attribute__((aligned(8)));
+
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
  * attach type).
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1a26d17536be..e63039817af3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5239,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file,
 }
 
 
+static int token_get_info_by_fd(struct file *file,
+				struct bpf_token *token,
+				const union bpf_attr *attr,
+				union bpf_attr __user *uattr)
+{
+	struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len);
+	if (err)
+		return err;
+	return bpf_token_get_info_by_fd(token, attr, uattr);
+}
+
 #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
 
 static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
@@ -5262,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll)
 		return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
 					      attr, uattr);
+	else if (fd_file(f)->f_op == &bpf_token_fops)
+		return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data,
+					    attr, uattr);
 	return -EINVAL;
 }
 
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index 26057aa13503..0bbe412f854e 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp)
 
 static const struct inode_operations bpf_token_iops = { };
 
-static const struct file_operations bpf_token_fops = {
+const struct file_operations bpf_token_fops = {
 	.release	= bpf_token_release,
 	.show_fdinfo	= bpf_token_show_fdinfo,
 };
@@ -210,6 +210,29 @@ out_file:
 	return err;
 }
 
+int bpf_token_get_info_by_fd(struct bpf_token *token,
+			     const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_token_info info;
+	u32 info_len = attr->info.info_len;
+
+	info_len = min_t(u32, info_len, sizeof(info));
+	memset(&info, 0, sizeof(info));
+
+	info.allowed_cmds = token->allowed_cmds;
+	info.allowed_maps = token->allowed_maps;
+	info.allowed_progs = token->allowed_progs;
+	info.allowed_attachs = token->allowed_attachs;
+
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
 struct bpf_token *bpf_token_get_from_fd(u32 ufd)
 {
 	CLASS(fd, f)(ufd);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0670e15a6100..233de8677382 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -450,6 +450,7 @@ union bpf_iter_link_info {
  *		* **struct bpf_map_info**
  *		* **struct bpf_btf_info**
  *		* **struct bpf_link_info**
+ *		* **struct bpf_token_info**
  *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
@@ -6803,6 +6804,13 @@ struct bpf_link_info {
 	};
 } __attribute__((aligned(8)));
 
+struct bpf_token_info {
+	__u64 allowed_cmds;
+	__u64 allowed_maps;
+	__u64 allowed_progs;
+	__u64 allowed_attachs;
+} __attribute__((aligned(8)));
+
 /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
  * by user and intended to be used by socket (e.g. to bind to, depends on
  * attach type).
-- 
cgit v1.2.3


From 23972da96e1eee7f10c8ef641d56202ab9af8ba7 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 30 Jun 2025 14:12:02 +0200
Subject: firmware: qcom: scm: remove unused arguments from SHM bridge routines

qcom_scm_shm_bridge_create() and qcom_scm_shm_bridge_delete() take
struct device as argument but don't use it. Remove it from these
functions' prototypes.

Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://lore.kernel.org/r/20250630-qcom-scm-race-v2-1-fa3851c98611@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/firmware/qcom/qcom_scm.c       | 4 ++--
 drivers/firmware/qcom/qcom_tzmem.c     | 8 ++++----
 include/linux/firmware/qcom/qcom_scm.h | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c
index f63b716be5b0..d830511a0082 100644
--- a/drivers/firmware/qcom/qcom_scm.c
+++ b/drivers/firmware/qcom/qcom_scm.c
@@ -1631,7 +1631,7 @@ int qcom_scm_shm_bridge_enable(void)
 }
 EXPORT_SYMBOL_GPL(qcom_scm_shm_bridge_enable);
 
-int qcom_scm_shm_bridge_create(struct device *dev, u64 pfn_and_ns_perm_flags,
+int qcom_scm_shm_bridge_create(u64 pfn_and_ns_perm_flags,
 			       u64 ipfn_and_s_perm_flags, u64 size_and_flags,
 			       u64 ns_vmids, u64 *handle)
 {
@@ -1659,7 +1659,7 @@ int qcom_scm_shm_bridge_create(struct device *dev, u64 pfn_and_ns_perm_flags,
 }
 EXPORT_SYMBOL_GPL(qcom_scm_shm_bridge_create);
 
-int qcom_scm_shm_bridge_delete(struct device *dev, u64 handle)
+int qcom_scm_shm_bridge_delete(u64 handle)
 {
 	struct qcom_scm_desc desc = {
 		.svc = QCOM_SCM_SVC_MP,
diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c
index 94196ad87105..4fe333fd2f07 100644
--- a/drivers/firmware/qcom/qcom_tzmem.c
+++ b/drivers/firmware/qcom/qcom_tzmem.c
@@ -124,9 +124,9 @@ static int qcom_tzmem_init_area(struct qcom_tzmem_area *area)
 	if (!handle)
 		return -ENOMEM;
 
-	ret = qcom_scm_shm_bridge_create(qcom_tzmem_dev, pfn_and_ns_perm,
-					 ipfn_and_s_perm, size_and_flags,
-					 QCOM_SCM_VMID_HLOS, handle);
+	ret = qcom_scm_shm_bridge_create(pfn_and_ns_perm, ipfn_and_s_perm,
+					 size_and_flags, QCOM_SCM_VMID_HLOS,
+					 handle);
 	if (ret)
 		return ret;
 
@@ -142,7 +142,7 @@ static void qcom_tzmem_cleanup_area(struct qcom_tzmem_area *area)
 	if (!qcom_tzmem_using_shm_bridge)
 		return;
 
-	qcom_scm_shm_bridge_delete(qcom_tzmem_dev, *handle);
+	qcom_scm_shm_bridge_delete(*handle);
 	kfree(handle);
 }
 
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 983e1591bbba..82b1b8c50ca3 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -149,10 +149,10 @@ bool qcom_scm_lmh_dcvsh_available(void);
 int qcom_scm_gpu_init_regs(u32 gpu_req);
 
 int qcom_scm_shm_bridge_enable(void);
-int qcom_scm_shm_bridge_create(struct device *dev, u64 pfn_and_ns_perm_flags,
+int qcom_scm_shm_bridge_create(u64 pfn_and_ns_perm_flags,
 			       u64 ipfn_and_s_perm_flags, u64 size_and_flags,
 			       u64 ns_vmids, u64 *handle);
-int qcom_scm_shm_bridge_delete(struct device *dev, u64 handle);
+int qcom_scm_shm_bridge_delete(u64 handle);
 
 #ifdef CONFIG_QCOM_QSEECOM
 
-- 
cgit v1.2.3


From dc3f4e75c54c19bad9a70419afae00ce6baf3ebf Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 30 Jun 2025 14:12:03 +0200
Subject: firmware: qcom: scm: take struct device as argument in SHM bridge
 enable

qcom_scm_shm_bridge_enable() is used early in the SCM initialization
routine. It makes an SCM call and so expects the internal __scm pointer
in the SCM driver to be assigned. For this reason the tzmem memory pool
is allocated *after* this pointer is assigned. However, this can lead to
a crash if another consumer of the SCM API makes a call using the memory
pool between the assignment of the __scm pointer and the initialization
of the tzmem memory pool.

As qcom_scm_shm_bridge_enable() is a special case, not meant to be
called by ordinary users, pull it into the local SCM header. Make it
take struct device as argument. This is the device that will be used to
make the SCM call as opposed to the global __scm pointer. This will
allow us to move the tzmem initialization *before* the __scm assignment
in the core SCM driver.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250630-qcom-scm-race-v2-2-fa3851c98611@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/firmware/qcom/qcom_scm.c       | 12 +++++++++---
 drivers/firmware/qcom/qcom_scm.h       |  1 +
 drivers/firmware/qcom/qcom_tzmem.c     |  3 ++-
 include/linux/firmware/qcom/qcom_scm.h |  1 -
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c
index d830511a0082..09b698b90216 100644
--- a/drivers/firmware/qcom/qcom_scm.c
+++ b/drivers/firmware/qcom/qcom_scm.c
@@ -1603,7 +1603,13 @@ bool qcom_scm_lmh_dcvsh_available(void)
 }
 EXPORT_SYMBOL_GPL(qcom_scm_lmh_dcvsh_available);
 
-int qcom_scm_shm_bridge_enable(void)
+/*
+ * This is only supposed to be called once by the TZMem module. It takes the
+ * SCM struct device as argument and uses it to pass the call as at the time
+ * the SHM Bridge is enabled, the SCM is not yet fully set up and doesn't
+ * accept global user calls. Don't try to use the __scm pointer here.
+ */
+int qcom_scm_shm_bridge_enable(struct device *scm_dev)
 {
 	int ret;
 
@@ -1615,11 +1621,11 @@ int qcom_scm_shm_bridge_enable(void)
 
 	struct qcom_scm_res res;
 
-	if (!__qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_MP,
+	if (!__qcom_scm_is_call_available(scm_dev, QCOM_SCM_SVC_MP,
 					  QCOM_SCM_MP_SHM_BRIDGE_ENABLE))
 		return -EOPNOTSUPP;
 
-	ret = qcom_scm_call(__scm->dev, &desc, &res);
+	ret = qcom_scm_call(scm_dev, &desc, &res);
 
 	if (ret)
 		return ret;
diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h
index 3133d826f5fa..0e8dd838099e 100644
--- a/drivers/firmware/qcom/qcom_scm.h
+++ b/drivers/firmware/qcom/qcom_scm.h
@@ -83,6 +83,7 @@ int scm_legacy_call(struct device *dev, const struct qcom_scm_desc *desc,
 		    struct qcom_scm_res *res);
 
 struct qcom_tzmem_pool *qcom_scm_get_tzmem_pool(void);
+int qcom_scm_shm_bridge_enable(struct device *scm_dev);
 
 #define QCOM_SCM_SVC_BOOT		0x01
 #define QCOM_SCM_BOOT_SET_ADDR		0x01
diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c
index 4fe333fd2f07..ea0a35355657 100644
--- a/drivers/firmware/qcom/qcom_tzmem.c
+++ b/drivers/firmware/qcom/qcom_tzmem.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 
+#include "qcom_scm.h"
 #include "qcom_tzmem.h"
 
 struct qcom_tzmem_area {
@@ -94,7 +95,7 @@ static int qcom_tzmem_init(void)
 			goto notsupp;
 	}
 
-	ret = qcom_scm_shm_bridge_enable();
+	ret = qcom_scm_shm_bridge_enable(qcom_tzmem_dev);
 	if (ret == -EOPNOTSUPP)
 		goto notsupp;
 
diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h
index 82b1b8c50ca3..0f667bf1d4d9 100644
--- a/include/linux/firmware/qcom/qcom_scm.h
+++ b/include/linux/firmware/qcom/qcom_scm.h
@@ -148,7 +148,6 @@ bool qcom_scm_lmh_dcvsh_available(void);
 
 int qcom_scm_gpu_init_regs(u32 gpu_req);
 
-int qcom_scm_shm_bridge_enable(void);
 int qcom_scm_shm_bridge_create(u64 pfn_and_ns_perm_flags,
 			       u64 ipfn_and_s_perm_flags, u64 size_and_flags,
 			       u64 ns_vmids, u64 *handle);
-- 
cgit v1.2.3


From e53ff5b79fbac35d1fbf2b8c28a5a5dcf125567e Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Wed, 25 Jun 2025 11:11:44 +0200
Subject: dt-bindings: arm: qcom,ids: Add SoC IDs for SM7635 family

Add the SoC IDs of the 'volcano' family, namely SM7635, SM6650, SM6650P,
QCM6690 and QCS6690.

Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250625-sm7635-socinfo-v1-1-be09d5c697b8@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/arm/qcom,ids.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index 897b8135dc12..cb8ce53146f0 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -279,8 +279,13 @@
 #define QCOM_ID_QCM8550			604
 #define QCOM_ID_SM8750			618
 #define QCOM_ID_IPQ5300			624
+#define QCOM_ID_SM7635			636
+#define QCOM_ID_SM6650			640
+#define QCOM_ID_SM6650P			641
 #define QCOM_ID_IPQ5321			650
 #define QCOM_ID_IPQ5424			651
+#define QCOM_ID_QCM6690			657
+#define QCOM_ID_QCS6690			658
 #define QCOM_ID_IPQ5404			671
 #define QCOM_ID_QCS9100			667
 #define QCOM_ID_QCS8300			674
-- 
cgit v1.2.3


From 50b749fab108c2354bb6368d95aaec82e3c99912 Mon Sep 17 00:00:00 2001
From: Rakesh Kota <rakesh.kota@oss.qualcomm.com>
Date: Fri, 4 Jul 2025 17:00:36 +0530
Subject: soc: qcom: spmi-pmic: add more PMIC SUBTYPE IDs

Add the PMM8650AU and PMM8650AU_PSAIL PMIC SUBTYPE IDs and
These PMICs are used by the qcs8300 and qcs9100 platforms.

Signed-off-by: Rakesh Kota <rakesh.kota@oss.qualcomm.com>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250704113036.1627695-1-rakesh.kota@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/socinfo.c        | 2 ++
 include/soc/qcom/qcom-spmi-pmic.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c
index e73644058ddd..30f09a8053f1 100644
--- a/drivers/soc/qcom/socinfo.c
+++ b/drivers/soc/qcom/socinfo.c
@@ -128,6 +128,8 @@ static const char *const pmic_models[] = {
 	[72] = "PMR735D",
 	[73] = "PM8550",
 	[74] = "PMK8550",
+	[78] = "PMM8650AU",
+	[79] = "PMM8650AU_PSAIL",
 	[80] = "PM7550",
 	[82] = "PMC8380",
 	[83] = "SMB2360",
diff --git a/include/soc/qcom/qcom-spmi-pmic.h b/include/soc/qcom/qcom-spmi-pmic.h
index df3d3a0af98a..2cf9e2d8cd55 100644
--- a/include/soc/qcom/qcom-spmi-pmic.h
+++ b/include/soc/qcom/qcom-spmi-pmic.h
@@ -50,6 +50,8 @@
 #define PMR735B_SUBTYPE		0x34
 #define PM6350_SUBTYPE		0x36
 #define PM4125_SUBTYPE		0x37
+#define PMM8650AU_SUBTYPE       0x4e
+#define PMM8650AU_PSAIL_SUBTYPE 0x4f
 
 #define PMI8998_FAB_ID_SMIC	0x11
 #define PMI8998_FAB_ID_GF	0x30
-- 
cgit v1.2.3


From 314b903c30040632db7edd187cd33003b2aee512 Mon Sep 17 00:00:00 2001
From: George Moussalem <george.moussalem@outlook.com>
Date: Fri, 16 May 2025 16:36:09 +0400
Subject: dt-bindings: clock: qcom: Add CMN PLL support for IPQ5018 SoC

The CMN PLL block in the IPQ5018 SoC takes 96 MHZ as the reference
input clock. Its output clocks are the XO (24Mhz), sleep (32Khz), and
ethernet (50Mhz) clocks.

Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: George Moussalem <george.moussalem@outlook.com>
Link: https://lore.kernel.org/r/20250516-ipq5018-cmn-pll-v4-2-389a6b30e504@outlook.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml  |  1 +
 include/dt-bindings/clock/qcom,ipq5018-cmn-pll.h         | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,ipq5018-cmn-pll.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml b/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
index f869b3739be8..b5f6dcfbb0fc 100644
--- a/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,ipq9574-cmn-pll.yaml
@@ -24,6 +24,7 @@ description:
 properties:
   compatible:
     enum:
+      - qcom,ipq5018-cmn-pll
       - qcom,ipq9574-cmn-pll
 
   reg:
diff --git a/include/dt-bindings/clock/qcom,ipq5018-cmn-pll.h b/include/dt-bindings/clock/qcom,ipq5018-cmn-pll.h
new file mode 100644
index 000000000000..586d1c9b33b3
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,ipq5018-cmn-pll.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_IPQ5018_CMN_PLL_H
+#define _DT_BINDINGS_CLK_QCOM_IPQ5018_CMN_PLL_H
+
+/* CMN PLL core clock. */
+#define IPQ5018_CMN_PLL_CLK			0
+
+/* The output clocks from CMN PLL of IPQ5018. */
+#define IPQ5018_XO_24MHZ_CLK			1
+#define IPQ5018_SLEEP_32KHZ_CLK			2
+#define IPQ5018_ETH_50MHZ_CLK			3
+#endif
-- 
cgit v1.2.3


From 8df29649903c067138180ef89f315b6f166b8732 Mon Sep 17 00:00:00 2001
From: Taniya Das <quic_tdas@quicinc.com>
Date: Wed, 2 Jul 2025 14:34:22 +0530
Subject: dt-bindings: clock: Add Qualcomm QCS615 Camera clock controller

Add DT bindings for the Camera clock on QCS615 platforms. Add the
relevant DT include definitions as well.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Taniya Das <quic_tdas@quicinc.com>
Link: https://lore.kernel.org/r/20250702-qcs615-mm-v10-clock-controllers-v11-2-9c216e1615ab@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,sm6350-camcc.yaml          |  11 ++-
 include/dt-bindings/clock/qcom,qcs615-camcc.h      | 110 +++++++++++++++++++++
 2 files changed, 118 insertions(+), 3 deletions(-)
 create mode 100644 include/dt-bindings/clock/qcom,qcs615-camcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml
index c03b30f64f35..e31cd4300f7d 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sm6350-camcc.yaml
@@ -8,16 +8,21 @@ title: Qualcomm Camera Clock & Reset Controller on SM6350
 
 maintainers:
   - Konrad Dybcio <konradybcio@kernel.org>
+  - Taniya Das <quic_tdas@quicinc.com>
 
 description: |
   Qualcomm camera clock control module provides the clocks, resets and  power
-  domains on SM6350.
+  domains on SM6350 and QCS615 SoC.
 
-  See also:: include/dt-bindings/clock/qcom,sm6350-camcc.h
+  See also:
+    include/dt-bindings/clock/qcom,qcs615-camcc.h
+    include/dt-bindings/clock/qcom,sm6350-camcc.h
 
 properties:
   compatible:
-    const: qcom,sm6350-camcc
+    enum:
+      - qcom,qcs615-camcc
+      - qcom,sm6350-camcc
 
   clocks:
     items:
diff --git a/include/dt-bindings/clock/qcom,qcs615-camcc.h b/include/dt-bindings/clock/qcom,qcs615-camcc.h
new file mode 100644
index 000000000000..aec57dddc067
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs615-camcc.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_CAM_CC_QCS615_H
+#define _DT_BINDINGS_CLK_QCOM_CAM_CC_QCS615_H
+
+/* CAM_CC clocks */
+#define CAM_CC_BPS_AHB_CLK					0
+#define CAM_CC_BPS_AREG_CLK					1
+#define CAM_CC_BPS_AXI_CLK					2
+#define CAM_CC_BPS_CLK						3
+#define CAM_CC_BPS_CLK_SRC					4
+#define CAM_CC_CAMNOC_ATB_CLK					5
+#define CAM_CC_CAMNOC_AXI_CLK					6
+#define CAM_CC_CCI_CLK						7
+#define CAM_CC_CCI_CLK_SRC					8
+#define CAM_CC_CORE_AHB_CLK					9
+#define CAM_CC_CPAS_AHB_CLK					10
+#define CAM_CC_CPHY_RX_CLK_SRC					11
+#define CAM_CC_CSI0PHYTIMER_CLK					12
+#define CAM_CC_CSI0PHYTIMER_CLK_SRC				13
+#define CAM_CC_CSI1PHYTIMER_CLK					14
+#define CAM_CC_CSI1PHYTIMER_CLK_SRC				15
+#define CAM_CC_CSI2PHYTIMER_CLK					16
+#define CAM_CC_CSI2PHYTIMER_CLK_SRC				17
+#define CAM_CC_CSIPHY0_CLK					18
+#define CAM_CC_CSIPHY1_CLK					19
+#define CAM_CC_CSIPHY2_CLK					20
+#define CAM_CC_FAST_AHB_CLK_SRC					21
+#define CAM_CC_ICP_ATB_CLK					22
+#define CAM_CC_ICP_CLK						23
+#define CAM_CC_ICP_CLK_SRC					24
+#define CAM_CC_ICP_CTI_CLK					25
+#define CAM_CC_ICP_TS_CLK					26
+#define CAM_CC_IFE_0_AXI_CLK					27
+#define CAM_CC_IFE_0_CLK					28
+#define CAM_CC_IFE_0_CLK_SRC					29
+#define CAM_CC_IFE_0_CPHY_RX_CLK				30
+#define CAM_CC_IFE_0_CSID_CLK					31
+#define CAM_CC_IFE_0_CSID_CLK_SRC				32
+#define CAM_CC_IFE_0_DSP_CLK					33
+#define CAM_CC_IFE_1_AXI_CLK					34
+#define CAM_CC_IFE_1_CLK					35
+#define CAM_CC_IFE_1_CLK_SRC					36
+#define CAM_CC_IFE_1_CPHY_RX_CLK				37
+#define CAM_CC_IFE_1_CSID_CLK					38
+#define CAM_CC_IFE_1_CSID_CLK_SRC				39
+#define CAM_CC_IFE_1_DSP_CLK					40
+#define CAM_CC_IFE_LITE_CLK					41
+#define CAM_CC_IFE_LITE_CLK_SRC					42
+#define CAM_CC_IFE_LITE_CPHY_RX_CLK				43
+#define CAM_CC_IFE_LITE_CSID_CLK				44
+#define CAM_CC_IFE_LITE_CSID_CLK_SRC				45
+#define CAM_CC_IPE_0_AHB_CLK					46
+#define CAM_CC_IPE_0_AREG_CLK					47
+#define CAM_CC_IPE_0_AXI_CLK					48
+#define CAM_CC_IPE_0_CLK					49
+#define CAM_CC_IPE_0_CLK_SRC					50
+#define CAM_CC_JPEG_CLK						51
+#define CAM_CC_JPEG_CLK_SRC					52
+#define CAM_CC_LRME_CLK						53
+#define CAM_CC_LRME_CLK_SRC					54
+#define CAM_CC_MCLK0_CLK					55
+#define CAM_CC_MCLK0_CLK_SRC					56
+#define CAM_CC_MCLK1_CLK					57
+#define CAM_CC_MCLK1_CLK_SRC					58
+#define CAM_CC_MCLK2_CLK					59
+#define CAM_CC_MCLK2_CLK_SRC					60
+#define CAM_CC_MCLK3_CLK					61
+#define CAM_CC_MCLK3_CLK_SRC					62
+#define CAM_CC_PLL0						63
+#define CAM_CC_PLL1						64
+#define CAM_CC_PLL2						65
+#define CAM_CC_PLL2_OUT_AUX2					66
+#define CAM_CC_PLL3						67
+#define CAM_CC_SLOW_AHB_CLK_SRC					68
+#define CAM_CC_SOC_AHB_CLK					69
+#define CAM_CC_SYS_TMR_CLK					70
+
+/* CAM_CC power domains */
+#define BPS_GDSC						0
+#define IFE_0_GDSC						1
+#define IFE_1_GDSC						2
+#define IPE_0_GDSC						3
+#define TITAN_TOP_GDSC						4
+
+/* CAM_CC resets */
+#define CAM_CC_BPS_BCR						0
+#define CAM_CC_CAMNOC_BCR					1
+#define CAM_CC_CCI_BCR						2
+#define CAM_CC_CPAS_BCR						3
+#define CAM_CC_CSI0PHY_BCR					4
+#define CAM_CC_CSI1PHY_BCR					5
+#define CAM_CC_CSI2PHY_BCR					6
+#define CAM_CC_ICP_BCR						7
+#define CAM_CC_IFE_0_BCR					8
+#define CAM_CC_IFE_1_BCR					9
+#define CAM_CC_IFE_LITE_BCR					10
+#define CAM_CC_IPE_0_BCR					11
+#define CAM_CC_JPEG_BCR						12
+#define CAM_CC_LRME_BCR						13
+#define CAM_CC_MCLK0_BCR					14
+#define CAM_CC_MCLK1_BCR					15
+#define CAM_CC_MCLK2_BCR					16
+#define CAM_CC_MCLK3_BCR					17
+#define CAM_CC_TITAN_TOP_BCR					18
+
+#endif
-- 
cgit v1.2.3


From 8b1750ea009f2774a3acd6c7bc9e61b5157101d1 Mon Sep 17 00:00:00 2001
From: Taniya Das <quic_tdas@quicinc.com>
Date: Wed, 2 Jul 2025 14:34:24 +0530
Subject: dt-bindings: clock: Add Qualcomm QCS615 Display clock controller

Add DT bindings for the Display clock on QCS615 platforms. Add the
relevant DT include definitions as well.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Taniya Das <quic_tdas@quicinc.com>
Link: https://lore.kernel.org/r/20250702-qcs615-mm-v10-clock-controllers-v11-4-9c216e1615ab@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,qcs615-dispcc.yaml         | 55 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,qcs615-dispcc.h     | 52 ++++++++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,qcs615-dispcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,qcs615-dispcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,qcs615-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,qcs615-dispcc.yaml
new file mode 100644
index 000000000000..d566f19beb0d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,qcs615-dispcc.yaml
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,qcs615-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on QCS615
+
+maintainers:
+  - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+  Qualcomm display clock control module provides the clocks, resets and power
+  domains on QCS615.
+
+  See also: include/dt-bindings/clock/qcom,qcs615-dispcc.h
+
+properties:
+  compatible:
+    const: qcom,qcs615-dispcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 clock source from GCC
+      - description: Byte clock from DSI PHY0
+      - description: Pixel clock from DSI PHY0
+      - description: Pixel clock from DSI PHY1
+      - description: Display port PLL link clock
+      - description: Display port PLL VCO DIV clock
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/clock/qcom,qcs615-gcc.h>
+    clock-controller@af00000 {
+      compatible = "qcom,qcs615-dispcc";
+      reg = <0x0af00000 0x20000>;
+      clocks = <&rpmhcc RPMH_CXO_CLK>,
+               <&gcc GCC_DISP_GPLL0_DIV_CLK_SRC>,
+               <&mdss_dsi0_phy 0>,
+               <&mdss_dsi0_phy 1>,
+               <&mdss_dsi1_phy 0>,
+               <&mdss_dp_phy 0>,
+               <&mdss_dp_vco 0>;
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,qcs615-dispcc.h b/include/dt-bindings/clock/qcom,qcs615-dispcc.h
new file mode 100644
index 000000000000..9a29945c5762
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs615-dispcc.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_DISP_CC_QCS615_H
+#define _DT_BINDINGS_CLK_QCOM_DISP_CC_QCS615_H
+
+/* DISP_CC clocks */
+#define DISP_CC_MDSS_AHB_CLK					0
+#define DISP_CC_MDSS_AHB_CLK_SRC				1
+#define DISP_CC_MDSS_BYTE0_CLK					2
+#define DISP_CC_MDSS_BYTE0_CLK_SRC				3
+#define DISP_CC_MDSS_BYTE0_DIV_CLK_SRC				4
+#define DISP_CC_MDSS_BYTE0_INTF_CLK				5
+#define DISP_CC_MDSS_DP_AUX_CLK					6
+#define DISP_CC_MDSS_DP_AUX_CLK_SRC				7
+#define DISP_CC_MDSS_DP_CRYPTO_CLK				8
+#define DISP_CC_MDSS_DP_CRYPTO_CLK_SRC				9
+#define DISP_CC_MDSS_DP_LINK_CLK				10
+#define DISP_CC_MDSS_DP_LINK_CLK_SRC				11
+#define DISP_CC_MDSS_DP_LINK_DIV_CLK_SRC			12
+#define DISP_CC_MDSS_DP_LINK_INTF_CLK				13
+#define DISP_CC_MDSS_DP_PIXEL1_CLK				14
+#define DISP_CC_MDSS_DP_PIXEL1_CLK_SRC				15
+#define DISP_CC_MDSS_DP_PIXEL_CLK				16
+#define DISP_CC_MDSS_DP_PIXEL_CLK_SRC				17
+#define DISP_CC_MDSS_ESC0_CLK					18
+#define DISP_CC_MDSS_ESC0_CLK_SRC				19
+#define DISP_CC_MDSS_MDP_CLK					20
+#define DISP_CC_MDSS_MDP_CLK_SRC				21
+#define DISP_CC_MDSS_MDP_LUT_CLK				22
+#define DISP_CC_MDSS_NON_GDSC_AHB_CLK				23
+#define DISP_CC_MDSS_PCLK0_CLK					24
+#define DISP_CC_MDSS_PCLK0_CLK_SRC				25
+#define DISP_CC_MDSS_ROT_CLK					26
+#define DISP_CC_MDSS_ROT_CLK_SRC				27
+#define DISP_CC_MDSS_RSCC_AHB_CLK				28
+#define DISP_CC_MDSS_RSCC_VSYNC_CLK				29
+#define DISP_CC_MDSS_VSYNC_CLK					30
+#define DISP_CC_MDSS_VSYNC_CLK_SRC				31
+#define DISP_CC_PLL0						32
+#define DISP_CC_XO_CLK						33
+
+/* DISP_CC power domains */
+#define MDSS_CORE_GDSC						0
+
+/* DISP_CC resets */
+#define DISP_CC_MDSS_CORE_BCR					0
+#define DISP_CC_MDSS_RSCC_BCR					1
+
+#endif
-- 
cgit v1.2.3


From 3590dfbdd1b3e4ceba0b7daed2a396f644c277c4 Mon Sep 17 00:00:00 2001
From: Taniya Das <quic_tdas@quicinc.com>
Date: Wed, 2 Jul 2025 14:34:26 +0530
Subject: dt-bindings: clock: Add Qualcomm QCS615 Graphics clock controller

Add DT bindings for the Graphics clock on QCS615 platforms. Add the
relevant DT include definitions as well.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Taniya Das <quic_tdas@quicinc.com>
Link: https://lore.kernel.org/r/20250702-qcs615-mm-v10-clock-controllers-v11-6-9c216e1615ab@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,qcs615-gpucc.yaml          | 49 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,qcs615-gpucc.h      | 39 +++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,qcs615-gpucc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,qcs615-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,qcs615-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,qcs615-gpucc.yaml
new file mode 100644
index 000000000000..5f7d83d1a7be
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,qcs615-gpucc.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,qcs615-gpucc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Graphics Clock & Reset Controller on QCS615
+
+maintainers:
+  - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+  Qualcomm graphics clock control module provides clocks, resets and power
+  domains on QCS615 Qualcomm SoCs.
+
+  See also: include/dt-bindings/clock/qcom,qcs615-gpucc.h
+
+properties:
+  compatible:
+    const: qcom,qcs615-gpucc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: GPLL0 main branch source
+      - description: GPLL0 GPUCC div branch source
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/clock/qcom,qcs615-gcc.h>
+
+    clock-controller@5090000 {
+      compatible = "qcom,qcs615-gpucc";
+      reg = <0x5090000 0x9000>;
+      clocks = <&rpmhcc RPMH_CXO_CLK>,
+               <&gcc GPLL0>,
+               <&gcc GCC_GPU_GPLL0_DIV_CLK_SRC>;
+
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,qcs615-gpucc.h b/include/dt-bindings/clock/qcom,qcs615-gpucc.h
new file mode 100644
index 000000000000..6d8394b90d59
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs615-gpucc.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_QCS615_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_QCS615_H
+
+/* GPU_CC clocks */
+#define CRC_DIV_PLL0						0
+#define CRC_DIV_PLL1						1
+#define GPU_CC_PLL0						2
+#define GPU_CC_PLL1						3
+#define GPU_CC_CRC_AHB_CLK					4
+#define GPU_CC_CX_GFX3D_CLK					5
+#define GPU_CC_CX_GFX3D_SLV_CLK					6
+#define GPU_CC_CX_GMU_CLK					7
+#define GPU_CC_CX_SNOC_DVM_CLK					8
+#define GPU_CC_CXO_AON_CLK					9
+#define GPU_CC_CXO_CLK						10
+#define GPU_CC_GMU_CLK_SRC					11
+#define GPU_CC_GX_GFX3D_CLK					12
+#define GPU_CC_GX_GFX3D_CLK_SRC					13
+#define GPU_CC_GX_GMU_CLK					14
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK				15
+#define GPU_CC_SLEEP_CLK					16
+
+/* GPU_CC power domains */
+#define CX_GDSC							0
+#define GX_GDSC							1
+
+/* GPU_CC resets */
+#define GPU_CC_CX_BCR						0
+#define GPU_CC_GFX3D_AON_BCR					1
+#define GPU_CC_GMU_BCR						2
+#define GPU_CC_GX_BCR						3
+#define GPU_CC_XO_BCR						4
+
+#endif
-- 
cgit v1.2.3


From 9c51c66c997cae09c12ec250a9f538c0c23d8930 Mon Sep 17 00:00:00 2001
From: Taniya Das <quic_tdas@quicinc.com>
Date: Wed, 2 Jul 2025 14:34:28 +0530
Subject: dt-bindings: clock: Add Qualcomm QCS615 Video clock controller

Add DT bindings for the Video clock on QCS615 platforms. Add the
relevant DT include definitions as well.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Taniya Das <quic_tdas@quicinc.com>
Link: https://lore.kernel.org/r/20250702-qcs615-mm-v10-clock-controllers-v11-8-9c216e1615ab@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,qcs615-videocc.yaml        | 47 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,qcs615-videocc.h    | 30 ++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,qcs615-videocc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,qcs615-videocc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,qcs615-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,qcs615-videocc.yaml
new file mode 100644
index 000000000000..f51b69de1047
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,qcs615-videocc.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,qcs615-videocc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Video Clock & Reset Controller on QCS615
+
+maintainers:
+  - Taniya Das <quic_tdas@quicinc.com>
+
+description: |
+  Qualcomm video clock control module provides clocks, resets and power
+  domains on QCS615 Qualcomm SoCs.
+
+  See also: include/dt-bindings/clock/qcom,qcs615-videocc.h
+
+properties:
+  compatible:
+    const: qcom,qcs615-videocc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Sleep clock source
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    #include <dt-bindings/clock/qcom,qcs615-gcc.h>
+
+    clock-controller@ab00000 {
+      compatible = "qcom,qcs615-videocc";
+      reg = <0xab00000 0x10000>;
+      clocks = <&rpmhcc RPMH_CXO_CLK>,
+               <&sleep_clk>;
+
+      #clock-cells = <1>;
+      #reset-cells = <1>;
+      #power-domain-cells = <1>;
+    };
+...
diff --git a/include/dt-bindings/clock/qcom,qcs615-videocc.h b/include/dt-bindings/clock/qcom,qcs615-videocc.h
new file mode 100644
index 000000000000..0ca3efb21103
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,qcs615-videocc.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_VIDEO_CC_QCS615_H
+#define _DT_BINDINGS_CLK_QCOM_VIDEO_CC_QCS615_H
+
+/* VIDEO_CC clocks */
+#define VIDEO_CC_SLEEP_CLK					0
+#define VIDEO_CC_SLEEP_CLK_SRC					1
+#define VIDEO_CC_VCODEC0_AXI_CLK				2
+#define VIDEO_CC_VCODEC0_CORE_CLK				3
+#define VIDEO_CC_VENUS_AHB_CLK					4
+#define VIDEO_CC_VENUS_CLK_SRC					5
+#define VIDEO_CC_VENUS_CTL_AXI_CLK				6
+#define VIDEO_CC_VENUS_CTL_CORE_CLK				7
+#define VIDEO_CC_XO_CLK						8
+#define VIDEO_PLL0						9
+
+/* VIDEO_CC power domains */
+#define VCODEC0_GDSC						0
+#define VENUS_GDSC						1
+
+/* VIDEO_CC resets */
+#define VIDEO_CC_INTERFACE_BCR					0
+#define VIDEO_CC_VCODEC0_BCR					1
+#define VIDEO_CC_VENUS_BCR					2
+
+#endif
-- 
cgit v1.2.3


From d0b706509fb04449add5446e51a494bfeadcac10 Mon Sep 17 00:00:00 2001
From: Stephan Gerhold <stephan.gerhold@linaro.org>
Date: Wed, 9 Jul 2025 12:08:56 +0200
Subject: dt-bindings: clock: qcom,x1e80100-gcc: Add missing video resets

Add the missing video resets that are needed for the iris video codec.

Acked-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
Signed-off-by: Stephan Gerhold <stephan.gerhold@linaro.org>
Link: https://lore.kernel.org/r/20250709-x1e-videocc-v2-4-ad1acf5674b4@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/clock/qcom,x1e80100-gcc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/qcom,x1e80100-gcc.h b/include/dt-bindings/clock/qcom,x1e80100-gcc.h
index 24ba9e2a5cf6..710c340f24a5 100644
--- a/include/dt-bindings/clock/qcom,x1e80100-gcc.h
+++ b/include/dt-bindings/clock/qcom,x1e80100-gcc.h
@@ -482,4 +482,6 @@
 #define GCC_USB_1_PHY_BCR					85
 #define GCC_USB_2_PHY_BCR					86
 #define GCC_VIDEO_BCR						87
+#define GCC_VIDEO_AXI0_CLK_ARES					88
+#define GCC_VIDEO_AXI1_CLK_ARES					89
 #endif
-- 
cgit v1.2.3


From 95ba6820a665c25f372a3cdc9c469bb0a86bf174 Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Tue, 15 Jul 2025 09:19:02 +0200
Subject: dt-bindings: clock: qcom: document the Milos Global Clock Controller

Add bindings documentation for the Milos (e.g. SM7635) Global Clock
Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250715-sm7635-clocks-v3-2-18f9faac4984@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../devicetree/bindings/clock/qcom,milos-gcc.yaml  |  62 ++++++
 include/dt-bindings/clock/qcom,milos-gcc.h         | 210 +++++++++++++++++++++
 2 files changed, 272 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,milos-gcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml b/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml
new file mode 100644
index 000000000000..cf244c155f9a
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,milos-gcc.yaml
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,milos-gcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Global Clock & Reset Controller on Milos
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description: |
+  Qualcomm global clock control module provides the clocks, resets and power
+  domains on Milos.
+
+  See also: include/dt-bindings/clock/qcom,milos-gcc.h
+
+properties:
+  compatible:
+    const: qcom,milos-gcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Sleep clock source
+      - description: PCIE 0 Pipe clock source
+      - description: PCIE 1 Pipe clock source
+      - description: UFS Phy Rx symbol 0 clock source
+      - description: UFS Phy Rx symbol 1 clock source
+      - description: UFS Phy Tx symbol 0 clock source
+      - description: USB3 Phy wrapper pipe clock source
+
+required:
+  - compatible
+  - clocks
+  - '#power-domain-cells'
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,rpmh.h>
+    clock-controller@100000 {
+        compatible = "qcom,milos-gcc";
+        reg = <0x00100000 0x1f4200>;
+        clocks = <&rpmhcc RPMH_CXO_CLK>,
+                 <&sleep_clk>,
+                 <&pcie0_phy>,
+                 <&pcie1_phy>,
+                 <&ufs_mem_phy 0>,
+                 <&ufs_mem_phy 1>,
+                 <&ufs_mem_phy 2>,
+                 <&usb_1_qmpphy>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+        #power-domain-cells = <1>;
+    };
+
+...
diff --git a/include/dt-bindings/clock/qcom,milos-gcc.h b/include/dt-bindings/clock/qcom,milos-gcc.h
new file mode 100644
index 000000000000..a530ca39e1ef
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,milos-gcc.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GCC_MILOS_H
+#define _DT_BINDINGS_CLK_QCOM_GCC_MILOS_H
+
+/* GCC clocks */
+#define GCC_GPLL0						0
+#define GCC_GPLL0_OUT_EVEN					1
+#define GCC_GPLL2						2
+#define GCC_GPLL4						3
+#define GCC_GPLL6						4
+#define GCC_GPLL7						5
+#define GCC_GPLL9						6
+#define GCC_AGGRE_NOC_PCIE_AXI_CLK				7
+#define GCC_AGGRE_UFS_PHY_AXI_CLK				8
+#define GCC_AGGRE_UFS_PHY_AXI_HW_CTL_CLK			9
+#define GCC_AGGRE_USB3_PRIM_AXI_CLK				10
+#define GCC_BOOT_ROM_AHB_CLK					11
+#define GCC_CAMERA_AHB_CLK					12
+#define GCC_CAMERA_HF_AXI_CLK					13
+#define GCC_CAMERA_HF_XO_CLK					14
+#define GCC_CAMERA_SF_AXI_CLK					15
+#define GCC_CAMERA_SF_XO_CLK					16
+#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK				17
+#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK				18
+#define GCC_CNOC_PCIE_SF_AXI_CLK				19
+#define GCC_DDRSS_GPU_AXI_CLK					20
+#define GCC_DDRSS_PCIE_SF_QTB_CLK				21
+#define GCC_DISP_AHB_CLK					22
+#define GCC_DISP_GPLL0_DIV_CLK_SRC				23
+#define GCC_DISP_HF_AXI_CLK					24
+#define GCC_DISP_XO_CLK						25
+#define GCC_GP1_CLK						26
+#define GCC_GP1_CLK_SRC						27
+#define GCC_GP2_CLK						28
+#define GCC_GP2_CLK_SRC						29
+#define GCC_GP3_CLK						30
+#define GCC_GP3_CLK_SRC						31
+#define GCC_GPU_CFG_AHB_CLK					32
+#define GCC_GPU_GPLL0_CLK_SRC					33
+#define GCC_GPU_GPLL0_DIV_CLK_SRC				34
+#define GCC_GPU_MEMNOC_GFX_CLK					35
+#define GCC_GPU_SNOC_DVM_GFX_CLK				36
+#define GCC_PCIE_0_AUX_CLK					37
+#define GCC_PCIE_0_AUX_CLK_SRC					38
+#define GCC_PCIE_0_CFG_AHB_CLK					39
+#define GCC_PCIE_0_MSTR_AXI_CLK					40
+#define GCC_PCIE_0_PHY_RCHNG_CLK				41
+#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC				42
+#define GCC_PCIE_0_PIPE_CLK					43
+#define GCC_PCIE_0_PIPE_CLK_SRC					44
+#define GCC_PCIE_0_PIPE_DIV2_CLK				45
+#define GCC_PCIE_0_PIPE_DIV2_CLK_SRC				46
+#define GCC_PCIE_0_SLV_AXI_CLK					47
+#define GCC_PCIE_0_SLV_Q2A_AXI_CLK				48
+#define GCC_PCIE_1_AUX_CLK					49
+#define GCC_PCIE_1_AUX_CLK_SRC					50
+#define GCC_PCIE_1_CFG_AHB_CLK					51
+#define GCC_PCIE_1_MSTR_AXI_CLK					52
+#define GCC_PCIE_1_PHY_RCHNG_CLK				53
+#define GCC_PCIE_1_PHY_RCHNG_CLK_SRC				54
+#define GCC_PCIE_1_PIPE_CLK					55
+#define GCC_PCIE_1_PIPE_CLK_SRC					56
+#define GCC_PCIE_1_PIPE_DIV2_CLK				57
+#define GCC_PCIE_1_PIPE_DIV2_CLK_SRC				58
+#define GCC_PCIE_1_SLV_AXI_CLK					59
+#define GCC_PCIE_1_SLV_Q2A_AXI_CLK				60
+#define GCC_PCIE_RSCC_CFG_AHB_CLK				61
+#define GCC_PCIE_RSCC_XO_CLK					62
+#define GCC_PDM2_CLK						63
+#define GCC_PDM2_CLK_SRC					64
+#define GCC_PDM_AHB_CLK						65
+#define GCC_PDM_XO4_CLK						66
+#define GCC_QMIP_CAMERA_NRT_AHB_CLK				67
+#define GCC_QMIP_CAMERA_RT_AHB_CLK				68
+#define GCC_QMIP_DISP_AHB_CLK					69
+#define GCC_QMIP_GPU_AHB_CLK					70
+#define GCC_QMIP_PCIE_AHB_CLK					71
+#define GCC_QMIP_VIDEO_CV_CPU_AHB_CLK				72
+#define GCC_QMIP_VIDEO_CVP_AHB_CLK				73
+#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK				74
+#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK				75
+#define GCC_QUPV3_WRAP0_CORE_2X_CLK				76
+#define GCC_QUPV3_WRAP0_CORE_CLK				77
+#define GCC_QUPV3_WRAP0_QSPI_REF_CLK				78
+#define GCC_QUPV3_WRAP0_QSPI_REF_CLK_SRC			79
+#define GCC_QUPV3_WRAP0_S0_CLK					80
+#define GCC_QUPV3_WRAP0_S0_CLK_SRC				81
+#define GCC_QUPV3_WRAP0_S1_CLK					82
+#define GCC_QUPV3_WRAP0_S1_CLK_SRC				83
+#define GCC_QUPV3_WRAP0_S2_CLK					84
+#define GCC_QUPV3_WRAP0_S2_CLK_SRC				85
+#define GCC_QUPV3_WRAP0_S3_CLK					86
+#define GCC_QUPV3_WRAP0_S3_CLK_SRC				87
+#define GCC_QUPV3_WRAP0_S4_CLK					88
+#define GCC_QUPV3_WRAP0_S4_CLK_SRC				89
+#define GCC_QUPV3_WRAP0_S5_CLK					90
+#define GCC_QUPV3_WRAP0_S5_CLK_SRC				91
+#define GCC_QUPV3_WRAP0_S6_CLK					92
+#define GCC_QUPV3_WRAP0_S6_CLK_SRC				93
+#define GCC_QUPV3_WRAP1_CORE_2X_CLK				94
+#define GCC_QUPV3_WRAP1_CORE_CLK				95
+#define GCC_QUPV3_WRAP1_QSPI_REF_CLK				96
+#define GCC_QUPV3_WRAP1_QSPI_REF_CLK_SRC			97
+#define GCC_QUPV3_WRAP1_S0_CLK					98
+#define GCC_QUPV3_WRAP1_S0_CLK_SRC				99
+#define GCC_QUPV3_WRAP1_S1_CLK					100
+#define GCC_QUPV3_WRAP1_S1_CLK_SRC				101
+#define GCC_QUPV3_WRAP1_S2_CLK					102
+#define GCC_QUPV3_WRAP1_S2_CLK_SRC				103
+#define GCC_QUPV3_WRAP1_S3_CLK					104
+#define GCC_QUPV3_WRAP1_S3_CLK_SRC				105
+#define GCC_QUPV3_WRAP1_S4_CLK					106
+#define GCC_QUPV3_WRAP1_S4_CLK_SRC				107
+#define GCC_QUPV3_WRAP1_S5_CLK					108
+#define GCC_QUPV3_WRAP1_S5_CLK_SRC				109
+#define GCC_QUPV3_WRAP1_S6_CLK					110
+#define GCC_QUPV3_WRAP1_S6_CLK_SRC				111
+#define GCC_QUPV3_WRAP_0_M_AHB_CLK				112
+#define GCC_QUPV3_WRAP_0_S_AHB_CLK				113
+#define GCC_QUPV3_WRAP_1_M_AHB_CLK				114
+#define GCC_QUPV3_WRAP_1_S_AHB_CLK				115
+#define GCC_SDCC1_AHB_CLK					116
+#define GCC_SDCC1_APPS_CLK					117
+#define GCC_SDCC1_APPS_CLK_SRC					118
+#define GCC_SDCC1_ICE_CORE_CLK					119
+#define GCC_SDCC1_ICE_CORE_CLK_SRC				120
+#define GCC_SDCC2_AHB_CLK					121
+#define GCC_SDCC2_APPS_CLK					122
+#define GCC_SDCC2_APPS_CLK_SRC					123
+#define GCC_UFS_PHY_AHB_CLK					124
+#define GCC_UFS_PHY_AXI_CLK					125
+#define GCC_UFS_PHY_AXI_CLK_SRC					126
+#define GCC_UFS_PHY_AXI_HW_CTL_CLK				127
+#define GCC_UFS_PHY_ICE_CORE_CLK				128
+#define GCC_UFS_PHY_ICE_CORE_CLK_SRC				129
+#define GCC_UFS_PHY_ICE_CORE_HW_CTL_CLK				130
+#define GCC_UFS_PHY_PHY_AUX_CLK					131
+#define GCC_UFS_PHY_PHY_AUX_CLK_SRC				132
+#define GCC_UFS_PHY_PHY_AUX_HW_CTL_CLK				133
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK				134
+#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC				135
+#define GCC_UFS_PHY_RX_SYMBOL_1_CLK				136
+#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC				137
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK				138
+#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC				139
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK				140
+#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC				141
+#define GCC_UFS_PHY_UNIPRO_CORE_HW_CTL_CLK			142
+#define GCC_USB30_PRIM_ATB_CLK					143
+#define GCC_USB30_PRIM_MASTER_CLK				144
+#define GCC_USB30_PRIM_MASTER_CLK_SRC				145
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK				146
+#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC			147
+#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC		148
+#define GCC_USB30_PRIM_SLEEP_CLK				149
+#define GCC_USB3_PRIM_PHY_AUX_CLK				150
+#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC				151
+#define GCC_USB3_PRIM_PHY_COM_AUX_CLK				152
+#define GCC_USB3_PRIM_PHY_PIPE_CLK				153
+#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC				154
+#define GCC_VIDEO_AHB_CLK					155
+#define GCC_VIDEO_AXI0_CLK					156
+#define GCC_VIDEO_XO_CLK					157
+
+/* GCC resets */
+#define GCC_CAMERA_BCR						0
+#define GCC_DISPLAY_BCR						1
+#define GCC_GPU_BCR						2
+#define GCC_PCIE_0_BCR						3
+#define GCC_PCIE_0_LINK_DOWN_BCR				4
+#define GCC_PCIE_0_NOCSR_COM_PHY_BCR				5
+#define GCC_PCIE_0_PHY_BCR					6
+#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR			7
+#define GCC_PCIE_1_BCR						8
+#define GCC_PCIE_1_LINK_DOWN_BCR				9
+#define GCC_PCIE_1_NOCSR_COM_PHY_BCR				10
+#define GCC_PCIE_1_PHY_BCR					11
+#define GCC_PCIE_1_PHY_NOCSR_COM_PHY_BCR			12
+#define GCC_PCIE_RSCC_BCR					13
+#define GCC_PDM_BCR						14
+#define GCC_QUPV3_WRAPPER_0_BCR					15
+#define GCC_QUPV3_WRAPPER_1_BCR					16
+#define GCC_QUSB2PHY_PRIM_BCR					17
+#define GCC_QUSB2PHY_SEC_BCR					18
+#define GCC_SDCC1_BCR						19
+#define GCC_SDCC2_BCR						20
+#define GCC_UFS_PHY_BCR						21
+#define GCC_USB30_PRIM_BCR					22
+#define GCC_USB3_DP_PHY_PRIM_BCR				23
+#define GCC_USB3_PHY_PRIM_BCR					24
+#define GCC_USB3PHY_PHY_PRIM_BCR				25
+#define GCC_VIDEO_AXI0_CLK_ARES					26
+#define GCC_VIDEO_BCR						27
+
+/* GCC power domains */
+#define PCIE_0_GDSC						0
+#define PCIE_0_PHY_GDSC						1
+#define PCIE_1_GDSC						2
+#define PCIE_1_PHY_GDSC						3
+#define UFS_PHY_GDSC						4
+#define UFS_MEM_PHY_GDSC					5
+#define USB30_PRIM_GDSC						6
+#define USB3_PHY_GDSC						7
+
+#endif
-- 
cgit v1.2.3


From dbb9d53b7197b6b13d0137c0ea45902ef26e2bb4 Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Tue, 15 Jul 2025 09:19:04 +0200
Subject: dt-bindings: clock: qcom: document the Milos Camera Clock Controller

Add bindings documentation for the Milos (e.g. SM7635) Camera Clock Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250715-sm7635-clocks-v3-4-18f9faac4984@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,milos-camcc.yaml           |  51 ++++++++
 include/dt-bindings/clock/qcom,milos-camcc.h       | 131 +++++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,milos-camcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,milos-camcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,milos-camcc.yaml b/Documentation/devicetree/bindings/clock/qcom,milos-camcc.yaml
new file mode 100644
index 000000000000..f63149ecf3e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,milos-camcc.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,milos-camcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Camera Clock & Reset Controller on Milos
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description: |
+  Qualcomm camera clock control module provides the clocks, resets and power
+  domains on Milos.
+
+  See also: include/dt-bindings/clock/qcom,milos-camcc.h
+
+properties:
+  compatible:
+    const: qcom,milos-camcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Sleep clock source
+      - description: Camera AHB clock from GCC
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,milos-gcc.h>
+    clock-controller@adb0000 {
+        compatible = "qcom,milos-camcc";
+        reg = <0x0adb0000 0x40000>;
+        clocks = <&bi_tcxo_div2>,
+                 <&sleep_clk>,
+                 <&gcc GCC_CAMERA_AHB_CLK>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+        #power-domain-cells = <1>;
+    };
+
+...
diff --git a/include/dt-bindings/clock/qcom,milos-camcc.h b/include/dt-bindings/clock/qcom,milos-camcc.h
new file mode 100644
index 000000000000..21925dca9a20
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,milos-camcc.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_CAM_CC_MILOS_H
+#define _DT_BINDINGS_CLK_QCOM_CAM_CC_MILOS_H
+
+/* CAM_CC clocks */
+#define CAM_CC_PLL0						0
+#define CAM_CC_PLL0_OUT_EVEN					1
+#define CAM_CC_PLL0_OUT_ODD					2
+#define CAM_CC_PLL1						3
+#define CAM_CC_PLL1_OUT_EVEN					4
+#define CAM_CC_PLL2						5
+#define CAM_CC_PLL2_OUT_EVEN					6
+#define CAM_CC_PLL3						7
+#define CAM_CC_PLL3_OUT_EVEN					8
+#define CAM_CC_PLL4						9
+#define CAM_CC_PLL4_OUT_EVEN					10
+#define CAM_CC_PLL5						11
+#define CAM_CC_PLL5_OUT_EVEN					12
+#define CAM_CC_PLL6						13
+#define CAM_CC_PLL6_OUT_EVEN					14
+#define CAM_CC_BPS_AHB_CLK					15
+#define CAM_CC_BPS_AREG_CLK					16
+#define CAM_CC_BPS_CLK						17
+#define CAM_CC_BPS_CLK_SRC					18
+#define CAM_CC_CAMNOC_ATB_CLK					19
+#define CAM_CC_CAMNOC_AXI_CLK_SRC				20
+#define CAM_CC_CAMNOC_AXI_HF_CLK				21
+#define CAM_CC_CAMNOC_AXI_SF_CLK				22
+#define CAM_CC_CAMNOC_NRT_AXI_CLK				23
+#define CAM_CC_CAMNOC_RT_AXI_CLK				24
+#define CAM_CC_CCI_0_CLK					25
+#define CAM_CC_CCI_0_CLK_SRC					26
+#define CAM_CC_CCI_1_CLK					27
+#define CAM_CC_CCI_1_CLK_SRC					28
+#define CAM_CC_CORE_AHB_CLK					29
+#define CAM_CC_CPAS_AHB_CLK					30
+#define CAM_CC_CPHY_RX_CLK_SRC					31
+#define CAM_CC_CRE_AHB_CLK					32
+#define CAM_CC_CRE_CLK						33
+#define CAM_CC_CRE_CLK_SRC					34
+#define CAM_CC_CSI0PHYTIMER_CLK					35
+#define CAM_CC_CSI0PHYTIMER_CLK_SRC				36
+#define CAM_CC_CSI1PHYTIMER_CLK					37
+#define CAM_CC_CSI1PHYTIMER_CLK_SRC				38
+#define CAM_CC_CSI2PHYTIMER_CLK					39
+#define CAM_CC_CSI2PHYTIMER_CLK_SRC				40
+#define CAM_CC_CSI3PHYTIMER_CLK					41
+#define CAM_CC_CSI3PHYTIMER_CLK_SRC				42
+#define CAM_CC_CSIPHY0_CLK					43
+#define CAM_CC_CSIPHY1_CLK					44
+#define CAM_CC_CSIPHY2_CLK					45
+#define CAM_CC_CSIPHY3_CLK					46
+#define CAM_CC_FAST_AHB_CLK_SRC					47
+#define CAM_CC_GDSC_CLK						48
+#define CAM_CC_ICP_ATB_CLK					49
+#define CAM_CC_ICP_CLK						50
+#define CAM_CC_ICP_CLK_SRC					51
+#define CAM_CC_ICP_CTI_CLK					52
+#define CAM_CC_ICP_TS_CLK					53
+#define CAM_CC_MCLK0_CLK					54
+#define CAM_CC_MCLK0_CLK_SRC					55
+#define CAM_CC_MCLK1_CLK					56
+#define CAM_CC_MCLK1_CLK_SRC					57
+#define CAM_CC_MCLK2_CLK					58
+#define CAM_CC_MCLK2_CLK_SRC					59
+#define CAM_CC_MCLK3_CLK					60
+#define CAM_CC_MCLK3_CLK_SRC					61
+#define CAM_CC_MCLK4_CLK					62
+#define CAM_CC_MCLK4_CLK_SRC					63
+#define CAM_CC_OPE_0_AHB_CLK					64
+#define CAM_CC_OPE_0_AREG_CLK					65
+#define CAM_CC_OPE_0_CLK					66
+#define CAM_CC_OPE_0_CLK_SRC					67
+#define CAM_CC_SLEEP_CLK					68
+#define CAM_CC_SLEEP_CLK_SRC					69
+#define CAM_CC_SLOW_AHB_CLK_SRC					70
+#define CAM_CC_SOC_AHB_CLK					71
+#define CAM_CC_SYS_TMR_CLK					72
+#define CAM_CC_TFE_0_AHB_CLK					73
+#define CAM_CC_TFE_0_CLK					74
+#define CAM_CC_TFE_0_CLK_SRC					75
+#define CAM_CC_TFE_0_CPHY_RX_CLK				76
+#define CAM_CC_TFE_0_CSID_CLK					77
+#define CAM_CC_TFE_0_CSID_CLK_SRC				78
+#define CAM_CC_TFE_1_AHB_CLK					79
+#define CAM_CC_TFE_1_CLK					80
+#define CAM_CC_TFE_1_CLK_SRC					81
+#define CAM_CC_TFE_1_CPHY_RX_CLK				82
+#define CAM_CC_TFE_1_CSID_CLK					83
+#define CAM_CC_TFE_1_CSID_CLK_SRC				84
+#define CAM_CC_TFE_2_AHB_CLK					85
+#define CAM_CC_TFE_2_CLK					86
+#define CAM_CC_TFE_2_CLK_SRC					87
+#define CAM_CC_TFE_2_CPHY_RX_CLK				88
+#define CAM_CC_TFE_2_CSID_CLK					89
+#define CAM_CC_TFE_2_CSID_CLK_SRC				90
+#define CAM_CC_TOP_SHIFT_CLK					91
+#define CAM_CC_XO_CLK_SRC					92
+
+/* CAM_CC resets */
+#define CAM_CC_BPS_BCR						0
+#define CAM_CC_CAMNOC_BCR					1
+#define CAM_CC_CAMSS_TOP_BCR					2
+#define CAM_CC_CCI_0_BCR					3
+#define CAM_CC_CCI_1_BCR					4
+#define CAM_CC_CPAS_BCR						5
+#define CAM_CC_CRE_BCR						6
+#define CAM_CC_CSI0PHY_BCR					7
+#define CAM_CC_CSI1PHY_BCR					8
+#define CAM_CC_CSI2PHY_BCR					9
+#define CAM_CC_CSI3PHY_BCR					10
+#define CAM_CC_ICP_BCR						11
+#define CAM_CC_MCLK0_BCR					12
+#define CAM_CC_MCLK1_BCR					13
+#define CAM_CC_MCLK2_BCR					14
+#define CAM_CC_MCLK3_BCR					15
+#define CAM_CC_MCLK4_BCR					16
+#define CAM_CC_OPE_0_BCR					17
+#define CAM_CC_TFE_0_BCR					18
+#define CAM_CC_TFE_1_BCR					19
+#define CAM_CC_TFE_2_BCR					20
+
+/* CAM_CC power domains */
+#define CAM_CC_CAMSS_TOP_GDSC					0
+
+#endif
-- 
cgit v1.2.3


From 63edb206a3a93f523579df7f49f2989aae4e8450 Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Tue, 15 Jul 2025 09:19:06 +0200
Subject: dt-bindings: clock: qcom: document the Milos Display Clock Controller

Add bindings documentation for the Milos (e.g. SM7635) Display Clock
Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250715-sm7635-clocks-v3-6-18f9faac4984@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,milos-dispcc.yaml          | 63 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,milos-dispcc.h      | 61 +++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,milos-dispcc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,milos-dispcc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,milos-dispcc.yaml b/Documentation/devicetree/bindings/clock/qcom,milos-dispcc.yaml
new file mode 100644
index 000000000000..94908804756b
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,milos-dispcc.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,milos-dispcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Display Clock & Reset Controller on Milos
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description: |
+  Qualcomm display clock control module provides the clocks, resets and power
+  domains on Milos.
+
+  See also: include/dt-bindings/clock/qcom,milos-dispcc.h
+
+properties:
+  compatible:
+    const: qcom,milos-dispcc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Sleep clock source
+      - description: Display's AHB clock
+      - description: GPLL0 source from GCC
+      - description: Byte clock from DSI PHY0
+      - description: Pixel clock from DSI PHY0
+      - description: Link clock from DP PHY0
+      - description: VCO DIV clock from DP PHY0
+
+required:
+  - compatible
+  - clocks
+  - '#power-domain-cells'
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,milos-gcc.h>
+    #include <dt-bindings/phy/phy-qcom-qmp.h>
+    clock-controller@af00000 {
+        compatible = "qcom,milos-dispcc";
+        reg = <0x0af00000 0x20000>;
+        clocks = <&bi_tcxo_div2>,
+                 <&sleep_clk>,
+                 <&gcc GCC_DISP_AHB_CLK>,
+                 <&gcc GCC_DISP_GPLL0_DIV_CLK_SRC>,
+                 <&mdss_dsi0_phy 0>,
+                 <&mdss_dsi0_phy 1>,
+                 <&usb_dp_qmpphy QMP_USB43DP_DP_LINK_CLK>,
+                 <&usb_dp_qmpphy QMP_USB43DP_DP_VCO_DIV_CLK>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+        #power-domain-cells = <1>;
+    };
+
+...
diff --git a/include/dt-bindings/clock/qcom,milos-dispcc.h b/include/dt-bindings/clock/qcom,milos-dispcc.h
new file mode 100644
index 000000000000..c70f23f32f0a
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,milos-dispcc.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_DISP_CC_MILOS_H
+#define _DT_BINDINGS_CLK_QCOM_DISP_CC_MILOS_H
+
+/* DISP_CC clocks */
+#define DISP_CC_PLL0						0
+#define DISP_CC_MDSS_ACCU_CLK					1
+#define DISP_CC_MDSS_AHB1_CLK					2
+#define DISP_CC_MDSS_AHB_CLK					3
+#define DISP_CC_MDSS_AHB_CLK_SRC				4
+#define DISP_CC_MDSS_BYTE0_CLK					5
+#define DISP_CC_MDSS_BYTE0_CLK_SRC				6
+#define DISP_CC_MDSS_BYTE0_DIV_CLK_SRC				7
+#define DISP_CC_MDSS_BYTE0_INTF_CLK				8
+#define DISP_CC_MDSS_DPTX0_AUX_CLK				9
+#define DISP_CC_MDSS_DPTX0_AUX_CLK_SRC				10
+#define DISP_CC_MDSS_DPTX0_CRYPTO_CLK				11
+#define DISP_CC_MDSS_DPTX0_LINK_CLK				12
+#define DISP_CC_MDSS_DPTX0_LINK_CLK_SRC				13
+#define DISP_CC_MDSS_DPTX0_LINK_DIV_CLK_SRC			14
+#define DISP_CC_MDSS_DPTX0_LINK_INTF_CLK			15
+#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK				16
+#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK_SRC			17
+#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK				18
+#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK_SRC			19
+#define DISP_CC_MDSS_DPTX0_USB_ROUTER_LINK_INTF_CLK		20
+#define DISP_CC_MDSS_ESC0_CLK					21
+#define DISP_CC_MDSS_ESC0_CLK_SRC				22
+#define DISP_CC_MDSS_MDP1_CLK					23
+#define DISP_CC_MDSS_MDP_CLK					24
+#define DISP_CC_MDSS_MDP_CLK_SRC				25
+#define DISP_CC_MDSS_MDP_LUT1_CLK				26
+#define DISP_CC_MDSS_MDP_LUT_CLK				27
+#define DISP_CC_MDSS_NON_GDSC_AHB_CLK				28
+#define DISP_CC_MDSS_PCLK0_CLK					29
+#define DISP_CC_MDSS_PCLK0_CLK_SRC				30
+#define DISP_CC_MDSS_RSCC_AHB_CLK				31
+#define DISP_CC_MDSS_RSCC_VSYNC_CLK				32
+#define DISP_CC_MDSS_VSYNC1_CLK					33
+#define DISP_CC_MDSS_VSYNC_CLK					34
+#define DISP_CC_MDSS_VSYNC_CLK_SRC				35
+#define DISP_CC_SLEEP_CLK					36
+#define DISP_CC_SLEEP_CLK_SRC					37
+#define DISP_CC_XO_CLK						38
+#define DISP_CC_XO_CLK_SRC					39
+
+/* DISP_CC resets */
+#define DISP_CC_MDSS_CORE_BCR					0
+#define DISP_CC_MDSS_CORE_INT2_BCR				1
+#define DISP_CC_MDSS_RSCC_BCR					2
+
+/* DISP_CC power domains */
+#define DISP_CC_MDSS_CORE_GDSC					0
+#define DISP_CC_MDSS_CORE_INT2_GDSC				1
+
+#endif
-- 
cgit v1.2.3


From 7e5368a14b8c295470ab07d2a9ad8ee9bf7187ee Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Tue, 15 Jul 2025 09:19:08 +0200
Subject: dt-bindings: clock: qcom: document the Milos GPU Clock Controller

Add bindings documentation for the Milos (e.g. SM7635) Graphics Clock
Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250715-sm7635-clocks-v3-8-18f9faac4984@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,sm8450-gpucc.yaml          |  2 +
 include/dt-bindings/clock/qcom,milos-gpucc.h       | 56 ++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 include/dt-bindings/clock/qcom,milos-gpucc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml
index 02968632fb3a..44380f6f8136 100644
--- a/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml
+++ b/Documentation/devicetree/bindings/clock/qcom,sm8450-gpucc.yaml
@@ -14,6 +14,7 @@ description: |
   domains on Qualcomm SoCs.
 
   See also::
+    include/dt-bindings/clock/qcom,milos-gpucc.h
     include/dt-bindings/clock/qcom,sar2130p-gpucc.h
     include/dt-bindings/clock/qcom,sm4450-gpucc.h
     include/dt-bindings/clock/qcom,sm8450-gpucc.h
@@ -25,6 +26,7 @@ description: |
 properties:
   compatible:
     enum:
+      - qcom,milos-gpucc
       - qcom,sar2130p-gpucc
       - qcom,sm4450-gpucc
       - qcom,sm8450-gpucc
diff --git a/include/dt-bindings/clock/qcom,milos-gpucc.h b/include/dt-bindings/clock/qcom,milos-gpucc.h
new file mode 100644
index 000000000000..6ff1925d409f
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,milos-gpucc.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_GPU_CC_MILOS_H
+#define _DT_BINDINGS_CLK_QCOM_GPU_CC_MILOS_H
+
+/* GPU_CC clocks */
+#define GPU_CC_PLL0						0
+#define GPU_CC_PLL0_OUT_EVEN					1
+#define GPU_CC_AHB_CLK						2
+#define GPU_CC_CB_CLK						3
+#define GPU_CC_CX_ACCU_SHIFT_CLK				4
+#define GPU_CC_CX_FF_CLK					5
+#define GPU_CC_CX_GMU_CLK					6
+#define GPU_CC_CXO_AON_CLK					7
+#define GPU_CC_CXO_CLK						8
+#define GPU_CC_DEMET_CLK					9
+#define GPU_CC_DEMET_DIV_CLK_SRC				10
+#define GPU_CC_DPM_CLK						11
+#define GPU_CC_FF_CLK_SRC					12
+#define GPU_CC_FREQ_MEASURE_CLK					13
+#define GPU_CC_GMU_CLK_SRC					14
+#define GPU_CC_GX_ACCU_SHIFT_CLK				15
+#define GPU_CC_GX_ACD_AHB_FF_CLK				16
+#define GPU_CC_GX_AHB_FF_CLK					17
+#define GPU_CC_GX_GMU_CLK					18
+#define GPU_CC_GX_RCG_AHB_FF_CLK				19
+#define GPU_CC_HLOS1_VOTE_GPU_SMMU_CLK				20
+#define GPU_CC_HUB_AON_CLK					21
+#define GPU_CC_HUB_CLK_SRC					22
+#define GPU_CC_HUB_CX_INT_CLK					23
+#define GPU_CC_HUB_DIV_CLK_SRC					24
+#define GPU_CC_MEMNOC_GFX_CLK					25
+#define GPU_CC_RSCC_HUB_AON_CLK					26
+#define GPU_CC_RSCC_XO_AON_CLK					27
+#define GPU_CC_SLEEP_CLK					28
+#define GPU_CC_XO_CLK_SRC					29
+#define GPU_CC_XO_DIV_CLK_SRC					30
+
+/* GPU_CC resets */
+#define GPU_CC_CB_BCR						0
+#define GPU_CC_CX_BCR						1
+#define GPU_CC_FAST_HUB_BCR					2
+#define GPU_CC_FF_BCR						3
+#define GPU_CC_GMU_BCR						4
+#define GPU_CC_GX_BCR						5
+#define GPU_CC_RBCPR_BCR					6
+#define GPU_CC_XO_BCR						7
+
+/* GPU_CC power domains */
+#define GPU_CC_CX_GDSC						0
+
+#endif
-- 
cgit v1.2.3


From a4937e9741867865bb307ae9dde6ef393b68540b Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Tue, 15 Jul 2025 09:19:10 +0200
Subject: dt-bindings: clock: qcom: document the Milos Video Clock Controller

Add bindings documentation for the Milos (e.g. SM7635) Video Clock
Controller.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250715-sm7635-clocks-v3-10-18f9faac4984@fairphone.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 .../bindings/clock/qcom,milos-videocc.yaml         | 53 ++++++++++++++++++++++
 include/dt-bindings/clock/qcom,milos-videocc.h     | 36 +++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/qcom,milos-videocc.yaml
 create mode 100644 include/dt-bindings/clock/qcom,milos-videocc.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/qcom,milos-videocc.yaml b/Documentation/devicetree/bindings/clock/qcom,milos-videocc.yaml
new file mode 100644
index 000000000000..14c31efe1308
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/qcom,milos-videocc.yaml
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/clock/qcom,milos-videocc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Video Clock & Reset Controller on Milos
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description: |
+  Qualcomm video clock control module provides the clocks, resets and power
+  domains on Milos.
+
+  See also: include/dt-bindings/clock/qcom,milos-videocc.h
+
+properties:
+  compatible:
+    const: qcom,milos-videocc
+
+  clocks:
+    items:
+      - description: Board XO source
+      - description: Board active XO source
+      - description: Sleep clock source
+      - description: Video AHB clock from GCC
+
+required:
+  - compatible
+  - clocks
+
+allOf:
+  - $ref: qcom,gcc.yaml#
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,milos-gcc.h>
+    clock-controller@aaf0000 {
+        compatible = "qcom,milos-videocc";
+        reg = <0x0aaf0000 0x10000>;
+        clocks = <&bi_tcxo_div2>,
+                 <&bi_tcxo_ao_div2>,
+                 <&sleep_clk>,
+                 <&gcc GCC_VIDEO_AHB_CLK>;
+        #clock-cells = <1>;
+        #reset-cells = <1>;
+        #power-domain-cells = <1>;
+    };
+
+...
diff --git a/include/dt-bindings/clock/qcom,milos-videocc.h b/include/dt-bindings/clock/qcom,milos-videocc.h
new file mode 100644
index 000000000000..3544db81ffae
--- /dev/null
+++ b/include/dt-bindings/clock/qcom,milos-videocc.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef _DT_BINDINGS_CLK_QCOM_VIDEO_CC_MILOS_H
+#define _DT_BINDINGS_CLK_QCOM_VIDEO_CC_MILOS_H
+
+/* VIDEO_CC clocks */
+#define VIDEO_CC_PLL0						0
+#define VIDEO_CC_AHB_CLK					1
+#define VIDEO_CC_AHB_CLK_SRC					2
+#define VIDEO_CC_MVS0_CLK					3
+#define VIDEO_CC_MVS0_CLK_SRC					4
+#define VIDEO_CC_MVS0_DIV_CLK_SRC				5
+#define VIDEO_CC_MVS0_SHIFT_CLK					6
+#define VIDEO_CC_MVS0C_CLK					7
+#define VIDEO_CC_MVS0C_DIV2_DIV_CLK_SRC				8
+#define VIDEO_CC_MVS0C_SHIFT_CLK				9
+#define VIDEO_CC_SLEEP_CLK					10
+#define VIDEO_CC_SLEEP_CLK_SRC					11
+#define VIDEO_CC_XO_CLK						12
+#define VIDEO_CC_XO_CLK_SRC					13
+
+/* VIDEO_CC resets */
+#define VIDEO_CC_INTERFACE_BCR					0
+#define VIDEO_CC_MVS0_BCR					1
+#define VIDEO_CC_MVS0C_CLK_ARES					2
+#define VIDEO_CC_MVS0C_BCR					3
+
+/* VIDEO_CC power domains */
+#define VIDEO_CC_MVS0_GDSC					0
+#define VIDEO_CC_MVS0C_GDSC					1
+
+#endif
-- 
cgit v1.2.3


From 2d72afb340657f03f7261e9243b44457a9228ac7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 16 Jul 2025 20:39:14 +0200
Subject: netfilter: nf_conntrack: fix crash due to removal of uninitialised
 entry

A crash in conntrack was reported while trying to unlink the conntrack
entry from the hash bucket list:
    [exception RIP: __nf_ct_delete_from_lists+172]
    [..]
 #7 [ff539b5a2b043aa0] nf_ct_delete at ffffffffc124d421 [nf_conntrack]
 #8 [ff539b5a2b043ad0] nf_ct_gc_expired at ffffffffc124d999 [nf_conntrack]
 #9 [ff539b5a2b043ae0] __nf_conntrack_find_get at ffffffffc124efbc [nf_conntrack]
    [..]

The nf_conn struct is marked as allocated from slab but appears to be in
a partially initialised state:

 ct hlist pointer is garbage; looks like the ct hash value
 (hence crash).
 ct->status is equal to IPS_CONFIRMED|IPS_DYING, which is expected
 ct->timeout is 30000 (=30s), which is unexpected.

Everything else looks like normal udp conntrack entry.  If we ignore
ct->status and pretend its 0, the entry matches those that are newly
allocated but not yet inserted into the hash:
  - ct hlist pointers are overloaded and store/cache the raw tuple hash
  - ct->timeout matches the relative time expected for a new udp flow
    rather than the absolute 'jiffies' value.

If it were not for the presence of IPS_CONFIRMED,
__nf_conntrack_find_get() would have skipped the entry.

Theory is that we did hit following race:

cpu x 			cpu y			cpu z
 found entry E		found entry E
 E is expired		<preemption>
 nf_ct_delete()
 return E to rcu slab
					init_conntrack
					E is re-inited,
					ct->status set to 0
					reply tuplehash hnnode.pprev
					stores hash value.

cpu y found E right before it was deleted on cpu x.
E is now re-inited on cpu z.  cpu y was preempted before
checking for expiry and/or confirm bit.

					->refcnt set to 1
					E now owned by skb
					->timeout set to 30000

If cpu y were to resume now, it would observe E as
expired but would skip E due to missing CONFIRMED bit.

					nf_conntrack_confirm gets called
					sets: ct->status |= CONFIRMED
					This is wrong: E is not yet added
					to hashtable.

cpu y resumes, it observes E as expired but CONFIRMED:
			<resumes>
			nf_ct_expired()
			 -> yes (ct->timeout is 30s)
			confirmed bit set.

cpu y will try to delete E from the hashtable:
			nf_ct_delete() -> set DYING bit
			__nf_ct_delete_from_lists

Even this scenario doesn't guarantee a crash:
cpu z still holds the table bucket lock(s) so y blocks:

			wait for spinlock held by z

					CONFIRMED is set but there is no
					guarantee ct will be added to hash:
					"chaintoolong" or "clash resolution"
					logic both skip the insert step.
					reply hnnode.pprev still stores the
					hash value.

					unlocks spinlock
					return NF_DROP
			<unblocks, then
			 crashes on hlist_nulls_del_rcu pprev>

In case CPU z does insert the entry into the hashtable, cpu y will unlink
E again right away but no crash occurs.

Without 'cpu y' race, 'garbage' hlist is of no consequence:
ct refcnt remains at 1, eventually skb will be free'd and E gets
destroyed via: nf_conntrack_put -> nf_conntrack_destroy -> nf_ct_destroy.

To resolve this, move the IPS_CONFIRMED assignment after the table
insertion but before the unlock.

Pablo points out that the confirm-bit-store could be reordered to happen
before hlist add resp. the timeout fixup, so switch to set_bit and
before_atomic memory barrier to prevent this.

It doesn't matter if other CPUs can observe a newly inserted entry right
before the CONFIRMED bit was set:

Such event cannot be distinguished from above "E is the old incarnation"
case: the entry will be skipped.

Also change nf_ct_should_gc() to first check the confirmed bit.

The gc sequence is:
 1. Check if entry has expired, if not skip to next entry
 2. Obtain a reference to the expired entry.
 3. Call nf_ct_should_gc() to double-check step 1.

nf_ct_should_gc() is thus called only for entries that already failed an
expiry check. After this patch, once the confirmed bit check passes
ct->timeout has been altered to reflect the absolute 'best before' date
instead of a relative time.  Step 3 will therefore not remove the entry.

Without this change to nf_ct_should_gc() we could still get this sequence:

 1. Check if entry has expired.
 2. Obtain a reference.
 3. Call nf_ct_should_gc() to double-check step 1:
    4 - entry is still observed as expired
    5 - meanwhile, ct->timeout is corrected to absolute value on other CPU
      and confirm bit gets set
    6 - confirm bit is seen
    7 - valid entry is removed again

First do check 6), then 4) so the gc expiry check always picks up either
confirmed bit unset (entry gets skipped) or expiry re-check failure for
re-inited conntrack objects.

This change cannot be backported to releases before 5.19. Without
commit 8a75a2c17410 ("netfilter: conntrack: remove unconfirmed list")
|= IPS_CONFIRMED line cannot be moved without further changes.

Cc: Razvan Cojocaru <rzvncj@gmail.com>
Link: https://lore.kernel.org/netfilter-devel/20250627142758.25664-1-fw@strlen.de/
Link: https://lore.kernel.org/netfilter-devel/4239da15-83ff-4ca4-939d-faef283471bb@gmail.com/
Fixes: 1397af5bfd7d ("netfilter: conntrack: remove the percpu dying list")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 15 +++++++++++++--
 net/netfilter/nf_conntrack_core.c    | 26 ++++++++++++++++++++------
 2 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..ca26274196b9 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -306,8 +306,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct)
 /* use after obtaining a reference count */
 static inline bool nf_ct_should_gc(const struct nf_conn *ct)
 {
-	return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
-	       !nf_ct_is_dying(ct);
+	if (!nf_ct_is_confirmed(ct))
+		return false;
+
+	/* load ct->timeout after is_confirmed() test.
+	 * Pairs with __nf_conntrack_confirm() which:
+	 * 1. Increases ct->timeout value
+	 * 2. Inserts ct into rcu hlist
+	 * 3. Sets the confirmed bit
+	 * 4. Unlocks the hlist lock
+	 */
+	smp_acquire__after_ctrl_dep();
+
+	return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct);
 }
 
 #define	NF_CT_DAY	(86400 * HZ)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 201d3c4ec623..e51f0b441109 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1124,6 +1124,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
 
 	hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
 				 &nf_conntrack_hash[repl_idx]);
+	/* confirmed bit must be set after hlist add, not before:
+	 * loser_ct can still be visible to other cpu due to
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	smp_mb__before_atomic();
+	set_bit(IPS_CONFIRMED_BIT, &loser_ct->status);
 
 	NF_CT_STAT_INC(net, clash_resolve);
 	return NF_ACCEPT;
@@ -1260,8 +1266,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 * user context, else we insert an already 'dead' hash, blocking
 	 * further use of that particular connection -JM.
 	 */
-	ct->status |= IPS_CONFIRMED;
-
 	if (unlikely(nf_ct_is_dying(ct))) {
 		NF_CT_STAT_INC(net, insert_failed);
 		goto dying;
@@ -1293,7 +1297,7 @@ chaintoolong:
 		}
 	}
 
-	/* Timer relative to confirmation time, not original
+	/* Timeout is relative to confirmation time, not original
 	   setting time, otherwise we'd get timer wrap in
 	   weird delay cases. */
 	ct->timeout += nfct_time_stamp;
@@ -1301,11 +1305,21 @@ chaintoolong:
 	__nf_conntrack_insert_prepare(ct);
 
 	/* Since the lookup is lockless, hash insertion must be done after
-	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
-	 * guarantee that no other CPU can find the conntrack before the above
-	 * stores are visible.
+	 * setting ct->timeout. The RCU barriers guarantee that no other CPU
+	 * can find the conntrack before the above stores are visible.
 	 */
 	__nf_conntrack_hash_insert(ct, hash, reply_hash);
+
+	/* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups
+	 * skip entries that lack this bit.  This happens when a CPU is looking
+	 * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU
+	 * or when another CPU encounters this entry right after the insertion
+	 * but before the set-confirm-bit below.  This bit must not be set until
+	 * after __nf_conntrack_hash_insert().
+	 */
+	smp_mb__before_atomic();
+	set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
 	nf_conntrack_double_unlock(hash, reply_hash);
 	local_bh_enable();
 
-- 
cgit v1.2.3


From ddb8172cdf8854a215ce23ad0f20b2578fa512db Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 8 Jul 2025 16:33:44 +0300
Subject: watchdog: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20250708133646.70384-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
---
 drivers/watchdog/watchdog_core.h       |  8 +++++++-
 drivers/watchdog/watchdog_pretimeout.c |  2 ++
 include/linux/watchdog.h               | 12 ++++++++----
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/watchdog/watchdog_core.h b/drivers/watchdog/watchdog_core.h
index 5b35a8439e26..ab825d9f9248 100644
--- a/drivers/watchdog/watchdog_core.h
+++ b/drivers/watchdog/watchdog_core.h
@@ -24,8 +24,14 @@
  *	This material is provided "AS-IS" and at no charge.
  */
 
-#include <linux/hrtimer.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/hrtimer_types.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
+#include <linux/mutex_types.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
 
 #define MAX_DOGS	32	/* Maximum number of watchdog devices */
 
diff --git a/drivers/watchdog/watchdog_pretimeout.c b/drivers/watchdog/watchdog_pretimeout.c
index e5295c990fa1..2526436dc74d 100644
--- a/drivers/watchdog/watchdog_pretimeout.c
+++ b/drivers/watchdog/watchdog_pretimeout.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
 #include <linux/watchdog.h>
 
 #include "watchdog_core.h"
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 99660197a36c..8c60687a3e55 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -9,14 +9,18 @@
 #ifndef _LINUX_WATCHDOG_H
 #define _LINUX_WATCHDOG_H
 
-
 #include <linux/bitops.h>
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
 #include <linux/notifier.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+
 #include <uapi/linux/watchdog.h>
 
+struct attribute_group;
+struct device;
+struct module;
+
 struct watchdog_ops;
 struct watchdog_device;
 struct watchdog_core_data;
-- 
cgit v1.2.3


From b5cd5f1e50205831cb078f5c52359004eb1cbe74 Mon Sep 17 00:00:00 2001
From: Alok Tiwari <alok.a.tiwari@oracle.com>
Date: Tue, 24 Jun 2025 21:16:33 -0700
Subject: nvme: fix typo in status code constant for self-test in progress

Correct a typo error in the NVMe status code constant from
NVME_SC_SELT_TEST_IN_PROGRESS to NVME_SC_SELF_TEST_IN_PROGRESS to
accurately reflect its meaning.

Signed-off-by: Alok Tiwari <alok.a.tiwari@oracle.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/constants.c | 4 ++--
 include/linux/nvme.h          | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 1a0058be5821..dc90df9e13a2 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -133,7 +133,7 @@ static const char * const nvme_statuses[] = {
 	[NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached",
 	[NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported",
 	[NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid",
-	[NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress",
+	[NVME_SC_SELF_TEST_IN_PROGRESS] = "Device Self-test In Progress",
 	[NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited",
 	[NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier",
 	[NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State",
@@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = {
 	[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
 	[NVME_SC_INVALID_PI] = "Invalid Protection Information",
 	[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
-	[NVME_SC_CMD_SIZE_LIM_EXCEEDED	] = "Command Size Limits Exceeded",
+	[NVME_SC_CMD_SIZE_LIM_EXCEEDED] = "Command Size Limits Exceeded",
 	[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
 	[NVME_SC_ZONE_FULL] = "Zone Is Full",
 	[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b65a1b9f2116..655d194f8e72 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2155,7 +2155,7 @@ enum {
 	NVME_SC_NS_NOT_ATTACHED		= 0x11a,
 	NVME_SC_THIN_PROV_NOT_SUPP	= 0x11b,
 	NVME_SC_CTRL_LIST_INVALID	= 0x11c,
-	NVME_SC_SELT_TEST_IN_PROGRESS	= 0x11d,
+	NVME_SC_SELF_TEST_IN_PROGRESS	= 0x11d,
 	NVME_SC_BP_WRITE_PROHIBITED	= 0x11e,
 	NVME_SC_CTRL_ID_INVALID		= 0x11f,
 	NVME_SC_SEC_CTRL_STATE_INVALID	= 0x120,
-- 
cgit v1.2.3


From 6381061d82141909c382811978ccdd7566698bca Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Fri, 11 Jul 2025 10:52:53 +0000
Subject: ilog2: add max_pow_of_two_factor()

Relocate the function max_pow_of_two_factor() to common ilog2.h from the
xfs code, as it will be used elsewhere.

Also simplify the function, as advised by Mikulas Patocka.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20250711105258.3135198-2-john.g.garry@oracle.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/xfs/xfs_mount.c   |  5 -----
 include/linux/log2.h | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 29276fe60df9..6c669ae082d4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -672,11 +672,6 @@ static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
 	return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
 }
 
-static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
-{
-	return 1 << (ffs(nr) - 1);
-}
-
 /*
  * If the data device advertises atomic write support, limit the size of data
  * device atomic writes to the greatest power-of-two factor of the AG size so
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1366cb688a6d..2eac3fc9303d 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -255,4 +255,18 @@ int __bits_per(unsigned long n)
 	) :					\
 	__bits_per(n)				\
 )
+
+/**
+ * max_pow_of_two_factor - return highest power-of-2 factor
+ * @n: parameter
+ *
+ * find highest power-of-2 which is evenly divisible into n.
+ * 0 is returned for n == 0 or 1.
+ */
+static inline __attribute__((const))
+unsigned int max_pow_of_two_factor(unsigned int n)
+{
+	return n & -n;
+}
+
 #endif /* _LINUX_LOG2_H */
-- 
cgit v1.2.3


From 962fb1f651c2cf2083e0c3ef53ba69e3b96d3fbc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jul 2025 08:43:42 +0100
Subject: rxrpc: Fix recv-recv race of completed call

If a call receives an event (such as incoming data), the call gets placed
on the socket's queue and a thread in recvmsg can be awakened to go and
process it.  Once the thread has picked up the call off of the queue,
further events will cause it to be requeued, and once the socket lock is
dropped (recvmsg uses call->user_mutex to allow the socket to be used in
parallel), a second thread can come in and its recvmsg can pop the call off
the socket queue again.

In such a case, the first thread will be receiving stuff from the call and
the second thread will be blocked on call->user_mutex.  The first thread
can, at this point, process both the event that it picked call for and the
event that the second thread picked the call for and may see the call
terminate - in which case the call will be "released", decoupling the call
from the user call ID assigned to it (RXRPC_USER_CALL_ID in the control
message).

The first thread will return okay, but then the second thread will wake up
holding the user_mutex and, if it sees that the call has been released by
the first thread, it will BUG thusly:

	kernel BUG at net/rxrpc/recvmsg.c:474!

Fix this by just dequeuing the call and ignoring it if it is seen to be
already released.  We can't tell userspace about it anyway as the user call
ID has become stale.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Reported-by: Junvyyang, Tencent Zhuque Lab <zhuque@tencent.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
cc: LePremierHomme <kwqcheii@proton.me>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250717074350.3767366-3-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h |  3 +++
 net/rxrpc/call_accept.c      |  1 +
 net/rxrpc/recvmsg.c          | 19 +++++++++++++++++--
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 378d2dfc7392..e7dcfb1369b6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -330,12 +330,15 @@
 	EM(rxrpc_call_put_userid,		"PUT user-id ") \
 	EM(rxrpc_call_see_accept,		"SEE accept  ") \
 	EM(rxrpc_call_see_activate_client,	"SEE act-clnt") \
+	EM(rxrpc_call_see_already_released,	"SEE alrdy-rl") \
 	EM(rxrpc_call_see_connect_failed,	"SEE con-fail") \
 	EM(rxrpc_call_see_connected,		"SEE connect ") \
 	EM(rxrpc_call_see_conn_abort,		"SEE conn-abt") \
+	EM(rxrpc_call_see_discard,		"SEE discard ") \
 	EM(rxrpc_call_see_disconnected,		"SEE disconn ") \
 	EM(rxrpc_call_see_distribute_error,	"SEE dist-err") \
 	EM(rxrpc_call_see_input,		"SEE input   ") \
+	EM(rxrpc_call_see_recvmsg,		"SEE recvmsg ") \
 	EM(rxrpc_call_see_release,		"SEE release ") \
 	EM(rxrpc_call_see_userid_exists,	"SEE u-exists") \
 	EM(rxrpc_call_see_waiting_call,		"SEE q-conn  ") \
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 226b4bf82747..a4d76f2da684 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -219,6 +219,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 	tail = b->call_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
 		struct rxrpc_call *call = b->call_backlog[tail];
+		rxrpc_see_call(call, rxrpc_call_see_discard);
 		rcu_assign_pointer(call->socket, rx);
 		if (rx->app_ops &&
 		    rx->app_ops->discard_new_call) {
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 86a27fb55a1c..6990e37697de 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -447,6 +447,16 @@ try_again:
 		goto try_again;
 	}
 
+	rxrpc_see_call(call, rxrpc_call_see_recvmsg);
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+		rxrpc_see_call(call, rxrpc_call_see_already_released);
+		list_del_init(&call->recvmsg_link);
+		spin_unlock_irq(&rx->recvmsg_lock);
+		release_sock(&rx->sk);
+		trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0);
+		rxrpc_put_call(call, rxrpc_call_put_recvmsg);
+		goto try_again;
+	}
 	if (!(flags & MSG_PEEK))
 		list_del_init(&call->recvmsg_link);
 	else
@@ -470,8 +480,13 @@ try_again:
 
 	release_sock(&rx->sk);
 
-	if (test_bit(RXRPC_CALL_RELEASED, &call->flags))
-		BUG();
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+		rxrpc_see_call(call, rxrpc_call_see_already_released);
+		mutex_unlock(&call->user_mutex);
+		if (!(flags & MSG_PEEK))
+			rxrpc_put_call(call, rxrpc_call_put_recvmsg);
+		goto try_again;
+	}
 
 	ret = rxrpc_recvmsg_user_id(call, msg, flags);
 	if (ret < 0)
-- 
cgit v1.2.3


From 2fd895842d49c23137ae48252dd211e5d6d8a3ed Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Jul 2025 08:43:43 +0100
Subject: rxrpc: Fix notification vs call-release vs recvmsg

When a call is released, rxrpc takes the spinlock and removes it from
->recvmsg_q in an effort to prevent racing recvmsg() invocations from
seeing the same call.  Now, rxrpc_recvmsg() only takes the spinlock when
actually removing a call from the queue; it doesn't, however, take it in
the lead up to that when it checks to see if the queue is empty.  It *does*
hold the socket lock, which prevents a recvmsg/recvmsg race - but this
doesn't prevent sendmsg from ending the call because sendmsg() drops the
socket lock and relies on the call->user_mutex.

Fix this by firstly removing the bit in rxrpc_release_call() that dequeues
the released call and, instead, rely on recvmsg() to simply discard
released calls (done in a preceding fix).

Secondly, rxrpc_notify_socket() is abandoned if the call is already marked
as released rather than trying to be clever by setting both pointers in
call->recvmsg_link to NULL to trick list_empty().  This isn't perfect and
can still race, resulting in a released call on the queue, but recvmsg()
will now clean that up.

Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both")
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Junvyyang, Tencent Zhuque Lab <zhuque@tencent.com>
cc: LePremierHomme <kwqcheii@proton.me>
cc: Simon Horman <horms@kernel.org>
cc: linux-afs@lists.infradead.org
Link: https://patch.msgid.link/20250717074350.3767366-4-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/rxrpc.h |  3 ++-
 net/rxrpc/call_object.c      | 28 ++++++++++++----------------
 net/rxrpc/recvmsg.c          |  4 ++++
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e7dcfb1369b6..de6f6d25767c 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -322,10 +322,10 @@
 	EM(rxrpc_call_put_kernel,		"PUT kernel  ") \
 	EM(rxrpc_call_put_poke,			"PUT poke    ") \
 	EM(rxrpc_call_put_recvmsg,		"PUT recvmsg ") \
+	EM(rxrpc_call_put_release_recvmsg_q,	"PUT rls-rcmq") \
 	EM(rxrpc_call_put_release_sock,		"PUT rls-sock") \
 	EM(rxrpc_call_put_release_sock_tba,	"PUT rls-sk-a") \
 	EM(rxrpc_call_put_sendmsg,		"PUT sendmsg ") \
-	EM(rxrpc_call_put_unnotify,		"PUT unnotify") \
 	EM(rxrpc_call_put_userid_exists,	"PUT u-exists") \
 	EM(rxrpc_call_put_userid,		"PUT user-id ") \
 	EM(rxrpc_call_see_accept,		"SEE accept  ") \
@@ -338,6 +338,7 @@
 	EM(rxrpc_call_see_disconnected,		"SEE disconn ") \
 	EM(rxrpc_call_see_distribute_error,	"SEE dist-err") \
 	EM(rxrpc_call_see_input,		"SEE input   ") \
+	EM(rxrpc_call_see_notify_released,	"SEE nfy-rlsd") \
 	EM(rxrpc_call_see_recvmsg,		"SEE recvmsg ") \
 	EM(rxrpc_call_see_release,		"SEE release ") \
 	EM(rxrpc_call_see_userid_exists,	"SEE u-exists") \
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 15067ff7b1f2..918f41d97a2f 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -561,7 +561,7 @@ static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call)
 void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 {
 	struct rxrpc_connection *conn = call->conn;
-	bool put = false, putu = false;
+	bool putu = false;
 
 	_enter("{%d,%d}", call->debug_id, refcount_read(&call->ref));
 
@@ -573,23 +573,13 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
 
 	rxrpc_put_call_slot(call);
 
-	/* Make sure we don't get any more notifications */
+	/* Note that at this point, the call may still be on or may have been
+	 * added back on to the socket receive queue.  recvmsg() must discard
+	 * released calls.  The CALL_RELEASED flag should prevent further
+	 * notifications.
+	 */
 	spin_lock_irq(&rx->recvmsg_lock);
-
-	if (!list_empty(&call->recvmsg_link)) {
-		_debug("unlinking once-pending call %p { e=%lx f=%lx }",
-		       call, call->events, call->flags);
-		list_del(&call->recvmsg_link);
-		put = true;
-	}
-
-	/* list_empty() must return false in rxrpc_notify_socket() */
-	call->recvmsg_link.next = NULL;
-	call->recvmsg_link.prev = NULL;
-
 	spin_unlock_irq(&rx->recvmsg_lock);
-	if (put)
-		rxrpc_put_call(call, rxrpc_call_put_unnotify);
 
 	write_lock(&rx->call_lock);
 
@@ -638,6 +628,12 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
 		rxrpc_put_call(call, rxrpc_call_put_release_sock);
 	}
 
+	while ((call = list_first_entry_or_null(&rx->recvmsg_q,
+						struct rxrpc_call, recvmsg_link))) {
+		list_del_init(&call->recvmsg_link);
+		rxrpc_put_call(call, rxrpc_call_put_release_recvmsg_q);
+	}
+
 	_leave("");
 }
 
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 6990e37697de..7fa7e77f6bb9 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -29,6 +29,10 @@ void rxrpc_notify_socket(struct rxrpc_call *call)
 
 	if (!list_empty(&call->recvmsg_link))
 		return;
+	if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+		rxrpc_see_call(call, rxrpc_call_see_notify_released);
+		return;
+	}
 
 	rcu_read_lock();
 
-- 
cgit v1.2.3


From dfe25fbaedfc2a07281ed5ff0442270217d25b31 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Fri, 4 Jul 2025 11:08:04 -0700
Subject: cgroup: llist: avoid memory tears for llist_node

Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi
safe"), the struct llist_node is expected to be private to the one
inserting the node to the lockless list or the one removing the node
from the lockless list. After the mentioned commit, the llist_node in
the rstat code is per-cpu shared between the stacked contexts i.e.
process, softirq, hardirq & nmi. It is possible the compiler may tear
the loads or stores of llist_node. Let's avoid that.

KCSAN reported the following race:

 Reported by Kernel Concurrency Sanitizer on:
 CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE
 Tainted: [E]=UNSIGNED_MODULE
 Hardware name: ...
 ==================================================================
 ==================================================================
 BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated
 write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1:
  css_rstat_flush+0x1b8/0xeb0
  __mem_cgroup_flush_stats+0x184/0x190
  flush_memcg_stats_dwork+0x22/0x50
  process_one_work+0x335/0x630
  worker_thread+0x5f1/0x8a0
  kthread+0x197/0x340
  ret_from_fork+0xd3/0x110
  ret_from_fork_asm+0x11/0x20
 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15:
  css_rstat_updated+0x81/0x180
  mod_memcg_lruvec_state+0x113/0x2d0
  __mod_lruvec_state+0x3d/0x50
  lru_add+0x21e/0x3f0
  folio_batch_move_lru+0x80/0x1b0
  __folio_batch_add_and_move+0xd7/0x160
  folio_add_lru_vma+0x42/0x50
  do_anonymous_page+0x892/0xe90
  __handle_mm_fault+0xfaa/0x1520
  handle_mm_fault+0xdc/0x350
  do_user_addr_fault+0x1dc/0x650
  exc_page_fault+0x5c/0x110
  asm_exc_page_fault+0x22/0x30
 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0

$ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0
css_rstat_flush+0x1b8/0xeb0:
init_llist_node at include/linux/llist.h:86
(inlined by) llist_del_first_init at include/linux/llist.h:308
(inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148
(inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258
(inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389

$ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180
css_rstat_updated+0x81/0x180:
css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1)

These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these
reports. However let's add comments to explain the race and the need for
memory barriers if stronger guarantees are needed.

More specifically the rstat updater and the flusher can race and cause a
scenario where the stats updater skips adding the css to the lockless
list but the flusher might not see those updates done by the skipped
updater. This is benign race and the subsequent flusher will flush those
stats and at the moment there aren't any rstat users which are not fine
with this kind of race. However some future user might want more
stricter guarantee, so let's add appropriate comments to ease the job of
future users.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/llist.h |  6 +++---
 kernel/cgroup/rstat.c | 28 +++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 27b17f64bcee..607b2360c938 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -83,7 +83,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline void init_llist_node(struct llist_node *node)
 {
-	node->next = node;
+	WRITE_ONCE(node->next, node);
 }
 
 /**
@@ -97,7 +97,7 @@ static inline void init_llist_node(struct llist_node *node)
  */
 static inline bool llist_on_list(const struct llist_node *node)
 {
-	return node->next != node;
+	return READ_ONCE(node->next) != node;
 }
 
 /**
@@ -220,7 +220,7 @@ static inline bool llist_empty(const struct llist_head *head)
 
 static inline struct llist_node *llist_next(struct llist_node *node)
 {
-	return node->next;
+	return READ_ONCE(node->next);
 }
 
 /**
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index c8a48cf83878..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -60,6 +60,12 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
  * Atomically inserts the css in the ss's llist for the given cpu. This is
  * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
  * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
  */
 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 {
@@ -86,7 +92,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
 		return;
 
 	rstatc = css_rstat_cpu(css, cpu);
-	/* If already on list return. */
+	/*
+	 * If already on list return. This check is racy and smp_mb() is needed
+	 * to pair it with the smp_mb() in css_process_update_tree() if the
+	 * guarantee that the updated stats are visible to concurrent flusher is
+	 * needed.
+	 */
 	if (llist_on_list(&rstatc->lnode))
 		return;
 
@@ -148,6 +159,21 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
 	while ((lnode = llist_del_first_init(lhead))) {
 		struct css_rstat_cpu *rstatc;
 
+		/*
+		 * smp_mb() is needed here (more specifically in between
+		 * init_llist_node() and per-cpu stats flushing) if the
+		 * guarantee is required by a rstat user where etiher the
+		 * updater should add itself on the lockless list or the
+		 * flusher flush the stats updated by the updater who have
+		 * observed that they are already on the list. The
+		 * corresponding barrier pair for this one should be before
+		 * css_rstat_updated() by the user.
+		 *
+		 * For now, there aren't any such user, so not adding the
+		 * barrier here but if such a use-case arise, please add
+		 * smp_mb() here.
+		 */
+
 		rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
 		__css_process_update_tree(rstatc->owner, cpu);
 	}
-- 
cgit v1.2.3


From fc6f89dc707838564abbb8e22dad8e4d75c7fa26 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 8 Jul 2025 15:47:29 -0700
Subject: stop_machine: Improve kernel-doc function-header comments

Add more detail to the kernel-doc function-header comments for
stop_machine(), stop_machine_cpuslocked(), and stop_core_cpuslocked().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/stop_machine.h | 64 ++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 3132262a404d..72820503514c 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -88,55 +88,73 @@ static inline void print_stop_info(const char *log_lvl, struct task_struct *task
 #endif	/* CONFIG_SMP */
 
 /*
- * stop_machine "Bogolock": stop the entire machine, disable
- * interrupts.  This is a very heavy lock, which is equivalent to
- * grabbing every spinlock (and more).  So the "read" side to such a
- * lock is anything which disables preemption.
+ * stop_machine "Bogolock": stop the entire machine, disable interrupts.
+ * This is a very heavy lock, which is equivalent to grabbing every raw
+ * spinlock (and more).  So the "read" side to such a lock is anything
+ * which disables preemption.
  */
 #if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU)
 
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
  * @fn: the function to run
- * @data: the data ptr for the @fn()
- * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ * @data: the data ptr to pass to @fn()
+ * @cpus: the cpus to run @fn() on (NULL = run on each online CPU)
  *
- * Description: This causes a thread to be scheduled on every cpu,
- * each of which disables interrupts.  The result is that no one is
- * holding a spinlock or inside any other preempt-disabled region when
- * @fn() runs.
+ * Description: This causes a thread to be scheduled on every CPU, which
+ * will run with interrupts disabled.  Each CPU specified by @cpus will
+ * run @fn.  While @fn is executing, there will no other CPUs holding
+ * a raw spinlock or running within any other type of preempt-disabled
+ * region of code.
  *
- * This can be thought of as a very heavy write lock, equivalent to
- * grabbing every spinlock in the kernel.
+ * When @cpus specifies only a single CPU, this can be thought of as
+ * a reader-writer lock where readers disable preemption (for example,
+ * by holding a raw spinlock) and where the insanely heavy writers run
+ * @fn while also preventing any other CPU from doing any useful work.
+ * These writers can also be thought of as having implicitly grabbed every
+ * raw spinlock in the kernel.
  *
- * Protects against CPU hotplug.
+ * When @fn is a no-op, this can be thought of as an RCU implementation
+ * where readers again disable preemption and writers use stop_machine()
+ * in place of synchronize_rcu(), albeit with orders of magnitude more
+ * disruption than even that of synchronize_rcu_expedited().
+ *
+ * Although only one stop_machine() operation can proceed at a time,
+ * the possibility of blocking in cpus_read_lock() means that the caller
+ * cannot usefully rely on this serialization.
+ *
+ * Return: 0 if all invocations of @fn return zero.  Otherwise, the
+ * value returned by an arbitrarily chosen member of the set of calls to
+ * @fn that returned non-zero.
  */
 int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
 
 /**
  * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function
  * @fn: the function to run
- * @data: the data ptr for the @fn()
- * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
+ * @data: the data ptr to pass to @fn()
+ * @cpus: the cpus to run @fn() on (NULL = run on each online CPU)
+ *
+ * Same as above.  Avoids nested calls to cpus_read_lock().
  *
- * Same as above. Must be called from with in a cpus_read_lock() protected
- * region. Avoids nested calls to cpus_read_lock().
+ * Context: Must be called from within a cpus_read_lock() protected region.
  */
 int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
 
 /**
  * stop_core_cpuslocked: - stop all threads on just one core
  * @cpu: any cpu in the targeted core
- * @fn: the function to run
- * @data: the data ptr for @fn()
+ * @fn: the function to run on each CPU in the core containing @cpu
+ * @data: the data ptr to pass to @fn()
  *
- * Same as above, but instead of every CPU, only the logical CPUs of a
- * single core are affected.
+ * Same as above, but instead of every CPU, only the logical CPUs of the
+ * single core containing @cpu are affected.
  *
  * Context: Must be called from within a cpus_read_lock() protected region.
  *
- * Return: 0 if all executions of @fn returned 0, any non zero return
- * value if any returned non zero.
+ * Return: 0 if all invocations of @fn return zero.  Otherwise, the
+ * value returned by an arbitrarily chosen member of the set of calls to
+ * @fn that returned non-zero.
  */
 int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data);
 
-- 
cgit v1.2.3


From 76720eed7d18baf51c0f31fe8a3784702f50e3fc Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 17 Jul 2025 12:38:04 -0500
Subject: PCI: Add pci_is_display() to check if device is a display controller

Several places in the kernel do class shifting to match whether a PCI
device is display class.  Add pci_is_display() for those places to use.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Daniel Dadap <ddadap@nvidia.com>
Reviewed-by: Simona Vetter <simona.vetter@ffwll.ch>
Link: https://patch.msgid.link/20250717173812.3633478-2-superm1@kernel.org
---
 include/linux/pci.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 05e68f35f392..4fff6405a830 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -744,6 +744,21 @@ static inline bool pci_is_vga(struct pci_dev *pdev)
 	return false;
 }
 
+/**
+ * pci_is_display - check if the PCI device is a display controller
+ * @pdev: PCI device
+ *
+ * Determine whether the given PCI device corresponds to a display
+ * controller. Display controllers are typically used for graphical output
+ * and are identified based on their class code.
+ *
+ * Return: true if the PCI device is a display controller, false otherwise.
+ */
+static inline bool pci_is_display(struct pci_dev *pdev)
+{
+	return (pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY;
+}
+
 #define for_each_pci_bridge(dev, bus)				\
 	list_for_each_entry(dev, &bus->devices, bus_list)	\
 		if (!pci_is_bridge(dev)) {} else
-- 
cgit v1.2.3


From c0ae03588bbb95378758fe80e7436a9b4cfc71f6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 15 Jul 2025 17:03:21 -0700
Subject: ethtool: rss: initial RSS_SET (indirection table handling)

Add initial support for RSS_SET, for now only operations on
the indirection table are supported.

Unlike the ioctl don't check if at least one parameter is
being changed. This is how other ethtool-nl ops behave,
so pick the ethtool-nl consistency vs copying ioctl behavior.

There are two special cases here:
 1) resetting the table to defaults;
 2) support for tables of different size.

For (1) I use an empty Netlink attribute (array of size 0).

(2) may require some background. AFAICT a lot of modern devices
allow allocating RSS tables of different sizes. mlx5 can upsize
its tables, bnxt has some "table size calculation", and Intel
folks asked about RSS table sizing in context of resource allocation
in the past. The ethtool IOCTL API has a concept of table size,
but right now the user is expected to provide a table exactly
the size the device requests. Some drivers may change the table
size at runtime (in response to queue count changes) but the
user is not in control of this. What's not great is that all
RSS contexts share the same table size. For example a device
with 128 queues enabled, 16 RSS contexts 8 queues in each will
likely have 256 entry tables for each of the 16 contexts,
while 32 would be more than enough given each context only has
8 queues. To address this the Netlink API should avoid enforcing
table size at the uAPI level, and should allow the user to express
the min table size they expect.

To fully solve (2) we will need more driver plumbing but
at the uAPI level this patch allows the user to specify
a table size smaller than what the device advertises. The device
table size must be a multiple of the user requested table size.
We then replicate the user-provided table to fill the full device
size table. This addresses the "allow the user to express the min
table size" objective, while not enforcing any fixed size.
From Netlink perspective .get_rxfh_indir_size() is now de facto
the "max" table size supported by the device.

We may choose to support table replication in ethtool, too,
when we actually plumb this thru the device APIs.

Initially I was considering moving full pattern generation
to the kernel (which queues to use, at which frequency and
what min sequence length). I don't think this complexity
would buy us much and most if not all devices have pow-2
table sizes, which simplifies the replication a lot.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250716000331.1378807-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  12 ++
 Documentation/networking/ethtool-netlink.rst   |  26 +++-
 include/uapi/linux/ethtool_netlink_generated.h |   1 +
 net/ethtool/netlink.c                          |   8 +
 net/ethtool/netlink.h                          |   1 +
 net/ethtool/rss.c                              | 195 +++++++++++++++++++++++++
 6 files changed, 242 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index c38c03c624f0..1eca88a508a0 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -2643,6 +2643,18 @@ operations:
         attributes:
           - header
           - events
+    -
+      name: rss-set
+      doc: Set RSS params.
+
+      attribute-set: rss
+
+      do:
+        request:
+          attributes:
+            - header
+            - context
+            - indir
     -
       name: rss-ntf
       doc: |
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 248bc3d93da9..27db7540e60e 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -239,6 +239,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_PHY_GET``               get Ethernet PHY information
   ``ETHTOOL_MSG_TSCONFIG_GET``          get hw timestamping configuration
   ``ETHTOOL_MSG_TSCONFIG_SET``          set hw timestamping configuration
+  ``ETHTOOL_MSG_RSS_SET``               set RSS settings
   ===================================== =================================
 
 Kernel to userspace:
@@ -292,6 +293,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_TSCONFIG_GET_REPLY``       hw timestamping configuration
   ``ETHTOOL_MSG_TSCONFIG_SET_REPLY``       new hw timestamping configuration
   ``ETHTOOL_MSG_PSE_NTF``                  PSE events notification
+  ``ETHTOOL_MSG_RSS_NTF``                  RSS settings notification
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1989,6 +1991,28 @@ hfunc. Current supported options are symmetric-xor and symmetric-or-xor.
 ETHTOOL_A_RSS_FLOW_HASH carries per-flow type bitmask of which header
 fields are included in the hash calculation.
 
+RSS_SET
+=======
+
+Request contents:
+
+=====================================  ======  ==============================
+  ``ETHTOOL_A_RSS_HEADER``             nested  request header
+  ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
+  ``ETHTOOL_A_RSS_INDIR``              binary  Indir table bytes
+=====================================  ======  ==============================
+
+``ETHTOOL_A_RSS_INDIR`` is the minimal RSS table the user expects. Kernel and
+the device driver may replicate the table if its smaller than smallest table
+size supported by the device. For example if user requests ``[0, 1]`` but the
+device needs at least 8 entries - the real table in use will end up being
+``[0, 1, 0, 1, 0, 1, 0, 1]``. Most devices require the table size to be power
+of 2, so tables which size is not a power of 2 will likely be rejected.
+Using table of size 0 will reset the indirection table to the default.
+
+Note that, at present, only a subset of RSS configuration can be accomplished
+over Netlink.
+
 PLCA_GET_CFG
 ============
 
@@ -2455,7 +2479,7 @@ are netlink only.
   ``ETHTOOL_GRXNTUPLE``               n/a
   ``ETHTOOL_GSSET_INFO``              ``ETHTOOL_MSG_STRSET_GET``
   ``ETHTOOL_GRXFHINDIR``              ``ETHTOOL_MSG_RSS_GET``
-  ``ETHTOOL_SRXFHINDIR``              n/a
+  ``ETHTOOL_SRXFHINDIR``              ``ETHTOOL_MSG_RSS_SET``
   ``ETHTOOL_GFEATURES``               ``ETHTOOL_MSG_FEATURES_GET``
   ``ETHTOOL_SFEATURES``               ``ETHTOOL_MSG_FEATURES_SET``
   ``ETHTOOL_GCHANNELS``               ``ETHTOOL_MSG_CHANNELS_GET``
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 96027e26ffba..130bdf5c3516 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -840,6 +840,7 @@ enum {
 	ETHTOOL_MSG_PHY_GET,
 	ETHTOOL_MSG_TSCONFIG_GET,
 	ETHTOOL_MSG_TSCONFIG_SET,
+	ETHTOOL_MSG_RSS_SET,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index b1f8999c1adc..0ae0d7a9667c 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -405,6 +405,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_PSE_GET]		= &ethnl_pse_request_ops,
 	[ETHTOOL_MSG_PSE_SET]		= &ethnl_pse_request_ops,
 	[ETHTOOL_MSG_RSS_GET]		= &ethnl_rss_request_ops,
+	[ETHTOOL_MSG_RSS_SET]		= &ethnl_rss_request_ops,
 	[ETHTOOL_MSG_PLCA_GET_CFG]	= &ethnl_plca_cfg_request_ops,
 	[ETHTOOL_MSG_PLCA_SET_CFG]	= &ethnl_plca_cfg_request_ops,
 	[ETHTOOL_MSG_PLCA_GET_STATUS]	= &ethnl_plca_status_request_ops,
@@ -1504,6 +1505,13 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_tsconfig_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_RSS_SET,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_default_set_doit,
+		.policy = ethnl_rss_set_policy,
+		.maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 94a7eb402022..620dd1ab9b3b 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -484,6 +484,7 @@ extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MO
 extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
 extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
 extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
+extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
 extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1];
 extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1];
 extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 41ab9fc67652..c8db523671de 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -218,6 +218,10 @@ rss_prepare(const struct rss_req_info *request, struct net_device *dev,
 {
 	rss_prepare_flow_hash(request, dev, data, info);
 
+	/* Coming from RSS_SET, driver may only have flow_hash_fields ops */
+	if (!dev->ethtool_ops->get_rxfh)
+		return 0;
+
 	if (request->rss_context)
 		return rss_prepare_ctx(request, dev, data, info);
 	return rss_prepare_get(request, dev, data, info);
@@ -466,6 +470,193 @@ void ethtool_rss_notify(struct net_device *dev, u32 rss_context)
 	ethnl_notify(dev, ETHTOOL_MSG_RSS_NTF, &req_info.base);
 }
 
+/* RSS_SET */
+
+const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_START_CONTEXT + 1] = {
+	[ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32, },
+	[ETHTOOL_A_RSS_INDIR] = { .type = NLA_BINARY, },
+};
+
+static int
+ethnl_rss_set_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+	const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+	struct rss_req_info *request = RSS_REQINFO(req_info);
+	struct nlattr **tb = info->attrs;
+	struct nlattr *bad_attr = NULL;
+
+	if (request->rss_context && !ops->create_rxfh_context)
+		bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_CONTEXT];
+
+	if (bad_attr) {
+		NL_SET_BAD_ATTR(info->extack, bad_attr);
+		return -EOPNOTSUPP;
+	}
+
+	return 1;
+}
+
+static int
+rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
+		   struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
+		   bool *reset, bool *mod)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct netlink_ext_ack *extack = info->extack;
+	struct nlattr **tb = info->attrs;
+	struct ethtool_rxnfc rx_rings;
+	size_t alloc_size;
+	u32 user_size;
+	int i, err;
+
+	if (!tb[ETHTOOL_A_RSS_INDIR])
+		return 0;
+	if (!data->indir_size || !ops->get_rxnfc)
+		return -EOPNOTSUPP;
+
+	rx_rings.cmd = ETHTOOL_GRXRINGS;
+	err = ops->get_rxnfc(dev, &rx_rings, NULL);
+	if (err)
+		return err;
+
+	if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) {
+		NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]);
+		return -EINVAL;
+	}
+	user_size = nla_len(tb[ETHTOOL_A_RSS_INDIR]) / 4;
+	if (!user_size) {
+		if (rxfh->rss_context) {
+			NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_RSS_INDIR],
+					    "can't reset table for a context");
+			return -EINVAL;
+		}
+		*reset = true;
+	} else if (data->indir_size % user_size) {
+		NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+					"size (%d) mismatch with device indir table (%d)",
+					user_size, data->indir_size);
+		return -EINVAL;
+	}
+
+	rxfh->indir_size = data->indir_size;
+	alloc_size = array_size(data->indir_size, sizeof(rxfh->indir[0]));
+	rxfh->indir = kzalloc(alloc_size, GFP_KERNEL);
+	if (!rxfh->indir)
+		return -ENOMEM;
+
+	nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size);
+	for (i = 0; i < user_size; i++) {
+		if (rxfh->indir[i] < rx_rings.data)
+			continue;
+
+		NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+					"entry %d: queue out of range (%d)",
+					i, rxfh->indir[i]);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	if (user_size) {
+		/* Replicate the user-provided table to fill the device table */
+		for (i = user_size; i < data->indir_size; i++)
+			rxfh->indir[i] = rxfh->indir[i % user_size];
+	} else {
+		for (i = 0; i < data->indir_size; i++)
+			rxfh->indir[i] =
+				ethtool_rxfh_indir_default(i, rx_rings.data);
+	}
+
+	*mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size);
+
+	return 0;
+
+err_free:
+	kfree(rxfh->indir);
+	rxfh->indir = NULL;
+	return err;
+}
+
+static void
+rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb,
+		   struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh)
+{
+	int i;
+
+	if (rxfh->indir) {
+		for (i = 0; i < data->indir_size; i++)
+			ethtool_rxfh_context_indir(ctx)[i] = rxfh->indir[i];
+		ctx->indir_configured = !!nla_len(tb[ETHTOOL_A_RSS_INDIR]);
+	}
+}
+
+static int
+ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+	struct rss_req_info *request = RSS_REQINFO(req_info);
+	struct ethtool_rxfh_context *ctx = NULL;
+	struct net_device *dev = req_info->dev;
+	struct ethtool_rxfh_param rxfh = {};
+	bool indir_reset = false, indir_mod;
+	struct nlattr **tb = info->attrs;
+	struct rss_reply_data data = {};
+	const struct ethtool_ops *ops;
+	bool mod = false;
+	int ret;
+
+	ops = dev->ethtool_ops;
+	data.base.dev = dev;
+
+	ret = rss_prepare(request, dev, &data, info);
+	if (ret)
+		return ret;
+
+	rxfh.rss_context = request->rss_context;
+
+	ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_reset, &mod);
+	if (ret)
+		goto exit_clean_data;
+	indir_mod = !!tb[ETHTOOL_A_RSS_INDIR];
+
+	rxfh.hfunc = ETH_RSS_HASH_NO_CHANGE;
+	rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+
+	mutex_lock(&dev->ethtool->rss_lock);
+	if (request->rss_context) {
+		ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context);
+		if (!ctx) {
+			ret = -ENOENT;
+			goto exit_unlock;
+		}
+	}
+
+	if (!mod)
+		ret = 0; /* nothing to tell the driver */
+	else if (!ops->set_rxfh)
+		ret = -EOPNOTSUPP;
+	else if (!rxfh.rss_context)
+		ret = ops->set_rxfh(dev, &rxfh, info->extack);
+	else
+		ret = ops->modify_rxfh_context(dev, ctx, &rxfh, info->extack);
+	if (ret)
+		goto exit_unlock;
+
+	if (ctx)
+		rss_set_ctx_update(ctx, tb, &data, &rxfh);
+	else if (indir_reset)
+		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+	else if (indir_mod)
+		dev->priv_flags |= IFF_RXFH_CONFIGURED;
+
+exit_unlock:
+	mutex_unlock(&dev->ethtool->rss_lock);
+	kfree(rxfh.indir);
+exit_clean_data:
+	rss_cleanup_data(&data.base);
+
+	return ret ?: mod;
+}
+
 const struct ethnl_request_ops ethnl_rss_request_ops = {
 	.request_cmd		= ETHTOOL_MSG_RSS_GET,
 	.reply_cmd		= ETHTOOL_MSG_RSS_GET_REPLY,
@@ -478,4 +669,8 @@ const struct ethnl_request_ops ethnl_rss_request_ops = {
 	.reply_size		= rss_reply_size,
 	.fill_reply		= rss_fill_reply,
 	.cleanup_data		= rss_cleanup_data,
+
+	.set_validate		= ethnl_rss_set_validate,
+	.set			= ethnl_rss_set,
+	.set_ntf_cmd		= ETHTOOL_MSG_RSS_NTF,
 };
-- 
cgit v1.2.3


From e804bd83c1fd7e1f03899c948812ebc207ac5a7e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:10 +0000
Subject: neighbour: Split pneigh_lookup().

pneigh_lookup() has ASSERT_RTNL() in the middle of the function, which
is confusing.

When called with the last argument, creat, 0, pneigh_lookup() literally
looks up a proxy neighbour entry.  This is the case of the reader path
as the fast path and RTM_GETNEIGH.

pneigh_lookup(), however, creates a pneigh_entry when called with creat 1
from RTM_NEWNEIGH and SIOCSARP, which require RTNL.

Let's split pneigh_lookup() into two functions.

We will convert all the reader paths to RCU, and read_lock_bh(&tbl->lock)
in the new pneigh_lookup() will be dropped.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-6-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  5 +++--
 net/core/neighbour.c    | 39 +++++++++++++++++++++++++++++----------
 net/ipv4/arp.c          |  4 ++--
 net/ipv6/ip6_output.c   |  2 +-
 net/ipv6/ndisc.c        |  2 +-
 5 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7e865b14749d..7f3d57da5689 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -376,10 +376,11 @@ unsigned long neigh_rand_reach_time(unsigned long base);
 void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		    struct sk_buff *skb);
 struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
-				   const void *key, struct net_device *dev,
-				   int creat);
+				   const void *key, struct net_device *dev);
 struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
 				     const void *key, struct net_device *dev);
+struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net,
+				   const void *key, struct net_device *dev);
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
 		  struct net_device *dev);
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index ffb8d80328ed..d0e303360b2c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -28,6 +28,7 @@
 #include <net/neighbour.h>
 #include <net/arp.h>
 #include <net/dst.h>
+#include <net/ip.h>
 #include <net/sock.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
@@ -746,24 +747,44 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
 }
 EXPORT_SYMBOL_GPL(__pneigh_lookup);
 
-struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
-				    struct net *net, const void *pkey,
-				    struct net_device *dev, int creat)
+struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
+				   struct net *net, const void *pkey,
+				   struct net_device *dev)
+{
+	struct pneigh_entry *n;
+	unsigned int key_len;
+	u32 hash_val;
+
+	key_len = tbl->key_len;
+	hash_val = pneigh_hash(pkey, key_len);
+
+	read_lock_bh(&tbl->lock);
+	n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+			      net, pkey, key_len, dev);
+	read_unlock_bh(&tbl->lock);
+
+	return n;
+}
+EXPORT_IPV6_MOD(pneigh_lookup);
+
+struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
+				   struct net *net, const void *pkey,
+				   struct net_device *dev)
 {
 	struct pneigh_entry *n;
 	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
+	ASSERT_RTNL();
+
 	read_lock_bh(&tbl->lock);
 	n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
 			      net, pkey, key_len, dev);
 	read_unlock_bh(&tbl->lock);
 
-	if (n || !creat)
+	if (n)
 		goto out;
 
-	ASSERT_RTNL();
-
 	n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
 	if (!n)
 		goto out;
@@ -787,8 +808,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
 out:
 	return n;
 }
-EXPORT_SYMBOL(pneigh_lookup);
-
 
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		  struct net_device *dev)
@@ -2007,7 +2026,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 		}
 
 		err = -ENOBUFS;
-		pn = pneigh_lookup(tbl, net, dst, dev, 1);
+		pn = pneigh_create(tbl, net, dst, dev);
 		if (pn) {
 			pn->flags = ndm_flags;
 			pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT);
@@ -3036,7 +3055,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	if (ndm->ndm_flags & NTF_PROXY) {
 		struct pneigh_entry *pn;
 
-		pn = pneigh_lookup(tbl, net, dst, dev, 0);
+		pn = pneigh_lookup(tbl, net, dst, dev);
 		if (!pn) {
 			NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found");
 			err = -ENOENT;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c0440d61cf2f..d93b5735b0ba 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
 			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
 			     (rt->dst.dev != dev &&
-			      pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+			      pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n)
 					neigh_release(n);
@@ -1089,7 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 	if (mask) {
 		__be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
 
-		if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
+		if (!pneigh_create(&arp_tbl, net, &ip, dev))
 			return -ENOBUFS;
 		return 0;
 	}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index fcc20c7250eb..0412f8544695 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -563,7 +563,7 @@ int ip6_forward(struct sk_buff *skb)
 
 	/* XXX: idev->cnf.proxy_ndp? */
 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
-	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
+	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
 		int proxied = ip6_forward_proxy_check(skb);
 		if (proxied > 0) {
 			/* It's tempting to decrease the hop limit
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d4c5876e1771..a3ac26c1df6d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1100,7 +1100,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
 		    READ_ONCE(net->ipv6.devconf_all->forwarding) &&
 		    READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
-		    pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
+		    pneigh_lookup(&nd_tbl, net, &msg->target, dev)) {
 			/* XXX: idev->cnf.proxy_ndp */
 			goto out;
 		}
-- 
cgit v1.2.3


From d63382aea70aa4ecb516126e00930bc8ab5e55ef Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:11 +0000
Subject: neighbour: Annotate neigh_table.phash_buckets and pneigh_entry.next
 with __rcu.

The next patch will free pneigh_entry with call_rcu().

Then, we need to annotate neigh_table.phash_buckets[] and
pneigh_entry.next with __rcu.

To make the next patch cleaner, let's annotate the fields in advance.

Currently, all accesses to the fields are under the neigh table lock,
so rcu_dereference_protected() is used with 1 for now, but most of them
(except in pneigh_delete() and pneigh_ifdown_and_unlock()) will be
replaced with rcu_dereference() and rcu_dereference_check().

Note that pneigh_ifdown_and_unlock() changes pneigh_entry.next to a
local list, which is illegal because the RCU iterator could be moved
to another list.  This part will be fixed in the next patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-7-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  4 ++--
 net/core/neighbour.c    | 52 +++++++++++++++++++++++++++++--------------------
 2 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7f3d57da5689..1ddc44a04200 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -176,7 +176,7 @@ struct neigh_ops {
 };
 
 struct pneigh_entry {
-	struct pneigh_entry	*next;
+	struct pneigh_entry	__rcu *next;
 	possible_net_t		net;
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
@@ -236,7 +236,7 @@ struct neigh_table {
 	unsigned long		last_rand;
 	struct neigh_statistics	__percpu *stats;
 	struct neigh_hash_table __rcu *nht;
-	struct pneigh_entry	**phash_buckets;
+	struct pneigh_entry	__rcu **phash_buckets;
 };
 
 static inline int neigh_parms_family(struct neigh_parms *p)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d0e303360b2c..7fcb0a8d655f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -731,7 +731,8 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 		    net_eq(pneigh_net(n), net) &&
 		    (n->dev == dev || !n->dev))
 			return n;
-		n = n->next;
+
+		n = rcu_dereference_protected(n->next, 1);
 	}
 	return NULL;
 }
@@ -742,7 +743,7 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
 	unsigned int key_len = tbl->key_len;
 	u32 hash_val = pneigh_hash(pkey, key_len);
 
-	return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+	return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1),
 				 net, pkey, key_len, dev);
 }
 EXPORT_SYMBOL_GPL(__pneigh_lookup);
@@ -759,7 +760,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
 	hash_val = pneigh_hash(pkey, key_len);
 
 	read_lock_bh(&tbl->lock);
-	n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+	n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1),
 			      net, pkey, key_len, dev);
 	read_unlock_bh(&tbl->lock);
 
@@ -778,7 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
 	ASSERT_RTNL();
 
 	read_lock_bh(&tbl->lock);
-	n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
+	n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1),
 			      net, pkey, key_len, dev);
 	read_unlock_bh(&tbl->lock);
 
@@ -803,7 +804,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
 
 	write_lock_bh(&tbl->lock);
 	n->next = tbl->phash_buckets[hash_val];
-	tbl->phash_buckets[hash_val] = n;
+	rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
 	write_unlock_bh(&tbl->lock);
 out:
 	return n;
@@ -812,16 +813,20 @@ out:
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		  struct net_device *dev)
 {
-	struct pneigh_entry *n, **np;
-	unsigned int key_len = tbl->key_len;
-	u32 hash_val = pneigh_hash(pkey, key_len);
+	struct pneigh_entry *n, __rcu **np;
+	unsigned int key_len;
+	u32 hash_val;
+
+	key_len = tbl->key_len;
+	hash_val = pneigh_hash(pkey, key_len);
 
 	write_lock_bh(&tbl->lock);
-	for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+	for (np = &tbl->phash_buckets[hash_val];
+	     (n = rcu_dereference_protected(*np, 1)) != NULL;
 	     np = &n->next) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
 		    net_eq(pneigh_net(n), net)) {
-			*np = n->next;
+			rcu_assign_pointer(*np, n->next);
 			write_unlock_bh(&tbl->lock);
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
@@ -838,17 +843,17 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 				    struct net_device *dev,
 				    bool skip_perm)
 {
-	struct pneigh_entry *n, **np, *freelist = NULL;
+	struct pneigh_entry *n, __rcu **np, *freelist = NULL;
 	u32 h;
 
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
 		np = &tbl->phash_buckets[h];
-		while ((n = *np) != NULL) {
+		while ((n = rcu_dereference_protected(*np, 1)) != NULL) {
 			if (skip_perm && n->permanent)
 				goto skip;
 			if (!dev || n->dev == dev) {
-				*np = n->next;
-				n->next = freelist;
+				rcu_assign_pointer(*np, n->next);
+				rcu_assign_pointer(n->next, freelist);
 				freelist = n;
 				continue;
 			}
@@ -858,7 +863,7 @@ skip:
 	}
 	write_unlock_bh(&tbl->lock);
 	while ((n = freelist)) {
-		freelist = n->next;
+		freelist = rcu_dereference_protected(n->next, 1);
 		n->next = NULL;
 		if (tbl->pdestructor)
 			tbl->pdestructor(n);
@@ -2794,7 +2799,9 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 	for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
 		if (h > s_h)
 			s_idx = 0;
-		for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+		for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0;
+		     n;
+		     n = rcu_dereference_protected(n->next, 1)) {
 			if (idx < s_idx || pneigh_net(n) != net)
 				goto next;
 			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
@@ -3288,9 +3295,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
 
 	state->flags |= NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
-		pn = tbl->phash_buckets[bucket];
+		pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1);
+
 		while (pn && !net_eq(pneigh_net(pn), net))
-			pn = pn->next;
+			pn = rcu_dereference_protected(pn->next, 1);
 		if (pn)
 			break;
 	}
@@ -3308,15 +3316,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
 	struct neigh_table *tbl = state->tbl;
 
 	do {
-		pn = pn->next;
+		pn = rcu_dereference_protected(pn->next, 1);
 	} while (pn && !net_eq(pneigh_net(pn), net));
 
 	while (!pn) {
 		if (++state->bucket > PNEIGH_HASHMASK)
 			break;
-		pn = tbl->phash_buckets[state->bucket];
+
+		pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1);
+
 		while (pn && !net_eq(pneigh_net(pn), net))
-			pn = pn->next;
+			pn = rcu_dereference_protected(pn->next, 1);
 		if (pn)
 			break;
 	}
-- 
cgit v1.2.3


From d539d8fbd8fcf64a1492c51f5ee99aaa8a8dc9ab Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:12 +0000
Subject: neighbour: Free pneigh_entry after RCU grace period.

We will convert RTM_GETNEIGH to RCU.

neigh_get() looks up pneigh_entry by pneigh_lookup() and passes
it to pneigh_fill_info().

Then, we must ensure that the entry is alive till pneigh_fill_info()
completes, but read_lock_bh(&tbl->lock) in pneigh_lookup() does not
guarantee that.

Also, we will convert all readers of tbl->phash_buckets[] to RCU.

Let's use call_rcu() to free pneigh_entry and update phash_buckets[]
and ->next by rcu_assign_pointer().

pneigh_ifdown_and_unlock() uses list_head to avoid overwriting
->next and moving RCU iterators to another list.

pndisc_destructor() (only IPv6 ndisc uses this) uses a mutex, so it
is not delayed to call_rcu(), where we cannot sleep.  This is fine
because the mcast code works with RCU and ipv6_dev_mc_dec() frees
mcast objects after RCU grace period.

While at it, we change the return type of pneigh_ifdown_and_unlock()
to void.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-8-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  4 ++++
 net/core/neighbour.c    | 45 ++++++++++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 1ddc44a04200..6d7f9aa53a7a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -180,6 +180,10 @@ struct pneigh_entry {
 	possible_net_t		net;
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
+	union {
+		struct list_head	free_node;
+		struct rcu_head		rcu;
+	};
 	u32			flags;
 	u8			protocol;
 	bool			permanent;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 7fcb0a8d655f..fa2e60a479ef 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -54,9 +54,9 @@ static void neigh_timer_handler(struct timer_list *t);
 static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				    struct net_device *dev,
-				    bool skip_perm);
+static void pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+				     struct net_device *dev,
+				     bool skip_perm);
 
 #ifdef CONFIG_PROC_FS
 static const struct seq_operations neigh_stat_seq_ops;
@@ -810,6 +810,14 @@ out:
 	return n;
 }
 
+static void pneigh_destroy(struct rcu_head *rcu)
+{
+	struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu);
+
+	netdev_put(n->dev, &n->dev_tracker);
+	kfree(n);
+}
+
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		  struct net_device *dev)
 {
@@ -828,10 +836,11 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 		    net_eq(pneigh_net(n), net)) {
 			rcu_assign_pointer(*np, n->next);
 			write_unlock_bh(&tbl->lock);
+
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
-			netdev_put(n->dev, &n->dev_tracker);
-			kfree(n);
+
+			call_rcu(&n->rcu, pneigh_destroy);
 			return 0;
 		}
 	}
@@ -839,11 +848,12 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 	return -ENOENT;
 }
 
-static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				    struct net_device *dev,
-				    bool skip_perm)
+static void pneigh_ifdown_and_unlock(struct neigh_table *tbl,
+				     struct net_device *dev,
+				     bool skip_perm)
 {
-	struct pneigh_entry *n, __rcu **np, *freelist = NULL;
+	struct pneigh_entry *n, __rcu **np;
+	LIST_HEAD(head);
 	u32 h;
 
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
@@ -853,24 +863,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
 				goto skip;
 			if (!dev || n->dev == dev) {
 				rcu_assign_pointer(*np, n->next);
-				rcu_assign_pointer(n->next, freelist);
-				freelist = n;
+				list_add(&n->free_node, &head);
 				continue;
 			}
 skip:
 			np = &n->next;
 		}
 	}
+
 	write_unlock_bh(&tbl->lock);
-	while ((n = freelist)) {
-		freelist = rcu_dereference_protected(n->next, 1);
-		n->next = NULL;
+
+	while (!list_empty(&head)) {
+		n = list_first_entry(&head, typeof(*n), free_node);
+		list_del(&n->free_node);
+
 		if (tbl->pdestructor)
 			tbl->pdestructor(n);
-		netdev_put(n->dev, &n->dev_tracker);
-		kfree(n);
+
+		call_rcu(&n->rcu, pneigh_destroy);
 	}
-	return -ENOENT;
 }
 
 static inline void neigh_parms_put(struct neigh_parms *parms)
-- 
cgit v1.2.3


From dd103c9a53752d3754a3182ec8dd97885680cfe2 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:17 +0000
Subject: neighbour: Remove __pneigh_lookup().

__pneigh_lookup() is the lockless version of pneigh_lookup(),
but its only caller pndisc_is_router() holds the table lock and
reads pneigh_netry.flags.

This is because accessing pneigh_entry after pneigh_lookup() was
illegal unless the caller holds RTNL or the table lock.

Now, pneigh_entry is guaranteed to be alive during the RCU critical
section.

Let's call pneigh_lookup() and use READ_ONCE() for n->flags in
pndisc_is_router() and remove __pneigh_lookup().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-13-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  2 --
 net/core/neighbour.c    | 11 -----------
 net/ipv6/ndisc.c        |  6 ++----
 3 files changed, 2 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 6d7f9aa53a7a..f8c7261cd4eb 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -381,8 +381,6 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		    struct sk_buff *skb);
 struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
 				   const void *key, struct net_device *dev);
-struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
-				     const void *key, struct net_device *dev);
 struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net,
 				   const void *key, struct net_device *dev);
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index b76ff416b9a7..e7bd8111f97f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -737,17 +737,6 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
 	return NULL;
 }
 
-struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
-		struct net *net, const void *pkey, struct net_device *dev)
-{
-	unsigned int key_len = tbl->key_len;
-	u32 hash_val = pneigh_hash(pkey, key_len);
-
-	return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1),
-				 net, pkey, key_len, dev);
-}
-EXPORT_SYMBOL_GPL(__pneigh_lookup);
-
 struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
 				   struct net *net, const void *pkey,
 				   struct net_device *dev)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index a3ac26c1df6d..7d5abb3158ec 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -768,11 +768,9 @@ static int pndisc_is_router(const void *pkey,
 	struct pneigh_entry *n;
 	int ret = -1;
 
-	read_lock_bh(&nd_tbl.lock);
-	n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
+	n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
 	if (n)
-		ret = !!(n->flags & NTF_ROUTER);
-	read_unlock_bh(&nd_tbl.lock);
+		ret = !!(READ_ONCE(n->flags) & NTF_ROUTER);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 13a936bb99fb6385dc8620d24d7111e514448371 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:19 +0000
Subject: neighbour: Protect tbl->phash_buckets[] with a dedicated mutex.

tbl->phash_buckets[] is only modified in the slow path by pneigh_create()
and pneigh_delete() under the table lock.

Both of them are called under RTNL, so no extra lock is needed, but we
will remove RTNL from the paths.

pneigh_create() looks up a pneigh_entry, and this part can be lockless,
but it would complicate the logic like

  1. lookup
  2. allocate pengih_entry for GFP_KERNEL
  3. lookup again but under lock
  4. if found, return it after freeing the allocated memory
  5. else, return the new one

Instead, let's add a per-table mutex and run lookup and allocation
under it.

Note that updating pneigh_entry part in neigh_add() is still protected
by RTNL and will be moved to pneigh_create() in the next patch.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-15-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  1 +
 net/core/neighbour.c    | 39 +++++++++++++++++++++------------------
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f8c7261cd4eb..f333f9ebc425 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -240,6 +240,7 @@ struct neigh_table {
 	unsigned long		last_rand;
 	struct neigh_statistics	__percpu *stats;
 	struct neigh_hash_table __rcu *nht;
+	struct mutex		phash_lock;
 	struct pneigh_entry	__rcu **phash_buckets;
 };
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 38f0067068c5..d312b6323ff2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -54,9 +54,8 @@ static void neigh_timer_handler(struct timer_list *t);
 static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static void pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				     struct net_device *dev,
-				     bool skip_perm);
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+			  bool skip_perm);
 
 #ifdef CONFIG_PROC_FS
 static const struct seq_operations neigh_stat_seq_ops;
@@ -437,7 +436,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
 {
 	write_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev, skip_perm);
-	pneigh_ifdown_and_unlock(tbl, dev, skip_perm);
+	write_unlock_bh(&tbl->lock);
+
+	pneigh_ifdown(tbl, dev, skip_perm);
 	pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
 			   tbl->family);
 	if (skb_queue_empty_lockless(&tbl->proxy_queue))
@@ -731,7 +732,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
 	key_len = tbl->key_len;
 	hash_val = pneigh_hash(pkey, key_len);
 	n = rcu_dereference_check(tbl->phash_buckets[hash_val],
-				  lockdep_is_held(&tbl->lock));
+				  lockdep_is_held(&tbl->phash_lock));
 
 	while (n) {
 		if (!memcmp(n->key, pkey, key_len) &&
@@ -739,7 +740,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
 		    (n->dev == dev || !n->dev))
 			return n;
 
-		n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock));
+		n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock));
 	}
 
 	return NULL;
@@ -754,11 +755,9 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
 	unsigned int key_len;
 	u32 hash_val;
 
-	ASSERT_RTNL();
+	mutex_lock(&tbl->phash_lock);
 
-	read_lock_bh(&tbl->lock);
 	n = pneigh_lookup(tbl, net, pkey, dev);
-	read_unlock_bh(&tbl->lock);
 	if (n)
 		goto out;
 
@@ -780,11 +779,10 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
 	}
 
 	hash_val = pneigh_hash(pkey, key_len);
-	write_lock_bh(&tbl->lock);
 	n->next = tbl->phash_buckets[hash_val];
 	rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
-	write_unlock_bh(&tbl->lock);
 out:
+	mutex_unlock(&tbl->phash_lock);
 	return n;
 }
 
@@ -806,14 +804,16 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 	key_len = tbl->key_len;
 	hash_val = pneigh_hash(pkey, key_len);
 
-	write_lock_bh(&tbl->lock);
+	mutex_lock(&tbl->phash_lock);
+
 	for (np = &tbl->phash_buckets[hash_val];
 	     (n = rcu_dereference_protected(*np, 1)) != NULL;
 	     np = &n->next) {
 		if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
 		    net_eq(pneigh_net(n), net)) {
 			rcu_assign_pointer(*np, n->next);
-			write_unlock_bh(&tbl->lock);
+
+			mutex_unlock(&tbl->phash_lock);
 
 			if (tbl->pdestructor)
 				tbl->pdestructor(n);
@@ -822,18 +822,20 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
 			return 0;
 		}
 	}
-	write_unlock_bh(&tbl->lock);
+
+	mutex_unlock(&tbl->phash_lock);
 	return -ENOENT;
 }
 
-static void pneigh_ifdown_and_unlock(struct neigh_table *tbl,
-				     struct net_device *dev,
-				     bool skip_perm)
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+			  bool skip_perm)
 {
 	struct pneigh_entry *n, __rcu **np;
 	LIST_HEAD(head);
 	u32 h;
 
+	mutex_lock(&tbl->phash_lock);
+
 	for (h = 0; h <= PNEIGH_HASHMASK; h++) {
 		np = &tbl->phash_buckets[h];
 		while ((n = rcu_dereference_protected(*np, 1)) != NULL) {
@@ -849,7 +851,7 @@ skip:
 		}
 	}
 
-	write_unlock_bh(&tbl->lock);
+	mutex_unlock(&tbl->phash_lock);
 
 	while (!list_empty(&head)) {
 		n = list_first_entry(&head, typeof(*n), free_node);
@@ -1796,6 +1798,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 		WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
 
 	rwlock_init(&tbl->lock);
+	mutex_init(&tbl->phash_lock);
 
 	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
 	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
-- 
cgit v1.2.3


From dc2a27e524ac13e7a599bc693934ed81f868dc2d Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Wed, 16 Jul 2025 22:08:20 +0000
Subject: neighbour: Update pneigh_entry in pneigh_create().

neigh_add() updates pneigh_entry() found or created by pneigh_create().

This update is serialised by RTNL, but we will remove it.

Let's move the update part to pneigh_create() and make it return errno
instead of a pointer of pneigh_entry.

Now, the pneigh code is RTNL free.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250716221221.442239-16-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  5 +++--
 net/core/neighbour.c    | 34 ++++++++++++++++------------------
 net/ipv4/arp.c          |  4 +---
 3 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index f333f9ebc425..4a30bd458c5a 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -382,8 +382,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
 		    struct sk_buff *skb);
 struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
 				   const void *key, struct net_device *dev);
-struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net,
-				   const void *key, struct net_device *dev);
+int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key,
+		  struct net_device *dev, u32 flags, u8 protocol,
+		  bool permanent);
 int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
 		  struct net_device *dev);
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d312b6323ff2..4316ca3d9872 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -747,24 +747,27 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
 }
 EXPORT_IPV6_MOD(pneigh_lookup);
 
-struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
-				   struct net *net, const void *pkey,
-				   struct net_device *dev)
+int pneigh_create(struct neigh_table *tbl, struct net *net,
+		  const void *pkey, struct net_device *dev,
+		  u32 flags, u8 protocol, bool permanent)
 {
 	struct pneigh_entry *n;
 	unsigned int key_len;
 	u32 hash_val;
+	int err = 0;
 
 	mutex_lock(&tbl->phash_lock);
 
 	n = pneigh_lookup(tbl, net, pkey, dev);
 	if (n)
-		goto out;
+		goto update;
 
 	key_len = tbl->key_len;
 	n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
-	if (!n)
+	if (!n) {
+		err = -ENOBUFS;
 		goto out;
+	}
 
 	write_pnet(&n->net, net);
 	memcpy(n->key, pkey, key_len);
@@ -774,16 +777,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl,
 	if (tbl->pconstructor && tbl->pconstructor(n)) {
 		netdev_put(dev, &n->dev_tracker);
 		kfree(n);
-		n = NULL;
+		err = -ENOBUFS;
 		goto out;
 	}
 
 	hash_val = pneigh_hash(pkey, key_len);
 	n->next = tbl->phash_buckets[hash_val];
 	rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
+update:
+	WRITE_ONCE(n->flags, flags);
+	n->permanent = permanent;
+	WRITE_ONCE(n->protocol, protocol);
 out:
 	mutex_unlock(&tbl->phash_lock);
-	return n;
+	return err;
 }
 
 static void pneigh_destroy(struct rcu_head *rcu)
@@ -2015,22 +2022,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[NDA_PROTOCOL])
 		protocol = nla_get_u8(tb[NDA_PROTOCOL]);
 	if (ndm_flags & NTF_PROXY) {
-		struct pneigh_entry *pn;
-
 		if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) {
 			NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
 			goto out;
 		}
 
-		err = -ENOBUFS;
-		pn = pneigh_create(tbl, net, dst, dev);
-		if (pn) {
-			WRITE_ONCE(pn->flags, ndm_flags);
-			pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT);
-			if (protocol)
-				WRITE_ONCE(pn->protocol, protocol);
-			err = 0;
-		}
+		err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol,
+				    !!(ndm->ndm_state & NUD_PERMANENT));
 		goto out;
 	}
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index d93b5735b0ba..5cfc1c939673 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1089,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 	if (mask) {
 		__be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
 
-		if (!pneigh_create(&arp_tbl, net, &ip, dev))
-			return -ENOBUFS;
-		return 0;
+		return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false);
 	}
 
 	return arp_req_set_proxy(net, dev, 1);
-- 
cgit v1.2.3


From 2d8ae9a4f1bc04a118e3d438ac50dd49281b34fd Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 11 Jul 2025 11:55:14 +0300
Subject: string: Group str_has_prefix() and strstarts()

The two str_has_prefix() and strstarts() are about the same
with a slight difference on what they return. Group them in
the header.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250711085514.1294428-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/string.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 01621ad0f598..fdd3442c6bcb 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -345,16 +345,6 @@ extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 
 int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);
 
-/**
- * strstarts - does @str start with @prefix?
- * @str: string to examine
- * @prefix: prefix to look for.
- */
-static inline bool strstarts(const char *str, const char *prefix)
-{
-	return strncmp(str, prefix, strlen(prefix)) == 0;
-}
-
 size_t memweight(const void *ptr, size_t bytes);
 
 /**
@@ -562,4 +552,14 @@ static __always_inline size_t str_has_prefix(const char *str, const char *prefix
 	return strncmp(str, prefix, len) == 0 ? len : 0;
 }
 
+/**
+ * strstarts - does @str start with @prefix?
+ * @str: string to examine
+ * @prefix: prefix to look for.
+ */
+static inline bool strstarts(const char *str, const char *prefix)
+{
+	return strncmp(str, prefix, strlen(prefix)) == 0;
+}
+
 #endif /* _LINUX_STRING_H_ */
-- 
cgit v1.2.3


From b2df55a98672f4be076ff69d0f0d0b1fc81f2044 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Jul 2025 09:30:36 -0700
Subject: cleanup: Fix documentation build error for ACQUIRE updates

Stephen reports:

Documentation/core-api/cleanup:7: include/linux/cleanup.h:73: ERROR: Unexpected indentation. [docutils]
Documentation/core-api/cleanup:7: include/linux/cleanup.h:74: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]

Which points out that the ACQUIRE() example in cleanup.h missed the "::"
suffix to mark the following text as a code-block.

Fixes: 857d18f23ab1 ("cleanup: Introduce ACQUIRE() and ACQUIRE_ERR() for conditional locks")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: http://lore.kernel.org/20250717173354.34375751@canb.auug.org.au
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20250717163036.1275791-1-dan.j.williams@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cleanup.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 4eb83dd71cfe..0fb796db4811 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -64,8 +64,7 @@
  * the remainder of "func()".
  *
  * The ACQUIRE() macro can be used in all places that guard() can be
- * used and additionally support conditional locks
- *
+ * used and additionally support conditional locks::
  *
  *	DEFINE_GUARD_COND(pci_dev, _try, pci_dev_trylock(_T))
  *	...
-- 
cgit v1.2.3


From 4e301d858af17ae2ce56886296e5458c5a08219a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sat, 7 Jun 2025 13:53:03 +0200
Subject: fs: constify file ptr in backing_file accessor helpers

Add internal helper backing_file_set_user_path() for the only
two cases that need to modify backing_file fields.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/20250607115304.2521155-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/backing-file.c  |  4 ++--
 fs/file_table.c    | 13 ++++++++-----
 fs/internal.h      |  1 +
 include/linux/fs.h |  6 +++---
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 763fbe9b72b2..8c7396bff121 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 		return f;
 
 	path_get(user_path);
-	*backing_file_user_path(f) = *user_path;
+	backing_file_set_user_path(f, user_path);
 	error = vfs_open(real_path, f);
 	if (error) {
 		fput(f);
@@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags,
 		return f;
 
 	path_get(user_path);
-	*backing_file_user_path(f) = *user_path;
+	backing_file_set_user_path(f, user_path);
 	error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
 	if (error) {
 		fput(f);
diff --git a/fs/file_table.c b/fs/file_table.c
index 138114d64307..f09d79a98111 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,17 +52,20 @@ struct backing_file {
 	};
 };
 
-static inline struct backing_file *backing_file(struct file *f)
-{
-	return container_of(f, struct backing_file, file);
-}
+#define backing_file(f) container_of(f, struct backing_file, file)
 
-struct path *backing_file_user_path(struct file *f)
+struct path *backing_file_user_path(const struct file *f)
 {
 	return &backing_file(f)->user_path;
 }
 EXPORT_SYMBOL_GPL(backing_file_user_path);
 
+void backing_file_set_user_path(struct file *f, const struct path *path)
+{
+	backing_file(f)->user_path = *path;
+}
+EXPORT_SYMBOL_GPL(backing_file_set_user_path);
+
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
diff --git a/fs/internal.h b/fs/internal.h
index 393f6c5c24f6..d733d8bb3d1f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,6 +101,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
 struct file *alloc_empty_file(int flags, const struct cred *cred);
 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void backing_file_set_user_path(struct file *f, const struct path *path);
 
 static inline void file_put_write_access(struct file *file)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1d9586a78041..8116b1080457 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2864,7 +2864,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags,
 				  const struct cred *cred);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct path *backing_file_user_path(struct file *f);
+struct path *backing_file_user_path(const struct file *f);
 
 /*
  * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
@@ -2876,14 +2876,14 @@ struct path *backing_file_user_path(struct file *f);
  * by fstat() on that same fd.
  */
 /* Get the path to display in /proc/<pid>/maps */
-static inline const struct path *file_user_path(struct file *f)
+static inline const struct path *file_user_path(const struct file *f)
 {
 	if (unlikely(f->f_mode & FMODE_BACKING))
 		return backing_file_user_path(f);
 	return &f->f_path;
 }
 /* Get the inode whose inode number to display in /proc/<pid>/maps */
-static inline const struct inode *file_user_inode(struct file *f)
+static inline const struct inode *file_user_inode(const struct file *f)
 {
 	if (unlikely(f->f_mode & FMODE_BACKING))
 		return d_inode(backing_file_user_path(f)->dentry);
-- 
cgit v1.2.3


From 962ddc5a7a4b04c007bba0f3e7298cda13c62efd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 8 Jul 2025 17:59:54 -0700
Subject: crypto: acomp - Fix CFI failure due to type punning

To avoid a crash when control flow integrity is enabled, make the
workspace ("stream") free function use a consistent type, and call it
through a function pointer that has that same type.

Fixes: 42d9f6c77479 ("crypto: acomp - Move scomp stream allocation code into acomp")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/deflate.c                    | 7 ++++++-
 crypto/zstd.c                       | 7 ++++++-
 include/crypto/internal/acompress.h | 5 +----
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/crypto/deflate.c b/crypto/deflate.c
index fe8e4ad0fee1..21404515dc77 100644
--- a/crypto/deflate.c
+++ b/crypto/deflate.c
@@ -48,9 +48,14 @@ static void *deflate_alloc_stream(void)
 	return ctx;
 }
 
+static void deflate_free_stream(void *ctx)
+{
+	kvfree(ctx);
+}
+
 static struct crypto_acomp_streams deflate_streams = {
 	.alloc_ctx = deflate_alloc_stream,
-	.cfree_ctx = kvfree,
+	.free_ctx = deflate_free_stream,
 };
 
 static int deflate_compress_one(struct acomp_req *req,
diff --git a/crypto/zstd.c b/crypto/zstd.c
index ebeadc1f3b5f..c2a19cb0879d 100644
--- a/crypto/zstd.c
+++ b/crypto/zstd.c
@@ -54,9 +54,14 @@ static void *zstd_alloc_stream(void)
 	return ctx;
 }
 
+static void zstd_free_stream(void *ctx)
+{
+	kvfree(ctx);
+}
+
 static struct crypto_acomp_streams zstd_streams = {
 	.alloc_ctx = zstd_alloc_stream,
-	.cfree_ctx = kvfree,
+	.free_ctx = zstd_free_stream,
 };
 
 static int zstd_init(struct crypto_acomp *acomp_tfm)
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index ffffd88bbbad..2d97440028ff 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -63,10 +63,7 @@ struct crypto_acomp_stream {
 struct crypto_acomp_streams {
 	/* These must come first because of struct scomp_alg. */
 	void *(*alloc_ctx)(void);
-	union {
-		void (*free_ctx)(void *);
-		void (*cfree_ctx)(const void *);
-	};
+	void (*free_ctx)(void *);
 
 	struct crypto_acomp_stream __percpu *streams;
 	struct work_struct stream_work;
-- 
cgit v1.2.3


From c470ffa6f48619e8ea2442206b31b7965f8826a5 Mon Sep 17 00:00:00 2001
From: Ovidiu Panait <ovidiu.panait.oss@gmail.com>
Date: Fri, 11 Jul 2025 21:29:31 +0300
Subject: crypto: engine - remove request batching support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove request batching support from crypto_engine, as there are no
drivers using this feature and it doesn't really work that well.

Instead of doing batching based on backlog, a more optimal approach
would be for the user to handle the batching (similar to how IPsec
can hook into GSO to get 64K of data each time or how block encryption
can use unit sizes much greater than 4K).

Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Ovidiu Panait <ovidiu.panait.oss@gmail.com>
Reviewed-by: Horia Geantă <horia.geanta@nxp.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/crypto/paes_s390.c               |  2 +-
 arch/s390/crypto/phmac_s390.c              |  2 +-
 crypto/crypto_engine.c                     | 25 +------------------------
 drivers/crypto/caam/jr.c                   |  3 +--
 drivers/crypto/virtio/virtio_crypto_core.c |  2 +-
 include/crypto/engine.h                    |  1 -
 include/crypto/internal/engine.h           |  4 ----
 7 files changed, 5 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 8a340c16acb4..a624a43a2b54 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -1633,7 +1633,7 @@ static int __init paes_s390_init(void)
 	/* with this pseudo devie alloc and start a crypto engine */
 	paes_crypto_engine =
 		crypto_engine_alloc_init_and_set(paes_dev.this_device,
-						 true, NULL, false, MAX_QLEN);
+						 true, false, MAX_QLEN);
 	if (!paes_crypto_engine) {
 		rc = -ENOMEM;
 		goto out_err;
diff --git a/arch/s390/crypto/phmac_s390.c b/arch/s390/crypto/phmac_s390.c
index 90602f72108f..7ecfdc4fba2d 100644
--- a/arch/s390/crypto/phmac_s390.c
+++ b/arch/s390/crypto/phmac_s390.c
@@ -1006,7 +1006,7 @@ static int __init s390_phmac_init(void)
 	/* with this pseudo device alloc and start a crypto engine */
 	phmac_crypto_engine =
 		crypto_engine_alloc_init_and_set(phmac_dev.this_device,
-						 true, NULL, false, MAX_QLEN);
+						 true, false, MAX_QLEN);
 	if (!phmac_crypto_engine) {
 		rc = -ENOMEM;
 		goto out_err;
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index 445d3c113ee1..8a2400f240d4 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -195,17 +195,6 @@ retry:
 out:
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 
-	/*
-	 * Batch requests is possible only if
-	 * hardware can enqueue multiple requests
-	 */
-	if (engine->do_batch_requests) {
-		ret = engine->do_batch_requests(engine);
-		if (ret)
-			dev_err(engine->dev, "failed to do batch requests: %d\n",
-				ret);
-	}
-
 	return;
 }
 
@@ -462,12 +451,6 @@ EXPORT_SYMBOL_GPL(crypto_engine_stop);
  * crypto-engine queue.
  * @dev: the device attached with one hardware engine
  * @retry_support: whether hardware has support for retry mechanism
- * @cbk_do_batch: pointer to a callback function to be invoked when executing
- *                a batch of requests.
- *                This has the form:
- *                callback(struct crypto_engine *engine)
- *                where:
- *                engine: the crypto engine structure.
  * @rt: whether this queue is set to run as a realtime task
  * @qlen: maximum size of the crypto-engine queue
  *
@@ -476,7 +459,6 @@ EXPORT_SYMBOL_GPL(crypto_engine_stop);
  */
 struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 						       bool retry_support,
-						       int (*cbk_do_batch)(struct crypto_engine *engine),
 						       bool rt, int qlen)
 {
 	struct crypto_engine *engine;
@@ -495,11 +477,6 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 	engine->idling = false;
 	engine->retry_support = retry_support;
 	engine->priv_data = dev;
-	/*
-	 * Batch requests is possible only if
-	 * hardware has support for retry mechanism.
-	 */
-	engine->do_batch_requests = retry_support ? cbk_do_batch : NULL;
 
 	snprintf(engine->name, sizeof(engine->name),
 		 "%s-engine", dev_name(dev));
@@ -534,7 +511,7 @@ EXPORT_SYMBOL_GPL(crypto_engine_alloc_init_and_set);
  */
 struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
 {
-	return crypto_engine_alloc_init_and_set(dev, false, NULL, rt,
+	return crypto_engine_alloc_init_and_set(dev, false, rt,
 						CRYPTO_ENGINE_MAX_QLEN);
 }
 EXPORT_SYMBOL_GPL(crypto_engine_alloc_init);
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index 9fcdb64084ac..0ef00df9730e 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -629,8 +629,7 @@ static int caam_jr_probe(struct platform_device *pdev)
 	}
 
 	/* Initialize crypto engine */
-	jrpriv->engine = crypto_engine_alloc_init_and_set(jrdev, true, NULL,
-							  false,
+	jrpriv->engine = crypto_engine_alloc_init_and_set(jrdev, true, false,
 							  CRYPTO_ENGINE_MAX_QLEN);
 	if (!jrpriv->engine) {
 		dev_err(jrdev, "Could not init crypto-engine\n");
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 0d522049f595..3d241446099c 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -139,7 +139,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 		spin_lock_init(&vi->data_vq[i].lock);
 		vi->data_vq[i].vq = vqs[i];
 		/* Initialize crypto engine */
-		vi->data_vq[i].engine = crypto_engine_alloc_init_and_set(dev, true, NULL, true,
+		vi->data_vq[i].engine = crypto_engine_alloc_init_and_set(dev, true, true,
 						virtqueue_get_vring_size(vqs[i]));
 		if (!vi->data_vq[i].engine) {
 			ret = -ENOMEM;
diff --git a/include/crypto/engine.h b/include/crypto/engine.h
index 545dbefe3e13..2e60344437da 100644
--- a/include/crypto/engine.h
+++ b/include/crypto/engine.h
@@ -76,7 +76,6 @@ int crypto_engine_stop(struct crypto_engine *engine);
 struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt);
 struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 						       bool retry_support,
-						       int (*cbk_do_batch)(struct crypto_engine *engine),
 						       bool rt, int qlen);
 void crypto_engine_exit(struct crypto_engine *engine);
 
diff --git a/include/crypto/internal/engine.h b/include/crypto/internal/engine.h
index b6a4ea2240fc..8da1a13619c9 100644
--- a/include/crypto/internal/engine.h
+++ b/include/crypto/internal/engine.h
@@ -37,8 +37,6 @@ struct device;
  * @unprepare_crypt_hardware: there are currently no more requests on the
  * queue so the subsystem notifies the driver that it may relax the
  * hardware by issuing this call
- * @do_batch_requests: execute a batch of requests. Depends on multiple
- * requests support.
  * @kworker: kthread worker struct for request pump
  * @pump_requests: work struct for scheduling work to the request pump
  * @priv_data: the engine private data
@@ -60,8 +58,6 @@ struct crypto_engine {
 
 	int (*prepare_crypt_hardware)(struct crypto_engine *engine);
 	int (*unprepare_crypt_hardware)(struct crypto_engine *engine);
-	int (*do_batch_requests)(struct crypto_engine *engine);
-
 
 	struct kthread_worker           *kworker;
 	struct kthread_work             pump_requests;
-- 
cgit v1.2.3


From 5eb32430df783e212ffed8d35cc494a8941cda0a Mon Sep 17 00:00:00 2001
From: Ovidiu Panait <ovidiu.panait.oss@gmail.com>
Date: Fri, 11 Jul 2025 21:29:32 +0300
Subject: crypto: engine - remove {prepare,unprepare}_crypt_hardware callbacks

The {prepare,unprepare}_crypt_hardware callbacks were added back in 2016
by commit 735d37b5424b ("crypto: engine - Introduce the block request
crypto engine framework"), but they were never implemented by any driver.
Remove them as they are unused.

Since the 'engine->idling' and 'was_busy' flags are no longer needed,
remove them as well.

Signed-off-by: Ovidiu Panait <ovidiu.panait.oss@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 Documentation/crypto/crypto_engine.rst |  6 ------
 crypto/crypto_engine.c                 | 30 +-----------------------------
 include/crypto/internal/engine.h       | 11 -----------
 3 files changed, 1 insertion(+), 46 deletions(-)

(limited to 'include')

diff --git a/Documentation/crypto/crypto_engine.rst b/Documentation/crypto/crypto_engine.rst
index d562ea17d994..7ef850e28016 100644
--- a/Documentation/crypto/crypto_engine.rst
+++ b/Documentation/crypto/crypto_engine.rst
@@ -36,12 +36,6 @@ engine using ``crypto_engine_stop()`` and destroy the engine with
 Before transferring any request, you have to fill the context enginectx by
 providing functions for the following:
 
-* ``prepare_crypt_hardware``: Called once before any prepare functions are
-  called.
-
-* ``unprepare_crypt_hardware``: Called once after all unprepare functions have
-  been called.
-
 * ``prepare_cipher_request``/``prepare_hash_request``: Called before each
   corresponding request is performed. If some processing or other preparatory
   work is required, do it here.
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index 8a2400f240d4..18e1689efe12 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -74,7 +74,6 @@ static void crypto_pump_requests(struct crypto_engine *engine,
 	struct crypto_engine_alg *alg;
 	struct crypto_engine_op *op;
 	unsigned long flags;
-	bool was_busy = false;
 	int ret;
 
 	spin_lock_irqsave(&engine->queue_lock, flags);
@@ -83,12 +82,6 @@ static void crypto_pump_requests(struct crypto_engine *engine,
 	if (!engine->retry_support && engine->cur_req)
 		goto out;
 
-	/* If another context is idling then defer */
-	if (engine->idling) {
-		kthread_queue_work(engine->kworker, &engine->pump_requests);
-		goto out;
-	}
-
 	/* Check if the engine queue is idle */
 	if (!crypto_queue_len(&engine->queue) || !engine->running) {
 		if (!engine->busy)
@@ -102,15 +95,6 @@ static void crypto_pump_requests(struct crypto_engine *engine,
 		}
 
 		engine->busy = false;
-		engine->idling = true;
-		spin_unlock_irqrestore(&engine->queue_lock, flags);
-
-		if (engine->unprepare_crypt_hardware &&
-		    engine->unprepare_crypt_hardware(engine))
-			dev_err(engine->dev, "failed to unprepare crypt hardware\n");
-
-		spin_lock_irqsave(&engine->queue_lock, flags);
-		engine->idling = false;
 		goto out;
 	}
 
@@ -129,22 +113,11 @@ start_request:
 	if (!engine->retry_support)
 		engine->cur_req = async_req;
 
-	if (engine->busy)
-		was_busy = true;
-	else
+	if (!engine->busy)
 		engine->busy = true;
 
 	spin_unlock_irqrestore(&engine->queue_lock, flags);
 
-	/* Until here we get the request need to be encrypted successfully */
-	if (!was_busy && engine->prepare_crypt_hardware) {
-		ret = engine->prepare_crypt_hardware(engine);
-		if (ret) {
-			dev_err(engine->dev, "failed to prepare crypt hardware\n");
-			goto req_err_1;
-		}
-	}
-
 	alg = container_of(async_req->tfm->__crt_alg,
 			   struct crypto_engine_alg, base);
 	op = &alg->op;
@@ -474,7 +447,6 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
 	engine->rt = rt;
 	engine->running = false;
 	engine->busy = false;
-	engine->idling = false;
 	engine->retry_support = retry_support;
 	engine->priv_data = dev;
 
diff --git a/include/crypto/internal/engine.h b/include/crypto/internal/engine.h
index 8da1a13619c9..f19ef376833f 100644
--- a/include/crypto/internal/engine.h
+++ b/include/crypto/internal/engine.h
@@ -21,7 +21,6 @@ struct device;
 /*
  * struct crypto_engine - crypto hardware engine
  * @name: the engine name
- * @idling: the engine is entering idle state
  * @busy: request pump is busy
  * @running: the engine is on working
  * @retry_support: indication that the hardware allows re-execution
@@ -31,12 +30,6 @@ struct device;
  * @list: link with the global crypto engine list
  * @queue_lock: spinlock to synchronise access to request queue
  * @queue: the crypto queue of the engine
- * @prepare_crypt_hardware: a request will soon arrive from the queue
- * so the subsystem requests the driver to prepare the hardware
- * by issuing this call
- * @unprepare_crypt_hardware: there are currently no more requests on the
- * queue so the subsystem notifies the driver that it may relax the
- * hardware by issuing this call
  * @kworker: kthread worker struct for request pump
  * @pump_requests: work struct for scheduling work to the request pump
  * @priv_data: the engine private data
@@ -44,7 +37,6 @@ struct device;
  */
 struct crypto_engine {
 	char			name[ENGINE_NAME_LEN];
-	bool			idling;
 	bool			busy;
 	bool			running;
 
@@ -56,9 +48,6 @@ struct crypto_engine {
 	struct crypto_queue	queue;
 	struct device		*dev;
 
-	int (*prepare_crypt_hardware)(struct crypto_engine *engine);
-	int (*unprepare_crypt_hardware)(struct crypto_engine *engine);
-
 	struct kthread_worker           *kworker;
 	struct kthread_work             pump_requests;
 
-- 
cgit v1.2.3


From 9b7fc3f14576c268f62fe0b882fac5e61239b659 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 1 Jul 2025 10:58:04 +0200
Subject: vdso: Introduce aux_clock_resolution_ns()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the constant resolution to a shared header,
so the vDSO can use it and return it without going through a syscall.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-10-df7d9f87b9b8@linutronix.de
---
 include/vdso/auxclock.h   | 13 +++++++++++++
 kernel/time/timekeeping.c |  6 ++++--
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 include/vdso/auxclock.h

(limited to 'include')

diff --git a/include/vdso/auxclock.h b/include/vdso/auxclock.h
new file mode 100644
index 000000000000..6d6e74cbc400
--- /dev/null
+++ b/include/vdso/auxclock.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _VDSO_AUXCLOCK_H
+#define _VDSO_AUXCLOCK_H
+
+#include <uapi/linux/time.h>
+#include <uapi/linux/types.h>
+
+static __always_inline u64 aux_clock_resolution_ns(void)
+{
+	return 1;
+}
+
+#endif /* _VDSO_AUXCLOCK_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c6fe89bded02..cbcf090bb4be 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,6 +26,8 @@
 #include <linux/audit.h>
 #include <linux/random.h>
 
+#include <vdso/auxclock.h>
+
 #include "tick-internal.h"
 #include "ntp_internal.h"
 #include "timekeeping_internal.h"
@@ -2876,8 +2878,8 @@ static int aux_get_res(clockid_t id, struct timespec64 *tp)
 	if (!clockid_aux_valid(id))
 		return -ENODEV;
 
-	tp->tv_sec = 0;
-	tp->tv_nsec = 1;
+	tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+	tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 380b84e168e57c54d0a9e053a5558fddc43f0c1a Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 1 Jul 2025 10:58:05 +0200
Subject: vdso/vsyscall: Update auxiliary clock data in the datapage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose the auxiliary clock data so it can be read from the vDSO.

Architectures not using the generic vDSO time framework,
namely SPARC64, are not supported.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-11-df7d9f87b9b8@linutronix.de
---
 include/linux/timekeeper_internal.h |  6 ++++++
 include/vdso/datapage.h             |  3 +++
 kernel/time/namespace.c             |  5 +++++
 kernel/time/timekeeping.c           | 12 +++++++++++
 kernel/time/vsyscall.c              | 40 +++++++++++++++++++++++++++++++++++++
 5 files changed, 66 insertions(+)

(limited to 'include')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index ca79938b62f3..c27aac67cb3f 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -190,4 +190,10 @@ static inline void update_vsyscall_tz(void)
 }
 #endif
 
+#if defined(CONFIG_GENERIC_GETTIMEOFDAY) && defined(CONFIG_POSIX_AUX_CLOCKS)
+extern void vdso_time_update_aux(struct timekeeper *tk);
+#else
+static inline void vdso_time_update_aux(struct timekeeper *tk) { }
+#endif
+
 #endif /* _LINUX_TIMEKEEPER_INTERNAL_H */
diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 1864e76e8f69..f4c96d9ce674 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -38,6 +38,7 @@ struct vdso_arch_data {
 #endif
 
 #define VDSO_BASES	(CLOCK_TAI + 1)
+#define VDSO_BASE_AUX	0
 #define VDSO_HRES	(BIT(CLOCK_REALTIME)		| \
 			 BIT(CLOCK_MONOTONIC)		| \
 			 BIT(CLOCK_BOOTTIME)		| \
@@ -117,6 +118,7 @@ struct vdso_clock {
  * @arch_data:		architecture specific data (optional, defaults
  *			to an empty struct)
  * @clock_data:		clocksource related data (array)
+ * @aux_clock_data:	auxiliary clocksource related data (array)
  * @tz_minuteswest:	minutes west of Greenwich
  * @tz_dsttime:		type of DST correction
  * @hrtimer_res:	hrtimer resolution
@@ -133,6 +135,7 @@ struct vdso_time_data {
 	struct arch_vdso_time_data	arch_data;
 
 	struct vdso_clock		clock_data[CS_BASES];
+	struct vdso_clock		aux_clock_data[MAX_AUX_CLOCKS];
 
 	s32				tz_minuteswest;
 	s32				tz_dsttime;
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index e3642278df43..667452768ed3 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task,
 	for (i = 0; i < CS_BASES; i++)
 		timens_setup_vdso_clock_data(&vc[i], ns);
 
+	if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+		for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+			timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+	}
+
 out:
 	mutex_unlock(&offset_lock);
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbcf090bb4be..243fe25e680a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -66,11 +66,21 @@ static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
 {
 	return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
 }
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+	return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
 #else
 static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
 {
 	return false;
 }
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+	return false;
+}
 #endif
 
 /* flag for if timekeeping is suspended */
@@ -719,6 +729,8 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
 
 		update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 		update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
+	} else if (tk_is_aux(tk)) {
+		vdso_time_update_aux(tk);
 	}
 
 	if (action & TK_CLOCK_WAS_SET)
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index df6bada2d58e..8ba8b0d8a387 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -136,6 +136,46 @@ void update_vsyscall_tz(void)
 	__arch_sync_vdso_time_data(vdata);
 }
 
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+	struct vdso_time_data *vdata = vdso_k_time_data;
+	struct vdso_timestamp *vdso_ts;
+	struct vdso_clock *vc;
+	s32 clock_mode;
+	u64 nsec;
+
+	vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+	vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+	clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+	if (!tk->clock_valid)
+		clock_mode = VDSO_CLOCKMODE_NONE;
+
+	/* copy vsyscall data */
+	vdso_write_begin_clock(vc);
+
+	vc->clock_mode = clock_mode;
+
+	if (clock_mode != VDSO_CLOCKMODE_NONE) {
+		fill_clock_configuration(vc, &tk->tkr_mono);
+
+		vdso_ts->sec	= tk->xtime_sec;
+
+		nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+		nsec += tk->offs_aux;
+		vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+		nsec = nsec << tk->tkr_mono.shift;
+		vdso_ts->nsec = nsec;
+	}
+
+	__arch_update_vdso_clock(vc);
+
+	vdso_write_end_clock(vc);
+
+	__arch_sync_vdso_time_data(vdata);
+}
+#endif
+
 /**
  * vdso_update_begin - Start of a VDSO update section
  *
-- 
cgit v1.2.3


From cd3557a7618bf5c1935e9f66b58a329f1f1f4b27 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Tue, 1 Jul 2025 10:58:06 +0200
Subject: vdso/gettimeofday: Add support for auxiliary clocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose the auxiliary clocks through the vDSO.

Architectures not using the generic vDSO time framework,
namely SPARC64, are not supported.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-12-df7d9f87b9b8@linutronix.de
---
 include/vdso/datapage.h |  2 ++
 lib/vdso/gettimeofday.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index f4c96d9ce674..02533038640e 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -5,6 +5,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
+#include <uapi/linux/bits.h>
 #include <uapi/linux/time.h>
 #include <uapi/linux/types.h>
 #include <uapi/asm-generic/errno-base.h>
@@ -46,6 +47,7 @@ struct vdso_arch_data {
 #define VDSO_COARSE	(BIT(CLOCK_REALTIME_COARSE)	| \
 			 BIT(CLOCK_MONOTONIC_COARSE))
 #define VDSO_RAW	(BIT(CLOCK_MONOTONIC_RAW))
+#define VDSO_AUX	__GENMASK(CLOCK_AUX_LAST, CLOCK_AUX)
 
 #define CS_HRES_COARSE	0
 #define CS_RAW		1
diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index fc0038e83b5c..02ea19f67164 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -2,6 +2,7 @@
 /*
  * Generic userspace implementations of gettimeofday() and similar.
  */
+#include <vdso/auxclock.h>
 #include <vdso/datapage.h>
 #include <vdso/helpers.h>
 
@@ -74,7 +75,7 @@ static inline bool vdso_cycles_ok(u64 cycles)
 static __always_inline bool vdso_clockid_valid(clockid_t clock)
 {
 	/* Check for negative values or invalid clocks */
-	return likely((u32) clock < MAX_CLOCKS);
+	return likely((u32) clock <= CLOCK_AUX_LAST);
 }
 
 /*
@@ -268,6 +269,48 @@ bool do_coarse(const struct vdso_time_data *vd, const struct vdso_clock *vc,
 	return true;
 }
 
+static __always_inline
+bool do_aux(const struct vdso_time_data *vd, clockid_t clock, struct __kernel_timespec *ts)
+{
+	const struct vdso_clock *vc;
+	u32 seq, idx;
+	u64 sec, ns;
+
+	if (!IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+		return false;
+
+	idx = clock - CLOCK_AUX;
+	vc = &vd->aux_clock_data[idx];
+
+	do {
+		/*
+		 * Open coded function vdso_read_begin() to handle
+		 * VDSO_CLOCK_TIMENS. See comment in do_hres().
+		 */
+		while ((seq = READ_ONCE(vc->seq)) & 1) {
+			if (IS_ENABLED(CONFIG_TIME_NS) && vc->clock_mode == VDSO_CLOCKMODE_TIMENS) {
+				vd = __arch_get_vdso_u_timens_data(vd);
+				vc = &vd->aux_clock_data[idx];
+				/* Re-read from the real time data page */
+				continue;
+			}
+			cpu_relax();
+		}
+		smp_rmb();
+
+		/* Auxclock disabled? */
+		if (vc->clock_mode == VDSO_CLOCKMODE_NONE)
+			return false;
+
+		if (!vdso_get_timestamp(vd, vc, VDSO_BASE_AUX, &sec, &ns))
+			return false;
+	} while (unlikely(vdso_read_retry(vc, seq)));
+
+	vdso_set_timespec(ts, sec, ns);
+
+	return true;
+}
+
 static __always_inline bool
 __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock,
 			     struct __kernel_timespec *ts)
@@ -289,6 +332,8 @@ __cvdso_clock_gettime_common(const struct vdso_time_data *vd, clockid_t clock,
 		return do_coarse(vd, &vc[CS_HRES_COARSE], clock, ts);
 	else if (msk & VDSO_RAW)
 		vc = &vc[CS_RAW];
+	else if (msk & VDSO_AUX)
+		return do_aux(vd, clock, ts);
 	else
 		return false;
 
@@ -433,6 +478,8 @@ bool __cvdso_clock_getres_common(const struct vdso_time_data *vd, clockid_t cloc
 		 * Preserves the behaviour of posix_get_coarse_res().
 		 */
 		ns = LOW_RES_NSEC;
+	} else if (msk & VDSO_AUX) {
+		ns = aux_clock_resolution_ns();
 	} else {
 		return false;
 	}
-- 
cgit v1.2.3


From 78e50d88998a4a634eeda3e0d136cb8b9c9bc9d8 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Sat, 12 Jul 2025 18:53:07 -0300
Subject: wifi: brcmfmac: Add support for the SDIO 43751 device

Add the SDIO ID and firmware matching for the 43751 device.

Based on the previous work from Marc Gonzalez <mgonzalez@freebox.fr>.

Tested on an i.MX6DL board connected to an AP6398SV chip with the
brcmfmac43752-sdio.bin firmware taken from:

https://source.puri.sm/Librem5/firmware-brcm43752-nonfree

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Acked-by: Arend van Spriel <arend.vanspriel@broadcom.com>>
Link: https://patch.msgid.link/20250712215307.1310802-1-festevam@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c     | 1 +
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c       | 2 ++
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c       | 5 ++++-
 drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 1 +
 include/linux/mmc/sdio_ids.h                                  | 1 +
 5 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index 6bc107476a2a..8ab7d1e34a6e 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -996,6 +996,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359, WCC),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43751, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752, CYW),
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
index 2ef92ef25517..9074ab49e806 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c
@@ -739,6 +739,7 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci)
 	case CY_CC_4373_CHIP_ID:
 		return 0x160000;
 	case CY_CC_43752_CHIP_ID:
+	case BRCM_CC_43751_CHIP_ID:
 	case BRCM_CC_4377_CHIP_ID:
 		return 0x170000;
 	case BRCM_CC_4378_CHIP_ID:
@@ -1450,6 +1451,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub)
 		reg = chip->ops->read32(chip->ctx, addr);
 		return (reg & CC_SR_CTL0_ENABLE_MASK) != 0;
 	case BRCM_CC_4359_CHIP_ID:
+	case BRCM_CC_43751_CHIP_ID:
 	case CY_CC_43752_CHIP_ID:
 	case CY_CC_43012_CHIP_ID:
 		addr = CORE_CC_REG(pmu->base, retention_ctl);
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index cf26ab15ee0c..8a0bad5119a0 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -654,6 +654,7 @@ static const struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = {
 	BRCMF_FW_ENTRY(BRCM_CC_4354_CHIP_ID, 0xFFFFFFFF, 4354),
 	BRCMF_FW_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356),
 	BRCMF_FW_ENTRY(BRCM_CC_4359_CHIP_ID, 0xFFFFFFFF, 4359),
+	BRCMF_FW_ENTRY(BRCM_CC_43751_CHIP_ID, 0xFFFFFFFF, 43752),
 	BRCMF_FW_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373),
 	BRCMF_FW_ENTRY(CY_CC_43012_CHIP_ID, 0xFFFFFFFF, 43012),
 	BRCMF_FW_ENTRY(CY_CC_43439_CHIP_ID, 0xFFFFFFFF, 43439),
@@ -3424,7 +3425,8 @@ err:
 
 static bool brcmf_sdio_aos_no_decode(struct brcmf_sdio *bus)
 {
-	if (bus->ci->chip == CY_CC_43012_CHIP_ID ||
+	if (bus->ci->chip == BRCM_CC_43751_CHIP_ID ||
+	    bus->ci->chip == CY_CC_43012_CHIP_ID ||
 	    bus->ci->chip == CY_CC_43752_CHIP_ID)
 		return true;
 	else
@@ -4275,6 +4277,7 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err,
 				   bus->hostintmask, NULL);
 
 		switch (sdiod->func1->device) {
+		case SDIO_DEVICE_ID_BROADCOM_43751:
 		case SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373:
 		case SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752:
 			brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n",
diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
index c1e22c589d85..6564616a57df 100644
--- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h
@@ -52,6 +52,7 @@
 #define BRCM_CC_43664_CHIP_ID		43664
 #define BRCM_CC_43666_CHIP_ID		43666
 #define BRCM_CC_4371_CHIP_ID		0x4371
+#define BRCM_CC_43751_CHIP_ID		43751
 #define BRCM_CC_43752_CHIP_ID		43752
 #define BRCM_CC_4377_CHIP_ID		0x4377
 #define BRCM_CC_4378_CHIP_ID		0x4378
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 7cddfdac2f57..fe3d6d98f8da 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -76,6 +76,7 @@
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
 #define SDIO_DEVICE_ID_BROADCOM_43439		0xa9af
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
+#define SDIO_DEVICE_ID_BROADCOM_43751		0xaae7
 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752	0xaae8
 
 #define SDIO_VENDOR_ID_CYPRESS			0x04b4
-- 
cgit v1.2.3


From 6624a0af82a6e3a4d3609264ef591a8fa3467139 Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Thu, 17 Jul 2025 17:42:02 +1000
Subject: wifi: cfg80211: support configuring an S1G short beaconing BSS

S1G short beacons are an optional frame type used in an S1G BSS
that contain a limited set of elements. While they are optional,
they are a fundamental part of S1G that enables significant
power saving.

Expose 2 additional netlink attributes,
NL80211_ATTR_S1G_LONG_BEACON_PERIOD which denotes the number of beacon
intervals between each long beacon and NL80211_ATTR_S1G_SHORT_BEACON
which is a nested attribute containing the short beacon tail and
head. We split them as the long beacon period cannot be updated,
and is only used when initialisng the interface, whereas the short
beacon data can be used to both initialise and update the templates.
This follows how things such as the beacon interval and DTIM period
currently operate.

During the initialisation path, we ensure we have the long beacon
period if the short beacon data is being passed down, whereas
the update path will simply update the template if its sent down.

The short beacon data is validated using the same routines for regular
beacons as they support correctly parsing the short beacon format
while ensuring the frame is well-formed.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250717074205.312577-2-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 23 ++++++++++++++
 include/uapi/linux/nl80211.h | 39 ++++++++++++++++++++++++
 net/wireless/nl80211.c       | 72 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 17f2a665dce6..44a1055a81ba 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1423,6 +1423,23 @@ struct cfg80211_unsol_bcast_probe_resp {
 	const u8 *tmpl;
 };
 
+/**
+ * struct cfg80211_s1g_short_beacon - S1G short beacon data.
+ *
+ * @update: Set to true if the feature configuration should be updated.
+ * @short_head: Short beacon head.
+ * @short_tail: Short beacon tail.
+ * @short_head_len: Short beacon head len.
+ * @short_tail_len: Short beacon tail len.
+ */
+struct cfg80211_s1g_short_beacon {
+	bool update;
+	const u8 *short_head;
+	const u8 *short_tail;
+	size_t short_head_len;
+	size_t short_tail_len;
+};
+
 /**
  * struct cfg80211_ap_settings - AP configuration
  *
@@ -1463,6 +1480,8 @@ struct cfg80211_unsol_bcast_probe_resp {
  * @fils_discovery: FILS discovery transmission parameters
  * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
  * @mbssid_config: AP settings for multiple bssid
+ * @s1g_long_beacon_period: S1G long beacon period
+ * @s1g_short_beacon: S1G short beacon data
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -1496,6 +1515,8 @@ struct cfg80211_ap_settings {
 	struct cfg80211_fils_discovery fils_discovery;
 	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
 	struct cfg80211_mbssid_config mbssid_config;
+	u8 s1g_long_beacon_period;
+	struct cfg80211_s1g_short_beacon s1g_short_beacon;
 };
 
 
@@ -1507,11 +1528,13 @@ struct cfg80211_ap_settings {
  * @beacon: beacon data
  * @fils_discovery: FILS discovery transmission parameters
  * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
+ * @s1g_short_beacon: S1G short beacon data
  */
 struct cfg80211_ap_update {
 	struct cfg80211_beacon_data beacon;
 	struct cfg80211_fils_discovery fils_discovery;
 	struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
+	struct cfg80211_s1g_short_beacon s1g_short_beacon;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 39460334dafb..d1a14f2892d9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2915,6 +2915,19 @@ enum nl80211_commands {
  *	applicable to that specific radio only. If the radio id is greater
  *	thank the number of radios, error denoting invalid value is returned.
  *
+ * @NL80211_ATTR_S1G_LONG_BEACON_PERIOD: (u8) Integer attribute that represents
+ *	the number of beacon intervals between each long beacon transmission
+ *	for an S1G BSS with short beaconing enabled. This is a required
+ *	attribute for initialising an S1G short beaconing BSS. When updating
+ *	the short beacon data, this is not required. It has a minimum value of
+ *	2 (i.e 2 beacon intervals).
+ *
+ * @NL80211_ATTR_S1G_SHORT_BEACON: Nested attribute containing the short beacon
+ *	head and tail used to set or update the short beacon templates. When
+ *	bringing up a new interface, %NL80211_ATTR_S1G_LONG_BEACON_PERIOD is
+ *	required alongside this attribute. Refer to
+ *	@enum nl80211_s1g_short_beacon_attrs for the attribute definitions.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3474,6 +3487,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_WIPHY_RADIO_INDEX,
 
+	NL80211_ATTR_S1G_LONG_BEACON_PERIOD,
+	NL80211_ATTR_S1G_SHORT_BEACON,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -8148,4 +8164,27 @@ enum nl80211_wiphy_radio_freq_range {
 	NL80211_WIPHY_RADIO_FREQ_ATTR_MAX = __NL80211_WIPHY_RADIO_FREQ_ATTR_LAST - 1,
 };
 
+/**
+ * enum nl80211_s1g_short_beacon_attrs - S1G short beacon data
+ *
+ * @__NL80211_S1G_SHORT_BEACON_ATTR_INVALID: Invalid
+ *
+ * @NL80211_S1G_SHORT_BEACON_ATTR_HEAD: Short beacon head (binary).
+ * @NL80211_S1G_SHORT_BEACON_ATTR_TAIL: Short beacon tail (binary).
+ *
+ * @__NL80211_S1G_SHORT_BEACON_ATTR_LAST: Internal
+ * @NL80211_S1G_SHORT_BEACON_ATTR_MAX: Highest attribute
+ */
+enum nl80211_s1g_short_beacon_attrs {
+	__NL80211_S1G_SHORT_BEACON_ATTR_INVALID,
+
+	NL80211_S1G_SHORT_BEACON_ATTR_HEAD,
+	NL80211_S1G_SHORT_BEACON_ATTR_TAIL,
+
+	/* keep last */
+	__NL80211_S1G_SHORT_BEACON_ATTR_LAST,
+	NL80211_S1G_SHORT_BEACON_ATTR_MAX =
+		__NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1
+};
+
 #endif /* __LINUX_NL80211_H */
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 20bc0f052c16..1c808b08b747 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -482,6 +482,16 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = {
 	[NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 },
 };
 
+static const struct nla_policy
+nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = {
+	[NL80211_S1G_SHORT_BEACON_ATTR_HEAD] =
+		NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head,
+				       IEEE80211_MAX_DATA_LEN),
+	[NL80211_S1G_SHORT_BEACON_ATTR_TAIL] =
+		NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+				       IEEE80211_MAX_DATA_LEN),
+};
+
 static const struct netlink_range_validation nl80211_punct_bitmap_range = {
 	.min = 0,
 	.max = 0xffff,
@@ -858,6 +868,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_EPCS] = { .type = NLA_FLAG },
 	[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 },
 	[NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 },
+	[NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2),
+	[NL80211_ATTR_S1G_SHORT_BEACON] =
+		NLA_POLICY_NESTED(nl80211_s1g_short_beacon),
 };
 
 /* policy for the key attributes */
@@ -6202,6 +6215,41 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params
 	return 0;
 }
 
+static int
+nl80211_parse_s1g_short_beacon(struct cfg80211_registered_device *rdev,
+			       struct nlattr *attrs,
+			       struct cfg80211_s1g_short_beacon *sb)
+{
+	struct nlattr *tb[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1];
+	int ret;
+
+	if (!rdev->wiphy.bands[NL80211_BAND_S1GHZ])
+		return -EINVAL;
+
+	ret = nla_parse_nested(tb, NL80211_S1G_SHORT_BEACON_ATTR_MAX, attrs,
+			       NULL, NULL);
+	if (ret)
+		return ret;
+
+	/* Short beacon tail is optional (i.e might only include the TIM) */
+	if (!tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD])
+		return -EINVAL;
+
+	sb->short_head = nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]);
+	sb->short_head_len = nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]);
+	sb->short_tail_len = 0;
+
+	if (tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]) {
+		sb->short_tail =
+			nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]);
+		sb->short_tail_len =
+			nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]);
+	}
+
+	sb->update = true;
+	return 0;
+}
+
 static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -6442,6 +6490,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		goto out;
 	}
 
+	if (info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]) {
+		if (!info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		params->s1g_long_beacon_period = nla_get_u8(
+			info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]);
+
+		err = nl80211_parse_s1g_short_beacon(
+			rdev, info->attrs[NL80211_ATTR_S1G_SHORT_BEACON],
+			&params->s1g_short_beacon);
+		if (err)
+			goto out;
+	}
+
 	err = nl80211_calculate_ap_params(params);
 	if (err)
 		goto out;
@@ -6550,6 +6614,14 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 			goto out;
 	}
 
+	attr = info->attrs[NL80211_ATTR_S1G_SHORT_BEACON];
+	if (attr) {
+		err = nl80211_parse_s1g_short_beacon(rdev, attr,
+						     &params->s1g_short_beacon);
+		if (err)
+			goto out;
+	}
+
 	err = rdev_change_beacon(rdev, dev, params);
 
 out:
-- 
cgit v1.2.3


From bbf93a06d73505591db3a93797f44b9c44555d9b Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Thu, 17 Jul 2025 17:42:03 +1000
Subject: wifi: mac80211: support initialising an S1G short beaconing BSS

Introduce the ability to parse the short beacon data and long
beacon period. The long beacon period represents the number of beacon
intervals between each long beacon transmission. Additionally,
as a BSS cannot change its configuration such that short beaconing
is dynamically disabled/enabled without tearing down the interface
- we ensure we have an existing short beacon before performing
the update.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250717074205.312577-3-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  4 +++
 net/mac80211/cfg.c         | 66 ++++++++++++++++++++++++++++++++++++++++++++--
 net/mac80211/ieee80211_i.h |  9 +++++++
 3 files changed, 77 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 577fd6a8c372..a2dbaad2f6d3 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -758,6 +758,8 @@ struct ieee80211_parsed_tpe {
  *	be updated to 1, even if bss_param_ch_cnt didn't change. This allows
  *	the link to know that it heard the latest value from its own beacon
  *	(as opposed to hearing its value from another link's beacon).
+ * @s1g_long_beacon_period: number of beacon intervals between each long
+ *	beacon transmission.
  */
 struct ieee80211_bss_conf {
 	struct ieee80211_vif *vif;
@@ -857,6 +859,8 @@ struct ieee80211_bss_conf {
 
 	u8 bss_param_ch_cnt;
 	u8 bss_param_ch_cnt_link_id;
+
+	u8 s1g_long_beacon_period;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index b99e39cb808b..2f97e2d5bb8b 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1071,6 +1071,47 @@ ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata,
 	return 0;
 }
 
+static int
+ieee80211_set_s1g_short_beacon(struct ieee80211_sub_if_data *sdata,
+			       struct ieee80211_link_data *link,
+			       struct cfg80211_s1g_short_beacon *params)
+{
+	struct s1g_short_beacon_data *new;
+	struct s1g_short_beacon_data *old =
+		sdata_dereference(link->u.ap.s1g_short_beacon, sdata);
+	size_t new_len =
+		sizeof(*new) + params->short_head_len + params->short_tail_len;
+
+	if (!params->update)
+		return 0;
+
+	if (!params->short_head)
+		return -EINVAL;
+
+	new = kzalloc(new_len, GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	/* Memory layout: | struct | head | tail | */
+	new->short_head = (u8 *)new + sizeof(*new);
+	new->short_head_len = params->short_head_len;
+	memcpy(new->short_head, params->short_head, params->short_head_len);
+
+	if (params->short_tail) {
+		new->short_tail = new->short_head + params->short_head_len;
+		new->short_tail_len = params->short_tail_len;
+		memcpy(new->short_tail, params->short_tail,
+		       params->short_tail_len);
+	}
+
+	rcu_assign_pointer(link->u.ap.s1g_short_beacon, new);
+
+	if (old)
+		kfree_rcu(old, rcu_head);
+
+	return 0;
+}
+
 static int ieee80211_set_ftm_responder_params(
 				struct ieee80211_sub_if_data *sdata,
 				const u8 *lci, size_t lci_len,
@@ -1493,8 +1534,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	link_conf->twt_responder = params->twt_responder;
 	link_conf->he_obss_pd = params->he_obss_pd;
 	link_conf->he_bss_color = params->beacon.he_bss_color;
-	sdata->vif.cfg.s1g = params->chandef.chan->band ==
-				  NL80211_BAND_S1GHZ;
+	link_conf->s1g_long_beacon_period = params->s1g_long_beacon_period;
+	sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ;
 
 	sdata->vif.cfg.ssid_len = params->ssid_len;
 	if (params->ssid_len)
@@ -1541,6 +1582,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	if (err < 0)
 		goto error;
 
+	if (sdata->vif.cfg.s1g) {
+		err = ieee80211_set_s1g_short_beacon(sdata, link,
+						     &params->s1g_short_beacon);
+		if (err < 0)
+			goto error;
+	}
+
 	err = drv_start_ap(sdata->local, sdata, link_conf);
 	if (err) {
 		old = sdata_dereference(link->u.ap.beacon, sdata);
@@ -1619,6 +1667,13 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
 	if (err < 0)
 		return err;
 
+	if (link->u.ap.s1g_short_beacon) {
+		err = ieee80211_set_s1g_short_beacon(sdata, link,
+						     &params->s1g_short_beacon);
+		if (err < 0)
+			return err;
+	}
+
 	if (beacon->he_bss_color_valid &&
 	    beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) {
 		link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled;
@@ -1650,6 +1705,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	struct probe_resp *old_probe_resp;
 	struct fils_discovery_data *old_fils_discovery;
 	struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp;
+	struct s1g_short_beacon_data *old_s1g_short_beacon;
 	struct cfg80211_chan_def chandef;
 	struct ieee80211_link_data *link =
 		sdata_dereference(sdata->link[link_id], sdata);
@@ -1668,6 +1724,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	old_unsol_bcast_probe_resp =
 		sdata_dereference(link->u.ap.unsol_bcast_probe_resp,
 				  sdata);
+	old_s1g_short_beacon =
+		sdata_dereference(link->u.ap.s1g_short_beacon, sdata);
 
 	/* abort any running channel switch or color change */
 	link_conf->csa_active = false;
@@ -1690,6 +1748,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	RCU_INIT_POINTER(link->u.ap.probe_resp, NULL);
 	RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL);
 	RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL);
+	RCU_INIT_POINTER(link->u.ap.s1g_short_beacon, NULL);
 	kfree_rcu(old_beacon, rcu_head);
 	if (old_probe_resp)
 		kfree_rcu(old_probe_resp, rcu_head);
@@ -1697,6 +1756,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 		kfree_rcu(old_fils_discovery, rcu_head);
 	if (old_unsol_bcast_probe_resp)
 		kfree_rcu(old_unsol_bcast_probe_resp, rcu_head);
+	if (old_s1g_short_beacon)
+		kfree_rcu(old_s1g_short_beacon, rcu_head);
 
 	kfree(link_conf->ftmr_params);
 	link_conf->ftmr_params = NULL;
@@ -1720,6 +1781,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
 	link_conf->enable_beacon = false;
 	sdata->beacon_rate_set = false;
 	sdata->vif.cfg.ssid_len = 0;
+	sdata->vif.cfg.s1g = false;
 	clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
 	ieee80211_link_info_change_notify(sdata, link,
 					  BSS_CHANGED_BEACON_ENABLED);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9c0603eb580f..61cd1cc098ac 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -296,6 +296,14 @@ struct unsol_bcast_probe_resp_data {
 	u8 data[];
 };
 
+struct s1g_short_beacon_data {
+	struct rcu_head rcu_head;
+	u8 *short_head;
+	u8 *short_tail;
+	int short_head_len;
+	int short_tail_len;
+};
+
 struct ps_data {
 	/* yes, this looks ugly, but guarantees that we can later use
 	 * bitmap_empty :)
@@ -1042,6 +1050,7 @@ struct ieee80211_link_data_ap {
 	struct probe_resp __rcu *probe_resp;
 	struct fils_discovery_data __rcu *fils_discovery;
 	struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp;
+	struct s1g_short_beacon_data __rcu *s1g_short_beacon;
 
 	/* to be used after channel switch. */
 	struct cfg80211_beacon_data *next_beacon;
-- 
cgit v1.2.3


From 1f4f8166110f037f15a89c2203ff887b98a8393a Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Thu, 17 Jul 2025 11:18:14 +0100
Subject: cxl/events: Update Common Event Record to CXL spec rev 3.2

CXL spec 3.2 section 8.2.10.2.1 Table 8-55, Common Event Record format
defined new fields LD-ID and Head ID.

LD-ID: ID of logical device from where the event originated, which is
valid only if LD-ID valid flag is set to 1.
CXL spec 3.2 Section 2.4 describes, a Type 3 Multi-Logical Device (MLD)
can partition its resources into up to 16 isolated Logical Devices.
Each Logical Device is identified by a Logical Device Identifier (LD-ID)
in CXL.mem and CXL.io protocols. LD-ID is a 16-bit Logical Device
identifier applicable for CXL.io and CXL.mem requests and responses.
CXL.mem supports only the lower 4 bits of LD-ID and therefore can support
up to 16 unique LD-ID values over the link. Requests and responses
forwarded over an MLD Port are tagged with LD-ID.

Head ID: ID of the device head, from where the event originated, which is
valid only if head valid flag is set to 1.

Add updates for the above spec changes in the CXL events record and CXL
common trace event implementation.

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Link: https://patch.msgid.link/20250717101817.2104-2-shiju.jose@huawei.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/trace.h | 18 ++++++++++++++----
 include/cxl/event.h      |  4 +++-
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 25ebfbc1616c..a77487a257b3 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -214,12 +214,16 @@ TRACE_EVENT(cxl_overflow,
 #define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED	BIT(4)
 #define CXL_EVENT_RECORD_FLAG_HW_REPLACE	BIT(5)
 #define CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID	BIT(6)
+#define CXL_EVENT_RECORD_FLAG_LD_ID_VALID	BIT(7)
+#define CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID	BIT(8)
 #define show_hdr_flags(flags)	__print_flags(flags, " | ",			   \
 	{ CXL_EVENT_RECORD_FLAG_PERMANENT,	"PERMANENT_CONDITION"		}, \
 	{ CXL_EVENT_RECORD_FLAG_MAINT_NEEDED,	"MAINTENANCE_NEEDED"		}, \
 	{ CXL_EVENT_RECORD_FLAG_PERF_DEGRADED,	"PERFORMANCE_DEGRADED"		}, \
 	{ CXL_EVENT_RECORD_FLAG_HW_REPLACE,	"HARDWARE_REPLACEMENT_NEEDED"	},  \
-	{ CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID,	"MAINT_OP_SUB_CLASS_VALID" }	\
+	{ CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID,	"MAINT_OP_SUB_CLASS_VALID" }, \
+	{ CXL_EVENT_RECORD_FLAG_LD_ID_VALID,	"LD_ID_VALID" }, \
+	{ CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID,	"HEAD_ID_VALID" } \
 )
 
 /*
@@ -247,7 +251,9 @@ TRACE_EVENT(cxl_overflow,
 	__field(u64, hdr_timestamp)				\
 	__field(u8, hdr_length)					\
 	__field(u8, hdr_maint_op_class)				\
-	__field(u8, hdr_maint_op_sub_class)
+	__field(u8, hdr_maint_op_sub_class)			\
+	__field(u16, hdr_ld_id)					\
+	__field(u8, hdr_head_id)
 
 #define CXL_EVT_TP_fast_assign(cxlmd, l, hdr)					\
 	__assign_str(memdev);				\
@@ -260,18 +266,22 @@ TRACE_EVENT(cxl_overflow,
 	__entry->hdr_related_handle = le16_to_cpu((hdr).related_handle);	\
 	__entry->hdr_timestamp = le64_to_cpu((hdr).timestamp);			\
 	__entry->hdr_maint_op_class = (hdr).maint_op_class;			\
-	__entry->hdr_maint_op_sub_class = (hdr).maint_op_sub_class
+	__entry->hdr_maint_op_sub_class = (hdr).maint_op_sub_class;		\
+	__entry->hdr_ld_id = le16_to_cpu((hdr).ld_id);				\
+	__entry->hdr_head_id = (hdr).head_id
 
 #define CXL_EVT_TP_printk(fmt, ...) \
 	TP_printk("memdev=%s host=%s serial=%lld log=%s : time=%llu uuid=%pUb "	\
 		"len=%d flags='%s' handle=%x related_handle=%x "		\
-		"maint_op_class=%u maint_op_sub_class=%u : " fmt,		\
+		"maint_op_class=%u maint_op_sub_class=%u "			\
+		"ld_id=%x head_id=%x : " fmt,					\
 		__get_str(memdev), __get_str(host), __entry->serial,		\
 		cxl_event_log_type_str(__entry->log),				\
 		__entry->hdr_timestamp, &__entry->hdr_uuid, __entry->hdr_length,\
 		show_hdr_flags(__entry->hdr_flags), __entry->hdr_handle,	\
 		__entry->hdr_related_handle, __entry->hdr_maint_op_class,	\
 		__entry->hdr_maint_op_sub_class,	\
+		__entry->hdr_ld_id, __entry->hdr_head_id,			\
 		##__VA_ARGS__)
 
 TRACE_EVENT(cxl_generic_event,
diff --git a/include/cxl/event.h b/include/cxl/event.h
index f9ae1796da85..f4cb8568566b 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -19,7 +19,9 @@ struct cxl_event_record_hdr {
 	__le64 timestamp;
 	u8 maint_op_class;
 	u8 maint_op_sub_class;
-	u8 reserved[14];
+	__le16 ld_id;
+	u8 head_id;
+	u8 reserved[11];
 } __packed;
 
 struct cxl_event_media_hdr {
-- 
cgit v1.2.3


From f10f46a0ee53420f707195fe33b7c235a1c0e48a Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Thu, 17 Jul 2025 11:18:17 +0100
Subject: cxl/events: Trace Memory Sparing Event Record

CXL rev 3.2 section 8.2.10.2.1.4 Table 8-60 defines the Memory Sparing
Event Record.

Determine if the event read is memory sparing record and if so trace the
record.

Memory device shall produce a memory sparing event record
1. After completion of a PPR maintenance operation if the memory sparing
event record enable bit is set (Field: sPPR/hPPR Operation Mode in
Table 8-128/Table 8-131).
2. In response to a query request by the host (see section 8.2.10.7.1.4)
to determine the availability of sparing resources.
The device shall report the resource availability by producing the Memory
Sparing Event Record (see Table 8-60) in which the channel, rank, nibble
mask, bank group, bank, row, column, sub-channel fields are a copy of the
values specified in the request. If the controller does not support
reporting whether a resource is available, and a perform maintenance
operation for memory sparing is issued with query resources set to 1, the
controller shall return invalid input.

Example trace log for produce memory sparing event record on completion
of a soft PPR operation,
cxl_memory_sparing: memdev=mem1 host=0000:0f:00.0 serial=3
log=Informational : time=55045163029
uuid=e71f3a40-2d29-4092-8a39-4d1c966c7c65 len=128 flags='0x1' handle=1
related_handle=0 maint_op_class=2 maint_op_sub_class=1
ld_id=0 head_id=0 : flags='' result=0
validity_flags='CHANNEL|RANK|NIBBLE|BANK GROUP|BANK|ROW|COLUMN'
spare resource avail=1 channel=2 rank=5 nibble_mask=a59c bank_group=2
bank=4 row=13 column=23 sub_channel=0
comp_id=00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
comp_id_pldm_valid_flags='' pldm_entity_id=0x00 pldm_resource_id=0x00

Note: For memory sparing event record, fields 'maintenance operation
class' and 'maintenance operation subclass' are defined twice, first
in the common event record (Table 8-55) and second in the memory
sparing event record (Table 8-60). Thus those in the sparing event
record coded as reserved, to be removed when the spec is updated.

Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Link: https://patch.msgid.link/20250717101817.2104-5-shiju.jose@huawei.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c  |   6 +++
 drivers/cxl/core/trace.h | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/cxlmem.h     |   8 ++++
 include/cxl/event.h      |  33 +++++++++++++++
 4 files changed, 152 insertions(+)

(limited to 'include')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 445889b128cd..f7e081c00c49 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -899,6 +899,10 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
 		return;
 	}
+	if (event_type == CXL_CPER_EVENT_MEM_SPARING) {
+		trace_cxl_memory_sparing(cxlmd, type, &evt->mem_sparing);
+		return;
+	}
 
 	if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
 		u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX;
@@ -970,6 +974,8 @@ static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 		ev_type = CXL_CPER_EVENT_DRAM;
 	else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
 		ev_type = CXL_CPER_EVENT_MEM_MODULE;
+	else if (uuid_equal(uuid, &CXL_EVENT_MEM_SPARING_UUID))
+		ev_type = CXL_CPER_EVENT_MEM_SPARING;
 
 	cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
 }
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 462c2e892ba2..a53ec4798b12 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -887,6 +887,111 @@ TRACE_EVENT(cxl_memory_module,
 	)
 );
 
+/*
+ * Memory Sparing Event Record - MSER
+ *
+ * CXL rev 3.2 section 8.2.10.2.1.4; Table 8-60
+ */
+#define CXL_MSER_QUERY_RESOURCE_FLAG			BIT(0)
+#define CXL_MSER_HARD_SPARING_FLAG			BIT(1)
+#define CXL_MSER_DEV_INITED_FLAG			BIT(2)
+#define show_mem_sparing_flags(flags)	__print_flags(flags, "|",	\
+	{ CXL_MSER_QUERY_RESOURCE_FLAG,		"Query Resources" },	\
+	{ CXL_MSER_HARD_SPARING_FLAG,		"Hard Sparing" },	\
+	{ CXL_MSER_DEV_INITED_FLAG,	"Device Initiated Sparing" }	\
+)
+
+#define CXL_MSER_VALID_CHANNEL				BIT(0)
+#define CXL_MSER_VALID_RANK				BIT(1)
+#define CXL_MSER_VALID_NIBBLE				BIT(2)
+#define CXL_MSER_VALID_BANK_GROUP			BIT(3)
+#define CXL_MSER_VALID_BANK				BIT(4)
+#define CXL_MSER_VALID_ROW				BIT(5)
+#define CXL_MSER_VALID_COLUMN				BIT(6)
+#define CXL_MSER_VALID_COMPONENT_ID			BIT(7)
+#define CXL_MSER_VALID_COMPONENT_ID_FORMAT		BIT(8)
+#define CXL_MSER_VALID_SUB_CHANNEL			BIT(9)
+#define show_mem_sparing_valid_flags(flags)	__print_flags(flags, "|",		\
+	{ CXL_MSER_VALID_CHANNEL,			"CHANNEL" },			\
+	{ CXL_MSER_VALID_RANK,				"RANK" },			\
+	{ CXL_MSER_VALID_NIBBLE,			"NIBBLE" },			\
+	{ CXL_MSER_VALID_BANK_GROUP,			"BANK GROUP" },			\
+	{ CXL_MSER_VALID_BANK,				"BANK" },			\
+	{ CXL_MSER_VALID_ROW,				"ROW" },			\
+	{ CXL_MSER_VALID_COLUMN,			"COLUMN" },			\
+	{ CXL_MSER_VALID_COMPONENT_ID,			"COMPONENT ID" },		\
+	{ CXL_MSER_VALID_COMPONENT_ID_FORMAT,		"COMPONENT ID PLDM FORMAT" },	\
+	{ CXL_MSER_VALID_SUB_CHANNEL,			"SUB CHANNEL" }			\
+)
+
+TRACE_EVENT(cxl_memory_sparing,
+
+	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+		 struct cxl_event_mem_sparing *rec),
+
+	TP_ARGS(cxlmd, log, rec),
+
+	TP_STRUCT__entry(
+		CXL_EVT_TP_entry
+
+		/* Memory Sparing Event */
+		__field(u8, flags)
+		__field(u8, result)
+		__field(u16, validity_flags)
+		__field(u16, res_avail)
+		__field(u8, channel)
+		__field(u8, rank)
+		__field(u32, nibble_mask)
+		__field(u8, bank_group)
+		__field(u8, bank)
+		__field(u32, row)
+		__field(u16, column)
+		__field(u8, sub_channel)
+		__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+	),
+
+	TP_fast_assign(
+		CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+		__entry->hdr_uuid = CXL_EVENT_MEM_SPARING_UUID;
+
+		/* Memory Sparing Event */
+		__entry->flags = rec->flags;
+		__entry->result = rec->result;
+		__entry->validity_flags = le16_to_cpu(rec->validity_flags);
+		__entry->res_avail = le16_to_cpu(rec->res_avail);
+		__entry->channel = rec->channel;
+		__entry->rank = rec->rank;
+		__entry->nibble_mask = get_unaligned_le24(rec->nibble_mask);
+		__entry->bank_group = rec->bank_group;
+		__entry->bank = rec->bank;
+		__entry->row = get_unaligned_le24(rec->row);
+		__entry->column = le16_to_cpu(rec->column);
+		__entry->sub_channel = rec->sub_channel;
+		memcpy(__entry->comp_id, &rec->component_id,
+		       CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+	),
+
+	CXL_EVT_TP_printk("flags='%s' result=%u validity_flags='%s' " \
+		"spare resource avail=%u channel=%u rank=%u " \
+		"nibble_mask=%x bank_group=%u bank=%u " \
+		"row=%u column=%u sub_channel=%u " \
+		"comp_id=%s comp_id_pldm_valid_flags='%s' " \
+		"pldm_entity_id=%s pldm_resource_id=%s",
+		show_mem_sparing_flags(__entry->flags),
+		__entry->result,
+		show_mem_sparing_valid_flags(__entry->validity_flags),
+		__entry->res_avail, __entry->channel, __entry->rank,
+		__entry->nibble_mask, __entry->bank_group, __entry->bank,
+		__entry->row, __entry->column, __entry->sub_channel,
+		__print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+		show_comp_id_pldm_flags(__entry->comp_id[0]),
+		show_pldm_entity_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID,
+				    CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+		show_pldm_resource_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID,
+				      CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id)
+	)
+);
+
 #define show_poison_trace_type(type)			\
 	__print_symbolic(type,				\
 	{ CXL_POISON_TRACE_LIST,	"List"   },	\
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 551b0ba2caa1..f98311f357b7 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -633,6 +633,14 @@ struct cxl_mbox_identify {
 	UUID_INIT(0xfe927475, 0xdd59, 0x4339, 0xa5, 0x86, 0x79, 0xba, 0xb1, \
 		  0x13, 0xb7, 0x74)
 
+/*
+ * Memory Sparing Event Record UUID
+ * CXL rev 3.2 section 8.2.10.2.1.4: Table 8-60
+ */
+#define CXL_EVENT_MEM_SPARING_UUID                                          \
+	UUID_INIT(0xe71f3a40, 0x2d29, 0x4092, 0x8a, 0x39, 0x4d, 0x1c, 0x96, \
+		  0x6c, 0x7c, 0x65)
+
 /*
  * Get Event Records output payload
  * CXL rev 3.0 section 8.2.9.2.2; Table 8-50
diff --git a/include/cxl/event.h b/include/cxl/event.h
index f4cb8568566b..6fd90f9cc203 100644
--- a/include/cxl/event.h
+++ b/include/cxl/event.h
@@ -110,11 +110,43 @@ struct cxl_event_mem_module {
 	u8 reserved[0x2a];
 } __packed;
 
+/*
+ * Memory Sparing Event Record - MSER
+ * CXL rev 3.2 section 8.2.10.2.1.4; Table 8-60
+ */
+struct cxl_event_mem_sparing {
+	struct cxl_event_record_hdr hdr;
+	/*
+	 * The fields maintenance operation class and maintenance operation
+	 * subclass defined in the Memory Sparing Event Record are the
+	 * duplication of the same in the common event record. Thus defined
+	 * as reserved and to be removed after the spec correction.
+	 */
+	u8 rsv1;
+	u8 rsv2;
+	u8 flags;
+	u8 result;
+	__le16 validity_flags;
+	u8 reserved1[6];
+	__le16 res_avail;
+	u8 channel;
+	u8 rank;
+	u8 nibble_mask[3];
+	u8 bank_group;
+	u8 bank;
+	u8 row[3];
+	__le16 column;
+	u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE];
+	u8 sub_channel;
+	u8 reserved2[0x25];
+} __packed;
+
 union cxl_event {
 	struct cxl_event_generic generic;
 	struct cxl_event_gen_media gen_media;
 	struct cxl_event_dram dram;
 	struct cxl_event_mem_module mem_module;
+	struct cxl_event_mem_sparing mem_sparing;
 	/* dram & gen_media event header */
 	struct cxl_event_media_hdr media_hdr;
 } __packed;
@@ -133,6 +165,7 @@ enum cxl_event_type {
 	CXL_CPER_EVENT_GEN_MEDIA,
 	CXL_CPER_EVENT_DRAM,
 	CXL_CPER_EVENT_MEM_MODULE,
+	CXL_CPER_EVENT_MEM_SPARING,
 };
 
 #define CPER_CXL_DEVICE_ID_VALID		BIT(0)
-- 
cgit v1.2.3


From 733c43f1df34f9185b945e6f12ac00c8556c6dfe Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Tue, 8 Jul 2025 14:22:10 -0600
Subject: io_uring/cmd: introduce IORING_URING_CMD_REISSUE flag

Add a flag IORING_URING_CMD_REISSUE that ->uring_cmd() implementations
can use to tell whether this is the first or subsequent issue of the
uring_cmd. This will allow ->uring_cmd() implementations to store
information in the io_uring_cmd's pdu across issues.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Acked-by: David Sterba <dsterba@suse.com>
Link: https://lore.kernel.org/r/20250708202212.2851548-3-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 2 ++
 io_uring/uring_cmd.c         | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 53408124c1e5..29892f54e0ac 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -8,6 +8,8 @@
 
 /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
 #define IORING_URING_CMD_CANCELABLE	(1U << 30)
+/* io_uring_cmd is being issued again */
+#define IORING_URING_CMD_REISSUE	(1U << 31)
 
 struct io_uring_cmd {
 	struct file	*file;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index b228b84a510f..58964a2f8582 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -261,7 +261,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
-	if (ret == -EAGAIN || ret == -EIOCBQUEUED)
+	if (ret == -EAGAIN) {
+		ioucmd->flags |= IORING_URING_CMD_REISSUE;
+		return ret;
+	}
+	if (ret == -EIOCBQUEUED)
 		return ret;
 	if (ret < 0)
 		req_set_fail(req);
-- 
cgit v1.2.3


From 2e6dbb25ea15844c8b617260d635731c37c85ac9 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Tue, 8 Jul 2025 14:22:12 -0600
Subject: io_uring/cmd: remove struct io_uring_cmd_data

There are no more users of struct io_uring_cmd_data and its op_data
field. Remove it to shave 8 bytes from struct io_async_cmd and eliminate
a store and load for every uring_cmd.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Acked-by: David Sterba <dsterba@suse.com>
Link: https://lore.kernel.org/r/20250708202212.2851548-5-csander@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h |  9 ---------
 io_uring/uring_cmd.c         | 12 +-----------
 io_uring/uring_cmd.h         |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 29892f54e0ac..cfa6d0c0c322 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -21,10 +21,6 @@ struct io_uring_cmd {
 	u8		pdu[32]; /* available inline for free use */
 };
 
-struct io_uring_cmd_data {
-	void			*op_data;
-};
-
 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
 {
 	return sqe->cmd;
@@ -137,11 +133,6 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd
 	return cmd_to_io_kiocb(cmd)->tctx->task;
 }
 
-static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd)
-{
-	return cmd_to_io_kiocb(cmd)->async_data;
-}
-
 /*
  * Return uring_cmd's context reference as its context handle for driver to
  * track per-context resource, such as registered kernel IO buffer
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 58964a2f8582..053bac89b6c0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -26,12 +26,6 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	struct io_async_cmd *ac = req->async_data;
-	struct io_uring_cmd_data *cache = &ac->data;
-
-	if (cache->op_data) {
-		kfree(cache->op_data);
-		cache->op_data = NULL;
-	}
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		return;
@@ -40,7 +34,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 	if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP)
 		io_vec_free(&ac->vec);
 
-	if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) {
+	if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) {
 		ioucmd->sqe = NULL;
 		req->async_data = NULL;
 		req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
@@ -193,9 +187,6 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 	struct io_async_cmd *ac;
 
-	/* see io_uring_cmd_get_async_data() */
-	BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0);
-
 	if (sqe->__pad1)
 		return -EINVAL;
 
@@ -211,7 +202,6 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
 	if (!ac)
 		return -ENOMEM;
-	ac->data.op_data = NULL;
 	ioucmd->sqe = sqe;
 	return 0;
 }
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 9e11da10ecab..041aef8a8aa3 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -4,7 +4,6 @@
 #include <linux/io_uring_types.h>
 
 struct io_async_cmd {
-	struct io_uring_cmd_data	data;
 	struct iou_vec			vec;
 	struct io_uring_sqe		sqes[2];
 };
-- 
cgit v1.2.3


From 850f14f5b91986e586b66565c9c75bdd4c834571 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@linux.intel.com>
Date: Wed, 16 Jul 2025 15:03:45 +0800
Subject: iommufd: Destroy vdevice on idevice destroy

Destroy iommufd_vdevice (vdev) on iommufd_idevice (idev) destruction so
that vdev can't outlive idev.

idev represents the physical device bound to iommufd, while the vdev
represents the virtual instance of the physical device in the VM. The
lifecycle of the vdev should not be longer than idev. This doesn't
cause real problem on existing use cases cause vdev doesn't impact the
physical device, only provides virtualization information. But to
extend vdev for Confidential Computing (CC), there are needs to do
secure configuration for the vdev, e.g. TSM Bind/Unbind. These
configurations should be rolled back on idev destroy, or the external
driver (VFIO) functionality may be impact.

The idev is created by external driver so its destruction can't fail.
The idev implements pre_destroy() op to actively remove its associated
vdev before destroying itself. There are 3 cases on idev pre_destroy():

  1. vdev is already destroyed by userspace. No extra handling needed.
  2. vdev is still alive. Use iommufd_object_tombstone_user() to
     destroy vdev and tombstone the vdev ID.
  3. vdev is being destroyed by userspace. The vdev ID is already
     freed, but vdev destroy handler is not completed. This requires
     multi-threads syncing - vdev holds idev's short term users
     reference until vdev destruction completes, idev leverages
     existing wait_shortterm mechanism for syncing.

idev should also block any new reference to it after pre_destroy(),
or the following wait shortterm would timeout. Introduce a 'destroying'
flag, set it to true on idev pre_destroy(). Any attempt to reference
idev should honor this flag under the protection of
idev->igroup->lock.

Link: https://patch.msgid.link/r/20250716070349.1807226-5-yilun.xu@linux.intel.com
Originally-by: Nicolin Chen <nicolinc@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Co-developed-by: "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org>
Signed-off-by: "Aneesh Kumar K.V (Arm)" <aneesh.kumar@kernel.org>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c          | 51 ++++++++++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h | 12 ++++++++
 drivers/iommu/iommufd/main.c            |  2 ++
 drivers/iommu/iommufd/viommu.c          | 52 ++++++++++++++++++++++++++++++---
 include/linux/iommufd.h                 |  1 +
 include/uapi/linux/iommufd.h            |  5 ++++
 6 files changed, 119 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index e2ba21c43ad2..ee6ff4caf398 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -137,6 +137,57 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
 	}
 }
 
+static void iommufd_device_remove_vdev(struct iommufd_device *idev)
+{
+	struct iommufd_vdevice *vdev;
+
+	mutex_lock(&idev->igroup->lock);
+	/* prevent new references from vdev */
+	idev->destroying = true;
+	/* vdev has been completely destroyed by userspace */
+	if (!idev->vdev)
+		goto out_unlock;
+
+	vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
+	/*
+	 * An ongoing vdev destroy ioctl has removed the vdev from the object
+	 * xarray, but has not finished iommufd_vdevice_destroy() yet as it
+	 * needs the same mutex. We exit the locking then wait on short term
+	 * users for the vdev destruction.
+	 */
+	if (IS_ERR(vdev))
+		goto out_unlock;
+
+	/* Should never happen */
+	if (WARN_ON(vdev != idev->vdev)) {
+		iommufd_put_object(idev->ictx, &vdev->obj);
+		goto out_unlock;
+	}
+
+	/*
+	 * vdev is still alive. Hold a users refcount to prevent racing with
+	 * userspace destruction, then use iommufd_object_tombstone_user() to
+	 * destroy it and leave a tombstone.
+	 */
+	refcount_inc(&vdev->obj.users);
+	iommufd_put_object(idev->ictx, &vdev->obj);
+	mutex_unlock(&idev->igroup->lock);
+	iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
+	return;
+
+out_unlock:
+	mutex_unlock(&idev->igroup->lock);
+}
+
+void iommufd_device_pre_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_device *idev =
+		container_of(obj, struct iommufd_device, obj);
+
+	/* Release the short term users on this */
+	iommufd_device_remove_vdev(idev);
+}
+
 void iommufd_device_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_device *idev =
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 149545060029..5d6ea5395cfe 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -489,6 +489,8 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	struct iommufd_vdevice *vdev;
+	bool destroying;
 };
 
 static inline struct iommufd_device *
@@ -499,6 +501,7 @@ iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id)
 			    struct iommufd_device, obj);
 }
 
+void iommufd_device_pre_destroy(struct iommufd_object *obj);
 void iommufd_device_destroy(struct iommufd_object *obj);
 int iommufd_get_hw_info(struct iommufd_ucmd *ucmd);
 
@@ -687,9 +690,18 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_vdevice_destroy(struct iommufd_object *obj);
+void iommufd_vdevice_abort(struct iommufd_object *obj);
 int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_hw_queue_destroy(struct iommufd_object *obj);
 
+static inline struct iommufd_vdevice *
+iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
+{
+	return container_of(iommufd_get_object(ictx, id,
+					       IOMMUFD_OBJ_VDEVICE),
+			    struct iommufd_vdevice, obj);
+}
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 53085d24ce4a..99c1aab3d396 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -655,6 +655,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_access_destroy_object,
 	},
 	[IOMMUFD_OBJ_DEVICE] = {
+		.pre_destroy = iommufd_device_pre_destroy,
 		.destroy = iommufd_device_destroy,
 	},
 	[IOMMUFD_OBJ_FAULT] = {
@@ -676,6 +677,7 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	},
 	[IOMMUFD_OBJ_VDEVICE] = {
 		.destroy = iommufd_vdevice_destroy,
+		.abort = iommufd_vdevice_abort,
 	},
 	[IOMMUFD_OBJ_VEVENTQ] = {
 		.destroy = iommufd_veventq_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index dcf8a85b9f6e..ecbae5091ffe 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -110,20 +110,37 @@ out_put_idev:
 	return rc;
 }
 
-void iommufd_vdevice_destroy(struct iommufd_object *obj)
+void iommufd_vdevice_abort(struct iommufd_object *obj)
 {
 	struct iommufd_vdevice *vdev =
 		container_of(obj, struct iommufd_vdevice, obj);
 	struct iommufd_viommu *viommu = vdev->viommu;
+	struct iommufd_device *idev = vdev->idev;
+
+	lockdep_assert_held(&idev->igroup->lock);
 
 	if (vdev->destroy)
 		vdev->destroy(vdev);
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
 	xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
+	idev->vdev = NULL;
 	put_device(vdev->dev);
 }
 
+void iommufd_vdevice_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vdevice *vdev =
+		container_of(obj, struct iommufd_vdevice, obj);
+	struct iommufd_device *idev = vdev->idev;
+	struct iommufd_ctx *ictx = idev->ictx;
+
+	mutex_lock(&idev->igroup->lock);
+	iommufd_vdevice_abort(obj);
+	mutex_unlock(&idev->igroup->lock);
+	iommufd_put_object(ictx, &idev->obj);
+}
+
 int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_vdevice_alloc *cmd = ucmd->cmd;
@@ -153,6 +170,17 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		goto out_put_idev;
 	}
 
+	mutex_lock(&idev->igroup->lock);
+	if (idev->destroying) {
+		rc = -ENOENT;
+		goto out_unlock_igroup;
+	}
+
+	if (idev->vdev) {
+		rc = -EEXIST;
+		goto out_unlock_igroup;
+	}
+
 	if (viommu->ops && viommu->ops->vdevice_size) {
 		/*
 		 * It is a driver bug for:
@@ -171,7 +199,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		ucmd->ictx, vdev_size, IOMMUFD_OBJ_VDEVICE);
 	if (IS_ERR(vdev)) {
 		rc = PTR_ERR(vdev);
-		goto out_put_idev;
+		goto out_unlock_igroup;
 	}
 
 	vdev->virt_id = virt_id;
@@ -179,6 +207,19 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	get_device(idev->dev);
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
+	/*
+	 * A short term users reference is held on the idev so long as we have
+	 * the pointer. iommufd_device_pre_destroy() will revoke it before the
+	 * idev real destruction.
+	 */
+	vdev->idev = idev;
+
+	/*
+	 * iommufd_device_destroy() delays until idev->vdev is NULL before
+	 * freeing the idev, which only happens once the vdev is finished
+	 * destruction.
+	 */
+	idev->vdev = vdev;
 
 	curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL);
 	if (curr) {
@@ -197,12 +238,15 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	if (rc)
 		goto out_abort;
 	iommufd_object_finalize(ucmd->ictx, &vdev->obj);
-	goto out_put_idev;
+	goto out_unlock_igroup;
 
 out_abort:
 	iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj);
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
 out_put_idev:
-	iommufd_put_object(ucmd->ictx, &idev->obj);
+	if (rc)
+		iommufd_put_object(ucmd->ictx, &idev->obj);
 out_put_viommu:
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
 	return rc;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index e3a0cd47384d..b88911026bc4 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -108,6 +108,7 @@ struct iommufd_viommu {
 struct iommufd_vdevice {
 	struct iommufd_object obj;
 	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
 	struct device *dev;
 
 	/*
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 554aacf89ea7..c218c89e0e2e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -1070,6 +1070,11 @@ struct iommu_viommu_alloc {
  *
  * Allocate a virtual device instance (for a physical device) against a vIOMMU.
  * This instance holds the device's information (related to its vIOMMU) in a VM.
+ * User should use IOMMU_DESTROY to destroy the virtual device before
+ * destroying the physical device (by closing vfio_cdev fd). Otherwise the
+ * virtual device would be forcibly destroyed on physical device destruction,
+ * its vdevice_id would be permanently leaked (unremovable & unreusable) until
+ * iommu fd closed.
  */
 struct iommu_vdevice_alloc {
 	__u32 size;
-- 
cgit v1.2.3


From 651f733675c4a26e59dd34522917eace20c557c0 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@linux.intel.com>
Date: Wed, 16 Jul 2025 15:03:46 +0800
Subject: iommufd/vdevice: Remove struct device reference from struct vdevice

Remove struct device *dev from struct vdevice.

The dev pointer is the Plan B for vdevice to reference the physical
device. As now vdev->idev is added without refcounting concern, just
use vdev->idev->dev when needed. To avoid exposing
struct iommufd_device in the public header, export a
iommufd_vdevice_to_device() helper.

Link: https://patch.msgid.link/r/20250716070349.1807226-6-yilun.xu@linux.intel.com
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c |  3 ++-
 drivers/iommu/iommufd/driver.c                 | 10 ++++++++--
 drivers/iommu/iommufd/viommu.c                 |  3 ---
 include/linux/iommufd.h                        |  8 +++++++-
 4 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index eb90af5093d8..4c86eacd36b1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -1218,7 +1218,8 @@ static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev)
 
 static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev)
 {
-	struct arm_smmu_master *master = dev_iommu_priv_get(vdev->dev);
+	struct device *dev = iommufd_vdevice_to_device(vdev);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu);
 	struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev);
 	struct arm_smmu_stream *stream = &master->streams[0];
diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c
index e4eae20bcd4e..6f1010da221c 100644
--- a/drivers/iommu/iommufd/driver.c
+++ b/drivers/iommu/iommufd/driver.c
@@ -83,6 +83,12 @@ void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
 }
 EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD");
 
+struct device *iommufd_vdevice_to_device(struct iommufd_vdevice *vdev)
+{
+	return vdev->idev->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vdevice_to_device, "IOMMUFD");
+
 /* Caller should xa_lock(&viommu->vdevs) to protect the return value */
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id)
@@ -92,7 +98,7 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 	lockdep_assert_held(&viommu->vdevs.xa_lock);
 
 	vdev = xa_load(&viommu->vdevs, vdev_id);
-	return vdev ? vdev->dev : NULL;
+	return vdev ? iommufd_vdevice_to_device(vdev) : NULL;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD");
 
@@ -109,7 +115,7 @@ int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
 
 	xa_lock(&viommu->vdevs);
 	xa_for_each(&viommu->vdevs, index, vdev) {
-		if (vdev->dev == dev) {
+		if (iommufd_vdevice_to_device(vdev) == dev) {
 			*vdev_id = vdev->virt_id;
 			rc = 0;
 			break;
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index ecbae5091ffe..6cf0bd5d8f08 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -125,7 +125,6 @@ void iommufd_vdevice_abort(struct iommufd_object *obj)
 	xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL);
 	refcount_dec(&viommu->obj.users);
 	idev->vdev = NULL;
-	put_device(vdev->dev);
 }
 
 void iommufd_vdevice_destroy(struct iommufd_object *obj)
@@ -203,8 +202,6 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	}
 
 	vdev->virt_id = virt_id;
-	vdev->dev = idev->dev;
-	get_device(idev->dev);
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
 	/*
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index b88911026bc4..810e4d8ac912 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -109,7 +109,6 @@ struct iommufd_vdevice {
 	struct iommufd_object obj;
 	struct iommufd_viommu *viommu;
 	struct iommufd_device *idev;
-	struct device *dev;
 
 	/*
 	 * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of
@@ -261,6 +260,7 @@ int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner,
 			unsigned long *offset);
 void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
 			   struct iommufd_object *owner, unsigned long offset);
+struct device *iommufd_vdevice_to_device(struct iommufd_vdevice *vdev);
 struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu,
 				       unsigned long vdev_id);
 int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu,
@@ -295,6 +295,12 @@ static inline void _iommufd_destroy_mmap(struct iommufd_ctx *ictx,
 {
 }
 
+static inline struct device *
+iommufd_vdevice_to_device(struct iommufd_vdevice *vdev)
+{
+	return NULL;
+}
+
 static inline struct device *
 iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id)
 {
-- 
cgit v1.2.3


From ab6bc44159d8f0c4ee757e0ce041fa9033e0ead8 Mon Sep 17 00:00:00 2001
From: Xu Yilun <yilun.xu@linux.intel.com>
Date: Wed, 16 Jul 2025 15:03:49 +0800
Subject: iommufd: Rename some shortterm-related identifiers

Rename the shortterm-related identifiers to wait-related.

The usage of shortterm_users refcount is now beyond its name.  It is
also used for references which live longer than an ioctl execution.
E.g. vdev holds idev's shortterm_users refcount on vdev allocation,
releases it during idev's pre_destroy(). Rename the refcount as
wait_cnt, since it is always used to sync the referencing & the
destruction of the object by waiting for it to go to zero.

List all changed identifiers:

  iommufd_object::shortterm_users -> iommufd_object::wait_cnt
  REMOVE_WAIT_SHORTTERM -> REMOVE_WAIT
  iommufd_object_dec_wait_shortterm() -> iommufd_object_dec_wait()
  zerod_shortterm -> zerod_wait_cnt

No functional change intended.

Link: https://patch.msgid.link/r/20250716070349.1807226-9-yilun.xu@linux.intel.com
Suggested-by: Kevin Tian <kevin.tian@intel.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |  6 ++---
 drivers/iommu/iommufd/iommufd_private.h | 18 +++++++--------
 drivers/iommu/iommufd/main.c            | 39 +++++++++++++++++----------------
 drivers/iommu/iommufd/viommu.c          |  4 ++--
 include/linux/iommufd.h                 |  8 ++++++-
 5 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index ee6ff4caf398..65fbd098f9e9 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -152,8 +152,8 @@ static void iommufd_device_remove_vdev(struct iommufd_device *idev)
 	/*
 	 * An ongoing vdev destroy ioctl has removed the vdev from the object
 	 * xarray, but has not finished iommufd_vdevice_destroy() yet as it
-	 * needs the same mutex. We exit the locking then wait on short term
-	 * users for the vdev destruction.
+	 * needs the same mutex. We exit the locking then wait on wait_cnt
+	 * reference for the vdev destruction.
 	 */
 	if (IS_ERR(vdev))
 		goto out_unlock;
@@ -184,7 +184,7 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj)
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
-	/* Release the short term users on this */
+	/* Release the wait_cnt reference on this */
 	iommufd_device_remove_vdev(idev);
 }
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 5d6ea5395cfe..0da2a81eedfa 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -169,7 +169,7 @@ static inline bool iommufd_lock_obj(struct iommufd_object *obj)
 {
 	if (!refcount_inc_not_zero(&obj->users))
 		return false;
-	if (!refcount_inc_not_zero(&obj->shortterm_users)) {
+	if (!refcount_inc_not_zero(&obj->wait_cnt)) {
 		/*
 		 * If the caller doesn't already have a ref on obj this must be
 		 * called under the xa_lock. Otherwise the caller is holding a
@@ -187,11 +187,11 @@ static inline void iommufd_put_object(struct iommufd_ctx *ictx,
 				      struct iommufd_object *obj)
 {
 	/*
-	 * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees
-	 * a spurious !0 users with a 0 shortterm_users.
+	 * Users first, then wait_cnt so that REMOVE_WAIT never sees a spurious
+	 * !0 users with a 0 wait_cnt.
 	 */
 	refcount_dec(&obj->users);
-	if (refcount_dec_and_test(&obj->shortterm_users))
+	if (refcount_dec_and_test(&obj->wait_cnt))
 		wake_up_interruptible_all(&ictx->destroy_wait);
 }
 
@@ -202,7 +202,7 @@ void iommufd_object_finalize(struct iommufd_ctx *ictx,
 			     struct iommufd_object *obj);
 
 enum {
-	REMOVE_WAIT_SHORTTERM	= BIT(0),
+	REMOVE_WAIT		= BIT(0),
 	REMOVE_OBJ_TOMBSTONE	= BIT(1),
 };
 int iommufd_object_remove(struct iommufd_ctx *ictx,
@@ -211,15 +211,15 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 
 /*
  * The caller holds a users refcount and wants to destroy the object. At this
- * point the caller has no shortterm_users reference and at least the xarray
- * will be holding one.
+ * point the caller has no wait_cnt reference and at least the xarray will be
+ * holding one.
  */
 static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx,
 					       struct iommufd_object *obj)
 {
 	int ret;
 
-	ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM);
+	ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT);
 
 	/*
 	 * If there is a bug and we couldn't destroy the object then we did put
@@ -239,7 +239,7 @@ static inline void iommufd_object_tombstone_user(struct iommufd_ctx *ictx,
 	int ret;
 
 	ret = iommufd_object_remove(ictx, obj, obj->id,
-				    REMOVE_WAIT_SHORTTERM | REMOVE_OBJ_TOMBSTONE);
+				    REMOVE_WAIT | REMOVE_OBJ_TOMBSTONE);
 
 	/*
 	 * If there is a bug and we couldn't destroy the object then we did put
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 99c1aab3d396..15af7ced0501 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -42,7 +42,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
 		return ERR_PTR(-ENOMEM);
 	obj->type = type;
 	/* Starts out bias'd by 1 until it is removed from the xarray */
-	refcount_set(&obj->shortterm_users, 1);
+	refcount_set(&obj->wait_cnt, 1);
 	refcount_set(&obj->users, 1);
 
 	/*
@@ -155,22 +155,22 @@ struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id,
 	return obj;
 }
 
-static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx,
-					     struct iommufd_object *to_destroy)
+static int iommufd_object_dec_wait(struct iommufd_ctx *ictx,
+				   struct iommufd_object *to_destroy)
 {
-	if (refcount_dec_and_test(&to_destroy->shortterm_users))
+	if (refcount_dec_and_test(&to_destroy->wait_cnt))
 		return 0;
 
 	if (iommufd_object_ops[to_destroy->type].pre_destroy)
 		iommufd_object_ops[to_destroy->type].pre_destroy(to_destroy);
 
 	if (wait_event_timeout(ictx->destroy_wait,
-			       refcount_read(&to_destroy->shortterm_users) == 0,
+			       refcount_read(&to_destroy->wait_cnt) == 0,
 			       msecs_to_jiffies(60000)))
 		return 0;
 
 	pr_crit("Time out waiting for iommufd object to become free\n");
-	refcount_inc(&to_destroy->shortterm_users);
+	refcount_inc(&to_destroy->wait_cnt);
 	return -EBUSY;
 }
 
@@ -184,17 +184,18 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 {
 	struct iommufd_object *obj;
 	XA_STATE(xas, &ictx->objects, id);
-	bool zerod_shortterm = false;
+	bool zerod_wait_cnt = false;
 	int ret;
 
 	/*
-	 * The purpose of the shortterm_users is to ensure deterministic
-	 * destruction of objects used by external drivers and destroyed by this
-	 * function. Any temporary increment of the refcount must increment
-	 * shortterm_users, such as during ioctl execution.
+	 * The purpose of the wait_cnt is to ensure deterministic destruction
+	 * of objects used by external drivers and destroyed by this function.
+	 * Incrementing this wait_cnt should either be short lived, such as
+	 * during ioctl execution, or be revoked and blocked during
+	 * pre_destroy(), such as vdev holding the idev's refcount.
 	 */
-	if (flags & REMOVE_WAIT_SHORTTERM) {
-		ret = iommufd_object_dec_wait_shortterm(ictx, to_destroy);
+	if (flags & REMOVE_WAIT) {
+		ret = iommufd_object_dec_wait(ictx, to_destroy);
 		if (ret) {
 			/*
 			 * We have a bug. Put back the callers reference and
@@ -203,7 +204,7 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 			refcount_dec(&to_destroy->users);
 			return ret;
 		}
-		zerod_shortterm = true;
+		zerod_wait_cnt = true;
 	}
 
 	xa_lock(&ictx->objects);
@@ -235,11 +236,11 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 	xa_unlock(&ictx->objects);
 
 	/*
-	 * Since users is zero any positive users_shortterm must be racing
+	 * Since users is zero any positive wait_cnt must be racing
 	 * iommufd_put_object(), or we have a bug.
 	 */
-	if (!zerod_shortterm) {
-		ret = iommufd_object_dec_wait_shortterm(ictx, obj);
+	if (!zerod_wait_cnt) {
+		ret = iommufd_object_dec_wait(ictx, obj);
 		if (WARN_ON(ret))
 			return ret;
 	}
@@ -249,9 +250,9 @@ int iommufd_object_remove(struct iommufd_ctx *ictx,
 	return 0;
 
 err_xa:
-	if (zerod_shortterm) {
+	if (zerod_wait_cnt) {
 		/* Restore the xarray owned reference */
-		refcount_set(&obj->shortterm_users, 1);
+		refcount_set(&obj->wait_cnt, 1);
 	}
 	xa_unlock(&ictx->objects);
 
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 6cf0bd5d8f08..2ca5809b238b 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -205,8 +205,8 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	vdev->viommu = viommu;
 	refcount_inc(&viommu->obj.users);
 	/*
-	 * A short term users reference is held on the idev so long as we have
-	 * the pointer. iommufd_device_pre_destroy() will revoke it before the
+	 * A wait_cnt reference is held on the idev so long as we have the
+	 * pointer. iommufd_device_pre_destroy() will revoke it before the
 	 * idev real destruction.
 	 */
 	vdev->idev = idev;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 810e4d8ac912..6e7efe83bc5d 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -46,7 +46,13 @@ enum iommufd_object_type {
 
 /* Base struct for all objects with a userspace ID handle. */
 struct iommufd_object {
-	refcount_t shortterm_users;
+	/*
+	 * Destroy will sleep and wait for wait_cnt to go to zero. This allows
+	 * concurrent users of the ID to reliably avoid causing a spurious
+	 * destroy failure. Incrementing this count should either be short
+	 * lived or be revoked and blocked during pre_destroy().
+	 */
+	refcount_t wait_cnt;
 	refcount_t users;
 	enum iommufd_object_type type;
 	unsigned int id;
-- 
cgit v1.2.3


From a6f190630d070173897a7e98a30188b7638ba0a1 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 16 Jul 2025 18:26:53 +0200
Subject: net: track pfmemalloc drops via SKB_DROP_REASON_PFMEMALLOC

Add a new SKB drop reason (SKB_DROP_REASON_PFMEMALLOC) to track packets
dropped due to memory pressure. In production environments, we've observed
memory exhaustion reported by memory layer stack traces, but these drops
were not properly tracked in the SKB drop reason infrastructure.

While most network code paths now properly report pfmemalloc drops, some
protocol-specific socket implementations still use sk_filter() without
drop reason tracking:
- Bluetooth L2CAP sockets
- CAIF sockets
- IUCV sockets
- Netlink sockets
- SCTP sockets
- Unix domain sockets

These remaining cases represent less common paths and could be converted
in a follow-up patch if needed. The current implementation provides
significantly improved observability into memory pressure events in the
network stack, especially for key protocols like TCP and UDP, helping to
diagnose problems in production environments.

Reported-by: Matt Fleming <mfleming@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/175268316579.2407873.11634752355644843509.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tun.c             |  6 ++----
 include/linux/filter.h        | 14 ++++++++++++--
 include/net/dropreason-core.h |  6 ++++++
 include/net/tcp.h             |  2 +-
 net/core/dev.c                |  8 ++++++--
 net/core/filter.c             | 15 ++++++++++++---
 net/core/sock.c               | 20 +++++++++++++-------
 net/ipv4/tcp_ipv4.c           | 26 +++++++++++++++-----------
 net/ipv4/udp.c                |  6 ++----
 net/ipv6/tcp_ipv6.c           |  9 +++------
 net/ipv6/udp.c                |  4 +---
 net/rose/rose_in.c            |  3 ++-
 12 files changed, 75 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 49bcd12a4ac8..e65228ba3fae 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1002,8 +1002,8 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun,
 /* Net device start xmit */
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	struct tun_struct *tun = netdev_priv(dev);
-	enum skb_drop_reason drop_reason;
 	int txq = skb->queue_mapping;
 	struct netdev_queue *queue;
 	struct tun_file *tfile;
@@ -1032,10 +1032,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	if (tfile->socket.sk->sk_filter &&
-	    sk_filter(tfile->socket.sk, skb)) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	    sk_filter_reason(tfile->socket.sk, skb, &drop_reason))
 		goto drop;
-	}
 
 	len = run_ebpf_filter(tun, skb, len);
 	if (len == 0) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5cf4d35d83e..4e82332afe03 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1073,10 +1073,20 @@ bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 	return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap,
+		       enum skb_drop_reason *reason);
+
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
-	return sk_filter_trim_cap(sk, skb, 1);
+	enum skb_drop_reason ignore_reason;
+
+	return sk_filter_trim_cap(sk, skb, 1, &ignore_reason);
+}
+
+static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb,
+				   enum skb_drop_reason *reason)
+{
+	return sk_filter_trim_cap(sk, skb, 1, reason);
 }
 
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 229bb1826f2a..e19184dd1b0f 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -125,6 +125,7 @@
 	FN(CAN_RX_INVALID_FRAME)	\
 	FN(CANFD_RX_INVALID_FRAME)	\
 	FN(CANXL_RX_INVALID_FRAME)	\
+	FN(PFMEMALLOC)	\
 	FNe(MAX)
 
 /**
@@ -598,6 +599,11 @@ enum skb_drop_reason {
 	 * non conform CAN-XL frame (or device is unable to receive CAN frames)
 	 */
 	SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
+	/**
+	 * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve
+	 * reached a path or socket not eligible for use of memory reserves
+	 */
+	SKB_DROP_REASON_PFMEMALLOC,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/include/net/tcp.h b/include/net/tcp.h
index bc08de49805c..b3815d104340 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1559,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
 		     enum skb_drop_reason *reason);
 
 
-int tcp_filter(struct sock *sk, struct sk_buff *skb);
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason);
 void tcp_set_state(struct sock *sk, int state);
 void tcp_done(struct sock *sk);
 int tcp_abort(struct sock *sk, int err);
diff --git a/net/core/dev.c b/net/core/dev.c
index 621a639aeba1..59a9089117de 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 				    struct packet_type **ppt_prev)
 {
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct sk_buff *skb = *pskb;
@@ -5840,8 +5841,10 @@ skip_taps:
 #endif
 	skb_reset_redirect(skb);
 skip_classify:
-	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+	if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
+		drop_reason = SKB_DROP_REASON_PFMEMALLOC;
 		goto drop;
+	}
 
 	if (skb_vlan_tag_present(skb)) {
 		if (pt_prev) {
@@ -5946,7 +5949,8 @@ drop:
 			dev_core_stats_rx_dropped_inc(skb->dev);
 		else
 			dev_core_stats_rx_nohandler_inc(skb->dev);
-		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
+
+		kfree_skb_reason(skb, drop_reason);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
 		 */
diff --git a/net/core/filter.c b/net/core/filter.c
index 7a72f766aacf..2eb8947d8097 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  *	@sk: sock associated with &sk_buff
  *	@skb: buffer to filter
  *	@cap: limit on how short the eBPF program may trim the packet
+ *	@reason: record drop reason on errors (negative return value)
  *
  * Run the eBPF program and then cut skb->data to correct size returned by
  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
@@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
+		       unsigned int cap, enum skb_drop_reason *reason)
 {
 	int err;
 	struct sk_filter *filter;
@@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 	 */
 	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+		*reason = SKB_DROP_REASON_PFMEMALLOC;
 		return -ENOMEM;
 	}
 	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
-	if (err)
+	if (err) {
+		*reason = SKB_DROP_REASON_SOCKET_FILTER;
 		return err;
+	}
 
 	err = security_sock_rcv_skb(sk, skb);
-	if (err)
+	if (err) {
+		*reason = SKB_DROP_REASON_SECURITY_HOOK;
 		return err;
+	}
 
 	rcu_read_lock();
 	filter = rcu_dereference(sk->sk_filter);
@@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 		pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
 		skb->sk = save_sk;
 		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+		if (err)
+			*reason = SKB_DROP_REASON_SOCKET_FILTER;
 	}
 	rcu_read_unlock();
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 8b7623c7d547..7c26ec8dce63 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -526,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 	enum skb_drop_reason drop_reason;
 	int err;
 
-	err = sk_filter(sk, skb);
-	if (err) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	err = sk_filter_reason(sk, skb, &drop_reason);
+	if (err)
 		goto out;
-	}
+
 	err = __sock_queue_rcv_skb(sk, skb);
 	switch (err) {
 	case -ENOMEM:
@@ -553,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 		     const int nested, unsigned int trim_cap, bool refcounted)
 {
+	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	int rc = NET_RX_SUCCESS;
+	int err;
 
-	if (sk_filter_trim_cap(sk, skb, trim_cap))
+	if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
 		goto discard_and_relse;
 
 	skb->dev = NULL;
 
 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
 		atomic_inc(&sk->sk_drops);
+		reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 		goto discard_and_relse;
 	}
 	if (nested)
@@ -577,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 		rc = sk_backlog_rcv(sk, skb);
 
 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
-	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
+	} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
 		bh_unlock_sock(sk);
+		if (err == -ENOMEM)
+			reason = SKB_DROP_REASON_PFMEMALLOC;
+		if (err == -ENOBUFS)
+			reason = SKB_DROP_REASON_SOCKET_BACKLOG;
 		atomic_inc(&sk->sk_drops);
 		goto discard_and_relse;
 	}
@@ -589,7 +595,7 @@ out:
 		sock_put(sk);
 	return rc;
 discard_and_relse:
-	kfree_skb(skb);
+	sk_skb_reason_drop(sk, skb, reason);
 	goto out;
 }
 EXPORT_SYMBOL(__sk_receive_skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 16bf6fdff96b..84d3d556ed80 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2026,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
 	u32 gso_size;
 	u64 limit;
 	int delta;
+	int err;
 
 	/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
 	 * we can fix skb->truesize to its real value to avoid future drops.
@@ -2136,21 +2137,27 @@ no_coalesce:
 
 	limit = min_t(u64, limit, UINT_MAX);
 
-	if (unlikely(sk_add_backlog(sk, skb, limit))) {
+	err = sk_add_backlog(sk, skb, limit);
+	if (unlikely(err)) {
 		bh_unlock_sock(sk);
-		*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
-		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+		if (err == -ENOMEM) {
+			*reason = SKB_DROP_REASON_PFMEMALLOC;
+			__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+		} else {
+			*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+			__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+		}
 		return true;
 	}
 	return false;
 }
 EXPORT_IPV6_MOD(tcp_add_backlog);
 
-int tcp_filter(struct sock *sk, struct sk_buff *skb)
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
 {
 	struct tcphdr *th = (struct tcphdr *)skb->data;
 
-	return sk_filter_trim_cap(sk, skb, th->doff * 4);
+	return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
 }
 EXPORT_IPV6_MOD(tcp_filter);
 
@@ -2277,14 +2284,12 @@ lookup:
 		}
 		refcounted = true;
 		nsk = NULL;
-		if (!tcp_filter(sk, skb)) {
+		if (!tcp_filter(sk, skb, &drop_reason)) {
 			th = (const struct tcphdr *)skb->data;
 			iph = ip_hdr(skb);
 			tcp_v4_fill_cb(skb, iph, th);
 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
 					    &drop_reason);
-		} else {
-			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 		}
 		if (!nsk) {
 			reqsk_put(req);
@@ -2340,10 +2345,9 @@ process:
 
 	nf_reset_ct(skb);
 
-	if (tcp_filter(sk, skb)) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	if (tcp_filter(sk, skb, &drop_reason))
 		goto discard_and_relse;
-	}
+
 	th = (const struct tcphdr *)skb->data;
 	iph = ip_hdr(skb);
 	tcp_v4_fill_cb(skb, iph, th);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 49f43c54cfb0..cc3ce0f762ec 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2347,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
  */
 static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
-	int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	struct udp_sock *up = udp_sk(sk);
 	int is_udplite = IS_UDPLITE(sk);
 
@@ -2436,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	    udp_lib_checksum_complete(skb))
 			goto csum_error;
 
-	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
 		goto drop;
-	}
 
 	udp_csum_pull_header(skb);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8f2c3cba1f1f..7577e7eb2c97 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1834,14 +1834,12 @@ lookup:
 		}
 		refcounted = true;
 		nsk = NULL;
-		if (!tcp_filter(sk, skb)) {
+		if (!tcp_filter(sk, skb, &drop_reason)) {
 			th = (const struct tcphdr *)skb->data;
 			hdr = ipv6_hdr(skb);
 			tcp_v6_fill_cb(skb, hdr, th);
 			nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
 					    &drop_reason);
-		} else {
-			drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 		}
 		if (!nsk) {
 			reqsk_put(req);
@@ -1897,10 +1895,9 @@ process:
 
 	nf_reset_ct(skb);
 
-	if (tcp_filter(sk, skb)) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	if (tcp_filter(sk, skb, &drop_reason))
 		goto discard_and_relse;
-	}
+
 	th = (const struct tcphdr *)skb->data;
 	hdr = ipv6_hdr(skb);
 	tcp_v6_fill_cb(skb, hdr, th);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6bbdadbd5fec..6a68f77da44b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -894,10 +894,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 	    udp_lib_checksum_complete(skb))
 		goto csum_error;
 
-	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
-		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
+	if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
 		goto drop;
-	}
 
 	udp_csum_pull_header(skb);
 
diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c
index 4d67f36dce1b..3e99181e759f 100644
--- a/net/rose/rose_in.c
+++ b/net/rose/rose_in.c
@@ -101,6 +101,7 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety
  */
 static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
 {
+	enum skb_drop_reason dr; /* ignored */
 	struct rose_sock *rose = rose_sk(sk);
 	int queued = 0;
 
@@ -162,7 +163,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
 		rose_frames_acked(sk, nr);
 		if (ns == rose->vr) {
 			rose_start_idletimer(sk);
-			if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 &&
+			if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) &&
 			    __sock_queue_rcv_skb(sk, skb) == 0) {
 				rose->vr = (rose->vr + 1) % ROSE_MODULUS;
 				queued = 1;
-- 
cgit v1.2.3


From ffea1168346120df9417fcafd8f3a1c93033ae34 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:27 -0700
Subject: net: s/dev_get_port_parent_id/netif_get_port_parent_id/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-2-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c |  2 +-
 include/linux/netdevice.h                       |  4 ++--
 net/bridge/br_switchdev.c                       |  2 +-
 net/core/dev.c                                  | 25 +++++++++++++------------
 net/core/net-sysfs.c                            |  2 +-
 net/core/rtnetlink.c                            |  2 +-
 net/ipv4/ipmr.c                                 |  2 +-
 7 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index fef418e1ed1a..32c07a8b03d1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5446,7 +5446,7 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 		goto err_action_counter;
 	}
 
-	err = dev_get_port_parent_id(priv->netdev, &ppid, false);
+	err = netif_get_port_parent_id(priv->netdev, &ppid, false);
 	if (!err) {
 		memcpy(&key, &ppid.id, sizeof(key));
 		mlx5_esw_offloads_devcom_init(esw, key);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e49d8c98d284..c6ba4ea66039 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4223,8 +4223,8 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss,
 			     struct netlink_ext_ack *extack);
 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
-int dev_get_port_parent_id(struct net_device *dev,
-			   struct netdev_phys_item_id *ppid, bool recurse);
+int netif_get_port_parent_id(struct net_device *dev,
+			     struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
 
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 9a910cf0256e..fe3f7bbe86ee 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -837,7 +837,7 @@ int br_switchdev_port_offload(struct net_bridge_port *p,
 	struct netdev_phys_item_id ppid;
 	int err;
 
-	err = dev_get_port_parent_id(dev, &ppid, false);
+	err = netif_get_port_parent_id(dev, &ppid, false);
 	if (err)
 		return err;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 59a9089117de..4979a9197b18 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9844,16 +9844,17 @@ int dev_get_phys_port_name(struct net_device *dev,
 }
 
 /**
- *	dev_get_port_parent_id - Get the device's port parent identifier
- *	@dev: network device
- *	@ppid: pointer to a storage for the port's parent identifier
- *	@recurse: allow/disallow recursion to lower devices
+ * netif_get_port_parent_id() - Get the device's port parent identifier
+ * @dev: network device
+ * @ppid: pointer to a storage for the port's parent identifier
+ * @recurse: allow/disallow recursion to lower devices
+ *
+ * Get the devices's port parent identifier.
  *
- *	Get the devices's port parent identifier
+ * Return: 0 on success, -errno on failure.
  */
-int dev_get_port_parent_id(struct net_device *dev,
-			   struct netdev_phys_item_id *ppid,
-			   bool recurse)
+int netif_get_port_parent_id(struct net_device *dev,
+			     struct netdev_phys_item_id *ppid, bool recurse)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct netdev_phys_item_id first = { };
@@ -9872,7 +9873,7 @@ int dev_get_port_parent_id(struct net_device *dev,
 		return err;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = dev_get_port_parent_id(lower_dev, ppid, true);
+		err = netif_get_port_parent_id(lower_dev, ppid, true);
 		if (err)
 			break;
 		if (!first.id_len)
@@ -9883,7 +9884,7 @@ int dev_get_port_parent_id(struct net_device *dev,
 
 	return err;
 }
-EXPORT_SYMBOL(dev_get_port_parent_id);
+EXPORT_SYMBOL(netif_get_port_parent_id);
 
 /**
  *	netdev_port_same_parent_id - Indicate if two network devices have
@@ -9896,8 +9897,8 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
 	struct netdev_phys_item_id a_id = { };
 	struct netdev_phys_item_id b_id = { };
 
-	if (dev_get_port_parent_id(a, &a_id, true) ||
-	    dev_get_port_parent_id(b, &b_id, true))
+	if (netif_get_port_parent_id(a, &a_id, true) ||
+	    netif_get_port_parent_id(b, &b_id, true))
 		return false;
 
 	return netdev_phys_item_id_same(&a_id, &b_id);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 8f897e2c8b4f..f7a6cc7aea79 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -687,7 +687,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = dev_get_port_parent_id(netdev, &ppid, false);
+	ret = netif_get_port_parent_id(netdev, &ppid, false);
 	if (!ret)
 		ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a9555bfc372f..108995b6eced 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1448,7 +1448,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 	struct netdev_phys_item_id ppid = { };
 	int err;
 
-	err = dev_get_port_parent_id(dev, &ppid, false);
+	err = netif_get_port_parent_id(dev, &ppid, false);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3a2044e6033d..e86a8a862c41 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -901,7 +901,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 			vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
 			(VIFF_TUNNEL | VIFF_REGISTER));
 
-	err = dev_get_port_parent_id(dev, &ppid, true);
+	err = netif_get_port_parent_id(dev, &ppid, true);
 	if (err == 0) {
 		memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
 		v->dev_parent_id.id_len = ppid.id_len;
-- 
cgit v1.2.3


From af1d017377c1c1931bfb898e719ab712cf79f944 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:28 -0700
Subject: net: s/dev_get_mac_address/netif_get_mac_address/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

netif_get_mac_address is used only by tun/tap, so move it into
NETDEV_INTERNAL namespace.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-3-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c         | 5 +++--
 drivers/net/tun.c         | 3 ++-
 include/linux/netdevice.h | 2 +-
 net/core/dev.c            | 4 ++--
 net/core/dev_ioctl.c      | 3 ++-
 net/core/net-sysfs.c      | 2 +-
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index d82eb7276a8b..1197f245e873 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -1000,8 +1000,8 @@ static long tap_ioctl(struct file *file, unsigned int cmd,
 			return -ENOLINK;
 		}
 		ret = 0;
-		dev_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev),
-				    tap->dev->name);
+		netif_get_mac_address((struct sockaddr *)&ss, dev_net(tap->dev),
+				      tap->dev->name);
 		if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) ||
 		    copy_to_user(&ifr->ifr_hwaddr, &ss, sizeof(ifr->ifr_hwaddr)))
 			ret = -EFAULT;
@@ -1282,3 +1282,4 @@ MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface");
 MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
 MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
 MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e65228ba3fae..cc6c50180663 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3223,7 +3223,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 	case SIOCGIFHWADDR:
 		/* Get hw address */
-		dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
+		netif_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
 		if (copy_to_user(argp, &ifr, ifreq_len))
 			ret = -EFAULT;
 		break;
@@ -3732,3 +3732,4 @@ MODULE_AUTHOR(DRV_COPYRIGHT);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(TUN_MINOR);
 MODULE_ALIAS("devname:net/tun");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6ba4ea66039..b3a48934b4cb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4222,7 +4222,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			struct netlink_ext_ack *extack);
 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr_storage *ss,
 			     struct netlink_ext_ack *extack);
-int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
+int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
 int netif_get_port_parent_id(struct net_device *dev,
 			     struct netdev_phys_item_id *ppid, bool recurse);
 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);
diff --git a/net/core/dev.c b/net/core/dev.c
index 4979a9197b18..d71f03874057 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9765,7 +9765,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 DECLARE_RWSEM(dev_addr_sem);
 
 /* "sa" is a true struct sockaddr with limited "sa_data" member. */
-int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
 {
 	size_t size = sizeof(sa->sa_data_min);
 	struct net_device *dev;
@@ -9791,7 +9791,7 @@ unlock:
 	up_read(&dev_addr_sem);
 	return ret;
 }
-EXPORT_SYMBOL(dev_get_mac_address);
+EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");
 
 int netif_change_carrier(struct net_device *dev, bool new_carrier)
 {
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 616479e71466..ceb2d63a818a 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -728,7 +728,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
 	switch (cmd) {
 	case SIOCGIFHWADDR:
 		dev_load(net, ifr->ifr_name);
-		ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name);
+		ret = netif_get_mac_address(&ifr->ifr_hwaddr, net,
+					    ifr->ifr_name);
 		if (colon)
 			*colon = ':';
 		return ret;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f7a6cc7aea79..e41ad1890e49 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -256,7 +256,7 @@ static ssize_t name_assign_type_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(name_assign_type);
 
-/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */
+/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */
 static ssize_t address_show(struct device *dev, struct device_attribute *attr,
 			    char *buf)
 {
-- 
cgit v1.2.3


From 0413a34ef678c3e2f0fafb4e113e810a05197030 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:29 -0700
Subject: net: s/dev_pre_changeaddr_notify/netif_pre_changeaddr_notify/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

netif_pre_changeaddr_notify is used only by ipvlan/bond, so move it into
NETDEV_INTERNAL namespace.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-4-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c  |  3 ++-
 drivers/net/ipvlan/ipvlan_main.c |  7 ++++---
 include/linux/netdevice.h        |  4 ++--
 net/bridge/br.c                  |  7 ++++---
 net/bridge/br_if.c               |  3 ++-
 net/core/dev.c                   | 18 ++++++++++--------
 net/core/dev_addr_lists.c        |  2 +-
 7 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 17c7542be6a5..d8281c486a44 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1040,7 +1040,7 @@ static int bond_set_dev_addr(struct net_device *bond_dev,
 
 	slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n",
 		  bond_dev, slave_dev, slave_dev->addr_len);
-	err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
+	err = netif_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
 	if (err)
 		return err;
 
@@ -6743,3 +6743,4 @@ module_exit(bonding_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION(DRV_DESCRIPTION);
 MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 0ed2fd833a5d..660f3db11766 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -784,9 +784,9 @@ static int ipvlan_device_event(struct notifier_block *unused,
 	case NETDEV_PRE_CHANGEADDR:
 		prechaddr_info = ptr;
 		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
-			err = dev_pre_changeaddr_notify(ipvlan->dev,
-						    prechaddr_info->dev_addr,
-						    extack);
+			err = netif_pre_changeaddr_notify(ipvlan->dev,
+							  prechaddr_info->dev_addr,
+							  extack);
 			if (err)
 				return notifier_from_errno(err);
 		}
@@ -1094,3 +1094,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
 MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
 MODULE_ALIAS_RTNL_LINK("ipvlan");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b3a48934b4cb..55c5cd9d1929 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4214,8 +4214,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net,
 int __dev_set_mtu(struct net_device *, int);
 int netif_set_mtu(struct net_device *dev, int new_mtu);
 int dev_set_mtu(struct net_device *, int);
-int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
-			      struct netlink_ext_ack *extack);
+int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+				struct netlink_ext_ack *extack);
 int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			  struct netlink_ext_ack *extack);
 int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 0adeafe11a36..1885d0c315f0 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -74,9 +74,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 		if (br->dev->addr_assign_type == NET_ADDR_SET)
 			break;
 		prechaddr_info = ptr;
-		err = dev_pre_changeaddr_notify(br->dev,
-						prechaddr_info->dev_addr,
-						extack);
+		err = netif_pre_changeaddr_notify(br->dev,
+						  prechaddr_info->dev_addr,
+						  extack);
 		if (err)
 			return notifier_from_errno(err);
 		break;
@@ -484,3 +484,4 @@ MODULE_LICENSE("GPL");
 MODULE_VERSION(BR_VERSION);
 MODULE_ALIAS_RTNL_LINK("bridge");
 MODULE_DESCRIPTION("Ethernet bridge driver");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 2450690f98cf..98c5b9c3145f 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -668,7 +668,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 		/* Ask for permission to use this MAC address now, even if we
 		 * don't end up choosing it below.
 		 */
-		err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
+		err = netif_pre_changeaddr_notify(br->dev, dev->dev_addr,
+						  extack);
 		if (err)
 			goto err6;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index d71f03874057..a47754fa7b15 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9716,13 +9716,15 @@ void netif_set_group(struct net_device *dev, int new_group)
 }
 
 /**
- *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
- *	@dev: device
- *	@addr: new address
- *	@extack: netlink extended ack
+ * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
+ *
+ * Return: 0 on success, -errno on failure.
  */
-int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
-			      struct netlink_ext_ack *extack)
+int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+				struct netlink_ext_ack *extack)
 {
 	struct netdev_notifier_pre_changeaddr_info info = {
 		.info.dev = dev,
@@ -9734,7 +9736,7 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
 	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
 	return notifier_to_errno(rc);
 }
-EXPORT_SYMBOL(dev_pre_changeaddr_notify);
+EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");
 
 int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 			  struct netlink_ext_ack *extack)
@@ -9748,7 +9750,7 @@ int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
 		return -EINVAL;
 	if (!netif_device_present(dev))
 		return -ENODEV;
-	err = dev_pre_changeaddr_notify(dev, ss->__data, extack);
+	err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
 	if (err)
 		return err;
 	if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 90716bd736f3..76c91f224886 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -603,7 +603,7 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr,
 
 	ASSERT_RTNL();
 
-	err = dev_pre_changeaddr_notify(dev, addr, NULL);
+	err = netif_pre_changeaddr_notify(dev, addr, NULL);
 	if (err)
 		return err;
 	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
-- 
cgit v1.2.3


From 303a8487a657c357ca6abc06a4045f72cdae90d5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:30 -0700
Subject: net: s/__dev_set_mtu/__netif_set_mtu/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

__netif_set_mtu is used only by bond, so move it into
NETDEV_INTERNAL namespace.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-5-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c |  2 +-
 include/linux/netdevice.h       |  2 +-
 net/core/dev.c                  | 22 +++++++++++++---------
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index d8281c486a44..257333c88710 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2669,7 +2669,7 @@ static int __bond_release_one(struct net_device *bond_dev,
 
 	if (unregister) {
 		netdev_lock_ops(slave_dev);
-		__dev_set_mtu(slave_dev, slave->original_mtu);
+		__netif_set_mtu(slave_dev, slave->original_mtu);
 		netdev_unlock_ops(slave_dev);
 	} else {
 		dev_set_mtu(slave_dev, slave->original_mtu);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 55c5cd9d1929..8978fbfbd644 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4211,7 +4211,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 			       struct netlink_ext_ack *extack);
 int dev_change_net_namespace(struct net_device *dev, struct net *net,
 			     const char *pat);
-int __dev_set_mtu(struct net_device *, int);
+int __netif_set_mtu(struct net_device *dev, int new_mtu);
 int netif_set_mtu(struct net_device *dev, int new_mtu);
 int dev_set_mtu(struct net_device *, int);
 int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
diff --git a/net/core/dev.c b/net/core/dev.c
index a47754fa7b15..a056f0dfc516 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9594,7 +9594,7 @@ int netif_change_flags(struct net_device *dev, unsigned int flags,
 	return ret;
 }
 
-int __dev_set_mtu(struct net_device *dev, int new_mtu)
+int __netif_set_mtu(struct net_device *dev, int new_mtu)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -9605,7 +9605,7 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
 	WRITE_ONCE(dev->mtu, new_mtu);
 	return 0;
 }
-EXPORT_SYMBOL(__dev_set_mtu);
+EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
 
 int dev_validate_mtu(struct net_device *dev, int new_mtu,
 		     struct netlink_ext_ack *extack)
@@ -9624,18 +9624,22 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,
 }
 
 /**
- *	netif_set_mtu_ext - Change maximum transfer unit
- *	@dev: device
- *	@new_mtu: new transfer unit
- *	@extack: netlink extended ack
+ * netif_set_mtu_ext() - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
  *
- *	Change the maximum transfer size of the network device.
+ * Change the maximum transfer size of the network device.
+ *
+ * Return: 0 on success, -errno on failure.
  */
 int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
 		      struct netlink_ext_ack *extack)
 {
 	int err, orig_mtu;
 
+	netdev_ops_assert_locked(dev);
+
 	if (new_mtu == dev->mtu)
 		return 0;
 
@@ -9652,7 +9656,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
 		return err;
 
 	orig_mtu = dev->mtu;
-	err = __dev_set_mtu(dev, new_mtu);
+	err = __netif_set_mtu(dev, new_mtu);
 
 	if (!err) {
 		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
@@ -9662,7 +9666,7 @@ int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
 			/* setting mtu back and notifying everyone again,
 			 * so that they have a chance to revert changes.
 			 */
-			__dev_set_mtu(dev, orig_mtu);
+			__netif_set_mtu(dev, orig_mtu);
 			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
 						     new_mtu);
 		}
-- 
cgit v1.2.3


From 93893a57efd431b9b4e72359bc8a8428681ca688 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:31 -0700
Subject: net: s/dev_get_flags/netif_get_flags/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-6-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/sw/rxe/rxe_verbs.c |  2 +-
 fs/smb/server/smb2pdu.c               |  2 +-
 include/linux/netdevice.h             |  2 +-
 net/8021q/vlan.c                      |  2 +-
 net/bridge/br_netlink.c               |  2 +-
 net/core/dev.c                        | 10 +++++-----
 net/core/dev_ioctl.c                  |  2 +-
 net/core/rtnetlink.c                  |  4 ++--
 net/ipv4/fib_frontend.c               |  2 +-
 net/ipv4/fib_semantics.c              |  2 +-
 net/ipv4/nexthop.c                    |  2 +-
 net/ipv6/addrconf.c                   |  2 +-
 net/mpls/af_mpls.c                    |  6 +++---
 net/wireless/wext-core.c              |  2 +-
 14 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 2331e698a65b..4f86b56fee26 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -65,7 +65,7 @@ static int rxe_query_port(struct ib_device *ibdev,
 	attr->state = ib_get_curr_port_state(ndev);
 	if (attr->state == IB_PORT_ACTIVE)
 		attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
-	else if (dev_get_flags(ndev) & IFF_UP)
+	else if (netif_get_flags(ndev) & IFF_UP)
 		attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
 	else
 		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 63d17cea2e95..fca92d1fea22 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -7847,7 +7847,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
 		if (!ksmbd_find_netdev_name_iface_list(netdev->name))
 			continue;
 
-		flags = dev_get_flags(netdev);
+		flags = netif_get_flags(netdev);
 		if (!(flags & IFF_RUNNING))
 			continue;
 ipv6_retry:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8978fbfbd644..8370cd0f8f6b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4196,7 +4196,7 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
 			       struct kernel_hwtstamp_config *kernel_cfg,
 			       struct netlink_ext_ack *extack);
 int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
-unsigned int dev_get_flags(const struct net_device *);
+unsigned int netif_get_flags(const struct net_device *dev);
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
 		       struct netlink_ext_ack *extack);
 int netif_change_flags(struct net_device *dev, unsigned int flags,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 9a6df8c1daf9..7ffd3386a842 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -483,7 +483,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	case NETDEV_UP:
 		/* Put all VLANs for this dev in the up state too.  */
 		vlan_group_for_each_dev(grp, i, vlandev) {
-			flgs = dev_get_flags(vlandev);
+			flgs = netif_get_flags(vlandev);
 			if (flgs & IFF_UP)
 				continue;
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 6e337937d0d7..4e2d53b27221 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -479,7 +479,7 @@ static int br_fill_ifinfo(struct sk_buff *skb,
 	hdr->__ifi_pad = 0;
 	hdr->ifi_type = dev->type;
 	hdr->ifi_index = dev->ifindex;
-	hdr->ifi_flags = dev_get_flags(dev);
+	hdr->ifi_flags = netif_get_flags(dev);
 	hdr->ifi_change = 0;
 
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
diff --git a/net/core/dev.c b/net/core/dev.c
index a056f0dfc516..25905bbf1972 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9454,12 +9454,12 @@ void dev_set_rx_mode(struct net_device *dev)
 }
 
 /**
- *	dev_get_flags - get flags reported to userspace
- *	@dev: device
+ * netif_get_flags() - get flags reported to userspace
+ * @dev: device
  *
- *	Get the combination of flag bits exported through APIs to userspace.
+ * Get the combination of flag bits exported through APIs to userspace.
  */
-unsigned int dev_get_flags(const struct net_device *dev)
+unsigned int netif_get_flags(const struct net_device *dev)
 {
 	unsigned int flags;
 
@@ -9482,7 +9482,7 @@ unsigned int dev_get_flags(const struct net_device *dev)
 
 	return flags;
 }
-EXPORT_SYMBOL(dev_get_flags);
+EXPORT_SYMBOL(netif_get_flags);
 
 int __dev_change_flags(struct net_device *dev, unsigned int flags,
 		       struct netlink_ext_ack *extack)
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index ceb2d63a818a..9c0ad7f4b5d8 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -147,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
 
 	switch (cmd) {
 	case SIOCGIFFLAGS:	/* Get interface flags */
-		ifr->ifr_flags = (short) dev_get_flags(dev);
+		ifr->ifr_flags = (short)netif_get_flags(dev);
 		return 0;
 
 	case SIOCGIFMETRIC:	/* Get the metric on the interface
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 108995b6eced..094b085cff20 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2038,7 +2038,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 	ifm->__ifi_pad = 0;
 	ifm->ifi_type = READ_ONCE(dev->type);
 	ifm->ifi_index = READ_ONCE(dev->ifindex);
-	ifm->ifi_flags = dev_get_flags(dev);
+	ifm->ifi_flags = netif_get_flags(dev);
 	ifm->ifi_change = change;
 
 	if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
@@ -5227,7 +5227,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	ifm->__ifi_pad = 0;
 	ifm->ifi_type = dev->type;
 	ifm->ifi_index = dev->ifindex;
-	ifm->ifi_flags = dev_get_flags(dev);
+	ifm->ifi_flags = netif_get_flags(dev);
 	ifm->ifi_change = 0;
 
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index fd1e1507a224..6e1b94796f67 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1524,7 +1524,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		fib_disable_ip(dev, event, false);
 		break;
 	case NETDEV_CHANGE:
-		flags = dev_get_flags(dev);
+		flags = netif_get_flags(dev);
 		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
 			fib_sync_up(dev, RTNH_F_LINKDOWN);
 		else
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a2f04992f579..a5f3c8459758 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2087,7 +2087,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
 		return 0;
 
 	if (nh_flags & RTNH_F_DEAD) {
-		unsigned int flags = dev_get_flags(dev);
+		unsigned int flags = netif_get_flags(dev);
 
 		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
 			nh_flags |= RTNH_F_LINKDOWN;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e808801ab9b8..29118c43ebf5 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -3884,7 +3884,7 @@ static int nh_netdev_event(struct notifier_block *this,
 		nexthop_flush_dev(dev, event);
 		break;
 	case NETDEV_CHANGE:
-		if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+		if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
 			nexthop_flush_dev(dev, event);
 		break;
 	case NETDEV_CHANGEMTU:
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c85b1db74b1a..4f1d7d110302 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6072,7 +6072,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 	hdr->ifi_type = dev->type;
 	ifindex = READ_ONCE(dev->ifindex);
 	hdr->ifi_index = ifindex;
-	hdr->ifi_flags = dev_get_flags(dev);
+	hdr->ifi_flags = netif_get_flags(dev);
 	hdr->ifi_change = 0;
 
 	iflink = dev_get_iflink(dev);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 47d7dfd9ad09..25c88cba5c48 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -706,7 +706,7 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
 	} else {
 		unsigned int flags;
 
-		flags = dev_get_flags(dev);
+		flags = netif_get_flags(dev);
 		if (!(flags & (IFF_RUNNING | IFF_LOWER_UP)))
 			nh->nh_flags |= RTNH_F_LINKDOWN;
 	}
@@ -1616,14 +1616,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 			return notifier_from_errno(err);
 		break;
 	case NETDEV_UP:
-		flags = dev_get_flags(dev);
+		flags = netif_get_flags(dev);
 		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
 			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
 		else
 			mpls_ifup(dev, RTNH_F_DEAD);
 		break;
 	case NETDEV_CHANGE:
-		flags = dev_get_flags(dev);
+		flags = netif_get_flags(dev);
 		if (flags & (IFF_RUNNING | IFF_LOWER_UP)) {
 			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
 		} else {
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index bea70eb6f034..c32a7c6903d5 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -431,7 +431,7 @@ static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev,
 	r->__ifi_pad = 0;
 	r->ifi_type = dev->type;
 	r->ifi_index = dev->ifindex;
-	r->ifi_flags = dev_get_flags(dev);
+	r->ifi_flags = netif_get_flags(dev);
 	r->ifi_change = 0;	/* Wireless changes don't affect those flags */
 
 	if (nla_put_string(skb, IFLA_IFNAME, dev->name))
-- 
cgit v1.2.3


From 5d4d84618e1aa2c9531afa3a6323f56e1db4dcf7 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:32 -0700
Subject: net: s/dev_set_threaded/netif_set_threaded/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

Note that one dev_set_threaded call still remains in mt76 for debugfs file.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-7-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/net_cachelines/net_device.rst |  2 +-
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c        |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c              |  2 +-
 drivers/net/ethernet/renesas/ravb_main.c               |  2 +-
 drivers/net/wireguard/device.c                         |  2 +-
 drivers/net/wireless/ath/ath10k/snoc.c                 |  2 +-
 include/linux/netdevice.h                              |  1 +
 net/core/dev.c                                         |  6 +++---
 net/core/dev_api.c                                     | 12 ++++++++++++
 net/core/net-sysfs.c                                   |  2 +-
 10 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index c69cc89c958e..2d3dc4692d20 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -165,7 +165,7 @@ struct sfp_bus*                     sfp_bus
 struct lock_class_key*              qdisc_tx_busylock
 bool                                proto_down
 unsigned:1                          wol_enabled
-unsigned:1                          threaded                                                            napi_poll(napi_enable,dev_set_threaded)
+unsigned:1                          threaded                                                            napi_poll(napi_enable,netif_set_threaded)
 unsigned_long:1                     see_all_hwtstamp_requests
 unsigned_long:1                     change_proto_down
 unsigned_long:1                     netns_immutable
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index ef1a51347351..3a9ad4a9c1cb 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2688,7 +2688,7 @@ static int atl1c_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->mii.mdio_write = atl1c_mdio_write;
 	adapter->mii.phy_id_mask = 0x1f;
 	adapter->mii.reg_num_mask = MDIO_CTRL_REG_MASK;
-	dev_set_threaded(netdev, true);
+	netif_set_threaded(netdev, true);
 	for (i = 0; i < adapter->rx_queue_count; ++i)
 		netif_napi_add(netdev, &adapter->rrd_ring[i].napi,
 			       atl1c_clean_rx);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 058dcabfaa2e..a2e97b712a3d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -156,7 +156,7 @@ static int mlxsw_pci_napi_devs_init(struct mlxsw_pci *mlxsw_pci)
 	}
 	strscpy(mlxsw_pci->napi_dev_rx->name, "mlxsw_rx",
 		sizeof(mlxsw_pci->napi_dev_rx->name));
-	dev_set_threaded(mlxsw_pci->napi_dev_rx, true);
+	netif_set_threaded(mlxsw_pci->napi_dev_rx, true);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index c9f4976a3527..4e79bf88688a 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -3075,7 +3075,7 @@ static int ravb_probe(struct platform_device *pdev)
 	if (info->coalesce_irqs) {
 		netdev_sw_irq_coalesce_default_on(ndev);
 		if (num_present_cpus() == 1)
-			dev_set_threaded(ndev, true);
+			netif_set_threaded(ndev, true);
 	}
 
 	/* Network device register */
diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index 4a529f1f9bea..5afec5a865f4 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -366,7 +366,7 @@ static int wg_newlink(struct net_device *dev,
 	if (ret < 0)
 		goto err_free_handshake_queue;
 
-	dev_set_threaded(dev, true);
+	netif_set_threaded(dev, true);
 	ret = register_netdevice(dev);
 	if (ret < 0)
 		goto err_uninit_ratelimiter;
diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index d51f2e5a79a4..0ee68d3dad12 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -936,7 +936,7 @@ static int ath10k_snoc_hif_start(struct ath10k *ar)
 
 	bitmap_clear(ar_snoc->pending_ce_irqs, 0, CE_COUNT_MAX);
 
-	dev_set_threaded(ar->napi_dev, true);
+	netif_set_threaded(ar->napi_dev, true);
 	ath10k_core_napi_enable(ar);
 	/* IRQs are left enabled when we restart due to a firmware crash */
 	if (!test_bit(ATH10K_SNOC_FLAG_RECOVERY, &ar_snoc->flags))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8370cd0f8f6b..7929ddfd4433 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -589,6 +589,7 @@ static inline bool napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
+int netif_set_threaded(struct net_device *dev, bool threaded);
 int dev_set_threaded(struct net_device *dev, bool threaded);
 
 void napi_disable(struct napi_struct *n);
diff --git a/net/core/dev.c b/net/core/dev.c
index 25905bbf1972..a22f26997b94 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4798,7 +4798,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 
 	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 		/* Paired with smp_mb__before_atomic() in
-		 * napi_enable()/dev_set_threaded().
+		 * napi_enable()/netif_set_threaded().
 		 * Use READ_ONCE() to guarantee a complete
 		 * read on napi->thread. Only call
 		 * wake_up_process() when it's not NULL.
@@ -6990,7 +6990,7 @@ int napi_set_threaded(struct napi_struct *napi, bool threaded)
 	return 0;
 }
 
-int dev_set_threaded(struct net_device *dev, bool threaded)
+int netif_set_threaded(struct net_device *dev, bool threaded)
 {
 	struct napi_struct *napi;
 	int err = 0;
@@ -7031,7 +7031,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 
 	return err;
 }
-EXPORT_SYMBOL(dev_set_threaded);
+EXPORT_SYMBOL(netif_set_threaded);
 
 /**
  * netif_queue_set_napi - Associate queue with the napi
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index 1bf0153195f2..dd7f57013ce5 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -367,3 +367,15 @@ void netdev_state_change(struct net_device *dev)
 	netdev_unlock_ops(dev);
 }
 EXPORT_SYMBOL(netdev_state_change);
+
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+	int ret;
+
+	netdev_lock(dev);
+	ret = netif_set_threaded(dev, threaded);
+	netdev_unlock(dev);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_set_threaded);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e41ad1890e49..c28cd6665444 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -757,7 +757,7 @@ static int modify_napi_threaded(struct net_device *dev, unsigned long val)
 	if (val != 0 && val != 1)
 		return -EOPNOTSUPP;
 
-	ret = dev_set_threaded(dev, val);
+	ret = netif_set_threaded(dev, val);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 88d3cec28274f9c15355835466c0c694e313680e Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@fomichev.me>
Date: Thu, 17 Jul 2025 10:23:33 -0700
Subject: net: s/dev_close_many/netif_close_many/

Commit cc34acd577f1 ("docs: net: document new locking reality")
introduced netif_ vs dev_ function semantics: the former expects locked
netdev, the latter takes care of the locking. We don't strictly
follow this semantics on either side, but there are more dev_xxx handlers
now that don't fit. Rename them to netif_xxx where appropriate.

netif_close_many is used only by vlan/dsa and one mtk driver, so move it into
NETDEV_INTERNAL namespace.

Signed-off-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20250717172333.1288349-8-sdf@fomichev.me
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |  3 ++-
 include/linux/netdevice.h                   |  2 +-
 net/8021q/vlan.c                            |  3 ++-
 net/core/dev.c                              | 10 +++++-----
 net/dsa/dsa.c                               |  3 ++-
 net/dsa/user.c                              |  2 +-
 6 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 11ee7e1829bf..5a5fcde76dc0 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -4967,7 +4967,7 @@ void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
 		list_add_tail(&dev->close_list, &dev_list);
 	}
 
-	dev_close_many(&dev_list, false);
+	netif_close_many(&dev_list, false);
 
 	eth->dma_dev = dma_dev;
 
@@ -5610,3 +5610,4 @@ module_platform_driver(mtk_driver);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("John Crispin <blogic@openwrt.org>");
 MODULE_DESCRIPTION("Ethernet driver for MediaTek SoC");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7929ddfd4433..5aee8d3895f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3343,7 +3343,7 @@ int netif_open(struct net_device *dev, struct netlink_ext_ack *extack);
 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
 void netif_close(struct net_device *dev);
 void dev_close(struct net_device *dev);
-void dev_close_many(struct list_head *head, bool unlink);
+void netif_close_many(struct list_head *head, bool unlink);
 void netif_disable_lro(struct net_device *dev);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 7ffd3386a842..fda3a80e9340 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -470,7 +470,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 				list_add(&vlandev->close_list, &close_list);
 		}
 
-		dev_close_many(&close_list, false);
+		netif_close_many(&close_list, false);
 
 		list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
 			vlan_stacked_transfer_operstate(dev, vlandev,
@@ -765,3 +765,4 @@ module_exit(vlan_cleanup_module);
 MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/core/dev.c b/net/core/dev.c
index a22f26997b94..354d3453b407 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1768,7 +1768,7 @@ static void __dev_close(struct net_device *dev)
 	list_del(&single);
 }
 
-void dev_close_many(struct list_head *head, bool unlink)
+void netif_close_many(struct list_head *head, bool unlink)
 {
 	struct net_device *dev, *tmp;
 
@@ -1786,7 +1786,7 @@ void dev_close_many(struct list_head *head, bool unlink)
 			list_del_init(&dev->close_list);
 	}
 }
-EXPORT_SYMBOL(dev_close_many);
+EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");
 
 void netif_close(struct net_device *dev)
 {
@@ -1794,7 +1794,7 @@ void netif_close(struct net_device *dev)
 		LIST_HEAD(single);
 
 		list_add(&dev->close_list, &single);
-		dev_close_many(&single, true);
+		netif_close_many(&single, true);
 		list_del(&single);
 	}
 }
@@ -12099,7 +12099,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 			netdev_lock(dev);
 		}
 	}
-	dev_close_many(&close_head, true);
+	netif_close_many(&close_head, true);
 	/* ... now unlock them and go over the rest. */
 	list_for_each_entry(dev, head, unreg_list) {
 		if (netdev_need_ops_lock(dev))
@@ -12107,7 +12107,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
 		else
 			list_add_tail(&dev->close_list, &close_head);
 	}
-	dev_close_many(&close_head, true);
+	netif_close_many(&close_head, true);
 
 	list_for_each_entry(dev, head, unreg_list) {
 		/* And unlink it from device chain. */
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 436a7e1b412a..5b01a0e43ebe 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1621,7 +1621,7 @@ void dsa_switch_shutdown(struct dsa_switch *ds)
 	dsa_switch_for_each_cpu_port(dp, ds)
 		list_add(&dp->conduit->close_list, &close_list);
 
-	dev_close_many(&close_list, true);
+	netif_close_many(&close_list, true);
 
 	dsa_switch_for_each_user_port(dp, ds) {
 		conduit = dsa_port_to_conduit(dp);
@@ -1829,3 +1829,4 @@ MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
 MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:dsa");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/dsa/user.c b/net/dsa/user.c
index e9334520c54a..f59d66f0975d 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -3604,7 +3604,7 @@ static int dsa_user_netdevice_event(struct notifier_block *nb,
 			list_add(&dp->user->close_list, &close_list);
 		}
 
-		dev_close_many(&close_list, true);
+		netif_close_many(&close_list, true);
 
 		return NOTIFY_OK;
 	}
-- 
cgit v1.2.3


From 954c0d74129948eed5c8f4a6898d3d5b344c8b18 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 17 Jul 2025 11:55:41 -0700
Subject: srcu: Add guards for SRCU-fast readers

This adds the usual scoped_guard(srcu_fast, &my_srcu) and
guard(srcu_fast)(&my_srcu).

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.upadhyay@kernel.org>
---
 include/linux/srcu.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index cf711a0f440b..f179700fecaf 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -481,4 +481,9 @@ DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
 		    srcu_read_unlock(_T->lock, _T->idx),
 		    int idx)
 
+DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct,
+		    _T->scp = srcu_read_lock_fast(_T->lock),
+		    srcu_read_unlock_fast(_T->lock, _T->scp),
+		    struct srcu_ctr __percpu *scp)
+
 #endif
-- 
cgit v1.2.3


From d2b5be741a5045272b9d711908eab017632ac022 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 5 Jul 2025 10:49:58 -0700
Subject: mm/damon/sysfs: use DAMON core API damon_is_running()

DAMON core implements a static function to see if a given DAMON context is
running.  DAMON sysfs interface is implementing the same one on its own.
Make the core function non-static and reuse it from the DAMON sysfs
interface.

Link: https://lkml.kernel.org/r/20250705175000.56259-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  1 +
 mm/damon/core.c       |  8 +++++++-
 mm/damon/sysfs.c      | 14 ++------------
 3 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index bb58e36f019e..e1fea3119538 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -934,6 +934,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs
 
 int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive);
 int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
+bool damon_is_running(struct damon_ctx *ctx);
 
 int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
 int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index e8036254cc98..c66583869e95 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1311,7 +1311,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
 	return err;
 }
 
-static bool damon_is_running(struct damon_ctx *ctx)
+/**
+ * damon_is_running() - Returns if a given DAMON context is running.
+ * @ctx:	The DAMON context to see if running.
+ *
+ * Return: true if @ctx is running, false otherwise.
+ */
+bool damon_is_running(struct damon_ctx *ctx)
 {
 	bool running;
 
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1b1476b79cdb..79d65dcc9dd0 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1189,16 +1189,6 @@ static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
 	kobject_put(&kdamond->contexts->kobj);
 }
 
-static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
-{
-	bool running;
-
-	mutex_lock(&ctx->kdamond_lock);
-	running = ctx->kdamond != NULL;
-	mutex_unlock(&ctx->kdamond_lock);
-	return running;
-}
-
 /*
  * enum damon_sysfs_cmd - Commands for a specific kdamond.
  */
@@ -1275,7 +1265,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 	if (!ctx)
 		running = false;
 	else
-		running = damon_sysfs_ctx_running(ctx);
+		running = damon_is_running(ctx);
 
 	return sysfs_emit(buf, "%s\n", running ?
 			damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
@@ -1429,7 +1419,7 @@ static inline bool damon_sysfs_kdamond_running(
 		struct damon_sysfs_kdamond *kdamond)
 {
 	return kdamond->damon_ctx &&
-		damon_sysfs_ctx_running(kdamond->damon_ctx);
+		damon_is_running(kdamond->damon_ctx);
 }
 
 static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
-- 
cgit v1.2.3


From fdc5001b002eaca9989b5f3563245662fd1e4d40 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 4 Jun 2025 12:51:11 +0300
Subject: mm/vmstat: make MEMCG select VM_EVENT_COUNTERS

The vmstat_text array contains labels for counters displayed in
/proc/vmstat.  It is important to keep the labels in sync with the
counters.

There is a BUILD_BUG_ON() check in vmstat_start() that ensures the size of
the vmstat_text is not smaller than VM_EVENT_COUNTERS.  This helps to
catch cases where a new counter is added but the label is not.  However,
it does not help if a counter is removed but the label remains.

It would be nice to make the BUILD_BUG_ON() check more strict to catch
such cases.  However, when compiling with MEMCG enabled but
VM_EVENT_COUNTERS disabled, the vmstat_text array is larger than
NR_VMSTAT_ITEMS.

This issue arises because some elements of the vmstat_text array are
present when either MEMCG or VM_EVENT_COUNTERS is enabled, but
NR_VMSTAT_ITEMS only accounts for these elements if VM_EVENT_COUNTERS is
enabled.

Instead of adjusting the NR_VMSTAT_ITEMS definition to account for MEMCG,
make MEMCG select VM_EVENT_COUNTERS.  VM_EVENT_COUNTERS is enabled in most
configurations anyway.

Link: https://lkml.kernel.org/r/20250604095111.533783-1-kirill.shutemov@linux.intel.com
Fixes: ebc5d83d0443 ("mm/memcontrol: use vmstat names for printing statistics")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/vmstat.h | 4 ++--
 init/Kconfig           | 1 +
 mm/vmstat.c            | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b2ccb6845595..c287998908bf 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -507,7 +507,7 @@ static inline const char *lru_list_name(enum lru_list lru)
 	return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
 }
 
-#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+#if defined(CONFIG_VM_EVENT_COUNTERS)
 static inline const char *vm_event_name(enum vm_event_item item)
 {
 	return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
@@ -516,7 +516,7 @@ static inline const char *vm_event_name(enum vm_event_item item)
 			   NR_VM_STAT_ITEMS +
 			   item];
 }
-#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+#endif /* CONFIG_VM_EVENT_COUNTERS */
 
 #ifdef CONFIG_MEMCG
 
diff --git a/init/Kconfig b/init/Kconfig
index 666783eb50ab..e9b88d527aeb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -992,6 +992,7 @@ config MEMCG
 	select PAGE_COUNTER
 	select EVENTFD
 	select SLAB_OBJ_EXT
+	select VM_EVENT_COUNTERS
 	help
 	  Provides control over the memory footprint of tasks in a cgroup.
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a78d70ddeacd..01d76216d65a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1287,7 +1287,7 @@ const char * const vmstat_text[] = {
 	"nr_memmap_pages",
 	"nr_memmap_boot_pages",
 
-#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
+#if defined(CONFIG_VM_EVENT_COUNTERS)
 	/* enum vm_event_item counters */
 	"pgpgin",
 	"pgpgout",
@@ -1475,7 +1475,7 @@ const char * const vmstat_text[] = {
 	"kstack_rest",
 #endif
 #endif
-#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+#endif /* CONFIG_VM_EVENT_COUNTERS */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
 
-- 
cgit v1.2.3


From 8356a5a3b078ca89c526dd6d71e9a76fec571c37 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 25 Jun 2025 17:51:52 +0200
Subject: mm, vmstat: remove the NR_WRITEBACK_TEMP node_stat_item counter

The only user of the counter (FUSE) was removed in commit 0c58a97f919c
("fuse: remove tmp folio for writebacks and internal rb tree") so follow
the established pattern of removing the counter and hardcoding 0 in
meminfo output, as done recently with NR_BOUNCE.  Update documentation for
procfs, including for the value for Bounce that was missed when removing
its counter.

Also remove the mention of NR_WRITEBACK_TEMP implications from a comment
in wb_position_ratio(). The rest of the comment there about fuse setting
bdi->max_ratio to 1% is still correct.

[vbabka@suse.cz: v2]
  Link: https://lkml.kernel.org/r/5a848e15-6a57-4ecb-a015-d4f358b8a5d3@suse.cz
Link: https://lkml.kernel.org/r/20250625-nr_writeback_removal-v1-1-7f2a0df70faa@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Jeffle Xu <jefflexu@linux.alibaba.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanne Koong <joannelkoong@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kirill A. Shuemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Maxim Patlasov <mpatlasov@parallels.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zach O'Keefe <zokeefe@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/filesystems/proc.rst | 8 +++++---
 drivers/base/node.c                | 2 +-
 fs/proc/meminfo.c                  | 3 +--
 include/linux/mmzone.h             | 1 -
 mm/page-writeback.c                | 4 +---
 mm/show_mem.c                      | 2 --
 mm/vmstat.c                        | 1 -
 7 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 5236cb52e357..2971551b7235 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1196,12 +1196,14 @@ SecPageTables
               Memory consumed by secondary page tables, this currently includes
               KVM mmu and IOMMU allocations on x86 and arm64.
 NFS_Unstable
-              Always zero. Previous counted pages which had been written to
+              Always zero. Previously counted pages which had been written to
               the server, but has not been committed to stable storage.
 Bounce
-              Memory used for block device "bounce buffers"
+              Always zero. Previously memory used for block device
+              "bounce buffers".
 WritebackTmp
-              Memory used by FUSE for temporary writeback buffers
+              Always zero. Previously memory used by FUSE for temporary
+              writeback buffers.
 CommitLimit
               Based on the overcommit ratio ('vm.overcommit_ratio'),
               this is the total amount of  memory currently available to
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6d66382dae65..e434cb260e61 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -500,7 +500,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 			     nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
 			     nid, 0UL,
 			     nid, 0UL,
-			     nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+			     nid, 0UL,
 			     nid, K(sreclaimable +
 				    node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
 			     nid, K(sreclaimable + sunreclaimable),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index bc2bc60c36cc..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -121,8 +121,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 
 	show_val_kb(m, "NFS_Unstable:   ", 0);
 	show_val_kb(m, "Bounce:         ", 0);
-	show_val_kb(m, "WritebackTmp:   ",
-		    global_node_page_state(NR_WRITEBACK_TEMP));
+	show_val_kb(m, "WritebackTmp:   ", 0);
 	show_val_kb(m, "CommitLimit:    ", vm_commit_limit());
 	show_val_kb(m, "Committed_AS:   ", committed);
 	seq_printf(m, "VmallocTotal:   %8lu kB\n",
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1d1bb2b7f40d..0c5da9141983 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,7 +206,6 @@ enum node_stat_item {
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
-	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 	NR_SHMEM,		/* shmem pages (included tmpfs/GEM pages) */
 	NR_SHMEM_THPS,
 	NR_SHMEM_PMDMAPPED,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 72b0ff0d4bae..3e248d1c3969 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1101,9 +1101,7 @@ static void wb_position_ratio(struct dirty_throttle_control *dtc)
 	 * such filesystems balance_dirty_pages always checks wb counters
 	 * against wb limits. Even if global "nr_dirty" is under "freerun".
 	 * This is especially important for fuse which sets bdi->max_ratio to
-	 * 1% by default. Without strictlimit feature, fuse writeback may
-	 * consume arbitrary amount of RAM because it is accounted in
-	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+	 * 1% by default.
 	 *
 	 * Here, in wb_position_ratio(), we calculate pos_ratio based on
 	 * two values: wb_dirty and wb_thresh. Let's consider an example:
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 0cf8bf5d832d..41999e94a56d 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -246,7 +246,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			" shmem_pmdmapped:%lukB"
 			" anon_thp:%lukB"
 #endif
-			" writeback_tmp:%lukB"
 			" kernel_stack:%lukB"
 #ifdef CONFIG_SHADOW_CALL_STACK
 			" shadow_call_stack:%lukB"
@@ -273,7 +272,6 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
 			K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
 			K(node_page_state(pgdat, NR_ANON_THPS)),
 #endif
-			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			node_page_state(pgdat, NR_KERNEL_STACK_KB),
 #ifdef CONFIG_SHADOW_CALL_STACK
 			node_page_state(pgdat, NR_KERNEL_SCS_KB),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c250eeba0f16..71cd1ceba191 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1251,7 +1251,6 @@ const char * const vmstat_text[] = {
 	[I(NR_FILE_PAGES)]			= "nr_file_pages",
 	[I(NR_FILE_DIRTY)]			= "nr_dirty",
 	[I(NR_WRITEBACK)]			= "nr_writeback",
-	[I(NR_WRITEBACK_TEMP)]			= "nr_writeback_temp",
 	[I(NR_SHMEM)]				= "nr_shmem",
 	[I(NR_SHMEM_THPS)]			= "nr_shmem_hugepages",
 	[I(NR_SHMEM_PMDMAPPED)]			= "nr_shmem_pmdmapped",
-- 
cgit v1.2.3


From a2c24eae5a15f79673eba2913d87d658a04830cf Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 8 Jul 2025 19:59:31 -0500
Subject: mm/damon: add struct damos_migrate_dests

Patch series "mm/damon/vaddr: Allow interleaving in migrate_{hot,cold}
actions", v4.

A recent patchset automatically sets the interleave weight for each node
according to the node's maximum bandwidth [1].  In another thread, the
patch set's author, Joshua Hahn, wondered if/how thes weights should be
changed if the bandwidth utilization of the system changes [2].

This patch set adds the mechanism for dynamically changing how application
data is interleaved across nodes while leaving the policy of what the
interleave weights should be to userspace.  It does this by having the
migrate_{hot,cold} operating schemes interleave application data according
to the list of migration nodes and weights passed in via the DAMON sysfs
interface.  This functionality can be used to dynamically adjust how
folios are interleaved by having a userspace process adjust those weights.
If no specific destination nodes or weights are provided, the
migrate_{hot,cold} actions will only migrate folios to damos->target_nid
as before.

The algorithm used to interleave the folios is similar to the one used for
the weighted interleave mempolicy [3].  It uses the offset from which a
folio is mapped into a VMA to determine the node the folio should be
placed in.  This method is convenient because for a given set of
interleave weights, a folio has only one valid node it can be placed in,
limitng the amount of unnecessary data movement.  However, finding out how
a folio is mapped inside of a VMA requires a costly rmap walk when using a
paddr scheme.  As such, we have decided that this functionality makes more
sense as a vaddr scheme [4].  To this end, this patch set also adds vaddr
versions of the migrate_{hot,cold}.

Motivation
==========
There have been prior discussions about how changing the interleave
weights in response to the system's bandwidth utilization can be
beneficial [2].  However, currently the interleave weights only are
applied when data is allocated.  Migrating already allocated pages
according to the dynamically changing weights will better help balance the
bandwidth utilization across nodes.

As a toy example, imagine some application that uses 75% of the local
bandwidth.  Assuming sufficient capacity, when running alone, we want to
keep that application's data in local memory.  However, if a second
instance of that application begins, using the same amount of bandwidth,
it would be best to interleave the data of both processes to alleviate the
bandwidth pressure from the local node.  Likewise, when one of the
processes ends, the data should be moves back to local memory.

We imagine there would be a userspace application that would monitor
system performance characteristics, such as bandwidth utilization or
memory access latency, and uses that information to tune the interleave
weights.  Others seem to have come to a similar conclusion in previous
discussions [5].  We are currently working on a userspace program that
does this, but it is not quite ready to be published yet.

After the userspace application tunes the interleave weights, there must
be some mechanism that actually migrates pages to be consistent with those
weights.  This patchset is what provides this mechanism.

We believe DAMON is the correct venue for the interleaving mechanism for a
few reasons.  First, we noticed that we don't have to migrate all of the
application's pages to improve performance.  we just need to migrate the
frequently accessed pages.  DAMON's existing hotness traching is very
useful for this.  Second, DAMON's quota system can be used to ensure we
are not using too much bandwidth for migrations.  Finally, as Ying pointed
out [6], a complete solution must also handle when a memory node is at
capacity.  The existing migrate_cold action can be used in conjunction
with the functionality added in this patch set to provide that complete
solution.

Functionality Test
==================
Below is an example of this new functionality in use to confirm that these
patches behave as intended.

In this example, the user starts an application, alloc_data, which
allocates 1GB using the default memory policy (i.e.  allocate to local
memory) then sleeps.  Afterwards, we start DAMON to interleave the data at
a 1:1 ratio.  Using numastat, we show that DAMON has migrated the
application's data to match the new interleave ratio.

For this example, I modified the userspace damo tool [8] to write to the
migration_dest sysfs files.  I plan to upstream these changes when these
patches are merged.

  $ # Allocate the data initially
  $ ./alloc_data 1G &
  [1] 6587
  $ numastat -c -p alloc_data

  Per-node process memory usage (in MBs) for PID 6587 (alloc_data)
           Node 0 Node 1 Total
           ------ ------ -----
  Huge          0      0     0
  Heap          0      0     0
  Stack         0      0     0
  Private    1027      0  1027
  -------  ------ ------ -----
  Total      1027      0  1027
  $ # Start DAMON to interleave data at a 1:1 ratio
  $ cat ./interleave_vaddr.yaml
  kdamonds:
  - contexts:
    - ops: vaddr
      addr_unit: null
      targets:
      - pid: 6587
        regions: []
      intervals:
        sample_us: 500 ms
        aggr_us: 5 s
        ops_update_us: 20 s
        intervals_goal:
          access_bp: 0 %
          aggrs: '0'
          min_sample_us: 0 ns
          max_sample_us: 0 ns
      nr_regions:
        min: '20'
        max: '50'
      schemes:
      - action: migrate_hot
        dests:
        - nid: 0
          weight: 1
        - nid: 1
          weight: 1
        access_pattern:
          sz_bytes:
            min: 0 B
            max: max
          nr_accesses:
            min: 0 %
            max: 100 %
          age:
            min: 0 ns
            max: max
  $ sudo ./damo/damo interleave_vaddr.yaml
  $ # Verify that DAMON has migrated data to match the 1:1 ratio
  $ numastat -c -p alloc_data

  Per-node process memory usage (in MBs) for PID 6587 (alloc_data)
           Node 0 Node 1 Total
           ------ ------ -----
  Huge          0      0     0
  Heap          0      0     0
  Stack         0      0     0
  Private     514    514  1027
  -------  ------ ------ -----
  Total       514    514  1027

Performance Test
================
Below is a simple example showing that interleaving application data using
these patches can improve application performance.  To do this, we run a
bandwidth intensive embedding reduction application [7].  This workload is
useful for this test because it reports the time it takes each iteration
to run and each iteration reuses the same allocation, allowing us to see
the benefits of the migration.

We evaluate this on a 128 core/256 thread AMD CPU with 72GB/s of local DDR
bandwidth and 26 GB/s of CXL bandwidth.

Before we start the workload, the system bandwidth utilization is low, so
we start with the interleave weights of 1:0, i.e.  allocating all data to
local memory.  When the workload beings, it saturates the local bandwidth,
making the page placement suboptimal.  To alleviate this, we modify the
interleave weights, triggering DAMON to migrate the workload's data.

We use the same interleave_vaddr.yaml file to setup DAMON, except we
configure it to begin with a 1:0 interleave ratio, and attach it to the
shell and its children processes.

  $ sudo ./damo/damo start interleave_vaddr.yaml --include_child_tasks &
  $ <path>/eval_baseline -d amazon_All -c 255 -r 100
  <clip startup output>
  Eval Phase 3: Running Baseline...

  REPEAT # 0 Baseline Total time : 7323.54 ms
  REPEAT # 1 Baseline Total time : 7624.56 ms
  REPEAT # 2 Baseline Total time : 7619.61 ms
  REPEAT # 3 Baseline Total time : 7617.12 ms
  REPEAT # 4 Baseline Total time : 7638.64 ms
  REPEAT # 5 Baseline Total time : 7611.27 ms
  REPEAT # 6 Baseline Total time : 7629.32 ms
  REPEAT # 7 Baseline Total time : 7695.63 ms
  # Interleave weights set to 3:1
  REPEAT # 8 Baseline Total time : 7077.5 ms
  REPEAT # 9 Baseline Total time : 5633.23 ms
  REPEAT # 10 Baseline Total time : 5644.6 ms
  REPEAT # 11 Baseline Total time : 5627.66 ms
  REPEAT # 12 Baseline Total time : 5629.76 ms
  REPEAT # 13 Baseline Total time : 5633.05 ms
  REPEAT # 14 Baseline Total time : 5641.24 ms
  REPEAT # 15 Baseline Total time : 5631.18 ms
  REPEAT # 16 Baseline Total time : 5631.33 ms

Updating the interleave weights and having DAMON migrate the workload data
according to the weights resulted in an approximarely 25% speedup.

Patches Sequence
================
Patches 1-7 extend the DAMON API to specify multiple destination nodes and
weights for the migrate_{hot,cold} actions.  These patches are from SJ'S
RFC [8].

Patches 8-10 add a vaddr implementation of the migrate_{hot,cold} schemes.

Patch 11 modifies the vaddr migrate_{hot,cold} schemes to interleave data
according to the weights provided by damos->migrate_dest.

Patches 12-13 allow the vaddr migrate_{hot,cold} implementation to filter
out folios like the paddr version.


This patch (of 13):

Introduce a new struct, namely damos_migrate_dests, for specifying
multiple DAMOS' migration destination nodes and their weights.

Link: https://lkml.kernel.org/r/20250709005952.17776-1-bijan311@gmail.com
Link: https://lkml.kernel.org/r/20250709005952.17776-2-bijan311@gmail.com
Link: https://lore.kernel.org/linux-mm/20250520141236.2987309-1-joshua.hahnjy@gmail.com/ [1]
Link: https://lore.kernel.org/linux-mm/20250313155705.1943522-1-joshua.hahnjy@gmail.com/ [2]
Link: https://elixir.bootlin.com/linux/v6.15.4/source/mm/mempolicy.c#L2015 [3]
Link: https://lore.kernel.org/damon/20250624223310.55786-1-sj@kernel.org/ [4]
Link: https://lore.kernel.org/linux-mm/20250314151137.892379-1-joshua.hahnjy@gmail.com/ [5]
Link: https://lore.kernel.org/linux-mm/87frjfx6u4.fsf@DESKTOP-5N7EMDA/ [6]
Link: https://github.com/SNU-ARC/MERCI [7]
Link: https://lore.kernel.org/damon/20250702051558.54138-1-sj@kernel.org/ [8]
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Bijan Tabatabai <bijantabatab@micron.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ravi Shankar Jonnalagadda <ravis.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index e1fea3119538..07cee590ff09 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -447,6 +447,22 @@ struct damos_access_pattern {
 	unsigned int max_age_region;
 };
 
+/**
+ * struct damos_migrate_dests - Migration destination nodes and their weights.
+ * @node_id_arr:	Array of migration destination node ids.
+ * @weight_arr:		Array of migration weights for @node_id_arr.
+ * @nr_dests:		Length of the @node_id_arr and @weight_arr arrays.
+ *
+ * @node_id_arr is an array of the ids of migration destination nodes.
+ * @weight_arr is an array of the weights for those.  The weights in
+ * @weight_arr are for nodes in @node_id_arr of same array index.
+ */
+struct damos_migrate_dests {
+	unsigned int *node_id_arr;
+	unsigned int *weight_arr;
+	size_t nr_dests;
+};
+
 /**
  * struct damos - Represents a Data Access Monitoring-based Operation Scheme.
  * @pattern:		Access pattern of target regions.
-- 
cgit v1.2.3


From aabc85ee33c883243f2c506a5d88963f2456faa6 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Tue, 8 Jul 2025 19:59:32 -0500
Subject: mm/damon/core: add damos->migrate_dests field

Add a new field to 'struct damos', namely migrate_dests, to allow DAMON
API callers specify multiple migration destination nodes and their
weights.  Also update 'struct damos' creation and destruction functions
accordingly to initialize the new field and free up the API
caller-allocated buffers on those, respectively.

Link: https://lkml.kernel.org/r/20250709005952.17776-3-bijan311@gmail.com
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Bijan Tabatabai <bijantabatab@micron.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ravi Shankar Jonnalagadda <ravis.opensrc@micron.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 13 ++++++++++---
 mm/damon/core.c       |  4 ++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 07cee590ff09..1f425d830bb9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -470,6 +470,7 @@ struct damos_migrate_dests {
  * @apply_interval_us:	The time between applying the @action.
  * @quota:		Control the aggressiveness of this scheme.
  * @wmarks:		Watermarks for automated (in)activation of this scheme.
+ * @migrate_dests:	Destination nodes if @action is "migrate_{hot,cold}".
  * @target_nid:		Destination node if @action is "migrate_{hot,cold}".
  * @filters:		Additional set of &struct damos_filter for &action.
  * @ops_filters:	ops layer handling &struct damos_filter objects list.
@@ -488,9 +489,12 @@ struct damos_migrate_dests {
  * monitoring context are inactive, DAMON stops monitoring either, and just
  * repeatedly checks the watermarks.
  *
+ * @migrate_dests specifies multiple migration target nodes with different
+ * weights for migrate_hot or migrate_cold actions.  @target_nid is ignored if
+ * this is set.
+ *
  * @target_nid is used to set the migration target node for migrate_hot or
- * migrate_cold actions, which means it's only meaningful when @action is either
- * "migrate_hot" or "migrate_cold".
+ * migrate_cold actions, and @migrate_dests is unset.
  *
  * Before applying the &action to a memory region, &struct damon_operations
  * implementation could check pages of the region and skip &action to respect
@@ -533,7 +537,10 @@ struct damos {
 	struct damos_quota quota;
 	struct damos_watermarks wmarks;
 	union {
-		int target_nid;
+		struct {
+			int target_nid;
+			struct damos_migrate_dests migrate_dests;
+		};
 	};
 	struct list_head filters;
 	struct list_head ops_filters;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 04e01e08253a..6c8170d4f695 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -407,6 +407,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
 	scheme->wmarks = *wmarks;
 	scheme->wmarks.activated = true;
 
+	scheme->migrate_dests = (struct damos_migrate_dests){};
 	scheme->target_nid = target_nid;
 
 	return scheme;
@@ -449,6 +450,9 @@ void damon_destroy_scheme(struct damos *s)
 
 	damos_for_each_filter_safe(f, next, s)
 		damos_destroy_filter(f);
+
+	kfree(s->migrate_dests.node_id_arr);
+	kfree(s->migrate_dests.weight_arr);
 	damon_del_scheme(s);
 	damon_free_scheme(s);
 }
-- 
cgit v1.2.3


From e05d3a6014fd4ae224b44b89fbeacfaa4ace0a8e Mon Sep 17 00:00:00 2001
From: "Vishal Moola (Oracle)" <vishal.moola@gmail.com>
Date: Wed, 9 Jul 2025 12:40:18 -0700
Subject: mm: remove unmap_and_put_page()

There are no callers of unmap_and_put_page() left. Remove it.

Link: https://lkml.kernel.org/r/20250709194017.927978-6-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jordan Rome <linux@jordanrome.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/highmem.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index a30526cc53a7..6234f316468c 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -682,10 +682,4 @@ static inline void folio_release_kmap(struct folio *folio, void *addr)
 	kunmap_local(addr);
 	folio_put(folio);
 }
-
-static inline void unmap_and_put_page(struct page *page, void *addr)
-{
-	folio_release_kmap(page_folio(page), addr);
-}
-
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From b980077899ea49cc747afe003e01ca303b00d463 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 23 Jun 2025 11:58:51 -0700
Subject: mm: introduce per-node proactive reclaim interface

This adds support for allowing proactive reclaim in general on a NUMA
system.  A per-node interface extends support for beyond a memcg-specific
interface, respecting the current semantics of memory.reclaim: respecting
aging LRU and not supporting artificially triggering eviction on nodes
belonging to non-bottom tiers.

This patch allows userspace to do:

     echo "512M swappiness=10" > /sys/devices/system/node/nodeX/reclaim

One of the premises for this is to semantically align as best as possible
with memory.reclaim.  During a brief time memcg did support nodemask until
55ab834a86a9 (Revert "mm: add nodes= arg to memory.reclaim"), for which
semantics around reclaim (eviction) vs demotion were not clear, rendering
charging expectations to be broken.

With this approach:

1. Users who do not use memcg can benefit from proactive reclaim.  The
   memcg interface is not NUMA aware and there are usecases that are
   focusing on NUMA balancing rather than workload memory footprint.

2. Proactive reclaim on top tiers will trigger demotion, for which
   memory is still byte-addressable.  Reclaiming on the bottom nodes will
   trigger evicting to swap (the traditional sense of reclaim).  This
   follows the semantics of what is today part of the aging process on
   tiered memory, mirroring what every other form of reclaim does
   (reactive and memcg proactive reclaim).  Furthermore per-node proactive
   reclaim is not as susceptible to the memcg charging problem mentioned
   above.

3. Unlike the nodes= arg, this interface avoids confusing semantics,
   such as what exactly the user wants when mixing top-tier and low-tier
   nodes in the nodemask.  Further per-node interface is less exposed to
   "free up memory in my container" usecases, where eviction is intended.

4. Users that *really* want to free up memory can use proactive
   reclaim on nodes knowingly to be on the bottom tiers to force eviction
   in a natural way - higher access latencies are still better than swap.
   If compelled, while no guarantees and perhaps not worth the effort,
   users could also also potentially follow a ladder-like approach to
   eventually free up the memory.  Alternatively, perhaps an 'evict'
   option could be added to the parameters for both memory.reclaim and
   per-node interfaces to force this action unconditionally.

[akpm@linux-foundation.org: user_proactive_reclaim(): return -EBUSY on PGDAT_RECLAIM_LOCKED contention, per Roman]
[dave@stgolabs.net: memcg && node is also a bogus case, per Shakeel]
  Link: https://lkml.kernel.org/r/20250717235604.2atyx2aobwowpge3@offworld
Link: https://lkml.kernel.org/r/20250623185851.830632-5-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/ABI/stable/sysfs-devices-node |  9 +++++
 drivers/base/node.c                         |  2 ++
 include/linux/swap.h                        | 16 +++++++++
 mm/vmscan.c                                 | 55 +++++++++++++++++++++++++----
 4 files changed, 75 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index a02707cb7cbc..2d0e023f22a7 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -227,3 +227,12 @@ Contact:	Jiaqi Yan <jiaqiyan@google.com>
 Description:
 		Of the raw poisoned pages on a NUMA node, how many pages are
 		recovered by memory error recovery attempt.
+
+What:		/sys/devices/system/node/nodeX/reclaim
+Date:		June 2025
+Contact:	Linux Memory Management list <linux-mm@kvack.org>
+Description:
+		Perform user-triggered proactive reclaim on a NUMA node.
+		This interface is equivalent to the memcg variant.
+
+		See Documentation/admin-guide/cgroup-v2.rst
diff --git a/drivers/base/node.c b/drivers/base/node.c
index e434cb260e61..bef84f01712f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -659,6 +659,7 @@ static int register_node(struct node *node, int num)
 	} else {
 		hugetlb_register_node(node);
 		compaction_register_node(node);
+		reclaim_register_node(node);
 	}
 
 	return error;
@@ -675,6 +676,7 @@ void unregister_node(struct node *node)
 {
 	hugetlb_unregister_node(node);
 	compaction_unregister_node(node);
+	reclaim_unregister_node(node);
 	node_remove_accesses(node);
 	node_remove_caches(node);
 	device_unregister(&node->dev);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a49be950c485..95c6061fa1dc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -431,6 +431,22 @@ extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 long remove_mapping(struct address_space *mapping, struct folio *folio);
 
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+extern int reclaim_register_node(struct node *node);
+extern void reclaim_unregister_node(struct node *node);
+
+#else
+
+static inline int reclaim_register_node(struct node *node)
+{
+	return 0;
+}
+
+static inline void reclaim_unregister_node(struct node *node)
+{
+}
+#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+
 #ifdef CONFIG_NUMA
 extern int sysctl_min_unmapped_ratio;
 extern int sysctl_min_slab_ratio;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d165b66da796..19bfce93b373 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -94,10 +94,8 @@ struct scan_control {
 	unsigned long	anon_cost;
 	unsigned long	file_cost;
 
-#ifdef CONFIG_MEMCG
 	/* Swappiness value for proactive reclaim. Always use sc_swappiness()! */
 	int *proactive_swappiness;
-#endif
 
 	/* Can active folios be deactivated as part of reclaim? */
 #define DEACTIVATE_ANON 1
@@ -121,7 +119,7 @@ struct scan_control {
 	/* Has cache_trim_mode failed at least once? */
 	unsigned int cache_trim_mode_failed:1;
 
-	/* Proactive reclaim invoked by userspace through memory.reclaim */
+	/* Proactive reclaim invoked by userspace */
 	unsigned int proactive:1;
 
 	/*
@@ -7732,15 +7730,17 @@ static const match_table_t tokens = {
 	{ MEMORY_RECLAIM_NULL, NULL },
 };
 
-int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat)
+int user_proactive_reclaim(char *buf,
+			   struct mem_cgroup *memcg, pg_data_t *pgdat)
 {
 	unsigned int nr_retries = MAX_RECLAIM_RETRIES;
 	unsigned long nr_to_reclaim, nr_reclaimed = 0;
 	int swappiness = -1;
 	char *old_buf, *start;
 	substring_t args[MAX_OPT_ARGS];
+	gfp_t gfp_mask = GFP_KERNEL;
 
-	if (!buf || (!memcg && !pgdat))
+	if (!buf || (!memcg && !pgdat) || (memcg && pgdat))
 		return -EINVAL;
 
 	buf = strstrip(buf);
@@ -7792,11 +7792,29 @@ int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat
 			reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
 					  MEMCG_RECLAIM_PROACTIVE;
 			reclaimed = try_to_free_mem_cgroup_pages(memcg,
-						 batch_size, GFP_KERNEL,
+						 batch_size, gfp_mask,
 						 reclaim_options,
 						 swappiness == -1 ? NULL : &swappiness);
 		} else {
-			return -EINVAL;
+			struct scan_control sc = {
+				.gfp_mask = current_gfp_context(gfp_mask),
+				.reclaim_idx = gfp_zone(gfp_mask),
+				.proactive_swappiness = swappiness == -1 ? NULL : &swappiness,
+				.priority = DEF_PRIORITY,
+				.may_writepage = !laptop_mode,
+				.nr_to_reclaim = max(batch_size, SWAP_CLUSTER_MAX),
+				.may_unmap = 1,
+				.may_swap = 1,
+				.proactive = 1,
+			};
+
+			if (test_and_set_bit_lock(PGDAT_RECLAIM_LOCKED,
+						  &pgdat->flags))
+				return -EBUSY;
+
+			reclaimed = __node_reclaim(pgdat, gfp_mask,
+						   batch_size, &sc);
+			clear_bit_unlock(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 		}
 
 		if (!reclaimed && !nr_retries--)
@@ -7855,3 +7873,26 @@ void check_move_unevictable_folios(struct folio_batch *fbatch)
 	}
 }
 EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
+static ssize_t reclaim_store(struct device *dev,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	int ret, nid = dev->id;
+
+	ret = user_proactive_reclaim((char *)buf, NULL, NODE_DATA(nid));
+	return ret ? -EAGAIN : count;
+}
+
+static DEVICE_ATTR_WO(reclaim);
+int reclaim_register_node(struct node *node)
+{
+	return device_create_file(&node->dev, &dev_attr_reclaim);
+}
+
+void reclaim_unregister_node(struct node *node)
+{
+	return device_remove_file(&node->dev, &dev_attr_reclaim);
+}
+#endif
-- 
cgit v1.2.3


From 004ded6bee11b8ed463cdc54b89a4390f4b64f6d Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 12 Jul 2025 12:50:03 -0700
Subject: mm/damon: accept parallel damon_call() requests

Patch series "mm/damon: remove damon_callback".

damon_callback was the only way for communicating with DAMON for contexts
running on its worker thread.  The interface is flexible and simple.  But
as DAMON evolves with more features, damon_callback has become somewhat
too old.  With runtime parameters update, for example, its lack of
synchronization support was found to be inconvenient.  Arguably it is also
not easy to use correctly since the callers should understand when each
callback is called, and implication of the return values from the
callbacks.

To replace it, damon_call() and damos_walk() are introduced.  And those
replaced a few damon_callback use cases.  Some use cases of damon_callback
such as parallel or repetitive DAMON internal data reading and additional
cleanups cannot simply be replaced by damon_call() and damos_walk(),
though.

To allow those replaceable, extend damon_call() for parallel and/or
repeated callbacks and modify the core/ops layers for additional resources
cleanup.  With the updates, replace the remaining damon_callback usages
and finally say goodbye to damon_callback.


This patch (of 14):

Calling damon_call() while it is serving for another parallel thread
immediately fails with -EBUSY.  The caller should call it again, later.
Each caller implementing such retry logic would be redundant.  Accept
parallel damon_call() requests and do the wait instead of the caller.

Link: https://lkml.kernel.org/r/20250712195016.151108-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20250712195016.151108-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  7 +++++--
 mm/damon/core.c       | 49 +++++++++++++++++++++++++------------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 1f425d830bb9..562c7876ba88 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -673,6 +673,8 @@ struct damon_call_control {
 	struct completion completion;
 	/* informs if the kdamond canceled @fn infocation */
 	bool canceled;
+	/* List head for siblings. */
+	struct list_head list;
 };
 
 /**
@@ -798,8 +800,9 @@ struct damon_ctx {
 	/* for scheme quotas prioritization */
 	unsigned long *regions_score_histogram;
 
-	struct damon_call_control *call_control;
-	struct mutex call_control_lock;
+	/* lists of &struct damon_call_control */
+	struct list_head call_controls;
+	struct mutex call_controls_lock;
 
 	struct damos_walk_control *walk_control;
 	struct mutex walk_control_lock;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 2714a7a023db..b0a0b98f6889 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -533,7 +533,8 @@ struct damon_ctx *damon_new_ctx(void)
 	ctx->next_ops_update_sis = 0;
 
 	mutex_init(&ctx->kdamond_lock);
-	mutex_init(&ctx->call_control_lock);
+	INIT_LIST_HEAD(&ctx->call_controls);
+	mutex_init(&ctx->call_controls_lock);
 	mutex_init(&ctx->walk_control_lock);
 
 	ctx->attrs.min_nr_regions = 10;
@@ -1393,14 +1394,11 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
 {
 	init_completion(&control->completion);
 	control->canceled = false;
+	INIT_LIST_HEAD(&control->list);
 
-	mutex_lock(&ctx->call_control_lock);
-	if (ctx->call_control) {
-		mutex_unlock(&ctx->call_control_lock);
-		return -EBUSY;
-	}
-	ctx->call_control = control;
-	mutex_unlock(&ctx->call_control_lock);
+	mutex_lock(&ctx->call_controls_lock);
+	list_add_tail(&ctx->call_controls, &control->list);
+	mutex_unlock(&ctx->call_controls_lock);
 	if (!damon_is_running(ctx))
 		return -EINVAL;
 	wait_for_completion(&control->completion);
@@ -2419,11 +2417,11 @@ static void kdamond_usleep(unsigned long usecs)
 }
 
 /*
- * kdamond_call() - handle damon_call_control.
+ * kdamond_call() - handle damon_call_control objects.
  * @ctx:	The &struct damon_ctx of the kdamond.
  * @cancel:	Whether to cancel the invocation of the function.
  *
- * If there is a &struct damon_call_control request that registered via
+ * If there are &struct damon_call_control requests that registered via
  * &damon_call() on @ctx, do or cancel the invocation of the function depending
  * on @cancel.  @cancel is set when the kdamond is already out of the main loop
  * and therefore will be terminated.
@@ -2433,21 +2431,24 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
 	struct damon_call_control *control;
 	int ret = 0;
 
-	mutex_lock(&ctx->call_control_lock);
-	control = ctx->call_control;
-	mutex_unlock(&ctx->call_control_lock);
-	if (!control)
-		return;
-	if (cancel) {
-		control->canceled = true;
-	} else {
-		ret = control->fn(control->data);
-		control->return_code = ret;
+	while (true) {
+		mutex_lock(&ctx->call_controls_lock);
+		control = list_first_entry_or_null(&ctx->call_controls,
+				struct damon_call_control, list);
+		mutex_unlock(&ctx->call_controls_lock);
+		if (!control)
+			return;
+		if (cancel) {
+			control->canceled = true;
+		} else {
+			ret = control->fn(control->data);
+			control->return_code = ret;
+		}
+		mutex_lock(&ctx->call_controls_lock);
+		list_del(&control->list);
+		mutex_unlock(&ctx->call_controls_lock);
+		complete(&control->completion);
 	}
-	complete(&control->completion);
-	mutex_lock(&ctx->call_control_lock);
-	ctx->call_control = NULL;
-	mutex_unlock(&ctx->call_control_lock);
 }
 
 /* Returns negative error code if it's not activated but should return */
-- 
cgit v1.2.3


From 43df7676e5508895c33b846d37cff9cf3b52674c Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 12 Jul 2025 12:50:04 -0700
Subject: mm/damon/core: introduce repeat mode damon_call()

damon_call() can be useful for reading or writing DAMON internal data for
one time.  A common pattern of DAMON core usage from DAMON modules is
doing such reads and writes repeatedly, for example, to periodically
update the DAMOS stats.  To do that with damon_call(), callers should call
damon_call() repeatedly, with their own delay loop.  Each caller doing
that is repetitive.  Introduce a repeat mode damon_call().  Callers can
use the mode by setting a new field in damon_call_control.  If the mode is
turned on, damon_call() returns success immediately, and DAMON repeats
invoking the callback function inside the kdamond main loop.

Link: https://lkml.kernel.org/r/20250712195016.151108-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h |  2 ++
 mm/damon/core.c       | 25 ++++++++++++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 562c7876ba88..b83987275ff9 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -659,6 +659,7 @@ struct damon_callback {
  *
  * @fn:			Function to be called back.
  * @data:		Data that will be passed to @fn.
+ * @repeat:		Repeat invocations.
  * @return_code:	Return code from @fn invocation.
  *
  * Control damon_call(), which requests specific kdamond to invoke a given
@@ -667,6 +668,7 @@ struct damon_callback {
 struct damon_call_control {
 	int (*fn)(void *data);
 	void *data;
+	bool repeat;
 	int return_code;
 /* private: internal use only */
 	/* informs if the kdamond finished handling of the request */
diff --git a/mm/damon/core.c b/mm/damon/core.c
index b0a0b98f6889..ffb87497dbb5 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1379,8 +1379,9 @@ bool damon_is_running(struct damon_ctx *ctx)
  *
  * Ask DAMON worker thread (kdamond) of @ctx to call a function with an
  * argument data that respectively passed via &damon_call_control->fn and
- * &damon_call_control->data of @control, and wait until the kdamond finishes
- * handling of the request.
+ * &damon_call_control->data of @control.  If &damon_call_control->repeat of
+ * @control is set, further wait until the kdamond finishes handling of the
+ * request.  Otherwise, return as soon as the request is made.
  *
  * The kdamond executes the function with the argument in the main loop, just
  * after a sampling of the iteration is finished.  The function can hence
@@ -1392,7 +1393,8 @@ bool damon_is_running(struct damon_ctx *ctx)
  */
 int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
 {
-	init_completion(&control->completion);
+	if (!control->repeat)
+		init_completion(&control->completion);
 	control->canceled = false;
 	INIT_LIST_HEAD(&control->list);
 
@@ -1401,6 +1403,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
 	mutex_unlock(&ctx->call_controls_lock);
 	if (!damon_is_running(ctx))
 		return -EINVAL;
+	if (control->repeat)
+		return 0;
 	wait_for_completion(&control->completion);
 	if (control->canceled)
 		return -ECANCELED;
@@ -2429,6 +2433,7 @@ static void kdamond_usleep(unsigned long usecs)
 static void kdamond_call(struct damon_ctx *ctx, bool cancel)
 {
 	struct damon_call_control *control;
+	LIST_HEAD(repeat_controls);
 	int ret = 0;
 
 	while (true) {
@@ -2437,7 +2442,7 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
 				struct damon_call_control, list);
 		mutex_unlock(&ctx->call_controls_lock);
 		if (!control)
-			return;
+			break;
 		if (cancel) {
 			control->canceled = true;
 		} else {
@@ -2447,8 +2452,18 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel)
 		mutex_lock(&ctx->call_controls_lock);
 		list_del(&control->list);
 		mutex_unlock(&ctx->call_controls_lock);
-		complete(&control->completion);
+		if (!control->repeat)
+			complete(&control->completion);
+		else
+			list_add(&control->list, &repeat_controls);
 	}
+	control = list_first_entry_or_null(&repeat_controls,
+			struct damon_call_control, list);
+	if (!control || cancel)
+		return;
+	mutex_lock(&ctx->call_controls_lock);
+	list_add_tail(&control->list, &ctx->call_controls);
+	mutex_unlock(&ctx->call_controls_lock);
 }
 
 /* Returns negative error code if it's not activated but should return */
-- 
cgit v1.2.3


From 7114bc5e01cf393e1fdc97e10399eb9451b6af45 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 12 Jul 2025 12:50:11 -0700
Subject: mm/damon/core: add cleanup_target() ops callback

Some DAMON operation sets may need additional cleanup per target.  For
example, [f]vaddr need to put pids of each target.  Each user and core
logic is doing that redundantly.  Add another DAMON ops callback that will
be used for doing such cleanups in operations set layer.

[sj@kernel.org: add kernel-doc comment for damon_operations->cleanup_target]
  Link: https://lkml.kernel.org/r/20250715185239.89152-2-sj@kernel.org
[sj@kernel.org: remove damon_ctx->callback kernel-doc comment]
  Link: https://lkml.kernel.org/r/20250715185239.89152-3-sj@kernel.org
Link: https://lkml.kernel.org/r/20250712195016.151108-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h        |  6 ++++--
 mm/damon/core.c              | 12 ++++++++----
 mm/damon/sysfs.c             |  4 ++--
 mm/damon/tests/core-kunit.h  |  4 ++--
 mm/damon/tests/vaddr-kunit.h |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index b83987275ff9..8c765e36623a 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -576,6 +576,7 @@ enum damon_ops_id {
  * @get_scheme_score:		Get the score of a region for a scheme.
  * @apply_scheme:		Apply a DAMON-based operation scheme.
  * @target_valid:		Determine if the target is valid.
+ * @cleanup_target:		Clean up each target before deallocation.
  * @cleanup:			Clean up the context.
  *
  * DAMON can be extended for various address spaces and usages.  For this,
@@ -608,6 +609,7 @@ enum damon_ops_id {
  * filters (&struct damos_filter) that handled by itself.
  * @target_valid should check whether the target is still valid for the
  * monitoring.
+ * @cleanup_target is called before the target will be deallocated.
  * @cleanup is called from @kdamond just before its termination.
  */
 struct damon_operations {
@@ -623,6 +625,7 @@ struct damon_operations {
 			struct damon_target *t, struct damon_region *r,
 			struct damos *scheme, unsigned long *sz_filter_passed);
 	bool (*target_valid)(struct damon_target *t);
+	void (*cleanup_target)(struct damon_target *t);
 	void (*cleanup)(struct damon_ctx *context);
 };
 
@@ -771,7 +774,6 @@ struct damon_attrs {
  * Accesses to other fields must be protected by themselves.
  *
  * @ops:	Set of monitoring operations for given use cases.
- * @callback:	Set of callbacks for monitoring events notifications.
  *
  * @adaptive_targets:	Head of monitoring targets (&damon_target) list.
  * @schemes:		Head of schemes (&damos) list.
@@ -933,7 +935,7 @@ struct damon_target *damon_new_target(void);
 void damon_add_target(struct damon_ctx *ctx, struct damon_target *t);
 bool damon_targets_empty(struct damon_ctx *ctx);
 void damon_free_target(struct damon_target *t);
-void damon_destroy_target(struct damon_target *t);
+void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx);
 unsigned int damon_nr_regions(struct damon_target *t);
 
 struct damon_ctx *damon_new_ctx(void);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index b82a838b5a0e..678c9b4e038c 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -502,8 +502,12 @@ void damon_free_target(struct damon_target *t)
 	kfree(t);
 }
 
-void damon_destroy_target(struct damon_target *t)
+void damon_destroy_target(struct damon_target *t, struct damon_ctx *ctx)
 {
+
+	if (ctx && ctx->ops.cleanup_target)
+		ctx->ops.cleanup_target(t);
+
 	damon_del_target(t);
 	damon_free_target(t);
 }
@@ -551,7 +555,7 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
 	struct damon_target *t, *next_t;
 
 	damon_for_each_target_safe(t, next_t, ctx)
-		damon_destroy_target(t);
+		damon_destroy_target(t, ctx);
 }
 
 void damon_destroy_ctx(struct damon_ctx *ctx)
@@ -1137,7 +1141,7 @@ static int damon_commit_targets(
 
 			if (damon_target_has_pid(dst))
 				put_pid(dst_target->pid);
-			damon_destroy_target(dst_target);
+			damon_destroy_target(dst_target, dst);
 			damon_for_each_scheme(s, dst) {
 				if (s->quota.charge_target_from == dst_target) {
 					s->quota.charge_target_from = NULL;
@@ -1156,7 +1160,7 @@ static int damon_commit_targets(
 		err = damon_commit_target(new_target, false,
 				src_target, damon_target_has_pid(src));
 		if (err) {
-			damon_destroy_target(new_target);
+			damon_destroy_target(new_target, NULL);
 			return err;
 		}
 		damon_add_target(dst, new_target);
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index c0193de6fb9a..f2f9f756f5a2 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1303,7 +1303,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
 	damon_for_each_target_safe(t, next, ctx) {
 		if (has_pid)
 			put_pid(t->pid);
-		damon_destroy_target(t);
+		damon_destroy_target(t, ctx);
 	}
 }
 
@@ -1389,7 +1389,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
 
 	damon_for_each_target_safe(t, next, ctx) {
 		put_pid(t->pid);
-		damon_destroy_target(t);
+		damon_destroy_target(t, ctx);
 	}
 }
 
diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h
index 298c67557fae..dfedfff19940 100644
--- a/mm/damon/tests/core-kunit.h
+++ b/mm/damon/tests/core-kunit.h
@@ -58,7 +58,7 @@ static void damon_test_target(struct kunit *test)
 	damon_add_target(c, t);
 	KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c));
 
-	damon_destroy_target(t);
+	damon_destroy_target(t, c);
 	KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
 
 	damon_destroy_ctx(c);
@@ -310,7 +310,7 @@ static void damon_test_set_regions(struct kunit *test)
 		KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
 		KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
 	}
-	damon_destroy_target(t);
+	damon_destroy_target(t, NULL);
 }
 
 static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h
index 7cd944266a92..d2b37ccf2cc0 100644
--- a/mm/damon/tests/vaddr-kunit.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -149,7 +149,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
 		KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
 	}
 
-	damon_destroy_target(t);
+	damon_destroy_target(t, NULL);
 }
 
 /*
-- 
cgit v1.2.3


From 5add26c0a18636e8e9fe409d4591c8a36e1bf695 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Sat, 12 Jul 2025 12:50:16 -0700
Subject: mm/damon/core: remove damon_callback

All damon_callback usages are replicated by damon_call() and damos_walk().
Time to say goodbye.  Remove damon_callback.

Link: https://lkml.kernel.org/r/20250712195016.151108-15-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h | 31 +------------------------------
 mm/damon/core.c       | 26 +++++++-------------------
 2 files changed, 8 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 8c765e36623a..f13664c62ddd 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -629,34 +629,6 @@ struct damon_operations {
 	void (*cleanup)(struct damon_ctx *context);
 };
 
-/**
- * struct damon_callback - Monitoring events notification callbacks.
- *
- * @after_wmarks_check:	Called after each schemes' watermarks check.
- * @after_aggregation:	Called after each aggregation.
- * @before_terminate:	Called before terminating the monitoring.
- *
- * The monitoring thread (&damon_ctx.kdamond) calls @before_terminate just
- * before finishing the monitoring.
- *
- * The monitoring thread calls @after_wmarks_check after each DAMON-based
- * operation schemes' watermarks check.  If users need to make changes to the
- * attributes of the monitoring context while it's deactivated due to the
- * watermarks, this is the good place to do.
- *
- * The monitoring thread calls @after_aggregation for each of the aggregation
- * intervals.  Therefore, users can safely access the monitoring results
- * without additional protection.  For the reason, users are recommended to use
- * these callback for the accesses to the results.
- *
- * If any callback returns non-zero, monitoring stops.
- */
-struct damon_callback {
-	int (*after_wmarks_check)(struct damon_ctx *context);
-	int (*after_aggregation)(struct damon_ctx *context);
-	void (*before_terminate)(struct damon_ctx *context);
-};
-
 /*
  * struct damon_call_control - Control damon_call().
  *
@@ -727,7 +699,7 @@ struct damon_intervals_goal {
  * ``mmap()`` calls from the application, in case of virtual memory monitoring)
  * and applies the changes for each @ops_update_interval.  All time intervals
  * are in micro-seconds.  Please refer to &struct damon_operations and &struct
- * damon_callback for more detail.
+ * damon_call_control for more detail.
  */
 struct damon_attrs {
 	unsigned long sample_interval;
@@ -816,7 +788,6 @@ struct damon_ctx {
 	struct mutex kdamond_lock;
 
 	struct damon_operations ops;
-	struct damon_callback callback;
 
 	struct list_head adaptive_targets;
 	struct list_head schemes;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index ffd1a061c2cb..f3ec3bd736ec 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -680,9 +680,7 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs)
  * @attrs:		monitoring attributes
  *
  * This function should be called while the kdamond is not running, an access
- * check results aggregation is not ongoing (e.g., from &struct
- * damon_callback->after_aggregation or &struct
- * damon_callback->after_wmarks_check callbacks), or from damon_call().
+ * check results aggregation is not ongoing (e.g., from damon_call().
  *
  * Every time interval is in micro-seconds.
  *
@@ -778,7 +776,7 @@ static void damos_commit_quota_goal(
  * DAMON contexts, instead of manual in-place updates.
  *
  * This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
  */
 int damos_commit_quota_goals(struct damos_quota *dst, struct damos_quota *src)
 {
@@ -1177,7 +1175,7 @@ static int damon_commit_targets(
  * in-place updates.
  *
  * This function should be called from parameters-update safe context, like
- * DAMON callbacks.
+ * damon_call().
  */
 int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
 {
@@ -2484,9 +2482,6 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
 
 		kdamond_usleep(min_wait_time);
 
-		if (ctx->callback.after_wmarks_check &&
-				ctx->callback.after_wmarks_check(ctx))
-			break;
 		kdamond_call(ctx, false);
 		damos_walk_cancel(ctx);
 	}
@@ -2543,10 +2538,9 @@ static int kdamond_fn(void *data)
 	while (!kdamond_need_stop(ctx)) {
 		/*
 		 * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could
-		 * be changed from after_wmarks_check() or after_aggregation()
-		 * callbacks.  Read the values here, and use those for this
-		 * iteration.  That is, damon_set_attrs() updated new values
-		 * are respected from next iteration.
+		 * be changed from kdamond_call().  Read the values here, and
+		 * use those for this iteration.  That is, damon_set_attrs()
+		 * updated new values are respected from next iteration.
 		 */
 		unsigned long next_aggregation_sis = ctx->next_aggregation_sis;
 		unsigned long next_ops_update_sis = ctx->next_ops_update_sis;
@@ -2564,14 +2558,10 @@ static int kdamond_fn(void *data)
 		if (ctx->ops.check_accesses)
 			max_nr_accesses = ctx->ops.check_accesses(ctx);
 
-		if (ctx->passed_sample_intervals >= next_aggregation_sis) {
+		if (ctx->passed_sample_intervals >= next_aggregation_sis)
 			kdamond_merge_regions(ctx,
 					max_nr_accesses / 10,
 					sz_limit);
-			if (ctx->callback.after_aggregation &&
-					ctx->callback.after_aggregation(ctx))
-				break;
-		}
 
 		/*
 		 * do kdamond_call() and kdamond_apply_schemes() after
@@ -2637,8 +2627,6 @@ done:
 			damon_destroy_region(r, t);
 	}
 
-	if (ctx->callback.before_terminate)
-		ctx->callback.before_terminate(ctx);
 	if (ctx->ops.cleanup)
 		ctx->ops.cleanup(ctx);
 	kfree(ctx->regions_score_histogram);
-- 
cgit v1.2.3


From 9989db9f230542cfc097be3291b9457173371eb1 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Fri, 11 Jul 2025 10:59:10 -0400
Subject: mm/page_owner: convert set_page_owner_migrate_reason() to folios

Both callers of set_page_owner_migrate_reason() use folios.  Convert the
function to take a folio directly and move the &folio->page conversion
inside __set_page_owner_migrate_reason().

Link: https://lkml.kernel.org/r/20250711145910.90135-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_owner.h | 8 ++++----
 mm/hugetlb.c               | 2 +-
 mm/migrate.c               | 2 +-
 mm/page_owner.c            | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index debdc25f08b9..3328357f6dba 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -14,7 +14,7 @@ extern void __set_page_owner(struct page *page,
 extern void __split_page_owner(struct page *page, int old_order,
 			int new_order);
 extern void __folio_copy_owner(struct folio *newfolio, struct folio *old);
-extern void __set_page_owner_migrate_reason(struct page *page, int reason);
+extern void __folio_set_owner_migrate_reason(struct folio *folio, int reason);
 extern void __dump_page_owner(const struct page *page);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 					pg_data_t *pgdat, struct zone *zone);
@@ -43,10 +43,10 @@ static inline void folio_copy_owner(struct folio *newfolio, struct folio *old)
 	if (static_branch_unlikely(&page_owner_inited))
 		__folio_copy_owner(newfolio, old);
 }
-static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason)
 {
 	if (static_branch_unlikely(&page_owner_inited))
-		__set_page_owner_migrate_reason(page, reason);
+		__folio_set_owner_migrate_reason(folio, reason);
 }
 static inline void dump_page_owner(const struct page *page)
 {
@@ -68,7 +68,7 @@ static inline void split_page_owner(struct page *page, int old_order,
 static inline void folio_copy_owner(struct folio *newfolio, struct folio *folio)
 {
 }
-static inline void set_page_owner_migrate_reason(struct page *page, int reason)
+static inline void folio_set_owner_migrate_reason(struct folio *folio, int reason)
 {
 }
 static inline void dump_page_owner(const struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f13fa5aa6624..753f99b4c718 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7835,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
 	struct hstate *h = folio_hstate(old_folio);
 
 	hugetlb_cgroup_migrate(old_folio, new_folio);
-	set_page_owner_migrate_reason(&new_folio->page, reason);
+	folio_set_owner_migrate_reason(new_folio, reason);
 
 	/*
 	 * transfer temporary state of the new hugetlb folio. This is
diff --git a/mm/migrate.c b/mm/migrate.c
index 36b2764204b6..425401b2d4e1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1367,7 +1367,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 
 out_unlock_both:
 	folio_unlock(dst);
-	set_page_owner_migrate_reason(&dst->page, reason);
+	folio_set_owner_migrate_reason(dst, reason);
 	/*
 	 * If migration is successful, decrease refcount of dst,
 	 * which will not free the page because new page owner increased
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9928c9ac8c31..c3ca21132c2c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -333,9 +333,9 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
 	inc_stack_record_count(handle, gfp_mask, 1 << order);
 }
 
-void __set_page_owner_migrate_reason(struct page *page, int reason)
+void __folio_set_owner_migrate_reason(struct folio *folio, int reason)
 {
-	struct page_ext *page_ext = page_ext_get(page);
+	struct page_ext *page_ext = page_ext_get(&folio->page);
 	struct page_owner *page_owner;
 
 	if (unlikely(!page_ext))
-- 
cgit v1.2.3


From 35c18f2933c596b4fd6a98baee36f3137d133a5f Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:13:21 +0200
Subject: Add a new optional ",cma" suffix to the crashkernel= command line
 option

Patch series "kdump: crashkernel reservation from CMA", v5.

This series implements a way to reserve additional crash kernel memory
using CMA.

Currently, all the memory for the crash kernel is not usable by the 1st
(production) kernel.  It is also unmapped so that it can't be corrupted by
the fault that will eventually trigger the crash.  This makes sense for
the memory actually used by the kexec-loaded crash kernel image and initrd
and the data prepared during the load (vmcoreinfo, ...).  However, the
reserved space needs to be much larger than that to provide enough
run-time memory for the crash kernel and the kdump userspace.  Estimating
the amount of memory to reserve is difficult.  Being too careful makes
kdump likely to end in OOM, being too generous takes even more memory from
the production system.  Also, the reservation only allows reserving a
single contiguous block (or two with the "low" suffix).  I've seen systems
where this fails because the physical memory is fragmented.

By reserving additional crashkernel memory from CMA, the main crashkernel
reservation can be just large enough to fit the kernel and initrd image,
minimizing the memory taken away from the production system.  Most of the
run-time memory for the crash kernel will be memory previously available
to userspace in the production system.  As this memory is no longer
wasted, the reservation can be done with a generous margin, making kdump
more reliable.  Kernel memory that we need to preserve for dumping is
normally not allocated from CMA, unless it is explicitly allocated as
movable.  Currently this is only the case for memory ballooning and zswap.
Such movable memory will be missing from the vmcore.  User data is
typically not dumped by makedumpfile.  When dumping of user data is
intended this new CMA reservation cannot be used.

There are five patches in this series:

The first adds a new ",cma" suffix to the recenly introduced generic
crashkernel parsing code.  parse_crashkernel() takes one more argument to
store the cma reservation size.

The second patch implements reserve_crashkernel_cma() which performs the
reservation.  If the requested size is not available in a single range,
multiple smaller ranges will be reserved.

The third patch updates Documentation/, explicitly mentioning the
potential DMA corruption of the CMA-reserved memory.

The fourth patch adds a short delay before booting the kdump kernel,
allowing pending DMA transfers to finish.

The fifth patch enables the functionality for x86 as a proof of
concept. There are just three things every arch needs to do:
- call reserve_crashkernel_cma()
- include the CMA-reserved ranges in the physical memory map
- exclude the CMA-reserved ranges from the memory available
  through /proc/vmcore by excluding them from the vmcoreinfo
  PT_LOAD ranges.

Adding other architectures is easy and I can do that as soon as this
series is merged.

With this series applied, specifying
	crashkernel=100M craskhernel=1G,cma
on the command line will make a standard crashkernel reservation
of 100M, where kexec will load the kernel and initrd.

An additional 1G will be reserved from CMA, still usable by the production
system.  The crash kernel will have 1.1G memory available.  The 100M can
be reliably predicted based on the size of the kernel and initrd.

The new cma suffix is completely optional. When no
crashkernel=size,cma is specified, everything works as before.


This patch (of 5):

Add a new cma_size parameter to parse_crashkernel().  When not NULL, call
__parse_crashkernel to parse the CMA reservation size from
"crashkernel=size,cma" and store it in cma_size.

Set cma_size to NULL in all calls to parse_crashkernel().

Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz
Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/kernel/setup.c              |  2 +-
 arch/arm64/mm/init.c                 |  2 +-
 arch/loongarch/kernel/setup.c        |  2 +-
 arch/mips/kernel/setup.c             |  2 +-
 arch/powerpc/kernel/fadump.c         |  2 +-
 arch/powerpc/kexec/core.c            |  2 +-
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/riscv/mm/init.c                 |  2 +-
 arch/s390/kernel/setup.c             |  2 +-
 arch/sh/kernel/machine_kexec.c       |  2 +-
 arch/x86/kernel/setup.c              |  2 +-
 include/linux/crash_reserve.h        |  3 ++-
 kernel/crash_reserve.c               | 16 ++++++++++++++--
 13 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index a41c93988d2c..0bfd66c7ada0 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void)
 	total_mem = get_total_mem();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
-				NULL, NULL);
+				NULL, NULL, NULL);
 	/* invalid value specified or crashkernel=0 */
 	if (ret || !crash_size)
 		return;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 0c8c35dd645e..ea84a61ed508 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index b99fbb388fe0..22b27cd447a1 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void)
 		return;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&crash_size, &crash_base, &low_size, &high);
+				&crash_size, &crash_base, &low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index fbfe0771317e..11b9b6b63e19 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void)
 	total_mem = memblock_phys_mem_size();
 	ret = parse_crashkernel(boot_command_line, total_mem,
 				&crash_size, &crash_base,
-				NULL, NULL);
+				NULL, NULL, NULL);
 	if (ret != 0 || crash_size <= 0)
 		return;
 
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ca49e40c473..28cab25d5b33 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void)
 	 * memory at a predefined offset.
 	 */
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-				&size, &base, NULL, NULL);
+				&size, &base, NULL, NULL, NULL);
 	if (ret == 0 && size > 0) {
 		unsigned long max_size;
 
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 00e9c267b912..d1a2d755381c 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void)
 
 	/* use common parsing */
 	ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size,
-				&crash_base, NULL, NULL);
+				&crash_base, NULL, NULL, NULL);
 
 	if (ret)
 		return;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 5c8d1bb98b3e..5e4897daaaea 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
 	int ret;
 
 	ret = parse_crashkernel(boot_command_line, size, &crash_size,
-				&crash_base, NULL, NULL);
+				&crash_base, NULL, NULL, NULL);
 	if (ret != 0 || crash_size == 0)
 		return;
 	if (crash_base == 0)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 8d0374d7ce8e..15683ae13fa5 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f244c5560e7f..b99aeb0db2ee 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void)
 	int rc;
 
 	rc = parse_crashkernel(boot_command_line, ident_map_size,
-			       &crash_size, &crash_base, NULL, NULL);
+			       &crash_size, &crash_base, NULL, NULL, NULL);
 
 	crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
 	crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c
index 8321b31d2e19..37073ca1e0ad 100644
--- a/arch/sh/kernel/machine_kexec.c
+++ b/arch/sh/kernel/machine_kexec.c
@@ -146,7 +146,7 @@ void __init reserve_crashkernel(void)
 		return;
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-			&crash_size, &crash_base, NULL, NULL);
+			&crash_size, &crash_base, NULL, NULL, NULL);
 	if (ret == 0 && crash_size > 0) {
 		crashk_res.start = crash_base;
 		crashk_res.end = crash_base + crash_size - 1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index fb27be697128..c22dc630c297 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void)
 
 	ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
 				&crash_size, &crash_base,
-				&low_size, &high);
+				&low_size, NULL, &high);
 	if (ret)
 		return;
 
diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index 1fe7e7d1b214..e784aaff2f5a 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -16,7 +16,8 @@ extern struct resource crashk_low_res;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
-		unsigned long long *low_size, bool *high);
+		unsigned long long *low_size, unsigned long long *cma_size,
+		bool *high);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index acb6bf42e30d..86ae1365d04e 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
 
 #define SUFFIX_HIGH 0
 #define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA  2
+#define SUFFIX_NULL 3
 static __initdata char *suffix_tbl[] = {
 	[SUFFIX_HIGH] = ",high",
 	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_CMA]  = ",cma",
 	[SUFFIX_NULL] = NULL,
 };
 
 /*
  * That function parses "suffix"  crashkernel command lines like
  *
- *	crashkernel=size,[high|low]
+ *	crashkernel=size,[high|low|cma]
  *
  * It returns 0 on success and -EINVAL on failure.
  */
@@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline,
 			     unsigned long long *crash_size,
 			     unsigned long long *crash_base,
 			     unsigned long long *low_size,
+			     unsigned long long *cma_size,
 			     bool *high)
 {
 	int ret;
+	unsigned long long __always_unused cma_base;
 
 	/* crashkernel=X[@offset] */
 	ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline,
 
 		*high = true;
 	}
+
+	/*
+	 * optional CMA reservation
+	 * cma_base is ignored
+	 */
+	if (cma_size)
+		__parse_crashkernel(cmdline, 0, cma_size,
+			&cma_base, suffix_tbl[SUFFIX_CMA]);
 #endif
 	if (!*crash_size)
 		ret = -EINVAL;
-- 
cgit v1.2.3


From ab475510e0422bb5672d465f9d0f523d72fdb7f1 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 12 Jun 2025 12:16:39 +0200
Subject: kdump: implement reserve_crashkernel_cma

reserve_crashkernel_cma() reserves CMA ranges for the crash kernel.  If
allocating the requested size fails, try to reserve in smaller blocks.

Store the reserved ranges in the crashk_cma_ranges array and the number of
ranges in crashk_cma_cnt.

Link: https://lkml.kernel.org/r/aEqpBwOy_ekm0gw9@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/crash_reserve.h | 12 ++++++++++
 kernel/crash_reserve.c        | 52 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include')

diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h
index e784aaff2f5a..7b44b41d0a20 100644
--- a/include/linux/crash_reserve.h
+++ b/include/linux/crash_reserve.h
@@ -13,12 +13,24 @@
  */
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
+extern struct range crashk_cma_ranges[];
+#if defined(CONFIG_CMA) && defined(CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION)
+#define CRASHKERNEL_CMA
+#define CRASHKERNEL_CMA_RANGES_MAX 4
+extern int crashk_cma_cnt;
+#else
+#define crashk_cma_cnt 0
+#define CRASHKERNEL_CMA_RANGES_MAX 0
+#endif
+
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base,
 		unsigned long long *low_size, unsigned long long *cma_size,
 		bool *high);
 
+void __init reserve_crashkernel_cma(unsigned long long cma_size);
+
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
 #define DEFAULT_CRASH_KERNEL_LOW_SIZE	(128UL << 20)
diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c
index 86ae1365d04e..87bf4d41eabb 100644
--- a/kernel/crash_reserve.c
+++ b/kernel/crash_reserve.c
@@ -14,6 +14,8 @@
 #include <linux/cpuhotplug.h>
 #include <linux/memblock.h>
 #include <linux/kmemleak.h>
+#include <linux/cma.h>
+#include <linux/crash_reserve.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -469,6 +471,56 @@ retry:
 #endif
 }
 
+struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX];
+#ifdef CRASHKERNEL_CMA
+int crashk_cma_cnt;
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+	unsigned long long request_size = roundup(cma_size, PAGE_SIZE);
+	unsigned long long reserved_size = 0;
+
+	if (!cma_size)
+		return;
+
+	while (cma_size > reserved_size &&
+	       crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) {
+
+		struct cma *res;
+
+		if (cma_declare_contiguous(0, request_size, 0, 0, 0, false,
+				       "crashkernel", &res)) {
+			/* reservation failed, try half-sized blocks */
+			if (request_size <= PAGE_SIZE)
+				break;
+
+			request_size = roundup(request_size / 2, PAGE_SIZE);
+			continue;
+		}
+
+		crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res);
+		crashk_cma_ranges[crashk_cma_cnt].end =
+			crashk_cma_ranges[crashk_cma_cnt].start +
+			cma_get_size(res) - 1;
+		++crashk_cma_cnt;
+		reserved_size += request_size;
+	}
+
+	if (cma_size > reserved_size)
+		pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n",
+			cma_size >> 20, reserved_size >> 20, crashk_cma_cnt);
+	else
+		pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n",
+			reserved_size >> 20, crashk_cma_cnt);
+}
+
+#else /* CRASHKERNEL_CMA */
+void __init reserve_crashkernel_cma(unsigned long long cma_size)
+{
+	if (cma_size)
+		pr_warn("crashkernel CMA reservation not supported\n");
+}
+#endif
+
 #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
 static __init int insert_crashkernel_resources(void)
 {
-- 
cgit v1.2.3


From b76e89e50fc3693b7b8a443ed906320d8ccb93fd Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:01 +0800
Subject: panic: generalize panic_print's function to show sys info

'panic_print' was introduced to help debugging kernel panic by dumping
different kinds of system information like tasks' call stack, memory,
ftrace buffer, etc.  Actually this function could also be used to help
debugging other cases like task-hung, soft/hard lockup, etc.  where user
may need the snapshot of system info at that time.

Extract system info dump function related code from panic.c to separate
file sys_info.[ch], for wider usage by other kernel parts for debugging.

Also modify the macro names about singulars/plurals.

Link: https://lkml.kernel.org/r/20250703021004.42328-3-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sys_info.h | 20 ++++++++++++++++++++
 kernel/panic.c           | 36 ++++--------------------------------
 lib/Makefile             |  2 +-
 lib/sys_info.c           | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/sys_info.h
 create mode 100644 lib/sys_info.c

(limited to 'include')

diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h
new file mode 100644
index 000000000000..53b7e27dbf2a
--- /dev/null
+++ b/include/linux/sys_info.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SYS_INFO_H
+#define _LINUX_SYS_INFO_H
+
+/*
+ * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special
+ * handling which only fits panic case.
+ */
+#define SYS_INFO_TASKS			0x00000001
+#define SYS_INFO_MEM			0x00000002
+#define SYS_INFO_TIMERS			0x00000004
+#define SYS_INFO_LOCKS			0x00000008
+#define SYS_INFO_FTRACE			0x00000010
+#define SYS_INFO_PANIC_CONSOLE_REPLAY	0x00000020
+#define SYS_INFO_ALL_CPU_BT		0x00000040
+#define SYS_INFO_BLOCKED_TASKS		0x00000080
+
+void sys_info(unsigned long si_mask);
+
+#endif	/* _LINUX_SYS_INFO_H */
diff --git a/kernel/panic.c b/kernel/panic.c
index 9b6c5dc28a65..cbb0681177b3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,6 +36,7 @@
 #include <linux/sysfs.h>
 #include <linux/context_tracking.h>
 #include <linux/seq_buf.h>
+#include <linux/sys_info.h>
 #include <trace/events/error_report.h>
 #include <asm/sections.h>
 
@@ -69,14 +70,6 @@ bool panic_triggering_all_cpu_backtrace;
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
 
-#define PANIC_PRINT_TASK_INFO		0x00000001
-#define PANIC_PRINT_MEM_INFO		0x00000002
-#define PANIC_PRINT_TIMER_INFO		0x00000004
-#define PANIC_PRINT_LOCK_INFO		0x00000008
-#define PANIC_PRINT_FTRACE_INFO		0x00000010
-#define PANIC_CONSOLE_REPLAY		0x00000020
-#define PANIC_PRINT_ALL_CPU_BT		0x00000040
-#define PANIC_PRINT_BLOCKED_TASKS	0x00000080
 unsigned long panic_print;
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -240,31 +233,10 @@ EXPORT_SYMBOL(nmi_panic);
 
 static void panic_console_replay(void)
 {
-	if (panic_print & PANIC_CONSOLE_REPLAY)
+	if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY)
 		console_flush_on_panic(CONSOLE_REPLAY_ALL);
 }
 
-static void panic_print_sys_info(void)
-{
-	if (panic_print & PANIC_PRINT_TASK_INFO)
-		show_state();
-
-	if (panic_print & PANIC_PRINT_MEM_INFO)
-		show_mem();
-
-	if (panic_print & PANIC_PRINT_TIMER_INFO)
-		sysrq_timer_list_show();
-
-	if (panic_print & PANIC_PRINT_LOCK_INFO)
-		debug_show_all_locks();
-
-	if (panic_print & PANIC_PRINT_FTRACE_INFO)
-		ftrace_dump(DUMP_ALL);
-
-	if (panic_print & PANIC_PRINT_BLOCKED_TASKS)
-		show_state_filter(TASK_UNINTERRUPTIBLE);
-}
-
 void check_panic_on_warn(const char *origin)
 {
 	unsigned int limit;
@@ -285,7 +257,7 @@ void check_panic_on_warn(const char *origin)
  */
 static void panic_other_cpus_shutdown(bool crash_kexec)
 {
-	if (panic_print & PANIC_PRINT_ALL_CPU_BT) {
+	if (panic_print & SYS_INFO_ALL_CPU_BT) {
 		/* Temporary allow non-panic CPUs to write their backtraces. */
 		panic_triggering_all_cpu_backtrace = true;
 		trigger_all_cpu_backtrace();
@@ -410,7 +382,7 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-	panic_print_sys_info();
+	sys_info(panic_print);
 
 	kmsg_dump_desc(KMSG_DUMP_PANIC, buf);
 
diff --git a/lib/Makefile b/lib/Makefile
index c38582f187dd..88d6228089a8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -40,7 +40,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-	 buildid.o objpool.o iomem_copy.o
+	 buildid.o objpool.o iomem_copy.o sys_info.o
 
 lib-$(CONFIG_UNION_FIND) += union_find.o
 lib-$(CONFIG_PRINTK) += dump_stack.o
diff --git a/lib/sys_info.c b/lib/sys_info.c
new file mode 100644
index 000000000000..53031e5cb98e
--- /dev/null
+++ b/lib/sys_info.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/nmi.h>
+
+#include <linux/sys_info.h>
+
+void sys_info(unsigned long si_mask)
+{
+	if (si_mask & SYS_INFO_TASKS)
+		show_state();
+
+	if (si_mask & SYS_INFO_MEM)
+		show_mem();
+
+	if (si_mask & SYS_INFO_TIMERS)
+		sysrq_timer_list_show();
+
+	if (si_mask & SYS_INFO_LOCKS)
+		debug_show_all_locks();
+
+	if (si_mask & SYS_INFO_FTRACE)
+		ftrace_dump(DUMP_ALL);
+
+	if (si_mask & SYS_INFO_ALL_CPU_BT)
+		trigger_all_cpu_backtrace();
+
+	if (si_mask & SYS_INFO_BLOCKED_TASKS)
+		show_state_filter(TASK_UNINTERRUPTIBLE);
+}
-- 
cgit v1.2.3


From d747755917bf8ae08f490c3fe7d8e321afab8127 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@linux.alibaba.com>
Date: Thu, 3 Jul 2025 10:10:02 +0800
Subject: panic: add 'panic_sys_info' sysctl to take human readable string
 parameter

Bitmap definition for 'panic_print' is hard to remember and decode.  Add
'panic_sys_info='sysctl to take human readable string like
"tasks,mem,timers,locks,ftrace,..." and translate it into bitmap.

The detailed mapping is:
	SYS_INFO_TASKS		"tasks"
	SYS_INFO_MEM		"mem"
	SYS_INFO_TIMERS		"timers"
	SYS_INFO_LOCKS		"locks"
	SYS_INFO_FTRACE		"ftrace"
	SYS_INFO_ALL_CPU_BT	"all_bt"
	SYS_INFO_BLOCKED_TASKS	"blocked_tasks"

[nathan@kernel.org: add __maybe_unused to sys_info_avail]
  Link: https://lkml.kernel.org/r/20250708-fix-clang-sys_info_avail-warning-v1-1-60d239eacd64@kernel.org
Link: https://lkml.kernel.org/r/20250703021004.42328-4-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/kernel.rst | 18 ++++++
 include/linux/sys_info.h                    |  8 +++
 kernel/panic.c                              |  7 +++
 lib/sys_info.c                              | 90 +++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+)

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 0d08b7a2db2d..cccb06d1a6bf 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -899,6 +899,24 @@ So for example to print tasks and memory info on panic, user can::
   echo 3 > /proc/sys/kernel/panic_print
 
 
+panic_sys_info
+==============
+
+A comma separated list of extra information to be dumped on panic,
+for example, "tasks,mem,timers,...".  It is a human readable alternative
+to 'panic_print'. Possible values are:
+
+=============   ===================================================
+tasks           print all tasks info
+mem             print system memory info
+timer           print timers info
+lock            print locks info if CONFIG_LOCKDEP is on
+ftrace          print ftrace buffer
+all_bt          print all CPUs backtrace (if available in the arch)
+blocked_tasks   print only tasks in uninterruptible (blocked) state
+=============   ===================================================
+
+
 panic_on_rcu_stall
 ==================
 
diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h
index 53b7e27dbf2a..89d77dc4f2ed 100644
--- a/include/linux/sys_info.h
+++ b/include/linux/sys_info.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_SYS_INFO_H
 #define _LINUX_SYS_INFO_H
 
+#include <linux/sysctl.h>
+
 /*
  * SYS_INFO_PANIC_CONSOLE_REPLAY is for panic case only, as it needs special
  * handling which only fits panic case.
@@ -16,5 +18,11 @@
 #define SYS_INFO_BLOCKED_TASKS		0x00000080
 
 void sys_info(unsigned long si_mask);
+unsigned long sys_info_parse_param(char *str);
 
+#ifdef CONFIG_SYSCTL
+int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos);
+#endif
 #endif	/* _LINUX_SYS_INFO_H */
diff --git a/kernel/panic.c b/kernel/panic.c
index cbb0681177b3..d7aa427dc23c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -126,6 +126,13 @@ static const struct ctl_table kern_panic_table[] = {
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec,
 	},
+	{
+		.procname	= "panic_sys_info",
+		.data		= &panic_print,
+		.maxlen         = sizeof(panic_print),
+		.mode		= 0644,
+		.proc_handler	= sysctl_sys_info_handler,
+	},
 };
 
 static __init int kernel_panic_sysctls_init(void)
diff --git a/lib/sys_info.c b/lib/sys_info.c
index 53031e5cb98e..5bf503fd7ec1 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -3,10 +3,100 @@
 #include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
+#include <linux/sysctl.h>
 #include <linux/nmi.h>
 
 #include <linux/sys_info.h>
 
+struct sys_info_name {
+	unsigned long bit;
+	const char *name;
+};
+
+/*
+ * When 'si_names' gets updated,  please make sure the 'sys_info_avail'
+ * below is updated accordingly.
+ */
+static const struct sys_info_name  si_names[] = {
+	{ SYS_INFO_TASKS,		"tasks" },
+	{ SYS_INFO_MEM,			"mem" },
+	{ SYS_INFO_TIMERS,		"timers" },
+	{ SYS_INFO_LOCKS,		"locks" },
+	{ SYS_INFO_FTRACE,		"ftrace" },
+	{ SYS_INFO_ALL_CPU_BT,		"all_bt" },
+	{ SYS_INFO_BLOCKED_TASKS,	"blocked_tasks" },
+};
+
+/* Expecting string like "xxx_sys_info=tasks,mem,timers,locks,ftrace,..." */
+unsigned long sys_info_parse_param(char *str)
+{
+	unsigned long si_bits = 0;
+	char *s, *name;
+	int i;
+
+	s = str;
+	while ((name = strsep(&s, ",")) && *name) {
+		for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+			if (!strcmp(name, si_names[i].name)) {
+				si_bits |= si_names[i].bit;
+				break;
+			}
+		}
+	}
+
+	return si_bits;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static const char sys_info_avail[] __maybe_unused = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks";
+
+int sysctl_sys_info_handler(const struct ctl_table *ro_table, int write,
+					  void *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	char names[sizeof(sys_info_avail) + 1];
+	struct ctl_table table;
+	unsigned long *si_bits_global;
+
+	si_bits_global = ro_table->data;
+
+	if (write) {
+		unsigned long si_bits;
+		int ret;
+
+		table = *ro_table;
+		table.data = names;
+		table.maxlen = sizeof(names);
+		ret = proc_dostring(&table, write, buffer, lenp, ppos);
+		if (ret)
+			return ret;
+
+		si_bits = sys_info_parse_param(names);
+		/* The access to the global value is not synchronized. */
+		WRITE_ONCE(*si_bits_global, si_bits);
+		return 0;
+	} else {
+		/* for 'read' operation */
+		char *delim = "";
+		int i, len = 0;
+
+		for (i = 0; i < ARRAY_SIZE(si_names); i++) {
+			if (*si_bits_global & si_names[i].bit) {
+				len += scnprintf(names + len, sizeof(names) - len,
+					"%s%s", delim, si_names[i].name);
+				delim = ",";
+			}
+		}
+
+		table = *ro_table;
+		table.data = names;
+		table.maxlen = sizeof(names);
+		return proc_dostring(&table, write, buffer, lenp, ppos);
+	}
+}
+#endif
+
 void sys_info(unsigned long si_mask)
 {
 	if (si_mask & SYS_INFO_TASKS)
-- 
cgit v1.2.3


From ae2da51def76020fa16f53cd3446c00cafe41008 Mon Sep 17 00:00:00 2001
From: Lance Yang <lance.yang@linux.dev>
Date: Fri, 27 Jun 2025 15:29:22 +0800
Subject: locking/rwsem: make owner helpers globally available

Patch series "extend hung task blocker tracking to rwsems".

Inspired by mutex blocker tracking[1], and having already extended it to
semaphores, let's now add support for reader-writer semaphores (rwsems).

The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while
waiting for an rwsem, we just call hung_task_set_blocker().  The hung task
detector can then query the rwsem's owner to identify the lock holder.

Tracking works reliably for writers, as there can only be a single writer
holding the lock, and its task struct is stored in the owner field.

The main challenge lies with readers.  The owner field points to only one
of many concurrent readers, so we might lose track of the blocker if that
specific reader unlocks, even while others remain.  This is not a
significant issue, however.  In practice, long-lasting lock contention is
almost always caused by a writer.  Therefore, reliably tracking the writer
is the primary goal of this patch series ;)

With this change, the hung task detector can now show blocker task's info
like below:

[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds.
[Fri Jun 27 15:21:34 2025]       Tainted: G S                  6.16.0-rc3 #8
[Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Fri Jun 27 15:21:34 2025] task:cat             state:D stack:0     pid:28631 tgid:28631 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? policy_nodemask+0x215/0x340
[Fri Jun 27 15:21:34 2025]  ? _raw_spin_lock_irq+0x8a/0xe0
[Fri Jun 27 15:21:34 2025]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_preempt_disabled+0x15/0x30
[Fri Jun 27 15:21:34 2025]  rwsem_down_read_slowpath+0x55e/0xe10
[Fri Jun 27 15:21:34 2025]  ? __pfx_rwsem_down_read_slowpath+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx___might_resched+0x10/0x10
[Fri Jun 27 15:21:34 2025]  down_read+0xc9/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_down_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __debugfs_file_get+0x14d/0x700
[Fri Jun 27 15:21:34 2025]  ? __pfx___debugfs_file_get+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? handle_pte_fault+0x52a/0x710
[Fri Jun 27 15:21:34 2025]  ? selinux_file_permission+0x3a9/0x590
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_read+0x4a/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>
[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 <reader> blocked on an rw-semaphore likely owned by task cat:28630 <writer>
[Fri Jun 27 15:21:34 2025] task:cat             state:S stack:0     pid:28630 tgid:28630 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __mod_timer+0x304/0xa80
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_timeout+0xfb/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_schedule_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx_process_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? down_write+0xc4/0x140
[Fri Jun 27 15:21:34 2025]  msleep_interruptible+0xbe/0x150
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_write+0x54/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>


This patch (of 3):

In preparation for extending blocker tracking to support rwsems, make the
rwsem_owner() and is_rwsem_reader_owned() helpers globally available for
determining if the blocker is a writer or one of the readers.

Additionally, a stale owner pointer in a reader-owned rwsem can lead to
false positives in blocker tracking when CONFIG_DETECT_HUNG_TASK_BLOCKER
is enabled.  To mitigate this, clear the owner field on the reader unlock
path, similar to what CONFIG_DEBUG_RWSEMS does.  A NULL owner is better
than a stale one for diagnostics.

Link: https://lkml.kernel.org/r/20250627072924.36567-1-lance.yang@linux.dev
Link: https://lkml.kernel.org/r/20250627072924.36567-2-lance.yang@linux.dev
Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ [1]
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Mingzhe Yang <mingzhe.yang@ly.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Cc: Zi Li <zi.li@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rwsem.h  | 12 ++++++++++++
 kernel/locking/rwsem.c | 14 +++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index c8b543d428b0..544853bed5b9 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -132,6 +132,18 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
 	return !list_empty(&sem->wait_list);
 }
 
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
+/*
+ * Return just the real task structure pointer of the owner
+ */
+extern struct task_struct *rwsem_owner(struct rw_semaphore *sem);
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+extern bool is_rwsem_reader_owned(struct rw_semaphore *sem);
+#endif
+
 #else /* !CONFIG_PREEMPT_RT */
 
 #include <linux/rwbase_rt.h>
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2ddb827e3bea..a310eb9896de 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -181,11 +181,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
 	__rwsem_set_reader_owned(sem, current);
 }
 
-#ifdef CONFIG_DEBUG_RWSEMS
+#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER)
 /*
  * Return just the real task structure pointer of the owner
  */
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+struct task_struct *rwsem_owner(struct rw_semaphore *sem)
 {
 	return (struct task_struct *)
 		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
@@ -194,7 +194,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
 /*
  * Return true if the rwsem is owned by a reader.
  */
-static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 {
 	/*
 	 * Check the count to see if it is write-locked.
@@ -207,10 +207,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
 }
 
 /*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
+ * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured,
+ * it will make sure that the owner field of a reader-owned rwsem either
+ * points to a real reader-owner(s) or gets cleared. The only exception is
+ * when the unlock is done by up_read_non_owner().
  */
 static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
-- 
cgit v1.2.3


From 77da18de55ac6417e48905bec8b3c66f023b15a9 Mon Sep 17 00:00:00 2001
From: Lance Yang <lance.yang@linux.dev>
Date: Fri, 27 Jun 2025 15:29:23 +0800
Subject: hung_task: extend hung task blocker tracking to rwsems

Inspired by mutex blocker tracking[1], and having already extended it to
semaphores, let's now add support for reader-writer semaphores (rwsems).

The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while
waiting for an rwsem, we just call hung_task_set_blocker().  The hung task
detector can then query the rwsem's owner to identify the lock holder.

Tracking works reliably for writers, as there can only be a single writer
holding the lock, and its task struct is stored in the owner field.

The main challenge lies with readers.  The owner field points to only one
of many concurrent readers, so we might lose track of the blocker if that
specific reader unlocks, even while others remain.  This is not a
significant issue, however.  In practice, long-lasting lock contention is
almost always caused by a writer.  Therefore, reliably tracking the writer
is the primary goal of this patch series ;)

With this change, the hung task detector can now show blocker task's info
like below:

[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds.
[Fri Jun 27 15:21:34 2025]       Tainted: G S                  6.16.0-rc3 #8
[Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[Fri Jun 27 15:21:34 2025] task:cat             state:D stack:0     pid:28631 tgid:28631 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? policy_nodemask+0x215/0x340
[Fri Jun 27 15:21:34 2025]  ? _raw_spin_lock_irq+0x8a/0xe0
[Fri Jun 27 15:21:34 2025]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_preempt_disabled+0x15/0x30
[Fri Jun 27 15:21:34 2025]  rwsem_down_read_slowpath+0x55e/0xe10
[Fri Jun 27 15:21:34 2025]  ? __pfx_rwsem_down_read_slowpath+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx___might_resched+0x10/0x10
[Fri Jun 27 15:21:34 2025]  down_read+0xc9/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_down_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __debugfs_file_get+0x14d/0x700
[Fri Jun 27 15:21:34 2025]  ? __pfx___debugfs_file_get+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? handle_pte_fault+0x52a/0x710
[Fri Jun 27 15:21:34 2025]  ? selinux_file_permission+0x3a9/0x590
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_read+0x4a/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>
[Fri Jun 27 15:21:34 2025] INFO: task cat:28631 <reader> blocked on an rw-semaphore likely owned by task cat:28630 <writer>
[Fri Jun 27 15:21:34 2025] task:cat             state:S stack:0     pid:28630 tgid:28630 ppid:28501  task_flags:0x400000 flags:0x00004000
[Fri Jun 27 15:21:34 2025] Call Trace:
[Fri Jun 27 15:21:34 2025]  <TASK>
[Fri Jun 27 15:21:34 2025]  __schedule+0x7c7/0x1930
[Fri Jun 27 15:21:34 2025]  ? __pfx___schedule+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __mod_timer+0x304/0xa80
[Fri Jun 27 15:21:34 2025]  schedule+0x6a/0x180
[Fri Jun 27 15:21:34 2025]  schedule_timeout+0xfb/0x230
[Fri Jun 27 15:21:34 2025]  ? __pfx_schedule_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? __pfx_process_timeout+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? down_write+0xc4/0x140
[Fri Jun 27 15:21:34 2025]  msleep_interruptible+0xbe/0x150
[Fri Jun 27 15:21:34 2025]  read_dummy_rwsem_write+0x54/0x90
[Fri Jun 27 15:21:34 2025]  full_proxy_read+0xff/0x1c0
[Fri Jun 27 15:21:34 2025]  ? rw_verify_area+0x6d/0x410
[Fri Jun 27 15:21:34 2025]  vfs_read+0x177/0xa50
[Fri Jun 27 15:21:34 2025]  ? __pfx_vfs_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  ? fdget_pos+0x1cf/0x4c0
[Fri Jun 27 15:21:34 2025]  ksys_read+0xfc/0x1d0
[Fri Jun 27 15:21:34 2025]  ? __pfx_ksys_read+0x10/0x10
[Fri Jun 27 15:21:34 2025]  do_syscall_64+0x66/0x2d0
[Fri Jun 27 15:21:34 2025]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40
[Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40
[Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003
[Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff
[Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000
[Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff
[Fri Jun 27 15:21:34 2025]  </TASK>

[1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/

Link: https://lkml.kernel.org/r/20250627072924.36567-3-lance.yang@linux.dev
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Suggested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: John Stultz <jstultz@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Mingzhe Yang <mingzhe.yang@ly.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tomasz Figa <tfiga@chromium.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yongliang Gao <leonylgao@tencent.com>
Cc: Zi Li <zi.li@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hung_task.h | 18 +++++++++---------
 kernel/hung_task.c        | 29 +++++++++++++++++++++++++----
 kernel/locking/rwsem.c    | 17 ++++++++++++++++-
 3 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/hung_task.h b/include/linux/hung_task.h
index 1bc2b3244613..34e615c76ca5 100644
--- a/include/linux/hung_task.h
+++ b/include/linux/hung_task.h
@@ -21,17 +21,17 @@
  * type.
  *
  * Type encoding:
- * 00 - Blocked on mutex        (BLOCKER_TYPE_MUTEX)
- * 01 - Blocked on semaphore    (BLOCKER_TYPE_SEM)
- * 10 - Blocked on rt-mutex     (BLOCKER_TYPE_RTMUTEX)
- * 11 - Blocked on rw-semaphore (BLOCKER_TYPE_RWSEM)
+ * 00 - Blocked on mutex			(BLOCKER_TYPE_MUTEX)
+ * 01 - Blocked on semaphore			(BLOCKER_TYPE_SEM)
+ * 10 - Blocked on rw-semaphore as READER	(BLOCKER_TYPE_RWSEM_READER)
+ * 11 - Blocked on rw-semaphore as WRITER	(BLOCKER_TYPE_RWSEM_WRITER)
  */
-#define BLOCKER_TYPE_MUTEX      0x00UL
-#define BLOCKER_TYPE_SEM        0x01UL
-#define BLOCKER_TYPE_RTMUTEX    0x02UL
-#define BLOCKER_TYPE_RWSEM      0x03UL
+#define BLOCKER_TYPE_MUTEX		0x00UL
+#define BLOCKER_TYPE_SEM		0x01UL
+#define BLOCKER_TYPE_RWSEM_READER	0x02UL
+#define BLOCKER_TYPE_RWSEM_WRITER	0x03UL
 
-#define BLOCKER_TYPE_MASK       0x03UL
+#define BLOCKER_TYPE_MASK		0x03UL
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 static inline void hung_task_set_blocker(void *lock, unsigned long type)
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index d2432df2b905..8708a1205f82 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -23,6 +23,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/sysctl.h>
 #include <linux/hung_task.h>
+#include <linux/rwsem.h>
 
 #include <trace/events/sched.h>
 
@@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task)
 {
 	struct task_struct *g, *t;
 	unsigned long owner, blocker, blocker_type;
+	const char *rwsem_blocked_by, *rwsem_blocked_as;
 
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
 
@@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task)
 
 	switch (blocker_type) {
 	case BLOCKER_TYPE_MUTEX:
-		owner = mutex_get_owner(
-			(struct mutex *)hung_task_blocker_to_lock(blocker));
+		owner = mutex_get_owner(hung_task_blocker_to_lock(blocker));
 		break;
 	case BLOCKER_TYPE_SEM:
-		owner = sem_last_holder(
-			(struct semaphore *)hung_task_blocker_to_lock(blocker));
+		owner = sem_last_holder(hung_task_blocker_to_lock(blocker));
+		break;
+	case BLOCKER_TYPE_RWSEM_READER:
+	case BLOCKER_TYPE_RWSEM_WRITER:
+		owner = (unsigned long)rwsem_owner(
+					hung_task_blocker_to_lock(blocker));
+		rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ?
+					"reader" : "writer";
+		rwsem_blocked_by = is_rwsem_reader_owned(
+					hung_task_blocker_to_lock(blocker)) ?
+					"reader" : "writer";
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task)
 			pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
 			       task->comm, task->pid);
 			break;
+		case BLOCKER_TYPE_RWSEM_READER:
+		case BLOCKER_TYPE_RWSEM_WRITER:
+			pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n",
+			       task->comm, task->pid);
+			break;
 		}
 		return;
 	}
@@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task)
 			pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
 			       task->comm, task->pid, t->comm, t->pid);
 			break;
+		case BLOCKER_TYPE_RWSEM_READER:
+		case BLOCKER_TYPE_RWSEM_WRITER:
+			pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n",
+			       task->comm, task->pid, rwsem_blocked_as, t->comm,
+			       t->pid, rwsem_blocked_by);
+			break;
 		}
 		sched_show_task(t);
 		return;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index a310eb9896de..92c6332da401 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
 #include <linux/export.h>
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
+#include <linux/hung_task.h>
 #include <trace/events/lock.h>
 
 #ifndef CONFIG_PREEMPT_RT
@@ -1065,10 +1066,13 @@ queue:
 		wake_up_q(&wake_q);
 
 	trace_contention_begin(sem, LCB_F_READ);
+	set_current_state(state);
+
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER);
 
 	/* wait to be given the lock */
 	for (;;) {
-		set_current_state(state);
 		if (!smp_load_acquire(&waiter.task)) {
 			/* Matches rwsem_mark_wake()'s smp_store_release(). */
 			break;
@@ -1083,8 +1087,12 @@ queue:
 		}
 		schedule_preempt_disabled();
 		lockevent_inc(rwsem_sleep_reader);
+		set_current_state(state);
 	}
 
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_clear_blocker();
+
 	__set_current_state(TASK_RUNNING);
 	lockevent_inc(rwsem_rlock);
 	trace_contention_end(sem, 0);
@@ -1146,6 +1154,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 	set_current_state(state);
 	trace_contention_begin(sem, LCB_F_WRITE);
 
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER);
+
 	for (;;) {
 		if (rwsem_try_write_lock(sem, &waiter)) {
 			/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1179,6 +1190,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
 trylock_again:
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
+
+	if (state == TASK_UNINTERRUPTIBLE)
+		hung_task_clear_blocker();
+
 	__set_current_state(TASK_RUNNING);
 	raw_spin_unlock_irq(&sem->wait_lock);
 	lockevent_inc(rwsem_wlock);
-- 
cgit v1.2.3


From b3d5fd6f82dde8c906dc2a587003a44252ae5eae Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Fri, 6 Jun 2025 21:47:56 +0800
Subject: lib/math/gcd: use static key to select implementation at runtime

Patch series "Optimize GCD performance on RISC-V by selecting
implementation at runtime", v3.

The current implementation of gcd() selects between the binary GCD and the
odd-even GCD algorithm at compile time, depending on whether
CONFIG_CPU_NO_EFFICIENT_FFS is set.  On platforms like RISC-V, however,
this compile-time decision can be misleading: even when the compiler emits
ctz instructions based on the assumption that they are efficient (as is
the case when CONFIG_RISCV_ISA_ZBB is enabled), the actual hardware may
lack support for the Zbb extension.  In such cases, ffs() falls back to a
software implementation at runtime, making the binary GCD algorithm
significantly slower than the odd-even variant.

To address this, we introduce a static key to allow runtime selection
between the binary and odd-even GCD implementations.  On RISC-V, the
kernel now checks for Zbb support during boot.  If Zbb is unavailable, the
static key is disabled so that gcd() consistently uses the more efficient
odd-even algorithm in that scenario.  Additionally, to further reduce code
size, we select CONFIG_CPU_NO_EFFICIENT_FFS automatically when
CONFIG_RISCV_ISA_ZBB is not enabled, avoiding compilation of the unused
binary GCD implementation entirely on systems where it would never be
executed.

This series ensures that the most efficient GCD algorithm is used in
practice and avoids compiling unnecessary code based on hardware
capabilities and kernel configuration.


This patch (of 3):

On platforms like RISC-V, the compiler may generate hardware FFS
instructions even if the underlying CPU does not actually support them.
Currently, the GCD implementation is chosen at compile time based on
CONFIG_CPU_NO_EFFICIENT_FFS, which can result in suboptimal behavior on
such systems.

Introduce a static key, efficient_ffs_key, to enable runtime selection
between the binary GCD (using ffs) and the odd-even GCD implementation.
This allows the kernel to default to the faster binary GCD when FFS is
efficient, while retaining the ability to fall back when needed.

Link: https://lkml.kernel.org/r/20250606134758.1308400-1-visitorckw@gmail.com
Link: https://lkml.kernel.org/r/20250606134758.1308400-2-visitorckw@gmail.com
Co-developed-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Ching-Chun (Jim) Huang <jserv@ccns.ncku.edu.tw>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gcd.h |  3 +++
 lib/math/gcd.c      | 27 +++++++++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/gcd.h b/include/linux/gcd.h
index cb572677fd7f..616e81a7f7e3 100644
--- a/include/linux/gcd.h
+++ b/include/linux/gcd.h
@@ -3,6 +3,9 @@
 #define _GCD_H
 
 #include <linux/compiler.h>
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_TRUE(efficient_ffs_key);
 
 unsigned long gcd(unsigned long a, unsigned long b) __attribute_const__;
 
diff --git a/lib/math/gcd.c b/lib/math/gcd.c
index e3b042214d1b..62efca6787ae 100644
--- a/lib/math/gcd.c
+++ b/lib/math/gcd.c
@@ -11,22 +11,16 @@
  * has decent hardware division.
  */
 
+DEFINE_STATIC_KEY_TRUE(efficient_ffs_key);
+
 #if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
 
 /* If __ffs is available, the even/odd algorithm benchmarks slower. */
 
-/**
- * gcd - calculate and return the greatest common divisor of 2 unsigned longs
- * @a: first value
- * @b: second value
- */
-unsigned long gcd(unsigned long a, unsigned long b)
+static unsigned long binary_gcd(unsigned long a, unsigned long b)
 {
 	unsigned long r = a | b;
 
-	if (!a || !b)
-		return r;
-
 	b >>= __ffs(b);
 	if (b == 1)
 		return r & -r;
@@ -44,9 +38,15 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	}
 }
 
-#else
+#endif
 
 /* If normalization is done by loops, the even/odd algorithm is a win. */
+
+/**
+ * gcd - calculate and return the greatest common divisor of 2 unsigned longs
+ * @a: first value
+ * @b: second value
+ */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
 	unsigned long r = a | b;
@@ -54,6 +54,11 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	if (!a || !b)
 		return r;
 
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+	if (static_branch_likely(&efficient_ffs_key))
+		return binary_gcd(a, b);
+#endif
+
 	/* Isolate lsbit of r */
 	r &= -r;
 
@@ -80,6 +85,4 @@ unsigned long gcd(unsigned long a, unsigned long b)
 	}
 }
 
-#endif
-
 EXPORT_SYMBOL_GPL(gcd);
-- 
cgit v1.2.3


From 438794e93f6271af93f0d16a1851725115b5fd51 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Thu, 17 Jul 2025 09:48:13 +0300
Subject: net/mlx5: Add IFC bits to support RSS for IPSec offload

This adds the capabilities, ipsec_next_header and inner/outer
l4_type_ext fields to support RSS for the decrypted packets.

These fields are specifically for firmware steering. HWS validation
logic is updated to correctly handle the changes, ensuring the
unsupported fields are not set.

Besides, reserved_at_c4 is fixed to reserved_at_d4 to reflect the
accurate offset within the structure.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1752734895-257735-2-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 .../mellanox/mlx5/core/steering/hws/definer.c      | 13 ++++++-----
 include/linux/mlx5/mlx5_ifc.h                      | 25 ++++++++++++++++------
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
index d45e1145d197..c6436c3a7a83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c
@@ -727,8 +727,9 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
 	u32 *s_ipv6, *d_ipv6;
 
 	if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) ||
-	    HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c2, 0xe) ||
-	    HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c4, 0x4)) {
+	    HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type_ext, 0x4) ||
+	    HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c6, 0xa) ||
+	    HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_d4, 0x4)) {
 		mlx5hws_err(cd->ctx, "Unsupported outer parameters set\n");
 		return -EINVAL;
 	}
@@ -903,8 +904,9 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
 	u32 *s_ipv6, *d_ipv6;
 
 	if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) ||
-	    HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c2, 0xe) ||
-	    HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c4, 0x4)) {
+	    HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type_ext, 0x4) ||
+	    HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c6, 0xa) ||
+	    HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_d4, 0x4)) {
 		mlx5hws_err(cd->ctx, "Unsupported inner parameters set\n");
 		return -EINVAL;
 	}
@@ -1279,7 +1281,8 @@ hws_definer_conv_misc2(struct mlx5hws_definer_conv_data *cd,
 	struct mlx5hws_definer_fc *curr_fc;
 
 	if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1a0, 0x8) ||
-	    HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1b8, 0x8) ||
+	    HWS_IS_FLD_SET_SZ(match_param,
+			      misc_parameters_2.ipsec_next_header, 0x8) ||
 	    HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1c0, 0x40) ||
 	    HWS_IS_FLD_SET(match_param, misc_parameters_2.macsec_syndrome) ||
 	    HWS_IS_FLD_SET(match_param, misc_parameters_2.ipsec_syndrome)) {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 639dd0b56655..c9a7773ac8ec 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -420,7 +420,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 
 /* Table 2170 - Flow Table Fields Supported 2 Format */
 struct mlx5_ifc_flow_table_fields_supported_2_bits {
-	u8         reserved_at_0[0x2];
+	u8         inner_l4_type_ext[0x1];
+	u8         outer_l4_type_ext[0x1];
 	u8         inner_l4_type[0x1];
 	u8         outer_l4_type[0x1];
 	u8         reserved_at_4[0xa];
@@ -429,7 +430,11 @@ struct mlx5_ifc_flow_table_fields_supported_2_bits {
 	u8         tunnel_header_0_1[0x1];
 	u8         reserved_at_11[0xf];
 
-	u8         reserved_at_20[0x60];
+	u8         reserved_at_20[0xf];
+	u8         ipsec_next_header[0x1];
+	u8         reserved_at_30[0x10];
+
+	u8         reserved_at_40[0x40];
 };
 
 struct mlx5_ifc_flow_table_prop_layout_bits {
@@ -552,6 +557,13 @@ enum {
 	MLX5_PACKET_L4_TYPE_UDP,
 };
 
+enum {
+	MLX5_PACKET_L4_TYPE_EXT_NONE,
+	MLX5_PACKET_L4_TYPE_EXT_TCP,
+	MLX5_PACKET_L4_TYPE_EXT_UDP,
+	MLX5_PACKET_L4_TYPE_EXT_ICMP,
+};
+
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         smac_47_16[0x20];
 
@@ -578,10 +590,10 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         tcp_dport[0x10];
 
 	u8         l4_type[0x2];
-	u8         reserved_at_c2[0xe];
+	u8         l4_type_ext[0x4];
+	u8         reserved_at_c6[0xa];
 	u8         ipv4_ihl[0x4];
-	u8         reserved_at_c4[0x4];
-
+	u8         reserved_at_d4[0x4];
 	u8         ttl_hoplimit[0x8];
 
 	u8         udp_sport[0x10];
@@ -689,10 +701,9 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 	u8         metadata_reg_a[0x20];
 
 	u8         reserved_at_1a0[0x8];
-
 	u8         macsec_syndrome[0x8];
 	u8         ipsec_syndrome[0x8];
-	u8         reserved_at_1b8[0x8];
+	u8         ipsec_next_header[0x8];
 
 	u8         reserved_at_1c0[0x40];
 };
-- 
cgit v1.2.3


From 6f09ee0b583cad4f2b6a82842c26235bee3d5c2e Mon Sep 17 00:00:00 2001
From: Oren Sidi <osidi@nvidia.com>
Date: Thu, 17 Jul 2025 09:48:14 +0300
Subject: net/mlx5: Add IFC bits and enums for buf_ownership

Extend structure layouts and defines buf_ownership.
buf_ownership indicates whether the buffer is managed by SW or FW.

Signed-off-by: Oren Sidi <osidi@nvidia.com>
Reviewed-by: Alex Lazar <alazar@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1752734895-257735-3-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c9a7773ac8ec..e1220aa1e7dc 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10474,8 +10474,16 @@ struct mlx5_ifc_pifr_reg_bits {
 	u8         port_filter_update_en[8][0x20];
 };
 
+enum {
+	MLX5_BUF_OWNERSHIP_UNKNOWN	= 0x0,
+	MLX5_BUF_OWNERSHIP_FW_OWNED	= 0x1,
+	MLX5_BUF_OWNERSHIP_SW_OWNED	= 0x2,
+};
+
 struct mlx5_ifc_pfcc_reg_bits {
-	u8         reserved_at_0[0x8];
+	u8         reserved_at_0[0x4];
+	u8	   buf_ownership[0x2];
+	u8	   reserved_at_6[0x2];
 	u8         local_port[0x8];
 	u8         reserved_at_10[0xb];
 	u8         ppan_mask_n[0x1];
@@ -10611,7 +10619,9 @@ struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         fec_200G_per_lane_in_pplm[0x1];
 	u8         reserved_at_1e[0x2a];
 	u8         fec_100G_per_lane_in_pplm[0x1];
-	u8         reserved_at_49[0x1f];
+	u8         reserved_at_49[0xa];
+	u8	   buffer_ownership[0x1];
+	u8	   resereved_at_54[0x14];
 	u8         fec_50G_per_lane_in_pplm[0x1];
 	u8         reserved_at_69[0x4];
 	u8         rx_icrc_encapsulated_counter[0x1];
-- 
cgit v1.2.3


From 9a0048e0ae14cb7babfd459ec920234e8a2ab86e Mon Sep 17 00:00:00 2001
From: Oren Sidi <osidi@nvidia.com>
Date: Thu, 17 Jul 2025 09:48:15 +0300
Subject: net/mlx5: Expose cable_length field in PFCC register

Introduce new "cable_length" field in PFCC register and related fields
to enhance rx buffer configuration management:
1. cable_length: Shifts cable length handling to fw by storing a
   manually entered length from user in PFCC.cable_length
2. lane_rate_oper: In a case where PFCC.cable_length is not supported,
   helps compute a default cable length

Signed-off-by: Oren Sidi <osidi@nvidia.com>
Reviewed-by: Alex Lazar <alazar@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://patch.msgid.link/1752734895-257735-4-git-send-email-tariqt@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e1220aa1e7dc..ed4130e49c27 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9994,6 +9994,10 @@ struct mlx5_ifc_pude_reg_bits {
 	u8         reserved_at_20[0x60];
 };
 
+enum {
+	MLX5_PTYS_CONNECTOR_TYPE_PORT_DA = 0x7,
+};
+
 struct mlx5_ifc_ptys_reg_bits {
 	u8         reserved_at_0[0x1];
 	u8         an_disable_admin[0x1];
@@ -10030,7 +10034,8 @@ struct mlx5_ifc_ptys_reg_bits {
 	u8         ib_link_width_oper[0x10];
 	u8         ib_proto_oper[0x10];
 
-	u8         reserved_at_160[0x1c];
+	u8         reserved_at_160[0x8];
+	u8         lane_rate_oper[0x14];
 	u8         connector_type[0x4];
 
 	u8         eth_proto_lp_advertise[0x20];
@@ -10485,7 +10490,8 @@ struct mlx5_ifc_pfcc_reg_bits {
 	u8	   buf_ownership[0x2];
 	u8	   reserved_at_6[0x2];
 	u8         local_port[0x8];
-	u8         reserved_at_10[0xb];
+	u8         reserved_at_10[0xa];
+	u8	   cable_length_mask[0x1];
 	u8         ppan_mask_n[0x1];
 	u8         minor_stall_mask[0x1];
 	u8         critical_stall_mask[0x1];
@@ -10514,7 +10520,10 @@ struct mlx5_ifc_pfcc_reg_bits {
 	u8         device_stall_minor_watermark[0x10];
 	u8         device_stall_critical_watermark[0x10];
 
-	u8         reserved_at_a0[0x60];
+	u8	   reserved_at_a0[0x18];
+	u8	   cable_length[0x8];
+
+	u8         reserved_at_c0[0x40];
 };
 
 struct mlx5_ifc_pelc_reg_bits {
@@ -10615,7 +10624,9 @@ struct mlx5_ifc_mtutc_reg_bits {
 struct mlx5_ifc_pcam_enhanced_features_bits {
 	u8         reserved_at_0[0x10];
 	u8         ppcnt_recovery_counters[0x1];
-	u8         reserved_at_11[0xc];
+	u8         reserved_at_11[0x7];
+	u8	   cable_length[0x1];
+	u8	   reserved_at_19[0x4];
 	u8         fec_200G_per_lane_in_pplm[0x1];
 	u8         reserved_at_1e[0x2a];
 	u8         fec_100G_per_lane_in_pplm[0x1];
-- 
cgit v1.2.3


From 7114b74d99a3cd588da4ecb6011858c06f8408a1 Mon Sep 17 00:00:00 2001
From: Florin Leotescu <florin.leotescu@nxp.com>
Date: Tue, 3 Jun 2025 14:31:22 +0300
Subject: hwmon: (emc2305) Add support for PWM frequency, polarity and output

Add three new attributes to the driver data structures to support
configuration of PWM frequency, PWM polarity and PWM output config.

Signed-off-by: Florin Leotescu <florin.leotescu@nxp.com>
Link: https://lore.kernel.org/r/20250603113125.3175103-2-florin.leotescu@oss.nxp.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/emc2305.c               | 6 ++++++
 include/linux/platform_data/emc2305.h | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/hwmon/emc2305.c b/drivers/hwmon/emc2305.c
index 234c54956a4b..8fc4fcf8a063 100644
--- a/drivers/hwmon/emc2305.c
+++ b/drivers/hwmon/emc2305.c
@@ -89,8 +89,11 @@ struct emc2305_cdev_data {
  * @hwmon_dev: hwmon device
  * @max_state: maximum cooling state of the cooling device
  * @pwm_num: number of PWM channels
+ * @pwm_output_mask: PWM output mask
+ * @pwm_polarity_mask: PWM polarity mask
  * @pwm_separate: separate PWM settings for every channel
  * @pwm_min: array of minimum PWM per channel
+ * @pwm_freq: array of PWM frequency per channel
  * @cdev_data: array of cooling devices data
  */
 struct emc2305_data {
@@ -98,8 +101,11 @@ struct emc2305_data {
 	struct device *hwmon_dev;
 	u8 max_state;
 	u8 pwm_num;
+	u8 pwm_output_mask;
+	u8 pwm_polarity_mask;
 	bool pwm_separate;
 	u8 pwm_min[EMC2305_PWM_MAX];
+	u16 pwm_freq[EMC2305_PWM_MAX];
 	struct emc2305_cdev_data cdev_data[EMC2305_PWM_MAX];
 };
 
diff --git a/include/linux/platform_data/emc2305.h b/include/linux/platform_data/emc2305.h
index 54d672dd6f7d..76043a97f975 100644
--- a/include/linux/platform_data/emc2305.h
+++ b/include/linux/platform_data/emc2305.h
@@ -9,14 +9,20 @@
  * struct emc2305_platform_data - EMC2305 driver platform data
  * @max_state: maximum cooling state of the cooling device;
  * @pwm_num: number of active channels;
+ * @pwm_output_mask: PWM output mask
+ * @pwm_polarity_mask: PWM polarity mask
  * @pwm_separate: separate PWM settings for every channel;
  * @pwm_min: array of minimum PWM per channel;
+ * @pwm_freq: array of PWM frequency per channel
  */
 struct emc2305_platform_data {
 	u8 max_state;
 	u8 pwm_num;
+	u8 pwm_output_mask;
+	u8 pwm_polarity_mask;
 	bool pwm_separate;
 	u8 pwm_min[EMC2305_PWM_MAX];
+	u16 pwm_freq[EMC2305_PWM_MAX];
 };
 
 #endif
-- 
cgit v1.2.3


From 34b1cb4ec286603127aa8c4191ea527eb8dd3567 Mon Sep 17 00:00:00 2001
From: Venkata Prasad Potturu <venkataprasad.potturu@amd.com>
Date: Tue, 15 Jul 2025 17:40:41 +0530
Subject: soundwire: amd: Add support for acp7.2 platform

Add soundwire support for acp7.2 platform.

Signed-off-by: Venkata Prasad Potturu <venkataprasad.potturu@amd.com>
Link: https://lore.kernel.org/r/20250715121048.1795607-1-venkataprasad.potturu@amd.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/amd_manager.c   | 4 ++++
 include/linux/soundwire/sdw_amd.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c
index d4e62c383b12..3632838f3ed9 100644
--- a/drivers/soundwire/amd_manager.c
+++ b/drivers/soundwire/amd_manager.c
@@ -499,6 +499,7 @@ static int amd_sdw_port_params(struct sdw_bus *bus, struct sdw_port_params *p_pa
 		break;
 	case ACP70_PCI_REV_ID:
 	case ACP71_PCI_REV_ID:
+	case ACP72_PCI_REV_ID:
 		frame_fmt_reg = acp70_sdw_dp_reg[p_params->num].frame_fmt_reg;
 		break;
 	default:
@@ -551,6 +552,7 @@ static int amd_sdw_transport_params(struct sdw_bus *bus,
 		break;
 	case ACP70_PCI_REV_ID:
 	case ACP71_PCI_REV_ID:
+	case ACP72_PCI_REV_ID:
 		frame_fmt_reg = acp70_sdw_dp_reg[params->port_num].frame_fmt_reg;
 		sample_int_reg = acp70_sdw_dp_reg[params->port_num].sample_int_reg;
 		hctrl_dp0_reg = acp70_sdw_dp_reg[params->port_num].hctrl_dp0_reg;
@@ -614,6 +616,7 @@ static int amd_sdw_port_enable(struct sdw_bus *bus,
 		break;
 	case ACP70_PCI_REV_ID:
 	case ACP71_PCI_REV_ID:
+	case ACP72_PCI_REV_ID:
 		lane_ctrl_ch_en_reg = acp70_sdw_dp_reg[enable_ch->port_num].lane_ctrl_ch_en_reg;
 		break;
 	default:
@@ -1038,6 +1041,7 @@ static int amd_sdw_manager_probe(struct platform_device *pdev)
 		break;
 	case ACP70_PCI_REV_ID:
 	case ACP71_PCI_REV_ID:
+	case ACP72_PCI_REV_ID:
 		amd_manager->num_dout_ports = AMD_ACP70_SDW_MAX_TX_PORTS;
 		amd_manager->num_din_ports = AMD_ACP70_SDW_MAX_RX_PORTS;
 		break;
diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h
index 6b839987f14c..fe31773d5210 100644
--- a/include/linux/soundwire/sdw_amd.h
+++ b/include/linux/soundwire/sdw_amd.h
@@ -30,6 +30,7 @@
 #define ACP63_PCI_REV_ID		0x63
 #define ACP70_PCI_REV_ID		0x70
 #define ACP71_PCI_REV_ID		0x71
+#define ACP72_PCI_REV_ID		0x72
 
 struct acp_sdw_pdata {
 	u16 instance;
-- 
cgit v1.2.3


From 0b0cd1857b783711b4bdfb8eb513c263b8a84f6d Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 7 Jul 2025 10:24:37 +0800
Subject: dt-bindings: clock: Add support for i.MX94 LVDS/DISPLAY CSR

Add i.MX94 LVDS/DISPLAY CSR compatible string.

Add clock index for the two CSRs.

Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20250707-imx95-blk-ctl-7-1-v3-1-c1b676ec13be@nxp.com
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
---
 .../devicetree/bindings/clock/nxp,imx95-blk-ctl.yaml        |  2 ++
 include/dt-bindings/clock/nxp,imx94-clock.h                 | 13 +++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 include/dt-bindings/clock/nxp,imx94-clock.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/nxp,imx95-blk-ctl.yaml b/Documentation/devicetree/bindings/clock/nxp,imx95-blk-ctl.yaml
index d0291bfff23a..27403b4c52d6 100644
--- a/Documentation/devicetree/bindings/clock/nxp,imx95-blk-ctl.yaml
+++ b/Documentation/devicetree/bindings/clock/nxp,imx95-blk-ctl.yaml
@@ -13,6 +13,8 @@ properties:
   compatible:
     items:
       - enum:
+          - nxp,imx94-display-csr
+          - nxp,imx94-lvds-csr
           - nxp,imx95-camera-csr
           - nxp,imx95-display-csr
           - nxp,imx95-hsio-blk-ctl
diff --git a/include/dt-bindings/clock/nxp,imx94-clock.h b/include/dt-bindings/clock/nxp,imx94-clock.h
new file mode 100644
index 000000000000..c4ba13352b99
--- /dev/null
+++ b/include/dt-bindings/clock/nxp,imx94-clock.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright 2025 NXP
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_IMX94_H
+#define __DT_BINDINGS_CLOCK_IMX94_H
+
+#define IMX94_CLK_DISPMIX_CLK_SEL	0
+
+#define IMX94_CLK_DISPMIX_LVDS_CLK_GATE	0
+
+#endif /* __DT_BINDINGS_CLOCK_IMX94_H */
-- 
cgit v1.2.3


From ee4a2e08c10188f02fe3fb36b6beddc3c2fdb287 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 8 Jul 2025 19:27:41 +1000
Subject: entry: Add arch_in_rcu_eqs()

All architectures have an interruptible RCU extended quiescent state
(EQS) as part of their idle sequences, where interrupts can occur
without RCU watching. Entry code must account for this and wake RCU as
necessary; the common entry code deals with this in irqentry_enter() by
treating any interrupt from an idle thread as potentially having
occurred within an EQS and waking RCU for the duration of the interrupt
via rcu_irq_enter() .. rcu_irq_exit().

Some architectures may have other interruptible EQSs which require
similar treatment. For example, on s390 it is necessary to enable
interrupts around guest entry in the middle of a period where core KVM
code has entered an EQS.

So that architectures can wake RCU in these cases, this patch adds a
new arch_in_rcu_eqs() hook to the common entry code which is checked in
addition to the existing is_idle_thread() check, with RCU woken if
either returns true. A default implementation is provided which always
returns false, which suffices for most architectures.

As no architectures currently implement arch_in_rcu_eqs(), there should
be no functional change as a result of this patch alone. A subsequent
patch will add an s390 implementation to fix a latent bug with missing
RCU wakeups.

[ajd@linux.ibm.com: rebase, fix commit message]

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
Link: https://lore.kernel.org/r/20250708092742.104309-2-ajd@linux.ibm.com
Signed-off-by: Janosch Frank <frankja@linux.ibm.com>
Message-ID: <20250708092742.104309-2-ajd@linux.ibm.com>
---
 include/linux/entry-common.h | 16 ++++++++++++++++
 kernel/entry/common.c        |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..3bf99cbad8a3 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -86,6 +86,22 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs);
 static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {}
 #endif
 
+/**
+ * arch_in_rcu_eqs - Architecture specific check for RCU extended quiescent
+ * states.
+ *
+ * Returns: true if the CPU is potentially in an RCU EQS, false otherwise.
+ *
+ * Architectures only need to define this if threads other than the idle thread
+ * may have an interruptible EQS. This does not need to handle idle threads. It
+ * is safe to over-estimate at the cost of redundant RCU management work.
+ *
+ * Invoked from irqentry_enter()
+ */
+#ifndef arch_in_rcu_eqs
+static __always_inline bool arch_in_rcu_eqs(void) { return false; }
+#endif
+
 /**
  * enter_from_user_mode - Establish state when coming from user mode
  *
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417c..eb52d38e8099 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -220,7 +220,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * TINY_RCU does not support EQS, so let the compiler eliminate
 	 * this part when enabled.
 	 */
-	if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+	if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+	    (is_idle_task(current) || arch_in_rcu_eqs())) {
 		/*
 		 * If RCU is not watching then the same careful
 		 * sequence vs. lockdep and tracing is required
-- 
cgit v1.2.3


From 2b752ae0231f7b20cd2b8cad0b4ab36b16d4be88 Mon Sep 17 00:00:00 2001
From: Gary Yang <gary.yang@cixtech.com>
Date: Mon, 21 Jul 2025 22:44:58 +0800
Subject: dt-bindings: clock: cix: Add CIX sky1 scmi clock id

Add device tree bindings for the scmi clock id on
Cix sky1 platform.

Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Peter Chen <peter.chen@cixtech.com>
Signed-off-by: Gary Yang <gary.yang@cixtech.com>
Signed-off-by: Peter Chen <peter.chen@cixtech.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/dt-bindings/clock/cix,sky1.h | 279 +++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 include/dt-bindings/clock/cix,sky1.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/cix,sky1.h b/include/dt-bindings/clock/cix,sky1.h
new file mode 100644
index 000000000000..9245ebd1e80a
--- /dev/null
+++ b/include/dt-bindings/clock/cix,sky1.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright 2024-2025 Cix Technology Group Co., Ltd.
+ */
+
+#ifndef _DT_BINDINGS_CLK_CIX_SKY1_H
+#define _DT_BINDINGS_CLK_CIX_SKY1_H
+
+#define CLK_TREE_CPU_GICxCLK			0
+#define CLK_TREE_CPU_PPUCLK			1
+#define CLK_TREE_CPU_PERIPHCLK			2
+#define CLK_TREE_DSU_CLK			3
+#define CLK_TREE_DSU_PCLK			4
+#define CLK_TREE_CPU_CLK_BC0			5
+#define CLK_TREE_CPU_CLK_BC1			6
+#define CLK_TREE_CPU_CLK_BC2			7
+#define CLK_TREE_CPU_CLK_BC3			8
+#define CLK_TREE_CPU_CLK_MC0			9
+#define CLK_TREE_CPU_CLK_MC1			10
+#define CLK_TREE_CPU_CLK_MC2			11
+#define CLK_TREE_CPU_CLK_MC3			12
+#define CLK_TREE_CPU_CLK_LC0			13
+#define CLK_TREE_CPU_CLK_LC1			14
+#define CLK_TREE_CPU_CLK_LC2			15
+#define CLK_TREE_CPU_CLK_LC3			16
+#define CLK_TREE_CSI_CTRL0_PCLK			17
+#define CLK_TREE_CSI_CTRL1_PCLK			18
+#define CLK_TREE_CSI_CTRL2_PCLK			19
+#define CLK_TREE_CSI_CTRL3_PCLK			20
+#define CLK_TREE_CSI_DMA0_PCLK			21
+#define CLK_TREE_CSI_DMA1_PCLK			22
+#define CLK_TREE_CSI_DMA2_PCLK			23
+#define CLK_TREE_CSI_DMA3_PCLK			24
+#define CLK_TREE_CSI_PHY0_PSM			25
+#define CLK_TREE_CSI_PHY1_PSM			26
+#define CLK_TREE_CSI_PHY0_APBCLK		27
+#define CLK_TREE_CSI_PHY1_APBCLK		28
+#define CLK_TREE_FCH_APB_CLK			29
+#define CLK_TREE_GPU_CLK_400M			30
+#define CLK_TREE_GPU_CLK_CORE			31
+#define CLK_TREE_GPU_CLK_STACKS			32
+#define CLK_TREE_DP0_PIXEL0			33
+#define CLK_TREE_DP0_PIXEL1			34
+#define CLK_TREE_DP1_PIXEL0			35
+#define CLK_TREE_DP1_PIXEL1			36
+#define CLK_TREE_DP2_PIXEL0			37
+#define CLK_TREE_DP2_PIXEL1			38
+#define CLK_TREE_DP3_PIXEL0			39
+#define CLK_TREE_DP3_PIXEL1			40
+#define CLK_TREE_DP4_PIXEL0			41
+#define CLK_TREE_DP4_PIXEL1			42
+#define CLK_TREE_DPU_CLK			43
+#define CLK_TREE_DPU0_ACLK			44
+#define CLK_TREE_DPU1_ACLK			45
+#define CLK_TREE_DPU2_ACLK			46
+#define CLK_TREE_DPU3_ACLK			47
+#define CLK_TREE_DPU4_ACLK			48
+#define CLK_TREE_DPC0_VIDCLK0			49
+#define CLK_TREE_DPC0_VIDCLK1			50
+#define CLK_TREE_DPC1_VIDCLK0			51
+#define CLK_TREE_DPC1_VIDCLK1			52
+#define CLK_TREE_DPC2_VIDCLK0			53
+#define CLK_TREE_DPC2_VIDCLK1			54
+#define CLK_TREE_DPC3_VIDCLK0			55
+#define CLK_TREE_DPC3_VIDCLK1			56
+#define CLK_TREE_DPC4_VIDCLK0			57
+#define CLK_TREE_DPC4_VIDCLK1			58
+#define CLK_TREE_DPC0_APBCLK			59
+#define CLK_TREE_DPC1_APBCLK			60
+#define CLK_TREE_DPC2_APBCLK			61
+#define CLK_TREE_DPC3_APBCLK			62
+#define CLK_TREE_DPC4_APBCLK			63
+#define CLK_TREE_NPU_MEMCLK			64
+#define CLK_TREE_NPU_SYSCLK			65
+#define CLK_TREE_NPU_DBGCLK			66
+#define CLK_TREE_VPU_APBCLK			67
+#define CLK_TREE_ISP_ACLK			68
+#define CLK_TREE_ISP_SCLK			69
+#define CLK_TREE_AUDIO_CLK4			70
+#define CLK_TREE_AUDIO_CLK5			71
+#define CLK_TREE_CAMERA_MCLK0			72
+#define CLK_TREE_CAMERA_MCLK1			73
+#define CLK_TREE_CAMERA_MCLK2			74
+#define CLK_TREE_CAMERA_MCLK3			75
+#define CLK_TREE_AUDIO_CLK0			76
+#define CLK_TREE_AUDIO_CLK1			77
+#define CLK_TREE_AUDIO_CLK2			78
+#define CLK_TREE_AUDIO_CLK3			79
+#define CLK_TREE_MM_NI700_CLK			80
+#define CLK_TREE_SYS_NI700_CLK			81
+#define CLK_TREE_GMAC0_ACLK			82
+#define CLK_TREE_GMAC1_ACLK			83
+#define CLK_TREE_GMAC0_DIV_ACLK			84
+#define CLK_TREE_GMAC0_DIV_TXCLK		85
+#define CLK_TREE_GMAC0_RGMII0_TXCLK		86
+#define CLK_TREE_GMAC1_DIV_ACLK			87
+#define CLK_TREE_GMAC1_DIV_TXCLK		88
+#define CLK_TREE_GMAC1_RGMII0_TXCLK		89
+#define CLK_TREE_GMAC0_PCLK			90
+#define CLK_TREE_GMAC1_PCLK			91
+#define CLK_TREE_USB2_0_AXI_GATE		92
+#define CLK_TREE_USB2_0_APB_GATE		93
+#define CLK_TREE_USB2_1_AXI_GATE		94
+#define CLK_TREE_USB2_1_APB_GATE		95
+#define CLK_TREE_USB2_2_AXI_GATE		96
+#define CLK_TREE_USB2_2_APB_GATE		97
+#define CLK_TREE_USB2_3_AXI_GATE		98
+#define CLK_TREE_USB2_3_APB_GATE		99
+#define CLK_TREE_USB2_0_PHY_GATE		100
+#define CLK_TREE_USB2_1_PHY_GATE		101
+#define CLK_TREE_USB2_2_PHY_GATE		102
+#define CLK_TREE_USB2_3_PHY_GATE		103
+#define CLK_TREE_USB3C_DRD_AXI_GATE		104
+#define CLK_TREE_USB3C_DRD_APB_GATE		105
+#define CLK_TREE_USB3C_DRD_PHY2_GATE		106
+#define CLK_TREE_USB3C_DRD_PHY3_GATE		107
+#define CLK_TREE_USB3C_0_AXI_GATE		108
+#define CLK_TREE_USB3C_0_APB_GATE		109
+#define CLK_TREE_USB3C_0_PHY2_GATE		110
+#define CLK_TREE_USB3C_0_PHY3_GATE		111
+#define CLK_TREE_USB3C_1_AXI_GATE		112
+#define CLK_TREE_USB3C_1_APB_GATE		113
+#define CLK_TREE_USB3C_1_PHY2_GATE		114
+#define CLK_TREE_USB3C_1_PHY3_GATE		115
+#define CLK_TREE_USB3C_2_AXI_GATE		116
+#define CLK_TREE_USB3C_2_APB_GATE		117
+#define CLK_TREE_USB3C_2_PHY2_GATE		118
+#define CLK_TREE_USB3C_2_PHY3_GATE		119
+#define CLK_TREE_USB3A_0_AXI_GATE		120
+#define CLK_TREE_USB3A_0_APB_GATE		121
+#define CLK_TREE_USB3A_0_PHY2_GATE		122
+#define CLK_TREE_USB3A_1_AXI_GATE		123
+#define CLK_TREE_USB3A_1_APB_GATE		124
+#define CLK_TREE_USB3A_1_PHY2_GATE		125
+#define CLK_TREE_USB3A_PHY3_GATE		126
+#define CLK_TREE_USB2_0_CLK_SOF			127
+#define CLK_TREE_USB2_1_CLK_SOF			128
+#define CLK_TREE_USB2_2_CLK_SOF			129
+#define CLK_TREE_USB2_3_CLK_SOF			130
+#define CLK_TREE_USB3C_DRD_CLK_SOF		131
+#define CLK_TREE_USB3C_H0_CLK_SOF		132
+#define CLK_TREE_USB3C_H1_CLK_SOF		133
+#define CLK_TREE_USB3C_H2_CLK_SOF		134
+#define CLK_TREE_USB3A_H0_CLK_SOF		135
+#define CLK_TREE_USB3A_H1_CLK_SOF		136
+#define CLK_TREE_USB2_0_CLK_LPM			137
+#define CLK_TREE_USB2_1_CLK_LPM			138
+#define CLK_TREE_USB2_2_CLK_LPM			139
+#define CLK_TREE_USB2_3_CLK_LPM			140
+#define CLK_TREE_USB3C_DRD_CLK_LPM		141
+#define CLK_TREE_USB3C_H0_CLK_LPM		142
+#define CLK_TREE_USB3C_H1_CLK_LPM		143
+#define CLK_TREE_USB3C_H2_CLK_LPM		144
+#define CLK_TREE_USB3A_H0_CLK_LPM		145
+#define CLK_TREE_USB3A_H1_CLK_LPM		146
+#define CLK_TREE_USB2_0_PHY_REF			147
+#define CLK_TREE_USB2_1_PHY_REF			148
+#define CLK_TREE_USB2_2_PHY_REF			149
+#define CLK_TREE_USB2_3_PHY_REF			150
+#define CLK_TREE_USB3C_DRD_PHY_REF		151
+#define CLK_TREE_USB3C_H0_PHY_REF		152
+#define CLK_TREE_USB3C_H1_PHY_REF		153
+#define CLK_TREE_USB3C_H2_PHY_REF		154
+#define CLK_TREE_USB3A_H0_PHY_REF		155
+#define CLK_TREE_USB3A_H1_PHY_REF		156
+#define CLK_TREE_USB3C_DRD_PHY_x4_REF		157
+#define CLK_TREE_USB3C_H0_PHY_x4_REF		158
+#define CLK_TREE_USB3C_H1_PHY_x4_REF		159
+#define CLK_TREE_USB3C_H2_PHY_x4_REF		160
+#define CLK_TREE_USB3A_PHY_x2_REF		161
+#define CLK_TREE_PCIE_X8CTRL_APB		162
+#define CLK_TREE_PCIE_X4CTRL_APB		163
+#define CLK_TREE_PCIE_X2CTRL_APB		164
+#define CLK_TREE_PCIE_X1_0CTRL_APB		165
+#define CLK_TREE_PCIE_X1_1CTRL_APB		166
+#define CLK_TREE_PCIE_X8_PHY_APB		167
+#define CLK_TREE_PCIE_X4_PHY_APB		168
+#define CLK_TREE_PCIE_X211_PHY_APB		169
+#define CLK_TREE_PCIE_NI700_CLK			170
+#define CLK_TREE_PCIE_CTRL0_CLK			171
+#define CLK_TREE_PCIE_CTRL1_CLK			172
+#define CLK_TREE_PCIE_CTRL2_CLK			173
+#define CLK_TREE_PCIE_CTRL3_CLK			174
+#define CLK_TREE_PCIE_CTRL4_CLK			175
+#define CLK_TREE_CSI_CTRL0_SYSCLK		176
+#define CLK_TREE_CSI_CTRL1_SYSCLK		177
+#define CLK_TREE_CSI_CTRL2_SYSCLK		178
+#define CLK_TREE_CSI_CTRL3_SYSCLK		179
+#define CLK_TREE_CSI_CTRL0_PIXEL0_CLK		180
+#define CLK_TREE_CSI_CTRL0_PIXEL1_CLK		181
+#define CLK_TREE_CSI_CTRL0_PIXEL2_CLK		182
+#define CLK_TREE_CSI_CTRL0_PIXEL3_CLK		183
+#define CLK_TREE_CSI_CTRL1_PIXEL0_CLK		184
+#define CLK_TREE_CSI_CTRL2_PIXEL0_CLK		185
+#define CLK_TREE_CSI_CTRL2_PIXEL1_CLK		186
+#define CLK_TREE_CSI_CTRL2_PIXEL2_CLK		187
+#define CLK_TREE_CSI_CTRL2_PIXEL3_CLK		188
+#define CLK_TREE_CSI_CTRL3_PIXEL0_CLK		189
+#define CLK_TREE_CI700_GCLK0			190
+#define CLK_TREE_DDRC0_ACLK_CLK			191
+#define CLK_TREE_DDRC1_ACLK_CLK			192
+#define CLK_TREE_DDRC2_ACLK_CLK			193
+#define CLK_TREE_DDRC3_ACLK_CLK			194
+#define CLK_TREE_DDRC0_DFICLK_CLK		195
+#define CLK_TREE_DDRC1_DFICLK_CLK		196
+#define CLK_TREE_DDRC2_DFICLK_CLK		197
+#define CLK_TREE_DDRC3_DFICLK_CLK		198
+#define CLK_TREE_PHY0_SYNC_CLK			199
+#define CLK_TREE_PHY1_SYNC_CLK			200
+#define CLK_TREE_PHY2_SYNC_CLK			201
+#define CLK_TREE_PHY3_SYNC_CLK			202
+#define CLK_TREE_PHY0_BYPASS_CLK		203
+#define CLK_TREE_PHY1_BYPASS_CLK		204
+#define CLK_TREE_PHY2_BYPASS_CLK		205
+#define CLK_TREE_PHY3_BYPASS_CLK		206
+#define CLK_TREE_DDRC_0_APB			207
+#define CLK_TREE_DDRC_1_APB			208
+#define CLK_TREE_DDRC_2_APB			209
+#define CLK_TREE_DDRC_3_APB			210
+#define CLK_TREE_TZC400_0_APB			211
+#define CLK_TREE_TZC400_1_APB			212
+#define CLK_TREE_TZC400_2_APB			213
+#define CLK_TREE_TZC400_3_APB			214
+#define CLK_TREE_S5_SENSOR_HUB_25M		215
+#define CLK_TREE_S5_SENSOR_HUB_400M		216
+#define CLK_TREE_S5_CSS600_100M			217
+#define CLK_TREE_S5_DFD_800M			218
+#define CLK_TREE_S5_CSU_SE_800M			219
+#define CLK_TREE_S5_CSU_PM_800M			220
+#define CLK_TREE_PCIE_REF_B0			221
+#define CLK_TREE_PCIE_REF_B1			222
+#define CLK_TREE_PCIE_REF_B2			223
+#define CLK_TREE_PCIE_REF_B3			224
+#define CLK_TREE_PCIE_REF_B4			225
+#define CLK_TREE_PCIE_REF_PHY_X8		226
+#define CLK_TREE_PCIE_REF_PHY_X4		227
+#define CLK_TREE_PCIE_REF_PHY_X211		228
+#define CLK_TREE_GMAC_REC_CLK			229
+#define CLK_TREE_GPUTOP_PLL			230
+#define CLK_TREE_GPUCORE_PLL			231
+#define CLK_TREE_CPU_PLL_LIT			232
+#define CLK_TREE_CPU_PLL0			233
+#define CLK_TREE_CPU_PLL1			234
+#define CLK_TREE_CPU_PLL2			235
+#define CLK_TREE_CPU_PLL3			236
+#define CLK_TREE_FCH_I3C0_FUNC			237
+#define CLK_TREE_FCH_I3C1_FUNC			238
+#define CLK_TREE_FCH_DMA_ACLK			239
+#define CLK_TREE_FCH_XSPI_FUNC			240
+#define CLK_TREE_FCH_XSPI_MACLK			241
+#define CLK_TREE_FCH_TIMER_FUN			242
+#define CLK_TREE_FCH_APB_IO_S0			243
+#define CLK_TREE_FCH_I3C0_APB			244
+#define CLK_TREE_FCH_I3C1_APB			245
+#define CLK_TREE_FCH_UART0_APB			246
+#define CLK_TREE_FCH_UART1_APB			247
+#define CLK_TREE_FCH_UART2_APB			248
+#define CLK_TREE_FCH_UART3_APB			249
+#define CLK_TREE_FCH_SPI0_APB			250
+#define CLK_TREE_FCH_SPI1_APB			251
+#define CLK_TREE_FCH_XSPI_APB			252
+#define CLK_TREE_FCH_I2C0_APB			253
+#define CLK_TREE_FCH_I2C1_APB			254
+#define CLK_TREE_FCH_I2C2_APB			255
+#define CLK_TREE_FCH_I2C3_APB			256
+#define CLK_TREE_FCH_I2C4_APB			257
+#define CLK_TREE_FCH_I2C5_APB			258
+#define CLK_TREE_FCH_I2C6_APB			259
+#define CLK_TREE_FCH_I2C7_APB			260
+#define CLK_TREE_FCH_TIMER_APB			261
+#define CLK_TREE_FCH_GPIO_APB			262
+#define CLK_TREE_FCH_UART0_FUNC			263
+#define CLK_TREE_FCH_UART1_FUNC			264
+#define CLK_TREE_FCH_UART2_FUNC			265
+#define CLK_TREE_FCH_UART3_FUNC			266
+/* 267~271 not used by AP, skip */
+#define CLK_TREE_GPU_CLK_200M			272
+
+#endif
-- 
cgit v1.2.3


From fcb476990beb55c958db0b5aa2e9ca772d0fc982 Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Mon, 21 Jul 2025 18:44:17 +0800
Subject: usb: core: add urb->sgt parameter description

The parameter description of urb->sgt is lost, this will add it for
completeness.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/all/20250711182803.1d548467@canb.auug.org.au/
Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Fixes: 488e6eaab88c ("usb: core: add dma-noncoherent buffer alloc and free API")
Link: https://lore.kernel.org/r/20250721104417.3442530-1-xu.yang_2@nxp.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 535ac37198a1..9d662c6abb4d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1455,6 +1455,10 @@ typedef void (*usb_complete_t)(struct urb *);
  * @sg: scatter gather buffer list, the buffer size of each element in
  * 	the list (except the last) must be divisible by the endpoint's
  * 	max packet size if no_sg_constraint isn't set in 'struct usb_bus'
+ * @sgt: used to hold a scatter gather table returned by usb_alloc_noncoherent(),
+ *      which describes the allocated non-coherent and possibly non-contiguous
+ *      memory and is guaranteed to have 1 single DMA mapped segment. The
+ *      allocated memory needs to be freed by usb_free_noncoherent().
  * @num_mapped_sgs: (internal) number of mapped sg entries
  * @num_sgs: number of entries in the sg list
  * @transfer_buffer_length: How big is transfer_buffer.  The transfer may
-- 
cgit v1.2.3


From 0f29e33fbadd7517b96f3f3e86220215d99875cb Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca.weiss@fairphone.com>
Date: Wed, 9 Jul 2025 15:14:49 +0200
Subject: dt-bindings: interconnect: document the RPMh Network-On-Chip
 Interconnect in Qualcomm Milos SoC

Document the RPMh Network-On-Chip Interconnect of the Milos (e.g.
SM7635) SoC.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
Link: https://lore.kernel.org/r/20250709-sm7635-icc-v3-1-c446203c3b3a@fairphone.com
Signed-off-by: Georgi Djakov <djakov@kernel.org>
---
 .../bindings/interconnect/qcom,milos-rpmh.yaml     | 136 ++++++++++++++++++++
 include/dt-bindings/interconnect/qcom,milos-rpmh.h | 141 +++++++++++++++++++++
 2 files changed, 277 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/interconnect/qcom,milos-rpmh.yaml
 create mode 100644 include/dt-bindings/interconnect/qcom,milos-rpmh.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/interconnect/qcom,milos-rpmh.yaml b/Documentation/devicetree/bindings/interconnect/qcom,milos-rpmh.yaml
new file mode 100644
index 000000000000..00b7a4108d45
--- /dev/null
+++ b/Documentation/devicetree/bindings/interconnect/qcom,milos-rpmh.yaml
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/interconnect/qcom,milos-rpmh.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm RPMh Network-On-Chip Interconnect on Milos SoC
+
+maintainers:
+  - Luca Weiss <luca.weiss@fairphone.com>
+
+description: |
+  RPMh interconnect providers support system bandwidth requirements through
+  RPMh hardware accelerators known as Bus Clock Manager (BCM). The provider is
+  able to communicate with the BCM through the Resource State Coordinator (RSC)
+  associated with each execution environment. Provider nodes must point to at
+  least one RPMh device child node pertaining to their RSC and each provider
+  can map to multiple RPMh resources.
+
+  See also: include/dt-bindings/interconnect/qcom,milos-rpmh.h
+
+properties:
+  compatible:
+    enum:
+      - qcom,milos-aggre1-noc
+      - qcom,milos-aggre2-noc
+      - qcom,milos-clk-virt
+      - qcom,milos-cnoc-cfg
+      - qcom,milos-cnoc-main
+      - qcom,milos-gem-noc
+      - qcom,milos-lpass-ag-noc
+      - qcom,milos-mc-virt
+      - qcom,milos-mmss-noc
+      - qcom,milos-nsp-noc
+      - qcom,milos-pcie-anoc
+      - qcom,milos-system-noc
+
+  reg:
+    maxItems: 1
+
+  clocks:
+    minItems: 1
+    maxItems: 2
+
+required:
+  - compatible
+
+allOf:
+  - $ref: qcom,rpmh-common.yaml#
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,milos-clk-virt
+              - qcom,milos-mc-virt
+    then:
+      properties:
+        reg: false
+    else:
+      required:
+        - reg
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,milos-pcie-anoc
+    then:
+      properties:
+        clocks:
+          items:
+            - description: aggre-NOC PCIe AXI clock
+            - description: cfg-NOC PCIe a-NOC AHB clock
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,milos-aggre1-noc
+    then:
+      properties:
+        clocks:
+          items:
+            - description: aggre USB3 PRIM AXI clock
+            - description: aggre UFS PHY AXI clock
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,milos-aggre2-noc
+    then:
+      properties:
+        clocks:
+          items:
+            - description: RPMH CC IPA clock
+
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,milos-aggre1-noc
+              - qcom,milos-aggre2-noc
+              - qcom,milos-pcie-anoc
+    then:
+      required:
+        - clocks
+    else:
+      properties:
+        clocks: false
+
+unevaluatedProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,milos-gcc.h>
+
+    interconnect-0 {
+        compatible = "qcom,milos-clk-virt";
+        #interconnect-cells = <2>;
+        qcom,bcm-voters = <&apps_bcm_voter>;
+    };
+
+    interconnect@16e0000 {
+        compatible = "qcom,milos-aggre1-noc";
+        reg = <0x016e0000 0x16400>;
+        #interconnect-cells = <2>;
+        clocks = <&gcc GCC_AGGRE_USB3_PRIM_AXI_CLK>,
+                 <&gcc GCC_AGGRE_UFS_PHY_AXI_CLK>;
+        qcom,bcm-voters = <&apps_bcm_voter>;
+    };
diff --git a/include/dt-bindings/interconnect/qcom,milos-rpmh.h b/include/dt-bindings/interconnect/qcom,milos-rpmh.h
new file mode 100644
index 000000000000..9326d7d9c2a3
--- /dev/null
+++ b/include/dt-bindings/interconnect/qcom,milos-rpmh.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * Copyright (c) 2023-2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2025, Luca Weiss <luca.weiss@fairphone.com>
+ */
+
+#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_MILOS_H
+#define __DT_BINDINGS_INTERCONNECT_QCOM_MILOS_H
+
+#define MASTER_QUP_1				0
+#define MASTER_UFS_MEM				1
+#define MASTER_USB3_0				2
+#define SLAVE_A1NOC_SNOC			3
+
+#define MASTER_QDSS_BAM				0
+#define MASTER_QSPI_0				1
+#define MASTER_QUP_0				2
+#define MASTER_CRYPTO				3
+#define MASTER_IPA				4
+#define MASTER_QDSS_ETR				5
+#define MASTER_QDSS_ETR_1			6
+#define MASTER_SDCC_1				7
+#define MASTER_SDCC_2				8
+#define SLAVE_A2NOC_SNOC			9
+
+#define MASTER_QUP_CORE_0			0
+#define MASTER_QUP_CORE_1			1
+#define SLAVE_QUP_CORE_0			2
+#define SLAVE_QUP_CORE_1			3
+
+#define MASTER_CNOC_CFG				0
+#define SLAVE_AHB2PHY_SOUTH			1
+#define SLAVE_AHB2PHY_NORTH			2
+#define SLAVE_CAMERA_CFG			3
+#define SLAVE_CLK_CTL				4
+#define SLAVE_RBCPR_CX_CFG			5
+#define SLAVE_RBCPR_MXA_CFG			6
+#define SLAVE_CRYPTO_0_CFG			7
+#define SLAVE_CX_RDPM				8
+#define SLAVE_GFX3D_CFG				9
+#define SLAVE_IMEM_CFG				10
+#define SLAVE_CNOC_MSS				11
+#define SLAVE_MX_2_RDPM				12
+#define SLAVE_MX_RDPM				13
+#define SLAVE_PDM				14
+#define SLAVE_QDSS_CFG				15
+#define SLAVE_QSPI_0				16
+#define SLAVE_QUP_0				17
+#define SLAVE_QUP_1				18
+#define SLAVE_SDC1				19
+#define SLAVE_SDCC_2				20
+#define SLAVE_TCSR				21
+#define SLAVE_TLMM				22
+#define SLAVE_UFS_MEM_CFG			23
+#define SLAVE_USB3_0				24
+#define SLAVE_VENUS_CFG				25
+#define SLAVE_VSENSE_CTRL_CFG			26
+#define SLAVE_WLAN				27
+#define SLAVE_CNOC_MNOC_HF_CFG			28
+#define SLAVE_CNOC_MNOC_SF_CFG			29
+#define SLAVE_NSP_QTB_CFG			30
+#define SLAVE_PCIE_ANOC_CFG			31
+#define SLAVE_WLAN_Q6_THROTTLE_CFG		32
+#define SLAVE_SERVICE_CNOC_CFG			33
+#define SLAVE_QDSS_STM				34
+#define SLAVE_TCU				35
+
+#define MASTER_GEM_NOC_CNOC			0
+#define MASTER_GEM_NOC_PCIE_SNOC		1
+#define SLAVE_AOSS				2
+#define SLAVE_DISPLAY_CFG			3
+#define SLAVE_IPA_CFG				4
+#define SLAVE_IPC_ROUTER_CFG			5
+#define SLAVE_PCIE_0_CFG			6
+#define SLAVE_PCIE_1_CFG			7
+#define SLAVE_PRNG				8
+#define SLAVE_TME_CFG				9
+#define SLAVE_APPSS				10
+#define SLAVE_CNOC_CFG				11
+#define SLAVE_DDRSS_CFG				12
+#define SLAVE_IMEM				13
+#define SLAVE_PIMEM				14
+#define SLAVE_SERVICE_CNOC			15
+#define SLAVE_PCIE_0				16
+#define SLAVE_PCIE_1				17
+
+#define MASTER_GPU_TCU				0
+#define MASTER_SYS_TCU				1
+#define MASTER_APPSS_PROC			2
+#define MASTER_GFX3D				3
+#define MASTER_LPASS_GEM_NOC			4
+#define MASTER_MSS_PROC				5
+#define MASTER_MNOC_HF_MEM_NOC			6
+#define MASTER_MNOC_SF_MEM_NOC			7
+#define MASTER_COMPUTE_NOC			8
+#define MASTER_ANOC_PCIE_GEM_NOC		9
+#define MASTER_SNOC_GC_MEM_NOC			10
+#define MASTER_SNOC_SF_MEM_NOC			11
+#define MASTER_WLAN_Q6				12
+#define SLAVE_GEM_NOC_CNOC			13
+#define SLAVE_LLCC				14
+#define SLAVE_MEM_NOC_PCIE_SNOC			15
+
+#define MASTER_LPASS_PROC			0
+#define SLAVE_LPASS_GEM_NOC			1
+
+#define MASTER_LLCC				0
+#define SLAVE_EBI1				1
+
+#define MASTER_CAMNOC_HF			0
+#define MASTER_CAMNOC_ICP			1
+#define MASTER_CAMNOC_SF			2
+#define MASTER_MDP				3
+#define MASTER_VIDEO				4
+#define MASTER_CNOC_MNOC_HF_CFG			5
+#define MASTER_CNOC_MNOC_SF_CFG			6
+#define SLAVE_MNOC_HF_MEM_NOC			7
+#define SLAVE_MNOC_SF_MEM_NOC			8
+#define SLAVE_SERVICE_MNOC_HF			9
+#define SLAVE_SERVICE_MNOC_SF			10
+
+#define MASTER_CDSP_PROC			0
+#define SLAVE_CDSP_MEM_NOC			1
+
+#define MASTER_PCIE_ANOC_CFG			0
+#define MASTER_PCIE_0				1
+#define MASTER_PCIE_1				2
+#define SLAVE_ANOC_PCIE_GEM_NOC			3
+#define SLAVE_SERVICE_PCIE_ANOC			4
+
+#define MASTER_A1NOC_SNOC			0
+#define MASTER_A2NOC_SNOC			1
+#define MASTER_APSS_NOC				2
+#define MASTER_CNOC_SNOC			3
+#define MASTER_PIMEM				4
+#define MASTER_GIC				5
+#define SLAVE_SNOC_GEM_NOC_GC			6
+#define SLAVE_SNOC_GEM_NOC_SF			7
+
+
+#endif
-- 
cgit v1.2.3


From ab16122115327b2a602595f539cae7e39a331d0e Mon Sep 17 00:00:00 2001
From: Jack Thomson <jackabt@amazon.com>
Date: Mon, 21 Jul 2025 14:05:58 +0100
Subject: arm64: kvm, smccc: Fix vendor uuid

Commit 13423063c7cb ("arm64: kvm, smccc: Introduce and use API for
getting hypervisor UUID") replaced the explicit register constants
with the UUID_INIT macro. However, there is an endian issue, meaning
the UUID generated and used in the handshake didn't match UUID prior to
the commit.

The change in UUID causes the SMCCC vendor handshake to fail with older
guest kernels, meaning devices such as PTP were not available in the
guest.

This patch updates the parameters to the macro to generate a UUID which
matches the previous value, and re-establish backwards compatibility
with older guest kernels.

Fixes: 13423063c7cb ("arm64: kvm, smccc: Introduce and use API for getting hypervisor UUID")
Signed-off-by: Jack Thomson <jackabt@amazon.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20250721130558.50823-1-jackabt.amazon@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/arm-smccc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 784ebe4607a4..50b47eba7d01 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -113,7 +113,7 @@
 
 /* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */
 #define ARM_SMCCC_VENDOR_HYP_UID_KVM UUID_INIT(\
-	0xb66fb428, 0xc52e, 0xe911, \
+	0x28b46fb6, 0x2ec5, 0x11e9, \
 	0xa9, 0xca, 0x4b, 0x56, \
 	0x4d, 0x00, 0x3a, 0x74)
 
-- 
cgit v1.2.3


From 460114eae8284155b51f6e72ed26f627ee338a30 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Mon, 21 Jul 2025 09:20:03 +0300
Subject: wifi: mac80211: remove ieee80211_remove_key

It is no longer used, remove it.

Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250721091956.e964ceacd85c.Idecab8ef161fa58e000b3969bc936399284b79f0@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 16 +---------------
 net/mac80211/key.c     | 27 +--------------------------
 2 files changed, 2 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a2dbaad2f6d3..a0cf976a9117 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6032,18 +6032,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 			      int tid, struct ieee80211_key_seq *seq);
 
-/**
- * ieee80211_remove_key - remove the given key
- * @keyconf: the parameter passed with the set key
- *
- * Context: Must be called with the wiphy mutex held.
- *
- * Remove the given key. If the key was uploaded to the hardware at the
- * time this function is called, it is not deleted in the hardware but
- * instead assumed to have been removed already.
- */
-void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
-
 /**
  * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
  * @vif: the virtual interface to add the key on
@@ -6070,9 +6058,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
  * for the new key for each TID to set up sequence counters properly.
  *
  * IMPORTANT: If this replaces a key that is present in the hardware,
- * then it will attempt to remove it during this call. In many cases
- * this isn't what you want, so call ieee80211_remove_key() first for
- * the key that's being replaced.
+ * then it will attempt to remove it during this call.
  */
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 997892da8886..9d65013ddac7 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,7 +6,7 @@
  * Copyright 2007-2008	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright 2018-2020, 2022-2024  Intel Corporation
+ * Copyright 2018-2020, 2022-2025  Intel Corporation
  */
 
 #include <crypto/utils.h>
@@ -1354,31 +1354,6 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 }
 EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq);
 
-void ieee80211_remove_key(struct ieee80211_key_conf *keyconf)
-{
-	struct ieee80211_key *key;
-
-	key = container_of(keyconf, struct ieee80211_key, conf);
-
-	lockdep_assert_wiphy(key->local->hw.wiphy);
-
-	/*
-	 * if key was uploaded, we assume the driver will/has remove(d)
-	 * it, so adjust bookkeeping accordingly
-	 */
-	if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) {
-		key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE;
-
-		if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC |
-					 IEEE80211_KEY_FLAG_PUT_MIC_SPACE |
-					 IEEE80211_KEY_FLAG_RESERVE_TAILROOM)))
-			increment_tailroom_need_count(key->sdata);
-	}
-
-	ieee80211_key_free(key, false);
-}
-EXPORT_SYMBOL_GPL(ieee80211_remove_key);
-
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
 			struct ieee80211_key_conf *keyconf,
-- 
cgit v1.2.3


From 84b62b72b4c759b51568e44b0e8dc80f4cb8a2b9 Mon Sep 17 00:00:00 2001
From: Michael-CY Lee <michael-cy.lee@mediatek.com>
Date: Mon, 21 Jul 2025 14:51:59 +0800
Subject: wifi: cfg80211/mac80211: report link ID for unexpected frames

The upper layer may require the link ID to properly handle
unexpected frames. For instance, if hostapd, operating as an
AP MLD, receives a data frame from a non-associated STA,
it must send deauthentication to the link on which the STA is
operating.

Signed-off-by: Michael-CY Lee <michael-cy.lee@mediatek.com>
Reviewed-by: Money Wang <money.wang@mediatek.com>
Link: https://patch.msgid.link/20250721065159.1740992-1-michael-cy.lee@mediatek.com
[edit commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 10 ++++++----
 net/mac80211/rx.c      | 10 +++++-----
 net/wireless/nl80211.c | 22 ++++++++++++----------
 net/wireless/trace.h   | 17 ++++++++++-------
 4 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 44a1055a81ba..406626ff6cc8 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -9048,6 +9048,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
 /**
  * cfg80211_rx_spurious_frame - inform userspace about a spurious frame
  * @dev: The device the frame matched to
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
  * @addr: the transmitter address
  * @gfp: context flags
  *
@@ -9057,13 +9058,14 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
  * Return: %true if the frame was passed to userspace (or this failed
  * for a reason other than not having a subscription.)
  */
-bool cfg80211_rx_spurious_frame(struct net_device *dev,
-				const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr,
+				int link_id, gfp_t gfp);
 
 /**
  * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
  * @dev: The device the frame matched to
  * @addr: the transmitter address
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
  * @gfp: context flags
  *
  * This function is used in AP mode (only!) to inform userspace that
@@ -9073,8 +9075,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev,
  * Return: %true if the frame was passed to userspace (or this failed
  * for a reason other than not having a subscription.)
  */
-bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
-					const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr,
+					int link_id, gfp_t gfp);
 
 /**
  * cfg80211_probe_status - notify userspace about probe status
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 576e399fc99c..4d4ff4d4917a 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1532,9 +1532,8 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
 		}
 
 		if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
-		    cfg80211_rx_spurious_frame(rx->sdata->dev,
-					       hdr->addr2,
-					       GFP_ATOMIC))
+		    cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2,
+					       rx->link_id, GFP_ATOMIC))
 			return RX_DROP_U_SPURIOUS;
 
 		return RX_DROP;
@@ -1872,7 +1871,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 			if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT))
 				cfg80211_rx_unexpected_4addr_frame(
 					rx->sdata->dev, sta->sta.addr,
-					GFP_ATOMIC);
+					rx->link_id, GFP_ATOMIC);
 			return RX_DROP_U_UNEXPECTED_4ADDR_FRAME;
 		}
 		/*
@@ -3191,7 +3190,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
 		if (rx->sta &&
 		    !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT))
 			cfg80211_rx_unexpected_4addr_frame(
-				rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC);
+				rx->sdata->dev, rx->sta->sta.addr, rx->link_id,
+				GFP_ATOMIC);
 		return RX_DROP;
 	}
 
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1c808b08b747..b4bf44768dc8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -19755,7 +19755,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 EXPORT_SYMBOL(cfg80211_conn_failed);
 
 static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
-				       const u8 *addr, gfp_t gfp)
+				       const u8 *addr, int link_id, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -19778,7 +19778,9 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
 
 	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
 	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
-	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr))
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
+	    (link_id >= 0 &&
+	     nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
@@ -19790,13 +19792,13 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
 	return true;
 }
 
-bool cfg80211_rx_spurious_frame(struct net_device *dev,
-				const u8 *addr, gfp_t gfp)
+bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr,
+				int link_id, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	bool ret;
 
-	trace_cfg80211_rx_spurious_frame(dev, addr);
+	trace_cfg80211_rx_spurious_frame(dev, addr, link_id);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_GO)) {
@@ -19804,19 +19806,19 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev,
 		return false;
 	}
 	ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME,
-					 addr, gfp);
+					 addr, link_id, gfp);
 	trace_cfg80211_return_bool(ret);
 	return ret;
 }
 EXPORT_SYMBOL(cfg80211_rx_spurious_frame);
 
-bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
-					const u8 *addr, gfp_t gfp)
+bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr,
+					int link_id, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	bool ret;
 
-	trace_cfg80211_rx_unexpected_4addr_frame(dev, addr);
+	trace_cfg80211_rx_unexpected_4addr_frame(dev, addr, link_id);
 
 	if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP &&
 		    wdev->iftype != NL80211_IFTYPE_P2P_GO &&
@@ -19826,7 +19828,7 @@ bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
 	}
 	ret = __nl80211_unexpected_frame(dev,
 					 NL80211_CMD_UNEXPECTED_4ADDR_FRAME,
-					 addr, gfp);
+					 addr, link_id, gfp);
 	trace_cfg80211_return_bool(ret);
 	return ret;
 }
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index a07d88d61bec..34c584a215e5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3570,27 +3570,30 @@ TRACE_EVENT(cfg80211_cac_event,
 );
 
 DECLARE_EVENT_CLASS(cfg80211_rx_evt,
-	TP_PROTO(struct net_device *netdev, const u8 *addr),
-	TP_ARGS(netdev, addr),
+	TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
+	TP_ARGS(netdev, addr, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		MAC_ENTRY(addr)
+		__field(int, link_id)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
 		MAC_ASSIGN(addr, addr);
+		__entry->link_id = link_id;
 	),
-	TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->addr)
+	TP_printk(NETDEV_PR_FMT ", %pM, link_id:%d", NETDEV_PR_ARG,
+		  __entry->addr, __entry->link_id)
 );
 
 DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame,
-	TP_PROTO(struct net_device *netdev, const u8 *addr),
-	TP_ARGS(netdev, addr)
+	TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
+	TP_ARGS(netdev, addr, link_id)
 );
 
 DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame,
-	TP_PROTO(struct net_device *netdev, const u8 *addr),
-	TP_ARGS(netdev, addr)
+	TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id),
+	TP_ARGS(netdev, addr, link_id)
 );
 
 TRACE_EVENT(cfg80211_ibss_joined,
-- 
cgit v1.2.3


From a3b366dbf468788baa2e0ccdccd8482ac277e47b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 09:49:32 -0400
Subject: tracing, AER: Hide PCIe AER event when PCIEAER is not configured

The event aer_event is only used when CONFIG_PCIEAER is configured. It
should not be created when it is not. When an event is created it creates
around 5K of text and meta data regardless if the tracepoint is used or
not. Instead of wasting this memory, put #ifdef around the event to not
create it when it is not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20250612094932.4a08abd6@batman.local.home
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/ras/ras_event.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 14c9f943d53f..c8cd0f00c845 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -252,6 +252,7 @@ TRACE_EVENT(non_standard_event,
 		  __print_hex(__get_dynamic_array(buf), __entry->len))
 );
 
+#ifdef CONFIG_PCIEAER
 /*
  * PCIe AER Trace event
  *
@@ -337,6 +338,7 @@ TRACE_EVENT(aer_event,
 			__print_array(__entry->tlp_header, PCIE_STD_MAX_TLP_HEADERLOG, 4) :
 			"Not available")
 );
+#endif /* CONFIG_PCIEAER */
 
 /*
  * memory-failure recovery action result event
-- 
cgit v1.2.3


From 06cc77a63f1b10e8f05e7223753883e14c35b512 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 09:58:28 -0400
Subject: alarmtimer: Hide alarmtimer_suspend event when RTC_CLASS is not
 configured

The trace event alarmtimer_suspend is only called when RTC_CLASS is
defined. As every event created can create up to 5K of text and meta data
regardless if it is called or not it should not be created and waste
memory. Hide the event when CONFIG_RTC_CLASS is not defined.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/20250612095828.6d75dfa3@batman.local.home
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/alarmtimer.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/alarmtimer.h b/include/trace/events/alarmtimer.h
index 13483c7ca70b..8e9c76a7f21b 100644
--- a/include/trace/events/alarmtimer.h
+++ b/include/trace/events/alarmtimer.h
@@ -20,6 +20,7 @@ TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER);
 	{ 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" },	\
 	{ 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" })
 
+#ifdef CONFIG_RTC_CLASS
 TRACE_EVENT(alarmtimer_suspend,
 
 	TP_PROTO(ktime_t expires, int flag),
@@ -41,6 +42,7 @@ TRACE_EVENT(alarmtimer_suspend,
 		  __entry->expires
 	)
 );
+#endif /* CONFIG_RTC_CLASS */
 
 DECLARE_EVENT_CLASS(alarm_class,
 
-- 
cgit v1.2.3


From 647fe16b46999258ce1aec41f4bdeabb4f0cc8e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:53:11 -0400
Subject: PM: cpufreq: powernv/tracing: Move powernv_throttle trace event

As the trace event powernv_throttle is only used by the powernv code, move
it to a separate include file and have that code directly enable it.

Trace events can take up around 5K of memory when they are defined
regardless if they are used or not. It wastes memory to have them defined
in configurations where the tracepoint is not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/20250612145407.906308844@goodmis.org
Fixes: 0306e481d479a ("cpufreq: powernv/tracing: Add powernv_throttle tracepoint")
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 drivers/cpufreq/Makefile          |  1 +
 drivers/cpufreq/powernv-cpufreq.c |  4 +++-
 drivers/cpufreq/powernv-trace.h   | 44 +++++++++++++++++++++++++++++++++++++++
 include/trace/events/power.h      | 22 --------------------
 kernel/trace/power-traces.c       |  1 -
 5 files changed, 48 insertions(+), 24 deletions(-)
 create mode 100644 drivers/cpufreq/powernv-trace.h

(limited to 'include')

diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index d38526b8e063..681d687b5a18 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_CPUFREQ_VIRT)		+= virtual-cpufreq.o
 
 # Traces
 CFLAGS_amd-pstate-trace.o               := -I$(src)
+CFLAGS_powernv-cpufreq.o                := -I$(src)
 amd_pstate-y				:= amd-pstate.o amd-pstate-trace.o
 
 ##################################################################################
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index a8943e2a93be..7d9a5f656de8 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -21,7 +21,6 @@
 #include <linux/string_choices.h>
 #include <linux/cpu.h>
 #include <linux/hashtable.h>
-#include <trace/events/power.h>
 
 #include <asm/cputhreads.h>
 #include <asm/firmware.h>
@@ -30,6 +29,9 @@
 #include <asm/opal.h>
 #include <linux/timer.h>
 
+#define CREATE_TRACE_POINTS
+#include "powernv-trace.h"
+
 #define POWERNV_MAX_PSTATES_ORDER  8
 #define POWERNV_MAX_PSTATES	(1UL << (POWERNV_MAX_PSTATES_ORDER))
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
diff --git a/drivers/cpufreq/powernv-trace.h b/drivers/cpufreq/powernv-trace.h
new file mode 100644
index 000000000000..8cadb7c9427b
--- /dev/null
+++ b/drivers/cpufreq/powernv-trace.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_POWERNV_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _POWERNV_TRACE_H
+
+#include <linux/cpufreq.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_events.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+TRACE_EVENT(powernv_throttle,
+
+	TP_PROTO(int chip_id, const char *reason, int pmax),
+
+	TP_ARGS(chip_id, reason, pmax),
+
+	TP_STRUCT__entry(
+		__field(int, chip_id)
+		__string(reason, reason)
+		__field(int, pmax)
+	),
+
+	TP_fast_assign(
+		__entry->chip_id = chip_id;
+		__assign_str(reason);
+		__entry->pmax = pmax;
+	),
+
+	TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
+		  __entry->pmax, __get_str(reason))
+);
+
+#endif /* _POWERNV_TRACE_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE powernv-trace
+
+#include <trace/define_trace.h>
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 6c631eec23e3..913181cebfe9 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -99,28 +99,6 @@ DEFINE_EVENT(psci_domain_idle, psci_domain_idle_exit,
 	TP_ARGS(cpu_id, state, s2idle)
 );
 
-TRACE_EVENT(powernv_throttle,
-
-	TP_PROTO(int chip_id, const char *reason, int pmax),
-
-	TP_ARGS(chip_id, reason, pmax),
-
-	TP_STRUCT__entry(
-		__field(int, chip_id)
-		__string(reason, reason)
-		__field(int, pmax)
-	),
-
-	TP_fast_assign(
-		__entry->chip_id = chip_id;
-		__assign_str(reason);
-		__entry->pmax = pmax;
-	),
-
-	TP_printk("Chip %d Pmax %d %s", __entry->chip_id,
-		  __entry->pmax, __get_str(reason))
-);
-
 TRACE_EVENT(pstate_sample,
 
 	TP_PROTO(u32 core_busy,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 21bb161c2316..f2fe33573e54 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -17,5 +17,4 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
-EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
 
-- 
cgit v1.2.3


From e64397f81c5422d5323f5f9a0fd2438e4c403015 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:53:12 -0400
Subject: PM: tracing: Hide psci_domain_idle events under ARM_PSCI_CPUIDLE

The events psci_domain_idle_enter and psci_domain_idle_exit events are
only called when CONFIG_ARM_PSCI_CPUIDLE is defined. As each event can
take up to 5K (less for DEFINE_EVENT()) regardless if they are used or
not, it's best not to define them when they are not used. Add #ifdef
around these events when they are not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/20250612145408.074769245@goodmis.org
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/power.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 913181cebfe9..a10ad300d660 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -62,6 +62,7 @@ TRACE_EVENT(cpu_idle_miss,
 		(unsigned long)__entry->state, (__entry->below)?"below":"above")
 );
 
+#ifdef CONFIG_ARM_PSCI_CPUIDLE
 DECLARE_EVENT_CLASS(psci_domain_idle,
 
 	TP_PROTO(unsigned int cpu_id, unsigned int state, bool s2idle),
@@ -98,6 +99,7 @@ DEFINE_EVENT(psci_domain_idle, psci_domain_idle_exit,
 
 	TP_ARGS(cpu_id, state, s2idle)
 );
+#endif
 
 TRACE_EVENT(pstate_sample,
 
-- 
cgit v1.2.3


From e68849097e82f7e42f3212cefbdcb7d1428e6ffd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:53:13 -0400
Subject: PM: tracing: Hide device_pm_callback events under PM_SLEEP

The events device_pm_callback_start and device_pm_callback_end events are
only called when CONFIG_PM_SLEEP is defined. As each event can take up to
5K regardless if they are used or not, it's best not to define them when
they are not used. Add #ifdef around these events when they are not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/20250612145408.246703478@goodmis.org
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/power.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index a10ad300d660..5b1df5e1e092 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -212,6 +212,7 @@ TRACE_EVENT(cpu_frequency_limits,
 		  (unsigned long)__entry->cpu_id)
 );
 
+#ifdef CONFIG_PM_SLEEP
 TRACE_EVENT(device_pm_callback_start,
 
 	TP_PROTO(struct device *dev, const char *pm_ops, int event),
@@ -260,6 +261,7 @@ TRACE_EVENT(device_pm_callback_end,
 	TP_printk("%s %s, err=%d",
 		__get_str(driver), __get_str(device), __entry->error)
 );
+#endif
 
 TRACE_EVENT(suspend_resume,
 
-- 
cgit v1.2.3


From dd4186c2f2733389749182775498b03528622d70 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:53:14 -0400
Subject: PM: tracing: Hide power_domain_target event under ARCH_OMAP2PLUS

The power_domain_target event event is only called when CONFIG_OMAP2PLUS
is defined. As each event can take up to 5K regardless if they are used or
not, it's best not to define them when they are not used. Add #ifdef
around these events when they are not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/20250612145408.415483176@goodmis.org
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/power.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 5b1df5e1e092..82904291c2b8 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -319,6 +319,7 @@ DEFINE_EVENT(wakeup_source, wakeup_source_deactivate,
 	TP_ARGS(name, state)
 );
 
+#ifdef CONFIG_ARCH_OMAP2PLUS
 /*
  * The power domain events are used for power domains transitions
  */
@@ -350,6 +351,7 @@ DEFINE_EVENT(power_domain, power_domain_target,
 
 	TP_ARGS(name, state, cpu_id)
 );
+#endif
 
 /*
  * CPU latency QoS events used for global CPU latency QoS list updates
-- 
cgit v1.2.3


From b769777d927af168b1389388392bfd7dc4e38399 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 2 Jun 2025 13:56:48 +0100
Subject: btrfs: use refcount_t type for the extent buffer reference counter

Instead of using a bare atomic, use the refcount_t type, which despite
being a structure that contains only an atomic, has an API that checks
for underflows and other hazards. This doesn't change the size of the
extent_buffer structure.

This removes the need to do things like this:

    WARN_ON(atomic_read(&eb->refs) == 0);
    if (atomic_dec_and_test(&eb->refs)) {
        (...)
    }

And do just:

    if (refcount_dec_and_test(&eb->refs)) {
        (...)
    }

Since refcount_dec_and_test() already triggers a warning when we decrement
a ref count that has a value of 0 (or below zero).

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c             | 14 +++++++-------
 fs/btrfs/extent-tree.c       |  2 +-
 fs/btrfs/extent_io.c         | 45 ++++++++++++++++++++++----------------------
 fs/btrfs/extent_io.h         |  2 +-
 fs/btrfs/fiemap.c            |  2 +-
 fs/btrfs/print-tree.c        |  2 +-
 fs/btrfs/qgroup.c            |  6 +++---
 fs/btrfs/relocation.c        |  4 ++--
 fs/btrfs/tree-log.c          |  4 ++--
 fs/btrfs/zoned.c             |  2 +-
 include/trace/events/btrfs.h |  2 +-
 11 files changed, 42 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 94c4ed1b99d0..1b36ee2d8044 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 		 * the inc_not_zero dance and if it doesn't work then
 		 * synchronize_rcu and try again.
 		 */
-		if (atomic_inc_not_zero(&eb->refs)) {
+		if (refcount_inc_not_zero(&eb->refs)) {
 			rcu_read_unlock();
 			break;
 		}
@@ -560,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
 			btrfs_abort_transaction(trans, ret);
 			goto error_unlock_cow;
 		}
-		atomic_inc(&cow->refs);
+		refcount_inc(&cow->refs);
 		rcu_assign_pointer(root->node, cow);
 
 		ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
@@ -1092,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	/* update the path */
 	if (left) {
 		if (btrfs_header_nritems(left) > orig_slot) {
-			atomic_inc(&left->refs);
+			refcount_inc(&left->refs);
 			/* left was locked after cow */
 			path->nodes[level] = left;
 			path->slots[level + 1] -= 1;
@@ -1696,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 
 	if (p->search_commit_root) {
 		b = root->commit_root;
-		atomic_inc(&b->refs);
+		refcount_inc(&b->refs);
 		level = btrfs_header_level(b);
 		/*
 		 * Ensure that all callers have set skip_locking when
@@ -2894,7 +2894,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	free_extent_buffer(old);
 
 	add_root_to_dirty_list(root);
-	atomic_inc(&c->refs);
+	refcount_inc(&c->refs);
 	path->nodes[level] = c;
 	path->locks[level] = BTRFS_WRITE_LOCK;
 	path->slots[level] = 0;
@@ -4451,7 +4451,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
 	root_sub_used_bytes(root);
 
-	atomic_inc(&leaf->refs);
+	refcount_inc(&leaf->refs);
 	ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
 	free_extent_buffer_stale(leaf);
 	if (ret < 0)
@@ -4536,7 +4536,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			 * for possible call to btrfs_del_ptr below
 			 */
 			slot = path->slots[1];
-			atomic_inc(&leaf->refs);
+			refcount_inc(&leaf->refs);
 			/*
 			 * We want to be able to at least push one item to the
 			 * left neighbour leaf, and that's the first item.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c122f9f8280..46d4963a8241 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6348,7 +6348,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 
 	btrfs_assert_tree_write_locked(parent);
 	parent_level = btrfs_header_level(parent);
-	atomic_inc(&parent->refs);
+	refcount_inc(&parent->refs);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5e8dd9a99e58..ac639d50706a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -77,7 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
 				      struct extent_buffer, leak_list);
 		pr_err(
 	"BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
-		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+		       eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
 		       btrfs_header_owner(eb));
 		list_del(&eb->leak_list);
 		WARN_ON_ONCE(1);
@@ -1961,7 +1961,7 @@ retry:
 	if (!eb)
 		return NULL;
 
-	if (!atomic_inc_not_zero(&eb->refs)) {
+	if (!refcount_inc_not_zero(&eb->refs)) {
 		xas_reset(xas);
 		goto retry;
 	}
@@ -2012,7 +2012,7 @@ static struct extent_buffer *find_extent_buffer_nolock(
 
 	rcu_read_lock();
 	eb = xa_load(&fs_info->buffer_tree, index);
-	if (eb && !atomic_inc_not_zero(&eb->refs))
+	if (eb && !refcount_inc_not_zero(&eb->refs))
 		eb = NULL;
 	rcu_read_unlock();
 	return eb;
@@ -2842,7 +2842,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info
 	btrfs_leak_debug_add_eb(eb);
 
 	spin_lock_init(&eb->refs_lock);
-	atomic_set(&eb->refs, 1);
+	refcount_set(&eb->refs, 1);
 
 	ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
 
@@ -2975,13 +2975,13 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
 	 * once io is initiated, TREE_REF can no longer be cleared, so that is
 	 * the moment at which any such race is best fixed.
 	 */
-	refs = atomic_read(&eb->refs);
+	refs = refcount_read(&eb->refs);
 	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		return;
 
 	spin_lock(&eb->refs_lock);
 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
-		atomic_inc(&eb->refs);
+		refcount_inc(&eb->refs);
 	spin_unlock(&eb->refs_lock);
 }
 
@@ -3047,7 +3047,7 @@ again:
 		return ERR_PTR(ret);
 	}
 	if (exists) {
-		if (!atomic_inc_not_zero(&exists->refs)) {
+		if (!refcount_inc_not_zero(&exists->refs)) {
 			/* The extent buffer is being freed, retry. */
 			xa_unlock_irq(&fs_info->buffer_tree);
 			goto again;
@@ -3092,7 +3092,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
 	 * just overwrite folio private.
 	 */
 	exists = folio_get_private(folio);
-	if (atomic_inc_not_zero(&exists->refs))
+	if (refcount_inc_not_zero(&exists->refs))
 		return exists;
 
 	WARN_ON(folio_test_dirty(folio));
@@ -3362,7 +3362,7 @@ again:
 		goto out;
 	}
 	if (existing_eb) {
-		if (!atomic_inc_not_zero(&existing_eb->refs)) {
+		if (!refcount_inc_not_zero(&existing_eb->refs)) {
 			xa_unlock_irq(&fs_info->buffer_tree);
 			goto again;
 		}
@@ -3391,7 +3391,7 @@ again:
 	return eb;
 
 out:
-	WARN_ON(!atomic_dec_and_test(&eb->refs));
+	WARN_ON(!refcount_dec_and_test(&eb->refs));
 
 	/*
 	 * Any attached folios need to be detached before we unlock them.  This
@@ -3437,8 +3437,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 {
 	lockdep_assert_held(&eb->refs_lock);
 
-	WARN_ON(atomic_read(&eb->refs) == 0);
-	if (atomic_dec_and_test(&eb->refs)) {
+	if (refcount_dec_and_test(&eb->refs)) {
 		struct btrfs_fs_info *fs_info = eb->fs_info;
 
 		spin_unlock(&eb->refs_lock);
@@ -3484,7 +3483,7 @@ void free_extent_buffer(struct extent_buffer *eb)
 	if (!eb)
 		return;
 
-	refs = atomic_read(&eb->refs);
+	refs = refcount_read(&eb->refs);
 	while (1) {
 		if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
 			if (refs == 1)
@@ -3494,16 +3493,16 @@ void free_extent_buffer(struct extent_buffer *eb)
 		}
 
 		/* Optimization to avoid locking eb->refs_lock. */
-		if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+		if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
 			return;
 	}
 
 	spin_lock(&eb->refs_lock);
-	if (atomic_read(&eb->refs) == 2 &&
+	if (refcount_read(&eb->refs) == 2 &&
 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
 	    !extent_buffer_under_io(eb) &&
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
-		atomic_dec(&eb->refs);
+		refcount_dec(&eb->refs);
 
 	/*
 	 * I know this is terrible, but it's temporary until we stop tracking
@@ -3520,9 +3519,9 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
 	spin_lock(&eb->refs_lock);
 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
 
-	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+	if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
-		atomic_dec(&eb->refs);
+		refcount_dec(&eb->refs);
 	release_extent_buffer(eb);
 }
 
@@ -3580,7 +3579,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			btree_clear_folio_dirty_tag(folio);
 		folio_unlock(folio);
 	}
-	WARN_ON(atomic_read(&eb->refs) == 0);
+	WARN_ON(refcount_read(&eb->refs) == 0);
 }
 
 void set_extent_buffer_dirty(struct extent_buffer *eb)
@@ -3591,7 +3590,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
 
 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
 
-	WARN_ON(atomic_read(&eb->refs) == 0);
+	WARN_ON(refcount_read(&eb->refs) == 0);
 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
 	WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
 
@@ -3717,7 +3716,7 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
 
 	eb->read_mirror = 0;
 	check_buffer_tree_ref(eb);
-	atomic_inc(&eb->refs);
+	refcount_inc(&eb->refs);
 
 	bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
 			       REQ_OP_READ | REQ_META, eb->fs_info,
@@ -4312,7 +4311,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
 		 * won't disappear out from under us.
 		 */
 		spin_lock(&eb->refs_lock);
-		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+		if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
 			spin_unlock(&eb->refs_lock);
 			continue;
 		}
@@ -4378,7 +4377,7 @@ int try_release_extent_buffer(struct folio *folio)
 	 * this page.
 	 */
 	spin_lock(&eb->refs_lock);
-	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+	if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
 		spin_unlock(&eb->refs_lock);
 		spin_unlock(&folio->mapping->i_private_lock);
 		return 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e36e8d6a00bc..65bb87f1dce6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -98,7 +98,7 @@ struct extent_buffer {
 	void *addr;
 
 	spinlock_t refs_lock;
-	atomic_t refs;
+	refcount_t refs;
 	int read_mirror;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 43bf0979fd53..7935586a9dbd 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
 	 * the cost of allocating a new one.
 	 */
 	ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
-	atomic_inc(&clone->refs);
+	refcount_inc(&clone->refs);
 
 	ret = btrfs_next_leaf(inode->root, path);
 	if (ret != 0)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fc821aa446f0..21605b03f511 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
 {
 #ifdef CONFIG_BTRFS_DEBUG
 	btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
-		   atomic_read(&eb->refs), eb->lock_owner, current->pid);
+		   refcount_read(&eb->refs), eb->lock_owner, current->pid);
 #endif
 }
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 90685812ee56..a1afc549c404 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2338,7 +2338,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
 		btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
 
 	/* For src_path */
-	atomic_inc(&src_eb->refs);
+	refcount_inc(&src_eb->refs);
 	src_path->nodes[root_level] = src_eb;
 	src_path->slots[root_level] = dst_path->slots[root_level];
 	src_path->locks[root_level] = 0;
@@ -2571,7 +2571,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	/* For dst_path */
-	atomic_inc(&dst_eb->refs);
+	refcount_inc(&dst_eb->refs);
 	dst_path->nodes[level] = dst_eb;
 	dst_path->slots[level] = 0;
 	dst_path->locks[level] = 0;
@@ -2663,7 +2663,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 	 * walk back up the tree (adjusting slot pointers as we go)
 	 * and restart the search process.
 	 */
-	atomic_inc(&root_eb->refs);	/* For path */
+	refcount_inc(&root_eb->refs);	/* For path */
 	path->nodes[root_level] = root_eb;
 	path->slots[root_level] = 0;
 	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0b73f58db33f..d7ec1d72821c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1524,7 +1524,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		level = btrfs_root_level(root_item);
-		atomic_inc(&reloc_root->node->refs);
+		refcount_inc(&reloc_root->node->refs);
 		path->nodes[level] = reloc_root->node;
 		path->slots[level] = 0;
 	} else {
@@ -4347,7 +4347,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_backref_drop_node_buffer(node);
-		atomic_inc(&cow->refs);
+		refcount_inc(&cow->refs);
 		node->eb = cow;
 		node->new_bytenr = cow->start;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cd37c385541c..a7590c8c9996 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2719,7 +2719,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(log->node);
 	orig_level = level;
 	path->nodes[level] = log->node;
-	atomic_inc(&log->node->refs);
+	refcount_inc(&log->node->refs);
 	path->slots[level] = 0;
 
 	while (1) {
@@ -3683,7 +3683,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
 	 * Add extra ref to scratch eb so that it is not freed when callers
 	 * release the path, so we can reuse it later if needed.
 	 */
-	atomic_inc(&ctx->scratch_eb->refs);
+	refcount_inc(&ctx->scratch_eb->refs);
 
 	return 0;
 }
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 3fa526a0e37b..805f2eca20e9 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2485,7 +2485,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
 
 	/* For the work */
 	btrfs_get_block_group(bg);
-	atomic_inc(&eb->refs);
+	refcount_inc(&eb->refs);
 	bg->last_eb = eb;
 	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
 	queue_work(system_unbound_wq, &bg->zone_finish_work);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index bebc252db865..a32305044371 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1095,7 +1095,7 @@ TRACE_EVENT(btrfs_cow_block,
 	TP_fast_assign_btrfs(root->fs_info,
 		__entry->root_objectid	= btrfs_root_id(root);
 		__entry->buf_start	= buf->start;
-		__entry->refs		= atomic_read(&buf->refs);
+		__entry->refs		= refcount_read(&buf->refs);
 		__entry->cow_start	= cow->start;
 		__entry->buf_level	= btrfs_header_level(buf);
 		__entry->cow_level	= btrfs_header_level(cow);
-- 
cgit v1.2.3


From 44892c5a3e2d779fb056cbb69954d80e718edce1 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Tue, 10 Jun 2025 18:30:09 +0200
Subject: btrfs: tree-log: add and rename extent bits for dirty_log_pages tree

The dirty_log_pages tree is used for tree logging and marks extents
based on log_transid. The bits could be renamed to resemble the
LOG1/LOG2 naming used for the BTRFS_FS_LOG1_ERR bits.

The DIRTY bit is renamed to LOG1 and NEW to LOG2.

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-io-tree.h        |  3 ++-
 fs/btrfs/extent-tree.c           |  4 ++--
 fs/btrfs/tests/extent-io-tests.c |  3 ++-
 fs/btrfs/transaction.c           |  4 ++--
 fs/btrfs/tree-log.c              | 12 ++++++------
 include/trace/events/btrfs.h     |  3 ++-
 6 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 0a18ca9c59c3..819da07bff09 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -19,7 +19,8 @@ enum {
 	ENUM_BIT(EXTENT_DIRTY),
 	ENUM_BIT(EXTENT_LOCKED),
 	ENUM_BIT(EXTENT_DIO_LOCKED),
-	ENUM_BIT(EXTENT_NEW),
+	ENUM_BIT(EXTENT_DIRTY_LOG1),
+	ENUM_BIT(EXTENT_DIRTY_LOG2),
 	ENUM_BIT(EXTENT_DELALLOC),
 	ENUM_BIT(EXTENT_DEFRAG),
 	ENUM_BIT(EXTENT_BOUNDARY),
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b227c4690e5d..348b1920f33b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5113,11 +5113,11 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (buf->log_index == 0)
 			btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
 					     buf->start + buf->len - 1,
-					     EXTENT_DIRTY, NULL);
+					     EXTENT_DIRTY_LOG1, NULL);
 		else
 			btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
 					     buf->start + buf->len - 1,
-					     EXTENT_NEW, NULL);
+					     EXTENT_DIRTY_LOG2, NULL);
 	} else {
 		buf->log_index = -1;
 		btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start,
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 00da54f0164c..557d05220de1 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -75,7 +75,8 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
 	dest[0] = 0;
 	PRINT_ONE_FLAG(state, dest, cur, DIRTY);
 	PRINT_ONE_FLAG(state, dest, cur, LOCKED);
-	PRINT_ONE_FLAG(state, dest, cur, NEW);
+	PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1);
+	PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2);
 	PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
 	PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
 	PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 825d135ef6c7..2e07c90be5cd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1232,11 +1232,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
 	ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
 
 	ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
-	if ((mark & EXTENT_DIRTY) &&
+	if ((mark & EXTENT_DIRTY_LOG1) &&
 	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
 		errors = true;
 
-	if ((mark & EXTENT_NEW) &&
+	if ((mark & EXTENT_DIRTY_LOG2) &&
 	    test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
 		errors = true;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a7590c8c9996..a1e8ba7577f8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2960,9 +2960,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	if (log_transid % 2 == 0)
-		mark = EXTENT_DIRTY;
+		mark = EXTENT_DIRTY_LOG1;
 	else
-		mark = EXTENT_NEW;
+		mark = EXTENT_DIRTY_LOG2;
 
 	/* we start IO on  all the marked extents here, but we don't actually
 	 * wait for them until later.
@@ -3093,7 +3093,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_write_marked_extents(fs_info,
 					 &log_root_tree->dirty_log_pages,
-					 EXTENT_DIRTY | EXTENT_NEW);
+					 EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
 	blk_finish_plug(&plug);
 	/*
 	 * As described above, -EAGAIN indicates a hole in the extents. We
@@ -3113,7 +3113,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_wait_tree_log_extents(log, mark);
 	if (!ret)
 		ret = btrfs_wait_tree_log_extents(log_root_tree,
-						  EXTENT_NEW | EXTENT_DIRTY);
+						  EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
 	if (ret) {
 		btrfs_set_log_full_commit(trans);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -3239,9 +3239,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
 			 */
 			btrfs_write_marked_extents(log->fs_info,
 						   &log->dirty_log_pages,
-						   EXTENT_DIRTY | EXTENT_NEW);
+						   EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
 			btrfs_wait_tree_log_extents(log,
-						    EXTENT_DIRTY | EXTENT_NEW);
+						    EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
 
 			if (trans)
 				btrfs_abort_transaction(trans, ret);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index a32305044371..d54fe354b390 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -144,7 +144,8 @@ FLUSH_STATES
 #define EXTENT_FLAGS						\
 	{ EXTENT_DIRTY,			"DIRTY"},		\
 	{ EXTENT_LOCKED,		"LOCKED"},		\
-	{ EXTENT_NEW,			"NEW"},			\
+	{ EXTENT_DIRTY_LOG1,		"DIRTY_LOG1"},		\
+	{ EXTENT_DIRTY_LOG2,		"DIRTY_LOG2"},		\
 	{ EXTENT_DELALLOC,		"DELALLOC"},		\
 	{ EXTENT_DEFRAG,		"DEFRAG"},		\
 	{ EXTENT_BOUNDARY,		"BOUNDARY"},		\
-- 
cgit v1.2.3


From 009b2056cb259c90426b3c57e5b145d1cd9fa9e2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Jul 2025 16:29:17 +0200
Subject: btrfs: defrag: add flag to force no-compression

Currently the defrag ioctl cannot rewrite the extents without
compression. Add a new flag for that, as setting compression to 0 (or
"no compression") means to do no changes to compression so take what is
the current default, like mount options or properties.

The defrag setting overrides mount or properties. The compression
BTRFS_DEFRAG_DONT_COMPRESS is only used for in-memory operations and
does not need to have a fixed value.

Mount with zstd:9, copy test file from /usr/bin/ (about 260KB):

  $ mount -o compress=zstd:9 /dev/vda /mnt
  $ filefrag -vsb testfile
  filefrag: -b needs a blocksize option, assuming 1024-byte blocks.
  Filesystem type is: 9123683e
  File size of testfile is 297704 (292 blocks of 1024 bytes)
   ext:     logical_offset:        physical_offset: length:   expected: flags:
     0:        0..     127:      13312..     13439:    128:             encoded
     1:      128..     255:      13364..     13491:    128:      13440: encoded
     2:      256..     291:      13424..     13459:     36:      13492: last,encoded,eof
  testfile: 3 extents found

  $ compsize testfile
  Processed 1 file, 3 regular extents (3 refs), 0 inline, 1 fragments.
  Type       Perc     Disk Usage   Uncompressed Referenced
  TOTAL       42%      124K         292K         292K
  zstd        42%      124K         292K         292K

Defrag to uncompressed:

  $ btrfs fi defrag --nocomp testfile
  $ filefrag -vsb testfile
  filefrag: -b needs a blocksize option, assuming 1024-byte blocks.
  Filesystem type is: 9123683e
  File size of testfile is 297704 (292 blocks of 1024 bytes)
   ext:     logical_offset:        physical_offset: length:   expected: flags:
     0:        0..     291:     291840..    292131:    292:             last,eof
  testfile: 1 extent found

  $ compsize testfile
  Processed 1 file, 1 regular extents (1 refs), 0 inline, 1 fragments.
  Type       Perc     Disk Usage   Uncompressed Referenced
  TOTAL      100%      292K         292K         292K
  none       100%      292K         292K         292K

Compress again with LZO:

  $ btrfs fi defrag -clzo testfile
  $ filefrag -vsb testfile
  filefrag: -b needs a blocksize option, assuming 1024-byte blocks.
  Filesystem type is: 9123683e
  File size of testfile is 297704 (292 blocks of 1024 bytes)
   ext:     logical_offset:        physical_offset: length:   expected: flags:
     0:        0..     127:      13312..     13439:    128:             encoded
     1:      128..     255:      13392..     13519:    128:      13440: encoded
     2:      256..     291:      13480..     13515:     36:      13520: last,encoded,eof
  testfile: 3 extents found

  $ compsize testfile
  Processed 1 file, 3 regular extents (3 refs), 0 inline, 1 fragments.
  Type       Perc     Disk Usage   Uncompressed Referenced
  TOTAL       64%      188K         292K         292K
  lzo         64%      188K         292K         292K

Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.h     |  2 ++
 fs/btrfs/defrag.c          | 13 +++++++++----
 fs/btrfs/inode.c           | 11 +++++++----
 fs/btrfs/ioctl.c           | 10 ++++++++--
 include/uapi/linux/btrfs.h |  3 +++
 5 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 1df3c8dec40a..1b38e707bbd9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -113,6 +113,8 @@ enum btrfs_compression_type {
 	BTRFS_COMPRESS_LZO   = 2,
 	BTRFS_COMPRESS_ZSTD  = 3,
 	BTRFS_NR_COMPRESS_TYPES = 4,
+
+	BTRFS_DEFRAG_DONT_COMPRESS,
 };
 
 struct workspace_manager {
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 701b6b51ea85..738179a5e170 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -947,7 +947,7 @@ struct defrag_target_range {
  * @extent_thresh: file extent size threshold, any extent size >= this value
  *		   will be ignored
  * @newer_than:    only defrag extents newer than this value
- * @do_compress:   whether the defrag is doing compression
+ * @do_compress:   whether the defrag is doing compression or no-compression
  *		   if true, @extent_thresh will be ignored and all regular
  *		   file extents meeting @newer_than will be targets.
  * @locked:	   if the range has already held extent lock
@@ -1364,6 +1364,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
 	u64 cur;
 	u64 last_byte;
 	bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+	bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	int compress_level = 0;
 	int ret = 0;
@@ -1394,6 +1395,9 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
 			if (range->compress_type)
 				compress_type = range->compress_type;
 		}
+	} else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
+		compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
+		compress_level = 1;
 	}
 
 	if (extent_thresh == 0)
@@ -1444,13 +1448,14 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
 			btrfs_inode_unlock(inode, 0);
 			break;
 		}
-		if (do_compress) {
+		if (do_compress || no_compress) {
 			inode->defrag_compress = compress_type;
 			inode->defrag_compress_level = compress_level;
 		}
 		ret = defrag_one_cluster(inode, ra, cur,
 				cluster_end + 1 - cur, extent_thresh,
-				newer_than, do_compress, &sectors_defragged,
+				newer_than, do_compress || no_compress,
+				&sectors_defragged,
 				max_to_defrag, &last_scanned);
 
 		if (sectors_defragged > prev_sectors_defragged)
@@ -1489,7 +1494,7 @@ int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
 			btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
 		ret = sectors_defragged;
 	}
-	if (do_compress) {
+	if (do_compress || no_compress) {
 		btrfs_inode_lock(inode, 0);
 		inode->defrag_compress = BTRFS_COMPRESS_NONE;
 		btrfs_inode_unlock(inode, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ed340cac33f..b77dd22b8cdb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -781,12 +781,15 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
 		return 0;
 	}
 
+	/* Defrag ioctl takes precedence over mount options and properties. */
+	if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
+		return 0;
+	if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
+	    inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
+		return 1;
 	/* force compress */
 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
 		return 1;
-	/* defrag ioctl */
-	if (inode->defrag_compress)
-		return 1;
 	/* bad compression ratios */
 	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
 		return 0;
@@ -942,7 +945,7 @@ again:
 		goto cleanup_and_bail_uncompressed;
 	}
 
-	if (inode->defrag_compress) {
+	if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
 		compress_type = inode->defrag_compress;
 		compress_level = inode->defrag_compress_level;
 	} else if (inode->prop_compress) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 680c4e794e67..bf561be18885 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2554,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
 				ret = -EOPNOTSUPP;
 				goto out;
 			}
-			/* compression requires us to start the IO */
-			if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+			if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
+			    (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			/* Compression or no-compression require to start the IO. */
+			if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
+			    (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
 				range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
 				range.extent_thresh = (u32)-1;
 			}
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index dd02160015b2..8e710bbb688e 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -616,8 +616,11 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
 #define BTRFS_DEFRAG_RANGE_START_IO 2
 #define BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL 4
+/* Request no compression on the range (uncompress if necessary). */
+#define BTRFS_DEFRAG_RANGE_NOCOMPRESS	8
 #define BTRFS_DEFRAG_RANGE_FLAGS_SUPP	(BTRFS_DEFRAG_RANGE_COMPRESS |		\
 					 BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL |	\
+					 BTRFS_DEFRAG_RANGE_NOCOMPRESS |	\
 					 BTRFS_DEFRAG_RANGE_START_IO)
 
 struct btrfs_ioctl_defrag_range_args {
-- 
cgit v1.2.3


From 61c3e8940f2d8b5bfeaeec4bedc2f3e7d873abb3 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Thu, 17 Jul 2025 14:06:17 +0200
Subject: net: usb: cdc-ncm: check for filtering capability

If the decice does not support filtering, filtering
must not be used and all packets delivered for the
upper layers to sort.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Link: https://patch.msgid.link/20250717120649.2090929-1-oneukum@suse.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/cdc_ncm.c   | 20 ++++++++++++++++----
 include/linux/usb/cdc_ncm.h |  1 +
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 34e82f1e37d9..ea0e5e276cd6 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -892,6 +892,10 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 		}
 	}
 
+	if (ctx->func_desc)
+		ctx->filtering_supported = !!(ctx->func_desc->bmNetworkCapabilities
+			& USB_CDC_NCM_NCAP_ETH_FILTER);
+
 	iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber;
 
 	/* Device-specific flags */
@@ -1898,6 +1902,14 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 	}
 }
 
+static void cdc_ncm_update_filter(struct usbnet *dev)
+{
+	struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0];
+
+	if (ctx->filtering_supported)
+		usbnet_cdc_update_filter(dev);
+}
+
 static const struct driver_info cdc_ncm_info = {
 	.description = "CDC NCM (NO ZLP)",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
@@ -1908,7 +1920,7 @@ static const struct driver_info cdc_ncm_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
-	.set_rx_mode = usbnet_cdc_update_filter,
+	.set_rx_mode = cdc_ncm_update_filter,
 };
 
 /* Same as cdc_ncm_info, but with FLAG_SEND_ZLP  */
@@ -1922,7 +1934,7 @@ static const struct driver_info cdc_ncm_zlp_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
-	.set_rx_mode = usbnet_cdc_update_filter,
+	.set_rx_mode = cdc_ncm_update_filter,
 };
 
 /* Same as cdc_ncm_info, but with FLAG_SEND_ZLP */
@@ -1964,7 +1976,7 @@ static const struct driver_info wwan_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
-	.set_rx_mode = usbnet_cdc_update_filter,
+	.set_rx_mode = cdc_ncm_update_filter,
 };
 
 /* Same as wwan_info, but with FLAG_NOARP  */
@@ -1978,7 +1990,7 @@ static const struct driver_info wwan_noarp_info = {
 	.status = cdc_ncm_status,
 	.rx_fixup = cdc_ncm_rx_fixup,
 	.tx_fixup = cdc_ncm_tx_fixup,
-	.set_rx_mode = usbnet_cdc_update_filter,
+	.set_rx_mode = cdc_ncm_update_filter,
 };
 
 static const struct usb_device_id cdc_devs[] = {
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 2d207cb4837d..4ac082a63173 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -119,6 +119,7 @@ struct cdc_ncm_ctx {
 	u32 timer_interval;
 	u32 max_ndp_size;
 	u8 is_ndp16;
+	u8 filtering_supported;
 	union {
 		struct usb_cdc_ncm_ndp16 *delayed_ndp16;
 		struct usb_cdc_ncm_ndp32 *delayed_ndp32;
-- 
cgit v1.2.3


From 51a62199a8aaac0d1645b1dd8e670a6f35aead81 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Sat, 19 Jul 2025 00:06:57 +0200
Subject: tcp: add tcp_sock_set_maxseg

Add a helper tcp_sock_set_maxseg() to directly set the TCP_MAXSEG
sockopt from kernel space.

This new helper will be used in the following patch from MPTCP.

Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20250719-net-next-mptcp-tcp_maxseg-v2-2-8c910fbc5307@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/tcp.h |  1 +
 net/ipv4/tcp.c      | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1a5737b3753d..57e478bfaef2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -621,6 +621,7 @@ void tcp_sock_set_nodelay(struct sock *sk);
 void tcp_sock_set_quickack(struct sock *sk, int val);
 int tcp_sock_set_syncnt(struct sock *sk, int val);
 int tcp_sock_set_user_timeout(struct sock *sk, int val);
+int tcp_sock_set_maxseg(struct sock *sk, int val);
 
 static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
 {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 31149a0ac849..71a956fbfc55 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3751,6 +3751,19 @@ int tcp_set_window_clamp(struct sock *sk, int val)
 	return 0;
 }
 
+int tcp_sock_set_maxseg(struct sock *sk, int val)
+{
+	/* Values greater than interface MTU won't take effect. However
+	 * at the point when this call is done we typically don't yet
+	 * know which interface is going to be used
+	 */
+	if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
+		return -EINVAL;
+
+	tcp_sk(sk)->rx_opt.user_mss = val;
+	return 0;
+}
+
 /*
  *	Socket option code for TCP.
  */
@@ -3883,15 +3896,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
 
 	switch (optname) {
 	case TCP_MAXSEG:
-		/* Values greater than interface MTU won't take effect. However
-		 * at the point when this call is done we typically don't yet
-		 * know which interface is going to be used
-		 */
-		if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
-			err = -EINVAL;
-			break;
-		}
-		tp->rx_opt.user_mss = val;
+		err = tcp_sock_set_maxseg(sk, val);
 		break;
 
 	case TCP_NODELAY:
-- 
cgit v1.2.3


From a166ab7816c534973745b0fe7bce3c8cefc5426f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 17 Jul 2025 16:43:41 -0700
Subject: ethtool: rss: support creating contexts via Netlink

Support creating contexts via Netlink. Setting flow hashing
fields on the new context is not supported at this stage,
it can be added later.

An empty indirection table is not supported. This is a carry
over from the IOCTL interface where empty indirection table
meant delete. We can repurpose empty indirection table in
Netlink but for now to avoid confusion reject it using the
policy.

Support letting user choose the ID for the new context. This was
not possible in IOCTL since the context ID field for the create
action had to be set to the ETH_RXFH_CONTEXT_ALLOC magic value.

Link: https://patch.msgid.link/20250717234343.2328602-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  23 ++-
 Documentation/networking/ethtool-netlink.rst   |  27 ++++
 include/uapi/linux/ethtool_netlink_generated.h |   3 +
 net/ethtool/ioctl.c                            |   1 +
 net/ethtool/netlink.c                          |  15 ++
 net/ethtool/netlink.h                          |   3 +
 net/ethtool/rss.c                              | 203 +++++++++++++++++++++++++
 7 files changed, 273 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 069269edde01..25ffed5fddd5 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -2684,9 +2684,28 @@ operations:
       name: rss-ntf
       doc: |
         Notification for change in RSS configuration.
-        For additional contexts only modifications are modified, not creation
-        or removal of the contexts.
+        For additional contexts only modifications use this notification,
+        creation and deletion have dedicated messages.
       notify: rss-get
+    -
+      name: rss-create-act
+      doc: Create an RSS context.
+      attribute-set: rss
+      do:
+        request: &rss-create-attrs
+          attributes:
+            - header
+            - context
+            - hfunc
+            - indir
+            - hkey
+            - input-xfrm
+        reply: *rss-create-attrs
+    -
+      name: rss-create-ntf
+      doc: |
+        Notification for creation of an additional RSS context.
+      notify: rss-create-act
 
 mcast-groups:
   list:
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 056832c77ffd..2646fafb8512 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -240,6 +240,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_TSCONFIG_GET``          get hw timestamping configuration
   ``ETHTOOL_MSG_TSCONFIG_SET``          set hw timestamping configuration
   ``ETHTOOL_MSG_RSS_SET``               set RSS settings
+  ``ETHTOOL_MSG_RSS_CREATE_ACT``        create an additional RSS context
   ===================================== =================================
 
 Kernel to userspace:
@@ -294,6 +295,8 @@ Kernel to userspace:
   ``ETHTOOL_MSG_TSCONFIG_SET_REPLY``       new hw timestamping configuration
   ``ETHTOOL_MSG_PSE_NTF``                  PSE events notification
   ``ETHTOOL_MSG_RSS_NTF``                  RSS settings notification
+  ``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY``     create an additional RSS context
+  ``ETHTOOL_MSG_RSS_CREATE_NTF``           additional RSS context created
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -2014,6 +2017,30 @@ device needs at least 8 entries - the real table in use will end up being
 of 2, so tables which size is not a power of 2 will likely be rejected.
 Using table of size 0 will reset the indirection table to the default.
 
+RSS_CREATE_ACT
+==============
+
+Request contents:
+
+=====================================  ======  ==============================
+  ``ETHTOOL_A_RSS_HEADER``             nested  request header
+  ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
+  ``ETHTOOL_A_RSS_HFUNC``              u32     RSS hash func
+  ``ETHTOOL_A_RSS_INDIR``              binary  Indir table bytes
+  ``ETHTOOL_A_RSS_HKEY``               binary  Hash key bytes
+  ``ETHTOOL_A_RSS_INPUT_XFRM``         u32     RSS input data transformation
+=====================================  ======  ==============================
+
+Kernel response contents:
+
+=====================================  ======  ==============================
+  ``ETHTOOL_A_RSS_HEADER``             nested  request header
+  ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
+=====================================  ======  ==============================
+
+Create an additional RSS context, if ``ETHTOOL_A_RSS_CONTEXT`` is not
+specified kernel will allocate one automatically.
+
 PLCA_GET_CFG
 ============
 
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 130bdf5c3516..dea77abd295f 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -841,6 +841,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_GET,
 	ETHTOOL_MSG_TSCONFIG_SET,
 	ETHTOOL_MSG_RSS_SET,
+	ETHTOOL_MSG_RSS_CREATE_ACT,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
@@ -898,6 +899,8 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
 	ETHTOOL_MSG_PSE_NTF,
 	ETHTOOL_MSG_RSS_NTF,
+	ETHTOOL_MSG_RSS_CREATE_ACT_REPLY,
+	ETHTOOL_MSG_RSS_CREATE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index c53868889969..4b586b0f18e8 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1640,6 +1640,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		ntf = ETHTOOL_MSG_RSS_NTF;
 		ret = ops->set_rxfh(dev, &rxfh_dev, extack);
 	} else if (create) {
+		ntf = ETHTOOL_MSG_RSS_CREATE_NTF;
 		ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack);
 		/* Make sure driver populates defaults */
 		WARN_ON_ONCE(!ret && !rxfh_dev.key && ops->rxfh_per_ctx_key &&
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 0ae0d7a9667c..e9696113a96b 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -81,6 +81,12 @@ static void ethnl_sock_priv_destroy(void *priv)
 	}
 }
 
+u32 ethnl_bcast_seq_next(void)
+{
+	ASSERT_RTNL();
+	return ++ethnl_bcast_seq;
+}
+
 int ethnl_ops_begin(struct net_device *dev)
 {
 	int ret;
@@ -954,6 +960,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
 	[ETHTOOL_MSG_PLCA_NTF]		= &ethnl_plca_cfg_request_ops,
 	[ETHTOOL_MSG_MM_NTF]		= &ethnl_mm_request_ops,
 	[ETHTOOL_MSG_RSS_NTF]		= &ethnl_rss_request_ops,
+	[ETHTOOL_MSG_RSS_CREATE_NTF]	= &ethnl_rss_request_ops,
 };
 
 /* default notification handler */
@@ -1061,6 +1068,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
 	[ETHTOOL_MSG_PLCA_NTF]		= ethnl_default_notify,
 	[ETHTOOL_MSG_MM_NTF]		= ethnl_default_notify,
 	[ETHTOOL_MSG_RSS_NTF]		= ethnl_default_notify,
+	[ETHTOOL_MSG_RSS_CREATE_NTF]	= ethnl_default_notify,
 };
 
 void ethnl_notify(struct net_device *dev, unsigned int cmd,
@@ -1512,6 +1520,13 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_rss_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_RSS_CREATE_ACT,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_rss_create_doit,
+		.policy	= ethnl_rss_create_policy,
+		.maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index ddb2fb00f929..b530bf9f85ee 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -10,6 +10,7 @@
 
 struct ethnl_req_info;
 
+u32 ethnl_bcast_seq_next(void);
 int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 			       const struct nlattr *nest, struct net *net,
 			       struct netlink_ext_ack *extack,
@@ -485,6 +486,7 @@ extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
 extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
 extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
 extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1];
+extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1];
 extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1];
 extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1];
 extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
@@ -507,6 +509,7 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_tsinfo_start(struct netlink_callback *cb);
 int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_tsinfo_done(struct netlink_callback *cb);
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index e5516e529b4a..be092dfa4407 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -893,3 +893,206 @@ const struct ethnl_request_ops ethnl_rss_request_ops = {
 	.set			= ethnl_rss_set,
 	.set_ntf_cmd		= ETHTOOL_MSG_RSS_NTF,
 };
+
+/* RSS_CREATE */
+
+const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1] = {
+	[ETHTOOL_A_RSS_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_RSS_CONTEXT]	= NLA_POLICY_MIN(NLA_U32, 1),
+	[ETHTOOL_A_RSS_HFUNC]	= NLA_POLICY_MIN(NLA_U32, 1),
+	[ETHTOOL_A_RSS_INDIR]	= NLA_POLICY_MIN(NLA_BINARY, 1),
+	[ETHTOOL_A_RSS_HKEY]	= NLA_POLICY_MIN(NLA_BINARY, 1),
+	[ETHTOOL_A_RSS_INPUT_XFRM] =
+		NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR),
+};
+
+static int
+ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct nlattr **tb = info->attrs;
+	struct nlattr *bad_attr = NULL;
+	u32 rss_context, input_xfrm;
+
+	if (!ops->create_rxfh_context)
+		return -EOPNOTSUPP;
+
+	rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+	if (ops->rxfh_max_num_contexts &&
+	    ops->rxfh_max_num_contexts <= rss_context) {
+		NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]);
+		return -ERANGE;
+	}
+
+	if (!ops->rxfh_per_ctx_key) {
+		bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC];
+		bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY];
+		bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+	}
+
+	input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0);
+	if (input_xfrm & ~ops->supported_input_xfrm)
+		bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+
+	if (bad_attr) {
+		NL_SET_BAD_ATTR(info->extack, bad_attr);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void
+ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev)
+{
+	struct nlmsghdr *nlh = (void *)rsp->data;
+	struct genlmsghdr *genl_hdr;
+
+	/* Convert the reply into a notification */
+	nlh->nlmsg_pid = 0;
+	nlh->nlmsg_seq = ethnl_bcast_seq_next();
+
+	genl_hdr = nlmsg_data(nlh);
+	genl_hdr->cmd =	ETHTOOL_MSG_RSS_CREATE_NTF;
+
+	ethnl_multicast(rsp, dev);
+}
+
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	bool indir_dflt = false, mod = false, ntf_fail = false;
+	struct ethtool_rxfh_param rxfh = {};
+	struct ethtool_rxfh_context *ctx;
+	struct nlattr **tb = info->attrs;
+	struct rss_reply_data data = {};
+	const struct ethtool_ops *ops;
+	struct rss_req_info req = {};
+	struct net_device *dev;
+	struct sk_buff *rsp;
+	void *hdr;
+	u32 limit;
+	int ret;
+
+	rsp = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	ret = ethnl_parse_header_dev_get(&req.base, tb[ETHTOOL_A_RSS_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		goto exit_free_rsp;
+
+	dev = req.base.dev;
+	ops = dev->ethtool_ops;
+
+	req.rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+
+	ret = ethnl_rss_create_validate(dev, info);
+	if (ret)
+		goto exit_free_dev;
+
+	rtnl_lock();
+	netdev_lock_ops(dev);
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto exit_dev_unlock;
+
+	ret = rss_get_data_alloc(dev, &data);
+	if (ret)
+		goto exit_ops;
+
+	ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_dflt, &mod);
+	if (ret)
+		goto exit_clean_data;
+
+	ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod);
+
+	ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod);
+	if (ret)
+		goto exit_free_indir;
+
+	rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+	ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod);
+
+	ctx = ethtool_rxfh_ctx_alloc(ops, data.indir_size, data.hkey_size);
+	if (!ctx) {
+		ret = -ENOMEM;
+		goto exit_free_hkey;
+	}
+
+	mutex_lock(&dev->ethtool->rss_lock);
+	if (!req.rss_context) {
+		limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+		ret = xa_alloc(&dev->ethtool->rss_ctx, &req.rss_context, ctx,
+			       XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+	} else {
+		ret = xa_insert(&dev->ethtool->rss_ctx,
+				req.rss_context, ctx, GFP_KERNEL_ACCOUNT);
+	}
+	if (ret < 0) {
+		NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT],
+				    "error allocating context ID");
+		goto err_unlock_free_ctx;
+	}
+	rxfh.rss_context = req.rss_context;
+
+	ret = ops->create_rxfh_context(dev, ctx, &rxfh, info->extack);
+	if (ret)
+		goto err_ctx_id_free;
+
+	/* Make sure driver populates defaults */
+	WARN_ON_ONCE(!rxfh.key && ops->rxfh_per_ctx_key &&
+		     !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
+				 ctx->key_size));
+
+	/* Store the config from rxfh to Xarray.. */
+	rss_set_ctx_update(ctx, tb, &data, &rxfh);
+	/* .. copy from Xarray to data. */
+	__rss_prepare_ctx(dev, &data, ctx);
+
+	hdr = ethnl_unicast_put(rsp, info->snd_portid, info->snd_seq,
+				ETHTOOL_MSG_RSS_CREATE_ACT_REPLY);
+	ntf_fail = ethnl_fill_reply_header(rsp, dev, ETHTOOL_A_RSS_HEADER);
+	ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base);
+	if (WARN_ON(!hdr || ntf_fail)) {
+		ret = -EMSGSIZE;
+		goto exit_unlock;
+	}
+
+	genlmsg_end(rsp, hdr);
+
+	/* Use the same skb for the response and the notification,
+	 * genlmsg_reply() will copy the skb if it has elevated user count.
+	 */
+	skb_get(rsp);
+	ret = genlmsg_reply(rsp, info);
+	ethnl_rss_create_send_ntf(rsp, dev);
+	rsp = NULL;
+
+exit_unlock:
+	mutex_unlock(&dev->ethtool->rss_lock);
+exit_free_hkey:
+	kfree(rxfh.key);
+exit_free_indir:
+	kfree(rxfh.indir);
+exit_clean_data:
+	rss_get_data_free(&data);
+exit_ops:
+	ethnl_ops_complete(dev);
+exit_dev_unlock:
+	netdev_unlock_ops(dev);
+	rtnl_unlock();
+exit_free_dev:
+	ethnl_parse_header_dev_put(&req.base);
+exit_free_rsp:
+	nlmsg_free(rsp);
+	return ret;
+
+err_ctx_id_free:
+	xa_erase(&dev->ethtool->rss_ctx, req.rss_context);
+err_unlock_free_ctx:
+	kfree(ctx);
+	goto exit_unlock;
+}
-- 
cgit v1.2.3


From fbe09277fa6324b50cc4eedb4d99498cf7dad897 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 17 Jul 2025 16:43:42 -0700
Subject: ethtool: rss: support removing contexts via Netlink

Implement removing additional RSS contexts via Netlink.
Technically it'd be possible to shoehorn the delete operation
into ethnl_request_ops-compatible handler. The code ends
up longer than open coded version, and I think we'll need
a custom way of sending notifications at some stage (if we
allow tying the context lifetime to the netlink socket, in
the future).

Link: https://patch.msgid.link/20250717234343.2328602-8-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml       |  18 ++++
 Documentation/networking/ethtool-netlink.rst   |  14 ++++
 include/uapi/linux/ethtool_netlink_generated.h |   2 +
 net/ethtool/common.c                           |   1 +
 net/ethtool/ioctl.c                            |   1 +
 net/ethtool/netlink.c                          |   7 ++
 net/ethtool/netlink.h                          |   2 +
 net/ethtool/rss.c                              | 109 ++++++++++++++++++++++++-
 8 files changed, 153 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 25ffed5fddd5..1063d5d32fea 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -2706,6 +2706,24 @@ operations:
       doc: |
         Notification for creation of an additional RSS context.
       notify: rss-create-act
+    -
+      name: rss-delete-act
+      doc: Delete an RSS context.
+      attribute-set: rss
+      do:
+        request:
+          attributes:
+            - header
+            - context
+    -
+      name: rss-delete-ntf
+      doc: |
+        Notification for deletion of an additional RSS context.
+      attribute-set: rss
+      event:
+        attributes:
+          - header
+          - context
 
 mcast-groups:
   list:
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 2646fafb8512..ab20c644af24 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -241,6 +241,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_TSCONFIG_SET``          set hw timestamping configuration
   ``ETHTOOL_MSG_RSS_SET``               set RSS settings
   ``ETHTOOL_MSG_RSS_CREATE_ACT``        create an additional RSS context
+  ``ETHTOOL_MSG_RSS_DELETE_ACT``        delete an additional RSS context
   ===================================== =================================
 
 Kernel to userspace:
@@ -297,6 +298,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_RSS_NTF``                  RSS settings notification
   ``ETHTOOL_MSG_RSS_CREATE_ACT_REPLY``     create an additional RSS context
   ``ETHTOOL_MSG_RSS_CREATE_NTF``           additional RSS context created
+  ``ETHTOOL_MSG_RSS_DELETE_NTF``           additional RSS context deleted
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -2041,6 +2043,18 @@ Kernel response contents:
 Create an additional RSS context, if ``ETHTOOL_A_RSS_CONTEXT`` is not
 specified kernel will allocate one automatically.
 
+RSS_DELETE_ACT
+==============
+
+Request contents:
+
+=====================================  ======  ==============================
+  ``ETHTOOL_A_RSS_HEADER``             nested  request header
+  ``ETHTOOL_A_RSS_CONTEXT``            u32     context number
+=====================================  ======  ==============================
+
+Delete an additional RSS context.
+
 PLCA_GET_CFG
 ============
 
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index dea77abd295f..e3b8813465d7 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -842,6 +842,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_SET,
 	ETHTOOL_MSG_RSS_SET,
 	ETHTOOL_MSG_RSS_CREATE_ACT,
+	ETHTOOL_MSG_RSS_DELETE_ACT,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
@@ -901,6 +902,7 @@ enum {
 	ETHTOOL_MSG_RSS_NTF,
 	ETHTOOL_MSG_RSS_CREATE_ACT_REPLY,
 	ETHTOOL_MSG_RSS_CREATE_NTF,
+	ETHTOOL_MSG_RSS_DELETE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 2a1d40efb1fc..4f58648a27ad 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -1136,5 +1136,6 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id)
 	netdev_err(dev, "device error, RSS context %d lost\n", context_id);
 	ctx = xa_erase(&dev->ethtool->rss_ctx, context_id);
 	kfree(ctx);
+	ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id);
 }
 EXPORT_SYMBOL(ethtool_rxfh_context_lost);
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 4b586b0f18e8..43a7854e784e 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1647,6 +1647,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 			     !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
 					 ctx->key_size));
 	} else if (rxfh_dev.rss_delete) {
+		ntf = ETHTOOL_MSG_RSS_DELETE_NTF;
 		ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context,
 					       extack);
 	} else {
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index e9696113a96b..2f813f25f07e 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -1527,6 +1527,13 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy	= ethnl_rss_create_policy,
 		.maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_RSS_DELETE_ACT,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_rss_delete_doit,
+		.policy	= ethnl_rss_delete_policy,
+		.maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index b530bf9f85ee..1d4f9ecb3d26 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -487,6 +487,7 @@ extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
 extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
 extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1];
 extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1];
+extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1];
 extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1];
 extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1];
 extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
@@ -510,6 +511,7 @@ int ethnl_tsinfo_start(struct netlink_callback *cb);
 int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_tsinfo_done(struct netlink_callback *cb);
 int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info);
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index be092dfa4407..992e98abe9dd 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -486,13 +486,49 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 
 /* RSS_NTF */
 
+static void ethnl_rss_delete_notify(struct net_device *dev, u32 rss_context)
+{
+	struct sk_buff *ntf;
+	size_t ntf_size;
+	void *hdr;
+
+	ntf_size = ethnl_reply_header_size() +
+		nla_total_size(sizeof(u32));	/* _RSS_CONTEXT */
+
+	ntf = genlmsg_new(ntf_size, GFP_KERNEL);
+	if (!ntf)
+		goto out_warn;
+
+	hdr = ethnl_bcastmsg_put(ntf, ETHTOOL_MSG_RSS_DELETE_NTF);
+	if (!hdr)
+		goto out_free_ntf;
+
+	if (ethnl_fill_reply_header(ntf, dev, ETHTOOL_A_RSS_HEADER) ||
+	    nla_put_u32(ntf, ETHTOOL_A_RSS_CONTEXT, rss_context))
+		goto out_free_ntf;
+
+	genlmsg_end(ntf, hdr);
+	if (ethnl_multicast(ntf, dev))
+		goto out_warn;
+
+	return;
+
+out_free_ntf:
+	nlmsg_free(ntf);
+out_warn:
+	pr_warn_once("Failed to send a RSS delete notification");
+}
+
 void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
 {
 	struct rss_req_info req_info = {
 		.rss_context = rss_context,
 	};
 
-	ethnl_notify(dev, type, &req_info.base);
+	if (type == ETHTOOL_MSG_RSS_DELETE_NTF)
+		ethnl_rss_delete_notify(dev, rss_context);
+	else
+		ethnl_notify(dev, type, &req_info.base);
 }
 
 /* RSS_SET */
@@ -1096,3 +1132,74 @@ err_unlock_free_ctx:
 	kfree(ctx);
 	goto exit_unlock;
 }
+
+/* RSS_DELETE */
+
+const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1] = {
+	[ETHTOOL_A_RSS_HEADER]	= NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_RSS_CONTEXT]	= NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ethtool_rxfh_context *ctx;
+	struct nlattr **tb = info->attrs;
+	struct ethnl_req_info req = {};
+	const struct ethtool_ops *ops;
+	struct net_device *dev;
+	u32 rss_context;
+	int ret;
+
+	if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_RSS_CONTEXT))
+		return -EINVAL;
+	rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+
+	ret = ethnl_parse_header_dev_get(&req, tb[ETHTOOL_A_RSS_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+
+	dev = req.dev;
+	ops = dev->ethtool_ops;
+
+	if (!ops->create_rxfh_context)
+		goto exit_free_dev;
+
+	rtnl_lock();
+	netdev_lock_ops(dev);
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto exit_dev_unlock;
+
+	mutex_lock(&dev->ethtool->rss_lock);
+	ret = ethtool_check_rss_ctx_busy(dev, rss_context);
+	if (ret)
+		goto exit_unlock;
+
+	ctx = xa_load(&dev->ethtool->rss_ctx, rss_context);
+	if (!ctx) {
+		ret = -ENOENT;
+		goto exit_unlock;
+	}
+
+	ret = ops->remove_rxfh_context(dev, ctx, rss_context, info->extack);
+	if (ret)
+		goto exit_unlock;
+
+	WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rss_context) != ctx);
+	kfree(ctx);
+
+	ethnl_rss_delete_notify(dev, rss_context);
+
+exit_unlock:
+	mutex_unlock(&dev->ethtool->rss_lock);
+	ethnl_ops_complete(dev);
+exit_dev_unlock:
+	netdev_unlock_ops(dev);
+	rtnl_unlock();
+exit_free_dev:
+	ethnl_parse_header_dev_put(&req);
+	return ret;
+}
-- 
cgit v1.2.3


From 57fbad15c2eee77276a541c616589b32976d2b8e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 17 Jul 2025 16:25:06 -0700
Subject: stackleak: Rename STACKLEAK to KSTACK_ERASE

In preparation for adding Clang sanitizer coverage stack depth tracking
that can support stack depth callbacks:

- Add the new top-level CONFIG_KSTACK_ERASE option which will be
  implemented either with the stackleak GCC plugin, or with the Clang
  stack depth callback support.
- Rename CONFIG_GCC_PLUGIN_STACKLEAK as needed to CONFIG_KSTACK_ERASE,
  but keep it for anything specific to the GCC plugin itself.
- Rename all exposed "STACKLEAK" names and files to "KSTACK_ERASE" (named
  for what it does rather than what it protects against), but leave as
  many of the internals alone as possible to avoid even more churn.

While here, also split "prev_lowest_stack" into CONFIG_KSTACK_ERASE_METRICS,
since that's the only place it is referenced from.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20250717232519.2984886-1-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 Documentation/admin-guide/sysctl/kernel.rst        |   4 +-
 Documentation/arch/x86/x86_64/mm.rst               |   2 +-
 Documentation/security/self-protection.rst         |   2 +-
 .../zh_CN/security/self-protection.rst             |   2 +-
 MAINTAINERS                                        |   4 +-
 arch/Kconfig                                       |   4 +-
 arch/arm/Kconfig                                   |   2 +-
 arch/arm/boot/compressed/Makefile                  |   2 +-
 arch/arm/kernel/entry-common.S                     |   2 +-
 arch/arm64/Kconfig                                 |   2 +-
 arch/arm64/kernel/entry.S                          |   2 +-
 arch/arm64/kernel/pi/Makefile                      |   2 +-
 arch/arm64/kvm/hyp/nvhe/Makefile                   |   2 +-
 arch/loongarch/Kconfig                             |   2 +-
 arch/riscv/Kconfig                                 |   2 +-
 arch/riscv/kernel/entry.S                          |   2 +-
 arch/riscv/kernel/pi/Makefile                      |   2 +-
 arch/riscv/purgatory/Makefile                      |   2 +-
 arch/s390/Kconfig                                  |   2 +-
 arch/s390/kernel/entry.S                           |   2 +-
 arch/x86/Kconfig                                   |   2 +-
 arch/x86/entry/calling.h                           |   4 +-
 arch/x86/purgatory/Makefile                        |   2 +-
 drivers/firmware/efi/libstub/Makefile              |   8 +-
 drivers/misc/lkdtm/Makefile                        |   2 +-
 drivers/misc/lkdtm/kstack_erase.c                  | 150 +++++++++++++++++
 drivers/misc/lkdtm/stackleak.c                     | 150 -----------------
 fs/proc/base.c                                     |   6 +-
 include/linux/kstack_erase.h                       |  89 +++++++++++
 include/linux/sched.h                              |   4 +-
 include/linux/stackleak.h                          |  89 -----------
 kernel/Makefile                                    |  10 +-
 kernel/fork.c                                      |   2 +-
 kernel/kstack_erase.c                              | 177 +++++++++++++++++++++
 kernel/stackleak.c                                 | 177 ---------------------
 lib/Makefile                                       |   2 +-
 scripts/Makefile.gcc-plugins                       |   6 +-
 security/Kconfig.hardening                         |  36 +++--
 tools/objtool/check.c                              |   2 +-
 tools/testing/selftests/lkdtm/config               |   2 +-
 40 files changed, 486 insertions(+), 480 deletions(-)
 create mode 100644 drivers/misc/lkdtm/kstack_erase.c
 delete mode 100644 drivers/misc/lkdtm/stackleak.c
 create mode 100644 include/linux/kstack_erase.h
 delete mode 100644 include/linux/stackleak.h
 create mode 100644 kernel/kstack_erase.c
 delete mode 100644 kernel/stackleak.c

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index dd49a89a62d3..19224eeac1c2 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -1465,7 +1465,7 @@ stack_erasing
 =============
 
 This parameter can be used to control kernel stack erasing at the end
-of syscalls for kernels built with ``CONFIG_GCC_PLUGIN_STACKLEAK``.
+of syscalls for kernels built with ``CONFIG_KSTACK_ERASE``.
 
 That erasing reduces the information which kernel stack leak bugs
 can reveal and blocks some uninitialized stack variable attacks.
@@ -1473,7 +1473,7 @@ The tradeoff is the performance impact: on a single CPU system kernel
 compilation sees a 1% slowdown, other systems and workloads may vary.
 
 = ====================================================================
-0 Kernel stack erasing is disabled, STACKLEAK_METRICS are not updated.
+0 Kernel stack erasing is disabled, KSTACK_ERASE_METRICS are not updated.
 1 Kernel stack erasing is enabled (default), it is performed before
   returning to the userspace at the end of syscalls.
 = ====================================================================
diff --git a/Documentation/arch/x86/x86_64/mm.rst b/Documentation/arch/x86/x86_64/mm.rst
index f2db178b353f..a6cf05d51bd8 100644
--- a/Documentation/arch/x86/x86_64/mm.rst
+++ b/Documentation/arch/x86/x86_64/mm.rst
@@ -176,5 +176,5 @@ Be very careful vs. KASLR when changing anything here. The KASLR address
 range must not overlap with anything except the KASAN shadow area, which is
 correct as KASAN disables KASLR.
 
-For both 4- and 5-level layouts, the STACKLEAK_POISON value in the last 2MB
+For both 4- and 5-level layouts, the KSTACK_ERASE_POISON value in the last 2MB
 hole: ffffffffffff4111
diff --git a/Documentation/security/self-protection.rst b/Documentation/security/self-protection.rst
index 910668e665cb..a32ca23c21b0 100644
--- a/Documentation/security/self-protection.rst
+++ b/Documentation/security/self-protection.rst
@@ -303,7 +303,7 @@ Memory poisoning
 
 When releasing memory, it is best to poison the contents, to avoid reuse
 attacks that rely on the old contents of memory. E.g., clear stack on a
-syscall return (``CONFIG_GCC_PLUGIN_STACKLEAK``), wipe heap memory on a
+syscall return (``CONFIG_KSTACK_ERASE``), wipe heap memory on a
 free. This frustrates many uninitialized variable attacks, stack content
 exposures, heap content exposures, and use-after-free attacks.
 
diff --git a/Documentation/translations/zh_CN/security/self-protection.rst b/Documentation/translations/zh_CN/security/self-protection.rst
index 3c8a68b1e1be..93de9cee5c1a 100644
--- a/Documentation/translations/zh_CN/security/self-protection.rst
+++ b/Documentation/translations/zh_CN/security/self-protection.rst
@@ -259,7 +259,7 @@ KALLSYSM，则会直接打印原始地址。
 --------
 
 在释放内存时，最好对内存内容进行清除处理，以防止攻击者重用内存中以前
-的内容。例如，在系统调用返回时清除堆栈（CONFIG_GCC_PLUGIN_STACKLEAK）,
+的内容。例如，在系统调用返回时清除堆栈（CONFIG_KSTACK_ERASE）,
 在释放堆内容是清除其内容。这有助于防止许多未初始化变量攻击、堆栈内容
 泄露、堆内容泄露以及使用后释放攻击（user-after-free）。
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c1d245bf7b8..470d159d8fea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9992,8 +9992,6 @@ L:	linux-hardening@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 F:	Documentation/kbuild/gcc-plugins.rst
-F:	include/linux/stackleak.h
-F:	kernel/stackleak.c
 F:	scripts/Makefile.gcc-plugins
 F:	scripts/gcc-plugins/
 
@@ -13087,10 +13085,12 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har
 F:	Documentation/ABI/testing/sysfs-kernel-oops_count
 F:	Documentation/ABI/testing/sysfs-kernel-warn_count
 F:	arch/*/configs/hardening.config
+F:	include/linux/kstack_erase.h
 F:	include/linux/overflow.h
 F:	include/linux/randomize_kstack.h
 F:	include/linux/ucopysize.h
 F:	kernel/configs/hardening.config
+F:	kernel/kstack_erase.c
 F:	lib/tests/randstruct_kunit.c
 F:	lib/tests/usercopy_kunit.c
 F:	mm/usercopy.c
diff --git a/arch/Kconfig b/arch/Kconfig
index a3308a220f86..4d1908f6f084 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -630,11 +630,11 @@ config SECCOMP_CACHE_DEBUG
 
 	  If unsure, say N.
 
-config HAVE_ARCH_STACKLEAK
+config HAVE_ARCH_KSTACK_ERASE
 	bool
 	help
 	  An architecture should select this if it has the code which
-	  fills the used part of the kernel stack with the STACKLEAK_POISON
+	  fills the used part of the kernel stack with the KSTACK_ERASE_POISON
 	  value before returning from system calls.
 
 config HAVE_STACKPROTECTOR
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..cb0b2e2211ca 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -87,11 +87,11 @@ config ARM
 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 	select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_PFN_VALID
 	select HAVE_ARCH_SECCOMP
 	select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index d61369b1eabe..f9075edfd773 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -9,7 +9,7 @@ OBJS		=
 
 HEAD	= head.o
 OBJS	+= misc.o decompress.o
-CFLAGS_decompress.o += $(DISABLE_STACKLEAK_PLUGIN)
+CFLAGS_decompress.o += $(DISABLE_KSTACK_ERASE)
 ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y)
 OBJS	+= debug.o
 AFLAGS_head.o += -DDEBUG
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index f379c852dcb7..88336a1292bb 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -119,7 +119,7 @@ no_work_pending:
 
 	ct_user_enter save = 0
 
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	bl	stackleak_erase_on_task_stack
 #endif
 	restore_user_regs fast = 0, offset = 0
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331af337..e2a9e013b6a9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -187,12 +187,12 @@ config ARM64
 	select HAVE_ARCH_KCSAN if EXPERT
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5ae2a34b50bd..67331437b2aa 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -614,7 +614,7 @@ SYM_CODE_END(ret_to_kernel)
 SYM_CODE_START_LOCAL(ret_to_user)
 	ldr	x19, [tsk, #TSK_TI_FLAGS]	// re-check for single-step
 	enable_step_tsk x19, x2
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	bl	stackleak_erase_on_task_stack
 #endif
 	kernel_exit 0
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
index 4d11a8c29181..f440bf57b1a5 100644
--- a/arch/arm64/kernel/pi/Makefile
+++ b/arch/arm64/kernel/pi/Makefile
@@ -2,7 +2,7 @@
 # Copyright 2022 Google LLC
 
 KBUILD_CFLAGS	:= $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
-		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
 		   $(DISABLE_LATENT_ENTROPY_PLUGIN) \
 		   $(call cc-option,-mbranch-protection=none) \
 		   -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a76522d63c3e..0b0a68b663d4 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -12,7 +12,7 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__
 ccflags-y += -fno-stack-protector	\
 	     -DDISABLE_BRANCH_PROFILING	\
-	     $(DISABLE_STACKLEAK_PLUGIN)
+	     $(DISABLE_KSTACK_ERASE)
 
 hostprogs := gen-hyprel
 HOST_EXTRACFLAGS += -I$(objtree)/include
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4b19f93379a1..1514789bea4a 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -120,11 +120,11 @@ config LOONGARCH
 	select HAVE_ARCH_KASAN
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KGDB if PERF_EVENTS
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP
 	select HAVE_ARCH_SECCOMP_FILTER
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 36061f4732b7..cfc084fc9e6f 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -135,13 +135,13 @@ config RISCV
 	select HAVE_ARCH_KASAN if MMU && 64BIT
 	select HAVE_ARCH_KASAN_VMALLOC if MMU && 64BIT
 	select HAVE_ARCH_KFENCE if MMU && 64BIT
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_KGDB if !XIP_KERNEL
 	select HAVE_ARCH_KGDB_QXFER_PKT
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 75656afa2d6b..3a0ec6fd5956 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -220,7 +220,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
 #endif
 	bnez s0, 1f
 
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	call	stackleak_erase_on_task_stack
 #endif
 
diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile
index 81d69d45c06c..7dd15be69c90 100644
--- a/arch/riscv/kernel/pi/Makefile
+++ b/arch/riscv/kernel/pi/Makefile
@@ -2,7 +2,7 @@
 # This file was copied from arm64/kernel/pi/Makefile.
 
 KBUILD_CFLAGS	:= $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
-		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+		   -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_KSTACK_ERASE) \
 		   $(call cc-option,-mbranch-protection=none) \
 		   -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
 		   -include $(srctree)/include/linux/hidden.h \
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index fb9c917c9b45..240592e3f5c2 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -53,7 +53,7 @@ targets += purgatory.ro purgatory.chk
 
 PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
 PURGATORY_CFLAGS := -mcmodel=medany -ffreestanding -fno-zero-initialized-in-bss
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
 PURGATORY_CFLAGS += -fno-stack-protector -g0
 
 # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0c16dc443e2f..a8e74ed8e3cc 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -176,10 +176,10 @@ config S390
 	select HAVE_ARCH_KCSAN
 	select HAVE_ARCH_KMSAN
 	select HAVE_ARCH_KFENCE
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SOFT_DIRTY
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_VMAP_STACK
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 0f00f4b06d51..75b0fbb236d0 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -124,7 +124,7 @@ _LPP_OFFSET	= __LC_LPP
 #endif
 
 	.macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	brasl	%r14,stackleak_erase_on_task_stack
 #endif
 	.endm
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..bc3708cad46b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -204,13 +204,13 @@ config X86
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KMSAN			if X86_64
 	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KSTACK_ERASE
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_PREL32_RELOCATIONS
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
-	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d83236b96f22..94519688b007 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -369,7 +369,7 @@ For 32-bit we have the following conventions - kernel is built with
 .endm
 
 .macro STACKLEAK_ERASE_NOCLOBBER
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	PUSH_AND_CLEAR_REGS
 	call stackleak_erase
 	POP_REGS
@@ -388,7 +388,7 @@ For 32-bit we have the following conventions - kernel is built with
 #endif /* !CONFIG_X86_64 */
 
 .macro STACKLEAK_ERASE
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	call stackleak_erase
 #endif
 .endm
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index ebdfd7b84feb..e0a607a14e7e 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -35,7 +35,7 @@ targets += purgatory.ro purgatory.chk
 PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel
 PURGATORY_CFLAGS := -mcmodel=small -ffreestanding -fno-zero-initialized-in-bss -g0
 PURGATORY_CFLAGS += -fpic -fvisibility=hidden
-PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING
+PURGATORY_CFLAGS += $(DISABLE_KSTACK_ERASE) -DDISABLE_BRANCH_PROFILING
 PURGATORY_CFLAGS += -fno-stack-protector
 
 # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 939a4955e00b..94b05e4451dd 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -22,16 +22,16 @@ cflags-$(CONFIG_X86)		+= -m$(BITS) -D__KERNEL__ -std=gnu11 \
 
 # arm64 uses the full KBUILD_CFLAGS so it's necessary to explicitly
 # disable the stackleak plugin
-cflags-$(CONFIG_ARM64)		+= -fpie $(DISABLE_STACKLEAK_PLUGIN) \
+cflags-$(CONFIG_ARM64)		+= -fpie $(DISABLE_KSTACK_ERASE) \
 				   -fno-unwind-tables -fno-asynchronous-unwind-tables
 cflags-$(CONFIG_ARM)		+= -DEFI_HAVE_STRLEN -DEFI_HAVE_STRNLEN \
 				   -DEFI_HAVE_MEMCHR -DEFI_HAVE_STRRCHR \
 				   -DEFI_HAVE_STRCMP -fno-builtin -fpic \
 				   $(call cc-option,-mno-single-pic-base) \
-				   $(DISABLE_STACKLEAK_PLUGIN)
+				   $(DISABLE_KSTACK_ERASE)
 cflags-$(CONFIG_RISCV)		+= -fpic -DNO_ALTERNATIVE -mno-relax \
-				   $(DISABLE_STACKLEAK_PLUGIN)
-cflags-$(CONFIG_LOONGARCH)	+= -fpie $(DISABLE_STACKLEAK_PLUGIN)
+				   $(DISABLE_KSTACK_ERASE)
+cflags-$(CONFIG_LOONGARCH)	+= -fpie $(DISABLE_KSTACK_ERASE)
 
 cflags-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= -I$(srctree)/scripts/dtc/libfdt
 
diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index 39468bd27b85..03ebe33185f9 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -8,7 +8,7 @@ lkdtm-$(CONFIG_LKDTM)		+= perms.o
 lkdtm-$(CONFIG_LKDTM)		+= refcount.o
 lkdtm-$(CONFIG_LKDTM)		+= rodata_objcopy.o
 lkdtm-$(CONFIG_LKDTM)		+= usercopy.o
-lkdtm-$(CONFIG_LKDTM)		+= stackleak.o
+lkdtm-$(CONFIG_LKDTM)		+= kstack_erase.o
 lkdtm-$(CONFIG_LKDTM)		+= cfi.o
 lkdtm-$(CONFIG_LKDTM)		+= fortify.o
 lkdtm-$(CONFIG_PPC_64S_HASH_MMU)	+= powerpc.o
diff --git a/drivers/misc/lkdtm/kstack_erase.c b/drivers/misc/lkdtm/kstack_erase.c
new file mode 100644
index 000000000000..4fd9b0bfb874
--- /dev/null
+++ b/drivers/misc/lkdtm/kstack_erase.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code tests that the current task stack is properly erased (filled
+ * with KSTACK_ERASE_POISON).
+ *
+ * Authors:
+ *   Alexander Popov <alex.popov@linux.com>
+ *   Tycho Andersen <tycho@tycho.ws>
+ */
+
+#include "lkdtm.h"
+#include <linux/kstack_erase.h>
+
+#if defined(CONFIG_KSTACK_ERASE)
+/*
+ * Check that stackleak tracks the lowest stack pointer and erases the stack
+ * below this as expected.
+ *
+ * To prevent the lowest stack pointer changing during the test, IRQs are
+ * masked and instrumentation of this function is disabled. We assume that the
+ * compiler will create a fixed-size stack frame for this function.
+ *
+ * Any non-inlined function may make further use of the stack, altering the
+ * lowest stack pointer and/or clobbering poison values. To avoid spurious
+ * failures we must avoid printing until the end of the test or have already
+ * encountered a failure condition.
+ */
+static void noinstr check_stackleak_irqoff(void)
+{
+	const unsigned long task_stack_base = (unsigned long)task_stack_page(current);
+	const unsigned long task_stack_low = stackleak_task_low_bound(current);
+	const unsigned long task_stack_high = stackleak_task_high_bound(current);
+	const unsigned long current_sp = current_stack_pointer;
+	const unsigned long lowest_sp = current->lowest_stack;
+	unsigned long untracked_high;
+	unsigned long poison_high, poison_low;
+	bool test_failed = false;
+
+	/*
+	 * Check that the current and lowest recorded stack pointer values fall
+	 * within the expected task stack boundaries. These tests should never
+	 * fail unless the boundaries are incorrect or we're clobbering the
+	 * STACK_END_MAGIC, and in either casee something is seriously wrong.
+	 */
+	if (current_sp < task_stack_low || current_sp >= task_stack_high) {
+		instrumentation_begin();
+		pr_err("FAIL: current_stack_pointer (0x%lx) outside of task stack bounds [0x%lx..0x%lx]\n",
+		       current_sp, task_stack_low, task_stack_high - 1);
+		test_failed = true;
+		goto out;
+	}
+	if (lowest_sp < task_stack_low || lowest_sp >= task_stack_high) {
+		instrumentation_begin();
+		pr_err("FAIL: current->lowest_stack (0x%lx) outside of task stack bounds [0x%lx..0x%lx]\n",
+		       lowest_sp, task_stack_low, task_stack_high - 1);
+		test_failed = true;
+		goto out;
+	}
+
+	/*
+	 * Depending on what has run prior to this test, the lowest recorded
+	 * stack pointer could be above or below the current stack pointer.
+	 * Start from the lowest of the two.
+	 *
+	 * Poison values are naturally-aligned unsigned longs. As the current
+	 * stack pointer might not be sufficiently aligned, we must align
+	 * downwards to find the lowest known stack pointer value. This is the
+	 * high boundary for a portion of the stack which may have been used
+	 * without being tracked, and has to be scanned for poison.
+	 */
+	untracked_high = min(current_sp, lowest_sp);
+	untracked_high = ALIGN_DOWN(untracked_high, sizeof(unsigned long));
+
+	/*
+	 * Find the top of the poison in the same way as the erasing code.
+	 */
+	poison_high = stackleak_find_top_of_poison(task_stack_low, untracked_high);
+
+	/*
+	 * Check whether the poisoned portion of the stack (if any) consists
+	 * entirely of poison. This verifies the entries that
+	 * stackleak_find_top_of_poison() should have checked.
+	 */
+	poison_low = poison_high;
+	while (poison_low > task_stack_low) {
+		poison_low -= sizeof(unsigned long);
+
+		if (*(unsigned long *)poison_low == KSTACK_ERASE_POISON)
+			continue;
+
+		instrumentation_begin();
+		pr_err("FAIL: non-poison value %lu bytes below poison boundary: 0x%lx\n",
+		       poison_high - poison_low, *(unsigned long *)poison_low);
+		test_failed = true;
+		goto out;
+	}
+
+	instrumentation_begin();
+	pr_info("kstack erase stack usage:\n"
+		"  high offset: %lu bytes\n"
+		"  current:     %lu bytes\n"
+		"  lowest:      %lu bytes\n"
+		"  tracked:     %lu bytes\n"
+		"  untracked:   %lu bytes\n"
+		"  poisoned:    %lu bytes\n"
+		"  low offset:  %lu bytes\n",
+		task_stack_base + THREAD_SIZE - task_stack_high,
+		task_stack_high - current_sp,
+		task_stack_high - lowest_sp,
+		task_stack_high - untracked_high,
+		untracked_high - poison_high,
+		poison_high - task_stack_low,
+		task_stack_low - task_stack_base);
+
+out:
+	if (test_failed) {
+		pr_err("FAIL: the thread stack is NOT properly erased!\n");
+	} else {
+		pr_info("OK: the rest of the thread stack is properly erased\n");
+	}
+	instrumentation_end();
+}
+
+static void lkdtm_KSTACK_ERASE(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	check_stackleak_irqoff();
+	local_irq_restore(flags);
+}
+#else /* defined(CONFIG_KSTACK_ERASE) */
+static void lkdtm_KSTACK_ERASE(void)
+{
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_KSTACK_ERASE)) {
+		pr_err("XFAIL: stackleak is not enabled (CONFIG_KSTACK_ERASE=n)\n");
+	} else {
+		pr_err("XFAIL: stackleak is not supported on this arch (HAVE_ARCH_KSTACK_ERASE=n)\n");
+	}
+}
+#endif /* defined(CONFIG_KSTACK_ERASE) */
+
+static struct crashtype crashtypes[] = {
+	CRASHTYPE(KSTACK_ERASE),
+};
+
+struct crashtype_category stackleak_crashtypes = {
+	.crashtypes = crashtypes,
+	.len	    = ARRAY_SIZE(crashtypes),
+};
diff --git a/drivers/misc/lkdtm/stackleak.c b/drivers/misc/lkdtm/stackleak.c
deleted file mode 100644
index f1d022160913..000000000000
--- a/drivers/misc/lkdtm/stackleak.c
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This code tests that the current task stack is properly erased (filled
- * with STACKLEAK_POISON).
- *
- * Authors:
- *   Alexander Popov <alex.popov@linux.com>
- *   Tycho Andersen <tycho@tycho.ws>
- */
-
-#include "lkdtm.h"
-#include <linux/stackleak.h>
-
-#if defined(CONFIG_GCC_PLUGIN_STACKLEAK)
-/*
- * Check that stackleak tracks the lowest stack pointer and erases the stack
- * below this as expected.
- *
- * To prevent the lowest stack pointer changing during the test, IRQs are
- * masked and instrumentation of this function is disabled. We assume that the
- * compiler will create a fixed-size stack frame for this function.
- *
- * Any non-inlined function may make further use of the stack, altering the
- * lowest stack pointer and/or clobbering poison values. To avoid spurious
- * failures we must avoid printing until the end of the test or have already
- * encountered a failure condition.
- */
-static void noinstr check_stackleak_irqoff(void)
-{
-	const unsigned long task_stack_base = (unsigned long)task_stack_page(current);
-	const unsigned long task_stack_low = stackleak_task_low_bound(current);
-	const unsigned long task_stack_high = stackleak_task_high_bound(current);
-	const unsigned long current_sp = current_stack_pointer;
-	const unsigned long lowest_sp = current->lowest_stack;
-	unsigned long untracked_high;
-	unsigned long poison_high, poison_low;
-	bool test_failed = false;
-
-	/*
-	 * Check that the current and lowest recorded stack pointer values fall
-	 * within the expected task stack boundaries. These tests should never
-	 * fail unless the boundaries are incorrect or we're clobbering the
-	 * STACK_END_MAGIC, and in either casee something is seriously wrong.
-	 */
-	if (current_sp < task_stack_low || current_sp >= task_stack_high) {
-		instrumentation_begin();
-		pr_err("FAIL: current_stack_pointer (0x%lx) outside of task stack bounds [0x%lx..0x%lx]\n",
-		       current_sp, task_stack_low, task_stack_high - 1);
-		test_failed = true;
-		goto out;
-	}
-	if (lowest_sp < task_stack_low || lowest_sp >= task_stack_high) {
-		instrumentation_begin();
-		pr_err("FAIL: current->lowest_stack (0x%lx) outside of task stack bounds [0x%lx..0x%lx]\n",
-		       lowest_sp, task_stack_low, task_stack_high - 1);
-		test_failed = true;
-		goto out;
-	}
-
-	/*
-	 * Depending on what has run prior to this test, the lowest recorded
-	 * stack pointer could be above or below the current stack pointer.
-	 * Start from the lowest of the two.
-	 *
-	 * Poison values are naturally-aligned unsigned longs. As the current
-	 * stack pointer might not be sufficiently aligned, we must align
-	 * downwards to find the lowest known stack pointer value. This is the
-	 * high boundary for a portion of the stack which may have been used
-	 * without being tracked, and has to be scanned for poison.
-	 */
-	untracked_high = min(current_sp, lowest_sp);
-	untracked_high = ALIGN_DOWN(untracked_high, sizeof(unsigned long));
-
-	/*
-	 * Find the top of the poison in the same way as the erasing code.
-	 */
-	poison_high = stackleak_find_top_of_poison(task_stack_low, untracked_high);
-
-	/*
-	 * Check whether the poisoned portion of the stack (if any) consists
-	 * entirely of poison. This verifies the entries that
-	 * stackleak_find_top_of_poison() should have checked.
-	 */
-	poison_low = poison_high;
-	while (poison_low > task_stack_low) {
-		poison_low -= sizeof(unsigned long);
-
-		if (*(unsigned long *)poison_low == STACKLEAK_POISON)
-			continue;
-
-		instrumentation_begin();
-		pr_err("FAIL: non-poison value %lu bytes below poison boundary: 0x%lx\n",
-		       poison_high - poison_low, *(unsigned long *)poison_low);
-		test_failed = true;
-		goto out;
-	}
-
-	instrumentation_begin();
-	pr_info("stackleak stack usage:\n"
-		"  high offset: %lu bytes\n"
-		"  current:     %lu bytes\n"
-		"  lowest:      %lu bytes\n"
-		"  tracked:     %lu bytes\n"
-		"  untracked:   %lu bytes\n"
-		"  poisoned:    %lu bytes\n"
-		"  low offset:  %lu bytes\n",
-		task_stack_base + THREAD_SIZE - task_stack_high,
-		task_stack_high - current_sp,
-		task_stack_high - lowest_sp,
-		task_stack_high - untracked_high,
-		untracked_high - poison_high,
-		poison_high - task_stack_low,
-		task_stack_low - task_stack_base);
-
-out:
-	if (test_failed) {
-		pr_err("FAIL: the thread stack is NOT properly erased!\n");
-	} else {
-		pr_info("OK: the rest of the thread stack is properly erased\n");
-	}
-	instrumentation_end();
-}
-
-static void lkdtm_STACKLEAK_ERASING(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	check_stackleak_irqoff();
-	local_irq_restore(flags);
-}
-#else /* defined(CONFIG_GCC_PLUGIN_STACKLEAK) */
-static void lkdtm_STACKLEAK_ERASING(void)
-{
-	if (IS_ENABLED(CONFIG_HAVE_ARCH_STACKLEAK)) {
-		pr_err("XFAIL: stackleak is not enabled (CONFIG_GCC_PLUGIN_STACKLEAK=n)\n");
-	} else {
-		pr_err("XFAIL: stackleak is not supported on this arch (HAVE_ARCH_STACKLEAK=n)\n");
-	}
-}
-#endif /* defined(CONFIG_GCC_PLUGIN_STACKLEAK) */
-
-static struct crashtype crashtypes[] = {
-	CRASHTYPE(STACKLEAK_ERASING),
-};
-
-struct crashtype_category stackleak_crashtypes = {
-	.crashtypes = crashtypes,
-	.len	    = ARRAY_SIZE(crashtypes),
-};
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c667702dc69b..be34612af8b6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3291,7 +3291,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
 }
 #endif /* CONFIG_KSM */
 
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
 static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
@@ -3304,7 +3304,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
 							prev_depth, depth);
 	return 0;
 }
-#endif /* CONFIG_STACKLEAK_METRICS */
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
 
 /*
  * Thread groups
@@ -3411,7 +3411,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
 	ONE("stack_depth", S_IRUGO, proc_stack_depth),
 #endif
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
diff --git a/include/linux/kstack_erase.h b/include/linux/kstack_erase.h
new file mode 100644
index 000000000000..4e432eefa4d0
--- /dev/null
+++ b/include/linux/kstack_erase.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KSTACK_ERASE_H
+#define _LINUX_KSTACK_ERASE_H
+
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+
+/*
+ * Check that the poison value points to the unused hole in the
+ * virtual memory map for your platform.
+ */
+#define KSTACK_ERASE_POISON -0xBEEF
+#define KSTACK_ERASE_SEARCH_DEPTH 128
+
+#ifdef CONFIG_KSTACK_ERASE
+#include <asm/stacktrace.h>
+#include <linux/linkage.h>
+
+/*
+ * The lowest address on tsk's stack which we can plausibly erase.
+ */
+static __always_inline unsigned long
+stackleak_task_low_bound(const struct task_struct *tsk)
+{
+	/*
+	 * The lowest unsigned long on the task stack contains STACK_END_MAGIC,
+	 * which we must not corrupt.
+	 */
+	return (unsigned long)end_of_stack(tsk) + sizeof(unsigned long);
+}
+
+/*
+ * The address immediately after the highest address on tsk's stack which we
+ * can plausibly erase.
+ */
+static __always_inline unsigned long
+stackleak_task_high_bound(const struct task_struct *tsk)
+{
+	/*
+	 * The task's pt_regs lives at the top of the task stack and will be
+	 * overwritten by exception entry, so there's no need to erase them.
+	 */
+	return (unsigned long)task_pt_regs(tsk);
+}
+
+/*
+ * Find the address immediately above the poisoned region of the stack, where
+ * that region falls between 'low' (inclusive) and 'high' (exclusive).
+ */
+static __always_inline unsigned long
+stackleak_find_top_of_poison(const unsigned long low, const unsigned long high)
+{
+	const unsigned int depth = KSTACK_ERASE_SEARCH_DEPTH / sizeof(unsigned long);
+	unsigned int poison_count = 0;
+	unsigned long poison_high = high;
+	unsigned long sp = high;
+
+	while (sp > low && poison_count < depth) {
+		sp -= sizeof(unsigned long);
+
+		if (*(unsigned long *)sp == KSTACK_ERASE_POISON) {
+			poison_count++;
+		} else {
+			poison_count = 0;
+			poison_high = sp;
+		}
+	}
+
+	return poison_high;
+}
+
+static inline void stackleak_task_init(struct task_struct *t)
+{
+	t->lowest_stack = stackleak_task_low_bound(t);
+# ifdef CONFIG_KSTACK_ERASE_METRICS
+	t->prev_lowest_stack = t->lowest_stack;
+# endif
+}
+
+asmlinkage void noinstr stackleak_erase(void);
+asmlinkage void noinstr stackleak_erase_on_task_stack(void);
+asmlinkage void noinstr stackleak_erase_off_task_stack(void);
+void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+
+#else /* !CONFIG_KSTACK_ERASE */
+static inline void stackleak_task_init(struct task_struct *t) { }
+#endif
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..b7d2f2fd4cd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1607,8 +1607,10 @@ struct task_struct {
 	/* Used by BPF for per-TASK xdp storage */
 	struct bpf_net_context		*bpf_net_context;
 
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+#ifdef CONFIG_KSTACK_ERASE
 	unsigned long			lowest_stack;
+#endif
+#ifdef CONFIG_KSTACK_ERASE_METRICS
 	unsigned long			prev_lowest_stack;
 #endif
 
diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h
deleted file mode 100644
index 3be2cb564710..000000000000
--- a/include/linux/stackleak.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_STACKLEAK_H
-#define _LINUX_STACKLEAK_H
-
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-
-/*
- * Check that the poison value points to the unused hole in the
- * virtual memory map for your platform.
- */
-#define STACKLEAK_POISON -0xBEEF
-#define STACKLEAK_SEARCH_DEPTH 128
-
-#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
-#include <asm/stacktrace.h>
-#include <linux/linkage.h>
-
-/*
- * The lowest address on tsk's stack which we can plausibly erase.
- */
-static __always_inline unsigned long
-stackleak_task_low_bound(const struct task_struct *tsk)
-{
-	/*
-	 * The lowest unsigned long on the task stack contains STACK_END_MAGIC,
-	 * which we must not corrupt.
-	 */
-	return (unsigned long)end_of_stack(tsk) + sizeof(unsigned long);
-}
-
-/*
- * The address immediately after the highest address on tsk's stack which we
- * can plausibly erase.
- */
-static __always_inline unsigned long
-stackleak_task_high_bound(const struct task_struct *tsk)
-{
-	/*
-	 * The task's pt_regs lives at the top of the task stack and will be
-	 * overwritten by exception entry, so there's no need to erase them.
-	 */
-	return (unsigned long)task_pt_regs(tsk);
-}
-
-/*
- * Find the address immediately above the poisoned region of the stack, where
- * that region falls between 'low' (inclusive) and 'high' (exclusive).
- */
-static __always_inline unsigned long
-stackleak_find_top_of_poison(const unsigned long low, const unsigned long high)
-{
-	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);
-	unsigned int poison_count = 0;
-	unsigned long poison_high = high;
-	unsigned long sp = high;
-
-	while (sp > low && poison_count < depth) {
-		sp -= sizeof(unsigned long);
-
-		if (*(unsigned long *)sp == STACKLEAK_POISON) {
-			poison_count++;
-		} else {
-			poison_count = 0;
-			poison_high = sp;
-		}
-	}
-
-	return poison_high;
-}
-
-static inline void stackleak_task_init(struct task_struct *t)
-{
-	t->lowest_stack = stackleak_task_low_bound(t);
-# ifdef CONFIG_STACKLEAK_METRICS
-	t->prev_lowest_stack = t->lowest_stack;
-# endif
-}
-
-asmlinkage void noinstr stackleak_erase(void);
-asmlinkage void noinstr stackleak_erase_on_task_stack(void);
-asmlinkage void noinstr stackleak_erase_off_task_stack(void);
-void __no_caller_saved_registers noinstr stackleak_track_stack(void);
-
-#else /* !CONFIG_GCC_PLUGIN_STACKLEAK */
-static inline void stackleak_task_init(struct task_struct *t) { }
-#endif
-
-#endif
diff --git a/kernel/Makefile b/kernel/Makefile
index 32e80dd626af..e4f01f1d4d0c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -139,11 +139,11 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
 obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
 obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
 
-CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
-KASAN_SANITIZE_stackleak.o := n
-KCSAN_SANITIZE_stackleak.o := n
-KCOV_INSTRUMENT_stackleak.o := n
+CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE)
+obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o
+KASAN_SANITIZE_kstack_erase.o := n
+KCSAN_SANITIZE_kstack_erase.o := n
+KCOV_INSTRUMENT_kstack_erase.o := n
 
 obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..1ec66911f6f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,7 +93,7 @@
 #include <linux/kcov.h>
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c
new file mode 100644
index 000000000000..201b846f8345
--- /dev/null
+++ b/kernel/kstack_erase.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code fills the used part of the kernel stack with a poison value
+ * before returning to userspace. It's part of the STACKLEAK feature
+ * ported from grsecurity/PaX.
+ *
+ * Author: Alexander Popov <alex.popov@linux.com>
+ *
+ * KSTACK_ERASE reduces the information which kernel stack leak bugs can
+ * reveal and blocks some uninitialized stack variable attacks.
+ */
+
+#include <linux/kstack_erase.h>
+#include <linux/kprobes.h>
+
+#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE
+#include <linux/jump_label.h>
+#include <linux/string_choices.h>
+#include <linux/sysctl.h>
+#include <linux/init.h>
+
+static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
+
+#ifdef CONFIG_SYSCTL
+static int stack_erasing_sysctl(const struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	int state = !static_branch_unlikely(&stack_erasing_bypass);
+	int prev_state = state;
+	struct ctl_table table_copy = *table;
+
+	table_copy.data = &state;
+	ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
+	state = !!state;
+	if (ret || !write || state == prev_state)
+		return ret;
+
+	if (state)
+		static_branch_disable(&stack_erasing_bypass);
+	else
+		static_branch_enable(&stack_erasing_bypass);
+
+	pr_warn("stackleak: kernel stack erasing is %s\n",
+					str_enabled_disabled(state));
+	return ret;
+}
+static const struct ctl_table stackleak_sysctls[] = {
+	{
+		.procname	= "stack_erasing",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= stack_erasing_sysctl,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+};
+
+static int __init stackleak_sysctls_init(void)
+{
+	register_sysctl_init("kernel", stackleak_sysctls);
+	return 0;
+}
+late_initcall(stackleak_sysctls_init);
+#endif /* CONFIG_SYSCTL */
+
+#define skip_erasing()	static_branch_unlikely(&stack_erasing_bypass)
+#else
+#define skip_erasing()	false
+#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */
+
+#ifndef __stackleak_poison
+static __always_inline void __stackleak_poison(unsigned long erase_low,
+					       unsigned long erase_high,
+					       unsigned long poison)
+{
+	while (erase_low < erase_high) {
+		*(unsigned long *)erase_low = poison;
+		erase_low += sizeof(unsigned long);
+	}
+}
+#endif
+
+static __always_inline void __stackleak_erase(bool on_task_stack)
+{
+	const unsigned long task_stack_low = stackleak_task_low_bound(current);
+	const unsigned long task_stack_high = stackleak_task_high_bound(current);
+	unsigned long erase_low, erase_high;
+
+	erase_low = stackleak_find_top_of_poison(task_stack_low,
+						 current->lowest_stack);
+
+#ifdef CONFIG_KSTACK_ERASE_METRICS
+	current->prev_lowest_stack = erase_low;
+#endif
+
+	/*
+	 * Write poison to the task's stack between 'erase_low' and
+	 * 'erase_high'.
+	 *
+	 * If we're running on a different stack (e.g. an entry trampoline
+	 * stack) we can erase everything below the pt_regs at the top of the
+	 * task stack.
+	 *
+	 * If we're running on the task stack itself, we must not clobber any
+	 * stack used by this function and its caller. We assume that this
+	 * function has a fixed-size stack frame, and the current stack pointer
+	 * doesn't change while we write poison.
+	 */
+	if (on_task_stack)
+		erase_high = current_stack_pointer;
+	else
+		erase_high = task_stack_high;
+
+	__stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON);
+
+	/* Reset the 'lowest_stack' value for the next syscall */
+	current->lowest_stack = task_stack_high;
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can be called from the task stack or an entry stack when the task stack is
+ * no longer in use.
+ */
+asmlinkage void noinstr stackleak_erase(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase(on_thread_stack());
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_on_task_stack(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase(true);
+}
+
+/*
+ * Erase and poison the portion of the task stack used since the last erase.
+ * Can only be called from a stack other than the task stack.
+ */
+asmlinkage void noinstr stackleak_erase_off_task_stack(void)
+{
+	if (skip_erasing())
+		return;
+
+	__stackleak_erase(false);
+}
+
+void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
+{
+	unsigned long sp = current_stack_pointer;
+
+	/*
+	 * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than
+	 * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in
+	 * stackleak_erase() unreliable. Let's prevent that.
+	 */
+	BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH);
+
+	/* 'lowest_stack' should be aligned on the register width boundary */
+	sp = ALIGN(sp, sizeof(unsigned long));
+	if (sp < current->lowest_stack &&
+	    sp >= stackleak_task_low_bound(current)) {
+		current->lowest_stack = sp;
+	}
+}
+EXPORT_SYMBOL(stackleak_track_stack);
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
deleted file mode 100644
index bb65321761b4..000000000000
--- a/kernel/stackleak.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This code fills the used part of the kernel stack with a poison value
- * before returning to userspace. It's part of the STACKLEAK feature
- * ported from grsecurity/PaX.
- *
- * Author: Alexander Popov <alex.popov@linux.com>
- *
- * STACKLEAK reduces the information which kernel stack leak bugs can
- * reveal and blocks some uninitialized stack variable attacks.
- */
-
-#include <linux/stackleak.h>
-#include <linux/kprobes.h>
-
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
-#include <linux/jump_label.h>
-#include <linux/string_choices.h>
-#include <linux/sysctl.h>
-#include <linux/init.h>
-
-static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
-
-#ifdef CONFIG_SYSCTL
-static int stack_erasing_sysctl(const struct ctl_table *table, int write,
-			void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-	int state = !static_branch_unlikely(&stack_erasing_bypass);
-	int prev_state = state;
-	struct ctl_table table_copy = *table;
-
-	table_copy.data = &state;
-	ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
-	state = !!state;
-	if (ret || !write || state == prev_state)
-		return ret;
-
-	if (state)
-		static_branch_disable(&stack_erasing_bypass);
-	else
-		static_branch_enable(&stack_erasing_bypass);
-
-	pr_warn("stackleak: kernel stack erasing is %s\n",
-					str_enabled_disabled(state));
-	return ret;
-}
-static const struct ctl_table stackleak_sysctls[] = {
-	{
-		.procname	= "stack_erasing",
-		.data		= NULL,
-		.maxlen		= sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= stack_erasing_sysctl,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-};
-
-static int __init stackleak_sysctls_init(void)
-{
-	register_sysctl_init("kernel", stackleak_sysctls);
-	return 0;
-}
-late_initcall(stackleak_sysctls_init);
-#endif /* CONFIG_SYSCTL */
-
-#define skip_erasing()	static_branch_unlikely(&stack_erasing_bypass)
-#else
-#define skip_erasing()	false
-#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-
-#ifndef __stackleak_poison
-static __always_inline void __stackleak_poison(unsigned long erase_low,
-					       unsigned long erase_high,
-					       unsigned long poison)
-{
-	while (erase_low < erase_high) {
-		*(unsigned long *)erase_low = poison;
-		erase_low += sizeof(unsigned long);
-	}
-}
-#endif
-
-static __always_inline void __stackleak_erase(bool on_task_stack)
-{
-	const unsigned long task_stack_low = stackleak_task_low_bound(current);
-	const unsigned long task_stack_high = stackleak_task_high_bound(current);
-	unsigned long erase_low, erase_high;
-
-	erase_low = stackleak_find_top_of_poison(task_stack_low,
-						 current->lowest_stack);
-
-#ifdef CONFIG_STACKLEAK_METRICS
-	current->prev_lowest_stack = erase_low;
-#endif
-
-	/*
-	 * Write poison to the task's stack between 'erase_low' and
-	 * 'erase_high'.
-	 *
-	 * If we're running on a different stack (e.g. an entry trampoline
-	 * stack) we can erase everything below the pt_regs at the top of the
-	 * task stack.
-	 *
-	 * If we're running on the task stack itself, we must not clobber any
-	 * stack used by this function and its caller. We assume that this
-	 * function has a fixed-size stack frame, and the current stack pointer
-	 * doesn't change while we write poison.
-	 */
-	if (on_task_stack)
-		erase_high = current_stack_pointer;
-	else
-		erase_high = task_stack_high;
-
-	__stackleak_poison(erase_low, erase_high, STACKLEAK_POISON);
-
-	/* Reset the 'lowest_stack' value for the next syscall */
-	current->lowest_stack = task_stack_high;
-}
-
-/*
- * Erase and poison the portion of the task stack used since the last erase.
- * Can be called from the task stack or an entry stack when the task stack is
- * no longer in use.
- */
-asmlinkage void noinstr stackleak_erase(void)
-{
-	if (skip_erasing())
-		return;
-
-	__stackleak_erase(on_thread_stack());
-}
-
-/*
- * Erase and poison the portion of the task stack used since the last erase.
- * Can only be called from the task stack.
- */
-asmlinkage void noinstr stackleak_erase_on_task_stack(void)
-{
-	if (skip_erasing())
-		return;
-
-	__stackleak_erase(true);
-}
-
-/*
- * Erase and poison the portion of the task stack used since the last erase.
- * Can only be called from a stack other than the task stack.
- */
-asmlinkage void noinstr stackleak_erase_off_task_stack(void)
-{
-	if (skip_erasing())
-		return;
-
-	__stackleak_erase(false);
-}
-
-void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
-{
-	unsigned long sp = current_stack_pointer;
-
-	/*
-	 * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
-	 * STACKLEAK_SEARCH_DEPTH makes the poison search in
-	 * stackleak_erase() unreliable. Let's prevent that.
-	 */
-	BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
-
-	/* 'lowest_stack' should be aligned on the register width boundary */
-	sp = ALIGN(sp, sizeof(unsigned long));
-	if (sp < current->lowest_stack &&
-	    sp >= stackleak_task_low_bound(current)) {
-		current->lowest_stack = sp;
-	}
-}
-EXPORT_SYMBOL(stackleak_track_stack);
diff --git a/lib/Makefile b/lib/Makefile
index c38582f187dd..632e69d25feb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -337,7 +337,7 @@ obj-$(CONFIG_UBSAN) += ubsan.o
 UBSAN_SANITIZE_ubsan.o := n
 KASAN_SANITIZE_ubsan.o := n
 KCSAN_SANITIZE_ubsan.o := n
-CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_STACKLEAK_PLUGIN)
+CFLAGS_ubsan.o := -fno-stack-protector $(DISABLE_KSTACK_ERASE)
 
 obj-$(CONFIG_SBITMAP) += sbitmap.o
 
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 435ab3f0ec44..28b8867c4e84 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -12,15 +12,15 @@ gcc-plugin-$(CONFIG_GCC_PLUGIN_STACKLEAK)	+= stackleak_plugin.so
 gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
 		+= -DSTACKLEAK_PLUGIN
 gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
-		+= -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE)
+		+= -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE)
 gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK)		\
 		+= -fplugin-arg-stackleak_plugin-arch=$(SRCARCH)
 gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK_VERBOSE)	\
 		+= -fplugin-arg-stackleak_plugin-verbose
 ifdef CONFIG_GCC_PLUGIN_STACKLEAK
-    DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable
+    DISABLE_KSTACK_ERASE += -fplugin-arg-stackleak_plugin-disable
 endif
-export DISABLE_STACKLEAK_PLUGIN
+export DISABLE_KSTACK_ERASE
 
 # All the plugin CFLAGS are collected here in case a build target needs to
 # filter them out of the KBUILD_CFLAGS.
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index fd1238753cad..125b35e2ef0f 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -82,10 +82,10 @@ choice
 
 endchoice
 
-config GCC_PLUGIN_STACKLEAK
+config KSTACK_ERASE
 	bool "Poison kernel stack before returning from syscalls"
+	depends on HAVE_ARCH_KSTACK_ERASE
 	depends on GCC_PLUGINS
-	depends on HAVE_ARCH_STACKLEAK
 	help
 	  This option makes the kernel erase the kernel stack before
 	  returning from system calls. This has the effect of leaving
@@ -103,6 +103,10 @@ config GCC_PLUGIN_STACKLEAK
 	  are advised to test this feature on your expected workload before
 	  deploying it.
 
+config GCC_PLUGIN_STACKLEAK
+	def_bool KSTACK_ERASE
+	depends on GCC_PLUGINS
+	help
 	  This plugin was ported from grsecurity/PaX. More information at:
 	   * https://grsecurity.net/
 	   * https://pax.grsecurity.net/
@@ -117,37 +121,37 @@ config GCC_PLUGIN_STACKLEAK_VERBOSE
 	  instrumented. This is useful for comparing coverage between
 	  builds.
 
-config STACKLEAK_TRACK_MIN_SIZE
-	int "Minimum stack frame size of functions tracked by STACKLEAK"
+config KSTACK_ERASE_TRACK_MIN_SIZE
+	int "Minimum stack frame size of functions tracked by KSTACK_ERASE"
 	default 100
 	range 0 4096
-	depends on GCC_PLUGIN_STACKLEAK
+	depends on KSTACK_ERASE
 	help
-	  The STACKLEAK gcc plugin instruments the kernel code for tracking
+	  The KSTACK_ERASE option instruments the kernel code for tracking
 	  the lowest border of the kernel stack (and for some other purposes).
 	  It inserts the stackleak_track_stack() call for the functions with
 	  a stack frame size greater than or equal to this parameter.
 	  If unsure, leave the default value 100.
 
-config STACKLEAK_METRICS
-	bool "Show STACKLEAK metrics in the /proc file system"
-	depends on GCC_PLUGIN_STACKLEAK
+config KSTACK_ERASE_METRICS
+	bool "Show KSTACK_ERASE metrics in the /proc file system"
+	depends on KSTACK_ERASE
 	depends on PROC_FS
 	help
-	  If this is set, STACKLEAK metrics for every task are available in
-	  the /proc file system. In particular, /proc/<pid>/stack_depth
+	  If this is set, KSTACK_ERASE metrics for every task are available
+	  in the /proc file system. In particular, /proc/<pid>/stack_depth
 	  shows the maximum kernel stack consumption for the current and
 	  previous syscalls. Although this information is not precise, it
-	  can be useful for estimating the STACKLEAK performance impact for
-	  your workloads.
+	  can be useful for estimating the KSTACK_ERASE performance impact
+	  for your workloads.
 
-config STACKLEAK_RUNTIME_DISABLE
+config KSTACK_ERASE_RUNTIME_DISABLE
 	bool "Allow runtime disabling of kernel stack erasing"
-	depends on GCC_PLUGIN_STACKLEAK
+	depends on KSTACK_ERASE
 	help
 	  This option provides 'stack_erasing' sysctl, which can be used in
 	  runtime to control kernel stack erasing for kernels built with
-	  CONFIG_GCC_PLUGIN_STACKLEAK.
+	  CONFIG_KSTACK_ERASE.
 
 config INIT_ON_ALLOC_DEFAULT_ON
 	bool "Enable heap memory zeroing on allocation by default"
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f23bdda737aa..5451bdbcf84a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1192,7 +1192,7 @@ static const char *uaccess_safe_builtin[] = {
 	"__ubsan_handle_type_mismatch_v1",
 	"__ubsan_handle_shift_out_of_bounds",
 	"__ubsan_handle_load_invalid_value",
-	/* STACKLEAK */
+	/* KSTACK_ERASE */
 	"stackleak_track_stack",
 	/* TRACE_BRANCH_PROFILING */
 	"ftrace_likely_update",
diff --git a/tools/testing/selftests/lkdtm/config b/tools/testing/selftests/lkdtm/config
index 7afe05e8c4d7..bd09fdaf53e0 100644
--- a/tools/testing/selftests/lkdtm/config
+++ b/tools/testing/selftests/lkdtm/config
@@ -2,7 +2,7 @@ CONFIG_LKDTM=y
 CONFIG_DEBUG_LIST=y
 CONFIG_SLAB_FREELIST_HARDENED=y
 CONFIG_FORTIFY_SOURCE=y
-CONFIG_GCC_PLUGIN_STACKLEAK=y
+CONFIG_KSTACK_ERASE=y
 CONFIG_HARDENED_USERCOPY=y
 CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y
 CONFIG_INIT_ON_FREE_DEFAULT_ON=y
-- 
cgit v1.2.3


From 9ea1e8d28add49ab3c1ecfa43f08d92ee23f3e33 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 17 Jul 2025 16:25:07 -0700
Subject: stackleak: Rename stackleak_track_stack to
 __sanitizer_cov_stack_depth

The Clang stack depth tracking implementation has a fixed name for
the stack depth tracking callback, "__sanitizer_cov_stack_depth", so
rename the GCC plugin function to match since the plugin has no external
dependencies on naming.

Link: https://lore.kernel.org/r/20250717232519.2984886-2-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/kstack_erase.h           |  2 +-
 kernel/kstack_erase.c                  |  4 +--
 scripts/gcc-plugins/stackleak_plugin.c | 52 +++++++++++++++++-----------------
 security/Kconfig.hardening             |  4 +--
 tools/objtool/check.c                  |  2 +-
 5 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/kstack_erase.h b/include/linux/kstack_erase.h
index 4e432eefa4d0..bf3bf1905557 100644
--- a/include/linux/kstack_erase.h
+++ b/include/linux/kstack_erase.h
@@ -80,7 +80,7 @@ static inline void stackleak_task_init(struct task_struct *t)
 asmlinkage void noinstr stackleak_erase(void);
 asmlinkage void noinstr stackleak_erase_on_task_stack(void);
 asmlinkage void noinstr stackleak_erase_off_task_stack(void);
-void __no_caller_saved_registers noinstr stackleak_track_stack(void);
+void __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void);
 
 #else /* !CONFIG_KSTACK_ERASE */
 static inline void stackleak_task_init(struct task_struct *t) { }
diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c
index 201b846f8345..e49bb88b4f0a 100644
--- a/kernel/kstack_erase.c
+++ b/kernel/kstack_erase.c
@@ -156,7 +156,7 @@ asmlinkage void noinstr stackleak_erase_off_task_stack(void)
 	__stackleak_erase(false);
 }
 
-void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
+void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void)
 {
 	unsigned long sp = current_stack_pointer;
 
@@ -174,4 +174,4 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
 		current->lowest_stack = sp;
 	}
 }
-EXPORT_SYMBOL(stackleak_track_stack);
+EXPORT_SYMBOL(__sanitizer_cov_stack_depth);
diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c
index d20c47d21ad8..e486488c867d 100644
--- a/scripts/gcc-plugins/stackleak_plugin.c
+++ b/scripts/gcc-plugins/stackleak_plugin.c
@@ -9,7 +9,7 @@
  * any of the gcc libraries
  *
  * This gcc plugin is needed for tracking the lowest border of the kernel stack.
- * It instruments the kernel code inserting stackleak_track_stack() calls:
+ * It instruments the kernel code inserting __sanitizer_cov_stack_depth() calls:
  *  - after alloca();
  *  - for the functions with a stack frame size greater than or equal
  *     to the "track-min-size" plugin parameter.
@@ -33,7 +33,7 @@ __visible int plugin_is_GPL_compatible;
 
 static int track_frame_size = -1;
 static bool build_for_x86 = false;
-static const char track_function[] = "stackleak_track_stack";
+static const char track_function[] = "__sanitizer_cov_stack_depth";
 static bool disable = false;
 static bool verbose = false;
 
@@ -58,7 +58,7 @@ static void add_stack_tracking_gcall(gimple_stmt_iterator *gsi, bool after)
 	cgraph_node_ptr node;
 	basic_block bb;
 
-	/* Insert calling stackleak_track_stack() */
+	/* Insert calling __sanitizer_cov_stack_depth() */
 	stmt = gimple_build_call(track_function_decl, 0);
 	gimple_call = as_a_gcall(stmt);
 	if (after)
@@ -120,12 +120,12 @@ static void add_stack_tracking_gasm(gimple_stmt_iterator *gsi, bool after)
 	gcc_assert(build_for_x86);
 
 	/*
-	 * Insert calling stackleak_track_stack() in asm:
-	 *   asm volatile("call stackleak_track_stack"
+	 * Insert calling __sanitizer_cov_stack_depth() in asm:
+	 *   asm volatile("call __sanitizer_cov_stack_depth"
 	 *		  :: "r" (current_stack_pointer))
 	 * Use ASM_CALL_CONSTRAINT trick from arch/x86/include/asm/asm.h.
 	 * This constraint is taken into account during gcc shrink-wrapping
-	 * optimization. It is needed to be sure that stackleak_track_stack()
+	 * optimization. It is needed to be sure that __sanitizer_cov_stack_depth()
 	 * call is inserted after the prologue of the containing function,
 	 * when the stack frame is prepared.
 	 */
@@ -137,7 +137,7 @@ static void add_stack_tracking_gasm(gimple_stmt_iterator *gsi, bool after)
 	input = build_tree_list(NULL_TREE, build_const_char_string(2, "r"));
 	input = chainon(NULL_TREE, build_tree_list(input, sp_decl));
 	vec_safe_push(inputs, input);
-	asm_call = gimple_build_asm_vec("call stackleak_track_stack",
+	asm_call = gimple_build_asm_vec("call __sanitizer_cov_stack_depth",
 					inputs, NULL, NULL, NULL);
 	gimple_asm_set_volatile(asm_call, true);
 	if (after)
@@ -151,11 +151,11 @@ static void add_stack_tracking(gimple_stmt_iterator *gsi, bool after)
 {
 	/*
 	 * The 'no_caller_saved_registers' attribute is used for
-	 * stackleak_track_stack(). If the compiler supports this attribute for
-	 * the target arch, we can add calling stackleak_track_stack() in asm.
+	 * __sanitizer_cov_stack_depth(). If the compiler supports this attribute for
+	 * the target arch, we can add calling __sanitizer_cov_stack_depth() in asm.
 	 * That improves performance: we avoid useless operations with the
 	 * caller-saved registers in the functions from which we will remove
-	 * stackleak_track_stack() call during the stackleak_cleanup pass.
+	 * __sanitizer_cov_stack_depth() call during the stackleak_cleanup pass.
 	 */
 	if (lookup_attribute_spec(get_identifier("no_caller_saved_registers")))
 		add_stack_tracking_gasm(gsi, after);
@@ -165,7 +165,7 @@ static void add_stack_tracking(gimple_stmt_iterator *gsi, bool after)
 
 /*
  * Work with the GIMPLE representation of the code. Insert the
- * stackleak_track_stack() call after alloca() and into the beginning
+ * __sanitizer_cov_stack_depth() call after alloca() and into the beginning
  * of the function if it is not instrumented.
  */
 static unsigned int stackleak_instrument_execute(void)
@@ -205,7 +205,7 @@ static unsigned int stackleak_instrument_execute(void)
 					DECL_NAME_POINTER(current_function_decl));
 			}
 
-			/* Insert stackleak_track_stack() call after alloca() */
+			/* Insert __sanitizer_cov_stack_depth() call after alloca() */
 			add_stack_tracking(&gsi, true);
 			if (bb == entry_bb)
 				prologue_instrumented = true;
@@ -241,7 +241,7 @@ static unsigned int stackleak_instrument_execute(void)
 		return 0;
 	}
 
-	/* Insert stackleak_track_stack() call at the function beginning */
+	/* Insert __sanitizer_cov_stack_depth() call at the function beginning */
 	bb = entry_bb;
 	if (!single_pred_p(bb)) {
 		/* gcc_assert(bb_loop_depth(bb) ||
@@ -270,15 +270,15 @@ static void remove_stack_tracking_gcall(void)
 	rtx_insn *insn, *next;
 
 	/*
-	 * Find stackleak_track_stack() calls. Loop through the chain of insns,
+	 * Find __sanitizer_cov_stack_depth() calls. Loop through the chain of insns,
 	 * which is an RTL representation of the code for a function.
 	 *
 	 * The example of a matching insn:
-	 *  (call_insn 8 4 10 2 (call (mem (symbol_ref ("stackleak_track_stack")
-	 *  [flags 0x41] <function_decl 0x7f7cd3302a80 stackleak_track_stack>)
-	 *  [0 stackleak_track_stack S1 A8]) (0)) 675 {*call} (expr_list
-	 *  (symbol_ref ("stackleak_track_stack") [flags 0x41] <function_decl
-	 *  0x7f7cd3302a80 stackleak_track_stack>) (expr_list (0) (nil))) (nil))
+	 *  (call_insn 8 4 10 2 (call (mem (symbol_ref ("__sanitizer_cov_stack_depth")
+	 *  [flags 0x41] <function_decl 0x7f7cd3302a80 __sanitizer_cov_stack_depth>)
+	 *  [0 __sanitizer_cov_stack_depth S1 A8]) (0)) 675 {*call} (expr_list
+	 *  (symbol_ref ("__sanitizer_cov_stack_depth") [flags 0x41] <function_decl
+	 *  0x7f7cd3302a80 __sanitizer_cov_stack_depth>) (expr_list (0) (nil))) (nil))
 	 */
 	for (insn = get_insns(); insn; insn = next) {
 		rtx body;
@@ -318,7 +318,7 @@ static void remove_stack_tracking_gcall(void)
 		if (SYMBOL_REF_DECL(body) != track_function_decl)
 			continue;
 
-		/* Delete the stackleak_track_stack() call */
+		/* Delete the __sanitizer_cov_stack_depth() call */
 		delete_insn_and_edges(insn);
 #if BUILDING_GCC_VERSION < 8000
 		if (GET_CODE(next) == NOTE &&
@@ -340,12 +340,12 @@ static bool remove_stack_tracking_gasm(void)
 	gcc_assert(build_for_x86);
 
 	/*
-	 * Find stackleak_track_stack() asm calls. Loop through the chain of
+	 * Find __sanitizer_cov_stack_depth() asm calls. Loop through the chain of
 	 * insns, which is an RTL representation of the code for a function.
 	 *
 	 * The example of a matching insn:
 	 *  (insn 11 5 12 2 (parallel [ (asm_operands/v
-	 *  ("call stackleak_track_stack") ("") 0
+	 *  ("call __sanitizer_cov_stack_depth") ("") 0
 	 *  [ (reg/v:DI 7 sp [ current_stack_pointer ]) ]
 	 *  [ (asm_input:DI ("r")) ] [])
 	 *  (clobber (reg:CC 17 flags)) ]) -1 (nil))
@@ -375,7 +375,7 @@ static bool remove_stack_tracking_gasm(void)
 			continue;
 
 		if (strcmp(ASM_OPERANDS_TEMPLATE(body),
-						"call stackleak_track_stack")) {
+						"call __sanitizer_cov_stack_depth")) {
 			continue;
 		}
 
@@ -389,7 +389,7 @@ static bool remove_stack_tracking_gasm(void)
 
 /*
  * Work with the RTL representation of the code.
- * Remove the unneeded stackleak_track_stack() calls from the functions
+ * Remove the unneeded __sanitizer_cov_stack_depth() calls from the functions
  * which don't call alloca() and don't have a large enough stack frame size.
  */
 static unsigned int stackleak_cleanup_execute(void)
@@ -474,13 +474,13 @@ static bool stackleak_gate(void)
 	return track_frame_size >= 0;
 }
 
-/* Build the function declaration for stackleak_track_stack() */
+/* Build the function declaration for __sanitizer_cov_stack_depth() */
 static void stackleak_start_unit(void *gcc_data __unused,
 				 void *user_data __unused)
 {
 	tree fntype;
 
-	/* void stackleak_track_stack(void) */
+	/* void __sanitizer_cov_stack_depth(void) */
 	fntype = build_function_type_list(void_type_node, NULL_TREE);
 	track_function_decl = build_fn_decl(track_function, fntype);
 	DECL_ASSEMBLER_NAME(track_function_decl); /* for LTO */
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 125b35e2ef0f..f7aa2024ab25 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -129,8 +129,8 @@ config KSTACK_ERASE_TRACK_MIN_SIZE
 	help
 	  The KSTACK_ERASE option instruments the kernel code for tracking
 	  the lowest border of the kernel stack (and for some other purposes).
-	  It inserts the stackleak_track_stack() call for the functions with
-	  a stack frame size greater than or equal to this parameter.
+	  It inserts the __sanitizer_cov_stack_depth() call for the functions
+	  with a stack frame size greater than or equal to this parameter.
 	  If unsure, leave the default value 100.
 
 config KSTACK_ERASE_METRICS
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 5451bdbcf84a..b7e24684f592 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1193,7 +1193,7 @@ static const char *uaccess_safe_builtin[] = {
 	"__ubsan_handle_shift_out_of_bounds",
 	"__ubsan_handle_load_invalid_value",
 	/* KSTACK_ERASE */
-	"stackleak_track_stack",
+	"__sanitizer_cov_stack_depth",
 	/* TRACE_BRANCH_PROFILING */
 	"ftrace_likely_update",
 	/* STACKPROTECTOR */
-- 
cgit v1.2.3


From 2424fe1cac4fc8ea0520ba22ede7544c3ddc8dd1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 17 Jul 2025 16:25:10 -0700
Subject: arm: Handle KCOV __init vs inline mismatches

When KCOV is enabled all functions get instrumented, unless
the __no_sanitize_coverage attribute is used. To prepare for
__no_sanitize_coverage being applied to __init functions, we have to
handle differences in how GCC's inline optimizations get resolved. For
arm this exposed several places where __init annotations were missing
but ended up being "accidentally correct". Fix these cases and force
several functions to be inline with __always_inline.

Acked-by: Nishanth Menon <nm@ti.com>
Acked-by: Lee Jones <lee@kernel.org>
Reviewed-by: Nishanth Menon <nm@ti.com>
Link: https://lore.kernel.org/r/20250717232519.2984886-5-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 arch/arm/mm/cache-feroceon-l2.c   | 2 +-
 arch/arm/mm/cache-tauros2.c       | 2 +-
 drivers/clocksource/timer-orion.c | 2 +-
 drivers/soc/ti/pm33xx.c           | 2 +-
 include/linux/mfd/dbx500-prcmu.h  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c
index 25dbd84a1aaf..2bfefb252ffd 100644
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -295,7 +295,7 @@ static inline u32 read_extra_features(void)
 	return u;
 }
 
-static inline void write_extra_features(u32 u)
+static inline void __init write_extra_features(u32 u)
 {
 	__asm__("mcr p15, 1, %0, c15, c1, 0" : : "r" (u));
 }
diff --git a/arch/arm/mm/cache-tauros2.c b/arch/arm/mm/cache-tauros2.c
index b1e1aba602f7..bfe166ccace0 100644
--- a/arch/arm/mm/cache-tauros2.c
+++ b/arch/arm/mm/cache-tauros2.c
@@ -177,7 +177,7 @@ static inline void __init write_actlr(u32 actlr)
 	__asm__("mcr p15, 0, %0, c1, c0, 1\n" : : "r" (actlr));
 }
 
-static void enable_extra_feature(unsigned int features)
+static void __init enable_extra_feature(unsigned int features)
 {
 	u32 u;
 
diff --git a/drivers/clocksource/timer-orion.c b/drivers/clocksource/timer-orion.c
index 49e86cb70a7a..61f1e27fc41e 100644
--- a/drivers/clocksource/timer-orion.c
+++ b/drivers/clocksource/timer-orion.c
@@ -43,7 +43,7 @@ static struct delay_timer orion_delay_timer = {
 	.read_current_timer = orion_read_timer,
 };
 
-static void orion_delay_timer_init(unsigned long rate)
+static void __init orion_delay_timer_init(unsigned long rate)
 {
 	orion_delay_timer.freq = rate;
 	register_current_timer_delay(&orion_delay_timer);
diff --git a/drivers/soc/ti/pm33xx.c b/drivers/soc/ti/pm33xx.c
index dfdff186c805..dc52a2197d24 100644
--- a/drivers/soc/ti/pm33xx.c
+++ b/drivers/soc/ti/pm33xx.c
@@ -145,7 +145,7 @@ static int am33xx_do_sram_idle(u32 wfi_flags)
 	return pm_ops->cpu_suspend(am33xx_do_wfi_sram, wfi_flags);
 }
 
-static int __init am43xx_map_gic(void)
+static int am43xx_map_gic(void)
 {
 	gic_dist_base = ioremap(AM43XX_GIC_DIST_BASE, SZ_4K);
 
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index 98567623c9df..828362b7860c 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -213,7 +213,7 @@ struct prcmu_fw_version {
 
 #if defined(CONFIG_UX500_SOC_DB8500)
 
-static inline void prcmu_early_init(void)
+static inline void __init prcmu_early_init(void)
 {
 	db8500_prcmu_early_init();
 }
-- 
cgit v1.2.3


From 69fdb084355d6c0b353536024cc51aa5f7ffb62c Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Mon, 21 Jul 2025 21:50:49 +0300
Subject: wifi: mac80211: don't require cipher and keylen in gtk rekey

ieee80211_add_gtk_rekey receives a keyconf as an argument, and the
cipher and keylen are taken from there to the new allocated key.
But in rekey, both the cipher and the keylen should be the same as of
the old key, so let ieee80211_add_gtk_rekey find those, so drivers won't
have to fill it in.

Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://patch.msgid.link/20250721214922.3c5c023bfae9.Ie6594ae2b4b6d5b3d536e642b349046ebfce7a5d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mld/d3.c |  7 +++++-
 drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 11 +++++++--
 drivers/net/wireless/realtek/rtw89/wow.c    |  7 ++++--
 include/net/mac80211.h                      |  7 ++++--
 net/mac80211/key.c                          | 36 ++++++++++++++++++++++++++---
 5 files changed, 58 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mld/d3.c b/drivers/net/wireless/intel/iwlwifi/mld/d3.c
index 26255246a320..ed0a0f76f1c5 100644
--- a/drivers/net/wireless/intel/iwlwifi/mld/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mld/d3.c
@@ -762,6 +762,7 @@ iwl_mld_add_mcast_rekey(struct ieee80211_vif *vif,
 		.conf.keyidx = key_data->id,
 	};
 	int link_id = vif->active_links ? __ffs(vif->active_links) : -1;
+	u8 key[WOWLAN_KEY_MAX_SIZE];
 
 	BUILD_BUG_ON(WLAN_KEY_LEN_CCMP != WLAN_KEY_LEN_GCMP);
 	BUILD_BUG_ON(sizeof(conf.key) < WLAN_KEY_LEN_CCMP);
@@ -803,7 +804,11 @@ iwl_mld_add_mcast_rekey(struct ieee80211_vif *vif,
 	}
 
 	memcpy(conf.conf.key, key_data->key, conf.conf.keylen);
-	key_config = ieee80211_gtk_rekey_add(vif, &conf.conf, link_id);
+
+	memcpy(key, key_data->key, sizeof(key_data->key));
+
+	key_config = ieee80211_gtk_rekey_add(vif, key_data->id, key,
+					     sizeof(key), link_id);
 	if (IS_ERR(key_config))
 		return;
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
index ef9bab042902..997cdd76b13c 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c
@@ -1954,6 +1954,7 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status,
 	DEFINE_RAW_FLEX(struct ieee80211_key_conf, conf, key,
 			WOWLAN_KEY_MAX_SIZE);
 	int link_id = vif->active_links ? __ffs(vif->active_links) : -1;
+	u8 key_data[WOWLAN_KEY_MAX_SIZE];
 
 	conf->cipher = gtk_cipher;
 
@@ -1988,8 +1989,10 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status,
 				 conf->cipher, conf->keyidx);
 		memcpy(conf->key, status->gtk[i].key,
 		       sizeof(status->gtk[i].key));
+		memcpy(key_data, status->gtk[i].key, sizeof(status->gtk[i].key));
 
-		key = ieee80211_gtk_rekey_add(vif, conf, link_id);
+		key = ieee80211_gtk_rekey_add(vif, status->gtk[i].id, key_data,
+					      sizeof(key_data), link_id);
 		if (IS_ERR(key)) {
 			/* FW may send also the old keys */
 			if (PTR_ERR(key) == -EALREADY)
@@ -2021,6 +2024,7 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status,
 	struct ieee80211_key_conf *key_config;
 	struct ieee80211_key_seq seq;
 	int link_id = vif->active_links ? __ffs(vif->active_links) : -1;
+	u8 key[WOWLAN_KEY_MAX_SIZE];
 	s8 keyidx = key_data->id;
 
 	conf->cipher = cipher;
@@ -2050,7 +2054,10 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status,
 	BUILD_BUG_ON(WOWLAN_KEY_MAX_SIZE < sizeof(key_data->key));
 	memcpy(conf->key, key_data->key, conf->keylen);
 
-	key_config = ieee80211_gtk_rekey_add(vif, conf, link_id);
+	memcpy(key, key_data->key, sizeof(key_data->key));
+
+	key_config = ieee80211_gtk_rekey_add(vif, keyidx, key, sizeof(key),
+					     link_id);
 	if (IS_ERR(key_config)) {
 		/* FW may send also the old keys */
 		return PTR_ERR(key_config) == -EALREADY;
diff --git a/drivers/net/wireless/realtek/rtw89/wow.c b/drivers/net/wireless/realtek/rtw89/wow.c
index 071c7577df52..5bb7c1a42f1d 100644
--- a/drivers/net/wireless/realtek/rtw89/wow.c
+++ b/drivers/net/wireless/realtek/rtw89/wow.c
@@ -619,9 +619,12 @@ static struct ieee80211_key_conf *rtw89_wow_gtk_rekey(struct rtw89_dev *rtwdev,
 	       flex_array_size(rekey_conf, key, cipher_info->len));
 
 	if (ieee80211_vif_is_mld(wow_vif))
-		key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, rtwvif_link->link_id);
+		key = ieee80211_gtk_rekey_add(wow_vif, keyidx, gtk,
+					      cipher_info->len,
+					      rtwvif_link->link_id);
 	else
-		key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, -1);
+		key = ieee80211_gtk_rekey_add(wow_vif, keyidx, gtk,
+					      cipher_info->len, -1);
 
 	kfree(rekey_conf);
 	if (IS_ERR(key)) {
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a0cf976a9117..a45e4bee65d4 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -6035,7 +6035,10 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 /**
  * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
  * @vif: the virtual interface to add the key on
- * @keyconf: new key data
+ * @idx: the keyidx of the key
+ * @key_data: the key data
+ * @key_len: the key data. Might be bigger than the actual key length,
+ *	but not smaller (for the driver convinence)
  * @link_id: the link id of the key or -1 for non-MLO
  *
  * When GTK rekeying was done while the system was suspended, (a) new
@@ -6062,7 +6065,7 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
  */
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
-			struct ieee80211_key_conf *keyconf,
+			u8 idx, u8 *key_data, u8 key_len,
 			int link_id);
 
 /**
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 9d65013ddac7..b14e9cd9713f 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -1356,11 +1356,12 @@ EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq);
 
 struct ieee80211_key_conf *
 ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
-			struct ieee80211_key_conf *keyconf,
+			u8 idx, u8 *key_data, u8 key_len,
 			int link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_key *prev_key;
 	struct ieee80211_key *key;
 	int err;
 	struct ieee80211_link_data *link_data =
@@ -1376,8 +1377,37 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
 	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
 		return ERR_PTR(-EINVAL);
 
-	key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx,
-				  keyconf->keylen, keyconf->key,
+	if (WARN_ON(idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS +
+		    NUM_DEFAULT_BEACON_KEYS))
+		return ERR_PTR(-EINVAL);
+
+	prev_key = wiphy_dereference(local->hw.wiphy,
+				     link_data->gtk[idx]);
+	if (!prev_key) {
+		if (idx < NUM_DEFAULT_KEYS) {
+			for (int i = 0; i < NUM_DEFAULT_KEYS; i++) {
+				if (i == idx)
+					continue;
+				prev_key = wiphy_dereference(local->hw.wiphy,
+							     link_data->gtk[i]);
+				if (prev_key)
+					break;
+			}
+		} else {
+			/* For IGTK we have 4 and 5 and for BIGTK - 6 and 7 */
+			prev_key = wiphy_dereference(local->hw.wiphy,
+						     link_data->gtk[idx ^ 1]);
+		}
+	}
+
+	if (WARN_ON(!prev_key))
+		return ERR_PTR(-EINVAL);
+
+	if (WARN_ON(key_len < prev_key->conf.keylen))
+		return ERR_PTR(-EINVAL);
+
+	key = ieee80211_key_alloc(prev_key->conf.cipher, idx,
+				  prev_key->conf.keylen, key_data,
 				  0, NULL);
 	if (IS_ERR(key))
 		return ERR_CAST(key);
-- 
cgit v1.2.3


From 1b5f1454091e9e9fb5c944b3161acf4ec0894d0d Mon Sep 17 00:00:00 2001
From: Feng Lee <379943137@qq.com>
Date: Mon, 21 Jul 2025 16:04:35 +0800
Subject: sched/idle: Remove play_idle()

play_idle() is no longer in use, so delete it.

Signed-off-by: Feng Lee <379943137@qq.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/tencent_C3E0BD9B812C27A30FC49F1EA6A4B1352707@qq.com
---
 include/linux/cpu.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6378370a952f..8b1abbf5b6d2 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -187,11 +187,6 @@ static inline void arch_cpu_finalize_init(void) { }
 
 void play_idle_precise(u64 duration_ns, u64 latency_ns);
 
-static inline void play_idle(unsigned long duration_us)
-{
-	play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX);
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 void cpuhp_report_idle_dead(void);
 #else
-- 
cgit v1.2.3


From 2fb4af5ea3c735a205d97de10f044f809b20af51 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 18 Jul 2025 10:14:49 +0200
Subject: NFS: track active delegations per-server

The active delegation watermark was added to avoid overloading servers.
Track the active delegation per-server instead of globally so that clients
talking to multiple servers aren't limited by the global limit.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20250718081509.2607553-5-hch@lst.de
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           |  1 +
 fs/nfs/delegation.c       | 35 +++++++++++++++++++----------------
 include/linux/nfs_fs_sb.h |  1 +
 3 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 47258dc3af70..e13eb429b8b5 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1005,6 +1005,7 @@ struct nfs_server *nfs_alloc_server(void)
 	INIT_LIST_HEAD(&server->ss_src_copies);
 
 	atomic_set(&server->active, 0);
+	atomic_long_set(&server->nr_active_delegations, 0);
 
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5f85966d7709..ea96f77e38c2 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -27,7 +27,6 @@
 
 #define NFS_DEFAULT_DELEGATION_WATERMARK (5000U)
 
-static atomic_long_t nfs_active_delegations;
 static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK;
 module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
 
@@ -38,11 +37,12 @@ static void __nfs_free_delegation(struct nfs_delegation *delegation)
 	kfree_rcu(delegation, rcu);
 }
 
-static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+		struct nfs_delegation *delegation)
 {
 	if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
 		delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
-		atomic_long_dec(&nfs_active_delegations);
+		atomic_long_dec(&server->nr_active_delegations);
 		if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
 			nfs_clear_verifier_delegated(delegation->inode);
 	}
@@ -60,9 +60,10 @@ static void nfs_put_delegation(struct nfs_delegation *delegation)
 		__nfs_free_delegation(delegation);
 }
 
-static void nfs_free_delegation(struct nfs_delegation *delegation)
+static void nfs_free_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation)
 {
-	nfs_mark_delegation_revoked(delegation);
+	nfs_mark_delegation_revoked(server, delegation);
 	nfs_put_delegation(delegation);
 }
 
@@ -261,7 +262,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
 	}
 	clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
 	if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
-		atomic_long_inc(&nfs_active_delegations);
+		atomic_long_inc(&NFS_SERVER(inode)->nr_active_delegations);
 	spin_unlock(&delegation->lock);
 	rcu_read_unlock();
 	put_cred(oldcred);
@@ -413,7 +414,8 @@ nfs_update_delegation_cred(struct nfs_delegation *delegation,
 }
 
 static void
-nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+nfs_update_inplace_delegation(struct nfs_server *server,
+		struct nfs_delegation *delegation,
 		const struct nfs_delegation *update)
 {
 	if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
@@ -426,7 +428,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
 			nfs_update_delegation_cred(delegation, update->cred);
 			/* smp_mb__before_atomic() is implicit due to xchg() */
 			clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
-			atomic_long_inc(&nfs_active_delegations);
+			atomic_long_inc(&server->nr_active_delegations);
 		}
 	}
 }
@@ -481,7 +483,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 	if (nfs4_stateid_match_other(&old_delegation->stateid,
 				&delegation->stateid)) {
 		spin_lock(&old_delegation->lock);
-		nfs_update_inplace_delegation(old_delegation,
+		nfs_update_inplace_delegation(server, old_delegation,
 				delegation);
 		spin_unlock(&old_delegation->lock);
 		goto out;
@@ -530,7 +532,7 @@ add_new:
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
 
-	atomic_long_inc(&nfs_active_delegations);
+	atomic_long_inc(&server->nr_active_delegations);
 
 	trace_nfs4_set_delegation(inode, type);
 
@@ -544,7 +546,7 @@ out:
 		__nfs_free_delegation(delegation);
 	if (freeme != NULL) {
 		nfs_do_return_delegation(inode, freeme, 0);
-		nfs_free_delegation(freeme);
+		nfs_free_delegation(server, freeme);
 	}
 	return status;
 }
@@ -756,7 +758,7 @@ void nfs_inode_evict_delegation(struct inode *inode)
 		set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 		set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
 		nfs_do_return_delegation(inode, delegation, 1);
-		nfs_free_delegation(delegation);
+		nfs_free_delegation(NFS_SERVER(inode), delegation);
 	}
 }
 
@@ -842,7 +844,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode)
 	if (!delegation)
 		goto out;
 	if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) ||
-	    atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) {
+	    atomic_long_read(&NFS_SERVER(inode)->nr_active_delegations) >=
+	    nfs_delegation_watermark) {
 		spin_lock(&delegation->lock);
 		if (delegation->inode &&
 		    list_empty(&NFS_I(inode)->open_files) &&
@@ -1018,7 +1021,7 @@ static void nfs_revoke_delegation(struct inode *inode,
 		}
 		spin_unlock(&delegation->lock);
 	}
-	nfs_mark_delegation_revoked(delegation);
+	nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
 	ret = true;
 out:
 	rcu_read_unlock();
@@ -1050,7 +1053,7 @@ void nfs_delegation_mark_returned(struct inode *inode,
 			delegation->stateid.seqid = stateid->seqid;
 	}
 
-	nfs_mark_delegation_revoked(delegation);
+	nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
 	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 	spin_unlock(&delegation->lock);
 	if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode)))
@@ -1270,7 +1273,7 @@ restart:
 		if (delegation != NULL) {
 			if (nfs_detach_delegation(NFS_I(inode), delegation,
 						server) != NULL)
-				nfs_free_delegation(delegation);
+				nfs_free_delegation(server, delegation);
 			/* Match nfs_start_delegation_return_locked */
 			nfs_put_delegation(delegation);
 		}
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d2d36711a119..a9b44f12623f 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -254,6 +254,7 @@ struct nfs_server {
 	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
+	atomic_long_t		nr_active_delegations;
 	struct list_head	ss_copies;
 	struct list_head	ss_src_copies;
 
-- 
cgit v1.2.3


From f5b3108e6a14418b120a3c38ca589b8d6cf87627 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 18 Jul 2025 10:14:50 +0200
Subject: NFS: use a hash table for delegation lookup

nfs_delegation_find_inode currently has to walk the entire list of
delegations per inode, which can become pretty large, and can become even
larger when increasing the delegation watermark.

Add a hash table to speed up the delegation lookup, sized as a fraction
of the delegation watermark.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20250718081509.2607553-6-hch@lst.de
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/delegation.c       | 28 +++++++++++++++++++++++++++-
 fs/nfs/delegation.h       |  3 +++
 fs/nfs/nfs4client.c       |  5 +++++
 fs/nfs/nfs4proc.c         | 22 +++++++++++++++++++++-
 include/linux/nfs_fs_sb.h |  2 ++
 5 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ea96f77e38c2..9d3a5f29f17f 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -30,6 +30,13 @@
 static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK;
 module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
 
+static struct hlist_head *nfs_delegation_hash(struct nfs_server *server,
+		const struct nfs_fh *fhandle)
+{
+	return server->delegation_hash_table +
+		(nfs_fhandle_hash(fhandle) & server->delegation_hash_mask);
+}
+
 static void __nfs_free_delegation(struct nfs_delegation *delegation)
 {
 	put_cred(delegation->cred);
@@ -367,6 +374,7 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
 		spin_unlock(&delegation->lock);
 		return NULL;
 	}
+	hlist_del_init_rcu(&delegation->hash);
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
 	rcu_assign_pointer(nfsi->delegation, NULL);
@@ -529,6 +537,8 @@ add_new:
 	spin_unlock(&inode->i_lock);
 
 	list_add_tail_rcu(&delegation->super_list, &server->delegations);
+	hlist_add_head_rcu(&delegation->hash,
+			nfs_delegation_hash(server, &NFS_I(inode)->fh));
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
 
@@ -1166,11 +1176,12 @@ static struct inode *
 nfs_delegation_find_inode_server(struct nfs_server *server,
 				 const struct nfs_fh *fhandle)
 {
+	struct hlist_head *head = nfs_delegation_hash(server, fhandle);
 	struct nfs_delegation *delegation;
 	struct super_block *freeme = NULL;
 	struct inode *res = NULL;
 
-	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+	hlist_for_each_entry_rcu(delegation, head, hash) {
 		spin_lock(&delegation->lock);
 		if (delegation->inode != NULL &&
 		    !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
@@ -1577,3 +1588,18 @@ out:
 	rcu_read_unlock();
 	return ret;
 }
+
+int nfs4_delegation_hash_alloc(struct nfs_server *server)
+{
+	int delegation_buckets, i;
+
+	delegation_buckets = roundup_pow_of_two(nfs_delegation_watermark / 16);
+	server->delegation_hash_mask = delegation_buckets - 1;
+	server->delegation_hash_table = kmalloc_array(delegation_buckets,
+			sizeof(*server->delegation_hash_table), GFP_KERNEL);
+	if (!server->delegation_hash_table)
+		return -ENOMEM;
+	for (i = 0; i < delegation_buckets; i++)
+		INIT_HLIST_HEAD(&server->delegation_hash_table[i]);
+	return 0;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 8ff5ab9c5c25..08ec2e9c68a4 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -14,6 +14,7 @@
  * NFSv4 delegation
  */
 struct nfs_delegation {
+	struct hlist_node hash;
 	struct list_head super_list;
 	const struct cred *cred;
 	struct inode *inode;
@@ -123,4 +124,6 @@ static inline int nfs_have_delegated_mtime(struct inode *inode)
 						 NFS_DELEGATION_FLAG_TIME);
 }
 
+int nfs4_delegation_hash_alloc(struct nfs_server *server);
+
 #endif
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 5943a192f36b..2ea98f1f116f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -802,6 +802,7 @@ static void nfs4_destroy_server(struct nfs_server *server)
 	unset_pnfs_layoutdriver(server);
 	nfs4_purge_state_owners(server, &freeme);
 	nfs4_free_state_owners(&freeme);
+	kfree(server->delegation_hash_table);
 }
 
 /*
@@ -1096,6 +1097,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 {
 	int error;
 
+	error = nfs4_delegation_hash_alloc(server);
+	if (error)
+		return error;
+
 	/* data servers support only a subset of NFSv4.1 */
 	if (is_ds_only_client(server->nfs_client))
 		return -EPROTONOSUPPORT;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ef2077e185b6..d8bebd757af3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10967,6 +10967,26 @@ static const struct inode_operations nfs4_file_inode_operations = {
 	.listxattr	= nfs4_listxattr,
 };
 
+static struct nfs_server *nfs4_clone_server(struct nfs_server *source,
+		struct nfs_fh *fh, struct nfs_fattr *fattr,
+		rpc_authflavor_t flavor)
+{
+	struct nfs_server *server;
+	int error;
+
+	server = nfs_clone_server(source, fh, fattr, flavor);
+	if (IS_ERR(server))
+		return server;
+
+	error = nfs4_delegation_hash_alloc(server);
+	if (error) {
+		nfs_free_server(server);
+		return ERR_PTR(error);
+	}
+
+	return server;
+}
+
 const struct nfs_rpc_ops nfs_v4_clientops = {
 	.version	= 4,			/* protocol version */
 	.dentry_ops	= &nfs4_dentry_operations,
@@ -11019,7 +11039,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.init_client	= nfs4_init_client,
 	.free_client	= nfs4_free_client,
 	.create_server	= nfs4_create_server,
-	.clone_server	= nfs_clone_server,
+	.clone_server	= nfs4_clone_server,
 	.discover_trunking = nfs4_discover_trunking,
 	.enable_swap	= nfs4_enable_swap,
 	.disable_swap	= nfs4_disable_swap,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index a9b44f12623f..d30c0245031c 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -255,6 +255,8 @@ struct nfs_server {
 	struct list_head	layouts;
 	struct list_head	delegations;
 	atomic_long_t		nr_active_delegations;
+	unsigned int		delegation_hash_mask;
+	struct hlist_head	*delegation_hash_table;
 	struct list_head	ss_copies;
 	struct list_head	ss_src_copies;
 
-- 
cgit v1.2.3


From ad38574a8e8223361e265973fbd87013ea058c5d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Jul 2025 18:03:33 +0100
Subject: f2fs: Pass a folio to ADDRS_PER_PAGE()

All callers now have a folio so pass it in.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          |  2 +-
 fs/f2fs/file.c          | 18 +++++++++---------
 fs/f2fs/recovery.c      |  4 ++--
 fs/f2fs/segment.c       |  2 +-
 include/linux/f2fs_fs.h |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 0e261caf2f91..8a2414ce39ff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1588,7 +1588,7 @@ next_dnode:
 	start_pgofs = pgofs;
 	prealloc = 0;
 	last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
-	end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+	end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 
 next_block:
 	blkaddr = f2fs_data_blkaddr(&dn);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 04a5a1089320..60618c52ba50 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -489,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 			}
 		}
 
-		end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+		end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 
 		/* find data/hole in dnode block */
 		for (; dn.ofs_in_node < end_offset;
@@ -814,7 +814,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 		goto out;
 	}
 
-	count = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+	count = ADDRS_PER_PAGE(dn.node_folio, inode);
 
 	count -= dn.ofs_in_node;
 	f2fs_bug_on(sbi, count < 0);
@@ -1233,7 +1233,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
 			return err;
 		}
 
-		end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+		end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 		count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
 
 		f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -1332,7 +1332,7 @@ next_dnode:
 		goto next;
 	}
 
-	done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) -
+	done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) -
 							dn.ofs_in_node, len);
 	for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
 		*blkaddr = f2fs_data_blkaddr(&dn);
@@ -1421,7 +1421,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
 			}
 
 			ilen = min((pgoff_t)
-				ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) -
+				ADDRS_PER_PAGE(dn.node_folio, dst_inode) -
 						dn.ofs_in_node, len - i);
 			do {
 				dn.data_blkaddr = f2fs_data_blkaddr(&dn);
@@ -1717,7 +1717,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 				goto out;
 			}
 
-			end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+			end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 			end = min(pg_end, end_offset - dn.ofs_in_node + index);
 
 			ret = f2fs_do_zero_range(&dn, index, end);
@@ -3885,7 +3885,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
 			break;
 		}
 
-		end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+		end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 		count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
 		count = round_up(count, fi->i_cluster_size);
 
@@ -4063,7 +4063,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 			break;
 		}
 
-		end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+		end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 		count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
 		count = round_up(count, fi->i_cluster_size);
 
@@ -4227,7 +4227,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
 			goto out;
 		}
 
-		end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode);
+		end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
 		count = min(end_offset - dn.ofs_in_node, pg_end - index);
 		for (i = 0; i < count; i++, index++, dn.ofs_in_node++) {
 			struct block_device *cur_bdev;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 5a45d0d1f05c..894b27b0329d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -527,7 +527,7 @@ got_it:
 	nid = le32_to_cpu(sum.nid);
 	ofs_in_node = le16_to_cpu(sum.ofs_in_node);
 
-	max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode);
+	max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode);
 	if (ofs_in_node >= max_addrs) {
 		f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
 			ofs_in_node, dn->inode->i_ino, nid, max_addrs);
@@ -649,7 +649,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 	/* step 3: recover data indices */
 	start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode);
-	end = start + ADDRS_PER_PAGE(&folio->page, inode);
+	end = start + ADDRS_PER_PAGE(folio, inode);
 
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 retry_dn:
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 2a6dcfba911f..909637873ff7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
 			goto next;
 		}
 
-		blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode),
+		blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode),
 				len);
 		index = off;
 		for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 5206d63b3386..25857877eaec 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -268,7 +268,7 @@ struct node_footer {
 /* Node IDs in an Indirect Block */
 #define NIDS_PER_BLOCK		((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 
-#define ADDRS_PER_PAGE(page, inode)	(addrs_per_page(inode, IS_INODE(page)))
+#define ADDRS_PER_PAGE(folio, inode)	(addrs_per_page(inode, IS_INODE(&folio->page)))
 
 #define	NODE_DIR1_BLOCK		(DEF_ADDRS_PER_INODE + 1)
 #define	NODE_DIR2_BLOCK		(DEF_ADDRS_PER_INODE + 2)
-- 
cgit v1.2.3


From a5f3be6e652a7beaaf6c482bc013b64129a5d239 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Jul 2025 18:03:34 +0100
Subject: f2fs: Pass a folio to IS_INODE()

All callers now have a folio so pass it in.  Also make it const to help
the compiler.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/f2fs.h          |  6 +++---
 fs/f2fs/file.c          |  2 +-
 fs/f2fs/gc.c            |  2 +-
 fs/f2fs/inline.c        |  2 +-
 fs/f2fs/inode.c         |  2 +-
 fs/f2fs/node.c          | 14 ++++++--------
 fs/f2fs/recovery.c      |  8 ++++----
 include/linux/f2fs_fs.h |  2 +-
 8 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3933327d8cc3..09ddc0626dfe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3019,9 +3019,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
 
 #define RAW_IS_INODE(p)	((p)->footer.nid == (p)->footer.ino)
 
-static inline bool IS_INODE(struct page *page)
+static inline bool IS_INODE(const struct folio *folio)
 {
-	struct f2fs_node *p = F2FS_NODE(page);
+	struct f2fs_node *p = F2FS_NODE(&folio->page);
 
 	return RAW_IS_INODE(p);
 }
@@ -3041,7 +3041,7 @@ static inline int f2fs_has_extra_attr(struct inode *inode);
 static inline unsigned int get_dnode_base(struct inode *inode,
 					struct folio *node_folio)
 {
-	if (!IS_INODE(&node_folio->page))
+	if (!IS_INODE(node_folio))
 		return 0;
 
 	return inode ? get_extra_isize(inode) :
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 60618c52ba50..36b32757d5b9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -819,7 +819,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
 	count -= dn.ofs_in_node;
 	f2fs_bug_on(sbi, count < 0);
 
-	if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) {
+	if (dn.ofs_in_node || IS_INODE(dn.node_folio)) {
 		f2fs_truncate_data_blocks_range(&dn, count);
 		free_from += count;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 781b955cbb77..c1d4ecbd2505 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1162,7 +1162,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 		return false;
 	}
 
-	if (IS_INODE(&node_folio->page)) {
+	if (IS_INODE(node_folio)) {
 		base = offset_in_addr(F2FS_INODE(node_folio));
 		max_addrs = DEF_ADDRS_PER_INODE;
 	} else {
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 9851310cdb87..51adc43d5a5c 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -305,7 +305,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio)
 	 *    x       o  -> remove data blocks, and then recover inline_data
 	 *    x       x  -> recover data blocks
 	 */
-	if (IS_INODE(&nfolio->page))
+	if (IS_INODE(nfolio))
 		ri = F2FS_INODE(nfolio);
 
 	if (f2fs_has_inline_data(inode) &&
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index a8f64d206d19..dd3b43c24831 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -124,7 +124,7 @@ bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio)
 	if (!f2fs_sb_has_inode_chksum(sbi))
 		return false;
 
-	if (!IS_INODE(&folio->page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+	if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR))
 		return false;
 
 	if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize),
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b56e627e0b56..908a1eb9c415 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -973,7 +973,7 @@ static int truncate_dnode(struct dnode_of_data *dn)
 	else if (IS_ERR(folio))
 		return PTR_ERR(folio);
 
-	if (IS_INODE(&folio->page) || ino_of_node(folio) != dn->inode->i_ino) {
+	if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) {
 		f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u",
 				dn->inode->i_ino, dn->nid, ino_of_node(folio));
 		set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -1474,10 +1474,8 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
 					struct folio *folio, pgoff_t nid,
 					enum node_type ntype)
 {
-	struct page *page = &folio->page;
-
 	if (unlikely(nid != nid_of_node(folio) ||
-		(ntype == NODE_TYPE_INODE && !IS_INODE(page)) ||
+		(ntype == NODE_TYPE_INODE && !IS_INODE(folio)) ||
 		(ntype == NODE_TYPE_XATTR &&
 		!f2fs_has_xattr_block(ofs_of_node(folio))) ||
 		time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
@@ -1867,7 +1865,7 @@ continue_unlock:
 			if (!atomic || folio == last_folio) {
 				set_fsync_mark(folio, 1);
 				percpu_counter_inc(&sbi->rf_node_block_count);
-				if (IS_INODE(&folio->page)) {
+				if (IS_INODE(folio)) {
 					if (is_inode_flag_set(inode,
 								FI_DIRTY_INODE))
 						f2fs_update_inode(inode, folio);
@@ -1976,7 +1974,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
 		for (i = 0; i < nr_folios; i++) {
 			struct folio *folio = fbatch.folios[i];
 
-			if (!IS_INODE(&folio->page))
+			if (!IS_INODE(folio))
 				continue;
 
 			folio_lock(folio);
@@ -2077,7 +2075,7 @@ continue_unlock:
 			}
 
 			/* flush dirty inode */
-			if (IS_INODE(&folio->page) && flush_dirty_inode(folio))
+			if (IS_INODE(folio) && flush_dirty_inode(folio))
 				goto lock_node;
 write_node:
 			f2fs_folio_wait_writeback(folio, NODE, true, true);
@@ -2213,7 +2211,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
 	if (!folio_test_uptodate(folio))
 		folio_mark_uptodate(folio);
 #ifdef CONFIG_F2FS_CHECK_FS
-	if (IS_INODE(&folio->page))
+	if (IS_INODE(folio))
 		f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio);
 #endif
 	if (filemap_dirty_folio(mapping, folio)) {
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 894b27b0329d..4cb3a91801b4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -437,7 +437,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
 			bool quota_inode = false;
 
 			if (!check_only &&
-					IS_INODE(&folio->page) &&
+					IS_INODE(folio) &&
 					is_dent_dnode(folio)) {
 				err = f2fs_recover_inode_page(sbi, folio);
 				if (err) {
@@ -463,7 +463,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
 		}
 		entry->blkaddr = blkaddr;
 
-		if (IS_INODE(&folio->page) && is_dent_dnode(folio))
+		if (IS_INODE(folio) && is_dent_dnode(folio))
 			entry->last_dentry = blkaddr;
 next:
 		/* check next segment */
@@ -628,7 +628,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 	int err = 0, recovered = 0;
 
 	/* step 1: recover xattr */
-	if (IS_INODE(&folio->page)) {
+	if (IS_INODE(folio)) {
 		err = f2fs_recover_inline_xattr(inode, folio);
 		if (err)
 			goto out;
@@ -821,7 +821,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 		 * In this case, we can lose the latest inode(x).
 		 * So, call recover_inode for the inode update.
 		 */
-		if (IS_INODE(&folio->page)) {
+		if (IS_INODE(folio)) {
 			err = recover_inode(entry->inode, folio);
 			if (err) {
 				f2fs_folio_put(folio, true);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 25857877eaec..2f8b8bfc0e73 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -268,7 +268,7 @@ struct node_footer {
 /* Node IDs in an Indirect Block */
 #define NIDS_PER_BLOCK		((F2FS_BLKSIZE - sizeof(struct node_footer)) / sizeof(__le32))
 
-#define ADDRS_PER_PAGE(folio, inode)	(addrs_per_page(inode, IS_INODE(&folio->page)))
+#define ADDRS_PER_PAGE(folio, inode)	(addrs_per_page(inode, IS_INODE(folio)))
 
 #define	NODE_DIR1_BLOCK		(DEF_ADDRS_PER_INODE + 1)
 #define	NODE_DIR2_BLOCK		(DEF_ADDRS_PER_INODE + 2)
-- 
cgit v1.2.3


From a824388d911927b2a82bf7dcfd7cef6ee45c8b43 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 8 Jul 2025 18:03:36 +0100
Subject: f2fs: Use a folio in f2fs_is_cp_guaranteed()

Convert the passed page to a folio and use it throughout.  Removes
a use of fscrypt_is_bounce_page(), which we're trying to remove.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/data.c          | 11 ++++++-----
 fs/f2fs/f2fs.h          |  2 +-
 include/linux/fscrypt.h | 10 ++++++----
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f3e11f5672ec..c1fc8c7b1256 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -47,14 +47,15 @@ void f2fs_destroy_bioset(void)
 	bioset_exit(&f2fs_bioset);
 }
 
-bool f2fs_is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(const struct page *page)
 {
-	struct address_space *mapping = page_folio(page)->mapping;
+	const struct folio *folio = page_folio(page);
+	struct address_space *mapping = folio->mapping;
 	struct inode *inode;
 	struct f2fs_sb_info *sbi;
 
-	if (fscrypt_is_bounce_page(page))
-		return page_private_gcing(fscrypt_pagecache_page(page));
+	if (fscrypt_is_bounce_folio(folio))
+		return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio));
 
 	inode = mapping->host;
 	sbi = F2FS_I_SB(inode);
@@ -65,7 +66,7 @@ bool f2fs_is_cp_guaranteed(struct page *page)
 		return true;
 
 	if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
-			page_private_gcing(page))
+			folio_test_f2fs_gcing(folio))
 		return true;
 	return false;
 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0e607305e308..be9b7a0120a9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3990,7 +3990,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
  */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
-bool f2fs_is_cp_guaranteed(struct page *page);
+bool f2fs_is_cp_guaranteed(const struct page *page);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 56fad33043d5..8d9127a0fdb3 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -332,12 +332,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
 	return (struct page *)page_private(bounce_page);
 }
 
-static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
 {
 	return folio->mapping == NULL;
 }
 
-static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+static inline
+struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
 {
 	return bounce_folio->private;
 }
@@ -518,12 +519,13 @@ static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
 	return ERR_PTR(-EINVAL);
 }
 
-static inline bool fscrypt_is_bounce_folio(struct folio *folio)
+static inline bool fscrypt_is_bounce_folio(const struct folio *folio)
 {
 	return false;
 }
 
-static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
+static inline
+struct folio *fscrypt_pagecache_folio(const struct folio *bounce_folio)
 {
 	WARN_ON_ONCE(1);
 	return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From e612423be33465d2b9822bf09e03d4e6c165e384 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 8 Jul 2025 18:34:04 +0100
Subject: cpu/hotplug: Remove unused cpuhp_state CPUHP_PCI_XGENE_DEAD

Now that the XGene MSI driver has been mostly rewritten and doesn't
use the CPU hotplug infrastructure, CPUHP_PCI_XGENE_DEAD is unused.

Remove it to reduce the size of cpuhp_hp_states[].

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20250708173404.1278635-14-maz@kernel.org
---
 include/linux/cpuhotplug.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index df366ee15456..eaca70eb6136 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -90,7 +90,6 @@ enum cpuhp_state {
 	CPUHP_RADIX_DEAD,
 	CPUHP_PAGE_ALLOC,
 	CPUHP_NET_DEV_DEAD,
-	CPUHP_PCI_XGENE_DEAD,
 	CPUHP_IOMMU_IOVA_DEAD,
 	CPUHP_AP_ARM_CACHE_B15_RAC_DEAD,
 	CPUHP_PADATA_DEAD,
-- 
cgit v1.2.3


From 07d8004d6fb95cbe48918e56012f16454cfdfe89 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 20 Jun 2025 15:08:07 +0200
Subject: tpm: add bufsiz parameter in the .send callback

Add a new `bufsiz` parameter to the `.send` callback in `tpm_class_ops`.
This parameter will allow drivers to differentiate between the actual
command length to send and the total buffer size. Currently `bufsiz` is
not used, but it will be used to implement devices with synchronous send()
to send the command and receive the response on the same buffer.

Also rename the previous parameter `len` to `cmd_len` in the declaration
to make it clear that it contains the length in bytes of the command
stored in the buffer. The semantics don't change and it can be used as
before by drivers. This is an optimization since the drivers could get it
from the header, but let's avoid duplicating code.

While we are here, resolve a checkpatch warning:
  WARNING: Unnecessary space before function pointer arguments
  #66: FILE: include/linux/tpm.h:90:
  +	int (*send) (struct tpm_chip *chip, u8 *buf, size_t bufsiz,

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Suggested-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/st33zp24/st33zp24.c | 2 +-
 drivers/char/tpm/tpm-interface.c     | 2 +-
 drivers/char/tpm/tpm_atmel.c         | 3 ++-
 drivers/char/tpm/tpm_crb.c           | 2 +-
 drivers/char/tpm/tpm_ftpm_tee.c      | 4 +++-
 drivers/char/tpm/tpm_i2c_atmel.c     | 3 ++-
 drivers/char/tpm/tpm_i2c_infineon.c  | 3 ++-
 drivers/char/tpm/tpm_i2c_nuvoton.c   | 3 ++-
 drivers/char/tpm/tpm_ibmvtpm.c       | 6 ++++--
 drivers/char/tpm/tpm_infineon.c      | 3 ++-
 drivers/char/tpm/tpm_nsc.c           | 3 ++-
 drivers/char/tpm/tpm_svsm.c          | 3 ++-
 drivers/char/tpm/tpm_tis_core.c      | 3 ++-
 drivers/char/tpm/tpm_tis_i2c_cr50.c  | 6 ++++--
 drivers/char/tpm/tpm_vtpm_proxy.c    | 4 +++-
 drivers/char/tpm/xen-tpmfront.c      | 3 ++-
 include/linux/tpm.h                  | 3 ++-
 17 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/st33zp24/st33zp24.c b/drivers/char/tpm/st33zp24/st33zp24.c
index c0771980bc2f..2ed7815e4899 100644
--- a/drivers/char/tpm/st33zp24/st33zp24.c
+++ b/drivers/char/tpm/st33zp24/st33zp24.c
@@ -300,7 +300,7 @@ static irqreturn_t tpm_ioserirq_handler(int irq, void *dev_id)
  * send TPM commands through the I2C bus.
  */
 static int st33zp24_send(struct tpm_chip *chip, unsigned char *buf,
-			 size_t len)
+			 size_t bufsiz, size_t len)
 {
 	struct st33zp24_dev *tpm_dev = dev_get_drvdata(&chip->dev);
 	u32 status, i, size, ordinal;
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 8d7e4da6ed53..816b7c690bc9 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -106,7 +106,7 @@ static ssize_t tpm_try_transmit(struct tpm_chip *chip, void *buf, size_t bufsiz)
 		return -E2BIG;
 	}
 
-	rc = chip->ops->send(chip, buf, count);
+	rc = chip->ops->send(chip, buf, bufsiz, count);
 	if (rc < 0) {
 		if (rc != -EPIPE)
 			dev_err(&chip->dev,
diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c
index 54a0360a3c95..f25faf468bba 100644
--- a/drivers/char/tpm/tpm_atmel.c
+++ b/drivers/char/tpm/tpm_atmel.c
@@ -148,7 +148,8 @@ static int tpm_atml_recv(struct tpm_chip *chip, u8 *buf, size_t count)
 	return size;
 }
 
-static int tpm_atml_send(struct tpm_chip *chip, u8 *buf, size_t count)
+static int tpm_atml_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			 size_t count)
 {
 	struct tpm_atmel_priv *priv = dev_get_drvdata(&chip->dev);
 	int i;
diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
index 876edf2705ab..ed97344f2324 100644
--- a/drivers/char/tpm/tpm_crb.c
+++ b/drivers/char/tpm/tpm_crb.c
@@ -426,7 +426,7 @@ static int tpm_crb_smc_start(struct device *dev, unsigned long func_id)
 }
 #endif
 
-static int crb_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int crb_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz, size_t len)
 {
 	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
 	int rc = 0;
diff --git a/drivers/char/tpm/tpm_ftpm_tee.c b/drivers/char/tpm/tpm_ftpm_tee.c
index 53ba28ccd5d3..dbad83bf798e 100644
--- a/drivers/char/tpm/tpm_ftpm_tee.c
+++ b/drivers/char/tpm/tpm_ftpm_tee.c
@@ -63,13 +63,15 @@ static int ftpm_tee_tpm_op_recv(struct tpm_chip *chip, u8 *buf, size_t count)
  * ftpm_tee_tpm_op_send() - send TPM commands through the TEE shared memory.
  * @chip:	the tpm_chip description as specified in driver/char/tpm/tpm.h
  * @buf:	the buffer to send.
+ * @bufsiz:	the size of the buffer.
  * @len:	the number of bytes to send.
  *
  * Return:
  *	In case of success, returns 0.
  *	On failure, -errno
  */
-static int ftpm_tee_tpm_op_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int ftpm_tee_tpm_op_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+				size_t len)
 {
 	struct ftpm_tee_private *pvt_data = dev_get_drvdata(chip->dev.parent);
 	size_t resp_len;
diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c
index d1d27fdfe523..4f229656a8e2 100644
--- a/drivers/char/tpm/tpm_i2c_atmel.c
+++ b/drivers/char/tpm/tpm_i2c_atmel.c
@@ -37,7 +37,8 @@ struct priv_data {
 	u8 buffer[sizeof(struct tpm_header) + 25];
 };
 
-static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			  size_t len)
 {
 	struct priv_data *priv = dev_get_drvdata(&chip->dev);
 	struct i2c_client *client = to_i2c_client(chip->dev.parent);
diff --git a/drivers/char/tpm/tpm_i2c_infineon.c b/drivers/char/tpm/tpm_i2c_infineon.c
index 81d8a78dc655..bdf1f329a679 100644
--- a/drivers/char/tpm/tpm_i2c_infineon.c
+++ b/drivers/char/tpm/tpm_i2c_infineon.c
@@ -514,7 +514,8 @@ out:
 	return size;
 }
 
-static int tpm_tis_i2c_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int tpm_tis_i2c_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			    size_t len)
 {
 	int rc, status;
 	ssize_t burstcnt;
diff --git a/drivers/char/tpm/tpm_i2c_nuvoton.c b/drivers/char/tpm/tpm_i2c_nuvoton.c
index 3c3ee5f551db..d44903b29929 100644
--- a/drivers/char/tpm/tpm_i2c_nuvoton.c
+++ b/drivers/char/tpm/tpm_i2c_nuvoton.c
@@ -350,7 +350,8 @@ static int i2c_nuvoton_recv(struct tpm_chip *chip, u8 *buf, size_t count)
  * tpm.c can skip polling for the data to be available as the interrupt is
  * waited for here
  */
-static int i2c_nuvoton_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int i2c_nuvoton_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			    size_t len)
 {
 	struct priv_data *priv = dev_get_drvdata(&chip->dev);
 	struct device *dev = chip->dev.parent;
diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c
index 76d048f63d55..4734a69406ce 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/drivers/char/tpm/tpm_ibmvtpm.c
@@ -191,13 +191,15 @@ static int tpm_ibmvtpm_resume(struct device *dev)
  * tpm_ibmvtpm_send() - Send a TPM command
  * @chip:	tpm chip struct
  * @buf:	buffer contains data to send
- * @count:	size of buffer
+ * @bufsiz:	size of the buffer
+ * @count:	length of the command
  *
  * Return:
  *   0 on success,
  *   -errno on error
  */
-static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
+static int tpm_ibmvtpm_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			    size_t count)
 {
 	struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev);
 	bool retry = true;
diff --git a/drivers/char/tpm/tpm_infineon.c b/drivers/char/tpm/tpm_infineon.c
index 2d2ae37153ba..7638b65b851b 100644
--- a/drivers/char/tpm/tpm_infineon.c
+++ b/drivers/char/tpm/tpm_infineon.c
@@ -312,7 +312,8 @@ recv_begin:
 	return -EIO;
 }
 
-static int tpm_inf_send(struct tpm_chip *chip, u8 * buf, size_t count)
+static int tpm_inf_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			size_t count)
 {
 	int i;
 	int ret;
diff --git a/drivers/char/tpm/tpm_nsc.c b/drivers/char/tpm/tpm_nsc.c
index 0f62bbc940da..879ac88f5783 100644
--- a/drivers/char/tpm/tpm_nsc.c
+++ b/drivers/char/tpm/tpm_nsc.c
@@ -178,7 +178,8 @@ static int tpm_nsc_recv(struct tpm_chip *chip, u8 * buf, size_t count)
 	return size;
 }
 
-static int tpm_nsc_send(struct tpm_chip *chip, u8 * buf, size_t count)
+static int tpm_nsc_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			size_t count)
 {
 	struct tpm_nsc_priv *priv = dev_get_drvdata(&chip->dev);
 	u8 data;
diff --git a/drivers/char/tpm/tpm_svsm.c b/drivers/char/tpm/tpm_svsm.c
index 4280edf427d6..0847cbf450b4 100644
--- a/drivers/char/tpm/tpm_svsm.c
+++ b/drivers/char/tpm/tpm_svsm.c
@@ -25,7 +25,8 @@ struct tpm_svsm_priv {
 	void *buffer;
 };
 
-static int tpm_svsm_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int tpm_svsm_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			 size_t len)
 {
 	struct tpm_svsm_priv *priv = dev_get_drvdata(&chip->dev);
 	int ret;
diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c
index ed0d3d8449b3..4b12c4b9da8b 100644
--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -580,7 +580,8 @@ out_err:
 	return rc;
 }
 
-static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+			size_t len)
 {
 	int rc, irq;
 	struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev);
diff --git a/drivers/char/tpm/tpm_tis_i2c_cr50.c b/drivers/char/tpm/tpm_tis_i2c_cr50.c
index 3b55a7b05c46..fc6891a0b693 100644
--- a/drivers/char/tpm/tpm_tis_i2c_cr50.c
+++ b/drivers/char/tpm/tpm_tis_i2c_cr50.c
@@ -546,13 +546,15 @@ out_err:
  * tpm_cr50_i2c_tis_send() - TPM transmission callback.
  * @chip:	A TPM chip.
  * @buf:	Buffer to send.
- * @len:	Buffer length.
+ * @bufsiz:	Buffer size.
+ * @len:	Command length.
  *
  * Return:
  * - 0:		Success.
  * - -errno:	A POSIX error code.
  */
-static int tpm_cr50_i2c_tis_send(struct tpm_chip *chip, u8 *buf, size_t len)
+static int tpm_cr50_i2c_tis_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+				 size_t len)
 {
 	size_t burstcnt, limit, sent = 0;
 	u8 tpm_go[4] = { TPM_STS_GO };
diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c
index 8fe4a01eea12..0818bb517805 100644
--- a/drivers/char/tpm/tpm_vtpm_proxy.c
+++ b/drivers/char/tpm/tpm_vtpm_proxy.c
@@ -321,12 +321,14 @@ static int vtpm_proxy_is_driver_command(struct tpm_chip *chip,
  *
  * @chip: tpm chip to use
  * @buf: send buffer
+ * @bufsiz: size of the buffer
  * @count: bytes to send
  *
  * Return:
  *      0 in case of success, negative error value otherwise.
  */
-static int vtpm_proxy_tpm_op_send(struct tpm_chip *chip, u8 *buf, size_t count)
+static int vtpm_proxy_tpm_op_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+				  size_t count)
 {
 	struct proxy_dev *proxy_dev = dev_get_drvdata(&chip->dev);
 
diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c
index 80cca3b83b22..556bf2256716 100644
--- a/drivers/char/tpm/xen-tpmfront.c
+++ b/drivers/char/tpm/xen-tpmfront.c
@@ -131,7 +131,8 @@ static size_t shr_data_offset(struct vtpm_shared_page *shr)
 	return struct_size(shr, extra_pages, shr->nr_extra_pages);
 }
 
-static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
+static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+		     size_t count)
 {
 	struct tpm_private *priv = dev_get_drvdata(&chip->dev);
 	struct vtpm_shared_page *shr = priv->shr;
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index a3d8305e88a5..cafe8c283e88 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -87,7 +87,8 @@ struct tpm_class_ops {
 	const u8 req_complete_val;
 	bool (*req_canceled)(struct tpm_chip *chip, u8 status);
 	int (*recv) (struct tpm_chip *chip, u8 *buf, size_t len);
-	int (*send) (struct tpm_chip *chip, u8 *buf, size_t len);
+	int (*send)(struct tpm_chip *chip, u8 *buf, size_t bufsiz,
+		    size_t cmd_len);
 	void (*cancel) (struct tpm_chip *chip);
 	u8 (*status) (struct tpm_chip *chip);
 	void (*update_timeouts)(struct tpm_chip *chip,
-- 
cgit v1.2.3


From 04fe47015d7726b42c34615c124697c7a3537bf0 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 20 Jun 2025 15:08:08 +0200
Subject: tpm: support devices with synchronous send()

Some devices do not support interrupts and provide a single synchronous
operation to send the command and receive the response on the same buffer.

Currently, these types of drivers must use an internal buffer where they
temporarily store the response between .send() and .recv() calls.

Introduce a new flag (TPM_CHIP_FLAG_SYNC) to support synchronous send().
If that flag is set by the driver, tpm_try_transmit() will use the send()
callback to send the command and receive the response on the same buffer
synchronously. In that case send() return the number of bytes of the
response on success, or -errno on failure.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Suggested-by: Jarkko Sakkinen <jarkko@kernel.org>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-interface.c | 20 +++++++++++++++++---
 include/linux/tpm.h              |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 816b7c690bc9..512882ac0db1 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -114,8 +114,19 @@ static ssize_t tpm_try_transmit(struct tpm_chip *chip, void *buf, size_t bufsiz)
 		return rc;
 	}
 
-	/* A sanity check. send() should just return zero on success e.g.
-	 * not the command length.
+	/*
+	 * Synchronous devices return the response directly during the send()
+	 * call in the same buffer.
+	 */
+	if (chip->flags & TPM_CHIP_FLAG_SYNC) {
+		len = rc;
+		rc = 0;
+		goto out_sync;
+	}
+
+	/*
+	 * A sanity check. send() of asynchronous devices should just return
+	 * zero on success e.g. not the command length.
 	 */
 	if (rc > 0) {
 		dev_warn(&chip->dev,
@@ -151,7 +162,10 @@ out_recv:
 	if (len < 0) {
 		rc = len;
 		dev_err(&chip->dev, "tpm_transmit: tpm_recv: error %d\n", rc);
-	} else if (len < TPM_HEADER_SIZE || len != be32_to_cpu(header->length))
+		return rc;
+	}
+out_sync:
+	if (len < TPM_HEADER_SIZE || len != be32_to_cpu(header->length))
 		rc = -EFAULT;
 
 	return rc ? rc : len;
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index cafe8c283e88..804fbbe3873d 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -351,6 +351,7 @@ enum tpm_chip_flags {
 	TPM_CHIP_FLAG_SUSPENDED			= BIT(8),
 	TPM_CHIP_FLAG_HWRNG_DISABLED		= BIT(9),
 	TPM_CHIP_FLAG_DISABLE			= BIT(10),
+	TPM_CHIP_FLAG_SYNC			= BIT(11),
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
-- 
cgit v1.2.3


From 119a5d573622ae90ba730d18acfae9bb75d77b9a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 30 Jun 2025 18:04:40 -0400
Subject: ring-buffer: Remove ring_buffer_read_prepare_sync()

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 63 +++++++--------------------------------------
 kernel/trace/trace.c        | 14 +++-------
 kernel/trace/trace_kdb.c    |  8 +++---
 4 files changed, 18 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index cd7f0ae26615..bc90c3c7b5fd 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -152,9 +152,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a99ed4716de9..903d9db75e12 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5943,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5986,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95ae7c4e5835..7996f26c3f46 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4735,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d7b135de958a..896ff78b8349 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
cgit v1.2.3


From 07c3f391bcb217b6949b49785ccb5fee02be21fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 2 Jul 2025 14:36:57 -0400
Subject: tracing: Remove EVENT_FILE_FL_SOFT_MODE flag

When soft disabling of trace events was first created, it needed to have a
way to know if a file had a user that was using it with soft disabled (for
triggers that need to enable or disable events from a context that can not
really enable or disable the event, it would set SOFT_DISABLED to state it
is disabled). The flag SOFT_MODE was used to denote that an event had a
user that would enable or disable it via the SOFT_DISABLED flag.

Commit 1cf4c0732db3c ("tracing: Modify soft-mode only if there's no other
referrer") fixed a bug where if two users were using the SOFT_DISABLED
flag the accounting would get messed up as the SOFT_MODE flag could only
handle one user. That commit added the sm_ref counter which kept track of
how many users were using the event in "soft mode". This made the
SOFT_MODE flag redundant as it should only be set if the sm_ref counter is
non zero.

Remove the SOFT_MODE flag and just use the sm_ref counter to know the
event is in soft mode or not. This makes the code a bit simpler.

Link: https://lore.kernel.org/all/20250702111908.03759998@batman.local.home/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Gabriele Paoloni <gpaoloni@redhat.com>
Link: https://lore.kernel.org/20250702143657.18dd1882@batman.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  3 ---
 kernel/trace/trace_events.c  | 24 ++++++++++++------------
 2 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fa9cf4292dff..04307a19cde3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -480,7 +480,6 @@ enum {
 	EVENT_FILE_FL_RECORDED_TGID_BIT,
 	EVENT_FILE_FL_FILTERED_BIT,
 	EVENT_FILE_FL_NO_SET_FILTER_BIT,
-	EVENT_FILE_FL_SOFT_MODE_BIT,
 	EVENT_FILE_FL_SOFT_DISABLED_BIT,
 	EVENT_FILE_FL_TRIGGER_MODE_BIT,
 	EVENT_FILE_FL_TRIGGER_COND_BIT,
@@ -618,7 +617,6 @@ extern int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...);
  *  RECORDED_TGID - The tgids should be recorded at sched_switch
  *  FILTERED	  - The event has a filter attached
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
- *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
  *  SOFT_DISABLED - When set, do not trace the event (even though its
  *                   tracepoint may be enabled)
  *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
@@ -633,7 +631,6 @@ enum {
 	EVENT_FILE_FL_RECORDED_TGID	= (1 << EVENT_FILE_FL_RECORDED_TGID_BIT),
 	EVENT_FILE_FL_FILTERED		= (1 << EVENT_FILE_FL_FILTERED_BIT),
 	EVENT_FILE_FL_NO_SET_FILTER	= (1 << EVENT_FILE_FL_NO_SET_FILTER_BIT),
-	EVENT_FILE_FL_SOFT_MODE		= (1 << EVENT_FILE_FL_SOFT_MODE_BIT),
 	EVENT_FILE_FL_SOFT_DISABLED	= (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT),
 	EVENT_FILE_FL_TRIGGER_MODE	= (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
 	EVENT_FILE_FL_TRIGGER_COND	= (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 120531268abf..0980f4def360 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 {
 	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
+	bool soft_mode = atomic_read(&file->sm_ref) != 0;
 	int ret = 0;
 	int disable;
 
@@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		 * is set we do not want the event to be enabled before we
 		 * clear the bit.
 		 *
-		 * When soft_disable is not set but the SOFT_MODE flag is,
+		 * When soft_disable is not set but the soft_mode is,
 		 * we do nothing. Do not disable the tracepoint, otherwise
 		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
 		 */
@@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 			if (atomic_dec_return(&file->sm_ref) > 0)
 				break;
 			disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
-			clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+			soft_mode = false;
 			/* Disable use of trace_buffered_event */
 			trace_buffered_event_disable();
 		} else
-			disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
+			disable = !soft_mode;
 
 		if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
 			clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 
 			WARN_ON_ONCE(ret);
 		}
-		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
-		if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+		/* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
+		if (soft_mode)
 			set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 		else
 			clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		 * When soft_disable is set and enable is set, we want to
 		 * register the tracepoint for the event, but leave the event
 		 * as is. That means, if the event was already enabled, we do
-		 * nothing (but set SOFT_MODE). If the event is disabled, we
+		 * nothing (but set soft_mode). If the event is disabled, we
 		 * set SOFT_DISABLED before enabling the event tracepoint, so
 		 * it still seems to be disabled.
 		 */
@@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		else {
 			if (atomic_inc_return(&file->sm_ref) > 1)
 				break;
-			set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+			soft_mode = true;
 			/* Enable use of trace_buffered_event */
 			trace_buffered_event_enable();
 		}
@@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
 			bool cmd = false, tgid = false;
 
-			/* Keep the event disabled, when going to SOFT_MODE. */
+			/* Keep the event disabled, when going to soft mode. */
 			if (soft_disable)
 				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 
@@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	    !(flags & EVENT_FILE_FL_SOFT_DISABLED))
 		strcpy(buf, "1");
 
-	if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
-	    flags & EVENT_FILE_FL_SOFT_MODE)
+	if (atomic_read(&file->sm_ref) != 0)
 		strcat(buf, "*");
 
 	strcat(buf, "\n");
@@ -3584,7 +3584,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
 			continue;
 		/*
 		 * We can't rely on ftrace_event_enable_disable(enable => 0)
-		 * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
+		 * we are going to do, soft mode can suppress
 		 * TRACE_REG_UNREGISTER.
 		 */
 		if (file->flags & EVENT_FILE_FL_ENABLED)
@@ -3997,7 +3997,7 @@ static int free_probe_data(void *data)
 
 	edata->ref--;
 	if (!edata->ref) {
-		/* Remove the SOFT_MODE flag */
+		/* Remove soft mode */
 		__ftrace_event_enable_disable(edata->file, 0, 1);
 		trace_event_put_ref(edata->file->event_call);
 		kfree(edata);
-- 
cgit v1.2.3


From 4d6d0a6263babf7c43faa55de4fa3c6637dec624 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 4 Jul 2025 10:48:38 -0400
Subject: tracing: Remove redundant config HAVE_FTRACE_MCOUNT_RECORD

Ftrace is tightly coupled with architecture specific code because it
requires the use of trampolines written in assembly. This means that when
a new feature or optimization is made, it must be done for all
architectures. To simplify the approach, CONFIG_HAVE_FTRACE_* configs are
added to denote which architecture has the new enhancement so that other
architectures can still function until they too have been updated.

The CONFIG_HAVE_FTRACE_MCOUNT was added to help simplify the
DYNAMIC_FTRACE work, but now every architecture that implements
DYNAMIC_FTRACE also has HAVE_FTRACE_MCOUNT set too, making it redundant
with the HAVE_DYNAMIC_FTRACE.

Remove the HAVE_FTRACE_MCOUNT config and use DYNAMIC_FTRACE directly where
applicable.

Link: https://lore.kernel.org/all/20250703154916.48e3ada7@gandalf.local.home/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/20250704104838.27a18690@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-design.rst | 12 ++++--------
 arch/arm/Kconfig                      |  1 -
 arch/arm64/Kconfig                    |  1 -
 arch/csky/Kconfig                     |  1 -
 arch/loongarch/Kconfig                |  1 -
 arch/microblaze/Kconfig               |  1 -
 arch/mips/Kconfig                     |  1 -
 arch/parisc/Kconfig                   |  1 -
 arch/powerpc/Kconfig                  |  1 -
 arch/riscv/Kconfig                    |  1 -
 arch/s390/Kconfig                     |  1 -
 arch/sh/Kconfig                       |  1 -
 arch/sparc/Kconfig                    |  1 -
 arch/x86/Kconfig                      |  1 -
 include/asm-generic/vmlinux.lds.h     |  2 +-
 include/linux/ftrace.h                |  2 +-
 include/linux/kernel.h                |  6 +++---
 include/linux/module.h                |  2 +-
 kernel/module/main.c                  |  2 +-
 kernel/trace/Kconfig                  | 18 ++++--------------
 kernel/trace/ftrace.c                 |  4 ----
 scripts/recordmcount.pl               |  2 +-
 22 files changed, 16 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/Documentation/trace/ftrace-design.rst b/Documentation/trace/ftrace-design.rst
index dc82d64b3a44..8f4fab3f9324 100644
--- a/Documentation/trace/ftrace-design.rst
+++ b/Documentation/trace/ftrace-design.rst
@@ -238,19 +238,15 @@ You need very few things to get the syscalls tracing in an arch.
   - Tag this arch as HAVE_SYSCALL_TRACEPOINTS.
 
 
-HAVE_FTRACE_MCOUNT_RECORD
--------------------------
+HAVE_DYNAMIC_FTRACE
+-------------------
 
 See scripts/recordmcount.pl for more info.  Just fill in the arch-specific
 details for how to locate the addresses of mcount call sites via objdump.
 This option doesn't make much sense without also implementing dynamic ftrace.
 
-
-HAVE_DYNAMIC_FTRACE
--------------------
-
-You will first need HAVE_FTRACE_MCOUNT_RECORD and HAVE_FUNCTION_TRACER, so
-scroll your reader back up if you got over eager.
+You will first need HAVE_FUNCTION_TRACER, so scroll your reader back up if you
+got over eager.
 
 Once those are out of the way, you will need to implement:
 	- asm/ftrace.h:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..33cc9dbb7f68 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -107,7 +107,6 @@ config ARM
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
 	select HAVE_EXIT_THREAD
 	select HAVE_GUP_FAST if ARM_LPAE
-	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 55fc331af337..f943a07db139 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -223,7 +223,6 @@ config ARM64
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index acc431c331b0..4331313a42ff 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -80,7 +80,6 @@ config CSKY
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_ERROR_INJECTION
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZMA
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4b19f93379a1..bead8266dc5c 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -144,7 +144,6 @@ config LOONGARCH
 	select HAVE_EXIT_THREAD
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index f18ec02ddeb2..484ebb3baedf 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -28,7 +28,6 @@ config MICROBLAZE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_PAGE_SIZE_4KB
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 1e48184ecf1e..268730824f75 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -73,7 +73,6 @@ config MIPS
 	select HAVE_EBPF_JIT if !CPU_MICROMIPS
 	select HAVE_EXIT_THREAD
 	select HAVE_GUP_FAST
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_GCC_PLUGINS
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fcc5973f7519..2efa4b08b7b8 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -81,7 +81,6 @@ config PARISC
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
 	select HAVE_DYNAMIC_FTRACE if $(cc-option,-fpatchable-function-entry=1,1)
-	select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
 	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c3e0cc83f120..cb4fb8d73300 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -246,7 +246,6 @@ config PPC
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_GUP_FAST
 	select HAVE_FTRACE_GRAPH_FUNC
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_DESCRIPTORS	if PPC64_ELF_ABI_V1
 	select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d71ea0f4466f..62ec265e2962 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -159,7 +159,6 @@ config RISCV
 	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC
-	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	select HAVE_FUNCTION_GRAPH_FREGS
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0c16dc443e2f..d956e85f0465 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,7 +199,6 @@ config S390
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY
 	select HAVE_FTRACE_GRAPH_FUNC
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_FREGS
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 89185af7bcc9..d5795067befa 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -40,7 +40,6 @@ config SUPERH
 	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_HW_BREAKPOINT
 	select HAVE_IOREMAP_PROT if MMU && !X2TLB
 	select HAVE_KERNEL_BZIP2
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 0f88123925a4..f307e730446c 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -78,7 +78,6 @@ config SPARC64
 	select MMU_GATHER_NO_FLUSH_CACHE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_PAGE_SIZE_8KB
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_CONTEXT_TRACKING_USER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 71019b3b54ea..eb07f236ce53 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -244,7 +244,6 @@ config X86
 	select HAVE_GUP_FAST
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC		if HAVE_FUNCTION_GRAPH_TRACER
-	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_FREGS	if HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER	if X86_32 || (X86_64 && DYNAMIC_FTRACE)
 	select HAVE_FUNCTION_TRACER
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fa5f19b8d53a..ae2d2359b79e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -167,7 +167,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
 #define FTRACE_STUB_HACK
 #endif
 
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
 /*
  * The ftrace call sites are logged to a section whose name depends on the
  * compiler option used. A given kernel image will only use one, AKA
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b672ca15f265..7ded7df6e9b5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1108,7 +1108,7 @@ static __always_inline unsigned long get_lock_parent_ip(void)
 # define trace_preempt_off(a0, a1) do { } while (0)
 #endif
 
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
 extern void ftrace_init(void);
 #ifdef CC_USING_PATCHABLE_FUNCTION_ENTRY
 #define FTRACE_CALLSITE_SECTION	"__patchable_function_entries"
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 1cce1f6410a9..989315dabb86 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -373,9 +373,9 @@ ftrace_vprintk(const char *fmt, va_list ap)
 static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
-/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
-# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+/* Rebuild everything on CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define REBUILD_DUE_TO_DYNAMIC_FTRACE
 #endif
 
 /* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */
diff --git a/include/linux/module.h b/include/linux/module.h
index 5faa1fb1f4b4..800e6fde9bf7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -539,7 +539,7 @@ struct module {
 	struct trace_eval_map **trace_evals;
 	unsigned int num_trace_evals;
 #endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
 	unsigned int num_ftrace_callsites;
 	unsigned long *ftrace_callsites;
 #endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 413ac6ea3702..58d36f8cef0d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2639,7 +2639,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					 sizeof(*mod->trace_bprintk_fmt_start),
 					 &mod->num_trace_bprintk_fmt);
 #endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
 	/* sechdrs[0].sh_size is always zero */
 	mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
 					     sizeof(*mod->ftrace_callsites),
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28afc6941e7a..9f2b1661a8ac 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -74,11 +74,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
 	  If the architecture generates __patchable_function_entries sections
 	  but does not want them included in the ftrace locations.
 
-config HAVE_FTRACE_MCOUNT_RECORD
-	bool
-	help
-	  See Documentation/trace/ftrace-design.rst
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -803,27 +798,22 @@ config BPF_KPROBE_OVERRIDE
 	 Allows BPF to override the execution of a probed function and
 	 set a different return value.  This is used for error injection.
 
-config FTRACE_MCOUNT_RECORD
-	def_bool y
-	depends on DYNAMIC_FTRACE
-	depends on HAVE_FTRACE_MCOUNT_RECORD
-
 config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
 	bool
-	depends on FTRACE_MCOUNT_RECORD
+	depends on DYNAMIC_FTRACE
 
 config FTRACE_MCOUNT_USE_CC
 	def_bool y
 	depends on $(cc-option,-mrecord-mcount)
 	depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
-	depends on FTRACE_MCOUNT_RECORD
+	depends on DYNAMIC_FTRACE
 
 config FTRACE_MCOUNT_USE_OBJTOOL
 	def_bool y
 	depends on HAVE_OBJTOOL_MCOUNT
 	depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
 	depends on !FTRACE_MCOUNT_USE_CC
-	depends on FTRACE_MCOUNT_RECORD
+	depends on DYNAMIC_FTRACE
 	select OBJTOOL
 
 config FTRACE_MCOUNT_USE_RECORDMCOUNT
@@ -831,7 +821,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT
 	depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
 	depends on !FTRACE_MCOUNT_USE_CC
 	depends on !FTRACE_MCOUNT_USE_OBJTOOL
-	depends on FTRACE_MCOUNT_RECORD
+	depends on DYNAMIC_FTRACE
 
 config TRACING_MAP
 	bool
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4203fad56b6c..00b76d450a89 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops;
  */
 static bool update_all_ops;
 
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
 struct ftrace_func_probe {
 	struct ftrace_probe_ops	*probe_ops;
 	struct ftrace_ops	ops;
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 0871b2e92584..861b56dda64e 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -359,7 +359,7 @@ if ($arch eq "x86_64") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_CKCORE_PCREL_JSR_IMM26BY2\\s+_mcount\$";
     $alignment = 2;
 } else {
-    die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
+    die "Arch $arch is not supported with CONFIG_DYNAMIC_FTRACE";
 }
 
 my $text_found = 0;
-- 
cgit v1.2.3


From 0a949252556809ce922e0289c148883e838cb9bb Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Fri, 11 Jul 2025 15:17:37 +0200
Subject: rv/ltl: Do not execute the Buchi automaton twice on start condition

On start condition of a Buchi automaton, the automaton is executed twice.

This is fine for now, as all the current LTL operators do not care about
this. But it would break the 'next' operator, which will be introduced in a
follow-up patch.

Prepare for the introduction of the 'next' operator, only execute the
automaton once on start condition.

Cc: John Ogness <john.ogness@linutronix.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/9379f4e7b9c1c69a6dca3e20a22936c850a25ca7.1752239482.git.namcao@linutronix.de
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/rv/ltl_monitor.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index 9a583125b566..67031a774e3d 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -167,8 +167,10 @@ static void ltl_atom_update(struct task_struct *task, enum ltl_atom atom, bool v
 	ltl_atom_set(mon, atom, value);
 	ltl_atoms_fetch(task, mon);
 
-	if (!rv_ltl_valid_state(mon))
+	if (!rv_ltl_valid_state(mon)) {
 		ltl_attempt_start(task, mon);
+		return;
+	}
 
 	ltl_validate(task, mon);
 }
-- 
cgit v1.2.3


From 5ec9d26b78c4eb7c2fab54dcec6c0eb845302a98 Mon Sep 17 00:00:00 2001
From: Phillip Potter <phil@philpotter.co.uk>
Date: Wed, 23 Jul 2025 00:19:00 +0100
Subject: cdrom: Call cdrom_mrw_exit from cdrom_release function

Remove the cdrom_mrw_exit call from unregister_cdrom, as it invokes
block commands that can fail due to a NULL pointer dereference from the
call happening too late, during the unloading of the driver (e.g.
unplugging of USB optical drives).

Instead perform the call inside cdrom_release, thus also removing the
need for the exit function pointer inside the cdrom_device_info struct.

Reported-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Closes: https://lore.kernel.org/linux-block/uxgzea5ibqxygv3x7i4ojbpvcpv2wziorvb3ns5cdtyvobyn7h@y4g4l5ezv2ec
Suggested-by: Jens Axboe <axboe@kernel.dk>
Link: https://lore.kernel.org/linux-block/6686fe78-a050-4a1d-aa27-b7bf7ca6e912@kernel.dk
Tested-by: Phillip Potter <phil@philpotter.co.uk>
Signed-off-by: Phillip Potter <phil@philpotter.co.uk>
Link: https://lore.kernel.org/r/20250722231900.1164-2-phil@philpotter.co.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/cdrom/cdrom-standard.rst | 1 -
 drivers/cdrom/cdrom.c                  | 8 ++------
 include/linux/cdrom.h                  | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/Documentation/cdrom/cdrom-standard.rst b/Documentation/cdrom/cdrom-standard.rst
index 6c1303cff159..b97a4e9b9bd3 100644
--- a/Documentation/cdrom/cdrom-standard.rst
+++ b/Documentation/cdrom/cdrom-standard.rst
@@ -273,7 +273,6 @@ The drive-specific, minor-like information that is registered with
 	__u8 media_written;			/*  dirty flag, DVD+RW bookkeeping */
 	unsigned short mmc3_profile;		/*  current MMC3 profile */
 	int for_data;				/*  unknown:TBD */
-	int (*exit)(struct cdrom_device_info *);/*  unknown:TBD */
 	int mrw_mode_page;			/*  which MRW mode page is in use */
   };
 
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 21a10552da61..31ba1f8c1f78 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -624,9 +624,6 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi)
 	if (check_media_type == 1)
 		cdi->options |= (int) CDO_CHECK_TYPE;
 
-	if (CDROM_CAN(CDC_MRW_W))
-		cdi->exit = cdrom_mrw_exit;
-
 	if (cdi->ops->read_cdda_bpc)
 		cdi->cdda_method = CDDA_BPC_FULL;
 	else
@@ -651,9 +648,6 @@ void unregister_cdrom(struct cdrom_device_info *cdi)
 	list_del(&cdi->list);
 	mutex_unlock(&cdrom_mutex);
 
-	if (cdi->exit)
-		cdi->exit(cdi);
-
 	cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" unregistered\n", cdi->name);
 }
 EXPORT_SYMBOL(unregister_cdrom);
@@ -1264,6 +1258,8 @@ void cdrom_release(struct cdrom_device_info *cdi)
 		cd_dbg(CD_CLOSE, "Use count for \"/dev/%s\" now zero\n",
 		       cdi->name);
 		cdrom_dvd_rw_close_write(cdi);
+		if (CDROM_CAN(CDC_MRW_W))
+			cdrom_mrw_exit(cdi);
 
 		if ((cdo->capability & CDC_LOCK) && !cdi->keeplocked) {
 			cd_dbg(CD_CLOSE, "Unlocking door!\n");
diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h
index fdfb61ccf55a..b907e6c2307d 100644
--- a/include/linux/cdrom.h
+++ b/include/linux/cdrom.h
@@ -62,7 +62,6 @@ struct cdrom_device_info {
 	__u8 last_sense;
 	__u8 media_written;		/* dirty flag, DVD+RW bookkeeping */
 	unsigned short mmc3_profile;	/* current MMC3 profile */
-	int (*exit)(struct cdrom_device_info *);
 	int mrw_mode_page;
 	bool opened_for_data;
 	__s64 last_media_change_ms;
-- 
cgit v1.2.3


From ad892e912b84b706ed399a212174978ddd1ac1f9 Mon Sep 17 00:00:00 2001
From: Fan Yu <fan.yu9@zte.com.cn>
Date: Mon, 21 Jul 2025 11:16:07 +0800
Subject: tcp: trace retransmit failures in tcp_retransmit_skb

Background
==========
When TCP retransmits a packet due to missing ACKs, the
retransmission may fail for various reasons (e.g., packets
stuck in driver queues, receiver zero windows, or routing issues).

The original tcp_retransmit_skb tracepoint:

  'commit e086101b150a ("tcp: add a tracepoint for tcp retransmission")'

lacks visibility into these failure causes, making production
diagnostics difficult.

Solution
========
Adds the retval("err") to the tcp_retransmit_skb tracepoint.
Enables users to know why some tcp retransmission failed and
users can filter retransmission failures by retval.

Compatibility description
=========================
This patch extends the tcp_retransmit_skb tracepoint
by adding a new "err" field at the end of its
existing structure (within TP_STRUCT__entry). The
compatibility implications are detailed as follows:

1) Structural compatibility for legacy user-space tools
Legacy tools/BPF programs accessing existing fields
(by offset or name) can still work without modification
or recompilation.The new field is appended to the end,
preserving original memory layout.

2) Note: semantic changes
The original tracepoint primarily only focused on
successfully retransmitted packets. With this patch,
the tracepoint now can figure out packets that may
terminate early due to specific reasons. For accurate
statistics, users should filter using "err" to
distinguish outcomes.

Before patched:
field:const void * skbaddr; offset:8; size:8; signed:0;
field:const void * skaddr; offset:16; size:8; signed:0;
field:int state; offset:24; size:4; signed:1;
field:__u16 sport; offset:28; size:2; signed:0;
field:__u16 dport; offset:30; size:2; signed:0;
field:__u16 family; offset:32; size:2; signed:0;
field:__u8 saddr[4]; offset:34; size:4; signed:0;
field:__u8 daddr[4]; offset:38; size:4; signed:0;
field:__u8 saddr_v6[16]; offset:42; size:16; signed:0;
field:__u8 daddr_v6[16]; offset:58; size:16; signed:0;

print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s"

After patched:
field:const void * skbaddr; offset:8; size:8; signed:0;
field:const void * skaddr; offset:16; size:8; signed:0;
field:int state; offset:24; size:4; signed:1;
field:__u16 sport; offset:28; size:2; signed:0;
field:__u16 dport; offset:30; size:2; signed:0;
field:__u16 family; offset:32; size:2; signed:0;
field:__u8 saddr[4]; offset:34; size:4; signed:0;
field:__u8 daddr[4]; offset:38; size:4; signed:0;
field:__u8 saddr_v6[16]; offset:42; size:16; signed:0;
field:__u8 daddr_v6[16]; offset:58; size:16; signed:0;
field:int err; offset:76; size:4; signed:1;

print fmt: "skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d"

Co-developed-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Signed-off-by: Fan Yu <fan.yu9@zte.com.cn>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250721111607626_BDnIJB0ywk6FghN63bor@zte.com.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 27 +++++++++------------------
 net/ipv4/tcp_output.c      | 46 +++++++++++++++++++++++++++++-----------------
 2 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 54e60c6009e3..9d2c36c6a0ed 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -13,17 +13,11 @@
 #include <linux/sock_diag.h>
 #include <net/rstreason.h>
 
-/*
- * tcp event with arguments sk and skb
- *
- * Note: this class requires a valid sk pointer; while skb pointer could
- *       be NULL.
- */
-DECLARE_EVENT_CLASS(tcp_event_sk_skb,
+TRACE_EVENT(tcp_retransmit_skb,
 
-	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+	TP_PROTO(const struct sock *sk, const struct sk_buff *skb, int err),
 
-	TP_ARGS(sk, skb),
+	TP_ARGS(sk, skb, err),
 
 	TP_STRUCT__entry(
 		__field(const void *, skbaddr)
@@ -36,6 +30,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 		__array(__u8, daddr, 4)
 		__array(__u8, saddr_v6, 16)
 		__array(__u8, daddr_v6, 16)
+		__field(int, err)
 	),
 
 	TP_fast_assign(
@@ -58,21 +53,17 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
 			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
+
+		__entry->err = err;
 	),
 
-	TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s",
+	TP_printk("skbaddr=%p skaddr=%p family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s err=%d",
 		  __entry->skbaddr, __entry->skaddr,
 		  show_family_name(__entry->family),
 		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
 		  __entry->saddr_v6, __entry->daddr_v6,
-		  show_tcp_state_name(__entry->state))
-);
-
-DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
-
-	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
-
-	TP_ARGS(sk, skb)
+		  show_tcp_state_name(__entry->state),
+		  __entry->err)
 );
 
 #undef FN
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b616776e3354..caf11920a878 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3330,8 +3330,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (icsk->icsk_mtup.probe_size)
 		icsk->icsk_mtup.probe_size = 0;
 
-	if (skb_still_in_host_queue(sk, skb))
-		return -EBUSY;
+	if (skb_still_in_host_queue(sk, skb)) {
+		err = -EBUSY;
+		goto out;
+	}
 
 start:
 	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
@@ -3342,14 +3344,19 @@ start:
 		}
 		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
 			WARN_ON_ONCE(1);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
+		}
+		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) {
+			err = -ENOMEM;
+			goto out;
 		}
-		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
-			return -ENOMEM;
 	}
 
-	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
-		return -EHOSTUNREACH; /* Routing failure or similar. */
+	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+		err = -EHOSTUNREACH; /* Routing failure or similar. */
+		goto out;
+	}
 
 	cur_mss = tcp_current_mss(sk);
 	avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
@@ -3360,8 +3367,10 @@ start:
 	 * our retransmit of one segment serves as a zero window probe.
 	 */
 	if (avail_wnd <= 0) {
-		if (TCP_SKB_CB(skb)->seq != tp->snd_una)
-			return -EAGAIN;
+		if (TCP_SKB_CB(skb)->seq != tp->snd_una) {
+			err = -EAGAIN;
+			goto out;
+		}
 		avail_wnd = cur_mss;
 	}
 
@@ -3373,11 +3382,15 @@ start:
 	}
 	if (skb->len > len) {
 		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
-				 cur_mss, GFP_ATOMIC))
-			return -ENOMEM; /* We'll try again later. */
+				 cur_mss, GFP_ATOMIC)) {
+			err = -ENOMEM;  /* We'll try again later. */
+			goto out;
+		}
 	} else {
-		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
-			return -ENOMEM;
+		if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) {
+			err = -ENOMEM;
+			goto out;
+		}
 
 		diff = tcp_skb_pcount(skb);
 		tcp_set_skb_tso_segs(skb, cur_mss);
@@ -3431,17 +3444,16 @@ start:
 		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
 				  TCP_SKB_CB(skb)->seq, segs, err);
 
-	if (likely(!err)) {
-		trace_tcp_retransmit_skb(sk, skb);
-	} else if (err != -EBUSY) {
+	if (unlikely(err) && err != -EBUSY)
 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
-	}
 
 	/* To avoid taking spuriously low RTT samples based on a timestamp
 	 * for a transmit that never happened, always mark EVER_RETRANS
 	 */
 	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
 
+out:
+	trace_tcp_retransmit_skb(sk, skb, err);
 	return err;
 }
 
-- 
cgit v1.2.3


From 0a61ec9cc51b0e43981222005444508437e95b33 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:25 +0300
Subject: PCI/TPH: Expose pcie_tph_get_st_table_size()

Expose pcie_tph_get_st_table_size() to be used by drivers as will be
done in the next patch from the series.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/9ae851e0ee42cc56d2a30276e116b65091030ceb.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/pci/tph.c       | 11 ++++++-----
 include/linux/pci-tph.h |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c
index 77fce5e1b830..cc64f93709a4 100644
--- a/drivers/pci/tph.c
+++ b/drivers/pci/tph.c
@@ -168,7 +168,7 @@ static u32 get_st_table_loc(struct pci_dev *pdev)
  * Return the size of ST table. If ST table is not in TPH Requester Extended
  * Capability space, return 0. Otherwise return the ST Table Size + 1.
  */
-static u16 get_st_table_size(struct pci_dev *pdev)
+u16 pcie_tph_get_st_table_size(struct pci_dev *pdev)
 {
 	u32 reg;
 	u32 loc;
@@ -185,6 +185,7 @@ static u16 get_st_table_size(struct pci_dev *pdev)
 
 	return FIELD_GET(PCI_TPH_CAP_ST_MASK, reg) + 1;
 }
+EXPORT_SYMBOL(pcie_tph_get_st_table_size);
 
 /* Return device's Root Port completer capability */
 static u8 get_rp_completer_type(struct pci_dev *pdev)
@@ -211,7 +212,7 @@ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag)
 	int offset;
 
 	/* Check if index is out of bound */
-	st_table_size = get_st_table_size(pdev);
+	st_table_size = pcie_tph_get_st_table_size(pdev);
 	if (index >= st_table_size)
 		return -ENXIO;
 
@@ -443,7 +444,7 @@ void pci_restore_tph_state(struct pci_dev *pdev)
 	pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++);
 	st_entry = (u16 *)cap;
 	offset = PCI_TPH_BASE_SIZEOF;
-	num_entries = get_st_table_size(pdev);
+	num_entries = pcie_tph_get_st_table_size(pdev);
 	for (i = 0; i < num_entries; i++) {
 		pci_write_config_word(pdev, pdev->tph_cap + offset,
 				      *st_entry++);
@@ -475,7 +476,7 @@ void pci_save_tph_state(struct pci_dev *pdev)
 	/* Save all ST entries in extended capability structure */
 	st_entry = (u16 *)cap;
 	offset = PCI_TPH_BASE_SIZEOF;
-	num_entries = get_st_table_size(pdev);
+	num_entries = pcie_tph_get_st_table_size(pdev);
 	for (i = 0; i < num_entries; i++) {
 		pci_read_config_word(pdev, pdev->tph_cap + offset,
 				     st_entry++);
@@ -499,7 +500,7 @@ void pci_tph_init(struct pci_dev *pdev)
 	if (!pdev->tph_cap)
 		return;
 
-	num_entries = get_st_table_size(pdev);
+	num_entries = pcie_tph_get_st_table_size(pdev);
 	save_size = sizeof(u32) + num_entries * sizeof(u16);
 	pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size);
 }
diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h
index c3e806c13d64..9e4e331b1603 100644
--- a/include/linux/pci-tph.h
+++ b/include/linux/pci-tph.h
@@ -28,6 +28,7 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev,
 			unsigned int cpu_uid, u16 *tag);
 void pcie_disable_tph(struct pci_dev *pdev);
 int pcie_enable_tph(struct pci_dev *pdev, int mode);
+u16 pcie_tph_get_st_table_size(struct pci_dev *pdev);
 #else
 static inline int pcie_tph_set_st_entry(struct pci_dev *pdev,
 					unsigned int index, u16 tag)
-- 
cgit v1.2.3


From 5f9ec7880e6b3c4d0cf242fe28506d0b084328b1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:26 +0300
Subject: net/mlx5: Expose IFC bits for TPH

Expose IFC bits for the TPH functionality.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Edward Srouji <edwards@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Link: https://patch.msgid.link/38ea3a0d56551364214e8edf359c9c77c9a3b71b.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index ed4130e49c27..8360d9011d4f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1871,7 +1871,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_280[0x10];
 	u8         max_wqe_sz_sq[0x10];
 
-	u8         reserved_at_2a0[0xb];
+	u8         reserved_at_2a0[0x7];
+	u8         mkey_pcie_tph[0x1];
+	u8         reserved_at_2a8[0x3];
 	u8         shampo[0x1];
 	u8         reserved_at_2ac[0x4];
 	u8         max_wqe_sz_rq[0x10];
@@ -4418,6 +4420,10 @@ enum {
 	MLX5_MKC_ACCESS_MODE_CROSSING = 0x6,
 };
 
+enum {
+	MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX = 0,
+};
+
 struct mlx5_ifc_mkc_bits {
 	u8         reserved_at_0[0x1];
 	u8         free[0x1];
@@ -4469,7 +4475,11 @@ struct mlx5_ifc_mkc_bits {
 	u8         relaxed_ordering_read[0x1];
 	u8         log_page_size[0x6];
 
-	u8         reserved_at_1e0[0x20];
+	u8         reserved_at_1e0[0x5];
+	u8         pcie_tph_en[0x1];
+	u8         pcie_tph_ph[0x2];
+	u8         pcie_tph_steering_tag_index[0x8];
+	u8         reserved_at_1f0[0x10];
 };
 
 struct mlx5_ifc_pkey_bits {
-- 
cgit v1.2.3


From 888a7776f4fb04c19bec70c737c61c2f383c6b1e Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:27 +0300
Subject: net/mlx5: Add support for device steering tag

Background, from PCIe specification 6.2.

TLP Processing Hints (TPH)
--------------------------
TLP Processing Hints is an optional feature that provides hints in
Request TLP headers to facilitate optimized processing of Requests that
target Memory Space. These Processing Hints enable the system hardware
(e.g., the Root Complex and/or Endpoints) to optimize platform
resources such as system and memory interconnect on a per TLP basis.
Steering Tags are system-specific values used to identify a processing
resource that a Requester explicitly targets. System software discovers
and identifies TPH capabilities to determine the Steering Tag allocation
for each Function that supports TPH.

This patch adds steering tag support for mlx5 based NICs by:

- Enabling the TPH functionality over PCI if both FW and OS support it.
- Managing steering tags and their matching steering indexes by
  writing a ST to an ST index over the PCI configuration space.
- Exposing APIs to upper layers (e.g.,mlx5_ib) to allow usage of
  the PCI TPH infrastructure.

Further details:
- Upon probing of a device, the feature will be enabled based
  on both capability detection and OS support.

- It will retrieve the appropriate ST for a given CPU ID and memory
  type using the pcie_tph_get_cpu_st() API.

- It will track available ST indices according to the configuration
  space table size (expected to be 63 entries), reserving index 0 to
  indicate non-TPH use.

- It will assign a free ST index with a ST using the
  pcie_tph_set_st_entry() API.

- It will reuse the same index for identical (CPU ID + memory type)
  combinations by maintaining a reference count per entry.

- It will expose APIs to upper layers (e.g., mlx5_ib) to allow usage of
  the PCI TPH infrastructure.

- SF will use its parent PF stuff.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/de1ae7398e9e34eacd8c10845683df44fc9e32f8.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   5 +
 drivers/net/ethernet/mellanox/mlx5/core/lib/st.c   | 164 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |   2 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   9 ++
 include/linux/mlx5/driver.h                        |  20 +++
 5 files changed, 200 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/st.c

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index d292e6a9e22c..bd9d46c6719f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -167,5 +167,10 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_
 #
 mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o
 
+#
+# TPH support
+#
+mlx5_core-$(CONFIG_PCIE_TPH) += lib/st.o
+
 obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o
 mlx5_dpll-y :=	dpll.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
new file mode 100644
index 000000000000..47fe215f66bf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/device.h>
+
+#include "mlx5_core.h"
+#include "lib/mlx5.h"
+
+struct mlx5_st_idx_data {
+	refcount_t usecount;
+	u16 tag;
+};
+
+struct mlx5_st {
+	/* serialize access upon alloc/free flows */
+	struct mutex lock;
+	struct xa_limit index_limit;
+	struct xarray idx_xa; /* key == index, value == struct mlx5_st_idx_data */
+};
+
+struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct mlx5_st *st;
+	u16 num_entries;
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
+		return NULL;
+
+#ifdef CONFIG_MLX5_SF
+	if (mlx5_core_is_sf(dev))
+		return dev->priv.parent_mdev->st;
+#endif
+
+	/* Checking whether the device is capable */
+	if (!pdev->tph_cap)
+		return NULL;
+
+	num_entries = pcie_tph_get_st_table_size(pdev);
+	/* We need a reserved entry for non TPH cases */
+	if (num_entries < 2)
+		return NULL;
+
+	/* The OS doesn't support ST */
+	ret = pcie_enable_tph(pdev, PCI_TPH_ST_DS_MODE);
+	if (ret)
+		return NULL;
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto end;
+
+	mutex_init(&st->lock);
+	xa_init_flags(&st->idx_xa, XA_FLAGS_ALLOC);
+	/* entry 0 is reserved for non TPH cases */
+	st->index_limit.min = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX + 1;
+	st->index_limit.max = num_entries - 1;
+
+	return st;
+
+end:
+	pcie_disable_tph(dev->pdev);
+	return NULL;
+}
+
+void mlx5_st_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_st *st = dev->st;
+
+	if (mlx5_core_is_sf(dev) || !st)
+		return;
+
+	pcie_disable_tph(dev->pdev);
+	WARN_ON_ONCE(!xa_empty(&st->idx_xa));
+	kfree(st);
+}
+
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+			unsigned int cpu_uid, u16 *st_index)
+{
+	struct mlx5_st_idx_data *idx_data;
+	struct mlx5_st *st = dev->st;
+	unsigned long index;
+	u32 xa_id;
+	u16 tag;
+	int ret;
+
+	if (!st)
+		return -EOPNOTSUPP;
+
+	ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
+	if (ret)
+		return ret;
+
+	mutex_lock(&st->lock);
+
+	xa_for_each(&st->idx_xa, index, idx_data) {
+		if (tag == idx_data->tag) {
+			refcount_inc(&idx_data->usecount);
+			*st_index = index;
+			goto end;
+		}
+	}
+
+	idx_data = kzalloc(sizeof(*idx_data), GFP_KERNEL);
+	if (!idx_data) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	refcount_set(&idx_data->usecount, 1);
+	idx_data->tag = tag;
+
+	ret = xa_alloc(&st->idx_xa, &xa_id, idx_data, st->index_limit, GFP_KERNEL);
+	if (ret)
+		goto clean_idx_data;
+
+	ret = pcie_tph_set_st_entry(dev->pdev, xa_id, tag);
+	if (ret)
+		goto clean_idx_xa;
+
+	*st_index = xa_id;
+	goto end;
+
+clean_idx_xa:
+	xa_erase(&st->idx_xa, xa_id);
+clean_idx_data:
+	kfree(idx_data);
+end:
+	mutex_unlock(&st->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
+
+int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+	struct mlx5_st_idx_data *idx_data;
+	struct mlx5_st *st = dev->st;
+	int ret = 0;
+
+	if (!st)
+		return -EOPNOTSUPP;
+
+	mutex_lock(&st->lock);
+	idx_data = xa_load(&st->idx_xa, st_index);
+	if (WARN_ON_ONCE(!idx_data)) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	if (refcount_dec_and_test(&idx_data->usecount)) {
+		xa_erase(&st->idx_xa, st_index);
+		/* We leave PCI config space as was before, no mkey will refer to it */
+	}
+
+end:
+	mutex_unlock(&st->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_st_dealloc_index);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b0043cfee29b..be3be043134f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1102,6 +1102,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 	}
 
 	dev->dm = mlx5_dm_create(dev);
+	dev->st = mlx5_st_create(dev);
 	dev->tracer = mlx5_fw_tracer_create(dev);
 	dev->hv_vhca = mlx5_hv_vhca_create(dev);
 	dev->rsc_dump = mlx5_rsc_dump_create(dev);
@@ -1150,6 +1151,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_rsc_dump_destroy(dev);
 	mlx5_hv_vhca_destroy(dev->hv_vhca);
 	mlx5_fw_tracer_destroy(dev->tracer);
+	mlx5_st_destroy(dev);
 	mlx5_dm_cleanup(dev);
 	mlx5_fs_core_free(dev);
 	mlx5_sf_table_cleanup(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 2e02bdea8361..1cada2f87acf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -300,6 +300,15 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode);
 struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev);
 void mlx5_dm_cleanup(struct mlx5_core_dev *dev);
 
+#ifdef CONFIG_PCIE_TPH
+struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev);
+void mlx5_st_destroy(struct mlx5_core_dev *dev);
+#else
+static inline struct mlx5_st *
+mlx5_st_create(struct mlx5_core_dev *dev) { return NULL; }
+static inline void mlx5_st_destroy(struct mlx5_core_dev *dev) { return; }
+#endif
+
 void mlx5_toggle_port_link(struct mlx5_core_dev *dev);
 int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
 			       enum mlx5_port_status status);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e6ba8f4f4bd1..104d4921c032 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -36,6 +36,7 @@
 #include <linux/kernel.h>
 #include <linux/completion.h>
 #include <linux/pci.h>
+#include <linux/pci-tph.h>
 #include <linux/irq.h>
 #include <linux/spinlock_types.h>
 #include <linux/semaphore.h>
@@ -688,6 +689,7 @@ struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 struct mlx5_geneve;
 struct mlx5_hv_vhca;
+struct mlx5_st;
 
 #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
 #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
@@ -757,6 +759,7 @@ struct mlx5_core_dev {
 	u32			issi;
 	struct mlx5e_resources  mlx5e_res;
 	struct mlx5_dm          *dm;
+	struct mlx5_st          *st;
 	struct mlx5_vxlan       *vxlan;
 	struct mlx5_geneve      *geneve;
 	struct {
@@ -1160,6 +1163,23 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
 int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
 			   u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
 
+#ifdef CONFIG_PCIE_TPH
+int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
+			unsigned int cpu_uid, u16 *st_index);
+int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
+#else
+static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
+				      enum tph_mem_type mem_type,
+				      unsigned int cpu_uid, u16 *st_index)
+{
+	return -EOPNOTSUPP;
+}
+static inline int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
 void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
 
-- 
cgit v1.2.3


From 5b2e45049dc06a876bc6b138218ddeb0814502ef Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:28 +0300
Subject: IB/core: Add UVERBS_METHOD_REG_MR on the MR object

This new method enables us to use a single ioctl from user space which
supports the below variants of reg_mr [1].

The method will be extended in the next patches from the series with an
extra attribute to let us pass DMA handle to be used as part of the
registration.

[1] ibv_reg_mr(), ibv_reg_mr_iova(), ibv_reg_mr_iova2(),
ibv_reg_dmabuf_mr().

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/5a3822ceef084efe967c9752e89c58d8250337c7.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/uverbs_std_types_mr.c | 153 +++++++++++++++++++++++++-
 include/uapi/rdma/ib_user_ioctl_cmds.h        |  14 +++
 2 files changed, 166 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 7ebc7bd3caae..1bd4b17b5515 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -266,6 +266,122 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
 	return ret;
 }
 
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj =
+		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+	struct ib_pd *pd =
+		uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE);
+	u32 valid_access_flags = IB_ACCESS_SUPPORTED;
+	u64 length, iova, fd_offset = 0, addr = 0;
+	struct ib_device *ib_dev = pd->device;
+	bool has_fd_offset = false;
+	bool has_addr = false;
+	bool has_fd = false;
+	u32 access_flags;
+	struct ib_mr *mr;
+	int fd;
+	int ret;
+
+	ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA);
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH);
+	if (ret)
+		return ret;
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) {
+		ret = uverbs_copy_from(&addr, attrs,
+				       UVERBS_ATTR_REG_MR_ADDR);
+		if (ret)
+			return ret;
+		has_addr = true;
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) {
+		ret = uverbs_copy_from(&fd_offset, attrs,
+				       UVERBS_ATTR_REG_MR_FD_OFFSET);
+		if (ret)
+			return ret;
+		has_fd_offset = true;
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) {
+		ret = uverbs_get_raw_fd(&fd, attrs,
+					UVERBS_ATTR_REG_MR_FD);
+		if (ret)
+			return ret;
+		has_fd = true;
+	}
+
+	if (has_fd) {
+		if (!ib_dev->ops.reg_user_mr_dmabuf)
+			return -EOPNOTSUPP;
+
+		/* FD requires offset and can't come with addr */
+		if (!has_fd_offset || has_addr)
+			return -EINVAL;
+
+		if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+			return -EINVAL;
+
+		valid_access_flags = IB_ACCESS_LOCAL_WRITE |
+				     IB_ACCESS_REMOTE_READ |
+				     IB_ACCESS_REMOTE_WRITE |
+				     IB_ACCESS_REMOTE_ATOMIC |
+				     IB_ACCESS_RELAXED_ORDERING;
+	} else {
+		if (!has_addr || has_fd_offset)
+			return -EINVAL;
+
+		if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+			return -EINVAL;
+	}
+
+	ret = uverbs_get_flags32(&access_flags, attrs,
+				 UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+				 valid_access_flags);
+	if (ret)
+		return ret;
+
+	ret = ib_check_mr_access(ib_dev, access_flags);
+	if (ret)
+		return ret;
+
+	if (has_fd)
+		mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, iova,
+							fd, access_flags, attrs);
+	else
+		mr = pd->device->ops.reg_user_mr(pd, addr, length,
+						 iova, access_flags, NULL);
+
+	if (IS_ERR(mr))
+		return PTR_ERR(mr);
+
+	mr->device = pd->device;
+	mr->pd = pd;
+	mr->type = IB_MR_TYPE_USER;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+	rdma_restrack_set_name(&mr->res, NULL);
+	rdma_restrack_add(&mr->res);
+	uobj->object = mr;
+
+	uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE);
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY,
+			     &mr->lkey, sizeof(mr->lkey));
+	if (ret)
+		return ret;
+
+	ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY,
+			     &mr->rkey, sizeof(mr->rkey));
+	return ret;
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	UVERBS_METHOD_ADVISE_MR,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
@@ -362,6 +478,40 @@ DECLARE_UVERBS_NAMED_METHOD(
 			    UVERBS_ATTR_TYPE(u32),
 			    UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_REG_MR,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE,
+			UVERBS_OBJECT_MR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+			     enum ib_access_flags,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET,
+			   UVERBS_ATTR_TYPE(u64),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD,
+			   UA_OPTIONAL),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY,
+			    UVERBS_ATTR_TYPE(u32),
+			    UA_MANDATORY));
+
 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
 	UVERBS_METHOD_MR_DESTROY,
 	UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
@@ -376,7 +526,8 @@ DECLARE_UVERBS_NAMED_OBJECT(
 	&UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
 	&UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
 	&UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
-	&UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
+	&UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR),
+	&UVERBS_METHOD(UVERBS_METHOD_REG_MR));
 
 const struct uapi_definition uverbs_def_obj_mr[] = {
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 5f3e5bee51b2..ece923ab48a0 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -257,6 +257,7 @@ enum uverbs_methods_mr {
 	UVERBS_METHOD_ADVISE_MR,
 	UVERBS_METHOD_QUERY_MR,
 	UVERBS_METHOD_REG_DMABUF_MR,
+	UVERBS_METHOD_REG_MR,
 };
 
 enum uverbs_attrs_mr_destroy_ids {
@@ -290,6 +291,19 @@ enum uverbs_attrs_reg_dmabuf_mr_cmd_attr_ids {
 	UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
 };
 
+enum uverbs_attrs_reg_mr_cmd_attr_ids {
+	UVERBS_ATTR_REG_MR_HANDLE,
+	UVERBS_ATTR_REG_MR_PD_HANDLE,
+	UVERBS_ATTR_REG_MR_IOVA,
+	UVERBS_ATTR_REG_MR_ADDR,
+	UVERBS_ATTR_REG_MR_LENGTH,
+	UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
+	UVERBS_ATTR_REG_MR_FD,
+	UVERBS_ATTR_REG_MR_FD_OFFSET,
+	UVERBS_ATTR_REG_MR_RESP_LKEY,
+	UVERBS_ATTR_REG_MR_RESP_RKEY,
+};
+
 enum uverbs_attrs_create_counters_cmd_attr_ids {
 	UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
 };
-- 
cgit v1.2.3


From d83edab562a496a42720902a1d2effccd05c37c5 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:29 +0300
Subject: RDMA/core: Introduce a DMAH object and its alloc/free APIs

Introduce a new DMA handle (DMAH) object along with its corresponding
allocation and deallocation APIs.

This DMAH object encapsulates attributes intended for use in DMA
transactions.

While its initial purpose is to support TPH functionality, it is
designed to be extensible for future features such as DMA PCI multipath,
PCI UIO configurations, PCI traffic class selection, and more.

Further details:
----------------
We ensure that a caller requesting a DMA handle for a specific CPU ID is
permitted to be scheduled on it. This prevent a potential security issue
where a non privilege user may trigger DMA operations toward a CPU that
it's not allowed to run on.

We manage reference counting for the DMAH object and its consumers
(e.g., memory regions) as will be detailed in subsequent patches in the
series.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/2cad097e849597e49d6b61e6865dba878257f371.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/Makefile                |   1 +
 drivers/infiniband/core/device.c                |   3 +
 drivers/infiniband/core/rdma_core.h             |   1 +
 drivers/infiniband/core/restrack.c              |   2 +
 drivers/infiniband/core/uverbs_std_types_dmah.c | 145 ++++++++++++++++++++++++
 drivers/infiniband/core/uverbs_uapi.c           |   1 +
 include/rdma/ib_verbs.h                         |  26 +++++
 include/rdma/restrack.h                         |   4 +
 include/uapi/rdma/ib_user_ioctl_cmds.h          |  17 +++
 9 files changed, 200 insertions(+)
 create mode 100644 drivers/infiniband/core/uverbs_std_types_dmah.c

(limited to 'include')

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d49ded7e95f0..f483e0c12444 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -33,6 +33,7 @@ ib_umad-y :=			user_mad.o
 ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
 				rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
 				uverbs_std_types_cq.o \
+				uverbs_std_types_dmah.o \
 				uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
 				uverbs_std_types_mr.o uverbs_std_types_counters.o \
 				uverbs_uapi.o uverbs_std_types_device.o \
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f301cdce1728..3145cb34a1d2 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2708,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, add_sub_dev);
 	SET_DEVICE_OP(dev_ops, advise_mr);
 	SET_DEVICE_OP(dev_ops, alloc_dm);
+	SET_DEVICE_OP(dev_ops, alloc_dmah);
 	SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
 	SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
 	SET_DEVICE_OP(dev_ops, alloc_mr);
@@ -2736,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_DEVICE_OP(dev_ops, create_user_ah);
 	SET_DEVICE_OP(dev_ops, create_wq);
 	SET_DEVICE_OP(dev_ops, dealloc_dm);
+	SET_DEVICE_OP(dev_ops, dealloc_dmah);
 	SET_DEVICE_OP(dev_ops, dealloc_driver);
 	SET_DEVICE_OP(dev_ops, dealloc_mw);
 	SET_DEVICE_OP(dev_ops, dealloc_pd);
@@ -2833,6 +2835,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
 	SET_OBJ_SIZE(dev_ops, ib_ah);
 	SET_OBJ_SIZE(dev_ops, ib_counters);
 	SET_OBJ_SIZE(dev_ops, ib_cq);
+	SET_OBJ_SIZE(dev_ops, ib_dmah);
 	SET_OBJ_SIZE(dev_ops, ib_mw);
 	SET_OBJ_SIZE(dev_ops, ib_pd);
 	SET_OBJ_SIZE(dev_ops, ib_qp);
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
index 33706dad6c0f..a59b087611cb 100644
--- a/drivers/infiniband/core/rdma_core.h
+++ b/drivers/infiniband/core/rdma_core.h
@@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[];
 extern const struct uapi_definition uverbs_def_obj_cq[];
 extern const struct uapi_definition uverbs_def_obj_device[];
 extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_dmah[];
 extern const struct uapi_definition uverbs_def_obj_flow_action[];
 extern const struct uapi_definition uverbs_def_obj_intf[];
 extern const struct uapi_definition uverbs_def_obj_mr[];
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3313410014cd..a7de6f403fca 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
 		return container_of(res, struct rdma_counter, res)->device;
 	case RDMA_RESTRACK_SRQ:
 		return container_of(res, struct ib_srq, res)->device;
+	case RDMA_RESTRACK_DMAH:
+		return container_of(res, struct ib_dmah, res)->device;
 	default:
 		WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
 		return NULL;
diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c
new file mode 100644
index 000000000000..453ce656c6f2
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dmah.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+#include "restrack.h"
+
+static int uverbs_free_dmah(struct ib_uobject *uobject,
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
+{
+	struct ib_dmah *dmah = uobject->object;
+	int ret;
+
+	if (atomic_read(&dmah->usecnt))
+		return -EBUSY;
+
+	ret = dmah->device->ops.dealloc_dmah(dmah, attrs);
+	if (ret)
+		return ret;
+
+	rdma_restrack_del(&dmah->res);
+	kfree(dmah);
+	return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj =
+		uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE)
+			->obj_attr.uobject;
+	struct ib_device *ib_dev = attrs->context->device;
+	struct ib_dmah *dmah;
+	int ret;
+
+	dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah);
+	if (!dmah)
+		return -ENOMEM;
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) {
+		ret = uverbs_copy_from(&dmah->cpu_id, attrs,
+				       UVERBS_ATTR_ALLOC_DMAH_CPU_ID);
+		if (ret)
+			goto err;
+
+		if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) {
+			ret = -EPERM;
+			goto err;
+		}
+
+		dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS);
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) {
+		dmah->mem_type = uverbs_attr_get_enum_id(attrs,
+					UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE);
+		dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS);
+	}
+
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) {
+		ret = uverbs_copy_from(&dmah->ph, attrs,
+				       UVERBS_ATTR_ALLOC_DMAH_PH);
+		if (ret)
+			goto err;
+
+		/* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */
+		if (dmah->ph & 0xFC) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS);
+	}
+
+	dmah->device = ib_dev;
+	dmah->uobject = uobj;
+	atomic_set(&dmah->usecnt, 0);
+
+	rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH);
+	rdma_restrack_set_name(&dmah->res, NULL);
+
+	ret = ib_dev->ops.alloc_dmah(dmah, attrs);
+	if (ret) {
+		rdma_restrack_put(&dmah->res);
+		goto err;
+	}
+
+	uobj->object = dmah;
+	rdma_restrack_add(&dmah->res);
+	uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE);
+	return 0;
+err:
+	kfree(dmah);
+	return ret;
+}
+
+static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = {
+	[TPH_MEM_TYPE_VM] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+	[TPH_MEM_TYPE_PM] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+	UVERBS_METHOD_DMAH_ALLOC,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE,
+			UVERBS_OBJECT_DMAH,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE,
+			    uverbs_dmah_mem_type,
+			    UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH,
+			   UVERBS_ATTR_TYPE(u8),
+			   UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	UVERBS_METHOD_DMAH_FREE,
+	UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE,
+			UVERBS_OBJECT_DMAH,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH,
+			    UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah),
+			    &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC),
+			    &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE));
+
+const struct uapi_definition uverbs_def_obj_dmah[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH,
+				      UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah),
+				      UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)),
+	{}
+};
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
index a02916a3a79c..e00ea63175bd 100644
--- a/drivers/infiniband/core/uverbs_uapi.c
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = {
 	UAPI_DEF_CHAIN(uverbs_def_obj_cq),
 	UAPI_DEF_CHAIN(uverbs_def_obj_device),
 	UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+	UAPI_DEF_CHAIN(uverbs_def_obj_dmah),
 	UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
 	UAPI_DEF_CHAIN(uverbs_def_obj_intf),
 	UAPI_DEF_CHAIN(uverbs_def_obj_mr),
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3fb1c963eeb0..9ad253687935 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -42,6 +42,7 @@
 #include <rdma/signature.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
 #include <uapi/rdma/ib_user_ioctl_verbs.h>
+#include <linux/pci-tph.h>
 
 #define IB_FW_VERSION_NAME_MAX	ETHTOOL_FWVERS_LEN
 
@@ -1846,6 +1847,27 @@ struct ib_dm {
 	atomic_t	   usecnt;
 };
 
+/* bit values to mark existence of ib_dmah fields */
+enum {
+	IB_DMAH_CPU_ID_EXISTS,
+	IB_DMAH_MEM_TYPE_EXISTS,
+	IB_DMAH_PH_EXISTS,
+};
+
+struct ib_dmah {
+	struct ib_device *device;
+	struct ib_uobject *uobject;
+	/*
+	 * Implementation details of the RDMA core, don't use in drivers:
+	 */
+	struct rdma_restrack_entry res;
+	u32 cpu_id;
+	enum tph_mem_type mem_type;
+	atomic_t usecnt;
+	u8 ph;
+	u8 valid_fields; /* use IB_DMAH_XXX_EXISTS */
+};
+
 struct ib_mr {
 	struct ib_device  *device;
 	struct ib_pd	  *pd;
@@ -2573,6 +2595,9 @@ struct ib_device_ops {
 				  struct ib_dm_alloc_attr *attr,
 				  struct uverbs_attr_bundle *attrs);
 	int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs);
+	int (*alloc_dmah)(struct ib_dmah *ibdmah,
+			  struct uverbs_attr_bundle *attrs);
+	int (*dealloc_dmah)(struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs);
 	struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
 				   struct ib_dm_mr_attr *attr,
 				   struct uverbs_attr_bundle *attrs);
@@ -2730,6 +2755,7 @@ struct ib_device_ops {
 	DECLARE_RDMA_OBJ_SIZE(ib_ah);
 	DECLARE_RDMA_OBJ_SIZE(ib_counters);
 	DECLARE_RDMA_OBJ_SIZE(ib_cq);
+	DECLARE_RDMA_OBJ_SIZE(ib_dmah);
 	DECLARE_RDMA_OBJ_SIZE(ib_mw);
 	DECLARE_RDMA_OBJ_SIZE(ib_pd);
 	DECLARE_RDMA_OBJ_SIZE(ib_qp);
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 0d69ded73bf2..8a9bcf77dace 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -56,6 +56,10 @@ enum rdma_restrack_type {
 	 * @RDMA_RESTRACK_SRQ: Shared receive queue (SRQ)
 	 */
 	RDMA_RESTRACK_SRQ,
+	/**
+	 * @RDMA_RESTRACK_DMAH: DMA handle
+	 */
+	RDMA_RESTRACK_DMAH,
 	/**
 	 * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
 	 */
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index ece923ab48a0..3bb72a259c29 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -55,6 +55,7 @@ enum uverbs_default_objects {
 	UVERBS_OBJECT_DM,
 	UVERBS_OBJECT_COUNTERS,
 	UVERBS_OBJECT_ASYNC_EVENT,
+	UVERBS_OBJECT_DMAH,
 };
 
 enum {
@@ -240,6 +241,22 @@ enum uverbs_methods_dm {
 	UVERBS_METHOD_DM_FREE,
 };
 
+enum uverbs_attrs_alloc_dmah_cmd_attr_ids {
+	UVERBS_ATTR_ALLOC_DMAH_HANDLE,
+	UVERBS_ATTR_ALLOC_DMAH_CPU_ID,
+	UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE,
+	UVERBS_ATTR_ALLOC_DMAH_PH,
+};
+
+enum uverbs_attrs_free_dmah_cmd_attr_ids {
+	UVERBS_ATTR_FREE_DMA_HANDLE,
+};
+
+enum uverbs_methods_dmah {
+	UVERBS_METHOD_DMAH_ALLOC,
+	UVERBS_METHOD_DMAH_FREE,
+};
+
 enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
 	UVERBS_ATTR_REG_DM_MR_HANDLE,
 	UVERBS_ATTR_REG_DM_MR_OFFSET,
-- 
cgit v1.2.3


From a272019a46c918575f10cc529c893585d46b3b55 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 17 Jul 2025 15:17:31 +0300
Subject: IB: Extend UVERBS_METHOD_REG_MR to get DMAH

Extend UVERBS_METHOD_REG_MR to get DMAH and pass it to all drivers.

It will be used in mlx5 driver as part of the next patch from the
series.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/2ae1e628c0675db81f092cc00d3ad6fbf6139405.1752752567.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/uverbs_cmd.c            |  2 +-
 drivers/infiniband/core/uverbs_std_types_mr.c   | 27 ++++++++++++++++++++-----
 drivers/infiniband/core/verbs.c                 |  5 ++++-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c        |  8 ++++++++
 drivers/infiniband/hw/bnxt_re/ib_verbs.h        |  2 ++
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h          |  1 +
 drivers/infiniband/hw/cxgb4/mem.c               |  6 +++++-
 drivers/infiniband/hw/efa/efa.h                 |  2 ++
 drivers/infiniband/hw/efa/efa_verbs.c           | 12 +++++++++++
 drivers/infiniband/hw/erdma/erdma_verbs.c       |  6 +++++-
 drivers/infiniband/hw/erdma/erdma_verbs.h       |  3 ++-
 drivers/infiniband/hw/hns/hns_roce_device.h     |  1 +
 drivers/infiniband/hw/hns/hns_roce_mr.c         |  6 ++++++
 drivers/infiniband/hw/irdma/verbs.c             |  9 +++++++++
 drivers/infiniband/hw/mana/mana_ib.h            |  2 ++
 drivers/infiniband/hw/mana/mr.c                 |  8 ++++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h            |  1 +
 drivers/infiniband/hw/mlx4/mr.c                 |  4 ++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h            |  2 ++
 drivers/infiniband/hw/mlx5/mr.c                 |  8 +++++---
 drivers/infiniband/hw/mthca/mthca_provider.c    |  6 +++++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c     |  6 +++++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h     |  3 ++-
 drivers/infiniband/hw/qedr/verbs.c              |  6 +++++-
 drivers/infiniband/hw/qedr/verbs.h              |  3 ++-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c    |  4 ++++
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h    |  1 +
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c    |  5 +++++
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h |  1 +
 drivers/infiniband/sw/rdmavt/mr.c               |  5 +++++
 drivers/infiniband/sw/rdmavt/mr.h               |  1 +
 drivers/infiniband/sw/rxe/rxe_verbs.c           |  4 ++++
 drivers/infiniband/sw/siw/siw_verbs.c           |  7 ++++++-
 drivers/infiniband/sw/siw/siw_verbs.h           |  3 ++-
 include/rdma/ib_verbs.h                         |  3 +++
 include/uapi/rdma/ib_user_ioctl_cmds.h          |  1 +
 36 files changed, 154 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 88aa8d4599df..ce16404cdfb8 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -741,7 +741,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 	}
 
 	mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
-					 cmd.access_flags,
+					 cmd.access_flags, NULL,
 					 &attrs->driver_udata);
 	if (IS_ERR(mr)) {
 		ret = PTR_ERR(mr);
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 1bd4b17b5515..570b9656801d 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
 		return ret;
 
 	mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
-						access_flags,
+						access_flags, NULL,
 						attrs);
 	if (IS_ERR(mr))
 		return PTR_ERR(mr);
@@ -276,6 +276,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
 	u32 valid_access_flags = IB_ACCESS_SUPPORTED;
 	u64 length, iova, fd_offset = 0, addr = 0;
 	struct ib_device *ib_dev = pd->device;
+	struct ib_dmah *dmah = NULL;
 	bool has_fd_offset = false;
 	bool has_addr = false;
 	bool has_fd = false;
@@ -340,6 +341,13 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
 			return -EINVAL;
 	}
 
+	if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) {
+		dmah = uverbs_attr_get_obj(attrs,
+					   UVERBS_ATTR_REG_MR_DMA_HANDLE);
+		if (IS_ERR(dmah))
+			return PTR_ERR(dmah);
+	}
+
 	ret = uverbs_get_flags32(&access_flags, attrs,
 				 UVERBS_ATTR_REG_MR_ACCESS_FLAGS,
 				 valid_access_flags);
@@ -351,11 +359,12 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
 		return ret;
 
 	if (has_fd)
-		mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, iova,
-							fd, access_flags, attrs);
+		mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length,
+							iova, fd, access_flags,
+							dmah, attrs);
 	else
-		mr = pd->device->ops.reg_user_mr(pd, addr, length,
-						 iova, access_flags, NULL);
+		mr = pd->device->ops.reg_user_mr(pd, addr, length, iova,
+						 access_flags, dmah, NULL);
 
 	if (IS_ERR(mr))
 		return PTR_ERR(mr);
@@ -365,6 +374,10 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)(
 	mr->type = IB_MR_TYPE_USER;
 	mr->uobject = uobj;
 	atomic_inc(&pd->usecnt);
+	if (dmah) {
+		mr->dmah = dmah;
+		atomic_inc(&dmah->usecnt);
+	}
 	rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
 	rdma_restrack_set_name(&mr->res, NULL);
 	rdma_restrack_add(&mr->res);
@@ -488,6 +501,10 @@ DECLARE_UVERBS_NAMED_METHOD(
 			UVERBS_OBJECT_PD,
 			UVERBS_ACCESS_READ,
 			UA_MANDATORY),
+	UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE,
+			UVERBS_OBJECT_DMAH,
+			UVERBS_ACCESS_READ,
+			UA_OPTIONAL),
 	UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA,
 			   UVERBS_ATTR_TYPE(u64),
 			   UA_MANDATORY),
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 75fde0fe9989..3a5f81402d2f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2223,7 +2223,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	}
 
 	mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
-					 access_flags, NULL);
+					 access_flags, NULL, NULL);
 
 	if (IS_ERR(mr))
 		return mr;
@@ -2262,6 +2262,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
 	struct ib_pd *pd = mr->pd;
 	struct ib_dm *dm = mr->dm;
+	struct ib_dmah *dmah = mr->dmah;
 	struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
 	int ret;
 
@@ -2272,6 +2273,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 		atomic_dec(&pd->usecnt);
 		if (dm)
 			atomic_dec(&dm->usecnt);
+		if (dmah)
+			atomic_dec(&dmah->usecnt);
 		kfree(sig_attrs);
 	}
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index 3a627acb82ce..37c2bc3bdba5 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -4235,6 +4235,7 @@ free_mr:
 
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 				  u64 virt_addr, int mr_access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
@@ -4242,6 +4243,9 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 	struct ib_umem *umem;
 	struct ib_mr *ib_mr;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
 	if (IS_ERR(umem))
 		return ERR_CAST(umem);
@@ -4255,6 +4259,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 					 u64 length, u64 virt_addr, int fd,
 					 int mr_access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
@@ -4263,6 +4268,9 @@ struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 	struct ib_umem *umem;
 	struct ib_mr *ib_mr;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	umem_dmabuf = ib_umem_dmabuf_get_pinned(&rdev->ibdev, start, length,
 						fd, mr_access_flags);
 	if (IS_ERR(umem_dmabuf))
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 22c9eb8e9cfc..fe00ab691a51 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -258,10 +258,12 @@ struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
 int bnxt_re_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int mr_access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata);
 struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 					 u64 length, u64 virt_addr,
 					 int fd, int mr_access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs);
 int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
 void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 5b3007acaa1f..e17c1252536b 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -1006,6 +1006,7 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 void c4iw_dealloc(struct uld_ctx *ctx);
 struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
 					   u64 length, u64 virt, int acc,
+					   struct ib_dmah *dmah,
 					   struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
 int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index a2c71a1d93d5..dcdfe250bdbe 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -489,7 +489,8 @@ err_free_mhp:
 }
 
 struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-			       u64 virt, int acc, struct ib_udata *udata)
+			       u64 virt, int acc, struct ib_dmah *dmah,
+			       struct ib_udata *udata)
 {
 	__be64 *pages;
 	int shift, n, i;
@@ -501,6 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
 	pr_debug("ib_pd %p\n", pd);
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (length == ~0ULL)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 3d49c1db928e..96f9c3bc98b2 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -167,10 +167,12 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		       struct ib_umem *umem, struct uverbs_attr_bundle *attrs);
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
+			 struct ib_dmah *dmah,
 			 struct ib_udata *udata);
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
+				     struct ib_dmah *dmah,
 				     struct uverbs_attr_bundle *attrs);
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 0f68aec12883..886923d5fe50 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1765,6 +1765,7 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
+				     struct ib_dmah *dmah,
 				     struct uverbs_attr_bundle *attrs)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
@@ -1772,6 +1773,11 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 	struct efa_mr *mr;
 	int err;
 
+	if (dmah) {
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
 	mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata);
 	if (IS_ERR(mr)) {
 		err = PTR_ERR(mr);
@@ -1804,12 +1810,18 @@ err_out:
 
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 			 u64 virt_addr, int access_flags,
+			 struct ib_dmah *dmah,
 			 struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_mr *mr;
 	int err;
 
+	if (dmah) {
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
 	mr = efa_alloc_mr(ibpd, access_flags, udata);
 	if (IS_ERR(mr)) {
 		err = PTR_ERR(mr);
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c
index ec0ad4086066..94c211df09d8 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.c
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.c
@@ -1200,13 +1200,17 @@ int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 }
 
 struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
-				u64 virt, int access, struct ib_udata *udata)
+				u64 virt, int access, struct ib_dmah *dmah,
+				struct ib_udata *udata)
 {
 	struct erdma_mr *mr = NULL;
 	struct erdma_dev *dev = to_edev(ibpd->device);
 	u32 stag;
 	int ret;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (!len || len > dev->attrs.max_mr_size)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h
index f9408ccc8bad..ef411b81fbd7 100644
--- a/drivers/infiniband/hw/erdma/erdma_verbs.h
+++ b/drivers/infiniband/hw/erdma/erdma_verbs.h
@@ -452,7 +452,8 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext);
 int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
-				u64 virt, int access, struct ib_udata *udata);
+				u64 virt, int access, struct ib_dmah *dmah,
+				struct ib_udata *udata);
 struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights);
 int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data);
 int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 25f77b1fa773..78ee04a48a74 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -1219,6 +1219,7 @@ int hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				   u64 virt_addr, int access_flags,
+				   struct ib_dmah *dmah,
 				   struct ib_udata *udata);
 struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
 				     u64 length, u64 virt_addr,
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index ebef93559225..0f037e545520 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -231,12 +231,18 @@ err_free:
 
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				   u64 virt_addr, int access_flags,
+				   struct ib_dmah *dmah,
 				   struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct hns_roce_mr *mr;
 	int ret;
 
+	if (dmah) {
+		ret = -EOPNOTSUPP;
+		goto err_out;
+	}
+
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr) {
 		ret = -ENOMEM;
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 1e8c92826de2..da5a41b275d8 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -3013,10 +3013,12 @@ static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req,
  * @len: length of mr
  * @virt: virtual address
  * @access: access of mr
+ * @dmah: dma handle
  * @udata: user data
  */
 static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 				       u64 virt, int access,
+				       struct ib_dmah *dmah,
 				       struct ib_udata *udata)
 {
 #define IRDMA_MEM_REG_MIN_REQ_LEN offsetofend(struct irdma_mem_reg_req, sq_pages)
@@ -3026,6 +3028,9 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 	struct irdma_mr *iwmr = NULL;
 	int err;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size)
 		return ERR_PTR(-EINVAL);
 
@@ -3085,6 +3090,7 @@ error:
 static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 					      u64 len, u64 virt,
 					      int fd, int access,
+					      struct ib_dmah *dmah,
 					      struct uverbs_attr_bundle *attrs)
 {
 	struct irdma_device *iwdev = to_iwdev(pd->device);
@@ -3092,6 +3098,9 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 	struct irdma_mr *iwmr;
 	int err;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
index 369825fdeff8..60a53f1958cf 100644
--- a/drivers/infiniband/hw/mana/mana_ib.h
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -632,6 +632,7 @@ struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags);
 
 struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 iova, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata);
 
 int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
@@ -721,5 +722,6 @@ int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 
 struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length,
 					 u64 iova, int fd, int mr_access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs);
 #endif
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index 6d974d0a8400..55701046ffba 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -106,6 +106,7 @@ static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle)
 
 struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 				  u64 iova, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata)
 {
 	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
@@ -116,6 +117,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	u64 dma_region_handle;
 	int err;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
 	ibdev_dbg(ibdev,
@@ -188,6 +192,7 @@ err_free:
 
 struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length,
 					 u64 iova, int fd, int access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs)
 {
 	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
@@ -199,6 +204,9 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng
 	u64 dma_region_handle;
 	int err;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
 	access_flags &= ~IB_ACCESS_OPTIONAL;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index f53b1846594c..5df5b955114e 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -759,6 +759,7 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 			   struct ib_umem *umem);
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata);
 int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 int mlx4_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index e77645a673fb..94464f1694d9 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -139,6 +139,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
 
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
@@ -147,6 +148,9 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	int err;
 	int n;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ff6026d2ed7c..dfd231333509 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1370,10 +1370,12 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 virt_addr, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata);
 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 					 u64 length, u64 virt_addr,
 					 int fd, int access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs);
 int mlx5_ib_advise_mr(struct ib_pd *pd,
 		      enum ib_uverbs_advise_mr_advice advice,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 50f7a882efde..69f1403cc578 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1529,13 +1529,14 @@ err_dereg_mr:
 
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				  u64 iova, int access_flags,
+				  struct ib_dmah *dmah,
 				  struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct ib_umem *umem;
 	int err;
 
-	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || dmah)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
@@ -1689,6 +1690,7 @@ end:
 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 					 u64 length, u64 virt_addr,
 					 int fd, int access_flags,
+					 struct ib_dmah *dmah,
 					 struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -1696,7 +1698,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 	int err;
 
 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
-	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || dmah)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
@@ -1903,7 +1905,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	 */
 recreate:
 	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
-				   new_access_flags, udata);
+				   new_access_flags, NULL, udata);
 }
 
 static int
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 6a1e2e79ddc3..dd572d76866c 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -825,7 +825,8 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
 }
 
 static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				       u64 virt, int acc, struct ib_udata *udata)
+				       u64 virt, int acc, struct ib_dmah *dmah,
+				       struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
 	struct ib_block_iter biter;
@@ -838,6 +839,9 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	int err = 0;
 	int write_mtt_size;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (udata->inlen < sizeof ucmd) {
 		if (!context->reg_mr_warned) {
 			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 979de8f8df14..46d911fd38de 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -847,13 +847,17 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr)
 }
 
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
-				 u64 usr_addr, int acc, struct ib_udata *udata)
+				 u64 usr_addr, int acc, struct ib_dmah *dmah,
+				 struct ib_udata *udata)
 {
 	int status = -ENOMEM;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	struct ocrdma_mr *mr;
 	struct ocrdma_pd *pd;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	pd = get_ocrdma_pd(ibpd);
 
 	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE))
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 0644346d8d98..6c5c3755b8a9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -98,7 +98,8 @@ int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *,
 int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
-				 u64 virt, int acc, struct ib_udata *);
+				 u64 virt, int acc, struct ib_dmah *dmah,
+				 struct ib_udata *);
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
 			      u32 max_num_sg);
 int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 568a5b18803f..ab9bf0922979 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -2953,13 +2953,17 @@ done:
 }
 
 struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
-			       u64 usr_addr, int acc, struct ib_udata *udata)
+			       u64 usr_addr, int acc, struct ib_dmah *dmah,
+			       struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
 	struct qedr_mr *mr;
 	struct qedr_pd *pd;
 	int rc = -ENOMEM;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	pd = get_qedr_pd(ibpd);
 	DP_DEBUG(dev, QEDR_MSG_MR,
 		 "qedr_register user mr pd = %d start = %lld, len = %lld, usr_addr = %lld, acc = %d\n",
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 5731458abb06..62420a15101b 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -79,7 +79,8 @@ int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
 
 struct ib_mr *qedr_reg_user_mr(struct ib_pd *, u64 start, u64 length,
-			       u64 virt, int acc, struct ib_udata *);
+			       u64 virt, int acc, struct ib_dmah *dmah,
+			       struct ib_udata *);
 
 int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		   int sg_nents, unsigned int *sg_offset);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index 217af34e82b3..ae5df96589d9 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -592,6 +592,7 @@ int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 					u64 virt_addr, int access_flags,
+					struct ib_dmah *dmah,
 					struct ib_udata *udata)
 {
 	struct usnic_ib_mr *mr;
@@ -600,6 +601,9 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 	usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start,
 			virt_addr, length);
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 53f53f2d53be..e3031ac32488 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -60,6 +60,7 @@ int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
+				struct ib_dmah *dmah,
 				struct ib_udata *udata);
 int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index e80848bfb3bd..ec7a00c8285b 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -104,12 +104,14 @@ struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc)
  * @length: length of region
  * @virt_addr: I/O virtual address
  * @access_flags: access flags for memory region
+ * @dmah: dma handle
  * @udata: user data
  *
  * @return: ib_mr pointer on success, otherwise returns an errno.
  */
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				 u64 virt_addr, int access_flags,
+				 struct ib_dmah *dmah,
 				 struct ib_udata *udata)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
@@ -121,6 +123,9 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
 	int ret, npages;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (length == 0 || length > dev->dsr->caps.max_mr_size) {
 		dev_warn(&dev->pdev->dev, "invalid mem region length\n");
 		return ERR_PTR(-EINVAL);
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index fd47b0b1df5c..603e5a9311eb 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -366,6 +366,7 @@ int pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				 u64 virt_addr, int access_flags,
+				 struct ib_dmah *dmah,
 				 struct ib_udata *udata);
 int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 5ed5cfc2b280..86e482593a85 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -329,12 +329,14 @@ bail:
  * @length: length of region to register
  * @virt_addr: associated virtual address
  * @mr_access_flags: access flags for this memory region
+ * @dmah: dma handle
  * @udata: unused by the driver
  *
  * Return: the memory region on success, otherwise returns an errno.
  */
 struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			      u64 virt_addr, int mr_access_flags,
+			      struct ib_dmah *dmah,
 			      struct ib_udata *udata)
 {
 	struct rvt_mr *mr;
@@ -343,6 +345,9 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	int n, m;
 	struct ib_mr *ret;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (length == 0)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
index 44afe2731741..72dab48307b7 100644
--- a/drivers/infiniband/sw/rdmavt/mr.h
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -26,6 +26,7 @@ void rvt_mr_exit(struct rvt_dev_info *rdi);
 struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 			      u64 virt_addr, int mr_access_flags,
+			      struct ib_dmah *dmah,
 			      struct ib_udata *udata);
 int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 2331e698a65b..f48d6e132954 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1271,6 +1271,7 @@ err_free:
 
 static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 iova, int access,
+				     struct ib_dmah *dmah,
 				     struct ib_udata *udata)
 {
 	struct rxe_dev *rxe = to_rdev(ibpd->device);
@@ -1278,6 +1279,9 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
 	struct rxe_mr *mr;
 	int err, cleanup_err;
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (access & ~RXE_ACCESS_SUPPORTED_MR) {
 		rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access,
 				RXE_ACCESS_SUPPORTED_MR);
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
index 2b2a7b8e93b0..35c3bde0d00a 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.c
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -1321,10 +1321,12 @@ int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
  * @len:	len of MR
  * @rnic_va:	not used by siw
  * @rights:	MR access rights
+ * @dmah:	dma handle
  * @udata:	user buffer to communicate STag and Key.
  */
 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
-			      u64 rnic_va, int rights, struct ib_udata *udata)
+			      u64 rnic_va, int rights,  struct ib_dmah *dmah,
+			      struct ib_udata *udata)
 {
 	struct siw_mr *mr = NULL;
 	struct siw_umem *umem = NULL;
@@ -1336,6 +1338,9 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 		   (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
 		   (unsigned long long)len);
 
+	if (dmah)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
 		siw_dbg_pd(pd, "too many mr's\n");
 		rv = -ENOMEM;
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
index 1f1a305540af..e9f4463aecdc 100644
--- a/drivers/infiniband/sw/siw/siw_verbs.h
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -65,7 +65,8 @@ int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
 int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
 struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
-			      u64 rnic_va, int rights, struct ib_udata *udata);
+			      u64 rnic_va, int rights, struct ib_dmah *dmah,
+			      struct ib_udata *udata);
 struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
 			   u32 max_sge);
 struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9ad253687935..6139223e92e4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1885,6 +1885,7 @@ struct ib_mr {
 
 	struct ib_dm      *dm;
 	struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
+	struct ib_dmah *dmah;
 	/*
 	 * Implementation details of the RDMA core, don't use in drivers:
 	 */
@@ -2527,10 +2528,12 @@ struct ib_device_ops {
 	struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
 	struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
 				     u64 virt_addr, int mr_access_flags,
+				     struct ib_dmah *dmah,
 				     struct ib_udata *udata);
 	struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset,
 					    u64 length, u64 virt_addr, int fd,
 					    int mr_access_flags,
+					    struct ib_dmah *dmah,
 					    struct uverbs_attr_bundle *attrs);
 	struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start,
 				       u64 length, u64 virt_addr,
diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h
index 3bb72a259c29..de6f5a94f1e3 100644
--- a/include/uapi/rdma/ib_user_ioctl_cmds.h
+++ b/include/uapi/rdma/ib_user_ioctl_cmds.h
@@ -311,6 +311,7 @@ enum uverbs_attrs_reg_dmabuf_mr_cmd_attr_ids {
 enum uverbs_attrs_reg_mr_cmd_attr_ids {
 	UVERBS_ATTR_REG_MR_HANDLE,
 	UVERBS_ATTR_REG_MR_PD_HANDLE,
+	UVERBS_ATTR_REG_MR_DMA_HANDLE,
 	UVERBS_ATTR_REG_MR_IOVA,
 	UVERBS_ATTR_REG_MR_ADDR,
 	UVERBS_ATTR_REG_MR_LENGTH,
-- 
cgit v1.2.3


From d0d05f602c1504fb868ed4a560d1465d88a3c5e5 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 29 Apr 2025 14:30:00 +0200
Subject: module: Move modprobe_path and modules_disabled ctl_tables into the
 module subsys

Move module sysctl (modprobe_path and modules_disabled) out of sysctl.c
and into the modules subsystem. Make modules_disabled static as it no
longer needs to be exported. Remove module.h from the includes in sysctl
as it no longer uses any module exported variables.

This is part of a greater effort to move ctl tables into their
respective subsystems which will reduce the merge conflicts in
kernel/sysctl.c.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/kmod.h     |  3 ---
 include/linux/module.h   |  1 -
 kernel/module/internal.h |  3 +++
 kernel/module/main.c     | 30 +++++++++++++++++++++++++++++-
 kernel/sysctl.c          | 20 --------------------
 5 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 68f69362d427..9a07c3215389 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -14,10 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/sysctl.h>
 
-#define KMOD_PATH_LEN 256
-
 #ifdef CONFIG_MODULES
-extern char modprobe_path[]; /* for sysctl */
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
 extern __printf(2, 3)
diff --git a/include/linux/module.h b/include/linux/module.h
index 92e1420fccdf..e93cdb92ad92 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -304,7 +304,6 @@ struct notifier_block;
 
 #ifdef CONFIG_MODULES
 
-extern int modules_disabled; /* for sysctl */
 /* Get/put a kernel symbol (calls must be symmetric) */
 void *__symbol_get(const char *symbol);
 void *__symbol_get_gpl(const char *symbol);
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 8d74b0a21c82..51ddd8866ef3 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
 extern const u32 __start___kcrctab[];
 extern const u32 __start___kcrctab_gpl[];
 
+#define KMOD_PATH_LEN 256
+extern char modprobe_path[];
+
 struct load_info {
 	const char *name;
 	/* pointer to module in temporary copy, freed at end of load_module() */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 413ac6ea3702..c11d9a125001 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod)
 }
 
 /* Block module loading/unloading? */
-int modules_disabled;
+static int modules_disabled;
 core_param(nomodule, modules_disabled, bint, 0);
 
+static const struct ctl_table module_sysctl_table[] = {
+	{
+		.procname	= "modprobe",
+		.data		= &modprobe_path,
+		.maxlen		= KMOD_PATH_LEN,
+		.mode		= 0644,
+		.proc_handler	= proc_dostring,
+	},
+	{
+		.procname	= "modules_disabled",
+		.data		= &modules_disabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
+	},
+};
+
+static int __init init_module_sysctl(void)
+{
+	register_sysctl_init("kernel", module_sysctl_table);
+	return 0;
+}
+
+subsys_initcall(init_module_sysctl);
+
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9b4f0cff76ea..473133d9651e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -19,7 +19,6 @@
  *  Removed it and replaced it with older style, 03/23/00, Bill Wendling
  */
 
-#include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/bitmap.h>
 #include <linux/printk.h>
@@ -1616,25 +1615,6 @@ static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_MODULES
-	{
-		.procname	= "modprobe",
-		.data		= &modprobe_path,
-		.maxlen		= KMOD_PATH_LEN,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring,
-	},
-	{
-		.procname	= "modules_disabled",
-		.data		= &modules_disabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		/* only handle a transition from default "0" to "1" */
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-		.extra2		= SYSCTL_ONE,
-	},
-#endif
 #ifdef CONFIG_UEVENT_HELPER
 	{
 		.procname	= "hotplug",
-- 
cgit v1.2.3


From f1b4f23a52c272f6c1e205e8ec243f563323c5aa Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Tue, 29 Apr 2025 15:12:17 +0200
Subject: locking/rtmutex: Move max_lock_depth into rtmutex.c

Move the max_lock_depth sysctl table element into rtmutex_api.c. Removed
the rtmutex.h include from sysctl.c. Chose to move into rtmutex_api.c
to avoid multiple registrations every time rtmutex.c is included in other
files.

This is part of a greater effort to move ctl tables into their
respective subsystems which will reduce the merge conflicts in
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/rtmutex.h      |  2 +-
 kernel/locking/rtmutex_api.c | 18 ++++++++++++++++++
 kernel/sysctl.c              | 12 ------------
 3 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 7d049883a08a..fa9f1021541e 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -18,7 +18,7 @@
 #include <linux/rbtree_types.h>
 #include <linux/spinlock_types_raw.h>
 
-extern int max_lock_depth; /* for sysctl */
+extern int max_lock_depth;
 
 struct rt_mutex_base {
 	raw_spinlock_t		wait_lock;
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 2d933528a0fa..bafd5af98eae 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -13,6 +13,24 @@
  */
 int max_lock_depth = 1024;
 
+static const struct ctl_table rtmutex_sysctl_table[] = {
+	{
+		.procname	= "max_lock_depth",
+		.data		= &max_lock_depth,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+	register_sysctl_init("kernel", rtmutex_sysctl_table);
+	return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
 /*
  * Debug aware fast / slowpath lock,trylock,unlock
  *
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 473133d9651e..a22f35013da0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,9 +59,6 @@
 #include <asm/nmi.h>
 #include <asm/io.h>
 #endif
-#ifdef CONFIG_RT_MUTEXES
-#include <linux/rtmutex.h>
-#endif
 
 /* shared constants to be used in various sysctls */
 const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
@@ -1709,15 +1706,6 @@ static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_RT_MUTEXES
-	{
-		.procname	= "max_lock_depth",
-		.data		= &max_lock_depth,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_TREE_RCU
 	{
 		.procname	= "panic_on_rcu_stall",
-- 
cgit v1.2.3


From fff6703fc843569d7a2f78ca08e7a69a9be22b0f Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Wed, 30 Apr 2025 14:07:33 +0200
Subject: rcu: Move rcu_stall related sysctls into rcu/tree_stall.h

Move sysctl_panic_on_rcu_stall and sysctl_max_rcu_stall_to_panic into
the kernel/rcu subdirectory. Make these static in tree_stall.h and
removed them as extern from panic.h as their scope is now confined into
one file.

This is part of a greater effort to move ctl tables into their
respective subsystems which will reduce the merge conflicts in
kernel/sysctl.c.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Joel Fernandes <joelagnelf@nvidia.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/panic.h   |  2 --
 kernel/rcu/tree_stall.h | 33 +++++++++++++++++++++++++++++++--
 kernel/sysctl.c         | 20 --------------------
 3 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/panic.h b/include/linux/panic.h
index 4adc65766935..8f2b5d92ac05 100644
--- a/include/linux/panic.h
+++ b/include/linux/panic.h
@@ -25,8 +25,6 @@ extern int panic_on_warn;
 extern unsigned long panic_on_taint;
 extern bool panic_on_taint_nousertaint;
 
-extern int sysctl_panic_on_rcu_stall;
-extern int sysctl_max_rcu_stall_to_panic;
 extern int sysctl_panic_on_stackoverflow;
 
 extern bool crash_kexec_post_notifiers;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 486c00536207..69482c2f0771 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -17,8 +17,37 @@
 // Controlling CPU stall warnings, including delay calculation.
 
 /* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-int sysctl_max_rcu_stall_to_panic __read_mostly;
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+	{
+		.procname	= "panic_on_rcu_stall",
+		.data		= &sysctl_panic_on_rcu_stall,
+		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "max_rcu_stall_to_panic",
+		.data		= &sysctl_max_rcu_stall_to_panic,
+		.maxlen		= sizeof(sysctl_max_rcu_stall_to_panic),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
+	},
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+	register_sysctl_init("kernel", rcu_stall_sysctl_table);
+	return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
 
 #ifdef CONFIG_SYSFS
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a22f35013da0..fd76f0e1d490 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1706,26 +1706,6 @@ static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_TREE_RCU
-	{
-		.procname	= "panic_on_rcu_stall",
-		.data		= &sysctl_panic_on_rcu_stall,
-		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
-	{
-		.procname	= "max_rcu_stall_to_panic",
-		.data		= &sysctl_max_rcu_stall_to_panic,
-		.maxlen		= sizeof(sysctl_max_rcu_stall_to_panic),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ONE,
-		.extra2		= SYSCTL_INT_MAX,
-	},
-#endif
 };
 
 int __init sysctl_init_bases(void)
-- 
cgit v1.2.3


From 9e2f403dd8c2b07aff012e72c1fe5455538d72d2 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Fri, 2 May 2025 15:32:17 +0200
Subject: parisc/power: Move soft-power into power.c

Move the soft-power ctl table into parisc/power.c. As a consequence the
pwrsw_enabled var is made static.

This is part of a greater effort to move ctl tables into their
respective subsystems which will reduce the merge conflicts in
kernel/sysctl.c.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 drivers/parisc/power.c | 20 +++++++++++++++++++-
 include/linux/sysctl.h |  1 -
 kernel/sysctl.c        |  9 ---------
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c
index 7a6a3e7f2825..9d6c7bf72e29 100644
--- a/drivers/parisc/power.c
+++ b/drivers/parisc/power.c
@@ -83,7 +83,25 @@ static struct task_struct *power_task;
 #define SYSCTL_FILENAME	"sys/kernel/power"
 
 /* soft power switch enabled/disabled */
-int pwrsw_enabled __read_mostly = 1;
+static int pwrsw_enabled __read_mostly = 1;
+
+static const struct ctl_table power_sysctl_table[] = {
+	{
+		.procname	= "soft-power",
+		.data		= &pwrsw_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+};
+
+static int __init init_power_sysctl(void)
+{
+	register_sysctl_init("kernel", power_sysctl_table);
+	return 0;
+}
+
+arch_initcall(init_power_sysctl);
 
 /* main kernel thread worker. It polls the button state */
 static int kpowerswd(void *param)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 40a6ac6c9713..ae762eabb7c9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -242,7 +242,6 @@ int do_proc_douintvec(const struct ctl_table *table, int write,
 				  int write, void *data),
 		      void *data);
 
-extern int pwrsw_enabled;
 extern int unaligned_enabled;
 extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index adc2d3ea1278..718140251972 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1594,15 +1594,6 @@ static const struct ctl_table kern_table[] = {
 		.extra2		= SYSCTL_ONE,
 	},
 #endif
-#ifdef CONFIG_PARISC
-	{
-		.procname	= "soft-power",
-		.data		= &pwrsw_enabled,
-		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
 	{
 		.procname	= "unaligned-trap",
-- 
cgit v1.2.3


From 8e5f04b0d58c734c69a0b6e26317561919299638 Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Fri, 2 May 2025 22:27:38 +0200
Subject: fork: mv threads-max into kernel/fork.c

make sysctl_max_threads static as it no longer needs to be exported into
sysctl.c.

This is part of a greater effort to move ctl tables into their
respective subsystems which will reduce the merge conflicts in
kernel/sysctl.c.

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h |  3 ---
 kernel/fork.c          | 20 +++++++++++++++++++-
 kernel/sysctl.c        |  7 -------
 3 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ae762eabb7c9..30bcbc59d12d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -284,7 +284,4 @@ static inline bool sysctl_is_alias(char *param)
 }
 #endif /* CONFIG_SYSCTL */
 
-int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer,
-		size_t *lenp, loff_t *ppos);
-
 #endif /* _LINUX_SYSCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..dea8e7740ad2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3216,7 +3216,7 @@ int unshare_files(void)
 	return 0;
 }
 
-int sysctl_max_threads(const struct ctl_table *table, int write,
+static int sysctl_max_threads(const struct ctl_table *table, int write,
 		       void *buffer, size_t *lenp, loff_t *ppos)
 {
 	struct ctl_table t;
@@ -3238,3 +3238,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write,
 
 	return 0;
 }
+
+static const struct ctl_table fork_sysctl_table[] = {
+	{
+		.procname	= "threads-max",
+		.data		= NULL,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_max_threads,
+	},
+};
+
+static int __init init_fork_sysctl(void)
+{
+	register_sysctl_init("kernel", fork_sysctl_table);
+	return 0;
+}
+
+subsys_initcall(init_fork_sysctl);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 718140251972..febf328054aa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1630,13 +1630,6 @@ static const struct ctl_table kern_table[] = {
 		.proc_handler	= proc_do_cad_pid,
 	},
 #endif
-	{
-		.procname	= "threads-max",
-		.data		= NULL,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= sysctl_max_threads,
-	},
 	{
 		.procname	= "overflowuid",
 		.data		= &overflowuid,
-- 
cgit v1.2.3


From 39dac316f09ae5a0930878d2cae8aea113648b5a Mon Sep 17 00:00:00 2001
From: Joel Granados <joel.granados@kernel.org>
Date: Fri, 27 Jun 2025 09:00:05 +0200
Subject: sysctl: Removed unused variable

Remove unaligned_dump_stack from sysctl.h; it is no longer used or
defined.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
---
 include/linux/sysctl.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 30bcbc59d12d..92e9146b1104 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -243,7 +243,6 @@ int do_proc_douintvec(const struct ctl_table *table, int write,
 		      void *data);
 
 extern int unaligned_enabled;
-extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 
 #else /* CONFIG_SYSCTL */
-- 
cgit v1.2.3


From 4e8fc4f7208b032674ef8a4977b96484c328515c Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Wed, 23 Jul 2025 20:23:29 +0800
Subject: netfs: Remove unused declaration netfs_queue_write_request()

Commit c245868524cc ("netfs: Remove the old writeback code") removed
the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/20250723122329.923223-1-yuehaibing@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/netfs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 065c17385e53..486d5766fe77 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -443,7 +443,6 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
 			size_t max_size, size_t max_segs);
 void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);
 void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error);
-void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
 
 int netfs_start_io_read(struct inode *inode);
 void netfs_end_io_read(struct inode *inode);
-- 
cgit v1.2.3


From 70c672f933337fc1de2df8628567ee0a8146562b Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Fri, 20 Jun 2025 15:03:45 +0800
Subject: Bluetooth: Remove hci_conn_hash_lookup_state()

Since commit 4aa42119d971 ("Bluetooth: Remove pending ACL connection
attempts") this function is unused.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f79f59e67114..69f491399dac 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1420,26 +1420,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
 	return NULL;
 }
 
-static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
-							__u8 type, __u16 state)
-{
-	struct hci_conn_hash *h = &hdev->conn_hash;
-	struct hci_conn  *c;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type == type && c->state == state) {
-			rcu_read_unlock();
-			return c;
-		}
-	}
-
-	rcu_read_unlock();
-
-	return NULL;
-}
-
 typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data);
 static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
 					    hci_conn_func_t func, __u8 type,
-- 
cgit v1.2.3


From b2a5f2e1c127cb431df22e114998ff72eb4578c8 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.li@amlogic.com>
Date: Tue, 1 Jul 2025 15:56:22 +0800
Subject: Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event

When the BIS source stops, the controller sends an LE BIG Sync Lost
event (subevent 0x1E). Currently, this event is not handled, causing
the BIS stream to remain active in BlueZ and preventing recovery.

Signed-off-by: Yang Li <yang.li@amlogic.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  6 ++++++
 include/net/bluetooth/hci_core.h |  5 +++--
 net/bluetooth/hci_conn.c         |  3 ++-
 net/bluetooth/hci_event.c        | 39 ++++++++++++++++++++++++++++++++++++++-
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index c79901f2dc2a..6213012610d7 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2851,6 +2851,12 @@ struct hci_evt_le_big_sync_estabilished {
 	__le16  bis[];
 } __packed;
 
+#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e
+struct hci_evt_le_big_sync_lost {
+	__u8    handle;
+	__u8    reason;
+} __packed;
+
 #define HCI_EVT_LE_BIG_INFO_ADV_REPORT	0x22
 struct hci_evt_le_big_info_adv_report {
 	__le16  sync_handle;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 69f491399dac..1ef9279cfd6f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1346,7 +1346,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
 }
 
 static inline struct hci_conn *
-hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle,  __u16 state)
+hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state,
+			       __u8 role)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn  *c;
@@ -1354,7 +1355,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle,  __u16 state)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != BIS_LINK || c->state != state)
+		if (c->type != BIS_LINK || c->state != state || c->role != role)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big) {
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 4f379184df5b..f5cd935490ad 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2146,7 +2146,8 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
 	struct hci_link *link;
 
 	/* Look for any BIS that is open for rebinding */
-	conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN);
+	conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN,
+					      HCI_ROLE_MASTER);
 	if (conn) {
 		memcpy(qos, &conn->iso_qos, sizeof(*qos));
 		conn->state = BT_CONNECTED;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f668bde007d4..fca58984ee4e 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6876,7 +6876,8 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
 
 	/* Connect all BISes that are bound to the BIG */
 	while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle,
-						      BT_BOUND))) {
+						      BT_BOUND,
+						      HCI_ROLE_MASTER))) {
 		if (ev->status) {
 			hci_connect_cfm(conn, ev->status);
 			hci_conn_del(conn);
@@ -6992,6 +6993,37 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
+				     struct sk_buff *skb)
+{
+	struct hci_evt_le_big_sync_lost *ev = data;
+	struct hci_conn *bis, *conn;
+
+	bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle);
+
+	hci_dev_lock(hdev);
+
+	/* Delete the pa sync connection */
+	bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle);
+	if (bis) {
+		conn = hci_conn_hash_lookup_pa_sync_handle(hdev,
+							   bis->sync_handle);
+		if (conn)
+			hci_conn_del(conn);
+	}
+
+	/* Delete each bis connection */
+	while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle,
+						     BT_CONNECTED,
+						     HCI_ROLE_SLAVE))) {
+		clear_bit(HCI_CONN_BIG_SYNC, &bis->flags);
+		hci_disconn_cfm(bis, ev->reason);
+		hci_conn_del(bis);
+	}
+
+	hci_dev_unlock(hdev);
+}
+
 static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 					   struct sk_buff *skb)
 {
@@ -7115,6 +7147,11 @@ static const struct hci_le_ev {
 		     hci_le_big_sync_established_evt,
 		     sizeof(struct hci_evt_le_big_sync_estabilished),
 		     HCI_MAX_EVENT_SIZE),
+	/* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */
+	HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST,
+		     hci_le_big_sync_lost_evt,
+		     sizeof(struct hci_evt_le_big_sync_lost),
+		     HCI_MAX_EVENT_SIZE),
 	/* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */
 	HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT,
 		     hci_le_big_info_adv_report_evt,
-- 
cgit v1.2.3


From be31d11ec9144f7f8f7fcbf84ba6971b664683f3 Mon Sep 17 00:00:00 2001
From: Yang Li <yang.li@amlogic.com>
Date: Tue, 1 Jul 2025 16:47:26 +0800
Subject: Bluetooth: Fix spelling mistakes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Correct the misspelling of “estabilished” in the code.

Signed-off-by: Yang Li <yang.li@amlogic.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  2 +-
 net/bluetooth/hci_event.c   | 16 ++++++++--------
 net/bluetooth/iso.c         |  8 ++++----
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 6213012610d7..94f365b75166 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2837,7 +2837,7 @@ struct hci_evt_le_create_big_complete {
 } __packed;
 
 #define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d
-struct hci_evt_le_big_sync_estabilished {
+struct hci_evt_le_big_sync_established {
 	__u8    status;
 	__u8    handle;
 	__u8    latency[3];
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index fca58984ee4e..d44463e65194 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6350,8 +6350,8 @@ static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
 	return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
 }
 
-static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
-					    struct sk_buff *skb)
+static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
+					   struct sk_buff *skb)
 {
 	struct hci_ev_le_pa_sync_established *ev = data;
 	int mask = hdev->link_mode;
@@ -6681,8 +6681,8 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
-static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
-					struct sk_buff *skb)
+static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
+				       struct sk_buff *skb)
 {
 	struct hci_evt_le_cis_established *ev = data;
 	struct hci_conn *conn;
@@ -6910,7 +6910,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
 static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 					    struct sk_buff *skb)
 {
-	struct hci_evt_le_big_sync_estabilished *ev = data;
+	struct hci_evt_le_big_sync_established *ev = data;
 	struct hci_conn *bis, *conn;
 	int i;
 
@@ -7121,7 +7121,7 @@ static const struct hci_le_ev {
 		     HCI_MAX_EVENT_SIZE),
 	/* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */
 	HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED,
-		  hci_le_pa_sync_estabilished_evt,
+		  hci_le_pa_sync_established_evt,
 		  sizeof(struct hci_ev_le_pa_sync_established)),
 	/* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */
 	HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT,
@@ -7132,7 +7132,7 @@ static const struct hci_le_ev {
 	HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
 		  sizeof(struct hci_evt_le_ext_adv_set_term)),
 	/* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */
-	HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_estabilished_evt,
+	HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt,
 		  sizeof(struct hci_evt_le_cis_established)),
 	/* [0x1a = HCI_EVT_LE_CIS_REQ] */
 	HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt,
@@ -7145,7 +7145,7 @@ static const struct hci_le_ev {
 	/* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */
 	HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
 		     hci_le_big_sync_established_evt,
-		     sizeof(struct hci_evt_le_big_sync_estabilished),
+		     sizeof(struct hci_evt_le_big_sync_established),
 		     HCI_MAX_EVENT_SIZE),
 	/* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */
 	HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST,
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 34e89bb5f384..5e752950e266 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1891,7 +1891,7 @@ static void iso_sock_ready(struct sock *sk)
 
 static bool iso_match_big(struct sock *sk, void *data)
 {
-	struct hci_evt_le_big_sync_estabilished *ev = data;
+	struct hci_evt_le_big_sync_established *ev = data;
 
 	return ev->handle == iso_pi(sk)->qos.bcast.big;
 }
@@ -1912,7 +1912,7 @@ static void iso_conn_ready(struct iso_conn *conn)
 {
 	struct sock *parent = NULL;
 	struct sock *sk = conn->sk;
-	struct hci_ev_le_big_sync_estabilished *ev = NULL;
+	struct hci_ev_le_big_sync_established *ev = NULL;
 	struct hci_ev_le_pa_sync_established *ev2 = NULL;
 	struct hci_ev_le_per_adv_report *ev3 = NULL;
 	struct hci_conn *hcon;
@@ -2023,7 +2023,7 @@ static void iso_conn_ready(struct iso_conn *conn)
 		hci_conn_hold(hcon);
 		iso_chan_add(conn, sk, parent);
 
-		if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) ||
+		if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) ||
 		    (ev2 && ev2->status)) {
 			/* Trigger error signal on child socket */
 			sk->sk_err = ECONNREFUSED;
@@ -2082,7 +2082,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 	 * proceed to establishing a BIG sync:
 	 *
 	 * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific
-	 * SID to listen to and once sync is estabilished its handle needs to
+	 * SID to listen to and once sync is established its handle needs to
 	 * be stored in iso_pi(sk)->sync_handle so it can be matched once
 	 * receiving the BIG Info.
 	 * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a
-- 
cgit v1.2.3


From 7565bc56598c3d135318f1bd76a0178dd3ea918f Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Mon, 14 Jul 2025 19:40:37 +0300
Subject: Bluetooth: ISO: add socket option to report packet seqnum via CMSG

User applications need a way to track which ISO interval a given SDU
belongs to, to properly detect packet loss. All controllers do not set
timestamps, and it's not guaranteed user application receives all packet
reports (small socket buffer, or controller doesn't send all reports
like Intel AX210 is doing).

Add socket option BT_PKT_SEQNUM that enables reporting of received
packet ISO sequence number in BT_SCM_PKT_SEQNUM CMSG.

Use BT_PKT_SEQNUM == 22 for the socket option, as 21 was used earlier
for a removed experimental feature that never got into mainline.

Signed-off-by: Pauli Virtanen <pav@iki.fi>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h | 11 ++++++++++-
 net/bluetooth/af_bluetooth.c      |  7 +++++++
 net/bluetooth/iso.c               | 21 ++++++++++++++++++---
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 114299bd8b98..ada5b56a4413 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -244,6 +244,12 @@ struct bt_codecs {
 
 #define BT_ISO_BASE		20
 
+/* Socket option value 21 reserved */
+
+#define BT_PKT_SEQNUM		22
+
+#define BT_SCM_PKT_SEQNUM	0x05
+
 __printf(1, 2)
 void bt_info(const char *fmt, ...);
 __printf(1, 2)
@@ -391,7 +397,8 @@ struct bt_sock {
 enum {
 	BT_SK_DEFER_SETUP,
 	BT_SK_SUSPEND,
-	BT_SK_PKT_STATUS
+	BT_SK_PKT_STATUS,
+	BT_SK_PKT_SEQNUM,
 };
 
 struct bt_sock_list {
@@ -475,6 +482,7 @@ struct bt_skb_cb {
 	u8 pkt_type;
 	u8 force_active;
 	u16 expect;
+	u16 pkt_seqnum;
 	u8 incoming:1;
 	u8 pkt_status:2;
 	union {
@@ -488,6 +496,7 @@ struct bt_skb_cb {
 
 #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type
 #define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status
+#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum
 #define hci_skb_expect(skb) bt_cb((skb))->expect
 #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode
 #define hci_skb_event(skb) bt_cb((skb))->hci.req_event
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index ee9bf84c88a7..2b94e2077203 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -364,6 +364,13 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS,
 				 sizeof(pkt_status), &pkt_status);
 		}
+
+		if (test_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags)) {
+			u16 pkt_seqnum = hci_skb_pkt_seqnum(skb);
+
+			put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_SEQNUM,
+				 sizeof(pkt_seqnum), &pkt_seqnum);
+		}
 	}
 
 	skb_free_datagram(sk, skb);
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 5e752950e266..2f229f2077e8 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1687,6 +1687,17 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 			clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
 		break;
 
+	case BT_PKT_SEQNUM:
+		err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+		if (err)
+			break;
+
+		if (opt)
+			set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+		else
+			clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+		break;
+
 	case BT_ISO_QOS:
 		if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
 		    sk->sk_state != BT_CONNECT2 &&
@@ -2278,7 +2289,7 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 {
 	struct iso_conn *conn = hcon->iso_data;
-	__u16 pb, ts, len;
+	__u16 pb, ts, len, sn;
 
 	if (!conn)
 		goto drop;
@@ -2308,6 +2319,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 				goto drop;
 			}
 
+			sn = __le16_to_cpu(hdr->sn);
 			len = __le16_to_cpu(hdr->slen);
 		} else {
 			struct hci_iso_data_hdr *hdr;
@@ -2318,18 +2330,20 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 				goto drop;
 			}
 
+			sn = __le16_to_cpu(hdr->sn);
 			len = __le16_to_cpu(hdr->slen);
 		}
 
 		flags  = hci_iso_data_flags(len);
 		len    = hci_iso_data_len(len);
 
-		BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len,
-		       skb->len, flags);
+		BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d",
+		       len, skb->len, flags, sn);
 
 		if (len == skb->len) {
 			/* Complete frame received */
 			hci_skb_pkt_status(skb) = flags & 0x03;
+			hci_skb_pkt_seqnum(skb) = sn;
 			iso_recv_frame(conn, skb);
 			return;
 		}
@@ -2352,6 +2366,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 			goto drop;
 
 		hci_skb_pkt_status(conn->rx_skb) = flags & 0x03;
+		hci_skb_pkt_seqnum(conn->rx_skb) = sn;
 		skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
 					  skb->len);
 		conn->rx_len = len - skb->len;
-- 
cgit v1.2.3


From 2935e556850e9c94d7a00adf14d3cd7fe406ac03 Mon Sep 17 00:00:00 2001
From: Arseniy Krasnov <avkrasnov@salutedevices.com>
Date: Wed, 16 Jul 2025 22:23:58 +0300
Subject: Bluetooth: hci_sync: fix double free in
 'hci_discovery_filter_clear()'

Function 'hci_discovery_filter_clear()' frees 'uuids' array and then
sets it to NULL. There is a tiny chance of the following race:

'hci_cmd_sync_work()'

 'update_passive_scan_sync()'

   'hci_update_passive_scan_sync()'

     'hci_discovery_filter_clear()'
       kfree(uuids);

       <-------------------------preempted-------------------------------->
                                           'start_service_discovery()'

                                             'hci_discovery_filter_clear()'
                                               kfree(uuids); // DOUBLE FREE

       <-------------------------preempted-------------------------------->

      uuids = NULL;

To fix it let's add locking around 'kfree()' call and NULL pointer
assignment. Otherwise the following backtrace fires:

[ ] ------------[ cut here ]------------
[ ] kernel BUG at mm/slub.c:547!
[ ] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
[ ] CPU: 3 UID: 0 PID: 246 Comm: bluetoothd Tainted: G O 6.12.19-kernel #1
[ ] Tainted: [O]=OOT_MODULE
[ ] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ ] pc : __slab_free+0xf8/0x348
[ ] lr : __slab_free+0x48/0x348
...
[ ] Call trace:
[ ]  __slab_free+0xf8/0x348
[ ]  kfree+0x164/0x27c
[ ]  start_service_discovery+0x1d0/0x2c0
[ ]  hci_sock_sendmsg+0x518/0x924
[ ]  __sock_sendmsg+0x54/0x60
[ ]  sock_write_iter+0x98/0xf8
[ ]  do_iter_readv_writev+0xe4/0x1c8
[ ]  vfs_writev+0x128/0x2b0
[ ]  do_writev+0xfc/0x118
[ ]  __arm64_sys_writev+0x20/0x2c
[ ]  invoke_syscall+0x68/0xf0
[ ]  el0_svc_common.constprop.0+0x40/0xe0
[ ]  do_el0_svc+0x1c/0x28
[ ]  el0_svc+0x30/0xd0
[ ]  el0t_64_sync_handler+0x100/0x12c
[ ]  el0t_64_sync+0x194/0x198
[ ] Code: 8b0002e6 eb17031f 54fffbe1 d503201f (d4210000)
[ ] ---[ end trace 0000000000000000 ]---

Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled")
Signed-off-by: Arseniy Krasnov <avkrasnov@salutedevices.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 1ef9279cfd6f..3728495f0819 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -29,6 +29,7 @@
 #include <linux/idr.h>
 #include <linux/leds.h>
 #include <linux/rculist.h>
+#include <linux/spinlock.h>
 #include <linux/srcu.h>
 
 #include <net/bluetooth/hci.h>
@@ -94,6 +95,7 @@ struct discovery_state {
 	u16			uuid_count;
 	u8			(*uuids)[16];
 	unsigned long		name_resolve_timeout;
+	spinlock_t		lock;
 };
 
 #define SUSPEND_NOTIFIER_TIMEOUT	msecs_to_jiffies(2000) /* 2 seconds */
@@ -889,6 +891,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
 
 static inline void discovery_init(struct hci_dev *hdev)
 {
+	spin_lock_init(&hdev->discovery.lock);
 	hdev->discovery.state = DISCOVERY_STOPPED;
 	INIT_LIST_HEAD(&hdev->discovery.all);
 	INIT_LIST_HEAD(&hdev->discovery.unknown);
@@ -903,8 +906,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev)
 	hdev->discovery.report_invalid_rssi = true;
 	hdev->discovery.rssi = HCI_RSSI_INVALID;
 	hdev->discovery.uuid_count = 0;
+
+	spin_lock(&hdev->discovery.lock);
 	kfree(hdev->discovery.uuids);
 	hdev->discovery.uuids = NULL;
+	spin_unlock(&hdev->discovery.lock);
 }
 
 bool hci_discovery_active(struct hci_dev *hdev);
-- 
cgit v1.2.3


From 0cadf8534f2a727bc3a01e8c583b085d25963ee0 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Mon, 21 Jul 2025 16:30:23 +0100
Subject: Bluetooth: hci_event: Mask data status from LE ext adv reports

The Event_Type field in an LE Extended Advertising Report uses bits 5
and 6 for data status (e.g. truncation or fragmentation), not the PDU
type itself.

The ext_evt_type_to_legacy() function fails to mask these status bits
before evaluation. This causes valid advertisements with status bits set
(e.g. a truncated non-connectable advertisement, which ends up showing
as PDU type 0x40) to be misclassified as unknown and subsequently
dropped. This is okay for most checks which use bitwise AND on the
relevant event type bits, but it doesn't work for non-connectable types,
which are checked with '== LE_EXT_ADV_NON_CONN_IND' (that is, zero).

In terms of behaviour, first the device sends a truncated report:

> HCI Event: LE Meta Event (0x3e) plen 26
      LE Extended Advertising Report (0x0d)
        Entry 0
          Event type: 0x0040
            Data status: Incomplete, data truncated, no more to come
          Address type: Random (0x01)
          Address: 1D:12:46:FA:F8:6E (Non-Resolvable)
          SID: 0x03
          RSSI: -98 dBm (0x9e)
          Data length: 0x00

Then, a few seconds later, it sends the subsequent complete report:

> HCI Event: LE Meta Event (0x3e) plen 122
      LE Extended Advertising Report (0x0d)
        Entry 0
          Event type: 0x0000
            Data status: Complete
          Address type: Random (0x01)
          Address: 1D:12:46:FA:F8:6E (Non-Resolvable)
          SID: 0x03
          RSSI: -97 dBm (0x9f)
          Data length: 0x60
          Service Data: Google (0xfef3)
            Data[92]: ...

These devices often send multiple truncated reports per second.

This patch introduces a PDU type mask to ensure only the relevant bits
are evaluated, allowing for the correct translation of all valid
extended advertising packets.

Fixes: b2cc9761f144 ("Bluetooth: Handle extended ADV PDU types")
Signed-off-by: Chris Down <chris@chrisdown.name>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 +
 net/bluetooth/hci_event.c   | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 94f365b75166..e90a7b753926 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2634,6 +2634,7 @@ struct hci_ev_le_conn_complete {
 #define LE_EXT_ADV_DIRECT_IND		0x0004
 #define LE_EXT_ADV_SCAN_RSP		0x0008
 #define LE_EXT_ADV_LEGACY_PDU		0x0010
+#define LE_EXT_ADV_DATA_STATUS_MASK	0x0060
 #define LE_EXT_ADV_EVT_TYPE_MASK	0x007f
 
 #define ADDR_LE_DEV_PUBLIC		0x00
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index c0eb03e5cbf8..b7b473473b70 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6239,6 +6239,11 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data,
 
 static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type)
 {
+	u16 pdu_type = evt_type & ~LE_EXT_ADV_DATA_STATUS_MASK;
+
+	if (!pdu_type)
+		return LE_ADV_NONCONN_IND;
+
 	if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
 		switch (evt_type) {
 		case LE_LEGACY_ADV_IND:
@@ -6270,8 +6275,7 @@ static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type)
 	if (evt_type & LE_EXT_ADV_SCAN_IND)
 		return LE_ADV_SCAN_IND;
 
-	if (evt_type == LE_EXT_ADV_NON_CONN_IND ||
-	    evt_type & LE_EXT_ADV_DIRECT_IND)
+	if (evt_type & LE_EXT_ADV_DIRECT_IND)
 		return LE_ADV_NONCONN_IND;
 
 invalid:
-- 
cgit v1.2.3


From a7bcffc673de219af2698fbb90627016233de67b Mon Sep 17 00:00:00 2001
From: Yang Li <yang.li@amlogic.com>
Date: Thu, 10 Jul 2025 18:52:47 +0800
Subject: Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync
 connections

Currently, BIS_LINK is used for both BIG sync and PA sync connections,
which makes it impossible to distinguish them when searching for a PA
sync connection.

Adding PA_LINK will make the distinction clearer and simplify future
extensions for PA-related features.

Signed-off-by: Yang Li <yang.li@amlogic.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  1 +
 include/net/bluetooth/hci_core.h | 10 +++++++---
 net/bluetooth/hci_conn.c         | 14 +++++++++-----
 net/bluetooth/hci_core.c         | 27 +++++++++++++++------------
 net/bluetooth/hci_event.c        |  7 ++++---
 net/bluetooth/hci_sync.c         | 10 +++++-----
 net/bluetooth/iso.c              |  6 ++++--
 net/bluetooth/mgmt.c             |  1 +
 8 files changed, 46 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index e90a7b753926..df1847b74e55 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -562,6 +562,7 @@ enum {
 #define LE_LINK		0x80
 #define CIS_LINK	0x82
 #define BIS_LINK	0x83
+#define PA_LINK		0x84
 #define INVALID_LINK	0xff
 
 /* LMP features */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 3728495f0819..4dc11c66f7b8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1015,6 +1015,7 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		h->iso_num++;
 		break;
 	}
@@ -1042,6 +1043,7 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		h->iso_num--;
 		break;
 	}
@@ -1060,6 +1062,7 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
 		return h->sco_num;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		return h->iso_num;
 	default:
 		return 0;
@@ -1142,7 +1145,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != BIS_LINK)
+		if (c->type != PA_LINK)
 			continue;
 
 		if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags))
@@ -1337,7 +1340,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != BIS_LINK)
+		if (c->type != PA_LINK)
 			continue;
 
 		if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) {
@@ -1407,7 +1410,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != BIS_LINK)
+		if (c->type != PA_LINK)
 			continue;
 
 		/* Ignore the listen hcon, we are looking
@@ -2006,6 +2009,7 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
 
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		return iso_connect_ind(hdev, bdaddr, flags);
 
 	default:
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index b2e09e7f70e1..7d1e79f69cd1 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -785,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c
 	d->sync_handle = conn->sync_handle;
 
 	if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
-		hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK,
+		hci_conn_hash_list_flag(hdev, find_bis, PA_LINK,
 					HCI_CONN_PA_SYNC, d);
 
 		if (!d->count)
@@ -914,6 +914,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		if (hdev->iso_mtu)
 			/* Dedicated ISO Buffer exists */
 			break;
@@ -979,6 +980,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		/* conn->src should reflect the local identity address */
 		hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
 
@@ -1033,7 +1035,6 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
 	}
 
 	hci_conn_init_sysfs(conn);
-
 	return conn;
 }
 
@@ -1077,6 +1078,7 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		if ((conn->state != BT_CONNECTED &&
 		    !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) ||
 		    test_bit(HCI_CONN_BIG_CREATED, &conn->flags))
@@ -1152,7 +1154,8 @@ void hci_conn_del(struct hci_conn *conn)
 	} else {
 		/* Unacked ISO frames */
 		if (conn->type == CIS_LINK ||
-		    conn->type == BIS_LINK) {
+		    conn->type == BIS_LINK ||
+		    conn->type == PA_LINK) {
 			if (hdev->iso_pkts)
 				hdev->iso_cnt += conn->sent;
 			else if (hdev->le_pkts)
@@ -2081,7 +2084,7 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
 
 	bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid);
 
-	conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_SLAVE);
+	conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE);
 	if (IS_ERR(conn))
 		return conn;
 
@@ -2246,7 +2249,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	 * the start periodic advertising and create BIG commands have
 	 * been queued
 	 */
-	hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
+	hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK,
 				 BT_BOUND, &data);
 
 	/* Queue start periodic advertising and create BIG */
@@ -2980,6 +2983,7 @@ void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb)
 	switch (conn->type) {
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 	case ACL_LINK:
 	case LE_LINK:
 		break;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index f2fbe9c8e1be..55e0722fd066 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2936,12 +2936,14 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
 	case HCI_ACLDATA_PKT:
 		/* Detect if ISO packet has been sent as ACL */
 		if (hci_conn_num(hdev, CIS_LINK) ||
-		    hci_conn_num(hdev, BIS_LINK)) {
+		    hci_conn_num(hdev, BIS_LINK) ||
+			hci_conn_num(hdev, PA_LINK)) {
 			__u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle);
 			__u8 type;
 
 			type = hci_conn_lookup_type(hdev, hci_handle(handle));
-			if (type == CIS_LINK || type == BIS_LINK)
+			if (type == CIS_LINK || type == BIS_LINK ||
+			    type == PA_LINK)
 				hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
 		}
 		break;
@@ -3396,6 +3398,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
 		break;
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 		cnt = hdev->iso_mtu ? hdev->iso_cnt :
 			hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
 		break;
@@ -3409,7 +3412,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
 }
 
 static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
-				     __u8 type2, int *quote)
+				     int *quote)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn *conn = NULL, *c;
@@ -3421,7 +3424,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if ((c->type != type && c->type != type2) ||
+		if (c->type != type ||
 		    skb_queue_empty(&c->data_q))
 			continue;
 
@@ -3625,7 +3628,7 @@ static void hci_sched_sco(struct hci_dev *hdev, __u8 type)
 	else
 		cnt = &hdev->sco_cnt;
 
-	while (*cnt && (conn = hci_low_sent(hdev, type, type, &quote))) {
+	while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
 		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
 			BT_DBG("skb %p len %d", skb, skb->len);
 			hci_send_conn_frame(hdev, conn, skb);
@@ -3744,8 +3747,8 @@ static void hci_sched_le(struct hci_dev *hdev)
 		hci_prio_recalculate(hdev, LE_LINK);
 }
 
-/* Schedule CIS */
-static void hci_sched_iso(struct hci_dev *hdev)
+/* Schedule iso */
+static void hci_sched_iso(struct hci_dev *hdev, __u8 type)
 {
 	struct hci_conn *conn;
 	struct sk_buff *skb;
@@ -3753,14 +3756,12 @@ static void hci_sched_iso(struct hci_dev *hdev)
 
 	BT_DBG("%s", hdev->name);
 
-	if (!hci_conn_num(hdev, CIS_LINK) &&
-	    !hci_conn_num(hdev, BIS_LINK))
+	if (!hci_conn_num(hdev, type))
 		return;
 
 	cnt = hdev->iso_pkts ? &hdev->iso_cnt :
 		hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
-	while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK,
-					    &quote))) {
+	while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
 		while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
 			BT_DBG("skb %p len %d", skb, skb->len);
 			hci_send_conn_frame(hdev, conn, skb);
@@ -3785,7 +3786,9 @@ static void hci_tx_work(struct work_struct *work)
 		/* Schedule queues and send stuff to HCI driver */
 		hci_sched_sco(hdev, SCO_LINK);
 		hci_sched_sco(hdev, ESCO_LINK);
-		hci_sched_iso(hdev);
+		hci_sched_iso(hdev, CIS_LINK);
+		hci_sched_iso(hdev, BIS_LINK);
+		hci_sched_iso(hdev, PA_LINK);
 		hci_sched_acl(hdev);
 		hci_sched_le(hdev);
 	}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index b7b473473b70..8aa5039b975a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4432,6 +4432,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 
 		case CIS_LINK:
 		case BIS_LINK:
+		case PA_LINK:
 			if (hdev->iso_pkts) {
 				hdev->iso_cnt += count;
 				if (hdev->iso_cnt > hdev->iso_pkts)
@@ -6381,7 +6382,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
 	conn->sync_handle = le16_to_cpu(ev->handle);
 	conn->sid = HCI_SID_INVALID;
 
-	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, BIS_LINK,
+	mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
 				      &flags);
 	if (!(mask & HCI_LM_ACCEPT)) {
 		hci_le_pa_term_sync(hdev, ev->handle);
@@ -6392,7 +6393,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 
 	/* Add connection to indicate PA sync event */
-	pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY,
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
 				     HCI_ROLE_SLAVE);
 
 	if (IS_ERR(pa_sync))
@@ -6423,7 +6424,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
 
 	hci_dev_lock(hdev);
 
-	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags);
+	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, PA_LINK, &flags);
 	if (!(mask & HCI_LM_ACCEPT))
 		goto unlock;
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e9df6502e58e..2b4f21fbf9c1 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2929,7 +2929,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
 		if (sent) {
 			struct hci_conn *conn;
 
-			conn = hci_conn_hash_lookup_ba(hdev, BIS_LINK,
+			conn = hci_conn_hash_lookup_ba(hdev, PA_LINK,
 						       &sent->bdaddr);
 			if (conn) {
 				struct bt_iso_qos *qos = &conn->iso_qos;
@@ -5493,7 +5493,7 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn,
 {
 	struct hci_cp_disconnect cp;
 
-	if (conn->type == BIS_LINK) {
+	if (conn->type == BIS_LINK || conn->type == PA_LINK) {
 		/* This is a BIS connection, hci_conn_del will
 		 * do the necessary cleanup.
 		 */
@@ -5562,7 +5562,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn,
 		return HCI_ERROR_LOCAL_HOST_TERM;
 	}
 
-	if (conn->type == BIS_LINK) {
+	if (conn->type == BIS_LINK || conn->type == PA_LINK) {
 		/* There is no way to cancel a BIS without terminating the BIG
 		 * which is done later on connection cleanup.
 		 */
@@ -5627,7 +5627,7 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
 	if (conn->type == CIS_LINK)
 		return hci_le_reject_cis_sync(hdev, conn, reason);
 
-	if (conn->type == BIS_LINK)
+	if (conn->type == BIS_LINK || conn->type == PA_LINK)
 		return -EINVAL;
 
 	if (conn->type == SCO_LINK || conn->type == ESCO_LINK)
@@ -6994,7 +6994,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
 		goto unlock;
 
 	/* Add connection to indicate PA sync error */
-	pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY,
+	pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY,
 				     HCI_ROLE_SLAVE);
 
 	if (IS_ERR(pa_sync))
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 2f45e46a9b6a..7bd3aa0a6db9 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -2237,7 +2237,8 @@ done:
 
 static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
 {
-	if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) {
+	if (hcon->type != CIS_LINK && hcon->type != BIS_LINK &&
+	    hcon->type != PA_LINK) {
 		if (hcon->type != LE_LINK)
 			return;
 
@@ -2278,7 +2279,8 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
 
 static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
 {
-	if (hcon->type != CIS_LINK && hcon->type != BIS_LINK)
+	if (hcon->type != CIS_LINK && hcon->type !=  BIS_LINK &&
+	    hcon->type != PA_LINK)
 		return;
 
 	BT_DBG("hcon %p reason %d", hcon, reason);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 63dba0503653..1ce682038b51 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3237,6 +3237,7 @@ static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
 	switch (link_type) {
 	case CIS_LINK:
 	case BIS_LINK:
+	case PA_LINK:
 	case LE_LINK:
 		switch (addr_type) {
 		case ADDR_LE_DEV_PUBLIC:
-- 
cgit v1.2.3


From ab29b3460c5cec24bde75278d7ffca23cac1b867 Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Wed, 23 Jul 2025 22:24:23 +0800
Subject: ALSA: hda: Add TAS2770 support

Add TAS2770 support in TI's HDA driver. And add hda_chip_id for
more products. Distinguish DSP and non-DSP in firmware
loading function.

Signed-off-by: Baojun Xu <baojun.xu@ti.com>

Link: https://patch.msgid.link/20250723142423.38768-1-baojun.xu@ti.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/tas2770-tlv.h                    |  23 ++++
 sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 157 ++++++++++++++++---------
 2 files changed, 125 insertions(+), 55 deletions(-)
 create mode 100644 include/sound/tas2770-tlv.h

(limited to 'include')

diff --git a/include/sound/tas2770-tlv.h b/include/sound/tas2770-tlv.h
new file mode 100644
index 000000000000..c0bd495b4a07
--- /dev/null
+++ b/include/sound/tas2770-tlv.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+//
+// ALSA SoC Texas Instruments TAS2770 Audio Smart Amplifier
+//
+// Copyright (C) 2025 Texas Instruments Incorporated
+// https://www.ti.com
+//
+// The TAS2770 hda driver implements for one, two, or even multiple
+// TAS2770 chips.
+//
+// Author: Baojun Xu <baojun.xu@ti.com>
+//
+
+#ifndef __TAS2770_TLV_H__
+#define __TAS2770_TLV_H__
+
+#define TAS2770_DVC_LEVEL		TASDEVICE_REG(0x0, 0x0, 0x17)
+#define TAS2770_AMP_LEVEL		TASDEVICE_REG(0x0, 0x0, 0x03)
+
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2770_dvc_tlv, 1650, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2770_amp_tlv, 1100, 50, 0);
+
+#endif
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
index bacc3f6ed4bd..a0b132681804 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
@@ -24,6 +24,7 @@
 #include <sound/tas2781.h>
 #include <sound/tas2781-comlib-i2c.h>
 #include <sound/tlv.h>
+#include <sound/tas2770-tlv.h>
 #include <sound/tas2781-tlv.h>
 
 #include "hda_local.h"
@@ -45,9 +46,18 @@
 #define TAS2563_CAL_TLIM		TASDEVICE_REG(0, 0x10, 0x14)
 #define TAS2563_CAL_R0			TASDEVICE_REG(0, 0x0f, 0x34)
 
+enum device_chip_id {
+	HDA_TAS2563,
+	HDA_TAS2770,
+	HDA_TAS2781,
+	HDA_OTHERS
+};
+
 struct tas2781_hda_i2c_priv {
 	struct snd_kcontrol *snd_ctls[2];
 	int (*save_calibration)(struct tas2781_hda *h);
+
+	int hda_chip_id;
 };
 
 static int tas2781_get_i2c_res(struct acpi_resource *ares, void *data)
@@ -245,6 +255,15 @@ static int tas2781_force_fwload_put(struct snd_kcontrol *kcontrol,
 	return change;
 }
 
+static const struct snd_kcontrol_new tas2770_snd_controls[] = {
+	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2770_AMP_LEVEL,
+		0, 0, 20, 0, tas2781_amp_getvol,
+		tas2781_amp_putvol, tas2770_amp_tlv),
+	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2770_DVC_LEVEL,
+		0, 0, 31, 0, tas2781_amp_getvol,
+		tas2781_amp_putvol, tas2770_dvc_tlv),
+};
+
 static const struct snd_kcontrol_new tas2781_snd_controls[] = {
 	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Gain", TAS2781_AMP_LEVEL,
 		1, 0, 20, 0, tas2781_amp_getvol,
@@ -253,7 +272,7 @@ static const struct snd_kcontrol_new tas2781_snd_controls[] = {
 		tas2781_force_fwload_get, tas2781_force_fwload_put),
 };
 
-static const struct snd_kcontrol_new tas2781_prof_ctrl = {
+static const struct snd_kcontrol_new tasdevice_prof_ctrl = {
 	.name = "Speaker Profile Id",
 	.iface = SNDRV_CTL_ELEM_IFACE_CARD,
 	.info = tasdevice_info_profile,
@@ -261,7 +280,7 @@ static const struct snd_kcontrol_new tas2781_prof_ctrl = {
 	.put = tasdevice_set_profile_id,
 };
 
-static const struct snd_kcontrol_new tas2781_dsp_prog_ctrl = {
+static const struct snd_kcontrol_new tasdevice_dsp_prog_ctrl = {
 	.name = "Speaker Program Id",
 	.iface = SNDRV_CTL_ELEM_IFACE_CARD,
 	.info = tasdevice_info_programs,
@@ -269,7 +288,7 @@ static const struct snd_kcontrol_new tas2781_dsp_prog_ctrl = {
 	.put = tasdevice_program_put,
 };
 
-static const struct snd_kcontrol_new tas2781_dsp_conf_ctrl = {
+static const struct snd_kcontrol_new tasdevice_dsp_conf_ctrl = {
 	.name = "Speaker Config Id",
 	.iface = SNDRV_CTL_ELEM_IFACE_CARD,
 	.info = tasdevice_info_config,
@@ -378,44 +397,34 @@ static void tas2781_hda_remove_controls(struct tas2781_hda *tas_hda)
 	snd_ctl_remove(codec->card, tas_hda->prof_ctl);
 }
 
-static void tasdev_fw_ready(const struct firmware *fmw, void *context)
+static void tasdev_add_kcontrols(struct tasdevice_priv *tas_priv,
+	struct snd_kcontrol **ctls, struct hda_codec *codec,
+	const struct snd_kcontrol_new *tas_snd_ctrls, int num_ctls)
 {
-	struct tasdevice_priv *tas_priv = context;
-	struct tas2781_hda *tas_hda = dev_get_drvdata(tas_priv->dev);
-	struct tas2781_hda_i2c_priv *hda_priv = tas_hda->hda_priv;
-	struct hda_codec *codec = tas_priv->codec;
-	int i, ret, spk_id;
-
-	pm_runtime_get_sync(tas_priv->dev);
-	mutex_lock(&tas_priv->codec_lock);
+	int i, ret;
 
-	ret = tasdevice_rca_parser(tas_priv, fmw);
-	if (ret)
-		goto out;
-
-	tas_hda->prof_ctl = snd_ctl_new1(&tas2781_prof_ctrl, tas_priv);
-	ret = snd_ctl_add(codec->card, tas_hda->prof_ctl);
-	if (ret) {
-		dev_err(tas_priv->dev,
-			"Failed to add KControl %s = %d\n",
-			tas2781_prof_ctrl.name, ret);
-		goto out;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(tas2781_snd_controls); i++) {
-		hda_priv->snd_ctls[i] = snd_ctl_new1(&tas2781_snd_controls[i],
-			tas_priv);
-		ret = snd_ctl_add(codec->card, hda_priv->snd_ctls[i]);
+	for (i = 0; i < num_ctls; i++) {
+		ctls[i] = snd_ctl_new1(
+			&tas_snd_ctrls[i], tas_priv);
+		ret = snd_ctl_add(codec->card, ctls[i]);
 		if (ret) {
 			dev_err(tas_priv->dev,
 				"Failed to add KControl %s = %d\n",
-				tas2781_snd_controls[i].name, ret);
-			goto out;
+				tas_snd_ctrls[i].name, ret);
+			break;
 		}
 	}
+}
 
-	tasdevice_dsp_remove(tas_priv);
+static void tasdevice_dspfw_init(void *context)
+{
+	struct tasdevice_priv *tas_priv = context;
+	struct tas2781_hda *tas_hda = dev_get_drvdata(tas_priv->dev);
+	struct tas2781_hda_i2c_priv *hda_priv = tas_hda->hda_priv;
+	struct hda_codec *codec = tas_priv->codec;
+	int ret, spk_id;
 
+	tasdevice_dsp_remove(tas_priv);
 	tas_priv->fw_state = TASDEVICE_DSP_FW_PENDING;
 	if (tas_priv->speaker_id != NULL) {
 		// Speaker id need to be checked for ASUS only.
@@ -441,28 +450,12 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context)
 		dev_err(tas_priv->dev, "dspfw load %s error\n",
 			tas_priv->coef_binaryname);
 		tas_priv->fw_state = TASDEVICE_DSP_FW_FAIL;
-		goto out;
-	}
-
-	tas_hda->dsp_prog_ctl = snd_ctl_new1(&tas2781_dsp_prog_ctrl,
-		tas_priv);
-	ret = snd_ctl_add(codec->card, tas_hda->dsp_prog_ctl);
-	if (ret) {
-		dev_err(tas_priv->dev,
-			"Failed to add KControl %s = %d\n",
-			tas2781_dsp_prog_ctrl.name, ret);
-		goto out;
-	}
-
-	tas_hda->dsp_conf_ctl = snd_ctl_new1(&tas2781_dsp_conf_ctrl,
-		tas_priv);
-	ret = snd_ctl_add(codec->card, tas_hda->dsp_conf_ctl);
-	if (ret) {
-		dev_err(tas_priv->dev,
-			"Failed to add KControl %s = %d\n",
-			tas2781_dsp_conf_ctrl.name, ret);
-		goto out;
+		return;
 	}
+	tasdev_add_kcontrols(tas_priv, &tas_hda->dsp_prog_ctl, codec,
+			     &tasdevice_dsp_prog_ctrl, 1);
+	tasdev_add_kcontrols(tas_priv, &tas_hda->dsp_conf_ctl, codec,
+			     &tasdevice_dsp_conf_ctrl, 1);
 
 	tas_priv->fw_state = TASDEVICE_DSP_FW_ALL_OK;
 	tasdevice_prmg_load(tas_priv, 0);
@@ -475,9 +468,45 @@ static void tasdev_fw_ready(const struct firmware *fmw, void *context)
 	 * calibrated data inside algo.
 	 */
 	hda_priv->save_calibration(tas_hda);
+}
+
+static void tasdev_fw_ready(const struct firmware *fmw, void *context)
+{
+	struct tasdevice_priv *tas_priv = context;
+	struct tas2781_hda *tas_hda = dev_get_drvdata(tas_priv->dev);
+	struct tas2781_hda_i2c_priv *hda_priv = tas_hda->hda_priv;
+	struct hda_codec *codec = tas_priv->codec;
+	int ret;
+
+	pm_runtime_get_sync(tas_priv->dev);
+	mutex_lock(&tas_priv->codec_lock);
+
+	ret = tasdevice_rca_parser(tas_priv, fmw);
+	if (ret)
+		goto out;
 
-	tasdevice_tuning_switch(tas_hda->priv, 0);
-	tas_hda->priv->playback_started = true;
+	tas_priv->fw_state = TASDEVICE_RCA_FW_OK;
+	tasdev_add_kcontrols(tas_priv, &tas_hda->prof_ctl, codec,
+		&tasdevice_prof_ctrl, 1);
+
+	switch (hda_priv->hda_chip_id) {
+	case HDA_TAS2770:
+		tasdev_add_kcontrols(tas_priv, hda_priv->snd_ctls, codec,
+				     &tas2770_snd_controls[0],
+				     ARRAY_SIZE(tas2770_snd_controls));
+		break;
+	case HDA_TAS2781:
+		tasdev_add_kcontrols(tas_priv, hda_priv->snd_ctls, codec,
+				     &tas2781_snd_controls[0],
+				     ARRAY_SIZE(tas2781_snd_controls));
+		tasdevice_dspfw_init(context);
+		break;
+	case HDA_TAS2563:
+		tasdevice_dspfw_init(context);
+		break;
+	default:
+		break;
+	}
 
 out:
 	mutex_unlock(&tas_hda->priv->codec_lock);
@@ -581,16 +610,33 @@ static int tas2781_hda_i2c_probe(struct i2c_client *clt)
 		return -ENOMEM;
 
 	if (strstr(dev_name(&clt->dev), "TIAS2781")) {
+		/*
+		 * TAS2781, integrated on-chip DSP with
+		 * global I2C address supported.
+		 */
 		device_name = "TIAS2781";
+		hda_priv->hda_chip_id = HDA_TAS2781;
 		hda_priv->save_calibration = tas2781_save_calibration;
 		tas_hda->priv->global_addr = TAS2781_GLOBAL_ADDR;
+	} else if (strstarts(dev_name(&clt->dev), "i2c-TXNW2770")) {
+		/*
+		 * TAS2770, has no on-chip DSP, so no calibration data
+		 * required; has no global I2C address supported.
+		 */
+		device_name = "TXNW2770";
+		hda_priv->hda_chip_id = HDA_TAS2770;
 	} else if (strstarts(dev_name(&clt->dev),
 			     "i2c-TXNW2781:00-tas2781-hda.0")) {
 		device_name = "TXNW2781";
 		hda_priv->save_calibration = tas2781_save_calibration;
 		tas_hda->priv->global_addr = TAS2781_GLOBAL_ADDR;
 	} else if (strstr(dev_name(&clt->dev), "INT8866")) {
+		/*
+		 * TAS2563, integrated on-chip DSP with
+		 * global I2C address supported.
+		 */
 		device_name = "INT8866";
+		hda_priv->hda_chip_id = HDA_TAS2563;
 		hda_priv->save_calibration = tas2563_save_calibration;
 		tas_hda->priv->global_addr = TAS2563_GLOBAL_ADDR;
 	} else {
@@ -727,6 +773,7 @@ static const struct i2c_device_id tas2781_hda_i2c_id[] = {
 static const struct acpi_device_id tas2781_acpi_hda_match[] = {
 	{"INT8866", 0 },
 	{"TIAS2781", 0 },
+	{"TXNW2770", 0 },
 	{"TXNW2781", 0 },
 	{}
 };
-- 
cgit v1.2.3


From 9f0cb91767f582df6b17c1e2f22f684c36962295 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 22 Jul 2025 10:37:14 -0400
Subject: tracing: arm: arm64: Hide trace events ipi_raise, ipi_entry and
 ipi_exit

The ipi tracepoints are mostly generic, but the tracepoints ipi_raise,
ipi_entry and ipi_exit are only used by arm and arm64. This means these
trace events are wasting memory in all the other architectures that do not
use them.

Add CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS and have arm and arm64 select it to
enable these trace events. The config makes it easy if other architectures
decide to trace these as well.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Will Deacon <will@kernel.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/20250722103714.64eba013@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/Kconfig           |  1 +
 arch/arm64/Kconfig         |  1 +
 include/trace/events/ipi.h | 58 ++++++++++++++++++++++++----------------------
 kernel/trace/Kconfig       |  6 +++++
 4 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3072731fe09c..e8f98217d0fc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -102,6 +102,7 @@ config ARM
 	select HAVE_BUILDTIME_MCOUNT_SORT
 	select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
 	select HAVE_DMA_CONTIGUOUS if MMU
+	select HAVE_EXTRA_IPI_TRACEPOINTS
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 393d71124f5d..a027cb81dfcc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -136,6 +136,7 @@ config ARM64
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select CPUMASK_OFFSTACK if NR_CPUS > 256
 	select DCACHE_WORD_ACCESS
+	select HAVE_EXTRA_IPI_TRACEPOINTS
 	select DYNAMIC_FTRACE if FUNCTION_TRACER
 	select DMA_BOUNCE_UNALIGNED_KMALLOC
 	select DMA_DIRECT_REMAP
diff --git a/include/trace/events/ipi.h b/include/trace/events/ipi.h
index 3de9bfc982ce..9912f0ded81d 100644
--- a/include/trace/events/ipi.h
+++ b/include/trace/events/ipi.h
@@ -7,34 +7,6 @@
 
 #include <linux/tracepoint.h>
 
-/**
- * ipi_raise - called when a smp cross call is made
- *
- * @mask: mask of recipient CPUs for the IPI
- * @reason: string identifying the IPI purpose
- *
- * It is necessary for @reason to be a static string declared with
- * __tracepoint_string.
- */
-TRACE_EVENT(ipi_raise,
-
-	TP_PROTO(const struct cpumask *mask, const char *reason),
-
-	TP_ARGS(mask, reason),
-
-	TP_STRUCT__entry(
-		__bitmask(target_cpus, nr_cpumask_bits)
-		__field(const char *, reason)
-	),
-
-	TP_fast_assign(
-		__assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
-		__entry->reason = reason;
-	),
-
-	TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
-);
-
 TRACE_EVENT(ipi_send_cpu,
 
 	TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),
@@ -79,6 +51,35 @@ TRACE_EVENT(ipi_send_cpumask,
 		  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
 );
 
+#ifdef CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS
+/**
+ * ipi_raise - called when a smp cross call is made
+ *
+ * @mask: mask of recipient CPUs for the IPI
+ * @reason: string identifying the IPI purpose
+ *
+ * It is necessary for @reason to be a static string declared with
+ * __tracepoint_string.
+ */
+TRACE_EVENT(ipi_raise,
+
+	TP_PROTO(const struct cpumask *mask, const char *reason),
+
+	TP_ARGS(mask, reason),
+
+	TP_STRUCT__entry(
+		__bitmask(target_cpus, nr_cpumask_bits)
+		__field(const char *, reason)
+	),
+
+	TP_fast_assign(
+		__assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
+		__entry->reason = reason;
+	),
+
+	TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
+);
+
 DECLARE_EVENT_CLASS(ipi_handler,
 
 	TP_PROTO(const char *reason),
@@ -127,6 +128,7 @@ DEFINE_EVENT(ipi_handler, ipi_exit,
 
 	TP_ARGS(reason)
 );
+#endif /* CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS */
 
 #endif /* _TRACE_IPI_H */
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a3f35c7d83b6..35448f7233fe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
 	bool
 
+config HAVE_EXTRA_IPI_TRACEPOINTS
+	bool
+	help
+	 For architectures that use ipi_raise, ipi_entry and ipi_exit
+	 tracepoints.
+
 config HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	bool
 	help
-- 
cgit v1.2.3


From 1bbdb81a98363fd5cd0c2ac16ad5346bdf814dff Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Tue, 22 Jul 2025 12:13:29 +0300
Subject: devlink: Fix excessive stack usage in rate TC bandwidth parsing

The devlink_nl_rate_tc_bw_parse function uses a large stack array for
devlink attributes, which triggers a warning about excessive stack
usage:

net/devlink/rate.c: In function 'devlink_nl_rate_tc_bw_parse':
net/devlink/rate.c:382:1: error: the frame size of 1648 bytes is larger than 1536 bytes [-Werror=frame-larger-than=]

Introduce a separate attribute set specifically for rate TC bandwidth
parsing that only contains the two attributes actually used: index
and bandwidth. This reduces the stack array from DEVLINK_ATTR_MAX
entries to just 2 entries, solving the stack usage issue.

Update devlink selftest to use the new 'index' and 'bw' attribute names
consistent with the YAML spec.

Example usage with ynl with the new spec:

    ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
      --do rate-set --json '{
      "bus-name": "pci",
      "dev-name": "0000:08:00.0",
      "port-index": 1,
      "rate-tc-bws": [
        {"index": 0, "bw": 50},
        {"index": 1, "bw": 50},
        {"index": 2, "bw": 0},
        {"index": 3, "bw": 0},
        {"index": 4, "bw": 0},
        {"index": 5, "bw": 0},
        {"index": 6, "bw": 0},
        {"index": 7, "bw": 0}
      ]
    }'

    ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
      --do rate-get --json '{
      "bus-name": "pci",
      "dev-name": "0000:08:00.0",
      "port-index": 1
    }'

    output for rate-get:
    {'bus-name': 'pci',
     'dev-name': '0000:08:00.0',
     'port-index': 1,
     'rate-tc-bws': [{'bw': 50, 'index': 0},
                     {'bw': 50, 'index': 1},
                     {'bw': 0, 'index': 2},
                     {'bw': 0, 'index': 3},
                     {'bw': 0, 'index': 4},
                     {'bw': 0, 'index': 5},
                     {'bw': 0, 'index': 6},
                     {'bw': 0, 'index': 7}],
     'rate-tx-max': 0,
     'rate-tx-priority': 0,
     'rate-tx-share': 0,
     'rate-tx-weight': 0,
     'rate-type': 'leaf'}

Fixes: 566e8f108fc7 ("devlink: Extend devlink rate API with traffic classes bandwidth management")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Closes: https://lore.kernel.org/netdev/20250708160652.1810573-1-arnd@kernel.org/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202507171943.W7DJcs6Y-lkp@intel.com/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Tested-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/1753175609-330621-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/devlink.yaml           | 26 +++++++++-------------
 include/uapi/linux/devlink.h                       | 11 +++++++--
 net/devlink/netlink_gen.c                          |  6 ++---
 net/devlink/netlink_gen.h                          |  2 +-
 net/devlink/rate.c                                 | 20 ++++++++---------
 .../selftests/drivers/net/hw/devlink_rate_tc_bw.py | 16 ++++++-------
 6 files changed, 42 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml
index 1c4bb0cbe5f0..bb87111d5e16 100644
--- a/Documentation/netlink/specs/devlink.yaml
+++ b/Documentation/netlink/specs/devlink.yaml
@@ -853,18 +853,6 @@ attribute-sets:
         type: nest
         multi-attr: true
         nested-attributes: dl-rate-tc-bws
-      -
-        name: rate-tc-index
-        type: u8
-        checks:
-          max: rate-tc-index-max
-      -
-        name: rate-tc-bw
-        type: u32
-        doc: |
-             Specifies the bandwidth share assigned to the Traffic Class.
-             The bandwidth for the traffic class is determined
-             in proportion to the sum of the shares of all configured classes.
   -
     name: dl-dev-stats
     subset-of: devlink
@@ -1271,12 +1259,20 @@ attribute-sets:
         type: flag
   -
     name: dl-rate-tc-bws
-    subset-of: devlink
+    name-prefix: devlink-rate-tc-attr-
     attributes:
       -
-        name: rate-tc-index
+        name: index
+        type: u8
+        checks:
+          max: rate-tc-index-max
       -
-        name: rate-tc-bw
+        name: bw
+        type: u32
+        doc: |
+             Specifies the bandwidth share assigned to the Traffic Class.
+             The bandwidth for the traffic class is determined
+             in proportion to the sum of the shares of all configured classes.
 
 operations:
   enum-model: directional
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index e72bcc239afd..9fcb25a0f447 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -635,8 +635,6 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
 
 	DEVLINK_ATTR_RATE_TC_BWS,		/* nested */
-	DEVLINK_ATTR_RATE_TC_INDEX,		/* u8 */
-	DEVLINK_ATTR_RATE_TC_BW,		/* u32 */
 
 	/* Add new attributes above here, update the spec in
 	 * Documentation/netlink/specs/devlink.yaml and re-generate
@@ -647,6 +645,15 @@ enum devlink_attr {
 	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
 };
 
+enum devlink_rate_tc_attr {
+	DEVLINK_RATE_TC_ATTR_UNSPEC,
+	DEVLINK_RATE_TC_ATTR_INDEX,		/* u8 */
+	DEVLINK_RATE_TC_ATTR_BW,		/* u32 */
+
+	__DEVLINK_RATE_TC_ATTR_MAX,
+	DEVLINK_RATE_TC_ATTR_MAX = __DEVLINK_RATE_TC_ATTR_MAX - 1
+};
+
 /* Mapping between internal resource described by the field and system
  * structure
  */
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index c50436433c18..d97c326a9045 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -45,9 +45,9 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_
 	[DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
 };
 
-const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = {
-	[DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
-	[DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, },
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = {
+	[DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
+	[DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, },
 };
 
 const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
index fb733b5d4ff1..09cc6f264ccf 100644
--- a/net/devlink/netlink_gen.h
+++ b/net/devlink/netlink_gen.h
@@ -13,7 +13,7 @@
 
 /* Common nested types */
 extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
-extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1];
 extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
 
 /* Ops table for devlink */
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index d39300a9b3d4..110b3fa8a0b1 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -90,8 +90,8 @@ static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
 		if (!nla_tc_bw)
 			return -EMSGSIZE;
 
-		if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) ||
-		    nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i]))
+		if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) ||
+		    nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i]))
 			goto nla_put_failure;
 
 		nla_nest_end(msg, nla_tc_bw);
@@ -346,26 +346,26 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
 				       unsigned long *bitmap,
 				       struct netlink_ext_ack *extack)
 {
-	struct nlattr *tb[DEVLINK_ATTR_MAX + 1];
+	struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1];
 	u8 tc_index;
 	int err;
 
-	err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest,
+	err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest,
 			       devlink_dl_rate_tc_bws_nl_policy, extack);
 	if (err)
 		return err;
 
-	if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) {
+	if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) {
 		NL_SET_ERR_ATTR_MISS(extack, parent_nest,
-				     DEVLINK_ATTR_RATE_TC_INDEX);
+				     DEVLINK_RATE_TC_ATTR_INDEX);
 		return -EINVAL;
 	}
 
-	tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]);
+	tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]);
 
-	if (!tb[DEVLINK_ATTR_RATE_TC_BW]) {
+	if (!tb[DEVLINK_RATE_TC_ATTR_BW]) {
 		NL_SET_ERR_ATTR_MISS(extack, parent_nest,
-				     DEVLINK_ATTR_RATE_TC_BW);
+				     DEVLINK_RATE_TC_ATTR_BW);
 		return -EINVAL;
 	}
 
@@ -376,7 +376,7 @@ static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
 		return -EINVAL;
 	}
 
-	tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]);
+	tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]);
 
 	return 0;
 }
diff --git a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py
index 820d8a03becc..835c357919a8 100755
--- a/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py
+++ b/tools/testing/selftests/drivers/net/hw/devlink_rate_tc_bw.py
@@ -208,14 +208,14 @@ def setup_devlink_rate(cfg):
             "port-index": port_index,
             "rate-tx-max": 125000000,
             "rate-tc-bws": [
-                {"rate-tc-index": 0, "rate-tc-bw": 0},
-                {"rate-tc-index": 1, "rate-tc-bw": 0},
-                {"rate-tc-index": 2, "rate-tc-bw": 0},
-                {"rate-tc-index": 3, "rate-tc-bw": 20},
-                {"rate-tc-index": 4, "rate-tc-bw": 80},
-                {"rate-tc-index": 5, "rate-tc-bw": 0},
-                {"rate-tc-index": 6, "rate-tc-bw": 0},
-                {"rate-tc-index": 7, "rate-tc-bw": 0},
+                {"index": 0, "bw": 0},
+                {"index": 1, "bw": 0},
+                {"index": 2, "bw": 0},
+                {"index": 3, "bw": 20},
+                {"index": 4, "bw": 80},
+                {"index": 5, "bw": 0},
+                {"index": 6, "bw": 0},
+                {"index": 7, "bw": 0},
             ]
         })
     except NlError as exc:
-- 
cgit v1.2.3


From f3d85c9ee51036ac7ed129ec16eef5df2192763e Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 21 Jul 2025 11:18:24 +0900
Subject: netmem: introduce struct netmem_desc mirroring struct page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To simplify struct page, the page pool members of struct page should be
moved to other, allowing these members to be removed from struct page.

Introduce a network memory descriptor to store the members, struct
netmem_desc, and make it union'ed with the existing fields in struct
net_iov, allowing to organize the fields of struct net_iov.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Link: https://patch.msgid.link/20250721021835.63939-2-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 116 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 95 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index de1d95f04076..535cf17b9134 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -12,6 +12,50 @@
 #include <linux/mm.h>
 #include <net/net_debug.h>
 
+/* These fields in struct page are used by the page_pool and net stack:
+ *
+ *        struct {
+ *                unsigned long pp_magic;
+ *                struct page_pool *pp;
+ *                unsigned long _pp_mapping_pad;
+ *                unsigned long dma_addr;
+ *                atomic_long_t pp_ref_count;
+ *        };
+ *
+ * We mirror the page_pool fields here so the page_pool can access these
+ * fields without worrying whether the underlying fields belong to a
+ * page or netmem_desc.
+ *
+ * CAUTION: Do not update the fields in netmem_desc without also
+ * updating the anonymous aliasing union in struct net_iov.
+ */
+struct netmem_desc {
+	unsigned long _flags;
+	unsigned long pp_magic;
+	struct page_pool *pp;
+	unsigned long _pp_mapping_pad;
+	unsigned long dma_addr;
+	atomic_long_t pp_ref_count;
+};
+
+#define NETMEM_DESC_ASSERT_OFFSET(pg, desc)        \
+	static_assert(offsetof(struct page, pg) == \
+		      offsetof(struct netmem_desc, desc))
+NETMEM_DESC_ASSERT_OFFSET(flags, _flags);
+NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic);
+NETMEM_DESC_ASSERT_OFFSET(pp, pp);
+NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
+NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr);
+NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
+#undef NETMEM_DESC_ASSERT_OFFSET
+
+/*
+ * Since struct netmem_desc uses the space in struct page, the size
+ * should be checked, until struct netmem_desc has its own instance from
+ * slab, to avoid conflicting with other members within struct page.
+ */
+static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount));
+
 /* net_iov */
 
 DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
@@ -30,13 +74,48 @@ enum net_iov_type {
 	NET_IOV_MAX = ULONG_MAX
 };
 
+/* A memory descriptor representing abstract networking I/O vectors,
+ * generally for non-pages memory that doesn't have its corresponding
+ * struct page and needs to be explicitly allocated through slab.
+ *
+ * net_iovs are allocated and used by networking code, and the size of
+ * the chunk is PAGE_SIZE.
+ *
+ * This memory can be any form of non-struct paged memory.  Examples
+ * include imported dmabuf memory and imported io_uring memory.  See
+ * net_iov_type for all the supported types.
+ *
+ * @pp_magic:	pp field, similar to the one in struct page/struct
+ *		netmem_desc.
+ * @pp:		the pp this net_iov belongs to, if any.
+ * @dma_addr:	the dma addrs of the net_iov. Needed for the network
+ *		card to send/receive this net_iov.
+ * @pp_ref_count: the pp ref count of this net_iov, exactly the same
+ *		usage as struct page/struct netmem_desc.
+ * @owner:	the net_iov_area this net_iov belongs to, if any.
+ * @type:	the type of the memory.  Different types of net_iovs are
+ *		supported.
+ */
 struct net_iov {
-	enum net_iov_type type;
-	unsigned long pp_magic;
-	struct page_pool *pp;
+	union {
+		struct netmem_desc desc;
+
+		/* XXX: The following part should be removed once all
+		 * the references to them are converted so as to be
+		 * accessed via netmem_desc e.g. niov->desc.pp instead
+		 * of niov->pp.
+		 */
+		struct {
+			unsigned long _flags;
+			unsigned long pp_magic;
+			struct page_pool *pp;
+			unsigned long _pp_mapping_pad;
+			unsigned long dma_addr;
+			atomic_long_t pp_ref_count;
+		};
+	};
 	struct net_iov_area *owner;
-	unsigned long dma_addr;
-	atomic_long_t pp_ref_count;
+	enum net_iov_type type;
 };
 
 struct net_iov_area {
@@ -48,27 +127,22 @@ struct net_iov_area {
 	unsigned long base_virtual;
 };
 
-/* These fields in struct page are used by the page_pool and net stack:
+/* net_iov is union'ed with struct netmem_desc mirroring struct page, so
+ * the page_pool can access these fields without worrying whether the
+ * underlying fields are accessed via netmem_desc or directly via
+ * net_iov, until all the references to them are converted so as to be
+ * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp.
  *
- *        struct {
- *                unsigned long pp_magic;
- *                struct page_pool *pp;
- *                unsigned long _pp_mapping_pad;
- *                unsigned long dma_addr;
- *                atomic_long_t pp_ref_count;
- *        };
- *
- * We mirror the page_pool fields here so the page_pool can access these fields
- * without worrying whether the underlying fields belong to a page or net_iov.
- *
- * The non-net stack fields of struct page are private to the mm stack and must
- * never be mirrored to net_iov.
+ * The non-net stack fields of struct page are private to the mm stack
+ * and must never be mirrored to net_iov.
  */
-#define NET_IOV_ASSERT_OFFSET(pg, iov)             \
-	static_assert(offsetof(struct page, pg) == \
+#define NET_IOV_ASSERT_OFFSET(desc, iov)                    \
+	static_assert(offsetof(struct netmem_desc, desc) == \
 		      offsetof(struct net_iov, iov))
+NET_IOV_ASSERT_OFFSET(_flags, _flags);
 NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
 NET_IOV_ASSERT_OFFSET(pp, pp);
+NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
 NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
 NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
 #undef NET_IOV_ASSERT_OFFSET
-- 
cgit v1.2.3


From 38a436d4e26487e16ac6c1de17c030b1bef84d83 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 21 Jul 2025 11:18:25 +0900
Subject: netmem: use netmem_desc instead of page to access ->pp in
 __netmem_get_pp()

To eliminate the use of the page pool fields in struct page, the page
pool code should use netmem descriptor and APIs instead.

However, __netmem_get_pp() still accesses ->pp via struct page.  So
change it to use struct netmem_desc instead, since ->pp no longer will
be available in struct page.

While at it, add a helper, __netmem_to_nmdesc(), that can be used to
unsafely get pointer to netmem_desc backing the netmem_ref, only when
the netmem_ref is always backed by system memory.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Link: https://patch.msgid.link/20250721021835.63939-3-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netmem.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netmem.h b/include/net/netmem.h
index 535cf17b9134..097bc74d9555 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -247,6 +247,24 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
 	return page_to_pfn(netmem_to_page(netmem));
 }
 
+/**
+ * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
+ * @netmem
+ * @netmem: netmem reference to convert
+ *
+ * Unsafe version that can be used only when @netmem is always backed by
+ * system memory, performs faster and generates smaller object code (no
+ * check for the LSB, no WARN). When @netmem points to IOV, provokes
+ * undefined behaviour.
+ *
+ * Return: pointer to the &netmem_desc (garbage if @netmem is not backed
+ * by system memory).
+ */
+static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
+{
+	return (__force struct netmem_desc *)netmem;
+}
+
 /* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to
  * common fields.
  * @netmem: netmem reference to extract as net_iov.
@@ -280,7 +298,7 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
  */
 static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
 {
-	return __netmem_to_page(netmem)->pp;
+	return __netmem_to_nmdesc(netmem)->pp;
 }
 
 static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
-- 
cgit v1.2.3


From 89ade7c7306508f46b811cd43960eaed88e0e1dd Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 21 Jul 2025 11:18:26 +0900
Subject: netmem, mlx4: access ->pp_ref_count through netmem_desc instead of
 page

To eliminate the use of struct page in page pool, the page pool users
should use netmem descriptor and APIs instead.

Make mlx4 access ->pp_ref_count through netmem_desc instead of page.

While at it, add a helper, pp_page_to_nmdesc() and __pp_page_to_nmdesc(),
that can be used to get netmem_desc from page only if it's a pp page.
For now that netmem_desc overlays on page, it can be achieved by just
casting, and use macro and _Generic to cover const casting as well.

Plus, change page_pool_page_is_pp() to check for 'const struct page *'
instead of 'struct page *' since it doesn't modify data and additionally
covers const type.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Link: https://patch.msgid.link/20250721021835.63939-4-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |  4 +++-
 include/linux/mm.h                         |  4 ++--
 include/net/netmem.h                       | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b33285d755b9..92a16ddb7d86 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -460,9 +460,11 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 
 		truesize += frag_info->frag_stride;
 		if (frag_info->frag_stride == PAGE_SIZE / 2) {
+			struct netmem_desc *desc = pp_page_to_nmdesc(page);
+
 			frags->page_offset ^= PAGE_SIZE / 2;
 			release = page_count(page) != 1 ||
-				  atomic_long_read(&page->pp_ref_count) != 1 ||
+				  atomic_long_read(&desc->pp_ref_count) != 1 ||
 				  page_is_pfmemalloc(page) ||
 				  page_to_nid(page) != numa_mem_id();
 		} else if (!priv->rx_headroom) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa538feaa8d9..ae50c1641bed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4178,12 +4178,12 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
 #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL)
 
 #ifdef CONFIG_PAGE_POOL
-static inline bool page_pool_page_is_pp(struct page *page)
+static inline bool page_pool_page_is_pp(const struct page *page)
 {
 	return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE;
 }
 #else
-static inline bool page_pool_page_is_pp(struct page *page)
+static inline bool page_pool_page_is_pp(const struct page *page)
 {
 	return false;
 }
diff --git a/include/net/netmem.h b/include/net/netmem.h
index 097bc74d9555..f7dacc9e75fd 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -285,6 +285,23 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
 	return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
 }
 
+/* XXX: How to extract netmem_desc from page must be changed, once
+ * netmem_desc no longer overlays on page and will be allocated through
+ * slab.
+ */
+#define __pp_page_to_nmdesc(p)	(_Generic((p),				\
+	const struct page * :	(const struct netmem_desc *)(p),	\
+	struct page * :		(struct netmem_desc *)(p)))
+
+/* CAUTION: Check if the page is a pp page before calling this helper or
+ * know it's a pp page.
+ */
+#define pp_page_to_nmdesc(p)						\
+({									\
+	DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p));		\
+	__pp_page_to_nmdesc(p);						\
+})
+
 /**
  * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
  * @netmem: netmem reference to get the pointer from
-- 
cgit v1.2.3


From 9dfd871a3e2ed433d5fee519b90b7e619b972043 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul@sk.com>
Date: Mon, 21 Jul 2025 11:18:35 +0900
Subject: libeth: xdp: access ->pp through netmem_desc instead of page

To eliminate the use of struct page in page pool, the page pool users
should use netmem descriptor and APIs instead.

Make xdp access ->pp through netmem_desc instead of page.

Signed-off-by: Byungchul Park <byungchul@sk.com>
Link: https://patch.msgid.link/20250721021835.63939-13-byungchul@sk.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/libeth/xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index 6ce6aec6884c..f4880b50e804 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -1292,7 +1292,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
 	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
 #endif
 	xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
-			 page->pp->p.offset, len, true);
+			 pp_page_to_nmdesc(page)->pp->p.offset, len, true);
 }
 
 /**
-- 
cgit v1.2.3


From 320d031ad6e4d67e8e1ab08ac71efda02bc85683 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Tue, 22 Jul 2025 11:59:10 +0200
Subject: sched: Struct definition and parsing of dualpi2 qdisc

DualPI2 is the reference implementation of IETF RFC9332 DualQ Coupled
AQM (https://datatracker.ietf.org/doc/html/rfc9332) providing two
queues called low latency (L-queue) and classic (C-queue). By default,
it enqueues non-ECN and ECT(0) packets into the C-queue and ECT(1) and
CE packets into the low latency queue (L-queue), as per IETF RFC9332 spec.

This patch defines the dualpi2 Qdisc structure and parsing, and the
following two patches include dumping and enqueue/dequeue for the DualPI2.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Link: https://patch.msgid.link/20250722095915.24485-2-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h |  53 ++++
 net/sched/sch_dualpi2.c        | 591 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 644 insertions(+)
 create mode 100644 net/sched/sch_dualpi2.c

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 3e41349f3fa2..75d685ea8368 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1211,4 +1211,57 @@ enum {
 
 #define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
 
+/* DUALPI2 */
+enum tc_dualpi2_drop_overload {
+	TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0,
+	TC_DUALPI2_DROP_OVERLOAD_DROP = 1,
+	__TCA_DUALPI2_DROP_OVERLOAD_MAX,
+};
+#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1)
+
+enum tc_dualpi2_drop_early {
+	TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0,
+	TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1,
+	__TCA_DUALPI2_DROP_EARLY_MAX,
+};
+#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1)
+
+enum tc_dualpi2_ecn_mask {
+	TC_DUALPI2_ECN_MASK_L4S_ECT = 1,
+	TC_DUALPI2_ECN_MASK_CLA_ECT = 2,
+	TC_DUALPI2_ECN_MASK_ANY_ECT = 3,
+	__TCA_DUALPI2_ECN_MASK_MAX,
+};
+#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1)
+
+enum tc_dualpi2_split_gso {
+	TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0,
+	TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1,
+	__TCA_DUALPI2_SPLIT_GSO_MAX,
+};
+#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1)
+
+enum {
+	TCA_DUALPI2_UNSPEC,
+	TCA_DUALPI2_LIMIT,		/* Packets */
+	TCA_DUALPI2_MEMORY_LIMIT,	/* Bytes */
+	TCA_DUALPI2_TARGET,		/* us */
+	TCA_DUALPI2_TUPDATE,		/* us */
+	TCA_DUALPI2_ALPHA,		/* Hz scaled up by 256 */
+	TCA_DUALPI2_BETA,		/* Hz scaled up by 256 */
+	TCA_DUALPI2_STEP_THRESH_PKTS,	/* Step threshold in packets */
+	TCA_DUALPI2_STEP_THRESH_US,	/* Step threshold in microseconds */
+	TCA_DUALPI2_MIN_QLEN_STEP,	/* Minimum qlen to apply STEP_THRESH */
+	TCA_DUALPI2_COUPLING,		/* Coupling factor between queues */
+	TCA_DUALPI2_DROP_OVERLOAD,	/* Whether to drop on overload */
+	TCA_DUALPI2_DROP_EARLY,		/* Whether to drop on enqueue */
+	TCA_DUALPI2_C_PROTECTION,	/* Percentage */
+	TCA_DUALPI2_ECN_MASK,		/* L4S queue classification mask */
+	TCA_DUALPI2_SPLIT_GSO,		/* Split GSO packets at enqueue */
+	TCA_DUALPI2_PAD,
+	__TCA_DUALPI2_MAX
+};
+
+#define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
+
 #endif
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
new file mode 100644
index 000000000000..c11ec66786d4
--- /dev/null
+++ b/net/sched/sch_dualpi2.c
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+/* Copyright (C) 2024 Nokia
+ *
+ * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
+ * Author: Olga Albisser <olga@albisser.org>
+ * Author: Henrik Steen <henrist@henrist.net>
+ * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
+ * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
+ *
+ * DualPI Improved with a Square (dualpi2):
+ * - Supports congestion controls that comply with the Prague requirements
+ *   in RFC9331 (e.g. TCP-Prague)
+ * - Supports coupled dual-queue with PI2 as defined in RFC9332
+ * - Supports ECN L4S-identifier (IP.ECN==0b*1)
+ *
+ * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
+ *   they do not meet the 'Prague L4S Requirements' listed in RFC 9331
+ *   Section 4, so they can only be used with DualPI2 in a datacenter
+ *   context.
+ *
+ * References:
+ * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
+ * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
+ *   scalable TCP."  in proc. ACM CoNEXT'16, 2016.
+ */
+
+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/gso.h>
+#include <net/inet_ecn.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>
+
+/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
+ * i.e., twice the maximal snd_cwnd.
+ * MAX_PROB must be consistent with the RNG in dualpi2_roll().
+ */
+#define MAX_PROB U32_MAX
+
+/* alpha/beta values exchanged over netlink are in units of 256ns */
+#define ALPHA_BETA_SHIFT 8
+
+/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
+ * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
+ * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
+ * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
+ * control flows whose maximal RTTs can be in usec up to few secs.
+ */
+#define ALPHA_BETA_MAX ((1U << 31) - 1)
+
+/* Internal alpha/beta are in units of 64ns.
+ * This enables to use all alpha/beta values in the allowed range without loss
+ * of precision due to rounding when scaling them internally, e.g.,
+ * scale_alpha_beta(1) will not round down to 0.
+ */
+#define ALPHA_BETA_GRANULARITY 6
+
+#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
+
+/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
+#define MAX_WC 100
+
+struct dualpi2_sched_data {
+	struct Qdisc *l_queue;	/* The L4S Low latency queue (L-queue) */
+	struct Qdisc *sch;	/* The Classic queue (C-queue) */
+
+	/* Registered tc filters */
+	struct tcf_proto __rcu *tcf_filters;
+	struct tcf_block *tcf_block;
+
+	/* PI2 parameters */
+	u64	pi2_target;	/* Target delay in nanoseconds */
+	u32	pi2_tupdate;	/* Timer frequency in nanoseconds */
+	u32	pi2_prob;	/* Base PI probability */
+	u32	pi2_alpha;	/* Gain factor for the integral rate response */
+	u32	pi2_beta;	/* Gain factor for the proportional response */
+	struct hrtimer pi2_timer; /* prob update timer */
+
+	/* Step AQM (L-queue only) parameters */
+	u32	step_thresh;	/* Step threshold */
+	bool	step_in_packets; /* Step thresh in packets (1) or time (0) */
+
+	/* C-queue starvation protection */
+	s32	c_protection_credit; /* Credit (sign indicates which queue) */
+	s32	c_protection_init; /* Reset value of the credit */
+	u8	c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
+	u8	c_protection_wl; /* L-queue weight (MAX_WC - wc) */
+
+	/* General dualQ parameters */
+	u32	memory_limit;	/* Memory limit of both queues */
+	u8	coupling_factor;/* Coupling factor (k) between both queues */
+	u8	ecn_mask;	/* Mask to match packets into L-queue */
+	u32	min_qlen_step;	/* Minimum queue length to apply step thresh */
+	bool	drop_early;	/* Drop at enqueue (1) instead of dequeue  (0) */
+	bool	drop_overload;	/* Drop (1) on overload, or overflow (0) */
+	bool	split_gso;	/* Split aggregated skb (1) or leave as is (0) */
+
+	/* Statistics */
+	u64	c_head_ts;	/* Enqueue timestamp of the C-queue head */
+	u64	l_head_ts;	/* Enqueue timestamp of the L-queue head */
+	u64	last_qdelay;	/* Q delay val at the last probability update */
+	u32	packets_in_c;	/* Enqueue packet counter of the C-queue */
+	u32	packets_in_l;	/* Enqueue packet counter of the L-queue */
+	u32	maxq;		/* Maximum queue size of the C-queue */
+	u32	ecn_mark;	/* ECN mark pkt counter due to PI probability */
+	u32	step_marks;	/* ECN mark pkt counter due to step AQM */
+	u32	memory_used;	/* Memory used of both queues */
+	u32	max_memory_used;/* Maximum used memory */
+};
+
+static u32 dualpi2_scale_alpha_beta(u32 param)
+{
+	u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
+
+	do_div(tmp, NSEC_PER_SEC);
+	return tmp;
+}
+
+static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
+{
+	return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
+}
+
+static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
+{
+	q->c_protection_credit = q->c_protection_init;
+}
+
+/* This computes the initial credit value and WRR weight for the L queue (wl)
+ * from the weight of the C queue (wc).
+ * If wl > wc, the scheduler will start with the L queue when reset.
+ */
+static void dualpi2_calculate_c_protection(struct Qdisc *sch,
+					   struct dualpi2_sched_data *q, u32 wc)
+{
+	q->c_protection_wc = wc;
+	q->c_protection_wl = MAX_WC - wc;
+	q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
+		((int)q->c_protection_wc - (int)q->c_protection_wl);
+	dualpi2_reset_c_protection(q);
+}
+
+static s64 __scale_delta(u64 diff)
+{
+	do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
+	return diff;
+}
+
+static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
+			     u64 *qdelay_l)
+{
+	u64 now, qc, ql;
+
+	now = ktime_get_ns();
+	qc = q->c_head_ts;
+	ql = q->l_head_ts;
+
+	*qdelay_c = qc ? now - qc : 0;
+	*qdelay_l = ql ? now - ql : 0;
+}
+
+static u32 calculate_probability(struct Qdisc *sch)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	u32 new_prob;
+	u64 qdelay_c;
+	u64 qdelay_l;
+	u64 qdelay;
+	s64 delta;
+
+	get_queue_delays(q, &qdelay_c, &qdelay_l);
+	qdelay = max(qdelay_l, qdelay_c);
+
+	/* Alpha and beta take at most 32b, i.e, the delay difference would
+	 * overflow for queuing delay differences > ~4.2sec.
+	 */
+	delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
+	delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
+	q->last_qdelay = qdelay;
+
+	/* Bound new_prob between 0 and MAX_PROB */
+	if (delta > 0) {
+		new_prob = __scale_delta(delta) + q->pi2_prob;
+		if (new_prob < q->pi2_prob)
+			new_prob = MAX_PROB;
+	} else {
+		new_prob = q->pi2_prob - __scale_delta(~delta + 1);
+		if (new_prob > q->pi2_prob)
+			new_prob = 0;
+	}
+
+	/* If we do not drop on overload, ensure we cap the L4S probability to
+	 * 100% to keep window fairness when overflowing.
+	 */
+	if (!q->drop_overload)
+		return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
+	return new_prob;
+}
+
+static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
+{
+	/* Apply rule of thumb, i.e., doubling the packet length,
+	 * to further include per packet overhead in memory_limit.
+	 */
+	u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
+
+	if (upper_32_bits(memlim))
+		return U32_MAX;
+	else
+		return lower_32_bits(memlim);
+}
+
+static u32 convert_us_to_nsec(u32 us)
+{
+	u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
+
+	if (upper_32_bits(ns))
+		return U32_MAX;
+
+	return lower_32_bits(ns);
+}
+
+static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
+{
+	struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
+	struct Qdisc *sch = q->sch;
+	spinlock_t *root_lock; /* to lock qdisc for probability calculations */
+
+	rcu_read_lock();
+	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
+	spin_lock(root_lock);
+
+	q->pi2_prob = calculate_probability(sch);
+	hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
+
+	spin_unlock(root_lock);
+	rcu_read_unlock();
+	return HRTIMER_RESTART;
+}
+
+static struct netlink_range_validation dualpi2_alpha_beta_range = {
+	.min = 1,
+	.max = ALPHA_BETA_MAX,
+};
+
+static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
+	[TCA_DUALPI2_LIMIT]		= NLA_POLICY_MIN(NLA_U32, 1),
+	[TCA_DUALPI2_MEMORY_LIMIT]	= NLA_POLICY_MIN(NLA_U32, 1),
+	[TCA_DUALPI2_TARGET]		= { .type = NLA_U32 },
+	[TCA_DUALPI2_TUPDATE]		= NLA_POLICY_MIN(NLA_U32, 1),
+	[TCA_DUALPI2_ALPHA]		=
+		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+	[TCA_DUALPI2_BETA]		=
+		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
+	[TCA_DUALPI2_STEP_THRESH_PKTS]	= { .type = NLA_U32 },
+	[TCA_DUALPI2_STEP_THRESH_US]	= { .type = NLA_U32 },
+	[TCA_DUALPI2_MIN_QLEN_STEP]	= { .type = NLA_U32 },
+	[TCA_DUALPI2_COUPLING]		= NLA_POLICY_MIN(NLA_U8, 1),
+	[TCA_DUALPI2_DROP_OVERLOAD]	=
+		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
+	[TCA_DUALPI2_DROP_EARLY]	=
+		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
+	[TCA_DUALPI2_C_PROTECTION]	=
+		NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
+	[TCA_DUALPI2_ECN_MASK]		=
+		NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
+				 TCA_DUALPI2_ECN_MASK_MAX),
+	[TCA_DUALPI2_SPLIT_GSO]		=
+		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
+};
+
+static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
+			  struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_DUALPI2_MAX + 1];
+	struct dualpi2_sched_data *q;
+	int old_backlog;
+	int old_qlen;
+	int err;
+
+	if (!opt || !nla_len(opt)) {
+		NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
+		return -EINVAL;
+	}
+	err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
+			       extack);
+	if (err < 0)
+		return err;
+	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
+		NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
+		return -EINVAL;
+	}
+
+	q = qdisc_priv(sch);
+	sch_tree_lock(sch);
+
+	if (tb[TCA_DUALPI2_LIMIT]) {
+		u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
+
+		sch->limit = limit;
+		q->memory_limit = get_memory_limit(sch, limit);
+	}
+
+	if (tb[TCA_DUALPI2_MEMORY_LIMIT])
+		q->memory_limit = nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]);
+
+	if (tb[TCA_DUALPI2_TARGET]) {
+		u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
+
+		q->pi2_target = target * NSEC_PER_USEC;
+	}
+
+	if (tb[TCA_DUALPI2_TUPDATE]) {
+		u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
+
+		q->pi2_tupdate = convert_us_to_nsec(tupdate);
+	}
+
+	if (tb[TCA_DUALPI2_ALPHA]) {
+		u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
+
+		q->pi2_alpha = dualpi2_scale_alpha_beta(alpha);
+	}
+
+	if (tb[TCA_DUALPI2_BETA]) {
+		u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
+
+		q->pi2_beta = dualpi2_scale_alpha_beta(beta);
+	}
+
+	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
+		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
+
+		q->step_in_packets = true;
+		q->step_thresh = step_th;
+	} else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
+		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
+
+		q->step_in_packets = false;
+		q->step_thresh = convert_us_to_nsec(step_th);
+	}
+
+	if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
+		q->min_qlen_step = nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]);
+
+	if (tb[TCA_DUALPI2_COUPLING]) {
+		u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
+
+		q->coupling_factor = coupling;
+	}
+
+	if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
+		u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
+
+		q->drop_overload = (bool)drop_overload;
+	}
+
+	if (tb[TCA_DUALPI2_DROP_EARLY]) {
+		u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
+
+		q->drop_early = (bool)drop_early;
+	}
+
+	if (tb[TCA_DUALPI2_C_PROTECTION]) {
+		u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
+
+		dualpi2_calculate_c_protection(sch, q, wc);
+	}
+
+	if (tb[TCA_DUALPI2_ECN_MASK]) {
+		u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
+
+		q->ecn_mask = ecn_mask;
+	}
+
+	if (tb[TCA_DUALPI2_SPLIT_GSO]) {
+		u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
+
+		q->split_gso = (bool)split_gso;
+	}
+
+	old_qlen = qdisc_qlen(sch);
+	old_backlog = sch->qstats.backlog;
+	while (qdisc_qlen(sch) > sch->limit ||
+	       q->memory_used > q->memory_limit) {
+		struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
+
+		q->memory_used -= skb->truesize;
+		qdisc_qstats_backlog_dec(sch, skb);
+		rtnl_qdisc_drop(skb, sch);
+	}
+	qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
+				  old_backlog - sch->qstats.backlog);
+
+	sch_tree_unlock(sch);
+	return 0;
+}
+
+/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
+static void dualpi2_reset_default(struct Qdisc *sch)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+	q->sch->limit = 10000;				/* Max 125ms at 1Gbps */
+	q->memory_limit = get_memory_limit(sch, q->sch->limit);
+
+	q->pi2_target = 15 * NSEC_PER_MSEC;
+	q->pi2_tupdate = 16 * NSEC_PER_MSEC;
+	q->pi2_alpha = dualpi2_scale_alpha_beta(41);	/* ~0.16 Hz * 256 */
+	q->pi2_beta = dualpi2_scale_alpha_beta(819);	/* ~3.20 Hz * 256 */
+
+	q->step_thresh = 1 * NSEC_PER_MSEC;
+	q->step_in_packets = false;
+
+	dualpi2_calculate_c_protection(q->sch, q, 10);	/* wc=10%, wl=90% */
+
+	q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT;	/* INET_ECN_ECT_1 */
+	q->min_qlen_step = 0;		/* Always apply step mark in L-queue */
+	q->coupling_factor = 2;		/* window fairness for equal RTTs */
+	q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
+	q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
+	q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO;	/* Split GSO */
+}
+
+static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+				       TC_H_MAKE(sch->handle, 1), extack);
+	if (!q->l_queue)
+		return -ENOMEM;
+
+	err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
+	if (err)
+		return err;
+
+	q->sch = sch;
+	dualpi2_reset_default(sch);
+	hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+
+	if (opt && nla_len(opt)) {
+		err = dualpi2_change(sch, opt, extack);
+
+		if (err)
+			return err;
+	}
+
+	hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
+		      HRTIMER_MODE_ABS_PINNED);
+	return 0;
+}
+
+/* Reset both L-queue and C-queue, internal packet counters, PI probability,
+ * C-queue protection credit, and timestamps, while preserving current
+ * configuration of DUALPI2.
+ */
+static void dualpi2_reset(struct Qdisc *sch)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+	qdisc_reset_queue(sch);
+	qdisc_reset_queue(q->l_queue);
+	q->c_head_ts = 0;
+	q->l_head_ts = 0;
+	q->pi2_prob = 0;
+	q->packets_in_c = 0;
+	q->packets_in_l = 0;
+	q->maxq = 0;
+	q->ecn_mark = 0;
+	q->step_marks = 0;
+	q->memory_used = 0;
+	q->max_memory_used = 0;
+	dualpi2_reset_c_protection(q);
+}
+
+static void dualpi2_destroy(struct Qdisc *sch)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+	q->pi2_tupdate = 0;
+	hrtimer_cancel(&q->pi2_timer);
+	if (q->l_queue)
+		qdisc_put(q->l_queue);
+	tcf_block_put(q->tcf_block);
+}
+
+static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
+{
+	return NULL;
+}
+
+static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
+{
+	return 0;
+}
+
+static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
+				  u32 classid)
+{
+	return 0;
+}
+
+static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
+{
+}
+
+static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
+					   struct netlink_ext_ack *extack)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+
+	if (cl)
+		return NULL;
+	return q->tcf_block;
+}
+
+static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	unsigned int i;
+
+	if (arg->stop)
+		return;
+
+	/* We statically define only 2 queues */
+	for (i = 0; i < 2; i++) {
+		if (arg->count < arg->skip) {
+			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, i + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+/* Minimal class support to handle tc filters */
+static const struct Qdisc_class_ops dualpi2_class_ops = {
+	.leaf		= dualpi2_leaf,
+	.find		= dualpi2_find,
+	.tcf_block	= dualpi2_tcf_block,
+	.bind_tcf	= dualpi2_bind,
+	.unbind_tcf	= dualpi2_unbind,
+	.walk		= dualpi2_walk,
+};
+
+static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
+	.id		= "dualpi2",
+	.cl_ops		= &dualpi2_class_ops,
+	.priv_size	= sizeof(struct dualpi2_sched_data),
+	.peek		= qdisc_peek_dequeued,
+	.init		= dualpi2_init,
+	.destroy	= dualpi2_destroy,
+	.reset		= dualpi2_reset,
+	.change		= dualpi2_change,
+	.owner		= THIS_MODULE,
+};
+
+static int __init dualpi2_module_init(void)
+{
+	return register_qdisc(&dualpi2_qdisc_ops);
+}
+
+static void __exit dualpi2_module_exit(void)
+{
+	unregister_qdisc(&dualpi2_qdisc_ops);
+}
+
+module_init(dualpi2_module_init);
+module_exit(dualpi2_module_exit);
+
+MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
+MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
+MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
+MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
+MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
+MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("1.0");
-- 
cgit v1.2.3


From d4de8bffbef4a7e4ad14b9fd2ff8e2d0e06b3fa5 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Tue, 22 Jul 2025 11:59:11 +0200
Subject: sched: Dump configuration and statistics of dualpi2 qdisc

The configuration and statistics dump of the DualPI2 Qdisc provides
information related to both queues, such as packet numbers and queuing
delays in the L-queue and C-queue, as well as general information such as
probability value, WRR credits, memory usage, packet marking counters, max
queue size, etc.

The following patch includes enqueue/dequeue for DualPI2.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Link: https://patch.msgid.link/20250722095915.24485-3-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h |  15 ++++
 net/sched/sch_dualpi2.c        | 154 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 152 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 75d685ea8368..c2da76e78bad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1264,4 +1264,19 @@ enum {
 
 #define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
 
+struct tc_dualpi2_xstats {
+	__u32 prob;		/* current probability */
+	__u32 delay_c;		/* current delay in C queue */
+	__u32 delay_l;		/* current delay in L queue */
+	__u32 packets_in_c;	/* number of packets enqueued in C queue */
+	__u32 packets_in_l;	/* number of packets enqueued in L queue */
+	__u32 maxq;		/* maximum queue size */
+	__u32 ecn_mark;		/* packets marked with ecn*/
+	__u32 step_marks;	/* ECN marks due to the step AQM */
+	__s32 credit;		/* current c_protection credit */
+	__u32 memory_used;	/* Memory used by both queues */
+	__u32 max_memory_used;	/* Maximum used memory */
+	__u32 memory_limit;	/* Memory limit of both queues */
+};
+
 #endif
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index c11ec66786d4..0a96d57c40d1 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -123,6 +123,14 @@ static u32 dualpi2_scale_alpha_beta(u32 param)
 	return tmp;
 }
 
+static u32 dualpi2_unscale_alpha_beta(u32 param)
+{
+	u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
+
+	do_div(tmp, MAX_PROB);
+	return tmp;
+}
+
 static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
 {
 	return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
@@ -227,6 +235,15 @@ static u32 convert_us_to_nsec(u32 us)
 	return lower_32_bits(ns);
 }
 
+static u32 convert_ns_to_usec(u64 ns)
+{
+	do_div(ns, NSEC_PER_USEC);
+	if (upper_32_bits(ns))
+		return U32_MAX;
+
+	return lower_32_bits(ns);
+}
+
 static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
 {
 	struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
@@ -304,68 +321,70 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_DUALPI2_LIMIT]) {
 		u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
 
-		sch->limit = limit;
-		q->memory_limit = get_memory_limit(sch, limit);
+		WRITE_ONCE(sch->limit, limit);
+		WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
 	}
 
 	if (tb[TCA_DUALPI2_MEMORY_LIMIT])
-		q->memory_limit = nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]);
+		WRITE_ONCE(q->memory_limit,
+			   nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
 
 	if (tb[TCA_DUALPI2_TARGET]) {
 		u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
 
-		q->pi2_target = target * NSEC_PER_USEC;
+		WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
 	}
 
 	if (tb[TCA_DUALPI2_TUPDATE]) {
 		u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
 
-		q->pi2_tupdate = convert_us_to_nsec(tupdate);
+		WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
 	}
 
 	if (tb[TCA_DUALPI2_ALPHA]) {
 		u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
 
-		q->pi2_alpha = dualpi2_scale_alpha_beta(alpha);
+		WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
 	}
 
 	if (tb[TCA_DUALPI2_BETA]) {
 		u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
 
-		q->pi2_beta = dualpi2_scale_alpha_beta(beta);
+		WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
 	}
 
 	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
 		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
 
-		q->step_in_packets = true;
-		q->step_thresh = step_th;
+		WRITE_ONCE(q->step_in_packets, true);
+		WRITE_ONCE(q->step_thresh, step_th);
 	} else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
 		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
 
-		q->step_in_packets = false;
-		q->step_thresh = convert_us_to_nsec(step_th);
+		WRITE_ONCE(q->step_in_packets, false);
+		WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
 	}
 
 	if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
-		q->min_qlen_step = nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]);
+		WRITE_ONCE(q->min_qlen_step,
+			   nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
 
 	if (tb[TCA_DUALPI2_COUPLING]) {
 		u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
 
-		q->coupling_factor = coupling;
+		WRITE_ONCE(q->coupling_factor, coupling);
 	}
 
 	if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
 		u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
 
-		q->drop_overload = (bool)drop_overload;
+		WRITE_ONCE(q->drop_overload, (bool)drop_overload);
 	}
 
 	if (tb[TCA_DUALPI2_DROP_EARLY]) {
 		u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
 
-		q->drop_early = (bool)drop_early;
+		WRITE_ONCE(q->drop_early, (bool)drop_early);
 	}
 
 	if (tb[TCA_DUALPI2_C_PROTECTION]) {
@@ -377,13 +396,13 @@ static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
 	if (tb[TCA_DUALPI2_ECN_MASK]) {
 		u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
 
-		q->ecn_mask = ecn_mask;
+		WRITE_ONCE(q->ecn_mask, ecn_mask);
 	}
 
 	if (tb[TCA_DUALPI2_SPLIT_GSO]) {
 		u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
 
-		q->split_gso = (bool)split_gso;
+		WRITE_ONCE(q->split_gso, (bool)split_gso);
 	}
 
 	old_qlen = qdisc_qlen(sch);
@@ -460,6 +479,105 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
 	return 0;
 }
 
+static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	struct nlattr *opts;
+	bool step_in_pkts;
+	u32 step_th;
+
+	step_in_pkts = READ_ONCE(q->step_in_packets);
+	step_th = READ_ONCE(q->step_thresh);
+
+	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
+	if (!opts)
+		goto nla_put_failure;
+
+	if (step_in_pkts &&
+	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+			READ_ONCE(q->memory_limit)) ||
+	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
+			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_BETA,
+			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
+	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+			READ_ONCE(q->min_qlen_step)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+		       READ_ONCE(q->coupling_factor)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+		       READ_ONCE(q->drop_overload)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+		       READ_ONCE(q->drop_early)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+		       READ_ONCE(q->c_protection_wc)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+		goto nla_put_failure;
+
+	if (!step_in_pkts &&
+	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
+	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
+			READ_ONCE(q->memory_limit)) ||
+	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
+			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
+			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
+			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_BETA,
+			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
+	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
+			convert_ns_to_usec(step_th)) ||
+	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
+			READ_ONCE(q->min_qlen_step)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
+		       READ_ONCE(q->coupling_factor)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
+		       READ_ONCE(q->drop_overload)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
+		       READ_ONCE(q->drop_early)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
+		       READ_ONCE(q->c_protection_wc)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
+	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, opts);
+
+nla_put_failure:
+	nla_nest_cancel(skb, opts);
+	return -1;
+}
+
+static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	struct tc_dualpi2_xstats st = {
+		.prob			= q->pi2_prob,
+		.packets_in_c		= q->packets_in_c,
+		.packets_in_l		= q->packets_in_l,
+		.maxq			= q->maxq,
+		.ecn_mark		= q->ecn_mark,
+		.credit			= q->c_protection_credit,
+		.step_marks		= q->step_marks,
+		.memory_used		= q->memory_used,
+		.max_memory_used	= q->max_memory_used,
+		.memory_limit		= q->memory_limit,
+	};
+	u64 qc, ql;
+
+	get_queue_delays(q, &qc, &ql);
+	st.delay_l = convert_ns_to_usec(ql);
+	st.delay_c = convert_ns_to_usec(qc);
+	return gnet_stats_copy_app(d, &st, sizeof(st));
+}
+
 /* Reset both L-queue and C-queue, internal packet counters, PI probability,
  * C-queue protection credit, and timestamps, while preserving current
  * configuration of DUALPI2.
@@ -564,6 +682,8 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
 	.destroy	= dualpi2_destroy,
 	.reset		= dualpi2_reset,
 	.change		= dualpi2_change,
+	.dump		= dualpi2_dump,
+	.dump_stats	= dualpi2_dump_stats,
 	.owner		= THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From 8f9516daedd67097a0c6e463fcb7a42b5ee9d477 Mon Sep 17 00:00:00 2001
From: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
Date: Tue, 22 Jul 2025 11:59:12 +0200
Subject: sched: Add enqueue/dequeue of dualpi2 qdisc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DualPI2 provides L4S-type low latency & loss to traffic that uses a
scalable congestion controller (e.g. TCP-Prague, DCTCP) without
degrading the performance of 'classic' traffic (e.g. Reno,
Cubic etc.). It is to be the reference implementation of IETF RFC9332
DualQ Coupled AQM (https://datatracker.ietf.org/doc/html/rfc9332).

Note that creating two independent queues cannot meet the goal of
DualPI2 mentioned in RFC9332: "...to preserve fairness between
ECN-capable and non-ECN-capable traffic." Further, it could even
lead to starvation of Classic traffic, which is also inconsistent
with the requirements in RFC9332: "...although priority MUST be
bounded in order not to starve Classic traffic." DualPI2 is
designed to maintain approximate per-flow fairness on L-queue and
C-queue by forming a single qdisc using the coupling factor and
scheduler between two queues.

The qdisc provides two queues called low latency and classic. It
classifies packets based on the ECN field in the IP headers. By
default it directs non-ECN and ECT(0) into the classic queue and
ECT(1) and CE into the low latency queue, as per the IETF spec.

Each queue runs its own AQM:
* The classic AQM is called PI2, which is similar to the PIE AQM but
  more responsive and simpler. Classic traffic requires a decent
  target queue (default 15ms for Internet deployment) to fully
  utilize the link and to avoid high drop rates.
* The low latency AQM is, by default, a very shallow ECN marking
  threshold (1ms) similar to that used for DCTCP.

The DualQ isolates the low queuing delay of the Low Latency queue
from the larger delay of the 'Classic' queue. However, from a
bandwidth perspective, flows in either queue will share out the link
capacity as if there was just a single queue. This bandwidth pooling
effect is achieved by coupling together the drop and ECN-marking
probabilities of the two AQMs.

The PI2 AQM has two main parameters in addition to its target delay.
The integral gain factor alpha is used to slowly correct any persistent
standing queue error from the target delay, while the proportional gain
factor beta is used to quickly compensate for queue changes (growth or
shrinkage). Either alpha and beta are given as a parameter, or they can
be calculated by tc from alternative typical and maximum RTT parameters.

Internally, the output of a linear Proportional Integral (PI)
controller is used for both queues. This output is squared to
calculate the drop or ECN-marking probability of the classic queue.
This counterbalances the square-root rate equation of Reno/Cubic,
which is the trick that balances flow rates across the queues. For
the ECN-marking probability of the low latency queue, the output of
the base AQM is multiplied by a coupling factor. This determines the
balance between the flow rates in each queue. The default setting
makes the flow rates roughly equal, which should be generally
applicable.

If DUALPI2 AQM has detected overload (due to excessive non-responsive
traffic in either queue), it will switch to signaling congestion
solely using drop, irrespective of the ECN field. Alternatively, it
can be configured to limit the drop probability and let the queue
grow and eventually overflow (like tail-drop).

GSO splitting in DUALPI2 is configurable from userspace while the
default behavior is to split gso. When running DUALPI2 at unshaped
10gigE with 4 download streams test, splitting gso apart results in
halving the latency with no loss in throughput:

Summary of tcp_4down run 'no_split_gso':
                         avg         median      # data pts
 Ping (ms) ICMP   :       0.53      0.30 ms         350
 TCP download avg :    2326.86       N/A Mbits/s    350
 TCP download sum :    9307.42       N/A Mbits/s    350
 TCP download::1  :    2672.99   2568.73 Mbits/s    350
 TCP download::2  :    2586.96   2570.51 Mbits/s    350
 TCP download::3  :    1786.26   1798.82 Mbits/s    350
 TCP download::4  :    2261.21   2309.49 Mbits/s    350

Summart of tcp_4down run 'split_gso':
                         avg          median      # data pts
 Ping (ms) ICMP   :       0.22      0.23 ms         350
 TCP download avg :    2335.02       N/A Mbits/s    350
 TCP download sum :    9340.09       N/A Mbits/s    350
 TCP download::1  :    2335.30   2334.22 Mbits/s    350
 TCP download::2  :    2334.72   2334.20 Mbits/s    350
 TCP download::3  :    2335.28   2334.58 Mbits/s    350
 TCP download::4  :    2334.79   2334.39 Mbits/s    350

A similar result is observed when running DUALPI2 at unshaped 1gigE
with 1 download stream test:

Summary of tcp_1down run 'no_split_gso':
                         avg         median      # data pts
 Ping (ms) ICMP :         1.13      1.25 ms         350
 TCP download   :       941.41    941.46 Mbits/s    350

Summart of tcp_1down run 'split_gso':
                         avg         median      # data pts
 Ping (ms) ICMP :         0.51      0.55 ms         350
 TCP download   :       941.41    941.45 Mbits/s    350

Additional details can be found in the draft:
  https://datatracker.ietf.org/doc/html/rfc9332

Signed-off-by: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
Co-developed-by: Olga Albisser <olga@albisser.org>
Signed-off-by: Olga Albisser <olga@albisser.org>
Co-developed-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Co-developed-by: Henrik Steen <henrist@henrist.net>
Signed-off-by: Henrik Steen <henrist@henrist.net>
Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Bob Briscoe <research@bobbriscoe.net>
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Acked-by: Dave Taht <dave.taht@gmail.com>
Link: https://patch.msgid.link/20250722095915.24485-4-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h |   6 +
 net/sched/Kconfig             |  12 ++
 net/sched/Makefile            |   1 +
 net/sched/sch_dualpi2.c       | 472 +++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 487 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index e19184dd1b0f..d8ff24a33459 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -126,6 +126,7 @@
 	FN(CANFD_RX_INVALID_FRAME)	\
 	FN(CANXL_RX_INVALID_FRAME)	\
 	FN(PFMEMALLOC)	\
+	FN(DUALPI2_STEP_DROP)		\
 	FNe(MAX)
 
 /**
@@ -604,6 +605,11 @@ enum skb_drop_reason {
 	 * reached a path or socket not eligible for use of memory reserves
 	 */
 	SKB_DROP_REASON_PFMEMALLOC,
+	/**
+	 * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop
+	 * threshold of DualPI2 qdisc.
+	 */
+	SKB_DROP_REASON_DUALPI2_STEP_DROP,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
 	 * shouldn't be used as a real 'reason' - only for tracing code gen
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ad914d2b2e22..6ddff028b81a 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -415,6 +415,18 @@ config NET_SCH_BPF
 
 	  If unsure, say N.
 
+config NET_SCH_DUALPI2
+	tristate "Dual Queue PI Square (DUALPI2) scheduler"
+	help
+	  Say Y here if you want to use the Dual Queue Proportional Integral
+	  Controller Improved with a Square scheduling algorithm.
+	  For more information, please see https://tools.ietf.org/html/rfc9332
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called sch_dualpi2.
+
+	  If unsure, say N.
+
 menuconfig NET_SCH_DEFAULT
 	bool "Allow override default queue discipline"
 	help
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 904d784902d1..5078ea84e6ad 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_NET_SCH_CBS)	+= sch_cbs.o
 obj-$(CONFIG_NET_SCH_ETF)	+= sch_etf.o
 obj-$(CONFIG_NET_SCH_TAPRIO)	+= sch_taprio.o
 obj-$(CONFIG_NET_SCH_BPF)	+= bpf_qdisc.o
+obj-$(CONFIG_NET_SCH_DUALPI2)	+= sch_dualpi2.o
 
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
index 0a96d57c40d1..845375ebd4ea 100644
--- a/net/sched/sch_dualpi2.c
+++ b/net/sched/sch_dualpi2.c
@@ -113,8 +113,44 @@ struct dualpi2_sched_data {
 	u32	step_marks;	/* ECN mark pkt counter due to step AQM */
 	u32	memory_used;	/* Memory used of both queues */
 	u32	max_memory_used;/* Maximum used memory */
+
+	/* Deferred drop statistics */
+	u32	deferred_drops_cnt;	/* Packets dropped */
+	u32	deferred_drops_len;	/* Bytes dropped */
+};
+
+struct dualpi2_skb_cb {
+	u64 ts;			/* Timestamp at enqueue */
+	u8 apply_step:1,	/* Can we apply the step threshold */
+	   classified:2,	/* Packet classification results */
+	   ect:2;		/* Packet ECT codepoint */
+};
+
+enum dualpi2_classification_results {
+	DUALPI2_C_CLASSIC	= 0,	/* C-queue */
+	DUALPI2_C_L4S		= 1,	/* L-queue (scale mark/classic drop) */
+	DUALPI2_C_LLLL		= 2,	/* L-queue (no drops/marks) */
+	__DUALPI2_C_MAX			/* Keep last*/
 };
 
+static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
+{
+	qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
+	return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
+static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
+{
+	return reference - dualpi2_skb_cb(skb)->ts;
+}
+
+static u64 head_enqueue_time(struct Qdisc *q)
+{
+	struct sk_buff *skb = qdisc_peek_head(q);
+
+	return skb ? dualpi2_skb_cb(skb)->ts : 0;
+}
+
 static u32 dualpi2_scale_alpha_beta(u32 param)
 {
 	u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
@@ -136,6 +172,30 @@ static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
 	return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
 }
 
+static bool skb_is_l4s(struct sk_buff *skb)
+{
+	return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
+}
+
+static bool skb_in_l_queue(struct sk_buff *skb)
+{
+	return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
+}
+
+static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
+{
+	return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
+}
+
+static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
+{
+	if (INET_ECN_set_ce(skb)) {
+		q->ecn_mark++;
+		return true;
+	}
+	return false;
+}
+
 static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
 {
 	q->c_protection_credit = q->c_protection_init;
@@ -155,6 +215,408 @@ static void dualpi2_calculate_c_protection(struct Qdisc *sch,
 	dualpi2_reset_c_protection(q);
 }
 
+static bool dualpi2_roll(u32 prob)
+{
+	return get_random_u32() <= prob;
+}
+
+/* Packets in the C-queue are subject to a marking probability pC, which is the
+ * square of the internal PI probability (i.e., have an overall lower mark/drop
+ * probability). If the qdisc is overloaded, ignore ECT values and only drop.
+ *
+ * Note that this marking scheme is also applied to L4S packets during overload.
+ * Return true if packet dropping is required in C queue
+ */
+static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
+				    struct sk_buff *skb, u32 prob,
+				    bool overload)
+{
+	if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
+		if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+			return true;
+		dualpi2_mark(q, skb);
+	}
+	return false;
+}
+
+/* Packets in the L-queue are subject to a marking probability pL given by the
+ * internal PI probability scaled by the coupling factor.
+ *
+ * On overload (i.e., @local_l_prob is >= 100%):
+ * - if the qdisc is configured to trade losses to preserve latency (i.e.,
+ *   @q->drop_overload), apply classic drops first before marking.
+ * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
+ *   delay, eventually resulting in taildrop behavior once sch->limit is
+ *   reached.
+ * Return true if packet dropping is required in L queue
+ */
+static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
+				     struct sk_buff *skb,
+				     u64 local_l_prob, u32 prob,
+				     bool overload)
+{
+	if (overload) {
+		/* Apply classic drop */
+		if (!q->drop_overload ||
+		    !(dualpi2_roll(prob) && dualpi2_roll(prob)))
+			goto mark;
+		return true;
+	}
+
+	/* We can safely cut the upper 32b as overload==false */
+	if (dualpi2_roll(local_l_prob)) {
+		/* Non-ECT packets could have classified as L4S by filters. */
+		if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
+			return true;
+mark:
+		dualpi2_mark(q, skb);
+	}
+	return false;
+}
+
+/* Decide whether a given packet must be dropped (or marked if ECT), according
+ * to the PI2 probability.
+ *
+ * Never mark/drop if we have a standing queue of less than 2 MTUs.
+ */
+static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
+		      struct sk_buff *skb)
+{
+	u64 local_l_prob;
+	bool overload;
+	u32 prob;
+
+	if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
+		return false;
+
+	prob = READ_ONCE(q->pi2_prob);
+	local_l_prob = (u64)prob * q->coupling_factor;
+	overload = local_l_prob > MAX_PROB;
+
+	switch (dualpi2_skb_cb(skb)->classified) {
+	case DUALPI2_C_CLASSIC:
+		return dualpi2_classic_marking(q, skb, prob, overload);
+	case DUALPI2_C_L4S:
+		return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
+						overload);
+	default: /* DUALPI2_C_LLLL */
+		return false;
+	}
+}
+
+static void dualpi2_read_ect(struct sk_buff *skb)
+{
+	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+	int wlen = skb_network_offset(skb);
+
+	switch (skb_protocol(skb, true)) {
+	case htons(ETH_P_IP):
+		wlen += sizeof(struct iphdr);
+		if (!pskb_may_pull(skb, wlen) ||
+		    skb_try_make_writable(skb, wlen))
+			goto not_ecn;
+
+		cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
+		break;
+	case htons(ETH_P_IPV6):
+		wlen += sizeof(struct ipv6hdr);
+		if (!pskb_may_pull(skb, wlen) ||
+		    skb_try_make_writable(skb, wlen))
+			goto not_ecn;
+
+		cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
+		break;
+	default:
+		goto not_ecn;
+	}
+	return;
+
+not_ecn:
+	/* Non pullable/writable packets can only be dropped hence are
+	 * classified as not ECT.
+	 */
+	cb->ect = INET_ECN_NOT_ECT;
+}
+
+static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
+				struct sk_buff *skb)
+{
+	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
+	struct tcf_result res;
+	struct tcf_proto *fl;
+	int result;
+
+	dualpi2_read_ect(skb);
+	if (cb->ect & q->ecn_mask) {
+		cb->classified = DUALPI2_C_L4S;
+		return NET_XMIT_SUCCESS;
+	}
+
+	if (TC_H_MAJ(skb->priority) == q->sch->handle &&
+	    TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
+		cb->classified = TC_H_MIN(skb->priority);
+		return NET_XMIT_SUCCESS;
+	}
+
+	fl = rcu_dereference_bh(q->tcf_filters);
+	if (!fl) {
+		cb->classified = DUALPI2_C_CLASSIC;
+		return NET_XMIT_SUCCESS;
+	}
+
+	result = tcf_classify(skb, NULL, fl, &res, false);
+	if (result >= 0) {
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_STOLEN:
+		case TC_ACT_QUEUED:
+		case TC_ACT_TRAP:
+			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
+		case TC_ACT_SHOT:
+			return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+		}
+#endif
+		cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
+			TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
+	}
+	return NET_XMIT_SUCCESS;
+}
+
+static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
+			       struct sk_buff **to_free)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	struct dualpi2_skb_cb *cb;
+
+	if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
+	    unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
+		qdisc_qstats_overlimit(sch);
+		if (skb_in_l_queue(skb))
+			qdisc_qstats_overlimit(q->l_queue);
+		return qdisc_drop_reason(skb, sch, to_free,
+					 SKB_DROP_REASON_QDISC_OVERLIMIT);
+	}
+
+	if (q->drop_early && must_drop(sch, q, skb)) {
+		qdisc_drop_reason(skb, sch, to_free,
+				  SKB_DROP_REASON_QDISC_CONGESTED);
+		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
+	}
+
+	cb = dualpi2_skb_cb(skb);
+	cb->ts = ktime_get_ns();
+	q->memory_used += skb->truesize;
+	if (q->memory_used > q->max_memory_used)
+		q->max_memory_used = q->memory_used;
+
+	if (qdisc_qlen(sch) > q->maxq)
+		q->maxq = qdisc_qlen(sch);
+
+	if (skb_in_l_queue(skb)) {
+		/* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
+		dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
+
+		/* Keep the overall qdisc stats consistent */
+		++sch->q.qlen;
+		qdisc_qstats_backlog_inc(sch, skb);
+		++q->packets_in_l;
+		if (!q->l_head_ts)
+			q->l_head_ts = cb->ts;
+		return qdisc_enqueue_tail(skb, q->l_queue);
+	}
+	++q->packets_in_c;
+	if (!q->c_head_ts)
+		q->c_head_ts = cb->ts;
+	return qdisc_enqueue_tail(skb, sch);
+}
+
+/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
+ * each of those individually. This yields the following benefits, at the
+ * expense of CPU usage:
+ * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
+ *   same fate (e.g., the random mark/drop probability is applied individually)
+ * - Improved precision of the starvation protection/WRR scheduler at dequeue,
+ *   as the size of the dequeued packets will be smaller.
+ */
+static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+				 struct sk_buff **to_free)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	int err;
+
+	err = dualpi2_skb_classify(q, skb);
+	if (err != NET_XMIT_SUCCESS) {
+		if (err & __NET_XMIT_BYPASS)
+			qdisc_qstats_drop(sch);
+		__qdisc_drop(skb, to_free);
+		return err;
+	}
+
+	if (q->split_gso && skb_is_gso(skb)) {
+		netdev_features_t features;
+		struct sk_buff *nskb, *next;
+		int cnt, byte_len, orig_len;
+		int err;
+
+		features = netif_skb_features(skb);
+		nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+		if (IS_ERR_OR_NULL(nskb))
+			return qdisc_drop(skb, sch, to_free);
+
+		cnt = 1;
+		byte_len = 0;
+		orig_len = qdisc_pkt_len(skb);
+		skb_list_walk_safe(nskb, nskb, next) {
+			skb_mark_not_on_list(nskb);
+
+			/* Iterate through GSO fragments of an skb:
+			 * (1) Set pkt_len from the single GSO fragments
+			 * (2) Copy classified and ect values of an skb
+			 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
+			 */
+			qdisc_skb_cb(nskb)->pkt_len = nskb->len;
+			dualpi2_skb_cb(nskb)->classified =
+				dualpi2_skb_cb(skb)->classified;
+			dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
+			err = dualpi2_enqueue_skb(nskb, sch, to_free);
+
+			if (err == NET_XMIT_SUCCESS) {
+				/* Compute the backlog adjustment that needs
+				 * to be propagated in the qdisc tree to reflect
+				 * all new skbs successfully enqueued.
+				 */
+				++cnt;
+				byte_len += nskb->len;
+			}
+		}
+		if (cnt > 1) {
+			/* The caller will add the original skb stats to its
+			 * backlog, compensate this if any nskb is enqueued.
+			 */
+			--cnt;
+			byte_len -= orig_len;
+		}
+		qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
+		consume_skb(skb);
+		return err;
+	}
+	return dualpi2_enqueue_skb(skb, sch, to_free);
+}
+
+/* Select the queue from which the next packet can be dequeued, ensuring that
+ * neither queue can starve the other with a WRR scheduler.
+ *
+ * The sign of the WRR credit determines the next queue, while the size of
+ * the dequeued packet determines the magnitude of the WRR credit change. If
+ * either queue is empty, the WRR credit is kept unchanged.
+ *
+ * As the dequeued packet can be dropped later, the caller has to perform the
+ * qdisc_bstats_update() calls.
+ */
+static struct sk_buff *dequeue_packet(struct Qdisc *sch,
+				      struct dualpi2_sched_data *q,
+				      int *credit_change,
+				      u64 now)
+{
+	struct sk_buff *skb = NULL;
+	int c_len;
+
+	*credit_change = 0;
+	c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
+	if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
+		skb = __qdisc_dequeue_head(&q->l_queue->q);
+		WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
+		if (c_len)
+			*credit_change = q->c_protection_wc;
+		qdisc_qstats_backlog_dec(q->l_queue, skb);
+
+		/* Keep the global queue size consistent */
+		--sch->q.qlen;
+		q->memory_used -= skb->truesize;
+	} else if (c_len) {
+		skb = __qdisc_dequeue_head(&sch->q);
+		WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
+		if (qdisc_qlen(q->l_queue))
+			*credit_change = ~((s32)q->c_protection_wl) + 1;
+		q->memory_used -= skb->truesize;
+	} else {
+		dualpi2_reset_c_protection(q);
+		return NULL;
+	}
+	*credit_change *= qdisc_pkt_len(skb);
+	qdisc_qstats_backlog_dec(sch, skb);
+	return skb;
+}
+
+static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
+		       u64 now)
+{
+	u64 qdelay = 0;
+
+	if (q->step_in_packets)
+		qdelay = qdisc_qlen(q->l_queue);
+	else
+		qdelay = dualpi2_sojourn_time(skb, now);
+
+	if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
+		if (!dualpi2_skb_cb(skb)->ect) {
+			/* Drop this non-ECT packet */
+			return 1;
+		}
+
+		if (dualpi2_mark(q, skb))
+			++q->step_marks;
+	}
+	qdisc_bstats_update(q->l_queue, skb);
+	return 0;
+}
+
+static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
+			   struct Qdisc *sch, enum skb_drop_reason reason)
+{
+	++q->deferred_drops_cnt;
+	q->deferred_drops_len += qdisc_pkt_len(skb);
+	kfree_skb_reason(skb, reason);
+	qdisc_qstats_drop(sch);
+}
+
+static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
+{
+	struct dualpi2_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	int credit_change;
+	u64 now;
+
+	now = ktime_get_ns();
+
+	while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
+		if (!q->drop_early && must_drop(sch, q, skb)) {
+			drop_and_retry(q, skb, sch,
+				       SKB_DROP_REASON_QDISC_CONGESTED);
+			continue;
+		}
+
+		if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
+			qdisc_qstats_drop(q->l_queue);
+			drop_and_retry(q, skb, sch,
+				       SKB_DROP_REASON_DUALPI2_STEP_DROP);
+			continue;
+		}
+
+		q->c_protection_credit += credit_change;
+		qdisc_bstats_update(sch, skb);
+		break;
+	}
+
+	if (q->deferred_drops_cnt) {
+		qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
+					  q->deferred_drops_len);
+		q->deferred_drops_cnt = 0;
+		q->deferred_drops_len = 0;
+	}
+	return skb;
+}
+
 static s64 __scale_delta(u64 diff)
 {
 	do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
@@ -167,8 +629,8 @@ static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
 	u64 now, qc, ql;
 
 	now = ktime_get_ns();
-	qc = q->c_head_ts;
-	ql = q->l_head_ts;
+	qc = READ_ONCE(q->c_head_ts);
+	ql = READ_ONCE(q->l_head_ts);
 
 	*qdelay_c = qc ? now - qc : 0;
 	*qdelay_l = ql ? now - ql : 0;
@@ -254,7 +716,7 @@ static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 	spin_lock(root_lock);
 
-	q->pi2_prob = calculate_probability(sch);
+	WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
 	hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
 
 	spin_unlock(root_lock);
@@ -559,7 +1021,7 @@ static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct dualpi2_sched_data *q = qdisc_priv(sch);
 	struct tc_dualpi2_xstats st = {
-		.prob			= q->pi2_prob,
+		.prob			= READ_ONCE(q->pi2_prob),
 		.packets_in_c		= q->packets_in_c,
 		.packets_in_l		= q->packets_in_l,
 		.maxq			= q->maxq,
@@ -677,6 +1139,8 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
 	.id		= "dualpi2",
 	.cl_ops		= &dualpi2_class_ops,
 	.priv_size	= sizeof(struct dualpi2_sched_data),
+	.enqueue	= dualpi2_qdisc_enqueue,
+	.dequeue	= dualpi2_qdisc_dequeue,
 	.peek		= qdisc_peek_dequeued,
 	.init		= dualpi2_init,
 	.destroy	= dualpi2_destroy,
-- 
cgit v1.2.3


From 28517c8b6275a9cd25a4974d0e4d58eaba465a67 Mon Sep 17 00:00:00 2001
From: Dimitri Fedrau <dimitri.fedrau@liebherr.com>
Date: Wed, 23 Jul 2025 19:34:56 +0200
Subject: pwm: mc33xs2410: add hwmon support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support for hwmon is provided by a separate driver residing in hwmon
subsystem which is implemented as auxiliary device. Add handling of this
device.

Signed-off-by: Dimitri Fedrau <dimitri.fedrau@liebherr.com>
Link: https://lore.kernel.org/r/20250723-mc33xs2410-hwmon-v5-1-f62aab71cd59@liebherr.com
Signed-off-by: Uwe Kleine-König <ukleinek@kernel.org>
---
 drivers/pwm/Kconfig          |  1 +
 drivers/pwm/pwm-mc33xs2410.c | 20 ++++++++++++++++++--
 include/linux/mc33xs2410.h   | 16 ++++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/mc33xs2410.h

(limited to 'include')

diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 3ef1757502eb..64f1c86340fd 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -436,6 +436,7 @@ config PWM_MC33XS2410
 	tristate "MC33XS2410 PWM support"
 	depends on OF
 	depends on SPI
+	select AUXILIARY_BUS
 	help
 	  NXP MC33XS2410 high-side switch driver. The MC33XS2410 is a four
 	  channel high-side switch. The device is operational from 3.0 V
diff --git a/drivers/pwm/pwm-mc33xs2410.c b/drivers/pwm/pwm-mc33xs2410.c
index a1ac3445ccdb..6d99e3ff7239 100644
--- a/drivers/pwm/pwm-mc33xs2410.c
+++ b/drivers/pwm/pwm-mc33xs2410.c
@@ -17,11 +17,14 @@
  *   behavior of the output pin that is neither the old nor the new state,
  *   rather something in between.
  */
+#define DEFAULT_SYMBOL_NAMESPACE		"PWM_MC33XS2410"
 
+#include <linux/auxiliary_bus.h>
 #include <linux/bitfield.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/math64.h>
+#include <linux/mc33xs2410.h>
 #include <linux/minmax.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -120,12 +123,19 @@ static int mc33xs2410_read_reg(struct spi_device *spi, u8 reg, u16 *val, u8 flag
 	return mc33xs2410_read_regs(spi, &reg, flag, val, 1);
 }
 
-static int mc33xs2410_read_reg_ctrl(struct spi_device *spi, u8 reg, u16 *val)
+int mc33xs2410_read_reg_ctrl(struct spi_device *spi, u8 reg, u16 *val)
 {
 	return mc33xs2410_read_reg(spi, reg, val, MC33XS2410_FRAME_IN_DATA_RD);
 }
+EXPORT_SYMBOL_GPL(mc33xs2410_read_reg_ctrl);
 
-static int mc33xs2410_modify_reg(struct spi_device *spi, u8 reg, u8 mask, u8 val)
+int mc33xs2410_read_reg_diag(struct spi_device *spi, u8 reg, u16 *val)
+{
+	return mc33xs2410_read_reg(spi, reg, val, 0);
+}
+EXPORT_SYMBOL_GPL(mc33xs2410_read_reg_diag);
+
+int mc33xs2410_modify_reg(struct spi_device *spi, u8 reg, u8 mask, u8 val)
 {
 	u16 tmp;
 	int ret;
@@ -139,6 +149,7 @@ static int mc33xs2410_modify_reg(struct spi_device *spi, u8 reg, u8 mask, u8 val
 
 	return mc33xs2410_write_reg(spi, reg, tmp);
 }
+EXPORT_SYMBOL_GPL(mc33xs2410_modify_reg);
 
 static u8 mc33xs2410_pwm_get_freq(u64 period)
 {
@@ -314,6 +325,7 @@ static int mc33xs2410_reset(struct device *dev)
 static int mc33xs2410_probe(struct spi_device *spi)
 {
 	struct device *dev = &spi->dev;
+	struct auxiliary_device *adev;
 	struct pwm_chip *chip;
 	int ret;
 
@@ -361,6 +373,10 @@ static int mc33xs2410_probe(struct spi_device *spi)
 	if (ret < 0)
 		return dev_err_probe(dev, ret, "Failed to add pwm chip\n");
 
+	adev = devm_auxiliary_device_create(dev, "hwmon", NULL);
+	if (!adev)
+		return dev_err_probe(dev, -ENODEV, "Failed to register hwmon device\n");
+
 	return 0;
 }
 
diff --git a/include/linux/mc33xs2410.h b/include/linux/mc33xs2410.h
new file mode 100644
index 000000000000..31c0edf10dd7
--- /dev/null
+++ b/include/linux/mc33xs2410.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024 Liebherr-Electronics and Drives GmbH
+ */
+#ifndef _MC33XS2410_H
+#define _MC33XS2410_H
+
+#include <linux/spi/spi.h>
+
+MODULE_IMPORT_NS("PWM_MC33XS2410");
+
+int mc33xs2410_read_reg_ctrl(struct spi_device *spi, u8 reg, u16 *val);
+int mc33xs2410_read_reg_diag(struct spi_device *spi, u8 reg, u16 *val);
+int mc33xs2410_modify_reg(struct spi_device *spi, u8 reg, u8 mask, u8 val);
+
+#endif /* _MC33XS2410_H */
-- 
cgit v1.2.3


From e038d985c9823a12cd64fa077d0c5aca2c644b67 Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@kernel.org>
Date: Tue, 10 Jun 2025 15:29:46 +0000
Subject: mfd: Add Apple Silicon System Management Controller

The System Management Controller (SMC) on Apple Silicon machines is a
piece of hardware that exposes various functionalities such as
temperature sensors, voltage/power meters, shutdown/reboot handling,
GPIOs and more.

Communication happens via a shared mailbox using the RTKit protocol
which is also used for other co-processors. The SMC protocol then allows
reading and writing many different keys which implement the various
features. The MFD core device handles this protocol and exposes it
to the sub-devices.

Some of the sub-devices are potentially also useful on pre-M1 Apple
machines and support for SMCs on these machines can be added at a later
time.

Co-developed-by: Hector Martin <marcan@marcan.st>
Signed-off-by: Hector Martin <marcan@marcan.st>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Sven Peter <sven@kernel.org>
Link: https://lore.kernel.org/r/20250610-smc-6-15-v7-5-556cafd771d3@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 MAINTAINERS                |   2 +
 drivers/mfd/Kconfig        |  18 ++
 drivers/mfd/Makefile       |   1 +
 drivers/mfd/macsmc.c       | 498 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/macsmc.h | 279 +++++++++++++++++++++++++
 5 files changed, 798 insertions(+)
 create mode 100644 drivers/mfd/macsmc.c
 create mode 100644 include/linux/mfd/macsmc.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 329b2df08d4b..f6bf4643c20b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2364,6 +2364,7 @@ F:	drivers/input/touchscreen/apple_z2.c
 F:	drivers/iommu/apple-dart.c
 F:	drivers/iommu/io-pgtable-dart.c
 F:	drivers/irqchip/irq-apple-aic.c
+F:	drivers/mfd/macsmc.c
 F:	drivers/nvme/host/apple.c
 F:	drivers/nvmem/apple-efuses.c
 F:	drivers/nvmem/apple-spmi-nvmem.c
@@ -2376,6 +2377,7 @@ F:	drivers/video/backlight/apple_dwi_bl.c
 F:	drivers/watchdog/apple_wdt.c
 F:	include/dt-bindings/interrupt-controller/apple-aic.h
 F:	include/dt-bindings/pinctrl/apple.h
+F:	include/linux/mfd/macsmc.h
 F:	include/linux/soc/apple/*
 F:	include/uapi/drm/asahi_drm.h
 
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 6fb3768e3d71..c6cc42360887 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -285,6 +285,24 @@ config MFD_CS42L43_SDW
 	  Select this to support the Cirrus Logic CS42L43 PC CODEC with
 	  headphone and class D speaker drivers over SoundWire.
 
+config MFD_MACSMC
+	tristate "Apple Silicon System Management Controller (SMC)"
+	depends on ARCH_APPLE || COMPILE_TEST
+	depends on OF
+	depends on APPLE_RTKIT
+	select MFD_CORE
+	help
+	  The System Management Controller (SMC) on Apple Silicon machines is a
+	  piece of hardware that exposes various functionalities such as
+	  temperature sensors, voltage/power meters, shutdown/reboot handling,
+	  GPIOs and more.
+
+	  Communication happens via a shared mailbox using the RTKit protocol
+	  which is also used for other co-processors. The SMC protocol then
+	  allows reading and writing many different keys which implement the
+	  various features. The MFD core device handles this protocol and
+	  exposes it to the sub-devices.
+
 config MFD_MADERA
 	tristate "Cirrus Logic Madera codecs"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 79495f9f3457..f7bdedd5a66d 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_MFD_CS42L43_SDW)	+= cs42l43-sdw.o
 obj-$(CONFIG_MFD_ENE_KB3930)	+= ene-kb3930.o
 obj-$(CONFIG_MFD_EXYNOS_LPASS)	+= exynos-lpass.o
 obj-$(CONFIG_MFD_GATEWORKS_GSC)	+= gateworks-gsc.o
+obj-$(CONFIG_MFD_MACSMC)	+= macsmc.o
 
 obj-$(CONFIG_MFD_TI_LP873X)	+= lp873x.o
 obj-$(CONFIG_MFD_TI_LP87565)	+= lp87565.o
diff --git a/drivers/mfd/macsmc.c b/drivers/mfd/macsmc.c
new file mode 100644
index 000000000000..870c8b2028a8
--- /dev/null
+++ b/drivers/mfd/macsmc.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/*
+ * Apple SMC (System Management Controller) MFD driver
+ *
+ * Copyright The Asahi Linux Contributors
+ */
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/math.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/macsmc.h>
+#include <linux/notifier.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/overflow.h>
+#include <linux/platform_device.h>
+#include <linux/soc/apple/rtkit.h>
+#include <linux/unaligned.h>
+
+#define SMC_ENDPOINT			0x20
+
+/* We don't actually know the true size here but this seem reasonable */
+#define SMC_SHMEM_SIZE			0x1000
+#define SMC_MAX_SIZE			255
+
+#define SMC_MSG_READ_KEY		0x10
+#define SMC_MSG_WRITE_KEY		0x11
+#define SMC_MSG_GET_KEY_BY_INDEX	0x12
+#define SMC_MSG_GET_KEY_INFO		0x13
+#define SMC_MSG_INITIALIZE		0x17
+#define SMC_MSG_NOTIFICATION		0x18
+#define SMC_MSG_RW_KEY			0x20
+
+#define SMC_DATA			GENMASK_ULL(63, 32)
+#define SMC_WSIZE			GENMASK_ULL(31, 24)
+#define SMC_SIZE			GENMASK_ULL(23, 16)
+#define SMC_ID				GENMASK_ULL(15, 12)
+#define SMC_MSG				GENMASK_ULL(7, 0)
+#define SMC_RESULT			SMC_MSG
+
+#define SMC_TIMEOUT_MS		500
+
+static const struct mfd_cell apple_smc_devs[] = {
+	MFD_CELL_OF("macsmc-gpio", NULL, NULL, 0, 0, "apple,smc-gpio"),
+	MFD_CELL_OF("macsmc-reboot", NULL, NULL, 0, 0, "apple,smc-reboot"),
+};
+
+static int apple_smc_cmd_locked(struct apple_smc *smc, u64 cmd, u64 arg,
+				  u64 size, u64 wsize, u32 *ret_data)
+{
+	u8 result;
+	int ret;
+	u64 msg;
+
+	lockdep_assert_held(&smc->mutex);
+
+	if (smc->boot_stage != APPLE_SMC_INITIALIZED)
+		return -EIO;
+	if (smc->atomic_mode)
+		return -EIO;
+
+	reinit_completion(&smc->cmd_done);
+
+	smc->msg_id = (smc->msg_id + 1) & 0xf;
+	msg = (FIELD_PREP(SMC_MSG, cmd) |
+	       FIELD_PREP(SMC_SIZE, size) |
+	       FIELD_PREP(SMC_WSIZE, wsize) |
+	       FIELD_PREP(SMC_ID, smc->msg_id) |
+	       FIELD_PREP(SMC_DATA, arg));
+
+	ret = apple_rtkit_send_message(smc->rtk, SMC_ENDPOINT, msg, NULL, false);
+	if (ret) {
+		dev_err(smc->dev, "Failed to send command\n");
+		return ret;
+	}
+
+	if (wait_for_completion_timeout(&smc->cmd_done, msecs_to_jiffies(SMC_TIMEOUT_MS)) <= 0) {
+		dev_err(smc->dev, "Command timed out (%llx)", msg);
+		return -ETIMEDOUT;
+	}
+
+	if (FIELD_GET(SMC_ID, smc->cmd_ret) != smc->msg_id) {
+		dev_err(smc->dev, "Command sequence mismatch (expected %d, got %d)\n",
+			smc->msg_id, (unsigned int)FIELD_GET(SMC_ID, smc->cmd_ret));
+		return -EIO;
+	}
+
+	result = FIELD_GET(SMC_RESULT, smc->cmd_ret);
+	if (result)
+		return -EIO;
+
+	if (ret_data)
+		*ret_data = FIELD_GET(SMC_DATA, smc->cmd_ret);
+
+	return FIELD_GET(SMC_SIZE, smc->cmd_ret);
+}
+
+static int apple_smc_cmd(struct apple_smc *smc, u64 cmd, u64 arg,
+			 u64 size, u64 wsize, u32 *ret_data)
+{
+	guard(mutex)(&smc->mutex);
+
+	return apple_smc_cmd_locked(smc, cmd, arg, size, wsize, ret_data);
+}
+
+static int apple_smc_rw_locked(struct apple_smc *smc, smc_key key,
+				const void *wbuf, size_t wsize,
+				void *rbuf, size_t rsize)
+{
+	u64 smc_size, smc_wsize;
+	u32 rdata;
+	int ret;
+	u64 cmd;
+
+	lockdep_assert_held(&smc->mutex);
+
+	if (rsize > SMC_MAX_SIZE)
+		return -EINVAL;
+	if (wsize > SMC_MAX_SIZE)
+		return -EINVAL;
+
+	if (rsize && wsize) {
+		cmd = SMC_MSG_RW_KEY;
+		memcpy_toio(smc->shmem.iomem, wbuf, wsize);
+		smc_size = rsize;
+		smc_wsize = wsize;
+	} else if (wsize && !rsize) {
+		cmd = SMC_MSG_WRITE_KEY;
+		memcpy_toio(smc->shmem.iomem, wbuf, wsize);
+		/*
+		 * Setting size to the length we want to write and wsize to 0
+		 * looks silly but that's how the SMC protocol works ¯\_(ツ)_/¯
+		 */
+		smc_size = wsize;
+		smc_wsize = 0;
+	} else if (!wsize && rsize) {
+		cmd = SMC_MSG_READ_KEY;
+		smc_size = rsize;
+		smc_wsize = 0;
+	} else {
+		return -EINVAL;
+	}
+
+	ret = apple_smc_cmd_locked(smc, cmd, key, smc_size, smc_wsize, &rdata);
+	if (ret < 0)
+		return ret;
+
+	if (rsize) {
+		/*
+		 * Small data <= 4 bytes is returned as part of the reply
+		 * message which is sent over the mailbox FIFO. Everything
+		 * bigger has to be copied from SRAM which is mapped as
+		 * Device memory.
+		 */
+		if (rsize <= 4)
+			memcpy(rbuf, &rdata, rsize);
+		else
+			memcpy_fromio(rbuf, smc->shmem.iomem, rsize);
+	}
+
+	return ret;
+}
+
+int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size)
+{
+	guard(mutex)(&smc->mutex);
+
+	return apple_smc_rw_locked(smc, key, NULL, 0, buf, size);
+}
+EXPORT_SYMBOL(apple_smc_read);
+
+int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size)
+{
+	guard(mutex)(&smc->mutex);
+
+	return apple_smc_rw_locked(smc, key, buf, size, NULL, 0);
+}
+EXPORT_SYMBOL(apple_smc_write);
+
+int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize,
+		 void *rbuf, size_t rsize)
+{
+	guard(mutex)(&smc->mutex);
+
+	return apple_smc_rw_locked(smc, key, wbuf, wsize, rbuf, rsize);
+}
+EXPORT_SYMBOL(apple_smc_rw);
+
+int apple_smc_get_key_by_index(struct apple_smc *smc, int index, smc_key *key)
+{
+	int ret;
+
+	ret = apple_smc_cmd(smc, SMC_MSG_GET_KEY_BY_INDEX, index, 0, 0, key);
+
+	*key = swab32(*key);
+	return ret;
+}
+EXPORT_SYMBOL(apple_smc_get_key_by_index);
+
+int apple_smc_get_key_info(struct apple_smc *smc, smc_key key, struct apple_smc_key_info *info)
+{
+	u8 key_info[6];
+	int ret;
+
+	ret = apple_smc_cmd(smc, SMC_MSG_GET_KEY_INFO, key, 0, 0, NULL);
+	if (ret >= 0 && info) {
+		memcpy_fromio(key_info, smc->shmem.iomem, sizeof(key_info));
+		info->size = key_info[0];
+		info->type_code = get_unaligned_be32(&key_info[1]);
+		info->flags = key_info[5];
+	}
+	return ret;
+}
+EXPORT_SYMBOL(apple_smc_get_key_info);
+
+int apple_smc_enter_atomic(struct apple_smc *smc)
+{
+	guard(mutex)(&smc->mutex);
+
+	/*
+	 * Disable notifications since this is called before shutdown and no
+	 * notification handler will be able to handle the notification
+	 * using atomic operations only. Also ignore any failure here
+	 * because we're about to shut down or reboot anyway.
+	 * We can't use apple_smc_write_flag here since that would try to lock
+	 * smc->mutex again.
+	 */
+	const u8 flag = 0;
+
+	apple_smc_rw_locked(smc, SMC_KEY(NTAP), &flag, sizeof(flag), NULL, 0);
+
+	smc->atomic_mode = true;
+
+	return 0;
+}
+EXPORT_SYMBOL(apple_smc_enter_atomic);
+
+int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size)
+{
+	guard(spinlock_irqsave)(&smc->lock);
+	u8 result;
+	int ret;
+	u64 msg;
+
+	if (size > SMC_MAX_SIZE || size == 0)
+		return -EINVAL;
+
+	if (smc->boot_stage != APPLE_SMC_INITIALIZED)
+		return -EIO;
+	if (!smc->atomic_mode)
+		return -EIO;
+
+	memcpy_toio(smc->shmem.iomem, buf, size);
+	smc->msg_id = (smc->msg_id + 1) & 0xf;
+	msg = (FIELD_PREP(SMC_MSG, SMC_MSG_WRITE_KEY) |
+	       FIELD_PREP(SMC_SIZE, size) |
+	       FIELD_PREP(SMC_ID, smc->msg_id) |
+	       FIELD_PREP(SMC_DATA, key));
+	smc->atomic_pending = true;
+
+	ret = apple_rtkit_send_message(smc->rtk, SMC_ENDPOINT, msg, NULL, true);
+	if (ret < 0) {
+		dev_err(smc->dev, "Failed to send command (%d)\n", ret);
+		return ret;
+	}
+
+	while (smc->atomic_pending) {
+		ret = apple_rtkit_poll(smc->rtk);
+		if (ret < 0) {
+			dev_err(smc->dev, "RTKit poll failed (%llx)", msg);
+			return ret;
+		}
+		udelay(100);
+	}
+
+	if (FIELD_GET(SMC_ID, smc->cmd_ret) != smc->msg_id) {
+		dev_err(smc->dev, "Command sequence mismatch (expected %d, got %d)\n",
+			smc->msg_id, (unsigned int)FIELD_GET(SMC_ID, smc->cmd_ret));
+		return -EIO;
+	}
+
+	result = FIELD_GET(SMC_RESULT, smc->cmd_ret);
+	if (result)
+		return -EIO;
+
+	return FIELD_GET(SMC_SIZE, smc->cmd_ret);
+}
+EXPORT_SYMBOL(apple_smc_write_atomic);
+
+static void apple_smc_rtkit_crashed(void *cookie, const void *bfr, size_t bfr_len)
+{
+	struct apple_smc *smc = cookie;
+
+	smc->boot_stage = APPLE_SMC_ERROR_CRASHED;
+	dev_err(smc->dev, "SMC crashed! Your system will reboot in a few seconds...\n");
+}
+
+static int apple_smc_rtkit_shmem_setup(void *cookie, struct apple_rtkit_shmem *bfr)
+{
+	struct apple_smc *smc = cookie;
+	size_t bfr_end;
+
+	if (!bfr->iova) {
+		dev_err(smc->dev, "RTKit wants a RAM buffer\n");
+		return -EIO;
+	}
+
+	if (check_add_overflow(bfr->iova, bfr->size - 1, &bfr_end))
+		return -EFAULT;
+
+	if (bfr->iova < smc->sram->start || bfr->iova > smc->sram->end ||
+	    bfr_end > smc->sram->end) {
+		dev_err(smc->dev, "RTKit buffer request outside SRAM region: [0x%llx, 0x%llx]\n",
+			(unsigned long long)bfr->iova,
+			(unsigned long long)bfr_end);
+		return -EFAULT;
+	}
+
+	bfr->iomem = smc->sram_base + (bfr->iova - smc->sram->start);
+	bfr->is_mapped = true;
+
+	return 0;
+}
+
+static bool apple_smc_rtkit_recv_early(void *cookie, u8 endpoint, u64 message)
+{
+	struct apple_smc *smc = cookie;
+
+	if (endpoint != SMC_ENDPOINT) {
+		dev_warn(smc->dev, "Received message for unknown endpoint 0x%x\n", endpoint);
+		return false;
+	}
+
+	if (smc->boot_stage == APPLE_SMC_BOOTING) {
+		int ret;
+
+		smc->shmem.iova = message;
+		smc->shmem.size = SMC_SHMEM_SIZE;
+		ret = apple_smc_rtkit_shmem_setup(smc, &smc->shmem);
+		if (ret < 0) {
+			smc->boot_stage = APPLE_SMC_ERROR_NO_SHMEM;
+			dev_err(smc->dev, "Failed to initialize shared memory (%d)\n", ret);
+		} else {
+			smc->boot_stage = APPLE_SMC_INITIALIZED;
+		}
+		complete(&smc->init_done);
+	} else if (FIELD_GET(SMC_MSG, message) == SMC_MSG_NOTIFICATION) {
+		/* Handle these in the RTKit worker thread */
+		return false;
+	} else {
+		smc->cmd_ret = message;
+		if (smc->atomic_pending)
+			smc->atomic_pending = false;
+		else
+			complete(&smc->cmd_done);
+	}
+
+	return true;
+}
+
+static void apple_smc_rtkit_recv(void *cookie, u8 endpoint, u64 message)
+{
+	struct apple_smc *smc = cookie;
+
+	if (endpoint != SMC_ENDPOINT) {
+		dev_warn(smc->dev, "Received message for unknown endpoint 0x%x\n", endpoint);
+		return;
+	}
+
+	if (FIELD_GET(SMC_MSG, message) != SMC_MSG_NOTIFICATION) {
+		dev_warn(smc->dev, "Received unknown message from worker: 0x%llx\n", message);
+		return;
+	}
+
+	blocking_notifier_call_chain(&smc->event_handlers, FIELD_GET(SMC_DATA, message), NULL);
+}
+
+static const struct apple_rtkit_ops apple_smc_rtkit_ops = {
+	.crashed = apple_smc_rtkit_crashed,
+	.recv_message = apple_smc_rtkit_recv,
+	.recv_message_early = apple_smc_rtkit_recv_early,
+	.shmem_setup = apple_smc_rtkit_shmem_setup,
+};
+
+static void apple_smc_rtkit_shutdown(void *data)
+{
+	struct apple_smc *smc = data;
+
+	/* Shut down SMC firmware, if it's not completely wedged */
+	if (apple_rtkit_is_running(smc->rtk))
+		apple_rtkit_quiesce(smc->rtk);
+}
+
+static void apple_smc_disable_notifications(void *data)
+{
+	struct apple_smc *smc = data;
+
+	apple_smc_write_flag(smc, SMC_KEY(NTAP), false);
+}
+
+static int apple_smc_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct apple_smc *smc;
+	u32 count;
+	int ret;
+
+	smc = devm_kzalloc(dev, sizeof(*smc), GFP_KERNEL);
+	if (!smc)
+		return -ENOMEM;
+
+	smc->dev = &pdev->dev;
+	smc->sram_base = devm_platform_get_and_ioremap_resource(pdev, 1, &smc->sram);
+	if (IS_ERR(smc->sram_base))
+		return dev_err_probe(dev, PTR_ERR(smc->sram_base), "Failed to map SRAM region");
+
+	smc->rtk = devm_apple_rtkit_init(dev, smc, NULL, 0, &apple_smc_rtkit_ops);
+	if (IS_ERR(smc->rtk))
+		return dev_err_probe(dev, PTR_ERR(smc->rtk), "Failed to initialize RTKit");
+
+	smc->boot_stage = APPLE_SMC_BOOTING;
+	ret = apple_rtkit_wake(smc->rtk);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to wake up SMC");
+
+	ret = devm_add_action_or_reset(dev, apple_smc_rtkit_shutdown, smc);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to register rtkit shutdown action");
+
+	ret = apple_rtkit_start_ep(smc->rtk, SMC_ENDPOINT);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to start SMC endpoint");
+
+	init_completion(&smc->init_done);
+	init_completion(&smc->cmd_done);
+
+	ret = apple_rtkit_send_message(smc->rtk, SMC_ENDPOINT,
+				       FIELD_PREP(SMC_MSG, SMC_MSG_INITIALIZE), NULL, false);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to send init message");
+
+	if (wait_for_completion_timeout(&smc->init_done, msecs_to_jiffies(SMC_TIMEOUT_MS)) == 0) {
+		dev_err(dev, "Timed out initializing SMC");
+		return -ETIMEDOUT;
+	}
+
+	if (smc->boot_stage != APPLE_SMC_INITIALIZED) {
+		dev_err(dev, "SMC failed to boot successfully, boot stage=%d\n", smc->boot_stage);
+		return -EIO;
+	}
+
+	dev_set_drvdata(&pdev->dev, smc);
+	BLOCKING_INIT_NOTIFIER_HEAD(&smc->event_handlers);
+
+	ret = apple_smc_read_u32(smc, SMC_KEY(#KEY), &count);
+	if (ret)
+		return dev_err_probe(smc->dev, ret, "Failed to get key count");
+	smc->key_count = be32_to_cpu(count);
+
+	/* Enable notifications */
+	apple_smc_write_flag(smc, SMC_KEY(NTAP), true);
+	ret = devm_add_action_or_reset(dev, apple_smc_disable_notifications, smc);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to register notification disable action");
+
+	ret = devm_mfd_add_devices(smc->dev, PLATFORM_DEVID_NONE,
+				   apple_smc_devs, ARRAY_SIZE(apple_smc_devs),
+				   NULL, 0, NULL);
+	if (ret)
+		return dev_err_probe(smc->dev, ret, "Failed to register sub-devices");
+
+
+	return 0;
+}
+
+static const struct of_device_id apple_smc_of_match[] = {
+	{ .compatible = "apple,smc" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, apple_smc_of_match);
+
+static struct platform_driver apple_smc_driver = {
+	.driver = {
+		.name = "macsmc",
+		.of_match_table = apple_smc_of_match,
+	},
+	.probe = apple_smc_probe,
+};
+module_platform_driver(apple_smc_driver);
+
+MODULE_AUTHOR("Hector Martin <marcan@marcan.st>");
+MODULE_AUTHOR("Sven Peter <sven@kernel.org>");
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_DESCRIPTION("Apple SMC driver");
diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h
new file mode 100644
index 000000000000..6b13f01a8592
--- /dev/null
+++ b/include/linux/mfd/macsmc.h
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/*
+ * Apple SMC (System Management Controller) core definitions
+ *
+ * Copyright (C) The Asahi Linux Contributors
+ */
+
+#ifndef _LINUX_MFD_MACSMC_H
+#define _LINUX_MFD_MACSMC_H
+
+#include <linux/soc/apple/rtkit.h>
+
+/**
+ * typedef smc_key - Alias for u32 to be used for SMC keys
+ *
+ * SMC keys are 32bit integers containing packed ASCII characters in natural
+ * integer order, i.e. 0xAABBCCDD, which represent the FourCC ABCD.
+ * The SMC driver is designed with this assumption and ensures the right
+ * endianness is used when these are stored to memory and sent to or received
+ * from the actual SMC firmware (which can be done in either shared memory or
+ * as 64bit mailbox message on Apple Silicon).
+ * Internally, SMC stores these keys in a table sorted lexicographically and
+ * allows resolving an index into this table to the corresponding SMC key.
+ * Thus, storing keys as u32 is very convenient as it allows to e.g. use
+ * normal comparison operators which directly map to the natural order used
+ * by SMC firmware.
+ *
+ * This simple type alias is introduced to allow easy recognition of SMC key
+ * variables and arguments.
+ */
+typedef u32 smc_key;
+
+/**
+ * SMC_KEY - Convert FourCC SMC keys in source code to smc_key
+ *
+ * This macro can be used to easily define FourCC SMC keys in source code
+ * and convert these to u32 / smc_key, e.g. SMC_KEY(NTAP) will expand to
+ * 0x4e544150.
+ *
+ * @s: FourCC SMC key to be converted
+ */
+#define SMC_KEY(s) (smc_key)(_SMC_KEY(#s))
+#define _SMC_KEY(s) (((s)[0] << 24) | ((s)[1] << 16) | ((s)[2] << 8) | (s)[3])
+
+#define APPLE_SMC_READABLE BIT(7)
+#define APPLE_SMC_WRITABLE BIT(6)
+#define APPLE_SMC_FUNCTION BIT(4)
+
+/**
+ * struct apple_smc_key_info - Information for a SMC key as returned by SMC
+ * @type_code: FourCC code indicating the type for this key.
+ *             Known types:
+ *              ch8*: ASCII string
+ *              flag: Boolean, 1 or 0
+ *              flt: 32-bit single-precision IEEE 754 float
+ *              hex: Binary data
+ *              ioft: 64bit Unsigned fixed-point intger (48.16)
+ *              {si,ui}{8,16,32,64}: Signed/Unsigned 8-/16-/32-/64-bit integer
+ * @size: Size of the buffer associated with this key
+ * @flags: Bitfield encoding flags (APPLE_SMC_{READABLE,WRITABLE,FUNCTION})
+ */
+struct apple_smc_key_info {
+	u32 type_code;
+	u8 size;
+	u8 flags;
+};
+
+/**
+ * enum apple_smc_boot_stage - SMC boot stage
+ * @APPLE_SMC_BOOTING: SMC is booting
+ * @APPLE_SMC_INITIALIZED: SMC is initialized and ready to use
+ * @APPLE_SMC_ERROR_NO_SHMEM: Shared memory could not be initialized during boot
+ * @APPLE_SMC_ERROR_CRASHED: SMC has crashed
+ */
+enum apple_smc_boot_stage {
+	APPLE_SMC_BOOTING,
+	APPLE_SMC_INITIALIZED,
+	APPLE_SMC_ERROR_NO_SHMEM,
+	APPLE_SMC_ERROR_CRASHED
+};
+
+/**
+ * struct apple_smc
+ * @dev: Underlying device struct for the physical backend device
+ * @key_count: Number of available SMC keys
+ * @first_key: First valid SMC key
+ * @last_key: Last valid SMC key
+ * @event_handlers: Notifier call chain for events received from SMC
+ * @rtk: Pointer to Apple RTKit instance
+ * @init_done: Completion for initialization
+ * @boot_stage: Current boot stage of SMC
+ * @sram: Pointer to SRAM resource
+ * @sram_base: SRAM base address
+ * @shmem: RTKit shared memory structure for SRAM
+ * @msg_id: Current message id for commands, will be incremented for each command
+ * @atomic_mode: Flag set when atomic mode is entered
+ * @atomic_pending: Flag indicating pending atomic command
+ * @cmd_done: Completion for command execution in non-atomic mode
+ * @cmd_ret: Return value from SMC for last command
+ * @mutex: Mutex for non-atomic mode
+ * @lock: Spinlock for atomic mode
+ */
+struct apple_smc {
+	struct device *dev;
+
+	u32 key_count;
+	smc_key first_key;
+	smc_key last_key;
+
+	struct blocking_notifier_head event_handlers;
+
+	struct apple_rtkit *rtk;
+
+	struct completion init_done;
+	enum apple_smc_boot_stage boot_stage;
+
+	struct resource *sram;
+	void __iomem *sram_base;
+	struct apple_rtkit_shmem shmem;
+
+	unsigned int msg_id;
+
+	bool atomic_mode;
+	bool atomic_pending;
+	struct completion cmd_done;
+	u64 cmd_ret;
+
+	struct mutex mutex;
+	spinlock_t lock;
+};
+
+/**
+ * apple_smc_read - Read size bytes from given SMC key into buf
+ * @smc: Pointer to apple_smc struct
+ * @key: smc_key to be read
+ * @buf: Buffer into which size bytes of data will be read from SMC
+ * @size: Number of bytes to be read into buf
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size);
+
+/**
+ * apple_smc_write - Write size bytes into given SMC key from buf
+ * @smc: Pointer to apple_smc struct
+ * @key: smc_key data will be written to
+ * @buf: Buffer from which size bytes of data will be written to SMC
+ * @size: Number of bytes to be written
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size);
+
+/**
+ * apple_smc_enter_atomic - Enter atomic mode to be able to use apple_smc_write_atomic
+ * @smc: Pointer to apple_smc struct
+ *
+ * This function switches the SMC backend to atomic mode which allows the
+ * use of apple_smc_write_atomic while disabling *all* other functions.
+ * This is only used for shutdown/reboot which requires writing to a SMC
+ * key from atomic context.
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_enter_atomic(struct apple_smc *smc);
+
+/**
+ * apple_smc_write_atomic - Write size bytes into given SMC key from buf without sleeping
+ * @smc: Pointer to apple_smc struct
+ * @key: smc_key data will be written to
+ * @buf: Buffer from which size bytes of data will be written to SMC
+ * @size: Number of bytes to be written
+ *
+ * Note that this function will fail if apple_smc_enter_atomic hasn't been
+ * called before.
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size);
+
+/**
+ * apple_smc_rw - Write and then read using the given SMC key
+ * @smc: Pointer to apple_smc struct
+ * @key: smc_key data will be written to
+ * @wbuf: Buffer from which size bytes of data will be written to SMC
+ * @wsize: Number of bytes to be written
+ * @rbuf: Buffer to which size bytes of data will be read from SMC
+ * @rsize: Number of bytes to be read
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize,
+		 void *rbuf, size_t rsize);
+
+/**
+ * apple_smc_get_key_by_index - Given an index return the corresponding SMC key
+ * @smc: Pointer to apple_smc struct
+ * @index: Index to be resolved
+ * @key: Buffer for SMC key to be returned
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_get_key_by_index(struct apple_smc *smc, int index, smc_key *key);
+
+/**
+ * apple_smc_get_key_info - Get key information from SMC
+ * @smc: Pointer to apple_smc struct
+ * @key: Key to acquire information for
+ * @info: Pointer to struct apple_smc_key_info which will be filled
+ *
+ * Return: Zero on success, negative errno on error
+ */
+int apple_smc_get_key_info(struct apple_smc *smc, smc_key key, struct apple_smc_key_info *info);
+
+/**
+ * apple_smc_key_exists - Check if the given SMC key exists
+ * @smc: Pointer to apple_smc struct
+ * @key: smc_key to be checked
+ *
+ * Return: True if the key exists, false otherwise
+ */
+static inline bool apple_smc_key_exists(struct apple_smc *smc, smc_key key)
+{
+	return apple_smc_get_key_info(smc, key, NULL) >= 0;
+}
+
+#define APPLE_SMC_TYPE_OPS(type) \
+	static inline int apple_smc_read_##type(struct apple_smc *smc, smc_key key, type *p) \
+	{ \
+		int ret = apple_smc_read(smc, key, p, sizeof(*p)); \
+		return (ret < 0) ? ret : ((ret != sizeof(*p)) ? -EINVAL : 0); \
+	} \
+	static inline int apple_smc_write_##type(struct apple_smc *smc, smc_key key, type p) \
+	{ \
+		return apple_smc_write(smc, key, &p, sizeof(p)); \
+	} \
+	static inline int apple_smc_write_##type##_atomic(struct apple_smc *smc, smc_key key, type p) \
+	{ \
+		return apple_smc_write_atomic(smc, key, &p, sizeof(p)); \
+	} \
+	static inline int apple_smc_rw_##type(struct apple_smc *smc, smc_key key, \
+					      type w, type *r) \
+	{ \
+		int ret = apple_smc_rw(smc, key, &w, sizeof(w), r, sizeof(*r)); \
+		return (ret < 0) ? ret : ((ret != sizeof(*r)) ? -EINVAL : 0); \
+	}
+
+APPLE_SMC_TYPE_OPS(u64)
+APPLE_SMC_TYPE_OPS(u32)
+APPLE_SMC_TYPE_OPS(u16)
+APPLE_SMC_TYPE_OPS(u8)
+APPLE_SMC_TYPE_OPS(s64)
+APPLE_SMC_TYPE_OPS(s32)
+APPLE_SMC_TYPE_OPS(s16)
+APPLE_SMC_TYPE_OPS(s8)
+
+static inline int apple_smc_read_flag(struct apple_smc *smc, smc_key key, bool *flag)
+{
+	u8 val;
+	int ret = apple_smc_read_u8(smc, key, &val);
+
+	if (ret < 0)
+		return ret;
+
+	*flag = val ? true : false;
+	return ret;
+}
+
+static inline int apple_smc_write_flag(struct apple_smc *smc, smc_key key, bool state)
+{
+	return apple_smc_write_u8(smc, key, state ? 1 : 0);
+}
+
+static inline int apple_smc_write_flag_atomic(struct apple_smc *smc, smc_key key, bool state)
+{
+	return apple_smc_write_u8_atomic(smc, key, state ? 1 : 0);
+}
+
+#endif
-- 
cgit v1.2.3


From 897e8601b9cff1d054cdd53047f568b0e1995726 Mon Sep 17 00:00:00 2001
From: Halil Pasic <pasic@linux.ibm.com>
Date: Tue, 22 Jul 2025 18:18:17 +0200
Subject: s390/ism: fix concurrency management in ism_cmd()

The s390x ISM device data sheet clearly states that only one
request-response sequence is allowable per ISM function at any point in
time.  Unfortunately as of today the s390/ism driver in Linux does not
honor that requirement. This patch aims to rectify that.

This problem was discovered based on Aliaksei's bug report which states
that for certain workloads the ISM functions end up entering error state
(with PEC 2 as seen from the logs) after a while and as a consequence
connections handled by the respective function break, and for future
connection requests the ISM device is not considered -- given it is in a
dysfunctional state. During further debugging PEC 3A was observed as
well.

A kernel message like
[ 1211.244319] zpci: 061a:00:00.0: Event 0x2 reports an error for PCI function 0x61a
is a reliable indicator of the stated function entering error state
with PEC 2. Let me also point out that a kernel message like
[ 1211.244325] zpci: 061a:00:00.0: The ism driver bound to the device does not support error recovery
is a reliable indicator that the ISM function won't be auto-recovered
because the ISM driver currently lacks support for it.

On a technical level, without this synchronization, commands (inputs to
the FW) may be partially or fully overwritten (corrupted) by another CPU
trying to issue commands on the same function. There is hard evidence that
this can lead to DMB token values being used as DMB IOVAs, leading to
PEC 2 PCI events indicating invalid DMA. But this is only one of the
failure modes imaginable. In theory even completely losing one command
and executing another one twice and then trying to interpret the outputs
as if the command we intended to execute was actually executed and not
the other one is also possible.  Frankly, I don't feel confident about
providing an exhaustive list of possible consequences.

Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory")
Reported-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Tested-by: Aliaksei Makarau <Aliaksei.Makarau@ibm.com>
Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Signed-off-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250722161817.1298473-1-wintera@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/s390/net/ism_drv.c | 3 +++
 include/linux/ism.h        | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index b7f15f303ea2..967dc4f9eea8 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -130,6 +130,7 @@ static int ism_cmd(struct ism_dev *ism, void *cmd)
 	struct ism_req_hdr *req = cmd;
 	struct ism_resp_hdr *resp = cmd;
 
+	spin_lock(&ism->cmd_lock);
 	__ism_write_cmd(ism, req + 1, sizeof(*req), req->len - sizeof(*req));
 	__ism_write_cmd(ism, req, 0, sizeof(*req));
 
@@ -143,6 +144,7 @@ static int ism_cmd(struct ism_dev *ism, void *cmd)
 	}
 	__ism_read_cmd(ism, resp + 1, sizeof(*resp), resp->len - sizeof(*resp));
 out:
+	spin_unlock(&ism->cmd_lock);
 	return resp->ret;
 }
 
@@ -606,6 +608,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	spin_lock_init(&ism->lock);
+	spin_lock_init(&ism->cmd_lock);
 	dev_set_drvdata(&pdev->dev, ism);
 	ism->pdev = pdev;
 	ism->dev.parent = &pdev->dev;
diff --git a/include/linux/ism.h b/include/linux/ism.h
index 5428edd90982..8358b4cd7ba6 100644
--- a/include/linux/ism.h
+++ b/include/linux/ism.h
@@ -28,6 +28,7 @@ struct ism_dmb {
 
 struct ism_dev {
 	spinlock_t lock; /* protects the ism device */
+	spinlock_t cmd_lock; /* serializes cmds */
 	struct list_head list;
 	struct pci_dev *pdev;
 
-- 
cgit v1.2.3


From fe473fba6435f631a4c7459c02057bf57457e128 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 7 Jun 2025 21:22:32 +0100
Subject: mfd: twl6030-irq: Remove unused twl6030_mmc_card_detect*

twl6030_mmc_card_detect() and twl6030_mmc_card_detect_config() have been
unused since 2013's commit b2ff4790612b ("ARM: OMAP2+: Remove legacy
omap4_twl6030_hsmmc_init")

Remove them.

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250607202232.265344-1-linux@treblig.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/twl6030-irq.c | 74 -----------------------------------------------
 include/linux/mfd/twl.h   | 21 --------------
 2 files changed, 95 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/twl6030-irq.c b/drivers/mfd/twl6030-irq.c
index 00b14cef1dfb..ffb4b98639c7 100644
--- a/drivers/mfd/twl6030-irq.c
+++ b/drivers/mfd/twl6030-irq.c
@@ -256,80 +256,6 @@ int twl6030_interrupt_mask(u8 bit_mask, u8 offset)
 }
 EXPORT_SYMBOL(twl6030_interrupt_mask);
 
-int twl6030_mmc_card_detect_config(void)
-{
-	int ret;
-	u8 reg_val = 0;
-
-	/* Unmasking the Card detect Interrupt line for MMC1 from Phoenix */
-	twl6030_interrupt_unmask(TWL6030_MMCDETECT_INT_MASK,
-						REG_INT_MSK_LINE_B);
-	twl6030_interrupt_unmask(TWL6030_MMCDETECT_INT_MASK,
-						REG_INT_MSK_STS_B);
-	/*
-	 * Initially Configuring MMC_CTRL for receiving interrupts &
-	 * Card status on TWL6030 for MMC1
-	 */
-	ret = twl_i2c_read_u8(TWL6030_MODULE_ID0, &reg_val, TWL6030_MMCCTRL);
-	if (ret < 0) {
-		pr_err("twl6030: Failed to read MMCCTRL, error %d\n", ret);
-		return ret;
-	}
-	reg_val &= ~VMMC_AUTO_OFF;
-	reg_val |= SW_FC;
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID0, reg_val, TWL6030_MMCCTRL);
-	if (ret < 0) {
-		pr_err("twl6030: Failed to write MMCCTRL, error %d\n", ret);
-		return ret;
-	}
-
-	/* Configuring PullUp-PullDown register */
-	ret = twl_i2c_read_u8(TWL6030_MODULE_ID0, &reg_val,
-						TWL6030_CFG_INPUT_PUPD3);
-	if (ret < 0) {
-		pr_err("twl6030: Failed to read CFG_INPUT_PUPD3, error %d\n",
-									ret);
-		return ret;
-	}
-	reg_val &= ~(MMC_PU | MMC_PD);
-	ret = twl_i2c_write_u8(TWL6030_MODULE_ID0, reg_val,
-						TWL6030_CFG_INPUT_PUPD3);
-	if (ret < 0) {
-		pr_err("twl6030: Failed to write CFG_INPUT_PUPD3, error %d\n",
-									ret);
-		return ret;
-	}
-
-	return irq_find_mapping(twl6030_irq->irq_domain,
-				 MMCDETECT_INTR_OFFSET);
-}
-EXPORT_SYMBOL(twl6030_mmc_card_detect_config);
-
-int twl6030_mmc_card_detect(struct device *dev, int slot)
-{
-	int ret = -EIO;
-	u8 read_reg = 0;
-	struct platform_device *pdev = to_platform_device(dev);
-
-	if (pdev->id) {
-		/* TWL6030 provide's Card detect support for
-		 * only MMC1 controller.
-		 */
-		pr_err("Unknown MMC controller %d in %s\n", pdev->id, __func__);
-		return ret;
-	}
-	/*
-	 * BIT0 of MMC_CTRL on TWL6030 provides card status for MMC1
-	 * 0 - Card not present ,1 - Card present
-	 */
-	ret = twl_i2c_read_u8(TWL6030_MODULE_ID0, &read_reg,
-						TWL6030_MMCCTRL);
-	if (ret >= 0)
-		ret = read_reg & STS_MMC;
-	return ret;
-}
-EXPORT_SYMBOL(twl6030_mmc_card_detect);
-
 static int twl6030_irq_map(struct irq_domain *d, unsigned int virq,
 			      irq_hw_number_t hwirq)
 {
diff --git a/include/linux/mfd/twl.h b/include/linux/mfd/twl.h
index 85dc406173db..b31e07fa4d51 100644
--- a/include/linux/mfd/twl.h
+++ b/include/linux/mfd/twl.h
@@ -205,27 +205,6 @@ int twl_get_hfclk_rate(void);
 int twl6030_interrupt_unmask(u8 bit_mask, u8 offset);
 int twl6030_interrupt_mask(u8 bit_mask, u8 offset);
 
-/* Card detect Configuration for MMC1 Controller on OMAP4 */
-#ifdef CONFIG_TWL4030_CORE
-int twl6030_mmc_card_detect_config(void);
-#else
-static inline int twl6030_mmc_card_detect_config(void)
-{
-	pr_debug("twl6030_mmc_card_detect_config not supported\n");
-	return 0;
-}
-#endif
-
-/* MMC1 Controller on OMAP4 uses Phoenix irq for Card detect */
-#ifdef CONFIG_TWL4030_CORE
-int twl6030_mmc_card_detect(struct device *dev, int slot);
-#else
-static inline int twl6030_mmc_card_detect(struct device *dev, int slot)
-{
-	pr_debug("Call back twl6030_mmc_card_detect not supported\n");
-	return -EIO;
-}
-#endif
 /*----------------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.3


From ea39dd2638ff6920c342313db98dbc152e815ecd Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 21 Jun 2025 20:30:51 +0200
Subject: mfd: tps65219: Remove an unused field from 'struct tps65219'

Since commit 3df4c6367520 ("mfd: tps65219: Add support for soft shutdown
via sys-off API"), the 'nb' field from 'struct tps65219' is unused.
Remove it.

Also remove the now useless #include <linux/notifier.h> for the same
reason.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/8a264c3a92b8e62c1dadd374f2685030e042eb08.1750530460.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/tps65219.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h
index 3e8d29189267..690002932377 100644
--- a/include/linux/mfd/tps65219.h
+++ b/include/linux/mfd/tps65219.h
@@ -10,7 +10,6 @@
 #define MFD_TPS65219_H
 
 #include <linux/bitops.h>
-#include <linux/notifier.h>
 #include <linux/regmap.h>
 #include <linux/regulator/driver.h>
 
@@ -440,7 +439,6 @@ enum tps65219_irqs {
  * @regmap: Regmap for accessing the device registers
  * @chip_id: Chip ID
  * @irq_data: Regmap irq data used for the irq chip
- * @nb: notifier block for the restart handler
  */
 struct tps65219 {
 	struct device *dev;
@@ -448,7 +446,6 @@ struct tps65219 {
 
 	unsigned int chip_id;
 	struct regmap_irq_chip_data *irq_data;
-	struct notifier_block nb;
 };
 
 #endif /* MFD_TPS65219_H */
-- 
cgit v1.2.3


From 83f9afe4689d96279d9ade6c1cce36fa86e8ac63 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 21 Jun 2025 20:30:52 +0200
Subject: mfd: tps65219: Remove another unused field from 'struct tps65219'

The 'chip_id' field from 'struct tps65219' is unused.
Remove it.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/f20443e6e13b0b101648a41010a19ee56589fa0b.1750530460.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/tps65219.c       | 5 +++--
 include/linux/mfd/tps65219.h | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/tps65219.c b/drivers/mfd/tps65219.c
index 83b8ab4707c2..65a952555218 100644
--- a/drivers/mfd/tps65219.c
+++ b/drivers/mfd/tps65219.c
@@ -477,6 +477,7 @@ static int tps65219_probe(struct i2c_client *client)
 {
 	struct tps65219 *tps;
 	const struct tps65219_chip_data *pmic;
+	unsigned int chip_id;
 	bool pwr_button;
 	int ret;
 
@@ -487,8 +488,8 @@ static int tps65219_probe(struct i2c_client *client)
 	i2c_set_clientdata(client, tps);
 
 	tps->dev = &client->dev;
-	tps->chip_id = (uintptr_t)i2c_get_match_data(client);
-	pmic = &chip_info_table[tps->chip_id];
+	chip_id = (uintptr_t)i2c_get_match_data(client);
+	pmic = &chip_info_table[chip_id];
 
 	tps->regmap = devm_regmap_init_i2c(client, &tps65219_regmap_config);
 	if (IS_ERR(tps->regmap)) {
diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h
index 690002932377..55234e771ba7 100644
--- a/include/linux/mfd/tps65219.h
+++ b/include/linux/mfd/tps65219.h
@@ -437,14 +437,12 @@ enum tps65219_irqs {
  *
  * @dev: MFD device
  * @regmap: Regmap for accessing the device registers
- * @chip_id: Chip ID
  * @irq_data: Regmap irq data used for the irq chip
  */
 struct tps65219 {
 	struct device *dev;
 	struct regmap *regmap;
 
-	unsigned int chip_id;
 	struct regmap_irq_chip_data *irq_data;
 };
 
-- 
cgit v1.2.3


From e403cdf0704b7b1fedaf4ed5f05cc814dffbd6d5 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 1 Jul 2025 15:56:25 +0100
Subject: mfd: pcf50633: Remove the header file core.h

The patches to remove all of the pieces of the pcf50633
have gone in and we're left with the header.

Remove it.

The pcf50633 was used as part of the OpenMoko devices but
the support for its main chip was recently removed in:
commit 61b7f8920b17 ("ARM: s3c: remove all s3c24xx support")

See https://lore.kernel.org/all/Z8z236h4B5A6Ki3D@gallifrey/

Signed-off-by: "Dr. David Alan Gilbert" <linux@treblig.org>
Link: https://lore.kernel.org/r/20250701145625.204048-1-linux@treblig.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/pcf50633/core.h | 229 --------------------------------------
 1 file changed, 229 deletions(-)
 delete mode 100644 include/linux/mfd/pcf50633/core.h

(limited to 'include')

diff --git a/include/linux/mfd/pcf50633/core.h b/include/linux/mfd/pcf50633/core.h
deleted file mode 100644
index 42d2b0e4884e..000000000000
--- a/include/linux/mfd/pcf50633/core.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * core.h  -- Core driver for NXP PCF50633
- *
- * (C) 2006-2008 by Openmoko, Inc.
- * All rights reserved.
- */
-
-#ifndef __LINUX_MFD_PCF50633_CORE_H
-#define __LINUX_MFD_PCF50633_CORE_H
-
-#include <linux/i2c.h>
-#include <linux/workqueue.h>
-#include <linux/regulator/driver.h>
-#include <linux/regulator/machine.h>
-#include <linux/pm.h>
-#include <linux/power_supply.h>
-
-struct pcf50633;
-struct regmap;
-
-#define PCF50633_NUM_REGULATORS	11
-
-struct pcf50633_platform_data {
-	struct regulator_init_data reg_init_data[PCF50633_NUM_REGULATORS];
-
-	char **batteries;
-	int num_batteries;
-
-	/*
-	 * Should be set accordingly to the reference resistor used, see
-	 * I_{ch(ref)} charger reference current in the pcf50633 User
-	 * Manual.
-	 */
-	int charger_reference_current_ma;
-
-	/* Callbacks */
-	void (*probe_done)(struct pcf50633 *);
-	void (*mbc_event_callback)(struct pcf50633 *, int);
-	void (*regulator_registered)(struct pcf50633 *, int);
-	void (*force_shutdown)(struct pcf50633 *);
-
-	u8 resumers[5];
-};
-
-struct pcf50633_irq {
-	void (*handler) (int, void *);
-	void *data;
-};
-
-int pcf50633_register_irq(struct pcf50633 *pcf, int irq,
-			void (*handler) (int, void *), void *data);
-int pcf50633_free_irq(struct pcf50633 *pcf, int irq);
-
-int pcf50633_irq_mask(struct pcf50633 *pcf, int irq);
-int pcf50633_irq_unmask(struct pcf50633 *pcf, int irq);
-int pcf50633_irq_mask_get(struct pcf50633 *pcf, int irq);
-
-int pcf50633_read_block(struct pcf50633 *, u8 reg,
-					int nr_regs, u8 *data);
-int pcf50633_write_block(struct pcf50633 *pcf, u8 reg,
-					int nr_regs, u8 *data);
-u8 pcf50633_reg_read(struct pcf50633 *, u8 reg);
-int pcf50633_reg_write(struct pcf50633 *pcf, u8 reg, u8 val);
-
-int pcf50633_reg_set_bit_mask(struct pcf50633 *pcf, u8 reg, u8 mask, u8 val);
-int pcf50633_reg_clear_bits(struct pcf50633 *pcf, u8 reg, u8 bits);
-
-/* Interrupt registers */
-
-#define PCF50633_REG_INT1	0x02
-#define PCF50633_REG_INT2	0x03
-#define PCF50633_REG_INT3	0x04
-#define PCF50633_REG_INT4	0x05
-#define PCF50633_REG_INT5	0x06
-
-#define PCF50633_REG_INT1M	0x07
-#define PCF50633_REG_INT2M	0x08
-#define PCF50633_REG_INT3M	0x09
-#define PCF50633_REG_INT4M	0x0a
-#define PCF50633_REG_INT5M	0x0b
-
-enum {
-	/* Chip IRQs */
-	PCF50633_IRQ_ADPINS,
-	PCF50633_IRQ_ADPREM,
-	PCF50633_IRQ_USBINS,
-	PCF50633_IRQ_USBREM,
-	PCF50633_IRQ_RESERVED1,
-	PCF50633_IRQ_RESERVED2,
-	PCF50633_IRQ_ALARM,
-	PCF50633_IRQ_SECOND,
-	PCF50633_IRQ_ONKEYR,
-	PCF50633_IRQ_ONKEYF,
-	PCF50633_IRQ_EXTON1R,
-	PCF50633_IRQ_EXTON1F,
-	PCF50633_IRQ_EXTON2R,
-	PCF50633_IRQ_EXTON2F,
-	PCF50633_IRQ_EXTON3R,
-	PCF50633_IRQ_EXTON3F,
-	PCF50633_IRQ_BATFULL,
-	PCF50633_IRQ_CHGHALT,
-	PCF50633_IRQ_THLIMON,
-	PCF50633_IRQ_THLIMOFF,
-	PCF50633_IRQ_USBLIMON,
-	PCF50633_IRQ_USBLIMOFF,
-	PCF50633_IRQ_ADCRDY,
-	PCF50633_IRQ_ONKEY1S,
-	PCF50633_IRQ_LOWSYS,
-	PCF50633_IRQ_LOWBAT,
-	PCF50633_IRQ_HIGHTMP,
-	PCF50633_IRQ_AUTOPWRFAIL,
-	PCF50633_IRQ_DWN1PWRFAIL,
-	PCF50633_IRQ_DWN2PWRFAIL,
-	PCF50633_IRQ_LEDPWRFAIL,
-	PCF50633_IRQ_LEDOVP,
-	PCF50633_IRQ_LDO1PWRFAIL,
-	PCF50633_IRQ_LDO2PWRFAIL,
-	PCF50633_IRQ_LDO3PWRFAIL,
-	PCF50633_IRQ_LDO4PWRFAIL,
-	PCF50633_IRQ_LDO5PWRFAIL,
-	PCF50633_IRQ_LDO6PWRFAIL,
-	PCF50633_IRQ_HCLDOPWRFAIL,
-	PCF50633_IRQ_HCLDOOVL,
-
-	/* Always last */
-	PCF50633_NUM_IRQ,
-};
-
-struct pcf50633 {
-	struct device *dev;
-	struct regmap *regmap;
-
-	struct pcf50633_platform_data *pdata;
-	int irq;
-	struct pcf50633_irq irq_handler[PCF50633_NUM_IRQ];
-	struct work_struct irq_work;
-	struct workqueue_struct *work_queue;
-	struct mutex lock;
-
-	u8 mask_regs[5];
-
-	u8 suspend_irq_masks[5];
-	u8 resume_reason[5];
-	int is_suspended;
-
-	int onkey1s_held;
-
-	struct platform_device *rtc_pdev;
-	struct platform_device *mbc_pdev;
-	struct platform_device *adc_pdev;
-	struct platform_device *input_pdev;
-	struct platform_device *bl_pdev;
-	struct platform_device *regulator_pdev[PCF50633_NUM_REGULATORS];
-};
-
-enum pcf50633_reg_int1 {
-	PCF50633_INT1_ADPINS	= 0x01,	/* Adapter inserted */
-	PCF50633_INT1_ADPREM	= 0x02,	/* Adapter removed */
-	PCF50633_INT1_USBINS	= 0x04,	/* USB inserted */
-	PCF50633_INT1_USBREM	= 0x08,	/* USB removed */
-	/* reserved */
-	PCF50633_INT1_ALARM	= 0x40, /* RTC alarm time is reached */
-	PCF50633_INT1_SECOND	= 0x80,	/* RTC periodic second interrupt */
-};
-
-enum pcf50633_reg_int2 {
-	PCF50633_INT2_ONKEYR	= 0x01, /* ONKEY rising edge */
-	PCF50633_INT2_ONKEYF	= 0x02, /* ONKEY falling edge */
-	PCF50633_INT2_EXTON1R	= 0x04, /* EXTON1 rising edge */
-	PCF50633_INT2_EXTON1F	= 0x08, /* EXTON1 falling edge */
-	PCF50633_INT2_EXTON2R	= 0x10, /* EXTON2 rising edge */
-	PCF50633_INT2_EXTON2F	= 0x20, /* EXTON2 falling edge */
-	PCF50633_INT2_EXTON3R	= 0x40, /* EXTON3 rising edge */
-	PCF50633_INT2_EXTON3F	= 0x80, /* EXTON3 falling edge */
-};
-
-enum pcf50633_reg_int3 {
-	PCF50633_INT3_BATFULL	= 0x01, /* Battery full */
-	PCF50633_INT3_CHGHALT	= 0x02,	/* Charger halt */
-	PCF50633_INT3_THLIMON	= 0x04,
-	PCF50633_INT3_THLIMOFF	= 0x08,
-	PCF50633_INT3_USBLIMON	= 0x10,
-	PCF50633_INT3_USBLIMOFF	= 0x20,
-	PCF50633_INT3_ADCRDY	= 0x40, /* ADC result ready */
-	PCF50633_INT3_ONKEY1S	= 0x80,	/* ONKEY pressed 1 second */
-};
-
-enum pcf50633_reg_int4 {
-	PCF50633_INT4_LOWSYS		= 0x01,
-	PCF50633_INT4_LOWBAT		= 0x02,
-	PCF50633_INT4_HIGHTMP		= 0x04,
-	PCF50633_INT4_AUTOPWRFAIL	= 0x08,
-	PCF50633_INT4_DWN1PWRFAIL	= 0x10,
-	PCF50633_INT4_DWN2PWRFAIL	= 0x20,
-	PCF50633_INT4_LEDPWRFAIL	= 0x40,
-	PCF50633_INT4_LEDOVP		= 0x80,
-};
-
-enum pcf50633_reg_int5 {
-	PCF50633_INT5_LDO1PWRFAIL	= 0x01,
-	PCF50633_INT5_LDO2PWRFAIL	= 0x02,
-	PCF50633_INT5_LDO3PWRFAIL	= 0x04,
-	PCF50633_INT5_LDO4PWRFAIL	= 0x08,
-	PCF50633_INT5_LDO5PWRFAIL	= 0x10,
-	PCF50633_INT5_LDO6PWRFAIL	= 0x20,
-	PCF50633_INT5_HCLDOPWRFAIL	= 0x40,
-	PCF50633_INT5_HCLDOOVL		= 0x80,
-};
-
-/* misc. registers */
-#define PCF50633_REG_OOCSHDWN	0x0c
-
-/* LED registers */
-#define PCF50633_REG_LEDOUT 0x28
-#define PCF50633_REG_LEDENA 0x29
-#define PCF50633_REG_LEDCTL 0x2a
-#define PCF50633_REG_LEDDIM 0x2b
-
-static inline struct pcf50633 *dev_to_pcf50633(struct device *dev)
-{
-	return dev_get_drvdata(dev);
-}
-
-int pcf50633_irq_init(struct pcf50633 *pcf, int irq);
-void pcf50633_irq_free(struct pcf50633 *pcf);
-extern const struct dev_pm_ops pcf50633_pm;
-
-#endif
-- 
cgit v1.2.3


From c371040f31ab63de992c7d811ffc4c7d450b46a5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 18:43:54 +0300
Subject: mfd: davinci_voicecodec: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250626154354.324439-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/davinci_voicecodec.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/davinci_voicecodec.h b/include/linux/mfd/davinci_voicecodec.h
index 556375b91316..9acd703dd5ca 100644
--- a/include/linux/mfd/davinci_voicecodec.h
+++ b/include/linux/mfd/davinci_voicecodec.h
@@ -10,11 +10,13 @@
 #ifndef __LINUX_MFD_DAVINCI_VOICECODEC_H_
 #define __LINUX_MFD_DAVINCI_VOICECODEC_H_
 
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
+#include <linux/bits.h>
 #include <linux/mfd/core.h>
-#include <linux/platform_data/edma.h>
+#include <linux/types.h>
 
+struct clk;
+struct device;
+struct platform_device;
 struct regmap;
 
 /*
-- 
cgit v1.2.3


From dd1902b6e90508b4243af930db933ea3d26d2981 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 18:59:51 +0300
Subject: mfd: wm8350-core: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20250626155951.325683-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/wm8350/core.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/wm8350/core.h b/include/linux/mfd/wm8350/core.h
index a3241e4d7548..5f70d3b5d1b1 100644
--- a/include/linux/mfd/wm8350/core.h
+++ b/include/linux/mfd/wm8350/core.h
@@ -8,11 +8,12 @@
 #ifndef __LINUX_MFD_WM8350_CORE_H_
 #define __LINUX_MFD_WM8350_CORE_H_
 
-#include <linux/kernel.h>
-#include <linux/mutex.h>
-#include <linux/interrupt.h>
 #include <linux/completion.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
 #include <linux/regmap.h>
+#include <linux/types.h>
 
 #include <linux/mfd/wm8350/audio.h>
 #include <linux/mfd/wm8350/gpio.h>
@@ -21,6 +22,9 @@
 #include <linux/mfd/wm8350/supply.h>
 #include <linux/mfd/wm8350/wdt.h>
 
+struct device;
+struct platform_device;
+
 /*
  * Register values.
  */
-- 
cgit v1.2.3


From dd394515d18aedd379e8dc886cb8286e1714f735 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 18:45:44 +0300
Subject: mfd: madera: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20250626154544.324724-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/madera/pdata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 32e3470708ed..7e84738cbb20 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -8,10 +8,11 @@
 #ifndef MADERA_PDATA_H
 #define MADERA_PDATA_H
 
-#include <linux/kernel.h>
 #include <linux/regulator/arizona-ldo1.h>
 #include <linux/regulator/arizona-micsupp.h>
 #include <linux/regulator/machine.h>
+#include <linux/types.h>
+
 #include <sound/madera-pdata.h>
 
 #define MADERA_MAX_MICBIAS		4
-- 
cgit v1.2.3


From b9ec71fbd572042770df16c9b65bbf91cbd556cf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 27 Jun 2025 19:43:58 +0300
Subject: mfd: syscon: atmel-smc: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20250627164414.1043434-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/atmel-smc.c              | 9 ++++++++-
 include/linux/mfd/syscon/atmel-smc.h | 8 +++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
index 4628ca14e766..0a5b42c83f17 100644
--- a/drivers/mfd/atmel-smc.c
+++ b/drivers/mfd/atmel-smc.c
@@ -8,9 +8,16 @@
  * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
  */
 
-#include <linux/mfd/syscon/atmel-smc.h>
+#include <linux/bits.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/mod_devicetable.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
 #include <linux/string.h>
 
+#include <linux/mfd/syscon/atmel-smc.h>
+
 /**
  * atmel_smc_cs_conf_init - initialize a SMC CS conf
  * @conf: the SMC CS conf to initialize
diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
index e9e24f4c4578..9b9119c742a2 100644
--- a/include/linux/mfd/syscon/atmel-smc.h
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -11,9 +11,11 @@
 #ifndef _LINUX_MFD_SYSCON_ATMEL_SMC_H_
 #define _LINUX_MFD_SYSCON_ATMEL_SMC_H_
 
-#include <linux/kernel.h>
-#include <linux/of.h>
-#include <linux/regmap.h>
+#include <linux/bits.h>
+#include <linux/types.h>
+
+struct device_node;
+struct regmap;
 
 #define ATMEL_SMC_SETUP(cs)			(((cs) * 0x10))
 #define ATMEL_HSMC_SETUP(layout, cs)		\
-- 
cgit v1.2.3


From db8db85cff331eb5a520a18a606692ff85405c3d Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@cherry.de>
Date: Fri, 27 Jun 2025 12:53:54 +0200
Subject: mfd: rk8xx-core: Allow to customize RK806 reset mode

The RK806 PMIC has a bitfield for configuring the restart/reset behavior
(which I assume Rockchip calls "function") whenever the PMIC is reset
either programmatically (c.f. DEV_RST in the datasheet) or via PWRCTRL
or RESETB pins.

For RK806, the following values are possible for RST_FUN:

0b00 means "Restart PMU"
0b01 means "Reset all the power off reset registers, forcing
	the state to switch to ACTIVE mode"
0b10 means "Reset all the power off reset registers, forcing
	the state to switch to ACTIVE mode, and simultaneously
	pull down the RESETB PIN for 5mS before releasing"
0b11 means the same as for 0b10 just above.

This adds the appropriate logic in the driver to parse the new
rockchip,reset-mode DT property to pass this information. It just
happens that the values in the binding match the values to write in the
bitfield so no mapping is necessary.

If it is missing, the register is left untouched and relies either on
the silicon default or on whatever was set earlier in the boot stages
(e.g. the bootloader).

Signed-off-by: Quentin Schulz <quentin.schulz@cherry.de>
Link: https://lore.kernel.org/r/20250627-rk8xx-rst-fun-v4-2-ce05d041b45f@cherry.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rk8xx-core.c  | 12 ++++++++++++
 include/linux/mfd/rk808.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/rk8xx-core.c b/drivers/mfd/rk8xx-core.c
index 71c2b80a4678..def4587fdfb8 100644
--- a/drivers/mfd/rk8xx-core.c
+++ b/drivers/mfd/rk8xx-core.c
@@ -10,6 +10,7 @@
  * Author: Wadim Egorov <w.egorov@phytec.de>
  */
 
+#include <linux/bitfield.h>
 #include <linux/interrupt.h>
 #include <linux/mfd/rk808.h>
 #include <linux/mfd/core.h>
@@ -699,6 +700,7 @@ int rk8xx_probe(struct device *dev, int variant, unsigned int irq, struct regmap
 	const struct mfd_cell *cells;
 	int dual_support = 0;
 	int nr_pre_init_regs;
+	u32 rst_fun = 0;
 	int nr_cells;
 	int ret;
 	int i;
@@ -726,6 +728,16 @@ int rk8xx_probe(struct device *dev, int variant, unsigned int irq, struct regmap
 		cells = rk806s;
 		nr_cells = ARRAY_SIZE(rk806s);
 		dual_support = IRQF_SHARED;
+
+		ret = device_property_read_u32(dev, "rockchip,reset-mode", &rst_fun);
+		if (ret)
+			break;
+
+		ret = regmap_update_bits(rk808->regmap, RK806_SYS_CFG3, RK806_RST_FUN_MSK,
+					 FIELD_PREP(RK806_RST_FUN_MSK, rst_fun));
+		if (ret)
+			return dev_err_probe(dev, ret,
+					     "Failed to configure requested restart/reset behavior\n");
 		break;
 	case RK808_ID:
 		rk808->regmap_irq_chip = &rk808_irq_chip;
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 69cbea78b430..28170ee08898 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -812,6 +812,8 @@ enum rk806_pin_dr_sel {
 #define RK806_INT_POL_H			BIT(1)
 #define RK806_INT_POL_L			0
 
+/* SYS_CFG3 */
+#define RK806_RST_FUN_MSK		GENMASK(7, 6)
 #define RK806_SLAVE_RESTART_FUN_MSK	BIT(1)
 #define RK806_SLAVE_RESTART_FUN_EN	BIT(1)
 #define RK806_SLAVE_RESTART_FUN_OFF	0
-- 
cgit v1.2.3


From 50a479527ef01f9b36dde1803a7e81741a222509 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Fri, 18 Jul 2025 14:54:32 +0100
Subject: ASoC: SDCA: Add support for -cn- value properties

Many of the DisCo properties that specify Control values have an
additional variant that specifies a separate value for each Control
Number. Add support for these.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://patch.msgid.link/20250718135432.1048566-3-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sdca_function.h   | 14 +++---
 sound/soc/sdca/sdca_functions.c | 99 ++++++++++++++++++++++++++---------------
 sound/soc/sdca/sdca_regmap.c    | 13 ++++--
 3 files changed, 78 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h
index 90d77fc46416..06ec126cdcc3 100644
--- a/include/sound/sdca_function.h
+++ b/include/sound/sdca_function.h
@@ -742,14 +742,14 @@ struct sdca_control_range {
  * struct sdca_control - information for one SDCA Control
  * @label: Name for the Control, from SDCA Specification v1.0, section 7.1.7.
  * @sel: Identifier used for addressing.
- * @value: Holds the Control value for constants and defaults.
  * @nbits: Number of bits used in the Control.
- * @interrupt_position: SCDA interrupt line that will alert to changes on this
- * Control.
+ * @values: Holds the Control value for constants and defaults.
  * @cn_list: A bitmask showing the valid Control Numbers within this Control,
  * Control Numbers typically represent channels.
- * @range: Buffer describing valid range of values for the Control.
+ * @interrupt_position: SCDA interrupt line that will alert to changes on this
+ * Control.
  * @type: Format of the data in the Control.
+ * @range: Buffer describing valid range of values for the Control.
  * @mode: Access mode of the Control.
  * @layers: Bitmask of access layers of the Control.
  * @deferrable: Indicates if the access to the Control can be deferred.
@@ -760,13 +760,13 @@ struct sdca_control {
 	const char *label;
 	int sel;
 
-	int value;
 	int nbits;
-	int interrupt_position;
+	int *values;
 	u64 cn_list;
+	int interrupt_position;
 
-	struct sdca_control_range range;
 	enum sdca_control_datatype type;
+	struct sdca_control_range range;
 	enum sdca_access_mode mode;
 	u8 layers;
 
diff --git a/sound/soc/sdca/sdca_functions.c b/sound/soc/sdca/sdca_functions.c
index 4b6da587c4ac..0faee522b3af 100644
--- a/sound/soc/sdca/sdca_functions.c
+++ b/sound/soc/sdca/sdca_functions.c
@@ -814,6 +814,43 @@ static int find_sdca_control_range(struct device *dev,
 	return 0;
 }
 
+static int find_sdca_control_value(struct device *dev, struct sdca_entity *entity,
+				   struct fwnode_handle *control_node,
+				   struct sdca_control *control,
+				   const char * const label)
+{
+	char property[SDCA_PROPERTY_LENGTH];
+	bool global = true;
+	int ret, cn, i;
+	u32 tmp;
+
+	snprintf(property, sizeof(property), "mipi-sdca-control-%s", label);
+
+	ret = fwnode_property_read_u32(control_node, property, &tmp);
+	if (ret == -EINVAL)
+		global = false;
+	else if (ret)
+		return ret;
+
+	i = 0;
+	for_each_set_bit(cn, (unsigned long *)&control->cn_list,
+			 BITS_PER_TYPE(control->cn_list)) {
+		if (!global) {
+			snprintf(property, sizeof(property),
+				 "mipi-sdca-control-cn-%d-%s", cn, label);
+
+			ret = fwnode_property_read_u32(control_node, property, &tmp);
+			if (ret)
+				return ret;
+		}
+
+		control->values[i] = tmp;
+		i++;
+	}
+
+	return 0;
+}
+
 /*
  * TODO: Add support for -cn- properties, allowing different channels to have
  * different defaults etc.
@@ -843,44 +880,44 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
 
 	control->layers = tmp;
 
+	ret = fwnode_property_read_u64(control_node, "mipi-sdca-control-cn-list",
+				       &control->cn_list);
+	if (ret == -EINVAL) {
+		/* Spec allows not specifying cn-list if only the first number is used */
+		control->cn_list = 0x1;
+	} else if (ret || !control->cn_list) {
+		dev_err(dev, "%s: control %#x: cn list missing: %d\n",
+			entity->label, control->sel, ret);
+		return ret;
+	}
+
+	control->values = devm_kzalloc(dev, hweight64(control->cn_list), GFP_KERNEL);
+	if (!control->values)
+		return -ENOMEM;
+
 	switch (control->mode) {
 	case SDCA_ACCESS_MODE_DC:
-		ret = fwnode_property_read_u32(control_node,
-					       "mipi-sdca-control-dc-value",
-					       &tmp);
+		ret = find_sdca_control_value(dev, entity, control_node, control,
+					      "dc-value");
 		if (ret) {
 			dev_err(dev, "%s: control %#x: dc value missing: %d\n",
 				entity->label, control->sel, ret);
 			return ret;
 		}
 
-		control->value = tmp;
 		control->has_fixed = true;
 		break;
 	case SDCA_ACCESS_MODE_RW:
 	case SDCA_ACCESS_MODE_DUAL:
-		ret = fwnode_property_read_u32(control_node,
-					       "mipi-sdca-control-default-value",
-					       &tmp);
-		if (!ret) {
-			control->value = tmp;
+		ret = find_sdca_control_value(dev, entity, control_node, control,
+					      "default-value");
+		if (!ret)
 			control->has_default = true;
-		}
-
-		ret = fwnode_property_read_u32(control_node,
-					       "mipi-sdca-control-fixed-value",
-					       &tmp);
-		if (!ret) {
-			if (control->has_default && control->value != tmp) {
-				dev_err(dev,
-					"%s: control %#x: default and fixed value don't match\n",
-					entity->label, control->sel);
-				return -EINVAL;
-			}
 
-			control->value = tmp;
+		ret = find_sdca_control_value(dev, entity, control_node, control,
+					      "fixed-value");
+		if (!ret)
 			control->has_fixed = true;
-		}
 		fallthrough;
 	case SDCA_ACCESS_MODE_RO:
 		control->deferrable = fwnode_property_read_bool(control_node,
@@ -897,17 +934,6 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
 		return ret;
 	}
 
-	ret = fwnode_property_read_u64(control_node, "mipi-sdca-control-cn-list",
-				       &control->cn_list);
-	if (ret == -EINVAL) {
-		/* Spec allows not specifying cn-list if only the first number is used */
-		control->cn_list = 0x1;
-	} else if (ret || !control->cn_list) {
-		dev_err(dev, "%s: control %#x: cn list missing: %d\n",
-			entity->label, control->sel, ret);
-		return ret;
-	}
-
 	ret = fwnode_property_read_u32(control_node,
 				       "mipi-sdca-control-interrupt-position",
 				       &tmp);
@@ -923,11 +949,10 @@ static int find_sdca_entity_control(struct device *dev, struct sdca_entity *enti
 	control->type = find_sdca_control_datatype(entity, control);
 	control->nbits = find_sdca_control_bits(entity, control);
 
-	dev_info(dev, "%s: %s: control %#x mode %#x layers %#x cn %#llx int %d value %#x %s\n",
+	dev_info(dev, "%s: %s: control %#x mode %#x layers %#x cn %#llx int %d %s\n",
 		 entity->label, control->label, control->sel,
 		 control->mode, control->layers, control->cn_list,
-		 control->interrupt_position, control->value,
-		 control->deferrable ? "deferrable" : "");
+		 control->interrupt_position, control->deferrable ? "deferrable" : "");
 
 	return 0;
 }
diff --git a/sound/soc/sdca/sdca_regmap.c b/sound/soc/sdca/sdca_regmap.c
index c41c67c2204a..5cb3048ea8cf 100644
--- a/sound/soc/sdca/sdca_regmap.c
+++ b/sound/soc/sdca/sdca_regmap.c
@@ -253,7 +253,7 @@ int sdca_regmap_populate_constants(struct device *dev,
 				   struct sdca_function_data *function,
 				   struct reg_default *consts)
 {
-	int i, j, k;
+	int i, j, k, l;
 
 	for (i = 0, k = 0; i < function->num_entities; i++) {
 		struct sdca_entity *entity = &function->entities[i];
@@ -265,13 +265,15 @@ int sdca_regmap_populate_constants(struct device *dev,
 			if (control->mode != SDCA_ACCESS_MODE_DC)
 				continue;
 
+			l = 0;
 			for_each_set_bit(cn, (unsigned long *)&control->cn_list,
 					 BITS_PER_TYPE(control->cn_list)) {
 				consts[k].reg = SDW_SDCA_CTL(function->desc->adr,
 							     entity->id,
 							     control->sel, cn);
-				consts[k].def = control->value;
+				consts[k].def = control->values[l];
 				k++;
+				l++;
 			}
 		}
 	}
@@ -295,7 +297,7 @@ EXPORT_SYMBOL_NS(sdca_regmap_populate_constants, "SND_SOC_SDCA");
 int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap,
 			       struct sdca_function_data *function)
 {
-	int i, j;
+	int i, j, k;
 	int ret;
 
 	for (i = 0; i < function->num_entities; i++) {
@@ -311,6 +313,7 @@ int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap,
 			if (!control->has_default && !control->has_fixed)
 				continue;
 
+			k = 0;
 			for_each_set_bit(cn, (unsigned long *)&control->cn_list,
 					 BITS_PER_TYPE(control->cn_list)) {
 				unsigned int reg;
@@ -318,9 +321,11 @@ int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap,
 				reg = SDW_SDCA_CTL(function->desc->adr, entity->id,
 						   control->sel, cn);
 
-				ret = regmap_write(regmap, reg, control->value);
+				ret = regmap_write(regmap, reg, control->values[k]);
 				if (ret)
 					return ret;
+
+				k++;
 			}
 		}
 	}
-- 
cgit v1.2.3


From 678bae2eaa812662929a83b3de399645e9de93ad Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 22 Jul 2025 17:35:43 +0200
Subject: gpiolib: make legacy interfaces optional

The traditional interfaces are only used on a small number of ancient
boards. Make these optional now so they can be disabled by default.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Alexander Sverdlin <alexander.sverdlin@gmail.com>
Link: https://lore.kernel.org/r/20250722153634.3683927-1-arnd@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/Kconfig  |  3 +++
 drivers/gpio/Makefile |  2 +-
 include/linux/gpio.h  | 10 ++++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 6e3c3f0e3dcf..500d839f65ee 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -14,6 +14,9 @@ menuconfig GPIOLIB
 
 if GPIOLIB
 
+config GPIOLIB_LEGACY
+	def_bool y
+
 config GPIOLIB_FASTPATH_LIMIT
 	int "Maximum number of GPIOs for fast path"
 	range 32 512
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 3e5bc90ba59e..379f55e9ed1e 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -5,7 +5,7 @@ ccflags-$(CONFIG_DEBUG_GPIO)	+= -DDEBUG
 
 obj-$(CONFIG_GPIOLIB)		+= gpiolib.o
 obj-$(CONFIG_GPIOLIB)		+= gpiolib-devres.o
-obj-$(CONFIG_GPIOLIB)		+= gpiolib-legacy.o
+obj-$(CONFIG_GPIOLIB_LEGACY)	+= gpiolib-legacy.o
 obj-$(CONFIG_OF_GPIO)		+= gpiolib-of.o
 obj-$(CONFIG_GPIO_CDEV)		+= gpiolib-cdev.o
 obj-$(CONFIG_GPIO_SYSFS)	+= gpiolib-sysfs.o
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index ff99ed76fdc3..8f85ddb26429 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -13,6 +13,11 @@
 #define __LINUX_GPIO_H
 
 #include <linux/types.h>
+#ifdef CONFIG_GPIOLIB
+#include <linux/gpio/consumer.h>
+#endif
+
+#ifdef CONFIG_GPIOLIB_LEGACY
 
 struct device;
 
@@ -22,9 +27,6 @@ struct device;
 #define GPIOF_OUT_INIT_HIGH	((0 << 0) | (1 << 1))
 
 #ifdef CONFIG_GPIOLIB
-
-#include <linux/gpio/consumer.h>
-
 /*
  * "valid" GPIO numbers are nonnegative and may be passed to
  * setup routines like gpio_request().  Only some valid numbers
@@ -170,5 +172,5 @@ static inline int devm_gpio_request_one(struct device *dev, unsigned gpio,
 }
 
 #endif /* ! CONFIG_GPIOLIB */
-
+#endif /* CONFIG_GPIOLIB_LEGACY */
 #endif /* __LINUX_GPIO_H */
-- 
cgit v1.2.3


From fdb7f139864aa332ea8f161beb636dc0599c64f2 Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Mon, 16 Jun 2025 13:29:56 -0700
Subject: ice, libie: move generic adminq descriptors to lib

The descriptor structure is the same in ice, ixgbe and i40e. Move it to
common libie header to use it across different driver.

Leave device specific adminq commands in separate folders. This lead to
a change that need to be done in filling/getting descriptor:
- previous: struct specific_desc *cmd;
	    cmd = &desc.params.specific_desc;
- now: struct specific_desc *cmd;
       cmd = libie_aq_raw(&desc);

Do this changes across the driver to allow clean build. The casting only
have to be done in case of specific descriptors, for generic one union
can still be used.

Changes beside code moving:
- change ICE_ prefix to LIBIE_ prefix (ice_ and libie_ too)
- remove shift variables not otherwise needed (in libie_aq_flags)
- fill/get descriptor data based on desc.params.raw whenever the
  descriptor isn't defined in libie
- move defines from the libie_aq_sth structure outside
- add libie_aq_raw helper and use it instead of explicit casting

Reviewed by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ice/devlink/health.c |   4 +-
 drivers/net/ethernet/intel/ice/ice.h            |   2 +-
 drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 270 +----------------
 drivers/net/ethernet/intel/ice/ice_common.c     | 384 ++++++++++++------------
 drivers/net/ethernet/intel/ice/ice_common.h     |   6 +-
 drivers/net/ethernet/intel/ice/ice_controlq.c   |  53 ++--
 drivers/net/ethernet/intel/ice/ice_controlq.h   |   8 +-
 drivers/net/ethernet/intel/ice/ice_dcb.c        |  36 +--
 drivers/net/ethernet/intel/ice/ice_dcb_lib.c    |   2 +-
 drivers/net/ethernet/intel/ice/ice_ddp.c        |  47 +--
 drivers/net/ethernet/intel/ice/ice_dpll.c       |   4 +-
 drivers/net/ethernet/intel/ice/ice_fw_update.c  |  24 +-
 drivers/net/ethernet/intel/ice/ice_fwlog.c      |  16 +-
 drivers/net/ethernet/intel/ice/ice_lag.c        |   4 +-
 drivers/net/ethernet/intel/ice/ice_lib.c        |   6 +-
 drivers/net/ethernet/intel/ice/ice_main.c       |  38 +--
 drivers/net/ethernet/intel/ice/ice_nvm.c        |  38 +--
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c     |  16 +-
 drivers/net/ethernet/intel/ice/ice_sched.c      |  18 +-
 drivers/net/ethernet/intel/ice/ice_sriov.c      |   4 +-
 drivers/net/ethernet/intel/ice/ice_switch.c     |  55 ++--
 drivers/net/ethernet/intel/ice/ice_vf_mbx.c     |   6 +-
 drivers/net/ethernet/intel/ice/ice_virtchnl.c   |   2 +-
 drivers/net/ethernet/intel/ice/ice_vlan_mode.c  |   6 +-
 include/linux/net/intel/libie/adminq.h          | 273 +++++++++++++++++
 25 files changed, 673 insertions(+), 649 deletions(-)
 create mode 100644 include/linux/net/intel/libie/adminq.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c
index 19c3d37aa768..b149b8185449 100644
--- a/drivers/net/ethernet/intel/ice/devlink/health.c
+++ b/drivers/net/ethernet/intel/ice/devlink/health.c
@@ -217,10 +217,12 @@ static void ice_config_health_events(struct ice_pf *pf, bool enable)
 void ice_process_health_status_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 {
 	const struct ice_aqc_health_status_elem *health_info;
+	const struct ice_aqc_get_health_status *cmd;
 	u16 count;
 
 	health_info = (struct ice_aqc_health_status_elem *)event->msg_buf;
-	count = le16_to_cpu(event->desc.params.get_health_status.health_status_count);
+	cmd = libie_aq_raw(&event->desc);
+	count = le16_to_cpu(cmd->health_status_count);
 
 	if (count > (event->buf_len / sizeof(*health_info))) {
 		dev_err(ice_pf_to_dev(pf), "Received a health status event with invalid element count\n");
diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h
index 657e1f608f1a..2c35782c7800 100644
--- a/drivers/net/ethernet/intel/ice/ice.h
+++ b/drivers/net/ethernet/intel/ice/ice.h
@@ -960,7 +960,7 @@ int ice_plug_aux_dev(struct ice_pf *pf);
 void ice_unplug_aux_dev(struct ice_pf *pf);
 int ice_init_rdma(struct ice_pf *pf);
 void ice_deinit_rdma(struct ice_pf *pf);
-const char *ice_aq_str(enum ice_aq_err aq_err);
+const char *ice_aq_str(enum libie_aq_err aq_err);
 bool ice_is_wol_supported(struct ice_hw *hw);
 void ice_fdir_del_all_fltrs(struct ice_vsi *vsi);
 int
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index 39d99c2f7976..3bd3ea3af888 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -4,6 +4,8 @@
 #ifndef _ICE_ADMINQ_CMD_H_
 #define _ICE_ADMINQ_CMD_H_
 
+#include <linux/net/intel/libie/adminq.h>
+
 /* This header file defines the Admin Queue commands, error codes and
  * descriptor format. It is shared between Firmware and Software.
  */
@@ -31,38 +33,6 @@ typedef struct __packed { u8 buf[ICE_TXQ_CTX_SZ]; } ice_txq_ctx_buf_t;
 
 typedef struct __packed { u8 buf[ICE_TXQ_CTX_FULL_SZ]; } ice_txq_ctx_buf_full_t;
 
-struct ice_aqc_generic {
-	__le32 param0;
-	__le32 param1;
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
-/* Get version (direct 0x0001) */
-struct ice_aqc_get_ver {
-	__le32 rom_ver;
-	__le32 fw_build;
-	u8 fw_branch;
-	u8 fw_major;
-	u8 fw_minor;
-	u8 fw_patch;
-	u8 api_branch;
-	u8 api_major;
-	u8 api_minor;
-	u8 api_patch;
-};
-
-/* Send driver version (indirect 0x0002) */
-struct ice_aqc_driver_ver {
-	u8 major_ver;
-	u8 minor_ver;
-	u8 build_ver;
-	u8 subbuild_ver;
-	u8 reserved[4];
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
 /* Queue Shutdown (direct 0x0003) */
 struct ice_aqc_q_shutdown {
 	u8 driver_unloading;
@@ -70,94 +40,6 @@ struct ice_aqc_q_shutdown {
 	u8 reserved[15];
 };
 
-/* Request resource ownership (direct 0x0008)
- * Release resource ownership (direct 0x0009)
- */
-struct ice_aqc_req_res {
-	__le16 res_id;
-#define ICE_AQC_RES_ID_NVM		1
-#define ICE_AQC_RES_ID_SDP		2
-#define ICE_AQC_RES_ID_CHNG_LOCK	3
-#define ICE_AQC_RES_ID_GLBL_LOCK	4
-	__le16 access_type;
-#define ICE_AQC_RES_ACCESS_READ		1
-#define ICE_AQC_RES_ACCESS_WRITE	2
-
-	/* Upon successful completion, FW writes this value and driver is
-	 * expected to release resource before timeout. This value is provided
-	 * in milliseconds.
-	 */
-	__le32 timeout;
-#define ICE_AQ_RES_NVM_READ_DFLT_TIMEOUT_MS	3000
-#define ICE_AQ_RES_NVM_WRITE_DFLT_TIMEOUT_MS	180000
-#define ICE_AQ_RES_CHNG_LOCK_DFLT_TIMEOUT_MS	1000
-#define ICE_AQ_RES_GLBL_LOCK_DFLT_TIMEOUT_MS	3000
-	/* For SDP: pin ID of the SDP */
-	__le32 res_number;
-	/* Status is only used for ICE_AQC_RES_ID_GLBL_LOCK */
-	__le16 status;
-#define ICE_AQ_RES_GLBL_SUCCESS		0
-#define ICE_AQ_RES_GLBL_IN_PROG		1
-#define ICE_AQ_RES_GLBL_DONE		2
-	u8 reserved[2];
-};
-
-/* Get function capabilities (indirect 0x000A)
- * Get device capabilities (indirect 0x000B)
- */
-struct ice_aqc_list_caps {
-	u8 cmd_flags;
-	u8 pf_index;
-	u8 reserved[2];
-	__le32 count;
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
-/* Device/Function buffer entry, repeated per reported capability */
-struct ice_aqc_list_caps_elem {
-	__le16 cap;
-#define ICE_AQC_CAPS_VALID_FUNCTIONS			0x0005
-#define ICE_AQC_CAPS_SRIOV				0x0012
-#define ICE_AQC_CAPS_VF					0x0013
-#define ICE_AQC_CAPS_VSI				0x0017
-#define ICE_AQC_CAPS_DCB				0x0018
-#define ICE_AQC_CAPS_RSS				0x0040
-#define ICE_AQC_CAPS_RXQS				0x0041
-#define ICE_AQC_CAPS_TXQS				0x0042
-#define ICE_AQC_CAPS_MSIX				0x0043
-#define ICE_AQC_CAPS_FD					0x0045
-#define ICE_AQC_CAPS_1588				0x0046
-#define ICE_AQC_CAPS_MAX_MTU				0x0047
-#define ICE_AQC_CAPS_NVM_VER				0x0048
-#define ICE_AQC_CAPS_PENDING_NVM_VER			0x0049
-#define ICE_AQC_CAPS_OROM_VER				0x004A
-#define ICE_AQC_CAPS_PENDING_OROM_VER			0x004B
-#define ICE_AQC_CAPS_NET_VER				0x004C
-#define ICE_AQC_CAPS_PENDING_NET_VER			0x004D
-#define ICE_AQC_CAPS_RDMA				0x0051
-#define ICE_AQC_CAPS_SENSOR_READING			0x0067
-#define ICE_AQC_CAPS_PCIE_RESET_AVOIDANCE		0x0076
-#define ICE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT		0x0077
-#define ICE_AQC_CAPS_NVM_MGMT				0x0080
-#define ICE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE		0x0085
-#define ICE_AQC_CAPS_NAC_TOPOLOGY			0x0087
-#define ICE_AQC_CAPS_FW_LAG_SUPPORT			0x0092
-#define ICE_AQC_BIT_ROCEV2_LAG				0x01
-#define ICE_AQC_BIT_SRIOV_LAG				0x02
-
-	u8 major_ver;
-	u8 minor_ver;
-	/* Number of resources described by this capability */
-	__le32 number;
-	/* Only meaningful for some types of resources */
-	__le32 logical_id;
-	/* Only meaningful for some types of resources */
-	__le32 phys_id;
-	__le64 rsvd1;
-	__le64 rsvd2;
-};
-
 /* Manage MAC address, read command - indirect (0x0107)
  * This struct is also used for the response
  */
@@ -2672,154 +2554,6 @@ struct ice_aqc_fw_log_cfg_resp {
 	u8 rsvd0;
 };
 
-/**
- * struct ice_aq_desc - Admin Queue (AQ) descriptor
- * @flags: ICE_AQ_FLAG_* flags
- * @opcode: AQ command opcode
- * @datalen: length in bytes of indirect/external data buffer
- * @retval: return value from firmware
- * @cookie_high: opaque data high-half
- * @cookie_low: opaque data low-half
- * @params: command-specific parameters
- *
- * Descriptor format for commands the driver posts on the Admin Transmit Queue
- * (ATQ). The firmware writes back onto the command descriptor and returns
- * the result of the command. Asynchronous events that are not an immediate
- * result of the command are written to the Admin Receive Queue (ARQ) using
- * the same descriptor format. Descriptors are in little-endian notation with
- * 32-bit words.
- */
-struct ice_aq_desc {
-	__le16 flags;
-	__le16 opcode;
-	__le16 datalen;
-	__le16 retval;
-	__le32 cookie_high;
-	__le32 cookie_low;
-	union {
-		u8 raw[16];
-		struct ice_aqc_generic generic;
-		struct ice_aqc_get_ver get_ver;
-		struct ice_aqc_driver_ver driver_ver;
-		struct ice_aqc_q_shutdown q_shutdown;
-		struct ice_aqc_req_res res_owner;
-		struct ice_aqc_manage_mac_read mac_read;
-		struct ice_aqc_manage_mac_write mac_write;
-		struct ice_aqc_clear_pxe clear_pxe;
-		struct ice_aqc_list_caps get_cap;
-		struct ice_aqc_get_phy_caps get_phy;
-		struct ice_aqc_set_phy_cfg set_phy;
-		struct ice_aqc_restart_an restart_an;
-		struct ice_aqc_set_phy_rec_clk_out set_phy_rec_clk_out;
-		struct ice_aqc_get_phy_rec_clk_out get_phy_rec_clk_out;
-		struct ice_aqc_get_sensor_reading get_sensor_reading;
-		struct ice_aqc_get_sensor_reading_resp get_sensor_reading_resp;
-		struct ice_aqc_gpio read_write_gpio;
-		struct ice_aqc_sff_eeprom read_write_sff_param;
-		struct ice_aqc_set_port_id_led set_port_id_led;
-		struct ice_aqc_get_port_options get_port_options;
-		struct ice_aqc_set_port_option set_port_option;
-		struct ice_aqc_get_sw_cfg get_sw_conf;
-		struct ice_aqc_set_port_params set_port_params;
-		struct ice_aqc_sw_rules sw_rules;
-		struct ice_aqc_add_get_recipe add_get_recipe;
-		struct ice_aqc_recipe_to_profile recipe_to_profile;
-		struct ice_aqc_get_topo get_topo;
-		struct ice_aqc_sched_elem_cmd sched_elem_cmd;
-		struct ice_aqc_query_txsched_res query_sched_res;
-		struct ice_aqc_query_port_ets port_ets;
-		struct ice_aqc_rl_profile rl_profile;
-		struct ice_aqc_nvm nvm;
-		struct ice_aqc_nvm_checksum nvm_checksum;
-		struct ice_aqc_nvm_pkg_data pkg_data;
-		struct ice_aqc_nvm_pass_comp_tbl pass_comp_tbl;
-		struct ice_aqc_pf_vf_msg virt;
-		struct ice_aqc_set_query_pfc_mode set_query_pfc_mode;
-		struct ice_aqc_lldp_get_mib lldp_get_mib;
-		struct ice_aqc_lldp_set_mib_change lldp_set_event;
-		struct ice_aqc_lldp_stop lldp_stop;
-		struct ice_aqc_lldp_start lldp_start;
-		struct ice_aqc_lldp_set_local_mib lldp_set_mib;
-		struct ice_aqc_lldp_stop_start_specific_agent lldp_agent_ctrl;
-		struct ice_aqc_lldp_filter_ctrl lldp_filter_ctrl;
-		struct ice_aqc_get_set_rss_lut get_set_rss_lut;
-		struct ice_aqc_get_set_rss_key get_set_rss_key;
-		struct ice_aqc_neigh_dev_req neigh_dev;
-		struct ice_aqc_add_txqs add_txqs;
-		struct ice_aqc_dis_txqs dis_txqs;
-		struct ice_aqc_cfg_txqs cfg_txqs;
-		struct ice_aqc_add_rdma_qset add_rdma_qset;
-		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
-		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
-		struct ice_aqc_download_pkg download_pkg;
-		struct ice_aqc_get_cgu_input_measure get_cgu_input_measure;
-		struct ice_aqc_set_cgu_input_config set_cgu_input_config;
-		struct ice_aqc_get_cgu_input_config get_cgu_input_config;
-		struct ice_aqc_set_cgu_output_config set_cgu_output_config;
-		struct ice_aqc_get_cgu_output_config get_cgu_output_config;
-		struct ice_aqc_get_cgu_dpll_status get_cgu_dpll_status;
-		struct ice_aqc_set_cgu_dpll_config set_cgu_dpll_config;
-		struct ice_aqc_set_cgu_ref_prio set_cgu_ref_prio;
-		struct ice_aqc_get_cgu_ref_prio get_cgu_ref_prio;
-		struct ice_aqc_get_cgu_info get_cgu_info;
-		struct ice_aqc_driver_shared_params drv_shared_params;
-		struct ice_aqc_fw_log fw_log;
-		struct ice_aqc_set_mac_lb set_mac_lb;
-		struct ice_aqc_alloc_free_res_cmd sw_res_ctrl;
-		struct ice_aqc_set_mac_cfg set_mac_cfg;
-		struct ice_aqc_set_event_mask set_event_mask;
-		struct ice_aqc_get_link_status get_link_status;
-		struct ice_aqc_event_lan_overflow lan_overflow;
-		struct ice_aqc_get_link_topo get_link_topo;
-		struct ice_aqc_set_health_status_cfg set_health_status_cfg;
-		struct ice_aqc_get_health_status get_health_status;
-		struct ice_aqc_dnl_call_command dnl_call;
-		struct ice_aqc_i2c read_write_i2c;
-		struct ice_aqc_read_i2c_resp read_i2c_resp;
-		struct ice_aqc_get_set_tx_topo get_set_tx_topo;
-	} params;
-};
-
-/* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
-#define ICE_AQ_LG_BUF	512
-
-#define ICE_AQ_FLAG_DD_S	0
-#define ICE_AQ_FLAG_CMP_S	1
-#define ICE_AQ_FLAG_ERR_S	2
-#define ICE_AQ_FLAG_LB_S	9
-#define ICE_AQ_FLAG_RD_S	10
-#define ICE_AQ_FLAG_BUF_S	12
-#define ICE_AQ_FLAG_SI_S	13
-
-#define ICE_AQ_FLAG_DD		BIT(ICE_AQ_FLAG_DD_S)  /* 0x1    */
-#define ICE_AQ_FLAG_CMP		BIT(ICE_AQ_FLAG_CMP_S) /* 0x2    */
-#define ICE_AQ_FLAG_ERR		BIT(ICE_AQ_FLAG_ERR_S) /* 0x4    */
-#define ICE_AQ_FLAG_LB		BIT(ICE_AQ_FLAG_LB_S)  /* 0x200  */
-#define ICE_AQ_FLAG_RD		BIT(ICE_AQ_FLAG_RD_S)  /* 0x400  */
-#define ICE_AQ_FLAG_BUF		BIT(ICE_AQ_FLAG_BUF_S) /* 0x1000 */
-#define ICE_AQ_FLAG_SI		BIT(ICE_AQ_FLAG_SI_S)  /* 0x2000 */
-
-/* error codes */
-enum ice_aq_err {
-	ICE_AQ_RC_OK		= 0,  /* Success */
-	ICE_AQ_RC_EPERM		= 1,  /* Operation not permitted */
-	ICE_AQ_RC_ENOENT	= 2,  /* No such element */
-	ICE_AQ_RC_ESRCH		= 3,  /* Bad opcode */
-	ICE_AQ_RC_EAGAIN	= 8,  /* Try again */
-	ICE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
-	ICE_AQ_RC_EBUSY		= 12, /* Device or resource busy */
-	ICE_AQ_RC_EEXIST	= 13, /* Object already exists */
-	ICE_AQ_RC_EINVAL	= 14, /* Invalid argument */
-	ICE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
-	ICE_AQ_RC_ENOSYS	= 17, /* Function not implemented */
-	ICE_AQ_RC_EMODE		= 21, /* Op not allowed in current dev mode */
-	ICE_AQ_RC_ENOSEC	= 24, /* Missing security manifest */
-	ICE_AQ_RC_EBADSIG	= 25, /* Bad RSA signature */
-	ICE_AQ_RC_ESVN		= 26, /* SVN number prohibits this package */
-	ICE_AQ_RC_EBADMAN	= 27, /* Manifest hash mismatch */
-	ICE_AQ_RC_EBADBUF	= 28, /* Buffer hash mismatches manifest */
-};
-
 /* Admin Queue command opcodes */
 enum ice_adminq_opc {
 	/* AQ commands */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index b542e1e0f0c9..003d60a4db21 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -248,7 +248,7 @@ static bool ice_is_pf_c827(struct ice_hw *hw)
  */
 int ice_clear_pf_cfg(struct ice_hw *hw)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pf_cfg);
 
@@ -276,12 +276,12 @@ ice_aq_manage_mac_read(struct ice_hw *hw, void *buf, u16 buf_size,
 {
 	struct ice_aqc_manage_mac_read_resp *resp;
 	struct ice_aqc_manage_mac_read *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 	u16 flags;
 	u8 i;
 
-	cmd = &desc.params.mac_read;
+	cmd = libie_aq_raw(&desc);
 
 	if (buf_size < sizeof(*resp))
 		return -EINVAL;
@@ -330,12 +330,12 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 {
 	struct ice_aqc_get_phy_caps *cmd;
 	u16 pcaps_size = sizeof(*pcaps);
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	const char *prefix;
 	struct ice_hw *hw;
 	int status;
 
-	cmd = &desc.params.get_phy;
+	cmd = libie_aq_raw(&desc);
 
 	if (!pcaps || (report_mode & ~ICE_AQC_REPORT_MODE_M) || !pi)
 		return -EINVAL;
@@ -424,9 +424,9 @@ ice_aq_get_link_topo_handle(struct ice_port_info *pi, u8 node_type,
 			    struct ice_sq_cd *cd)
 {
 	struct ice_aqc_get_link_topo *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.get_link_topo;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
 
@@ -454,19 +454,20 @@ int
 ice_aq_get_netlist_node(struct ice_hw *hw, struct ice_aqc_get_link_topo *cmd,
 			u8 *node_part_number, u16 *node_handle)
 {
-	struct ice_aq_desc desc;
+	struct ice_aqc_get_link_topo *resp;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
-	desc.params.get_link_topo = *cmd;
+	resp = libie_aq_raw(&desc);
+	*resp = *cmd;
 
 	if (ice_aq_send_cmd(hw, &desc, NULL, 0, NULL))
 		return -EINTR;
 
 	if (node_handle)
-		*node_handle =
-			le16_to_cpu(desc.params.get_link_topo.addr.handle);
+		*node_handle = le16_to_cpu(resp->addr.handle);
 	if (node_part_number)
-		*node_part_number = desc.params.get_link_topo.node_part_num;
+		*node_part_number = resp->node_part_num;
 
 	return 0;
 }
@@ -689,8 +690,8 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 	struct ice_link_status *li_old, *li;
 	enum ice_media_type *hw_media_type;
 	struct ice_fc_info *hw_fc_info;
+	struct libie_aq_desc desc;
 	bool tx_pause, rx_pause;
-	struct ice_aq_desc desc;
 	struct ice_hw *hw;
 	u16 cmd_flags;
 	int status;
@@ -705,7 +706,7 @@ ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_status);
 	cmd_flags = (ena_lse) ? ICE_AQ_LSE_ENA : ICE_AQ_LSE_DIS;
-	resp = &desc.params.get_link_status;
+	resp = libie_aq_raw(&desc);
 	resp->cmd_flags = cpu_to_le16(cmd_flags);
 	resp->lport_num = pi->lport;
 
@@ -834,9 +835,9 @@ int
 ice_aq_set_mac_cfg(struct ice_hw *hw, u16 max_frame_size, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_set_mac_cfg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_mac_cfg;
+	cmd = libie_aq_raw(&desc);
 
 	if (max_frame_size == 0)
 		return -EINVAL;
@@ -1707,7 +1708,7 @@ ice_sbq_send_cmd(struct ice_hw *hw, struct ice_sbq_cmd_desc *desc,
 		 void *buf, u16 buf_size, struct ice_sq_cd *cd)
 {
 	return ice_sq_send_cmd(hw, ice_get_sbq(hw),
-			       (struct ice_aq_desc *)desc, buf, buf_size, cd);
+			       (struct libie_aq_desc *)desc, buf, buf_size, cd);
 }
 
 /**
@@ -1792,10 +1793,10 @@ static bool ice_should_retry_sq_send_cmd(u16 opcode)
  */
 static int
 ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq,
-		      struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		      struct libie_aq_desc *desc, void *buf, u16 buf_size,
 		      struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc_cpy;
+	struct libie_aq_desc desc_cpy;
 	bool is_cmd_for_retry;
 	u8 idx = 0;
 	u16 opcode;
@@ -1816,7 +1817,7 @@ ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		status = ice_sq_send_cmd(hw, cq, desc, buf, buf_size, cd);
 
 		if (!is_cmd_for_retry || !status ||
-		    hw->adminq.sq_last_status != ICE_AQ_RC_EBUSY)
+		    hw->adminq.sq_last_status != LIBIE_AQ_RC_EBUSY)
 			break;
 
 		memcpy(desc, &desc_cpy, sizeof(desc_cpy));
@@ -1839,10 +1840,10 @@ ice_sq_send_cmd_retry(struct ice_hw *hw, struct ice_ctl_q_info *cq,
  * Helper function to send FW Admin Queue commands to the FW Admin Queue.
  */
 int
-ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
+ice_aq_send_cmd(struct ice_hw *hw, struct libie_aq_desc *desc, void *buf,
 		u16 buf_size, struct ice_sq_cd *cd)
 {
-	struct ice_aqc_req_res *cmd = &desc->params.res_owner;
+	struct libie_aqc_req_res *cmd = libie_aq_raw(desc);
 	bool lock_acquired = false;
 	int status;
 
@@ -1873,7 +1874,7 @@ ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
 	case ice_aqc_opc_get_recipe_to_profile:
 		break;
 	case ice_aqc_opc_release_res:
-		if (le16_to_cpu(cmd->res_id) == ICE_AQC_RES_ID_GLBL_LOCK)
+		if (le16_to_cpu(cmd->res_id) == LIBIE_AQC_RES_ID_GLBL_LOCK)
 			break;
 		fallthrough;
 	default:
@@ -1898,8 +1899,8 @@ ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf,
  */
 int ice_aq_get_fw_ver(struct ice_hw *hw, struct ice_sq_cd *cd)
 {
-	struct ice_aqc_get_ver *resp;
-	struct ice_aq_desc desc;
+	struct libie_aqc_get_ver *resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	resp = &desc.params.get_ver;
@@ -1935,8 +1936,8 @@ int
 ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
 		       struct ice_sq_cd *cd)
 {
-	struct ice_aqc_driver_ver *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aqc_driver_ver *cmd;
+	struct libie_aq_desc desc;
 	u16 len;
 
 	cmd = &desc.params.driver_ver;
@@ -1946,7 +1947,7 @@ ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_driver_ver);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	cmd->major_ver = dv->major_ver;
 	cmd->minor_ver = dv->minor_ver;
 	cmd->build_ver = dv->build_ver;
@@ -1971,9 +1972,9 @@ ice_aq_send_driver_ver(struct ice_hw *hw, struct ice_driver_ver *dv,
 int ice_aq_q_shutdown(struct ice_hw *hw, bool unloading)
 {
 	struct ice_aqc_q_shutdown *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.q_shutdown;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_q_shutdown);
 
@@ -2014,8 +2015,8 @@ ice_aq_req_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 	       enum ice_aq_res_access_type access, u8 sdp_number, u32 *timeout,
 	       struct ice_sq_cd *cd)
 {
-	struct ice_aqc_req_res *cmd_resp;
-	struct ice_aq_desc desc;
+	struct libie_aqc_req_res *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	cmd_resp = &desc.params.res_owner;
@@ -2037,20 +2038,20 @@ ice_aq_req_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 	/* Global config lock response utilizes an additional status field.
 	 *
 	 * If the Global config lock resource is held by some other driver, the
-	 * command completes with ICE_AQ_RES_GLBL_IN_PROG in the status field
+	 * command completes with LIBIE_AQ_RES_GLBL_IN_PROG in the status field
 	 * and the timeout field indicates the maximum time the current owner
 	 * of the resource has to free it.
 	 */
 	if (res == ICE_GLOBAL_CFG_LOCK_RES_ID) {
-		if (le16_to_cpu(cmd_resp->status) == ICE_AQ_RES_GLBL_SUCCESS) {
+		if (le16_to_cpu(cmd_resp->status) == LIBIE_AQ_RES_GLBL_SUCCESS) {
 			*timeout = le32_to_cpu(cmd_resp->timeout);
 			return 0;
 		} else if (le16_to_cpu(cmd_resp->status) ==
-			   ICE_AQ_RES_GLBL_IN_PROG) {
+			   LIBIE_AQ_RES_GLBL_IN_PROG) {
 			*timeout = le32_to_cpu(cmd_resp->timeout);
 			return -EIO;
 		} else if (le16_to_cpu(cmd_resp->status) ==
-			   ICE_AQ_RES_GLBL_DONE) {
+			   LIBIE_AQ_RES_GLBL_DONE) {
 			return -EALREADY;
 		}
 
@@ -2063,7 +2064,7 @@ ice_aq_req_res(struct ice_hw *hw, enum ice_aq_res_ids res,
 	 * with a busy return value and the timeout field indicates the maximum
 	 * time the current owner of the resource has to free it.
 	 */
-	if (!status || hw->adminq.sq_last_status == ICE_AQ_RC_EBUSY)
+	if (!status || hw->adminq.sq_last_status == LIBIE_AQ_RC_EBUSY)
 		*timeout = le32_to_cpu(cmd_resp->timeout);
 
 	return status;
@@ -2082,8 +2083,8 @@ static int
 ice_aq_release_res(struct ice_hw *hw, enum ice_aq_res_ids res, u8 sdp_number,
 		   struct ice_sq_cd *cd)
 {
-	struct ice_aqc_req_res *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aqc_req_res *cmd;
+	struct libie_aq_desc desc;
 
 	cmd = &desc.params.res_owner;
 
@@ -2192,16 +2193,16 @@ int ice_aq_alloc_free_res(struct ice_hw *hw,
 			  enum ice_adminq_opc opc)
 {
 	struct ice_aqc_alloc_free_res_cmd *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.sw_res_ctrl;
+	cmd = libie_aq_raw(&desc);
 
 	if (!buf || buf_size < flex_array_size(buf, elem, 1))
 		return -EINVAL;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, opc);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	cmd->num_entries = cpu_to_le16(1);
 
@@ -2315,7 +2316,7 @@ static u32 ice_get_num_per_func(struct ice_hw *hw, u32 max)
  */
 static bool
 ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
-		      struct ice_aqc_list_caps_elem *elem, const char *prefix)
+		      struct libie_aqc_list_caps_elem *elem, const char *prefix)
 {
 	u32 logical_id = le32_to_cpu(elem->logical_id);
 	u32 phys_id = le32_to_cpu(elem->phys_id);
@@ -2324,17 +2325,17 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 	bool found = true;
 
 	switch (cap) {
-	case ICE_AQC_CAPS_VALID_FUNCTIONS:
+	case LIBIE_AQC_CAPS_VALID_FUNCTIONS:
 		caps->valid_functions = number;
 		ice_debug(hw, ICE_DBG_INIT, "%s: valid_functions (bitmap) = %d\n", prefix,
 			  caps->valid_functions);
 		break;
-	case ICE_AQC_CAPS_SRIOV:
+	case LIBIE_AQC_CAPS_SRIOV:
 		caps->sr_iov_1_1 = (number == 1);
 		ice_debug(hw, ICE_DBG_INIT, "%s: sr_iov_1_1 = %d\n", prefix,
 			  caps->sr_iov_1_1);
 		break;
-	case ICE_AQC_CAPS_DCB:
+	case LIBIE_AQC_CAPS_DCB:
 		caps->dcb = (number == 1);
 		caps->active_tc_bitmap = logical_id;
 		caps->maxtc = phys_id;
@@ -2343,7 +2344,7 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 			  caps->active_tc_bitmap);
 		ice_debug(hw, ICE_DBG_INIT, "%s: maxtc = %d\n", prefix, caps->maxtc);
 		break;
-	case ICE_AQC_CAPS_RSS:
+	case LIBIE_AQC_CAPS_RSS:
 		caps->rss_table_size = number;
 		caps->rss_table_entry_width = logical_id;
 		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_size = %d\n", prefix,
@@ -2351,7 +2352,7 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 		ice_debug(hw, ICE_DBG_INIT, "%s: rss_table_entry_width = %d\n", prefix,
 			  caps->rss_table_entry_width);
 		break;
-	case ICE_AQC_CAPS_RXQS:
+	case LIBIE_AQC_CAPS_RXQS:
 		caps->num_rxq = number;
 		caps->rxq_first_id = phys_id;
 		ice_debug(hw, ICE_DBG_INIT, "%s: num_rxq = %d\n", prefix,
@@ -2359,7 +2360,7 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 		ice_debug(hw, ICE_DBG_INIT, "%s: rxq_first_id = %d\n", prefix,
 			  caps->rxq_first_id);
 		break;
-	case ICE_AQC_CAPS_TXQS:
+	case LIBIE_AQC_CAPS_TXQS:
 		caps->num_txq = number;
 		caps->txq_first_id = phys_id;
 		ice_debug(hw, ICE_DBG_INIT, "%s: num_txq = %d\n", prefix,
@@ -2367,7 +2368,7 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 		ice_debug(hw, ICE_DBG_INIT, "%s: txq_first_id = %d\n", prefix,
 			  caps->txq_first_id);
 		break;
-	case ICE_AQC_CAPS_MSIX:
+	case LIBIE_AQC_CAPS_MSIX:
 		caps->num_msix_vectors = number;
 		caps->msix_vector_first_id = phys_id;
 		ice_debug(hw, ICE_DBG_INIT, "%s: num_msix_vectors = %d\n", prefix,
@@ -2375,56 +2376,56 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 		ice_debug(hw, ICE_DBG_INIT, "%s: msix_vector_first_id = %d\n", prefix,
 			  caps->msix_vector_first_id);
 		break;
-	case ICE_AQC_CAPS_PENDING_NVM_VER:
+	case LIBIE_AQC_CAPS_PENDING_NVM_VER:
 		caps->nvm_update_pending_nvm = true;
 		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_nvm\n", prefix);
 		break;
-	case ICE_AQC_CAPS_PENDING_OROM_VER:
+	case LIBIE_AQC_CAPS_PENDING_OROM_VER:
 		caps->nvm_update_pending_orom = true;
 		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_orom\n", prefix);
 		break;
-	case ICE_AQC_CAPS_PENDING_NET_VER:
+	case LIBIE_AQC_CAPS_PENDING_NET_VER:
 		caps->nvm_update_pending_netlist = true;
 		ice_debug(hw, ICE_DBG_INIT, "%s: update_pending_netlist\n", prefix);
 		break;
-	case ICE_AQC_CAPS_NVM_MGMT:
+	case LIBIE_AQC_CAPS_NVM_MGMT:
 		caps->nvm_unified_update =
 			(number & ICE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ?
 			true : false;
 		ice_debug(hw, ICE_DBG_INIT, "%s: nvm_unified_update = %d\n", prefix,
 			  caps->nvm_unified_update);
 		break;
-	case ICE_AQC_CAPS_RDMA:
+	case LIBIE_AQC_CAPS_RDMA:
 		if (IS_ENABLED(CONFIG_INFINIBAND_IRDMA))
 			caps->rdma = (number == 1);
 		ice_debug(hw, ICE_DBG_INIT, "%s: rdma = %d\n", prefix, caps->rdma);
 		break;
-	case ICE_AQC_CAPS_MAX_MTU:
+	case LIBIE_AQC_CAPS_MAX_MTU:
 		caps->max_mtu = number;
 		ice_debug(hw, ICE_DBG_INIT, "%s: max_mtu = %d\n",
 			  prefix, caps->max_mtu);
 		break;
-	case ICE_AQC_CAPS_PCIE_RESET_AVOIDANCE:
+	case LIBIE_AQC_CAPS_PCIE_RESET_AVOIDANCE:
 		caps->pcie_reset_avoidance = (number > 0);
 		ice_debug(hw, ICE_DBG_INIT,
 			  "%s: pcie_reset_avoidance = %d\n", prefix,
 			  caps->pcie_reset_avoidance);
 		break;
-	case ICE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT:
+	case LIBIE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT:
 		caps->reset_restrict_support = (number == 1);
 		ice_debug(hw, ICE_DBG_INIT,
 			  "%s: reset_restrict_support = %d\n", prefix,
 			  caps->reset_restrict_support);
 		break;
-	case ICE_AQC_CAPS_FW_LAG_SUPPORT:
-		caps->roce_lag = !!(number & ICE_AQC_BIT_ROCEV2_LAG);
+	case LIBIE_AQC_CAPS_FW_LAG_SUPPORT:
+		caps->roce_lag = !!(number & LIBIE_AQC_BIT_ROCEV2_LAG);
 		ice_debug(hw, ICE_DBG_INIT, "%s: roce_lag = %u\n",
 			  prefix, caps->roce_lag);
-		caps->sriov_lag = !!(number & ICE_AQC_BIT_SRIOV_LAG);
+		caps->sriov_lag = !!(number & LIBIE_AQC_BIT_SRIOV_LAG);
 		ice_debug(hw, ICE_DBG_INIT, "%s: sriov_lag = %u\n",
 			  prefix, caps->sriov_lag);
 		break;
-	case ICE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE:
+	case LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE:
 		caps->tx_sched_topo_comp_mode_en = (number == 1);
 		break;
 	default:
@@ -2478,7 +2479,7 @@ ice_recalc_port_limited_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps)
  */
 static void
 ice_parse_vf_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
-		       struct ice_aqc_list_caps_elem *cap)
+		       struct libie_aqc_list_caps_elem *cap)
 {
 	u32 logical_id = le32_to_cpu(cap->logical_id);
 	u32 number = le32_to_cpu(cap->number);
@@ -2501,7 +2502,7 @@ ice_parse_vf_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
  */
 static void
 ice_parse_vsi_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
-			struct ice_aqc_list_caps_elem *cap)
+			struct libie_aqc_list_caps_elem *cap)
 {
 	func_p->guar_num_vsi = ice_get_num_per_func(hw, ICE_MAX_VSI);
 	ice_debug(hw, ICE_DBG_INIT, "func caps: guar_num_vsi (fw) = %d\n",
@@ -2520,7 +2521,7 @@ ice_parse_vsi_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
  */
 static void
 ice_parse_1588_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
-			 struct ice_aqc_list_caps_elem *cap)
+			 struct libie_aqc_list_caps_elem *cap)
 {
 	struct ice_ts_func_info *info = &func_p->ts_func_info;
 	u32 number = le32_to_cpu(cap->number);
@@ -2619,7 +2620,7 @@ static void
 ice_parse_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
 		    void *buf, u32 cap_count)
 {
-	struct ice_aqc_list_caps_elem *cap_resp;
+	struct libie_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
 	cap_resp = buf;
@@ -2634,16 +2635,16 @@ ice_parse_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_p,
 					      &cap_resp[i], "func caps");
 
 		switch (cap) {
-		case ICE_AQC_CAPS_VF:
+		case LIBIE_AQC_CAPS_VF:
 			ice_parse_vf_func_caps(hw, func_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_VSI:
+		case LIBIE_AQC_CAPS_VSI:
 			ice_parse_vsi_func_caps(hw, func_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_1588:
+		case LIBIE_AQC_CAPS_1588:
 			ice_parse_1588_func_caps(hw, func_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_FD:
+		case LIBIE_AQC_CAPS_FD:
 			ice_parse_fdir_func_caps(hw, func_p);
 			break;
 		default:
@@ -2687,7 +2688,7 @@ static int ice_func_id_to_logical_id(u32 active_function_bitmap, u8 pf_id)
  */
 static void
 ice_parse_valid_functions_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-			      struct ice_aqc_list_caps_elem *cap)
+			      struct libie_aqc_list_caps_elem *cap)
 {
 	u32 number = le32_to_cpu(cap->number);
 
@@ -2708,7 +2709,7 @@ ice_parse_valid_functions_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void
 ice_parse_vf_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-		      struct ice_aqc_list_caps_elem *cap)
+		      struct libie_aqc_list_caps_elem *cap)
 {
 	u32 number = le32_to_cpu(cap->number);
 
@@ -2727,7 +2728,7 @@ ice_parse_vf_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void
 ice_parse_vsi_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-		       struct ice_aqc_list_caps_elem *cap)
+		       struct libie_aqc_list_caps_elem *cap)
 {
 	u32 number = le32_to_cpu(cap->number);
 
@@ -2746,7 +2747,7 @@ ice_parse_vsi_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void
 ice_parse_1588_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-			struct ice_aqc_list_caps_elem *cap)
+			struct libie_aqc_list_caps_elem *cap)
 {
 	struct ice_ts_dev_info *info = &dev_p->ts_dev_info;
 	u32 logical_id = le32_to_cpu(cap->logical_id);
@@ -2807,7 +2808,7 @@ ice_parse_1588_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void
 ice_parse_fdir_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-			struct ice_aqc_list_caps_elem *cap)
+			struct libie_aqc_list_caps_elem *cap)
 {
 	u32 number = le32_to_cpu(cap->number);
 
@@ -2827,7 +2828,7 @@ ice_parse_fdir_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void
 ice_parse_sensor_reading_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
-			     struct ice_aqc_list_caps_elem *cap)
+			     struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->supported_sensors = le32_to_cpu(cap->number);
 
@@ -2846,7 +2847,7 @@ ice_parse_sensor_reading_cap(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
  */
 static void ice_parse_nac_topo_dev_caps(struct ice_hw *hw,
 					struct ice_hw_dev_caps *dev_p,
-					struct ice_aqc_list_caps_elem *cap)
+					struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->nac_topo.mode = le32_to_cpu(cap->number);
 	dev_p->nac_topo.id = le32_to_cpu(cap->phys_id) & ICE_NAC_TOPO_ID_M;
@@ -2882,7 +2883,7 @@ static void
 ice_parse_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
 		   void *buf, u32 cap_count)
 {
-	struct ice_aqc_list_caps_elem *cap_resp;
+	struct libie_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
 	cap_resp = buf;
@@ -2897,25 +2898,25 @@ ice_parse_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_p,
 					      &cap_resp[i], "dev caps");
 
 		switch (cap) {
-		case ICE_AQC_CAPS_VALID_FUNCTIONS:
+		case LIBIE_AQC_CAPS_VALID_FUNCTIONS:
 			ice_parse_valid_functions_cap(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_VF:
+		case LIBIE_AQC_CAPS_VF:
 			ice_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_VSI:
+		case LIBIE_AQC_CAPS_VSI:
 			ice_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_1588:
+		case LIBIE_AQC_CAPS_1588:
 			ice_parse_1588_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_FD:
+		case LIBIE_AQC_CAPS_FD:
 			ice_parse_fdir_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_SENSOR_READING:
+		case LIBIE_AQC_CAPS_SENSOR_READING:
 			ice_parse_sensor_reading_cap(hw, dev_p, &cap_resp[i]);
 			break;
-		case ICE_AQC_CAPS_NAC_TOPOLOGY:
+		case LIBIE_AQC_CAPS_NAC_TOPOLOGY:
 			ice_parse_nac_topo_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
 		default:
@@ -3035,8 +3036,8 @@ int
 ice_aq_list_caps(struct ice_hw *hw, void *buf, u16 buf_size, u32 *cap_count,
 		 enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
-	struct ice_aqc_list_caps *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aqc_list_caps *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	cmd = &desc.params.get_cap;
@@ -3077,7 +3078,7 @@ ice_discover_dev_caps(struct ice_hw *hw, struct ice_hw_dev_caps *dev_caps)
 	 * device will return, we can simply send a 4KB buffer, the maximum
 	 * possible size that firmware can return.
 	 */
-	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct libie_aqc_list_caps_elem);
 
 	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
 				  ice_aqc_opc_list_dev_caps, NULL);
@@ -3111,7 +3112,7 @@ ice_discover_func_caps(struct ice_hw *hw, struct ice_hw_func_caps *func_caps)
 	 * device will return, we can simply send a 4KB buffer, the maximum
 	 * possible size that firmware can return.
 	 */
-	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct ice_aqc_list_caps_elem);
+	cap_count = ICE_AQ_MAX_BUF_LEN / sizeof(struct libie_aqc_list_caps_elem);
 
 	status = ice_aq_list_caps(hw, cbuf, ICE_AQ_MAX_BUF_LEN, &cap_count,
 				  ice_aqc_opc_list_func_caps, NULL);
@@ -3220,9 +3221,9 @@ ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
 			struct ice_sq_cd *cd)
 {
 	struct ice_aqc_manage_mac_write *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.mac_write;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_manage_mac_write);
 
 	cmd->flags = flags;
@@ -3239,10 +3240,12 @@ ice_aq_manage_mac_write(struct ice_hw *hw, const u8 *mac_addr, u8 flags,
  */
 static int ice_aq_clear_pxe_mode(struct ice_hw *hw)
 {
-	struct ice_aq_desc desc;
+	struct ice_aqc_clear_pxe *cmd;
+	struct libie_aq_desc desc;
 
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_clear_pxe_mode);
-	desc.params.clear_pxe.rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
+	cmd->rx_cnt = ICE_AQC_CLEAR_PXE_RX_CNT;
 
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
@@ -3275,10 +3278,10 @@ ice_aq_set_port_params(struct ice_port_info *pi, bool double_vlan,
 {
 	struct ice_aqc_set_port_params *cmd;
 	struct ice_hw *hw = pi->hw;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 cmd_flags = 0;
 
-	cmd = &desc.params.set_port_params;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_params);
 	if (double_vlan)
@@ -3515,7 +3518,8 @@ int
 ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
 		   struct ice_aqc_set_phy_cfg_data *cfg, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct ice_aqc_set_phy_cfg *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!cfg)
@@ -3530,8 +3534,9 @@ ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
 	}
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_cfg);
-	desc.params.set_phy.lport_num = pi->lport;
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = libie_aq_raw(&desc);
+	cmd->lport_num = pi->lport;
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	ice_debug(hw, ICE_DBG_LINK, "set phy cfg\n");
 	ice_debug(hw, ICE_DBG_LINK, "	phy_type_low = 0x%llx\n",
@@ -3547,7 +3552,7 @@ ice_aq_set_phy_cfg(struct ice_hw *hw, struct ice_port_info *pi,
 		  cfg->link_fec_opt);
 
 	status = ice_aq_send_cmd(hw, &desc, cfg, sizeof(*cfg), cd);
-	if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+	if (hw->adminq.sq_last_status == LIBIE_AQ_RC_EMODE)
 		status = 0;
 
 	if (!status)
@@ -3604,17 +3609,17 @@ int ice_aq_get_phy_equalization(struct ice_hw *hw, u16 data_in, u16 op_code,
 {
 	struct ice_aqc_dnl_call_command *cmd;
 	struct ice_aqc_dnl_call buf = {};
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 
 	buf.sto.txrx_equa_reqs.data_in = cpu_to_le16(data_in);
 	buf.sto.txrx_equa_reqs.op_code_serdes_sel =
 		cpu_to_le16(op_code | (serdes_num & 0xF));
-	cmd = &desc.params.dnl_call;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dnl_call);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_BUF |
-				  ICE_AQ_FLAG_RD |
-				  ICE_AQ_FLAG_SI);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF |
+				  LIBIE_AQ_FLAG_RD |
+				  LIBIE_AQ_FLAG_SI);
 	desc.datalen = cpu_to_le16(sizeof(struct ice_aqc_dnl_call));
 	cmd->activity_id = cpu_to_le16(ICE_AQC_ACT_ID_DNL);
 
@@ -3652,7 +3657,7 @@ static const u32 fec_reg[][ICE_FEC_MAX] = {
 int ice_aq_get_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port,
 			 enum ice_fec_stats_types fec_type, u32 *output)
 {
-	u16 flag = (ICE_AQ_FLAG_RD | ICE_AQ_FLAG_BUF | ICE_AQ_FLAG_SI);
+	u16 flag = (LIBIE_AQ_FLAG_RD | LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_SI);
 	struct ice_sbq_msg_input msg = {};
 	u32 receiver_id, reg_offset;
 	int err;
@@ -4075,9 +4080,9 @@ ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd)
 {
 	struct ice_aqc_restart_an *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.restart_an;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_restart_an);
 
@@ -4105,9 +4110,9 @@ ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
 		      struct ice_sq_cd *cd)
 {
 	struct ice_aqc_set_event_mask *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_event_mask;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
 
@@ -4129,9 +4134,9 @@ int
 ice_aq_set_mac_loopback(struct ice_hw *hw, bool ena_lpbk, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_set_mac_lb *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_mac_lb;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_mac_lb);
 	if (ena_lpbk)
@@ -4154,9 +4159,9 @@ ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
 {
 	struct ice_aqc_set_port_id_led *cmd;
 	struct ice_hw *hw = pi->hw;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_port_id_led;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_id_led);
 
@@ -4192,7 +4197,7 @@ ice_aq_get_port_options(struct ice_hw *hw,
 			u8 *pending_option_idx, bool *pending_option_valid)
 {
 	struct ice_aqc_get_port_options *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 	u8 i;
 
@@ -4200,7 +4205,7 @@ ice_aq_get_port_options(struct ice_hw *hw,
 	if (*option_count < ICE_AQC_PORT_OPT_COUNT_M)
 		return -EINVAL;
 
-	cmd = &desc.params.get_port_options;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_port_options);
 
 	if (lport_valid)
@@ -4266,12 +4271,12 @@ ice_aq_set_port_option(struct ice_hw *hw, u8 lport, u8 lport_valid,
 		       u8 new_option)
 {
 	struct ice_aqc_set_port_option *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (new_option > ICE_AQC_PORT_OPT_COUNT_M)
 		return -EINVAL;
 
-	cmd = &desc.params.set_port_option;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_option);
 
 	if (lport_valid)
@@ -4357,7 +4362,7 @@ ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
 		  bool write, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_sff_eeprom *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 i2c_bus_addr;
 	int status;
 
@@ -4365,8 +4370,8 @@ ice_aq_sff_eeprom(struct ice_hw *hw, u16 lport, u8 bus_addr,
 		return -EINVAL;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_sff_eeprom);
-	cmd = &desc.params.read_write_sff_param;
-	desc.flags = cpu_to_le16(ICE_AQ_FLAG_RD);
+	cmd = libie_aq_raw(&desc);
+	desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	cmd->lport_num = (u8)(lport & 0xff);
 	cmd->lport_num_valid = (u8)((lport >> 8) & 0x01);
 	i2c_bus_addr = FIELD_PREP(ICE_AQC_SFF_I2CBUS_7BIT_M, bus_addr >> 1) |
@@ -4426,7 +4431,7 @@ __ice_aq_get_set_rss_lut(struct ice_hw *hw,
 	struct ice_aqc_get_set_rss_lut *desc_params;
 	enum ice_aqc_lut_flags flags;
 	enum ice_lut_size lut_size;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u8 *lut = params->lut;
 
 
@@ -4442,9 +4447,9 @@ __ice_aq_get_set_rss_lut(struct ice_hw *hw,
 	opcode = set ? ice_aqc_opc_set_rss_lut : ice_aqc_opc_get_rss_lut;
 	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
 	if (set)
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
-	desc_params = &desc.params.get_set_rss_lut;
+	desc_params = libie_aq_raw(&desc);
 	vsi_id = ice_get_hw_vsi_num(hw, vsi_handle);
 	desc_params->vsi_id = cpu_to_le16(vsi_id | ICE_AQC_RSS_VSI_VALID);
 
@@ -4499,16 +4504,16 @@ __ice_aq_get_set_rss_key(struct ice_hw *hw, u16 vsi_id,
 {
 	struct ice_aqc_get_set_rss_key *desc_params;
 	u16 key_size = sizeof(*key);
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (set) {
 		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_rss_key);
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	} else {
 		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_rss_key);
 	}
 
-	desc_params = &desc.params.get_set_rss_key;
+	desc_params = libie_aq_raw(&desc);
 	desc_params->vsi_id = cpu_to_le16(vsi_id | ICE_AQC_RSS_VSI_VALID);
 
 	return ice_aq_send_cmd(hw, &desc, key, key_size, NULL);
@@ -4580,10 +4585,10 @@ ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
 {
 	struct ice_aqc_add_tx_qgrp *list;
 	struct ice_aqc_add_txqs *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 i, sum_size = 0;
 
-	cmd = &desc.params.add_txqs;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_txqs);
 
@@ -4602,7 +4607,7 @@ ice_aq_add_lan_txq(struct ice_hw *hw, u8 num_qgrps,
 	if (buf_size != sum_size)
 		return -EINVAL;
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	cmd->num_qgrps = num_qgrps;
 
@@ -4629,12 +4634,12 @@ ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
 {
 	struct ice_aqc_dis_txq_item *item;
 	struct ice_aqc_dis_txqs *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 vmvf_and_timeout;
 	u16 i, sz = 0;
 	int status;
 
-	cmd = &desc.params.dis_txqs;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_dis_txqs);
 
 	/* qg_list can be NULL only in VM/VF reset flow */
@@ -4675,7 +4680,7 @@ ice_aq_dis_lan_txq(struct ice_hw *hw, u8 num_qgrps,
 	/* set RD bit to indicate that command buffer is provided by the driver
 	 * and it needs to be read by the firmware
 	 */
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	for (i = 0, item = qg_list; i < num_qgrps; i++) {
 		u16 item_size = struct_size(item, q_id, item->num_qs);
@@ -4727,12 +4732,12 @@ ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf,
 		   struct ice_sq_cd *cd)
 {
 	struct ice_aqc_cfg_txqs *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.cfg_txqs;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_cfg_txqs);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	if (!buf)
 		return -EINVAL;
@@ -4768,10 +4773,10 @@ ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
 {
 	struct ice_aqc_add_rdma_qset_data *list;
 	struct ice_aqc_add_rdma_qset *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 i, sum_size = 0;
 
-	cmd = &desc.params.add_rdma_qset;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset);
 
@@ -4789,7 +4794,7 @@ ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
 	if (buf_size != sum_size)
 		return -EINVAL;
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	cmd->num_qset_grps = num_qset_grps;
 
@@ -5223,10 +5228,10 @@ int ice_aq_get_cgu_input_pin_measure(struct ice_hw *hw, u8 dpll_idx,
 				     u16 meas_num)
 {
 	struct ice_aqc_get_cgu_input_measure *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_input_measure);
-	cmd = &desc.params.get_cgu_input_measure;
+	cmd = libie_aq_raw(&desc);
 	cmd->dpll_idx_opt = dpll_idx & ICE_AQC_GET_CGU_IN_MEAS_DPLL_IDX_M;
 
 	return ice_aq_send_cmd(hw, &desc, meas, meas_num * sizeof(*meas), NULL);
@@ -5244,7 +5249,7 @@ int
 ice_aq_get_cgu_abilities(struct ice_hw *hw,
 			 struct ice_aqc_get_cgu_abilities *abilities)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_abilities);
 	return ice_aq_send_cmd(hw, &desc, abilities, sizeof(*abilities), NULL);
@@ -5267,10 +5272,10 @@ ice_aq_set_input_pin_cfg(struct ice_hw *hw, u8 input_idx, u8 flags1, u8 flags2,
 			 u32 freq, s32 phase_delay)
 {
 	struct ice_aqc_set_cgu_input_config *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_cgu_input_config);
-	cmd = &desc.params.set_cgu_input_config;
+	cmd = libie_aq_raw(&desc);
 	cmd->input_idx = input_idx;
 	cmd->flags1 = flags1;
 	cmd->flags2 = flags2;
@@ -5299,11 +5304,11 @@ ice_aq_get_input_pin_cfg(struct ice_hw *hw, u8 input_idx, u8 *status, u8 *type,
 			 u8 *flags1, u8 *flags2, u32 *freq, s32 *phase_delay)
 {
 	struct ice_aqc_get_cgu_input_config *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int ret;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_input_config);
-	cmd = &desc.params.get_cgu_input_config;
+	cmd = libie_aq_raw(&desc);
 	cmd->input_idx = input_idx;
 
 	ret = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
@@ -5342,10 +5347,10 @@ ice_aq_set_output_pin_cfg(struct ice_hw *hw, u8 output_idx, u8 flags,
 			  u8 src_sel, u32 freq, s32 phase_delay)
 {
 	struct ice_aqc_set_cgu_output_config *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_cgu_output_config);
-	cmd = &desc.params.set_cgu_output_config;
+	cmd = libie_aq_raw(&desc);
 	cmd->output_idx = output_idx;
 	cmd->flags = flags;
 	cmd->src_sel = src_sel;
@@ -5372,11 +5377,11 @@ ice_aq_get_output_pin_cfg(struct ice_hw *hw, u8 output_idx, u8 *flags,
 			  u8 *src_sel, u32 *freq, u32 *src_freq)
 {
 	struct ice_aqc_get_cgu_output_config *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int ret;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_output_config);
-	cmd = &desc.params.get_cgu_output_config;
+	cmd = libie_aq_raw(&desc);
 	cmd->output_idx = output_idx;
 
 	ret = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
@@ -5413,11 +5418,11 @@ ice_aq_get_cgu_dpll_status(struct ice_hw *hw, u8 dpll_num, u8 *ref_state,
 			   u8 *eec_mode)
 {
 	struct ice_aqc_get_cgu_dpll_status *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_dpll_status);
-	cmd = &desc.params.get_cgu_dpll_status;
+	cmd = libie_aq_raw(&desc);
 	cmd->dpll_num = dpll_num;
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
@@ -5451,10 +5456,10 @@ ice_aq_set_cgu_dpll_config(struct ice_hw *hw, u8 dpll_num, u8 ref_state,
 			   u8 config, u8 eec_mode)
 {
 	struct ice_aqc_set_cgu_dpll_config *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_cgu_dpll_config);
-	cmd = &desc.params.set_cgu_dpll_config;
+	cmd = libie_aq_raw(&desc);
 	cmd->dpll_num = dpll_num;
 	cmd->ref_state = ref_state;
 	cmd->config = config;
@@ -5478,10 +5483,10 @@ ice_aq_set_cgu_ref_prio(struct ice_hw *hw, u8 dpll_num, u8 ref_idx,
 			u8 ref_priority)
 {
 	struct ice_aqc_set_cgu_ref_prio *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_cgu_ref_prio);
-	cmd = &desc.params.set_cgu_ref_prio;
+	cmd = libie_aq_raw(&desc);
 	cmd->dpll_num = dpll_num;
 	cmd->ref_idx = ref_idx;
 	cmd->ref_priority = ref_priority;
@@ -5504,11 +5509,11 @@ ice_aq_get_cgu_ref_prio(struct ice_hw *hw, u8 dpll_num, u8 ref_idx,
 			u8 *ref_prio)
 {
 	struct ice_aqc_get_cgu_ref_prio *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_ref_prio);
-	cmd = &desc.params.get_cgu_ref_prio;
+	cmd = libie_aq_raw(&desc);
 	cmd->dpll_num = dpll_num;
 	cmd->ref_idx = ref_idx;
 
@@ -5534,11 +5539,11 @@ ice_aq_get_cgu_info(struct ice_hw *hw, u32 *cgu_id, u32 *cgu_cfg_ver,
 		    u32 *cgu_fw_ver)
 {
 	struct ice_aqc_get_cgu_info *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cgu_info);
-	cmd = &desc.params.get_cgu_info;
+	cmd = libie_aq_raw(&desc);
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 	if (!status) {
@@ -5565,11 +5570,11 @@ ice_aq_set_phy_rec_clk_out(struct ice_hw *hw, u8 phy_output, bool enable,
 			   u32 *freq)
 {
 	struct ice_aqc_set_phy_rec_clk_out *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_phy_rec_clk_out);
-	cmd = &desc.params.set_phy_rec_clk_out;
+	cmd = libie_aq_raw(&desc);
 	cmd->phy_output = phy_output;
 	cmd->port_num = ICE_AQC_SET_PHY_REC_CLK_OUT_CURR_PORT;
 	cmd->flags = enable & ICE_AQC_SET_PHY_REC_CLK_OUT_OUT_EN;
@@ -5598,11 +5603,11 @@ ice_aq_get_phy_rec_clk_out(struct ice_hw *hw, u8 *phy_output, u8 *port_num,
 			   u8 *flags, u16 *node_handle)
 {
 	struct ice_aqc_get_phy_rec_clk_out *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_phy_rec_clk_out);
-	cmd = &desc.params.get_phy_rec_clk_out;
+	cmd = libie_aq_raw(&desc);
 	cmd->phy_output = *phy_output;
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
@@ -5630,11 +5635,11 @@ int ice_aq_get_sensor_reading(struct ice_hw *hw,
 			      struct ice_aqc_get_sensor_reading_resp *data)
 {
 	struct ice_aqc_get_sensor_reading *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_sensor_reading);
-	cmd = &desc.params.get_sensor_reading;
+	cmd = libie_aq_raw(&desc);
 #define ICE_INTERNAL_TEMP_SENSOR_FORMAT	0
 #define ICE_INTERNAL_TEMP_SENSOR	0
 	cmd->sensor = ICE_INTERNAL_TEMP_SENSOR;
@@ -5642,7 +5647,7 @@ int ice_aq_get_sensor_reading(struct ice_hw *hw,
 
 	status = ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 	if (!status)
-		memcpy(data, &desc.params.get_sensor_reading_resp,
+		memcpy(data, &desc.params.raw,
 		       sizeof(*data));
 
 	return status;
@@ -5839,13 +5844,13 @@ ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
 		u16 bus_addr, __le16 addr, u8 params, u8 *data,
 		struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc = { 0 };
+	struct libie_aq_desc desc = { 0 };
 	struct ice_aqc_i2c *cmd;
 	u8 data_size;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_read_i2c);
-	cmd = &desc.params.read_write_i2c;
+	cmd = libie_aq_raw(&desc);
 
 	if (!data)
 		return -EINVAL;
@@ -5862,7 +5867,7 @@ ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
 		struct ice_aqc_read_i2c_resp *resp;
 		u8 i;
 
-		resp = &desc.params.read_i2c_resp;
+		resp = libie_aq_raw(&desc);
 		for (i = 0; i < data_size; i++) {
 			*data = resp->i2c_data[i];
 			data++;
@@ -5894,12 +5899,12 @@ ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
 		 u16 bus_addr, __le16 addr, u8 params, const u8 *data,
 		 struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc = { 0 };
+	struct libie_aq_desc desc = { 0 };
 	struct ice_aqc_i2c *cmd;
 	u8 data_size;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_write_i2c);
-	cmd = &desc.params.read_write_i2c;
+	cmd = libie_aq_raw(&desc);
 
 	data_size = FIELD_GET(ICE_AQC_I2C_DATA_SIZE_M, params);
 
@@ -5931,7 +5936,7 @@ ice_aq_write_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr,
 int ice_get_pca9575_handle(struct ice_hw *hw, u16 *pca9575_handle)
 {
 	struct ice_aqc_get_link_topo *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 	u8 idx;
 
@@ -5954,7 +5959,7 @@ int ice_get_pca9575_handle(struct ice_hw *hw, u16 *pca9575_handle)
 
 	/* If handle was not detected read it from the netlist */
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_link_topo);
-	cmd = &desc.params.get_link_topo;
+	cmd = libie_aq_raw(&desc);
 	cmd->addr.topo_params.node_type_ctx =
 		ICE_AQC_LINK_TOPO_NODE_TYPE_GPIO_CTRL;
 	cmd->addr.topo_params.index = idx;
@@ -5964,13 +5969,12 @@ int ice_get_pca9575_handle(struct ice_hw *hw, u16 *pca9575_handle)
 		return -ENXIO;
 
 	/* Verify if we found the right IO expander type */
-	if (desc.params.get_link_topo.node_part_num !=
-	    ICE_AQC_GET_LINK_TOPO_NODE_NR_PCA9575)
+	if (cmd->node_part_num != ICE_AQC_GET_LINK_TOPO_NODE_NR_PCA9575)
 		return -ENXIO;
 
 	/* If present save the handle and return it */
 	hw->io_expander_handle =
-		le16_to_cpu(desc.params.get_link_topo.addr.handle);
+		le16_to_cpu(cmd->addr.handle);
 	*pca9575_handle = hw->io_expander_handle;
 
 	return 0;
@@ -6021,11 +6025,11 @@ int
 ice_aq_set_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx, bool value,
 		struct ice_sq_cd *cd)
 {
+	struct libie_aq_desc desc;
 	struct ice_aqc_gpio *cmd;
-	struct ice_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_gpio);
-	cmd = &desc.params.read_write_gpio;
+	cmd = libie_aq_raw(&desc);
 	cmd->gpio_ctrl_handle = cpu_to_le16(gpio_ctrl_handle);
 	cmd->gpio_num = pin_idx;
 	cmd->gpio_val = value ? 1 : 0;
@@ -6048,12 +6052,12 @@ int
 ice_aq_get_gpio(struct ice_hw *hw, u16 gpio_ctrl_handle, u8 pin_idx,
 		bool *value, struct ice_sq_cd *cd)
 {
+	struct libie_aq_desc desc;
 	struct ice_aqc_gpio *cmd;
-	struct ice_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_gpio);
-	cmd = &desc.params.read_write_gpio;
+	cmd = libie_aq_raw(&desc);
 	cmd->gpio_ctrl_handle = cpu_to_le16(gpio_ctrl_handle);
 	cmd->gpio_num = pin_idx;
 
@@ -6216,9 +6220,9 @@ bool ice_is_fw_health_report_supported(struct ice_hw *hw)
 int ice_aq_set_health_status_cfg(struct ice_hw *hw, u8 event_source)
 {
 	struct ice_aqc_set_health_status_cfg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_health_status_cfg;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_health_status_cfg);
 
@@ -6242,16 +6246,16 @@ ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size,
 		    struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_set_local_mib *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.lldp_set_mib;
+	cmd = libie_aq_raw(&desc);
 
 	if (buf_size == 0 || !buf)
 		return -EINVAL;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_local_mib);
 
-	desc.flags |= cpu_to_le16((u16)ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_RD);
 	desc.datalen = cpu_to_le16(buf_size);
 
 	cmd->type = mib_type;
@@ -6287,12 +6291,12 @@ bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw)
 int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add)
 {
 	struct ice_aqc_lldp_filter_ctrl *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (vsi->type != ICE_VSI_PF || !ice_fw_supports_lldp_fltr_ctrl(hw))
 		return -EOPNOTSUPP;
 
-	cmd = &desc.params.lldp_filter_ctrl;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl);
 
@@ -6312,7 +6316,7 @@ int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add)
  */
 int ice_lldp_execute_pending_mib(struct ice_hw *hw)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_execute_pending_mib);
 
@@ -6389,7 +6393,7 @@ int ice_read_cgu_reg(struct ice_hw *hw, u32 addr, u32 *val)
 	};
 	int err;
 
-	err = ice_sbq_rw_reg(hw, &cgu_msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &cgu_msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to read CGU register 0x%04x, err %d\n",
 			  addr, err);
@@ -6422,7 +6426,7 @@ int ice_write_cgu_reg(struct ice_hw *hw, u32 addr, u32 val)
 	};
 	int err;
 
-	err = ice_sbq_rw_reg(hw, &cgu_msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &cgu_msg, LIBIE_AQ_FLAG_RD);
 	if (err)
 		ice_debug(hw, ICE_DBG_PTP, "Failed to write CGU register 0x%04x, err %d\n",
 			  addr, err);
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index 25d9785f32cc..60320cdf7804 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -109,7 +109,7 @@ bool ice_is_sbq_supported(struct ice_hw *hw);
 struct ice_ctl_q_info *ice_get_sbq(struct ice_hw *hw);
 int
 ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
-		struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		struct libie_aq_desc *desc, void *buf, u16 buf_size,
 		struct ice_sq_cd *cd);
 void ice_clear_pxe_mode(struct ice_hw *hw);
 int ice_get_caps(struct ice_hw *hw);
@@ -138,14 +138,14 @@ ice_aq_set_rss_key(struct ice_hw *hw, u16 vsi_handle,
 
 bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq);
 int ice_aq_q_shutdown(struct ice_hw *hw, bool unloading);
-void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode);
+void ice_fill_dflt_direct_cmd_desc(struct libie_aq_desc *desc, u16 opcode);
 
 void ice_pack_txq_ctx(const struct ice_tlan_ctx *ctx, ice_txq_ctx_buf_t *buf);
 
 extern struct mutex ice_global_cfg_lock_sw;
 
 int
-ice_aq_send_cmd(struct ice_hw *hw, struct ice_aq_desc *desc,
+ice_aq_send_cmd(struct ice_hw *hw, struct libie_aq_desc *desc,
 		void *buf, u16 buf_size, struct ice_sq_cd *cd);
 int ice_aq_get_fw_ver(struct ice_hw *hw, struct ice_sq_cd *cd);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.c b/drivers/net/ethernet/intel/ice/ice_controlq.c
index e3959ad442a2..dcb837cadd18 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.c
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.c
@@ -90,7 +90,7 @@ bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 static int
 ice_alloc_ctrlq_sq_ring(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 {
-	size_t size = cq->num_sq_entries * sizeof(struct ice_aq_desc);
+	size_t size = cq->num_sq_entries * sizeof(struct libie_aq_desc);
 
 	cq->sq.desc_buf.va = dmam_alloc_coherent(ice_hw_to_dev(hw), size,
 						 &cq->sq.desc_buf.pa,
@@ -110,7 +110,7 @@ ice_alloc_ctrlq_sq_ring(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 static int
 ice_alloc_ctrlq_rq_ring(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 {
-	size_t size = cq->num_rq_entries * sizeof(struct ice_aq_desc);
+	size_t size = cq->num_rq_entries * sizeof(struct libie_aq_desc);
 
 	cq->rq.desc_buf.va = dmam_alloc_coherent(ice_hw_to_dev(hw), size,
 						 &cq->rq.desc_buf.pa,
@@ -159,7 +159,7 @@ ice_alloc_rq_bufs(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 
 	/* allocate the mapped buffers */
 	for (i = 0; i < cq->num_rq_entries; i++) {
-		struct ice_aq_desc *desc;
+		struct libie_aq_desc *desc;
 		struct ice_dma_mem *bi;
 
 		bi = &cq->rq.r.rq_bi[i];
@@ -173,9 +173,9 @@ ice_alloc_rq_bufs(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 		/* now configure the descriptors for use */
 		desc = ICE_CTL_Q_DESC(cq->rq, i);
 
-		desc->flags = cpu_to_le16(ICE_AQ_FLAG_BUF);
-		if (cq->rq_buf_size > ICE_AQ_LG_BUF)
-			desc->flags |= cpu_to_le16(ICE_AQ_FLAG_LB);
+		desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_BUF);
+		if (cq->rq_buf_size > LIBIE_AQ_LG_BUF)
+			desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 		desc->opcode = 0;
 		/* This is in accordance with control queue design, there is no
 		 * register for buffer size configuration
@@ -858,7 +858,7 @@ static u16 ice_clean_sq(struct ice_hw *hw, struct ice_ctl_q_info *cq)
 {
 	struct ice_ctl_q_ring *sq = &cq->sq;
 	u16 ntc = sq->next_to_clean;
-	struct ice_aq_desc *desc;
+	struct libie_aq_desc *desc;
 
 	desc = ICE_CTL_Q_DESC(*sq, ntc);
 
@@ -912,7 +912,7 @@ static const char *ice_ctl_q_str(enum ice_ctl_q qtype)
 static void ice_debug_cq(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 			 void *desc, void *buf, u16 buf_len, bool response)
 {
-	struct ice_aq_desc *cq_desc = desc;
+	struct libie_aq_desc *cq_desc = desc;
 	u16 datalen, flags;
 
 	if (!IS_ENABLED(CONFIG_DYNAMIC_DEBUG) &&
@@ -939,7 +939,8 @@ static void ice_debug_cq(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	 * by the DD and/or CMP flag set or a command with the RD flag set.
 	 */
 	if (buf && cq_desc->datalen &&
-	    (flags & (ICE_AQ_FLAG_DD | ICE_AQ_FLAG_CMP | ICE_AQ_FLAG_RD))) {
+	    (flags & (LIBIE_AQ_FLAG_DD | LIBIE_AQ_FLAG_CMP |
+		      LIBIE_AQ_FLAG_RD))) {
 		char prefix[] = KBUILD_MODNAME " 0x12341234 0x12341234 ";
 
 		sprintf(prefix, KBUILD_MODNAME " 0x%08X 0x%08X ",
@@ -992,11 +993,11 @@ static bool ice_sq_done(struct ice_hw *hw, struct ice_ctl_q_info *cq)
  */
 int
 ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
-		struct ice_aq_desc *desc, void *buf, u16 buf_size,
+		struct libie_aq_desc *desc, void *buf, u16 buf_size,
 		struct ice_sq_cd *cd)
 {
 	struct ice_dma_mem *dma_buf = NULL;
-	struct ice_aq_desc *desc_on_ring;
+	struct libie_aq_desc *desc_on_ring;
 	bool cmd_completed = false;
 	int status = 0;
 	u16 retval = 0;
@@ -1007,7 +1008,7 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		return -EBUSY;
 	mutex_lock(&cq->sq_lock);
 
-	cq->sq_last_status = ICE_AQ_RC_OK;
+	cq->sq_last_status = LIBIE_AQ_RC_OK;
 
 	if (!cq->sq.count) {
 		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Send queue not initialized.\n");
@@ -1028,9 +1029,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 			goto sq_send_command_error;
 		}
 
-		desc->flags |= cpu_to_le16(ICE_AQ_FLAG_BUF);
-		if (buf_size > ICE_AQ_LG_BUF)
-			desc->flags |= cpu_to_le16(ICE_AQ_FLAG_LB);
+		desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF);
+		if (buf_size > LIBIE_AQ_LG_BUF)
+			desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 	}
 
 	val = rd32(hw, cq->sq.head);
@@ -1112,9 +1113,9 @@ ice_sq_send_cmd(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 			retval &= 0xff;
 		}
 		cmd_completed = true;
-		if (!status && retval != ICE_AQ_RC_OK)
+		if (!status && retval != LIBIE_AQ_RC_OK)
 			status = -EIO;
-		cq->sq_last_status = (enum ice_aq_err)retval;
+		cq->sq_last_status = (enum libie_aq_err)retval;
 	}
 
 	ice_debug(hw, ICE_DBG_AQ_MSG, "ATQ: desc and buffer writeback:\n");
@@ -1149,12 +1150,12 @@ sq_send_command_error:
  *
  * Fill the desc with default values
  */
-void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode)
+void ice_fill_dflt_direct_cmd_desc(struct libie_aq_desc *desc, u16 opcode)
 {
 	/* zero out the desc */
 	memset(desc, 0, sizeof(*desc));
 	desc->opcode = cpu_to_le16(opcode);
-	desc->flags = cpu_to_le16(ICE_AQ_FLAG_SI);
+	desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_SI);
 }
 
 /**
@@ -1172,9 +1173,9 @@ int
 ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 		  struct ice_rq_event_info *e, u16 *pending)
 {
+	enum libie_aq_err rq_last_status;
 	u16 ntc = cq->rq.next_to_clean;
-	enum ice_aq_err rq_last_status;
-	struct ice_aq_desc *desc;
+	struct libie_aq_desc *desc;
 	struct ice_dma_mem *bi;
 	int ret_code = 0;
 	u16 desc_idx;
@@ -1207,9 +1208,9 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	desc = ICE_CTL_Q_DESC(cq->rq, ntc);
 	desc_idx = ntc;
 
-	rq_last_status = (enum ice_aq_err)le16_to_cpu(desc->retval);
+	rq_last_status = (enum libie_aq_err)le16_to_cpu(desc->retval);
 	flags = le16_to_cpu(desc->flags);
-	if (flags & ICE_AQ_FLAG_ERR) {
+	if (flags & LIBIE_AQ_FLAG_ERR) {
 		ret_code = -EIO;
 		ice_debug(hw, ICE_DBG_AQ_MSG, "Control Receive Queue Event 0x%04X received with error 0x%X\n",
 			  le16_to_cpu(desc->opcode), rq_last_status);
@@ -1230,9 +1231,9 @@ ice_clean_rq_elem(struct ice_hw *hw, struct ice_ctl_q_info *cq,
 	bi = &cq->rq.r.rq_bi[ntc];
 	memset(desc, 0, sizeof(*desc));
 
-	desc->flags = cpu_to_le16(ICE_AQ_FLAG_BUF);
-	if (cq->rq_buf_size > ICE_AQ_LG_BUF)
-		desc->flags |= cpu_to_le16(ICE_AQ_FLAG_LB);
+	desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_BUF);
+	if (cq->rq_buf_size > LIBIE_AQ_LG_BUF)
+		desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 	desc->datalen = cpu_to_le16(bi->size);
 	desc->params.generic.addr_high = cpu_to_le32(upper_32_bits(bi->pa));
 	desc->params.generic.addr_low = cpu_to_le32(lower_32_bits(bi->pa));
diff --git a/drivers/net/ethernet/intel/ice/ice_controlq.h b/drivers/net/ethernet/intel/ice/ice_controlq.h
index ca97b7365a1b..788040dd662e 100644
--- a/drivers/net/ethernet/intel/ice/ice_controlq.h
+++ b/drivers/net/ethernet/intel/ice/ice_controlq.h
@@ -12,7 +12,7 @@
 #define ICE_SBQ_MAX_BUF_LEN 512
 
 #define ICE_CTL_Q_DESC(R, i) \
-	(&(((struct ice_aq_desc *)((R).desc_buf.va))[i]))
+	(&(((struct libie_aq_desc *)((R).desc_buf.va))[i]))
 
 #define ICE_CTL_Q_DESC_UNUSED(R) \
 	((u16)((((R)->next_to_clean > (R)->next_to_use) ? 0 : (R)->count) + \
@@ -76,12 +76,12 @@ struct ice_ctl_q_ring {
 
 /* sq transaction details */
 struct ice_sq_cd {
-	struct ice_aq_desc *wb_desc;
+	struct libie_aq_desc *wb_desc;
 };
 
 /* rq event information */
 struct ice_rq_event_info {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 msg_len;
 	u16 buf_len;
 	u8 *msg_buf;
@@ -96,7 +96,7 @@ struct ice_ctl_q_info {
 	u16 num_sq_entries;		/* send queue depth */
 	u16 rq_buf_size;		/* receive queue buffer size */
 	u16 sq_buf_size;		/* send queue buffer size */
-	enum ice_aq_err sq_last_status;	/* last status on send queue */
+	enum libie_aq_err sq_last_status;	/* last status on send queue */
 	struct mutex sq_lock;		/* Send queue lock */
 	struct mutex rq_lock;		/* Receive queue lock */
 };
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c
index 64737fc62306..abea84f14658 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb.c
@@ -24,10 +24,10 @@ ice_aq_get_lldp_mib(struct ice_hw *hw, u8 bridge_type, u8 mib_type, void *buf,
 		    struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_get_mib *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.lldp_get_mib;
+	cmd = libie_aq_raw(&desc);
 
 	if (buf_size == 0 || !buf)
 		return -EINVAL;
@@ -64,9 +64,9 @@ ice_aq_cfg_lldp_mib_change(struct ice_hw *hw, bool ena_update,
 			   struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_set_mib_change *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.lldp_set_event;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_set_mib_change);
 
@@ -95,9 +95,9 @@ ice_aq_stop_lldp(struct ice_hw *hw, bool shutdown_lldp_agent, bool persist,
 		 struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_stop *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.lldp_stop;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_stop);
 
@@ -121,9 +121,9 @@ ice_aq_stop_lldp(struct ice_hw *hw, bool shutdown_lldp_agent, bool persist,
 int ice_aq_start_lldp(struct ice_hw *hw, bool persist, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_start *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.lldp_start;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_start);
 
@@ -677,11 +677,11 @@ ice_aq_start_stop_dcbx(struct ice_hw *hw, bool start_dcbx_agent,
 		       bool *dcbx_agent_status, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_lldp_stop_start_specific_agent *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 opcode;
 	int status;
 
-	cmd = &desc.params.lldp_agent_ctrl;
+	cmd = libie_aq_raw(&desc);
 
 	opcode = ice_aqc_opc_lldp_stop_start_specific_agent;
 
@@ -714,7 +714,7 @@ ice_aq_get_cee_dcb_cfg(struct ice_hw *hw,
 		       struct ice_aqc_get_cee_dcb_cfg_resp *buff,
 		       struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_cee_dcb_cfg);
 
@@ -733,13 +733,13 @@ ice_aq_get_cee_dcb_cfg(struct ice_hw *hw,
 int ice_aq_set_pfc_mode(struct ice_hw *hw, u8 pfc_mode, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_set_query_pfc_mode *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (pfc_mode > ICE_AQC_PFC_DSCP_BASED_PFC)
 		return -EINVAL;
 
-	cmd = &desc.params.set_query_pfc_mode;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_pfc_mode);
 
@@ -914,7 +914,7 @@ static int ice_get_ieee_or_cee_dcb_cfg(struct ice_port_info *pi, u8 dcbx_mode)
 	ret = ice_aq_get_dcb_cfg(pi->hw, ICE_AQ_LLDP_MIB_REMOTE,
 				 ICE_AQ_LLDP_BRID_TYPE_NEAREST_BRID, dcbx_cfg);
 	/* Don't treat ENOENT as an error for Remote MIBs */
-	if (pi->hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT)
+	if (pi->hw->adminq.sq_last_status == LIBIE_AQ_RC_ENOENT)
 		ret = 0;
 
 out:
@@ -941,7 +941,7 @@ int ice_get_dcb_cfg(struct ice_port_info *pi)
 		/* CEE mode */
 		ret = ice_get_ieee_or_cee_dcb_cfg(pi, ICE_DCBX_MODE_CEE);
 		ice_cee_to_dcb_cfg(&cee_cfg, pi);
-	} else if (pi->hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT) {
+	} else if (pi->hw->adminq.sq_last_status == LIBIE_AQ_RC_ENOENT) {
 		/* CEE mode not enabled try querying IEEE data */
 		dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 		dcbx_cfg->dcbx_mode = ICE_DCBX_MODE_IEEE;
@@ -965,7 +965,7 @@ void ice_get_dcb_cfg_from_mib_change(struct ice_port_info *pi,
 	struct ice_aqc_lldp_get_mib *mib;
 	u8 change_type, dcbx_mode;
 
-	mib = (struct ice_aqc_lldp_get_mib *)&event->desc.params.raw;
+	mib = libie_aq_raw(&event->desc);
 
 	change_type = FIELD_GET(ICE_AQ_LLDP_MIB_TYPE_M, mib->type);
 	if (change_type == ICE_AQ_LLDP_MIB_REMOTE)
@@ -1537,12 +1537,12 @@ ice_aq_query_port_ets(struct ice_port_info *pi,
 		      struct ice_sq_cd *cd)
 {
 	struct ice_aqc_query_port_ets *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!pi)
 		return -EINVAL;
-	cmd = &desc.params.port_ets;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_port_ets);
 	cmd->port_teid = pi->root->info.node_teid;
 
diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
index 533eb8930aa8..9fc8681cc58e 100644
--- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c
@@ -1020,7 +1020,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf,
 	}
 
 	pi = pf->hw.port_info;
-	mib = (struct ice_aqc_lldp_get_mib *)&event->desc.params.raw;
+	mib = libie_aq_raw(&event->desc);
 
 	/* Ignore if event is not for Nearest Bridge */
 	mib_type = FIELD_GET(ICE_AQ_LLDP_BRID_TYPE_M, mib->type);
diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c
index 59323c019544..e7ae220158b5 100644
--- a/drivers/net/ethernet/intel/ice/ice_ddp.c
+++ b/drivers/net/ethernet/intel/ice/ice_ddp.c
@@ -1101,16 +1101,16 @@ struct ice_buf *ice_pkg_buf(struct ice_buf_build *bld)
 	return &bld->buf;
 }
 
-static enum ice_ddp_state ice_map_aq_err_to_ddp_state(enum ice_aq_err aq_err)
+static enum ice_ddp_state ice_map_aq_err_to_ddp_state(enum libie_aq_err aq_err)
 {
 	switch (aq_err) {
-	case ICE_AQ_RC_ENOSEC:
-	case ICE_AQ_RC_EBADSIG:
+	case LIBIE_AQ_RC_ENOSEC:
+	case LIBIE_AQ_RC_EBADSIG:
 		return ICE_DDP_PKG_FILE_SIGNATURE_INVALID;
-	case ICE_AQ_RC_ESVN:
+	case LIBIE_AQ_RC_ESVN:
 		return ICE_DDP_PKG_FILE_REVISION_TOO_LOW;
-	case ICE_AQ_RC_EBADMAN:
-	case ICE_AQ_RC_EBADBUF:
+	case LIBIE_AQ_RC_EBADMAN:
+	case LIBIE_AQ_RC_EBADBUF:
 		return ICE_DDP_PKG_LOAD_ERROR;
 	default:
 		return ICE_DDP_PKG_ERR;
@@ -1180,7 +1180,7 @@ ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 		    u32 *error_info, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_download_pkg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (error_offset)
@@ -1188,9 +1188,9 @@ ice_aq_download_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 	if (error_info)
 		*error_info = 0;
 
-	cmd = &desc.params.download_pkg;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_download_pkg);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	if (last_buf)
 		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
@@ -1259,7 +1259,7 @@ static enum ice_ddp_state ice_ddp_send_hunk(struct ice_ddp_send_ctx *ctx,
 	struct ice_buf_hdr *prev_hunk = ctx->hdr;
 	struct ice_hw *hw = ctx->hw;
 	bool prev_was_last = !hunk;
-	enum ice_aq_err aq_err;
+	enum libie_aq_err aq_err;
 	u32 offset, info;
 	int attempt, err;
 
@@ -1278,7 +1278,8 @@ static enum ice_ddp_state ice_ddp_send_hunk(struct ice_ddp_send_ctx *ctx,
 					  prev_was_last, &offset, &info, NULL);
 
 		aq_err = hw->adminq.sq_last_status;
-		if (aq_err != ICE_AQ_RC_ENOSEC && aq_err != ICE_AQ_RC_EBADSIG)
+		if (aq_err != LIBIE_AQ_RC_ENOSEC &&
+		    aq_err != LIBIE_AQ_RC_EBADSIG)
 			break;
 	}
 
@@ -1537,7 +1538,7 @@ ice_post_dwnld_pkg_actions(struct ice_hw *hw)
 static enum ice_ddp_state
 ice_download_pkg_with_sig_seg(struct ice_hw *hw, struct ice_pkg_hdr *pkg_hdr)
 {
-	enum ice_aq_err aq_err = hw->adminq.sq_last_status;
+	enum libie_aq_err aq_err = hw->adminq.sq_last_status;
 	enum ice_ddp_state state = ICE_DDP_PKG_ERR;
 	struct ice_ddp_send_ctx ctx = { .hw = hw };
 	int status;
@@ -1687,7 +1688,7 @@ static int ice_aq_get_pkg_info_list(struct ice_hw *hw,
 				    struct ice_aqc_get_pkg_info_resp *pkg_info,
 				    u16 buf_size, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_pkg_info_list);
 
@@ -1711,7 +1712,7 @@ static int ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 			     u32 *error_info, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_download_pkg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (error_offset)
@@ -1719,9 +1720,9 @@ static int ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 	if (error_info)
 		*error_info = 0;
 
-	cmd = &desc.params.download_pkg;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_pkg);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	if (last_buf)
 		cmd->flags |= ICE_AQC_DOWNLOAD_PKG_LAST_BUF;
@@ -1753,10 +1754,10 @@ static int ice_aq_update_pkg(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 int ice_aq_upload_section(struct ice_hw *hw, struct ice_buf_hdr *pkg_buf,
 			  u16 buf_size, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_upload_section);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	return ice_aq_send_cmd(hw, &desc, pkg_buf, buf_size, cd);
 }
@@ -2333,10 +2334,10 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size,
 		    struct ice_sq_cd *cd, u8 *flags, bool set)
 {
 	struct ice_aqc_get_set_tx_topo *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.get_set_tx_topo;
+	cmd = libie_aq_raw(&desc);
 	if (set) {
 		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_tx_topo);
 		cmd->set_flags = ICE_AQC_TX_TOPO_FLAGS_ISSUED;
@@ -2345,14 +2346,14 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size,
 			cmd->set_flags |= ICE_AQC_TX_TOPO_FLAGS_SRC_RAM |
 					  ICE_AQC_TX_TOPO_FLAGS_LOAD_NEW;
 
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	} else {
 		ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_tx_topo);
 		cmd->get_flags = ICE_AQC_TX_TOPO_GET_RAM;
 
 		if (hw->mac_type == ICE_MAC_E810 ||
 		    hw->mac_type == ICE_MAC_GENERIC)
-			desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+			desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	}
 
 	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
@@ -2360,7 +2361,7 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size,
 		return status;
 	/* read the return flag values (first byte) for get operation */
 	if (!set && flags)
-		*flags = desc.params.get_set_tx_topo.set_flags;
+		*flags = cmd->set_flags;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index 39743cdba986..093835d2c822 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -2509,7 +2509,7 @@ static bool ice_dpll_is_pps_phase_monitor(struct ice_pf *pf)
 	int ret = ice_aq_get_cgu_input_pin_measure(&pf->hw, DPLL_TYPE_PPS, meas,
 						   ARRAY_SIZE(meas));
 
-	if (ret && pf->hw.adminq.sq_last_status == ICE_AQ_RC_ESRCH)
+	if (ret && pf->hw.adminq.sq_last_status == LIBIE_AQ_RC_ESRCH)
 		return false;
 
 	return true;
@@ -2562,7 +2562,7 @@ static int ice_dpll_pps_update_phase_offsets(struct ice_pf *pf,
 	*phase_offset_pins_updated = 0;
 	ret = ice_aq_get_cgu_input_pin_measure(&pf->hw, DPLL_TYPE_PPS, meas,
 					       ARRAY_SIZE(meas));
-	if (ret && pf->hw.adminq.sq_last_status == ICE_AQ_RC_EAGAIN) {
+	if (ret && pf->hw.adminq.sq_last_status == LIBIE_AQ_RC_EAGAIN) {
 		return 0;
 	} else if (ret) {
 		dev_err(ice_pf_to_dev(pf),
diff --git a/drivers/net/ethernet/intel/ice/ice_fw_update.c b/drivers/net/ethernet/intel/ice/ice_fw_update.c
index 70c201f569ce..4d9ad92a44fe 100644
--- a/drivers/net/ethernet/intel/ice/ice_fw_update.c
+++ b/drivers/net/ethernet/intel/ice/ice_fw_update.c
@@ -299,7 +299,8 @@ int ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
 	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_aq_task task = {};
 	struct ice_hw *hw = &pf->hw;
-	struct ice_aq_desc *desc;
+	struct libie_aq_desc *desc;
+	struct ice_aqc_nvm *cmd;
 	u32 completion_offset;
 	int err;
 
@@ -333,11 +334,12 @@ int ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
 	}
 
 	desc = &task.event.desc;
-	completion_module = le16_to_cpu(desc->params.nvm.module_typeid);
+	cmd = libie_aq_raw(desc);
+	completion_module = le16_to_cpu(cmd->module_typeid);
 	completion_retval = le16_to_cpu(desc->retval);
 
-	completion_offset = le16_to_cpu(desc->params.nvm.offset_low);
-	completion_offset |= desc->params.nvm.offset_high << 16;
+	completion_offset = le16_to_cpu(cmd->offset_low);
+	completion_offset |= cmd->offset_high << 16;
 
 	if (completion_module != module) {
 		dev_err(dev, "Unexpected module_typeid in write completion: got 0x%x, expected 0x%x\n",
@@ -356,7 +358,7 @@ int ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
 	if (completion_retval) {
 		dev_err(dev, "Firmware failed to flash module 0x%02x with block of size %u at offset %u, err %s\n",
 			module, block_size, offset,
-			ice_aq_str((enum ice_aq_err)completion_retval));
+			ice_aq_str((enum libie_aq_err)completion_retval));
 		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to program flash module");
 		return -EIO;
 	}
@@ -369,7 +371,7 @@ int ice_write_one_nvm_block(struct ice_pf *pf, u16 module, u32 offset,
 	 */
 	if (reset_level && last_cmd && module == ICE_SR_1ST_NVM_BANK_PTR) {
 		if (hw->dev_caps.common_cap.pcie_reset_avoidance) {
-			*reset_level = desc->params.nvm.cmd_flags &
+			*reset_level = cmd->cmd_flags &
 				       ICE_AQC_NVM_RESET_LVL_M;
 			dev_dbg(dev, "Firmware reported required reset level as %u\n",
 				*reset_level);
@@ -487,7 +489,8 @@ ice_erase_nvm_module(struct ice_pf *pf, u16 module, const char *component,
 	struct device *dev = ice_pf_to_dev(pf);
 	struct ice_aq_task task = {};
 	struct ice_hw *hw = &pf->hw;
-	struct ice_aq_desc *desc;
+	struct libie_aq_desc *desc;
+	struct ice_aqc_nvm *cmd;
 	struct devlink *devlink;
 	int err;
 
@@ -518,7 +521,8 @@ ice_erase_nvm_module(struct ice_pf *pf, u16 module, const char *component,
 	}
 
 	desc = &task.event.desc;
-	completion_module = le16_to_cpu(desc->params.nvm.module_typeid);
+	cmd = libie_aq_raw(desc);
+	completion_module = le16_to_cpu(cmd->module_typeid);
 	completion_retval = le16_to_cpu(desc->retval);
 
 	if (completion_module != module) {
@@ -532,7 +536,7 @@ ice_erase_nvm_module(struct ice_pf *pf, u16 module, const char *component,
 	if (completion_retval) {
 		dev_err(dev, "Firmware failed to erase %s (module 0x02%x), aq_err %s\n",
 			component, module,
-			ice_aq_str((enum ice_aq_err)completion_retval));
+			ice_aq_str((enum libie_aq_err)completion_retval));
 		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to erase flash");
 		err = -EIO;
 		goto out_notify_devlink;
@@ -611,7 +615,7 @@ ice_switch_flash_banks(struct ice_pf *pf, u8 activate_flags,
 	completion_retval = le16_to_cpu(task.event.desc.retval);
 	if (completion_retval) {
 		dev_err(dev, "Firmware failed to switch active flash banks aq_err %s\n",
-			ice_aq_str((enum ice_aq_err)completion_retval));
+			ice_aq_str((enum libie_aq_err)completion_retval));
 		NL_SET_ERR_MSG_MOD(extack, "Firmware failed to switch active flash banks");
 		return -EIO;
 	}
diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c
index 4fd15387a7e5..a31bb026ad34 100644
--- a/drivers/net/ethernet/intel/ice/ice_fwlog.c
+++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c
@@ -240,7 +240,7 @@ ice_aq_fwlog_set(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
 {
 	struct ice_aqc_fw_log_cfg_resp *fw_modules;
 	struct ice_aqc_fw_log *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 	int i;
 
@@ -255,9 +255,9 @@ ice_aq_fwlog_set(struct ice_hw *hw, struct ice_fwlog_module_entry *entries,
 	}
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_config);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
-	cmd = &desc.params.fw_log;
+	cmd = libie_aq_raw(&desc);
 
 	cmd->cmd_flags = ICE_AQC_FW_LOG_CONF_SET_VALID;
 	cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution);
@@ -309,7 +309,7 @@ static int ice_aq_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
 {
 	struct ice_aqc_fw_log_cfg_resp *fw_modules;
 	struct ice_aqc_fw_log *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 module_id_cnt;
 	int status;
 	void *buf;
@@ -322,7 +322,7 @@ static int ice_aq_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
 		return -ENOMEM;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_query);
-	cmd = &desc.params.fw_log;
+	cmd = libie_aq_raw(&desc);
 
 	cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_QUERY;
 
@@ -384,12 +384,14 @@ int ice_fwlog_get(struct ice_hw *hw, struct ice_fwlog_cfg *cfg)
  */
 static int ice_aq_fwlog_register(struct ice_hw *hw, bool reg)
 {
-	struct ice_aq_desc desc;
+	struct ice_aqc_fw_log *cmd;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_register);
+	cmd = libie_aq_raw(&desc);
 
 	if (reg)
-		desc.params.fw_log.cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER;
+		cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER;
 
 	return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c
index c8b4fa3efbd4..b1129da72139 100644
--- a/drivers/net/ethernet/intel/ice/ice_lag.c
+++ b/drivers/net/ethernet/intel/ice/ice_lag.c
@@ -1144,7 +1144,7 @@ ice_lag_set_swid(u16 primary_swid, struct ice_lag *local_lag,
 {
 	struct ice_aqc_alloc_free_res_elem *buf;
 	struct ice_aqc_set_port_params *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_len, swid;
 	int status, i;
 
@@ -1192,7 +1192,7 @@ ice_lag_set_swid(u16 primary_swid, struct ice_lag *local_lag,
 	else
 		swid = local_lag->pf->hw.port_info->sw_id;
 
-	cmd = &desc.params.set_port_params;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_port_params);
 
 	cmd->swid = cpu_to_le16(ICE_AQC_PORT_SWID_VALID | swid);
diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
index 1be1e429a7c8..e563700d4ba1 100644
--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -3736,13 +3736,13 @@ int ice_set_link(struct ice_vsi *vsi, bool ena)
 
 	status = ice_aq_set_link_restart_an(pi, ena, NULL);
 
-	/* if link is owned by manageability, FW will return ICE_AQ_RC_EMODE.
+	/* if link is owned by manageability, FW will return LIBIE_AQ_RC_EMODE.
 	 * this is not a fatal error, so print a warning message and return
 	 * a success code. Return an error if FW returns an error code other
-	 * than ICE_AQ_RC_EMODE
+	 * than LIBIE_AQ_RC_EMODE
 	 */
 	if (status == -EIO) {
-		if (hw->adminq.sq_last_status == ICE_AQ_RC_EMODE)
+		if (hw->adminq.sq_last_status == LIBIE_AQ_RC_EMODE)
 			dev_dbg(dev, "can't set link to %s, err %d aq_err %s. not fatal, continuing\n",
 				(ena ? "ON" : "OFF"), status,
 				ice_aq_str(hw->adminq.sq_last_status));
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 204e906af591..3024f5dde384 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -379,7 +379,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi)
 		 * should go into promiscuous mode. There should be some
 		 * space reserved for promiscuous filters.
 		 */
-		if (hw->adminq.sq_last_status == ICE_AQ_RC_ENOSPC &&
+		if (hw->adminq.sq_last_status == LIBIE_AQ_RC_ENOSPC &&
 		    !test_and_set_bit(ICE_FLTR_OVERFLOW_PROMISC,
 				      vsi->state)) {
 			promisc_forced_on = true;
@@ -7914,42 +7914,42 @@ int ice_change_mtu(struct net_device *netdev, int new_mtu)
  * ice_aq_str - convert AQ err code to a string
  * @aq_err: the AQ error code to convert
  */
-const char *ice_aq_str(enum ice_aq_err aq_err)
+const char *ice_aq_str(enum libie_aq_err aq_err)
 {
 	switch (aq_err) {
-	case ICE_AQ_RC_OK:
+	case LIBIE_AQ_RC_OK:
 		return "OK";
-	case ICE_AQ_RC_EPERM:
+	case LIBIE_AQ_RC_EPERM:
 		return "ICE_AQ_RC_EPERM";
-	case ICE_AQ_RC_ENOENT:
+	case LIBIE_AQ_RC_ENOENT:
 		return "ICE_AQ_RC_ENOENT";
-	case ICE_AQ_RC_ESRCH:
+	case LIBIE_AQ_RC_ESRCH:
 		return "ICE_AQ_RC_ESRCH";
-	case ICE_AQ_RC_EAGAIN:
+	case LIBIE_AQ_RC_EAGAIN:
 		return "ICE_AQ_RC_EAGAIN";
-	case ICE_AQ_RC_ENOMEM:
+	case LIBIE_AQ_RC_ENOMEM:
 		return "ICE_AQ_RC_ENOMEM";
-	case ICE_AQ_RC_EBUSY:
+	case LIBIE_AQ_RC_EBUSY:
 		return "ICE_AQ_RC_EBUSY";
-	case ICE_AQ_RC_EEXIST:
+	case LIBIE_AQ_RC_EEXIST:
 		return "ICE_AQ_RC_EEXIST";
-	case ICE_AQ_RC_EINVAL:
+	case LIBIE_AQ_RC_EINVAL:
 		return "ICE_AQ_RC_EINVAL";
-	case ICE_AQ_RC_ENOSPC:
+	case LIBIE_AQ_RC_ENOSPC:
 		return "ICE_AQ_RC_ENOSPC";
-	case ICE_AQ_RC_ENOSYS:
+	case LIBIE_AQ_RC_ENOSYS:
 		return "ICE_AQ_RC_ENOSYS";
-	case ICE_AQ_RC_EMODE:
+	case LIBIE_AQ_RC_EMODE:
 		return "ICE_AQ_RC_EMODE";
-	case ICE_AQ_RC_ENOSEC:
+	case LIBIE_AQ_RC_ENOSEC:
 		return "ICE_AQ_RC_ENOSEC";
-	case ICE_AQ_RC_EBADSIG:
+	case LIBIE_AQ_RC_EBADSIG:
 		return "ICE_AQ_RC_EBADSIG";
-	case ICE_AQ_RC_ESVN:
+	case LIBIE_AQ_RC_ESVN:
 		return "ICE_AQ_RC_ESVN";
-	case ICE_AQ_RC_EBADMAN:
+	case LIBIE_AQ_RC_EBADMAN:
 		return "ICE_AQ_RC_EBADMAN";
-	case ICE_AQ_RC_EBADBUF:
+	case LIBIE_AQ_RC_EBADBUF:
 		return "ICE_AQ_RC_EBADBUF";
 	}
 
diff --git a/drivers/net/ethernet/intel/ice/ice_nvm.c b/drivers/net/ethernet/intel/ice/ice_nvm.c
index 59e8879ac059..7e187a804dfa 100644
--- a/drivers/net/ethernet/intel/ice/ice_nvm.c
+++ b/drivers/net/ethernet/intel/ice/ice_nvm.c
@@ -22,10 +22,10 @@ int ice_aq_read_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
 		    u16 length, void *data, bool last_command,
 		    bool read_shadow_ram, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	if (offset > ICE_AQC_NVM_MAX_OFFSET)
 		return -EINVAL;
@@ -125,10 +125,10 @@ ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
 		  u16 length, void *data, bool last_command, u8 command_flags,
 		  struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	/* In offset the highest byte must be zeroed. */
 	if (offset & 0xFF000000)
@@ -146,7 +146,7 @@ ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
 	cmd->offset_high = (offset >> 16) & 0xFF;
 	cmd->length = cpu_to_le16(length);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	return ice_aq_send_cmd(hw, &desc, data, length, cd);
 }
@@ -161,10 +161,10 @@ ice_aq_update_nvm(struct ice_hw *hw, u16 module_typeid, u32 offset,
  */
 int ice_aq_erase_nvm(struct ice_hw *hw, u16 module_typeid, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_erase);
 
@@ -869,7 +869,7 @@ static int ice_discover_flash_size(struct ice_hw *hw)
 
 		status = ice_read_flat_nvm(hw, offset, &len, &data, false);
 		if (status == -EIO &&
-		    hw->adminq.sq_last_status == ICE_AQ_RC_EINVAL) {
+		    hw->adminq.sq_last_status == LIBIE_AQ_RC_EINVAL) {
 			ice_debug(hw, ICE_DBG_NVM, "%s: New upper bound of %u bytes\n",
 				  __func__, offset);
 			status = 0;
@@ -1182,14 +1182,14 @@ int ice_init_nvm(struct ice_hw *hw)
 int ice_nvm_validate_checksum(struct ice_hw *hw)
 {
 	struct ice_aqc_nvm_checksum *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	status = ice_acquire_nvm(hw, ICE_RES_READ);
 	if (status)
 		return status;
 
-	cmd = &desc.params.nvm_checksum;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_checksum);
 	cmd->flags = ICE_AQC_NVM_CHECKSUM_VERIFY;
@@ -1226,11 +1226,11 @@ int ice_nvm_validate_checksum(struct ice_hw *hw)
  */
 int ice_nvm_write_activate(struct ice_hw *hw, u16 cmd_flags, u8 *response_flags)
 {
+	struct libie_aq_desc desc;
 	struct ice_aqc_nvm *cmd;
-	struct ice_aq_desc desc;
 	int err;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_write_activate);
 
 	cmd->cmd_flags = (u8)(cmd_flags & 0xFF);
@@ -1252,7 +1252,7 @@ int ice_nvm_write_activate(struct ice_hw *hw, u16 cmd_flags, u8 *response_flags)
  */
 int ice_aq_nvm_update_empr(struct ice_hw *hw)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_update_empr);
 
@@ -1278,15 +1278,15 @@ ice_nvm_set_pkg_data(struct ice_hw *hw, bool del_pkg_data_flag, u8 *data,
 		     u16 length, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_nvm_pkg_data *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (length != 0 && !data)
 		return -EINVAL;
 
-	cmd = &desc.params.pkg_data;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_nvm_pkg_data);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	if (del_pkg_data_flag)
 		cmd->cmd_flags |= ICE_AQC_NVM_PKG_DELETE;
@@ -1316,17 +1316,17 @@ ice_nvm_pass_component_tbl(struct ice_hw *hw, u8 *data, u16 length,
 			   u8 *comp_response_code, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_nvm_pass_comp_tbl *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!data || !comp_response || !comp_response_code)
 		return -EINVAL;
 
-	cmd = &desc.params.pass_comp_tbl;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc,
 				      ice_aqc_opc_nvm_pass_component_tbl);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	cmd->transfer_flag = transfer_flag;
 	status = ice_aq_send_cmd(hw, &desc, data, length, cd);
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index e8e439fd64a4..523f95271f35 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -424,7 +424,7 @@ static int ice_write_phy_eth56g(struct ice_hw *hw, u8 port, u32 addr, u32 val)
 	};
 	int err;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err)
 		ice_debug(hw, ICE_DBG_PTP, "PTP failed to send msg to phy %d\n",
 			  err);
@@ -451,7 +451,7 @@ static int ice_read_phy_eth56g(struct ice_hw *hw, u8 port, u32 addr, u32 *val)
 	};
 	int err;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err)
 		ice_debug(hw, ICE_DBG_PTP, "PTP failed to send msg to phy %d\n",
 			  err);
@@ -2348,7 +2348,7 @@ ice_read_phy_reg_e82x(struct ice_hw *hw, u8 port, u16 offset, u32 *val)
 	ice_fill_phy_msg_e82x(hw, &msg, port, offset);
 	msg.opcode = ice_sbq_msg_rd;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
@@ -2426,7 +2426,7 @@ ice_write_phy_reg_e82x(struct ice_hw *hw, u8 port, u16 offset, u32 val)
 	msg.opcode = ice_sbq_msg_wr;
 	msg.data = val;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
@@ -2587,7 +2587,7 @@ ice_read_quad_reg_e82x(struct ice_hw *hw, u8 quad, u16 offset, u32 *val)
 
 	msg.opcode = ice_sbq_msg_rd;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
@@ -2622,7 +2622,7 @@ ice_write_quad_reg_e82x(struct ice_hw *hw, u8 quad, u16 offset, u32 val)
 	msg.opcode = ice_sbq_msg_wr;
 	msg.data = val;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
@@ -4267,7 +4267,7 @@ static int ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val)
 	msg.opcode = ice_sbq_msg_rd;
 	msg.dest_dev = ice_sbq_dev_phy_0;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
@@ -4298,7 +4298,7 @@ static int ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val)
 	msg.dest_dev = ice_sbq_dev_phy_0;
 	msg.data = val;
 
-	err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD);
+	err = ice_sbq_rw_reg(hw, &msg, LIBIE_AQ_FLAG_RD);
 	if (err) {
 		ice_debug(hw, ICE_DBG_PTP, "Failed to send message to PHY, err %d\n",
 			  err);
diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c
index d9d09296d1d4..fff0c1afdb41 100644
--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -123,13 +123,13 @@ ice_aqc_send_sched_elem_cmd(struct ice_hw *hw, enum ice_adminq_opc cmd_opc,
 			    u16 *elems_resp, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_sched_elem_cmd *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.sched_elem_cmd;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, cmd_opc);
 	cmd->num_elem_req = cpu_to_le16(elems_req);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 	if (!status && elems_resp)
 		*elems_resp = le16_to_cpu(cmd->num_elem_resp);
@@ -392,10 +392,10 @@ ice_aq_get_dflt_topo(struct ice_hw *hw, u8 lport,
 		     u8 *num_branches, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_get_topo *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.get_topo;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_dflt_topo);
 	cmd->port_num = lport;
 	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
@@ -518,7 +518,7 @@ ice_aq_query_sched_res(struct ice_hw *hw, u16 buf_size,
 		       struct ice_aqc_query_txsched_res_resp *buf,
 		       struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_sched_res);
 	return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
@@ -683,13 +683,13 @@ ice_aq_rl_profile(struct ice_hw *hw, enum ice_adminq_opc opcode,
 		  u16 buf_size, u16 *num_processed, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_rl_profile *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.rl_profile;
+	cmd = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, opcode);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 	cmd->num_profiles = cpu_to_le16(num_profiles);
 	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
 	if (!status && num_processed)
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index c434326a4694..9ce4c4db400e 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1161,10 +1161,12 @@ static u32 ice_globalq_to_pfq(struct ice_pf *pf, u32 globalq)
 void
 ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event)
 {
+	struct ice_aqc_event_lan_overflow *cmd;
 	u32 gldcb_rtctq, queue;
 	struct ice_vf *vf;
 
-	gldcb_rtctq = le32_to_cpu(event->desc.params.lan_overflow.prtdcb_ruptq);
+	cmd = libie_aq_raw(&event->desc);
+	gldcb_rtctq = le32_to_cpu(cmd->prtdcb_ruptq);
 	dev_dbg(ice_pf_to_dev(pf), "GLDCB_RTCTQ: 0x%08x\n", gldcb_rtctq);
 
 	/* event returns device global Rx queue number */
diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c
index 9d9a7edd3618..84848f0123e7 100644
--- a/drivers/net/ethernet/intel/ice/ice_switch.c
+++ b/drivers/net/ethernet/intel/ice/ice_switch.c
@@ -1511,11 +1511,11 @@ ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp_elem *buf,
 		  struct ice_sq_cd *cd)
 {
 	struct ice_aqc_get_sw_cfg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_sw_cfg);
-	cmd = &desc.params.get_sw_conf;
+	cmd = libie_aq_raw(&desc);
 	cmd->element = cpu_to_le16(*req_desc);
 
 	status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
@@ -1541,11 +1541,11 @@ ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 {
 	struct ice_aqc_add_update_free_vsi_resp *res;
 	struct ice_aqc_add_get_update_free_vsi *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.vsi_cmd;
-	res = &desc.params.add_update_free_vsi_res;
+	cmd = libie_aq_raw(&desc);
+	res = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_vsi);
 
@@ -1556,7 +1556,7 @@ ice_aq_add_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 
 	cmd->vsi_flags = cpu_to_le16(vsi_ctx->flags);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	status = ice_aq_send_cmd(hw, &desc, &vsi_ctx->info,
 				 sizeof(vsi_ctx->info), cd);
@@ -1585,11 +1585,11 @@ ice_aq_free_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 {
 	struct ice_aqc_add_update_free_vsi_resp *resp;
 	struct ice_aqc_add_get_update_free_vsi *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.vsi_cmd;
-	resp = &desc.params.add_update_free_vsi_res;
+	cmd = libie_aq_raw(&desc);
+	resp = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_free_vsi);
 
@@ -1620,17 +1620,17 @@ ice_aq_update_vsi(struct ice_hw *hw, struct ice_vsi_ctx *vsi_ctx,
 {
 	struct ice_aqc_add_update_free_vsi_resp *resp;
 	struct ice_aqc_add_get_update_free_vsi *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.vsi_cmd;
-	resp = &desc.params.add_update_free_vsi_res;
+	cmd = libie_aq_raw(&desc);
+	resp = libie_aq_raw(&desc);
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_update_vsi);
 
 	cmd->vsi_num = cpu_to_le16(vsi_ctx->vsi_num | ICE_AQ_VSI_IS_VALID);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	status = ice_aq_send_cmd(hw, &desc, &vsi_ctx->info,
 				 sizeof(vsi_ctx->info), cd);
@@ -1944,7 +1944,8 @@ int
 ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
 		u8 num_rules, enum ice_adminq_opc opc, struct ice_sq_cd *cd)
 {
-	struct ice_aq_desc desc;
+	struct ice_aqc_sw_rules *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (opc != ice_aqc_opc_add_sw_rules &&
@@ -1953,13 +1954,13 @@ ice_aq_sw_rules(struct ice_hw *hw, void *rule_list, u16 rule_list_sz,
 		return -EINVAL;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, opc);
+	cmd = libie_aq_raw(&desc);
 
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
-	desc.params.sw_rules.num_rules_fltr_entry_index =
-		cpu_to_le16(num_rules);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
+	cmd->num_rules_fltr_entry_index = cpu_to_le16(num_rules);
 	status = ice_aq_send_cmd(hw, &desc, rule_list, rule_list_sz, cd);
 	if (opc != ice_aqc_opc_add_sw_rules &&
-	    hw->adminq.sq_last_status == ICE_AQ_RC_ENOENT)
+	    hw->adminq.sq_last_status == LIBIE_AQ_RC_ENOENT)
 		status = -ENOENT;
 
 	if (!status) {
@@ -1989,14 +1990,14 @@ ice_aq_add_recipe(struct ice_hw *hw,
 		  u16 num_recipes, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_add_get_recipe *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 
-	cmd = &desc.params.add_get_recipe;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_recipe);
 
 	cmd->num_sub_recipes = cpu_to_le16(num_recipes);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	buf_size = num_recipes * sizeof(*s_recipe_list);
 
@@ -2026,14 +2027,14 @@ ice_aq_get_recipe(struct ice_hw *hw,
 		  u16 *num_recipes, u16 recipe_root, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_add_get_recipe *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 	int status;
 
 	if (*num_recipes != ICE_MAX_NUM_RECIPES)
 		return -EINVAL;
 
-	cmd = &desc.params.add_get_recipe;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe);
 
 	cmd->return_index = cpu_to_le16(recipe_root);
@@ -2118,9 +2119,9 @@ ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 r_assoc,
 			     struct ice_sq_cd *cd)
 {
 	struct ice_aqc_recipe_to_profile *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.recipe_to_profile;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_recipe_to_profile);
 	cmd->profile_id = cpu_to_le16(profile_id);
 	/* Set the recipe ID bit in the bitmask to let the device know which
@@ -2144,10 +2145,10 @@ ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 *r_assoc,
 			     struct ice_sq_cd *cd)
 {
 	struct ice_aqc_recipe_to_profile *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = &desc.params.recipe_to_profile;
+	cmd = libie_aq_raw(&desc);
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_recipe_to_profile);
 	cmd->profile_id = cpu_to_le16(profile_id);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c
index 75c8113e58ee..7798a5d4bc9d 100644
--- a/drivers/net/ethernet/intel/ice/ice_vf_mbx.c
+++ b/drivers/net/ethernet/intel/ice/ice_vf_mbx.c
@@ -23,18 +23,18 @@ ice_aq_send_msg_to_vf(struct ice_hw *hw, u16 vfid, u32 v_opcode, u32 v_retval,
 		      u8 *msg, u16 msglen, struct ice_sq_cd *cd)
 {
 	struct ice_aqc_pf_vf_msg *cmd;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	ice_fill_dflt_direct_cmd_desc(&desc, ice_mbx_opc_send_msg_to_vf);
 
-	cmd = &desc.params.virt;
+	cmd = libie_aq_raw(&desc);
 	cmd->id = cpu_to_le32(vfid);
 
 	desc.cookie_high = cpu_to_le32(v_opcode);
 	desc.cookie_low = cpu_to_le32(v_retval);
 
 	if (msglen)
-		desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+		desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	return ice_sq_send_cmd(hw, &hw->mailboxq, &desc, msg, msglen, cd);
 }
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 05511157c571..faec052cf469 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -304,7 +304,7 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode,
 
 	aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval,
 				       msg, msglen, NULL);
-	if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) {
+	if (aq_ret && pf->hw.mailboxq.sq_last_status != LIBIE_AQ_RC_ENOSYS) {
 		dev_info(dev, "Unable to send the message to VF %d ret %d aq_err %s\n",
 			 vf->vf_id, aq_ret,
 			 ice_aq_str(pf->hw.mailboxq.sq_last_status));
diff --git a/drivers/net/ethernet/intel/ice/ice_vlan_mode.c b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
index 1279c1ffe31c..fb526cb84776 100644
--- a/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
+++ b/drivers/net/ethernet/intel/ice/ice_vlan_mode.c
@@ -63,7 +63,7 @@ static int
 ice_aq_get_vlan_mode(struct ice_hw *hw,
 		     struct ice_aqc_get_vlan_mode *get_params)
 {
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (!get_params)
 		return -EINVAL;
@@ -275,7 +275,7 @@ ice_aq_set_vlan_mode(struct ice_hw *hw,
 		     struct ice_aqc_set_vlan_mode *set_params)
 {
 	u8 rdma_packet, mng_vlan_prot_id;
-	struct ice_aq_desc desc;
+	struct libie_aq_desc desc;
 
 	if (!set_params)
 		return -EINVAL;
@@ -295,7 +295,7 @@ ice_aq_set_vlan_mode(struct ice_hw *hw,
 
 	ice_fill_dflt_direct_cmd_desc(&desc,
 				      ice_aqc_opc_set_vlan_mode_parameters);
-	desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	return ice_aq_send_cmd(hw, &desc, set_params, sizeof(*set_params),
 			       NULL);
diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h
new file mode 100644
index 000000000000..3676adc33d3e
--- /dev/null
+++ b/include/linux/net/intel/libie/adminq.h
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBIE_ADMINQ_H
+#define __LIBIE_ADMINQ_H
+
+#include <linux/build_bug.h>
+#include <linux/types.h>
+
+#define LIBIE_CHECK_STRUCT_LEN(n, X)	\
+	static_assert((n) == sizeof(struct X))
+
+/**
+ * struct libie_aqc_generic - Generic structure used in adminq communication
+ * @param0: generic parameter high 32bit
+ * @param1: generic parameter lower 32bit
+ * @addr_high: generic address high 32bit
+ * @addr_low: generic address lower 32bit
+ */
+struct libie_aqc_generic {
+	__le32 param0;
+	__le32 param1;
+	__le32 addr_high;
+	__le32 addr_low;
+};
+LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_generic);
+
+/**
+ * struct libie_aqc_get_ver -  Used in command get version (direct 0x0001)
+ * @rom_ver: rom version
+ * @fw_build: number coressponding to firmware build
+ * @fw_branch: branch identifier of firmware version
+ * @fw_major: major number of firmware version
+ * @fw_minor: minor number of firmware version
+ * @fw_patch: patch of firmware version
+ * @api_branch: brancch identifier of API version
+ * @api_major: major number of API version
+ * @api_minor: minor number of API version
+ * @api_patch: patch of API version
+ */
+struct libie_aqc_get_ver {
+	__le32	rom_ver;
+	__le32	fw_build;
+	u8	fw_branch;
+	u8	fw_major;
+	u8	fw_minor;
+	u8	fw_patch;
+	u8	api_branch;
+	u8	api_major;
+	u8	api_minor;
+	u8	api_patch;
+};
+LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_get_ver);
+
+/**
+ * struct libie_aqc_driver_ver - Used in command send driver version
+ *				 (indirect 0x0002)
+ * @major_ver: driver major version
+ * @minor_ver: driver minor version
+ * @build_ver: driver build version
+ * @subbuild_ver: driver subbuild version
+ * @reserved: for feature use
+ * @addr_high: high part of response address buff
+ * @addr_low: low part of response address buff
+ */
+struct libie_aqc_driver_ver {
+	u8	major_ver;
+	u8	minor_ver;
+	u8	build_ver;
+	u8	subbuild_ver;
+	u8	reserved[4];
+	__le32	addr_high;
+	__le32	addr_low;
+};
+LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_driver_ver);
+
+enum libie_aq_res_id {
+	LIBIE_AQC_RES_ID_NVM				= 1,
+	LIBIE_AQC_RES_ID_SDP				= 2,
+	LIBIE_AQC_RES_ID_CHNG_LOCK			= 3,
+	LIBIE_AQC_RES_ID_GLBL_LOCK			= 4,
+};
+
+enum libie_aq_res_access_type {
+	LIBIE_AQC_RES_ACCESS_READ			= 1,
+	LIBIE_AQC_RES_ACCESS_WRITE			= 2,
+};
+
+#define LIBIE_AQ_RES_NVM_READ_DFLT_TIMEOUT_MS		3000
+#define LIBIE_AQ_RES_NVM_WRITE_DFLT_TIMEOUT_MS		180000
+#define LIBIE_AQ_RES_CHNG_LOCK_DFLT_TIMEOUT_MS		1000
+#define LIBIE_AQ_RES_GLBL_LOCK_DFLT_TIMEOUT_MS		3000
+
+#define LIBIE_AQ_RES_GLBL_SUCCESS			0
+#define LIBIE_AQ_RES_GLBL_IN_PROG			1
+#define LIBIE_AQ_RES_GLBL_DONE				2
+
+/**
+ * struct libie_aqc_req_res - Request resource ownership
+ * @res_id: resource ID (look at enum definition above)
+ * @access_type: read or write (enum definition above)
+ * @timeout: Upon successful completion, FW writes this value and driver is
+ * expected to release resource before timeout. This value is provided in
+ * milliseconds.
+ * @res_number: for SDP, this is the pin ID of the SDP
+ * @status: status only used for LIBIE_AQC_RES_ID_GLBL_LOCK, for others reserved
+ * @reserved: reserved for future use
+ *
+ * Used in commands:
+ * request resource ownership (direct 0x0008)
+ * request resource ownership (direct 0x0009)
+ */
+struct libie_aqc_req_res {
+	__le16	res_id;
+	__le16	access_type;
+
+	__le32	timeout;
+	__le32	res_number;
+	__le16	status;
+	u8	reserved[2];
+};
+LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_req_res);
+
+/**
+ * struct libie_aqc_list_caps - Getting capabilities
+ * @cmd_flags: command flags
+ * @pf_index: index of PF to get caps from
+ * @reserved: reserved for future use
+ * @count: number of capabilities records
+ * @addr_high: high part of response address buff
+ * @addr_low: low part of response address buff
+ *
+ * Used in commands:
+ * get function capabilities (indirect 0x000A)
+ * get device capabilities (indirect 0x000B)
+ */
+struct libie_aqc_list_caps {
+	u8	cmd_flags;
+	u8	pf_index;
+	u8	reserved[2];
+	__le32	count;
+	__le32	addr_high;
+	__le32	addr_low;
+};
+LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
+
+/* Device/Function buffer entry, repeated per reported capability */
+#define LIBIE_AQC_CAPS_VALID_FUNCTIONS			0x0005
+#define LIBIE_AQC_CAPS_SRIOV				0x0012
+#define LIBIE_AQC_CAPS_VF				0x0013
+#define LIBIE_AQC_CAPS_VSI				0x0017
+#define LIBIE_AQC_CAPS_DCB				0x0018
+#define LIBIE_AQC_CAPS_RSS				0x0040
+#define LIBIE_AQC_CAPS_RXQS				0x0041
+#define LIBIE_AQC_CAPS_TXQS				0x0042
+#define LIBIE_AQC_CAPS_MSIX				0x0043
+#define LIBIE_AQC_CAPS_FD				0x0045
+#define LIBIE_AQC_CAPS_1588				0x0046
+#define LIBIE_AQC_CAPS_MAX_MTU				0x0047
+#define LIBIE_AQC_CAPS_NVM_VER				0x0048
+#define LIBIE_AQC_CAPS_PENDING_NVM_VER			0x0049
+#define LIBIE_AQC_CAPS_OROM_VER				0x004A
+#define LIBIE_AQC_CAPS_PENDING_OROM_VER			0x004B
+#define LIBIE_AQC_CAPS_NET_VER				0x004C
+#define LIBIE_AQC_CAPS_PENDING_NET_VER			0x004D
+#define LIBIE_AQC_CAPS_RDMA				0x0051
+#define LIBIE_AQC_CAPS_SENSOR_READING			0x0067
+#define LIBIE_AQC_CAPS_PCIE_RESET_AVOIDANCE		0x0076
+#define LIBIE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT	0x0077
+#define LIBIE_AQC_CAPS_NVM_MGMT				0x0080
+#define LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE		0x0085
+#define LIBIE_AQC_CAPS_NAC_TOPOLOGY			0x0087
+#define LIBIE_AQC_CAPS_FW_LAG_SUPPORT			0x0092
+#define LIBIE_AQC_BIT_ROCEV2_LAG			0x01
+#define LIBIE_AQC_BIT_SRIOV_LAG				0x02
+
+/**
+ * struct libie_aqc_list_caps_elem - Getting list of caps elements
+ * @cap: one from the defines list above
+ * @major_ver: major version
+ * @minor_ver: minor version
+ * @number: number of resources described by this capability
+ * @logical_id: logical ID, only meaningful for some types of resources
+ * @phys_id: physical ID, only meaningful for some types of resources
+ * @rsvd1: reserved for future use
+ * @rsvd2: reserved for future use
+ */
+struct libie_aqc_list_caps_elem {
+	__le16	cap;
+
+	u8	major_ver;
+	u8	minor_ver;
+	__le32	number;
+	__le32	logical_id;
+	__le32	phys_id;
+	__le64	rsvd1;
+	__le64	rsvd2;
+};
+LIBIE_CHECK_STRUCT_LEN(32, libie_aqc_list_caps_elem);
+
+/**
+ * struct libie_aq_desc - Admin Queue (AQ) descriptor
+ * @flags: LIBIE_AQ_FLAG_* flags
+ * @opcode: AQ command opcode
+ * @datalen: length in bytes of indirect/external data buffer
+ * @retval: return value from firmware
+ * @cookie_high: opaque data high-half
+ * @cookie_low: opaque data low-half
+ * @params: command-specific parameters
+ *
+ * Descriptor format for commands the driver posts on the Admin Transmit Queue
+ * (ATQ). The firmware writes back onto the command descriptor and returns
+ * the result of the command. Asynchronous events that are not an immediate
+ * result of the command are written to the Admin Receive Queue (ARQ) using
+ * the same descriptor format. Descriptors are in little-endian notation with
+ * 32-bit words.
+ */
+struct libie_aq_desc {
+	__le16	flags;
+	__le16	opcode;
+	__le16	datalen;
+	__le16	retval;
+	__le32	cookie_high;
+	__le32	cookie_low;
+	union {
+		u8	raw[16];
+		struct	libie_aqc_generic generic;
+		struct	libie_aqc_get_ver get_ver;
+		struct	libie_aqc_driver_ver driver_ver;
+		struct	libie_aqc_req_res res_owner;
+		struct	libie_aqc_list_caps get_cap;
+	} params;
+};
+LIBIE_CHECK_STRUCT_LEN(32, libie_aq_desc);
+
+/* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
+#define LIBIE_AQ_LG_BUF				512
+
+#define LIBIE_AQ_FLAG_DD			BIT(0)	/* 0x1    */
+#define LIBIE_AQ_FLAG_CMP			BIT(1)	/* 0x2    */
+#define LIBIE_AQ_FLAG_ERR			BIT(2)	/* 0x4    */
+#define LIBIE_AQ_FLAG_LB			BIT(9)	/* 0x200  */
+#define LIBIE_AQ_FLAG_RD			BIT(10)	/* 0x400  */
+#define LIBIE_AQ_FLAG_BUF			BIT(12)	/* 0x1000 */
+#define LIBIE_AQ_FLAG_SI			BIT(13)	/* 0x2000 */
+
+/* error codes */
+enum libie_aq_err {
+	LIBIE_AQ_RC_OK		= 0,  /* Success */
+	LIBIE_AQ_RC_EPERM	= 1,  /* Operation not permitted */
+	LIBIE_AQ_RC_ENOENT	= 2,  /* No such element */
+	LIBIE_AQ_RC_ESRCH	= 3,  /* Bad opcode */
+	LIBIE_AQ_RC_EAGAIN	= 8,  /* Try again */
+	LIBIE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	LIBIE_AQ_RC_EBUSY	= 12, /* Device or resource busy */
+	LIBIE_AQ_RC_EEXIST	= 13, /* Object already exists */
+	LIBIE_AQ_RC_EINVAL	= 14, /* Invalid argument */
+	LIBIE_AQ_RC_ENOSPC	= 16, /* No space left or allocation failure */
+	LIBIE_AQ_RC_ENOSYS	= 17, /* Function not implemented */
+	LIBIE_AQ_RC_EMODE	= 21, /* Op not allowed in current dev mode */
+	LIBIE_AQ_RC_ENOSEC	= 24, /* Missing security manifest */
+	LIBIE_AQ_RC_EBADSIG	= 25, /* Bad RSA signature */
+	LIBIE_AQ_RC_ESVN	= 26, /* SVN number prohibits this package */
+	LIBIE_AQ_RC_EBADMAN	= 27, /* Manifest hash mismatch */
+	LIBIE_AQ_RC_EBADBUF	= 28, /* Buffer hash mismatches manifest */
+};
+
+static inline void *libie_aq_raw(struct libie_aq_desc *desc)
+{
+	return &desc->params.raw;
+}
+
+#endif /* __LIBIE_ADMINQ_H */
-- 
cgit v1.2.3


From 5b36bef444432b75e7285e33338eb8bad53fe152 Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Fri, 25 Apr 2025 08:08:03 +0200
Subject: ixgbe: use libie adminq descriptors

Use libie_aq_desc instead of ixgbe_aci_desc. Do needed changes to allow
clean build.

Move additional caps used in ixgbe to libie.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/devlink/region.c  |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c      | 272 +++++++++++----------
 drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h      |  12 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h | 226 +----------------
 include/linux/net/intel/libie/adminq.h             |  16 ++
 6 files changed, 167 insertions(+), 367 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/region.c b/drivers/net/ethernet/intel/ixgbe/devlink/region.c
index 76f6571c3c34..478b4f435120 100644
--- a/drivers/net/ethernet/intel/ixgbe/devlink/region.c
+++ b/drivers/net/ethernet/intel/ixgbe/devlink/region.c
@@ -74,7 +74,7 @@ static int ixgbe_devlink_nvm_snapshot(struct devlink *devlink,
 		 * total period of reading whole NVM is longer than the maximum
 		 * period the lock can be taken defined by the IXGBE_NVM_TIMEOUT.
 		 */
-		err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+		err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 		if (err) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed to acquire NVM semaphore");
@@ -184,7 +184,7 @@ static int ixgbe_devlink_nvm_read(struct devlink *devlink,
 		return -ERANGE;
 	}
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to acquire NVM semaphore");
 		return -EBUSY;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
index 87b03c1992a8..d74116441d1c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c
@@ -56,7 +56,7 @@ static bool ixgbe_should_retry_aci_send_cmd_execute(u16 opcode)
  * Admin Command failed with error Y.
  */
 static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
-				      struct ixgbe_aci_desc *desc,
+				      struct libie_aq_desc *desc,
 				      void *buf, u16 buf_size)
 {
 	u16 opcode, buf_tail_size = buf_size % 4;
@@ -64,7 +64,7 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
 	u32 hicr, i, buf_tail = 0;
 	bool valid_buf = false;
 
-	hw->aci.last_status = IXGBE_ACI_RC_OK;
+	hw->aci.last_status = LIBIE_AQ_RC_OK;
 
 	/* It's necessary to check if mechanism is enabled */
 	hicr = IXGBE_READ_REG(hw, IXGBE_PF_HICR);
@@ -73,7 +73,7 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
 		return -EIO;
 
 	if (hicr & IXGBE_PF_HICR_C) {
-		hw->aci.last_status = IXGBE_ACI_RC_EBUSY;
+		hw->aci.last_status = LIBIE_AQ_RC_EBUSY;
 		return -EBUSY;
 	}
 
@@ -83,9 +83,9 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
 		return -EINVAL;
 
 	if (buf)
-		desc->flags |= cpu_to_le16(IXGBE_ACI_FLAG_BUF);
+		desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF);
 
-	if (desc->flags & cpu_to_le16(IXGBE_ACI_FLAG_BUF)) {
+	if (desc->flags & cpu_to_le16(LIBIE_AQ_FLAG_BUF)) {
 		if ((buf && !buf_size) ||
 		    (!buf && buf_size))
 			return -EINVAL;
@@ -98,12 +98,12 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
 			memcpy(&buf_tail, buf + buf_size - buf_tail_size,
 			       buf_tail_size);
 
-		if (((buf_size + 3) & ~0x3) > IXGBE_ACI_LG_BUF)
-			desc->flags |= cpu_to_le16(IXGBE_ACI_FLAG_LB);
+		if (((buf_size + 3) & ~0x3) > LIBIE_AQ_LG_BUF)
+			desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 
 		desc->datalen = cpu_to_le16(buf_size);
 
-		if (desc->flags & cpu_to_le16(IXGBE_ACI_FLAG_RD)) {
+		if (desc->flags & cpu_to_le16(LIBIE_AQ_FLAG_RD)) {
 			for (i = 0; i < buf_size / 4; i++)
 				IXGBE_WRITE_REG(hw, IXGBE_PF_HIBA(i), ((u32 *)buf)[i]);
 			if (buf_tail_size)
@@ -174,7 +174,7 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
 		return -EIO;
 
 	if (desc->retval) {
-		hw->aci.last_status = (enum ixgbe_aci_err)
+		hw->aci.last_status = (enum libie_aq_err)
 			le16_to_cpu(desc->retval);
 		return -EIO;
 	}
@@ -207,12 +207,12 @@ static int ixgbe_aci_send_cmd_execute(struct ixgbe_hw *hw,
  *
  * Return: the exit code of the operation.
  */
-int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct ixgbe_aci_desc *desc,
+int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct libie_aq_desc *desc,
 		       void *buf, u16 buf_size)
 {
 	u16 opcode = le16_to_cpu(desc->opcode);
-	struct ixgbe_aci_desc desc_cpy;
-	enum ixgbe_aci_err last_status;
+	struct libie_aq_desc desc_cpy;
+	enum libie_aq_err last_status;
 	u8 idx = 0, *buf_cpy = NULL;
 	bool is_cmd_for_retry;
 	unsigned long timeout;
@@ -237,7 +237,7 @@ int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct ixgbe_aci_desc *desc,
 		mutex_unlock(&hw->aci.lock);
 
 		if (!is_cmd_for_retry || !err ||
-		    last_status != IXGBE_ACI_RC_EBUSY)
+		    last_status != LIBIE_AQ_RC_EBUSY)
 			break;
 
 		if (buf)
@@ -286,7 +286,7 @@ bool ixgbe_aci_check_event_pending(struct ixgbe_hw *hw)
 int ixgbe_aci_get_event(struct ixgbe_hw *hw, struct ixgbe_aci_event *e,
 			bool *pending)
 {
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 
 	if (!e || (!e->msg_buf && e->buf_len))
@@ -335,12 +335,12 @@ aci_get_event_exit:
  * Helper function to fill the descriptor desc with default values
  * and the provided opcode.
  */
-void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode)
+void ixgbe_fill_dflt_direct_cmd_desc(struct libie_aq_desc *desc, u16 opcode)
 {
 	/* Zero out the desc. */
 	memset(desc, 0, sizeof(*desc));
 	desc->opcode = cpu_to_le16(opcode);
-	desc->flags = cpu_to_le16(IXGBE_ACI_FLAG_SI);
+	desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_SI);
 }
 
 /**
@@ -353,8 +353,8 @@ void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode)
  */
 static int ixgbe_aci_get_fw_ver(struct ixgbe_hw *hw)
 {
-	struct ixgbe_aci_cmd_get_ver *resp;
-	struct ixgbe_aci_desc desc;
+	struct libie_aqc_get_ver *resp;
+	struct libie_aq_desc desc;
 	int err;
 
 	resp = &desc.params.get_ver;
@@ -393,12 +393,12 @@ static int ixgbe_aci_get_fw_ver(struct ixgbe_hw *hw)
  *
  * Return: the exit code of the operation.
  */
-static int ixgbe_aci_req_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
-			     enum ixgbe_aci_res_access_type access,
+static int ixgbe_aci_req_res(struct ixgbe_hw *hw, enum libie_aq_res_id res,
+			     enum libie_aq_res_access_type access,
 			     u8 sdp_number, u32 *timeout)
 {
-	struct ixgbe_aci_cmd_req_res *cmd_resp;
-	struct ixgbe_aci_desc desc;
+	struct libie_aqc_req_res *cmd_resp;
+	struct libie_aq_desc desc;
 	int err;
 
 	cmd_resp = &desc.params.res_owner;
@@ -417,7 +417,7 @@ static int ixgbe_aci_req_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
 	 * with a busy return value and the timeout field indicates the maximum
 	 * time the current owner of the resource has to free it.
 	 */
-	if (!err || hw->aci.last_status == IXGBE_ACI_RC_EBUSY)
+	if (!err || hw->aci.last_status == LIBIE_AQ_RC_EBUSY)
 		*timeout = le32_to_cpu(cmd_resp->timeout);
 
 	return err;
@@ -433,11 +433,11 @@ static int ixgbe_aci_req_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
  *
  * Return: the exit code of the operation.
  */
-static int ixgbe_aci_release_res(struct ixgbe_hw *hw,
-				 enum ixgbe_aci_res_ids res, u8 sdp_number)
+static int ixgbe_aci_release_res(struct ixgbe_hw *hw, enum libie_aq_res_id res,
+				 u8 sdp_number)
 {
-	struct ixgbe_aci_cmd_req_res *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aqc_req_res *cmd;
+	struct libie_aq_desc desc;
 
 	cmd = &desc.params.res_owner;
 
@@ -465,8 +465,8 @@ static int ixgbe_aci_release_res(struct ixgbe_hw *hw,
  *
  * Return: the exit code of the operation.
  */
-int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
-		      enum ixgbe_aci_res_access_type access, u32 timeout)
+int ixgbe_acquire_res(struct ixgbe_hw *hw, enum libie_aq_res_id res,
+		      enum libie_aq_res_access_type access, u32 timeout)
 {
 #define IXGBE_RES_POLLING_DELAY_MS	10
 	u32 delay = IXGBE_RES_POLLING_DELAY_MS;
@@ -514,7 +514,7 @@ int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
  *
  * Release a common resource using ixgbe_aci_release_res.
  */
-void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res)
+void ixgbe_release_res(struct ixgbe_hw *hw, enum libie_aq_res_id res)
 {
 	u32 total_delay = 0;
 	int err;
@@ -547,7 +547,7 @@ void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res)
  */
 static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw,
 				  struct ixgbe_hw_caps *caps,
-				  struct ixgbe_aci_cmd_list_caps_elem *elem,
+				  struct libie_aqc_list_caps_elem *elem,
 				  const char *prefix)
 {
 	u32 logical_id = le32_to_cpu(elem->logical_id);
@@ -556,67 +556,67 @@ static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw,
 	u16 cap = le16_to_cpu(elem->cap);
 
 	switch (cap) {
-	case IXGBE_ACI_CAPS_VALID_FUNCTIONS:
+	case LIBIE_AQC_CAPS_VALID_FUNCTIONS:
 		caps->valid_functions = number;
 		break;
-	case IXGBE_ACI_CAPS_SRIOV:
+	case LIBIE_AQC_CAPS_SRIOV:
 		caps->sr_iov_1_1 = (number == 1);
 		break;
-	case IXGBE_ACI_CAPS_VMDQ:
+	case LIBIE_AQC_CAPS_VMDQ:
 		caps->vmdq = (number == 1);
 		break;
-	case IXGBE_ACI_CAPS_DCB:
+	case LIBIE_AQC_CAPS_DCB:
 		caps->dcb = (number == 1);
 		caps->active_tc_bitmap = logical_id;
 		caps->maxtc = phys_id;
 		break;
-	case IXGBE_ACI_CAPS_RSS:
+	case LIBIE_AQC_CAPS_RSS:
 		caps->rss_table_size = number;
 		caps->rss_table_entry_width = logical_id;
 		break;
-	case IXGBE_ACI_CAPS_RXQS:
+	case LIBIE_AQC_CAPS_RXQS:
 		caps->num_rxq = number;
 		caps->rxq_first_id = phys_id;
 		break;
-	case IXGBE_ACI_CAPS_TXQS:
+	case LIBIE_AQC_CAPS_TXQS:
 		caps->num_txq = number;
 		caps->txq_first_id = phys_id;
 		break;
-	case IXGBE_ACI_CAPS_MSIX:
+	case LIBIE_AQC_CAPS_MSIX:
 		caps->num_msix_vectors = number;
 		caps->msix_vector_first_id = phys_id;
 		break;
-	case IXGBE_ACI_CAPS_NVM_VER:
+	case LIBIE_AQC_CAPS_NVM_VER:
 		break;
-	case IXGBE_ACI_CAPS_PENDING_NVM_VER:
+	case LIBIE_AQC_CAPS_PENDING_NVM_VER:
 		caps->nvm_update_pending_nvm = true;
 		break;
-	case IXGBE_ACI_CAPS_PENDING_OROM_VER:
+	case LIBIE_AQC_CAPS_PENDING_OROM_VER:
 		caps->nvm_update_pending_orom = true;
 		break;
-	case IXGBE_ACI_CAPS_PENDING_NET_VER:
+	case LIBIE_AQC_CAPS_PENDING_NET_VER:
 		caps->nvm_update_pending_netlist = true;
 		break;
-	case IXGBE_ACI_CAPS_NVM_MGMT:
+	case LIBIE_AQC_CAPS_NVM_MGMT:
 		caps->nvm_unified_update =
 			(number & IXGBE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ?
 			true : false;
 		break;
-	case IXGBE_ACI_CAPS_MAX_MTU:
+	case LIBIE_AQC_CAPS_MAX_MTU:
 		caps->max_mtu = number;
 		break;
-	case IXGBE_ACI_CAPS_PCIE_RESET_AVOIDANCE:
+	case LIBIE_AQC_CAPS_PCIE_RESET_AVOIDANCE:
 		caps->pcie_reset_avoidance = (number > 0);
 		break;
-	case IXGBE_ACI_CAPS_POST_UPDATE_RESET_RESTRICT:
+	case LIBIE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT:
 		caps->reset_restrict_support = (number == 1);
 		break;
-	case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0:
-	case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG1:
-	case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG2:
-	case IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG3:
+	case LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG0:
+	case LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG1:
+	case LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG2:
+	case LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG3:
 	{
-		u8 index = cap - IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0;
+		u8 index = cap - LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG0;
 
 		caps->ext_topo_dev_img_ver_high[index] = number;
 		caps->ext_topo_dev_img_ver_low[index] = logical_id;
@@ -637,62 +637,62 @@ static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw,
 }
 
 /**
- * ixgbe_parse_valid_functions_cap - Parse IXGBE_ACI_CAPS_VALID_FUNCTIONS caps
+ * ixgbe_parse_valid_functions_cap - Parse LIBIE_AQC_CAPS_VALID_FUNCTIONS caps
  * @hw: pointer to the HW struct
  * @dev_p: pointer to device capabilities structure
  * @cap: capability element to parse
  *
- * Parse IXGBE_ACI_CAPS_VALID_FUNCTIONS for device capabilities.
+ * Parse LIBIE_AQC_CAPS_VALID_FUNCTIONS for device capabilities.
  */
 static void
 ixgbe_parse_valid_functions_cap(struct ixgbe_hw *hw,
 				struct ixgbe_hw_dev_caps *dev_p,
-				struct ixgbe_aci_cmd_list_caps_elem *cap)
+				struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->num_funcs = hweight32(le32_to_cpu(cap->number));
 }
 
 /**
- * ixgbe_parse_vf_dev_caps - Parse IXGBE_ACI_CAPS_VF device caps
+ * ixgbe_parse_vf_dev_caps - Parse LIBIE_AQC_CAPS_VF device caps
  * @hw: pointer to the HW struct
  * @dev_p: pointer to device capabilities structure
  * @cap: capability element to parse
  *
- * Parse IXGBE_ACI_CAPS_VF for device capabilities.
+ * Parse LIBIE_AQC_CAPS_VF for device capabilities.
  */
 static void ixgbe_parse_vf_dev_caps(struct ixgbe_hw *hw,
 				    struct ixgbe_hw_dev_caps *dev_p,
-				    struct ixgbe_aci_cmd_list_caps_elem *cap)
+				    struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->num_vfs_exposed = le32_to_cpu(cap->number);
 }
 
 /**
- * ixgbe_parse_vsi_dev_caps - Parse IXGBE_ACI_CAPS_VSI device caps
+ * ixgbe_parse_vsi_dev_caps - Parse LIBIE_AQC_CAPS_VSI device caps
  * @hw: pointer to the HW struct
  * @dev_p: pointer to device capabilities structure
  * @cap: capability element to parse
  *
- * Parse IXGBE_ACI_CAPS_VSI for device capabilities.
+ * Parse LIBIE_AQC_CAPS_VSI for device capabilities.
  */
 static void ixgbe_parse_vsi_dev_caps(struct ixgbe_hw *hw,
 				     struct ixgbe_hw_dev_caps *dev_p,
-				     struct ixgbe_aci_cmd_list_caps_elem *cap)
+				     struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->num_vsi_allocd_to_host = le32_to_cpu(cap->number);
 }
 
 /**
- * ixgbe_parse_fdir_dev_caps - Parse IXGBE_ACI_CAPS_FD device caps
+ * ixgbe_parse_fdir_dev_caps - Parse LIBIE_AQC_CAPS_FD device caps
  * @hw: pointer to the HW struct
  * @dev_p: pointer to device capabilities structure
  * @cap: capability element to parse
  *
- * Parse IXGBE_ACI_CAPS_FD for device capabilities.
+ * Parse LIBIE_AQC_CAPS_FD for device capabilities.
  */
 static void ixgbe_parse_fdir_dev_caps(struct ixgbe_hw *hw,
 				      struct ixgbe_hw_dev_caps *dev_p,
-				      struct ixgbe_aci_cmd_list_caps_elem *cap)
+				      struct libie_aqc_list_caps_elem *cap)
 {
 	dev_p->num_flow_director_fltr = le32_to_cpu(cap->number);
 }
@@ -715,10 +715,10 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw,
 				 struct ixgbe_hw_dev_caps *dev_p,
 				 void *buf, u32 cap_count)
 {
-	struct ixgbe_aci_cmd_list_caps_elem *cap_resp;
+	struct libie_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
-	cap_resp = (struct ixgbe_aci_cmd_list_caps_elem *)buf;
+	cap_resp = (struct libie_aqc_list_caps_elem *)buf;
 
 	memset(dev_p, 0, sizeof(*dev_p));
 
@@ -729,17 +729,17 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw,
 				      "dev caps");
 
 		switch (cap) {
-		case IXGBE_ACI_CAPS_VALID_FUNCTIONS:
+		case LIBIE_AQC_CAPS_VALID_FUNCTIONS:
 			ixgbe_parse_valid_functions_cap(hw, dev_p,
 							&cap_resp[i]);
 			break;
-		case IXGBE_ACI_CAPS_VF:
+		case LIBIE_AQC_CAPS_VF:
 			ixgbe_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case IXGBE_ACI_CAPS_VSI:
+		case LIBIE_AQC_CAPS_VSI:
 			ixgbe_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
-		case  IXGBE_ACI_CAPS_FD:
+		case  LIBIE_AQC_CAPS_FD:
 			ixgbe_parse_fdir_dev_caps(hw, dev_p, &cap_resp[i]);
 			break;
 		default:
@@ -750,16 +750,16 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw,
 }
 
 /**
- * ixgbe_parse_vf_func_caps - Parse IXGBE_ACI_CAPS_VF function caps
+ * ixgbe_parse_vf_func_caps - Parse LIBIE_AQC_CAPS_VF function caps
  * @hw: pointer to the HW struct
  * @func_p: pointer to function capabilities structure
  * @cap: pointer to the capability element to parse
  *
- * Extract function capabilities for IXGBE_ACI_CAPS_VF.
+ * Extract function capabilities for LIBIE_AQC_CAPS_VF.
  */
 static void ixgbe_parse_vf_func_caps(struct ixgbe_hw *hw,
 				     struct ixgbe_hw_func_caps *func_p,
-				     struct ixgbe_aci_cmd_list_caps_elem *cap)
+				     struct libie_aqc_list_caps_elem *cap)
 {
 	func_p->num_allocd_vfs = le32_to_cpu(cap->number);
 	func_p->vf_base_id = le32_to_cpu(cap->logical_id);
@@ -786,16 +786,16 @@ static u32 ixgbe_get_num_per_func(struct ixgbe_hw *hw, u32 max)
 }
 
 /**
- * ixgbe_parse_vsi_func_caps - Parse IXGBE_ACI_CAPS_VSI function caps
+ * ixgbe_parse_vsi_func_caps - Parse LIBIE_AQC_CAPS_VSI function caps
  * @hw: pointer to the HW struct
  * @func_p: pointer to function capabilities structure
  * @cap: pointer to the capability element to parse
  *
- * Extract function capabilities for IXGBE_ACI_CAPS_VSI.
+ * Extract function capabilities for LIBIE_AQC_CAPS_VSI.
  */
 static void ixgbe_parse_vsi_func_caps(struct ixgbe_hw *hw,
 				      struct ixgbe_hw_func_caps *func_p,
-				      struct ixgbe_aci_cmd_list_caps_elem *cap)
+				      struct libie_aqc_list_caps_elem *cap)
 {
 	func_p->guar_num_vsi = ixgbe_get_num_per_func(hw, IXGBE_MAX_VSI);
 }
@@ -818,10 +818,10 @@ static void ixgbe_parse_func_caps(struct ixgbe_hw *hw,
 				  struct ixgbe_hw_func_caps *func_p,
 				  void *buf, u32 cap_count)
 {
-	struct ixgbe_aci_cmd_list_caps_elem *cap_resp;
+	struct libie_aqc_list_caps_elem *cap_resp;
 	u32 i;
 
-	cap_resp = (struct ixgbe_aci_cmd_list_caps_elem *)buf;
+	cap_resp = (struct libie_aqc_list_caps_elem *)buf;
 
 	memset(func_p, 0, sizeof(*func_p));
 
@@ -832,10 +832,10 @@ static void ixgbe_parse_func_caps(struct ixgbe_hw *hw,
 				      &cap_resp[i], "func caps");
 
 		switch (cap) {
-		case IXGBE_ACI_CAPS_VF:
+		case LIBIE_AQC_CAPS_VF:
 			ixgbe_parse_vf_func_caps(hw, func_p, &cap_resp[i]);
 			break;
-		case IXGBE_ACI_CAPS_VSI:
+		case LIBIE_AQC_CAPS_VSI:
 			ixgbe_parse_vsi_func_caps(hw, func_p, &cap_resp[i]);
 			break;
 		default:
@@ -869,8 +869,8 @@ static void ixgbe_parse_func_caps(struct ixgbe_hw *hw,
 int ixgbe_aci_list_caps(struct ixgbe_hw *hw, void *buf, u16 buf_size,
 			u32 *cap_count, enum ixgbe_aci_opc opc)
 {
-	struct ixgbe_aci_cmd_list_caps *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aqc_list_caps *cmd;
+	struct libie_aq_desc desc;
 	int err;
 
 	cmd = &desc.params.get_cap;
@@ -914,7 +914,7 @@ int ixgbe_discover_dev_caps(struct ixgbe_hw *hw,
 	 * possible size that firmware can return.
 	 */
 	cap_count = IXGBE_ACI_MAX_BUFFER_SIZE /
-		    sizeof(struct ixgbe_aci_cmd_list_caps_elem);
+		    sizeof(struct libie_aqc_list_caps_elem);
 
 	err = ixgbe_aci_list_caps(hw, cbuf, IXGBE_ACI_MAX_BUFFER_SIZE,
 				  &cap_count,
@@ -953,7 +953,7 @@ int ixgbe_discover_func_caps(struct ixgbe_hw *hw,
 	 * possible size that firmware can return.
 	 */
 	cap_count = IXGBE_ACI_MAX_BUFFER_SIZE /
-		    sizeof(struct ixgbe_aci_cmd_list_caps_elem);
+		    sizeof(struct libie_aqc_list_caps_elem);
 
 	err = ixgbe_aci_list_caps(hw, cbuf, IXGBE_ACI_MAX_BUFFER_SIZE,
 				  &cap_count,
@@ -996,9 +996,9 @@ int ixgbe_get_caps(struct ixgbe_hw *hw)
 int ixgbe_aci_disable_rxen(struct ixgbe_hw *hw)
 {
 	struct ixgbe_aci_cmd_disable_rxen *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.disable_rxen;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_disable_rxen);
 
@@ -1024,10 +1024,10 @@ int ixgbe_aci_get_phy_caps(struct ixgbe_hw *hw, bool qual_mods, u8 report_mode,
 {
 	struct ixgbe_aci_cmd_get_phy_caps *cmd;
 	u16 pcaps_size = sizeof(*pcaps);
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 
-	cmd = &desc.params.get_phy;
+	cmd = libie_aq_raw(&desc);
 
 	if (!pcaps || (report_mode & ~IXGBE_ACI_REPORT_MODE_M))
 		return -EINVAL;
@@ -1091,18 +1091,20 @@ void ixgbe_copy_phy_caps_to_cfg(struct ixgbe_aci_cmd_get_phy_caps_data *caps,
 int ixgbe_aci_set_phy_cfg(struct ixgbe_hw *hw,
 			  struct ixgbe_aci_cmd_set_phy_cfg_data *cfg)
 {
-	struct ixgbe_aci_desc desc;
+	struct ixgbe_aci_cmd_set_phy_cfg *cmd;
+	struct libie_aq_desc desc;
 	int err;
 
 	if (!cfg)
 		return -EINVAL;
 
+	cmd = libie_aq_raw(&desc);
 	/* Ensure that only valid bits of cfg->caps can be turned on. */
 	cfg->caps &= IXGBE_ACI_PHY_ENA_VALID_MASK;
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_phy_cfg);
-	desc.params.set_phy.lport_num = hw->bus.func;
-	desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD);
+	cmd->lport_num = hw->bus.func;
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	err = ixgbe_aci_send_cmd(hw, &desc, cfg, sizeof(*cfg));
 	if (!err)
@@ -1123,9 +1125,9 @@ int ixgbe_aci_set_phy_cfg(struct ixgbe_hw *hw,
 int ixgbe_aci_set_link_restart_an(struct ixgbe_hw *hw, bool ena_link)
 {
 	struct ixgbe_aci_cmd_restart_an *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.restart_an;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_restart_an);
 
@@ -1151,9 +1153,9 @@ int ixgbe_aci_set_link_restart_an(struct ixgbe_hw *hw, bool ena_link)
 static bool ixgbe_is_media_cage_present(struct ixgbe_hw *hw)
 {
 	struct ixgbe_aci_cmd_get_link_topo *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.get_link_topo;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_topo);
 
@@ -1346,7 +1348,7 @@ int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse,
 	struct ixgbe_aci_cmd_get_link_status *resp;
 	struct ixgbe_link_status *li_old, *li;
 	struct ixgbe_fc_info *hw_fc_info;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	bool tx_pause, rx_pause;
 	u8 cmd_flags;
 	int err;
@@ -1360,7 +1362,7 @@ int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse,
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_status);
 	cmd_flags = (ena_lse) ? IXGBE_ACI_LSE_ENA : IXGBE_ACI_LSE_DIS;
-	resp = &desc.params.get_link_status;
+	resp = libie_aq_raw(&desc);
 	resp->cmd_flags = cpu_to_le16(cmd_flags);
 	resp->lport_num = hw->bus.func;
 
@@ -1423,9 +1425,9 @@ int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse,
 int ixgbe_aci_set_event_mask(struct ixgbe_hw *hw, u8 port_num, u16 mask)
 {
 	struct ixgbe_aci_cmd_set_event_mask *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_event_mask;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_event_mask);
 
@@ -1496,9 +1498,9 @@ static int ixgbe_start_hw_e610(struct ixgbe_hw *hw)
 int ixgbe_aci_set_port_id_led(struct ixgbe_hw *hw, bool orig_mode)
 {
 	struct ixgbe_aci_cmd_set_port_id_led *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.set_port_id_led;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_port_id_led);
 
@@ -2260,19 +2262,20 @@ int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw,
 			       struct ixgbe_aci_cmd_get_link_topo *cmd,
 			       u8 *node_part_number, u16 *node_handle)
 {
-	struct ixgbe_aci_desc desc;
+	struct ixgbe_aci_cmd_get_link_topo *resp;
+	struct libie_aq_desc desc;
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_link_topo);
-	desc.params.get_link_topo = *cmd;
+	resp = libie_aq_raw(&desc);
+	*resp = *cmd;
 
 	if (ixgbe_aci_send_cmd(hw, &desc, NULL, 0))
 		return -EOPNOTSUPP;
 
 	if (node_handle)
-		*node_handle =
-			le16_to_cpu(desc.params.get_link_topo.addr.handle);
+		*node_handle = le16_to_cpu(resp->addr.handle);
 	if (node_part_number)
-		*node_part_number = desc.params.get_link_topo.node_part_num;
+		*node_part_number = resp->node_part_num;
 
 	return 0;
 }
@@ -2286,8 +2289,7 @@ int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw,
  *
  * Return: the exit code of the operation.
  */
-int ixgbe_acquire_nvm(struct ixgbe_hw *hw,
-		      enum ixgbe_aci_res_access_type access)
+int ixgbe_acquire_nvm(struct ixgbe_hw *hw, enum libie_aq_res_access_type access)
 {
 	u32 fla;
 
@@ -2296,7 +2298,7 @@ int ixgbe_acquire_nvm(struct ixgbe_hw *hw,
 	if ((fla & IXGBE_GLNVM_FLA_LOCKED_M) == 0)
 		return 0;
 
-	return ixgbe_acquire_res(hw, IXGBE_NVM_RES_ID, access,
+	return ixgbe_acquire_res(hw, LIBIE_AQC_RES_ID_NVM, access,
 				 IXGBE_NVM_TIMEOUT);
 }
 
@@ -2315,7 +2317,7 @@ void ixgbe_release_nvm(struct ixgbe_hw *hw)
 	if ((fla & IXGBE_GLNVM_FLA_LOCKED_M) == 0)
 		return;
 
-	ixgbe_release_res(hw, IXGBE_NVM_RES_ID);
+	ixgbe_release_res(hw, LIBIE_AQC_RES_ID_NVM);
 }
 
 /**
@@ -2337,12 +2339,12 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset,
 		       bool read_shadow_ram)
 {
 	struct ixgbe_aci_cmd_nvm *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
 	if (offset > IXGBE_ACI_NVM_MAX_OFFSET)
 		return -EINVAL;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_read);
 
@@ -2372,7 +2374,7 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset,
 int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid)
 {
 	struct ixgbe_aci_cmd_nvm *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	__le16 len;
 	int err;
 
@@ -2385,7 +2387,7 @@ int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid)
 	if (err)
 		return err;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_erase);
 
@@ -2416,9 +2418,9 @@ int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid,
 			 bool last_command, u8 command_flags)
 {
 	struct ixgbe_aci_cmd_nvm *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 
 	/* In offset the highest byte must be zeroed. */
 	if (offset & 0xFF000000)
@@ -2436,7 +2438,7 @@ int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid,
 	cmd->offset_high = FIELD_GET(IXGBE_ACI_NVM_OFFSET_HI_U_MASK, offset);
 	cmd->length = cpu_to_le16(length);
 
-	desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	return ixgbe_aci_send_cmd(hw, &desc, data, length);
 }
@@ -2467,10 +2469,10 @@ int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags,
 			     u8 *response_flags)
 {
 	struct ixgbe_aci_cmd_nvm *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	s32 err;
 
-	cmd = &desc.params.nvm;
+	cmd = libie_aq_raw(&desc);
 	ixgbe_fill_dflt_direct_cmd_desc(&desc,
 					ixgbe_aci_opc_nvm_write_activate);
 
@@ -2498,14 +2500,14 @@ int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags,
 int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw)
 {
 	struct ixgbe_aci_cmd_nvm_checksum *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err)
 		return err;
 
-	cmd = &desc.params.nvm_checksum;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_checksum);
 	cmd->flags = IXGBE_ACI_NVM_CHECKSUM_VERIFY;
@@ -2541,7 +2543,7 @@ static int ixgbe_discover_flash_size(struct ixgbe_hw *hw)
 	u32 min_size = 0, max_size = IXGBE_ACI_NVM_MAX_OFFSET + 1;
 	int err;
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err)
 		return err;
 
@@ -2552,7 +2554,7 @@ static int ixgbe_discover_flash_size(struct ixgbe_hw *hw)
 
 		err = ixgbe_read_flat_nvm(hw, offset, &len, &data, false);
 		if (err == -EIO &&
-		    hw->aci.last_status == IXGBE_ACI_RC_EINVAL) {
+		    hw->aci.last_status == LIBIE_AQ_RC_EINVAL) {
 			err = 0;
 			max_size = offset;
 		} else if (!err) {
@@ -2805,7 +2807,7 @@ static int ixgbe_read_flash_module(struct ixgbe_hw *hw,
 	if (!start)
 		return -EINVAL;
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err)
 		return err;
 
@@ -3389,7 +3391,7 @@ int ixgbe_get_flash_data(struct ixgbe_hw *hw)
  */
 int ixgbe_aci_nvm_update_empr(struct ixgbe_hw *hw)
 {
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_update_empr);
 
@@ -3415,15 +3417,15 @@ int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag,
 			   u8 *data, u16 length)
 {
 	struct ixgbe_aci_cmd_nvm_pkg_data *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 
 	if (length != 0 && !data)
 		return -EINVAL;
 
-	cmd = &desc.params.pkg_data;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_pkg_data);
-	desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	if (del_pkg_data_flag)
 		cmd->cmd_flags |= IXGBE_ACI_NVM_PKG_DELETE;
@@ -3453,17 +3455,17 @@ int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length,
 				 u8 *comp_response_code)
 {
 	struct ixgbe_aci_cmd_nvm_pass_comp_tbl *cmd;
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	int err;
 
 	if (!data || !comp_response || !comp_response_code)
 		return -EINVAL;
 
-	cmd = &desc.params.pass_comp_tbl;
+	cmd = libie_aq_raw(&desc);
 
 	ixgbe_fill_dflt_direct_cmd_desc(&desc,
 					ixgbe_aci_opc_nvm_pass_component_tbl);
-	desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD);
 
 	cmd->transfer_flag = transfer_flag;
 	err = ixgbe_aci_send_cmd(hw, &desc, data, length);
@@ -3617,7 +3619,7 @@ int ixgbe_read_ee_aci_e610(struct ixgbe_hw *hw, u16 offset, u16 *data)
 			return err;
 	}
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err)
 		return err;
 
@@ -3650,7 +3652,7 @@ int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset,
 			return err;
 	}
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 	if (err)
 		return err;
 
@@ -3690,7 +3692,7 @@ int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val)
 	if (checksum_val) {
 		u16 tmp_checksum;
 
-		err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
+		err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_READ);
 		if (err)
 			return err;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h
index bb31d65bd1c8..782c489b0fa7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h
@@ -6,15 +6,15 @@
 
 #include "ixgbe_type.h"
 
-int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct ixgbe_aci_desc *desc,
+int ixgbe_aci_send_cmd(struct ixgbe_hw *hw, struct libie_aq_desc *desc,
 		       void *buf, u16 buf_size);
 bool ixgbe_aci_check_event_pending(struct ixgbe_hw *hw);
 int ixgbe_aci_get_event(struct ixgbe_hw *hw, struct ixgbe_aci_event *e,
 			bool *pending);
-void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode);
-int ixgbe_acquire_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res,
-		      enum ixgbe_aci_res_access_type access, u32 timeout);
-void ixgbe_release_res(struct ixgbe_hw *hw, enum ixgbe_aci_res_ids res);
+void ixgbe_fill_dflt_direct_cmd_desc(struct libie_aq_desc *desc, u16 opcode);
+int ixgbe_acquire_res(struct ixgbe_hw *hw, enum libie_aq_res_id res,
+		      enum libie_aq_res_access_type access, u32 timeout);
+void ixgbe_release_res(struct ixgbe_hw *hw, enum libie_aq_res_id res);
 int ixgbe_aci_list_caps(struct ixgbe_hw *hw, void *buf, u16 buf_size,
 			u32 *cap_count, enum ixgbe_aci_opc opc);
 int ixgbe_discover_dev_caps(struct ixgbe_hw *hw,
@@ -62,7 +62,7 @@ int ixgbe_aci_get_netlist_node(struct ixgbe_hw *hw,
 			       struct ixgbe_aci_cmd_get_link_topo *cmd,
 			       u8 *node_part_number, u16 *node_handle);
 int ixgbe_acquire_nvm(struct ixgbe_hw *hw,
-		      enum ixgbe_aci_res_access_type access);
+		      enum libie_aq_res_access_type access);
 void ixgbe_release_nvm(struct ixgbe_hw *hw);
 int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset,
 		       u16 length, void *data, bool last_command,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c
index 49d3b66add7e..e5479fc07a07 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c
@@ -593,7 +593,7 @@ static int ixgbe_cancel_pending_update(struct ixgbe_adapter *adapter,
 					   "Canceling previous pending update",
 					   component, 0, 0);
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_WRITE);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed to acquire device flash lock");
@@ -686,7 +686,7 @@ int ixgbe_flash_pldm_image(struct devlink *devlink,
 	if (err)
 		return err;
 
-	err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE);
+	err = ixgbe_acquire_nvm(hw, LIBIE_AQC_RES_ACCESS_WRITE);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed to acquire device flash lock");
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
index 09df67f03cf4..d2f22d8558f8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h
@@ -4,6 +4,8 @@
 #ifndef _IXGBE_TYPE_E610_H_
 #define _IXGBE_TYPE_E610_H_
 
+#include <linux/net/intel/libie/adminq.h>
+
 #define BYTES_PER_DWORD	4
 
 /* General E610 defines */
@@ -135,60 +137,6 @@
 /* [ms] timeout of waiting for resource release */
 #define IXGBE_ACI_RELEASE_RES_TIMEOUT		10000
 
-/* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
-#define IXGBE_ACI_LG_BUF		512
-
-/* Flags sub-structure
- * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
- * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
- */
-
-#define IXGBE_ACI_FLAG_DD	BIT(0) /* 0x1 */
-#define IXGBE_ACI_FLAG_CMP	BIT(1) /* 0x2 */
-#define IXGBE_ACI_FLAG_ERR	BIT(2) /* 0x4 */
-#define IXGBE_ACI_FLAG_VFE	BIT(3) /* 0x8 */
-#define IXGBE_ACI_FLAG_LB	BIT(9) /* 0x200 */
-#define IXGBE_ACI_FLAG_RD	BIT(10) /* 0x400 */
-#define IXGBE_ACI_FLAG_VFC	BIT(11) /* 0x800 */
-#define IXGBE_ACI_FLAG_BUF	BIT(12) /* 0x1000 */
-#define IXGBE_ACI_FLAG_SI	BIT(13) /* 0x2000 */
-#define IXGBE_ACI_FLAG_EI	BIT(14) /* 0x4000 */
-#define IXGBE_ACI_FLAG_FE	BIT(15) /* 0x8000 */
-
-/* Admin Command Interface (ACI) error codes */
-enum ixgbe_aci_err {
-	IXGBE_ACI_RC_OK		= 0,  /* Success */
-	IXGBE_ACI_RC_EPERM	= 1,  /* Operation not permitted */
-	IXGBE_ACI_RC_ENOENT	= 2,  /* No such element */
-	IXGBE_ACI_RC_ESRCH	= 3,  /* Bad opcode */
-	IXGBE_ACI_RC_EINTR	= 4,  /* Operation interrupted */
-	IXGBE_ACI_RC_EIO	= 5,  /* I/O error */
-	IXGBE_ACI_RC_ENXIO	= 6,  /* No such resource */
-	IXGBE_ACI_RC_E2BIG	= 7,  /* Arg too long */
-	IXGBE_ACI_RC_EAGAIN	= 8,  /* Try again */
-	IXGBE_ACI_RC_ENOMEM	= 9,  /* Out of memory */
-	IXGBE_ACI_RC_EACCES	= 10, /* Permission denied */
-	IXGBE_ACI_RC_EFAULT	= 11, /* Bad address */
-	IXGBE_ACI_RC_EBUSY	= 12, /* Device or resource busy */
-	IXGBE_ACI_RC_EEXIST	= 13, /* Object already exists */
-	IXGBE_ACI_RC_EINVAL	= 14, /* Invalid argument */
-	IXGBE_ACI_RC_ENOTTY	= 15, /* Not a typewriter */
-	IXGBE_ACI_RC_ENOSPC	= 16, /* No space left or alloc failure */
-	IXGBE_ACI_RC_ENOSYS	= 17, /* Function not implemented */
-	IXGBE_ACI_RC_ERANGE	= 18, /* Parameter out of range */
-	IXGBE_ACI_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
-	IXGBE_ACI_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
-	IXGBE_ACI_RC_EMODE	= 21, /* Op not allowed in current dev mode */
-	IXGBE_ACI_RC_EFBIG	= 22, /* File too big */
-	IXGBE_ACI_RC_ESBCOMP	= 23, /* SB-IOSF completion unsuccessful */
-	IXGBE_ACI_RC_ENOSEC	= 24, /* Missing security manifest */
-	IXGBE_ACI_RC_EBADSIG	= 25, /* Bad RSA signature */
-	IXGBE_ACI_RC_ESVN	= 26, /* SVN number prohibits this package */
-	IXGBE_ACI_RC_EBADMAN	= 27, /* Manifest hash mismatch */
-	IXGBE_ACI_RC_EBADBUF	= 28, /* Buffer hash mismatches manifest */
-	IXGBE_ACI_RC_EACCES_BMCU	= 29, /* BMC Update in progress */
-};
-
 /* Admin Command Interface (ACI) opcodes */
 enum ixgbe_aci_opc {
 	ixgbe_aci_opc_get_ver				= 0x0001,
@@ -265,33 +213,8 @@ enum ixgbe_aci_opc {
 	ixgbe_aci_opc_clear_health_status		= 0xFF23,
 };
 
-/* Get version (direct 0x0001) */
-struct ixgbe_aci_cmd_get_ver {
-	__le32 rom_ver;
-	__le32 fw_build;
-	u8 fw_branch;
-	u8 fw_major;
-	u8 fw_minor;
-	u8 fw_patch;
-	u8 api_branch;
-	u8 api_major;
-	u8 api_minor;
-	u8 api_patch;
-};
-
 #define IXGBE_DRV_VER_STR_LEN_E610	32
 
-/* Send driver version (indirect 0x0002) */
-struct ixgbe_aci_cmd_driver_ver {
-	u8 major_ver;
-	u8 minor_ver;
-	u8 build_ver;
-	u8 subbuild_ver;
-	u8 reserved[4];
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
 /* Get Expanded Error Code (0x0005, direct) */
 struct ixgbe_aci_cmd_get_exp_err {
 	__le32 reason;
@@ -303,98 +226,6 @@ struct ixgbe_aci_cmd_get_exp_err {
 /* FW update timeout definitions are in milliseconds */
 #define IXGBE_NVM_TIMEOUT		180000
 
-enum ixgbe_aci_res_access_type {
-	IXGBE_RES_READ = 1,
-	IXGBE_RES_WRITE
-};
-
-enum ixgbe_aci_res_ids {
-	IXGBE_NVM_RES_ID = 1,
-	IXGBE_SPD_RES_ID,
-	IXGBE_CHANGE_LOCK_RES_ID,
-	IXGBE_GLOBAL_CFG_LOCK_RES_ID
-};
-
-/* Request resource ownership (direct 0x0008)
- * Release resource ownership (direct 0x0009)
- */
-struct ixgbe_aci_cmd_req_res {
-	__le16 res_id;
-	__le16 access_type;
-
-	/* Upon successful completion, FW writes this value and driver is
-	 * expected to release resource before timeout. This value is provided
-	 * in milliseconds.
-	 */
-	__le32 timeout;
-#define IXGBE_ACI_RES_NVM_READ_DFLT_TIMEOUT_MS	3000
-#define IXGBE_ACI_RES_NVM_WRITE_DFLT_TIMEOUT_MS	180000
-#define IXGBE_ACI_RES_CHNG_LOCK_DFLT_TIMEOUT_MS	1000
-#define IXGBE_ACI_RES_GLBL_LOCK_DFLT_TIMEOUT_MS	3000
-	/* For SDP: pin ID of the SDP */
-	__le32 res_number;
-	__le16 status;
-#define IXGBE_ACI_RES_GLBL_SUCCESS		0
-#define IXGBE_ACI_RES_GLBL_IN_PROG		1
-#define IXGBE_ACI_RES_GLBL_DONE			2
-	u8 reserved[2];
-};
-
-/* Get function capabilities (indirect 0x000A)
- * Get device capabilities (indirect 0x000B)
- */
-struct ixgbe_aci_cmd_list_caps {
-	u8 cmd_flags;
-	u8 pf_index;
-	u8 reserved[2];
-	__le32 count;
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
-/* Device/Function buffer entry, repeated per reported capability */
-struct ixgbe_aci_cmd_list_caps_elem {
-	__le16 cap;
-#define IXGBE_ACI_CAPS_VALID_FUNCTIONS			0x0005
-#define IXGBE_ACI_MAX_VALID_FUNCTIONS			0x8
-#define IXGBE_ACI_CAPS_SRIOV				0x0012
-#define IXGBE_ACI_CAPS_VF				0x0013
-#define IXGBE_ACI_CAPS_VMDQ				0x0014
-#define IXGBE_ACI_CAPS_VSI				0x0017
-#define IXGBE_ACI_CAPS_DCB				0x0018
-#define IXGBE_ACI_CAPS_RSS				0x0040
-#define IXGBE_ACI_CAPS_RXQS				0x0041
-#define IXGBE_ACI_CAPS_TXQS				0x0042
-#define IXGBE_ACI_CAPS_MSIX				0x0043
-#define IXGBE_ACI_CAPS_FD				0x0045
-#define IXGBE_ACI_CAPS_1588				0x0046
-#define IXGBE_ACI_CAPS_MAX_MTU				0x0047
-#define IXGBE_ACI_CAPS_NVM_VER				0x0048
-#define IXGBE_ACI_CAPS_PENDING_NVM_VER			0x0049
-#define IXGBE_ACI_CAPS_OROM_VER				0x004A
-#define IXGBE_ACI_CAPS_PENDING_OROM_VER			0x004B
-#define IXGBE_ACI_CAPS_PENDING_NET_VER			0x004D
-#define IXGBE_ACI_CAPS_INLINE_IPSEC			0x0070
-#define IXGBE_ACI_CAPS_NUM_ENABLED_PORTS		0x0072
-#define IXGBE_ACI_CAPS_PCIE_RESET_AVOIDANCE		0x0076
-#define IXGBE_ACI_CAPS_POST_UPDATE_RESET_RESTRICT	0x0077
-#define IXGBE_ACI_CAPS_NVM_MGMT				0x0080
-#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG0		0x0081
-#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG1		0x0082
-#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG2		0x0083
-#define IXGBE_ACI_CAPS_EXT_TOPO_DEV_IMG3		0x0084
-	u8 major_ver;
-	u8 minor_ver;
-	/* Number of resources described by this capability */
-	__le32 number;
-	/* Only meaningful for some types of resources */
-	__le32 logical_id;
-	/* Only meaningful for some types of resources */
-	__le32 phys_id;
-	__le64 rsvd1;
-	__le64 rsvd2;
-};
-
 /* Disable RXEN (direct 0x000C) */
 struct ixgbe_aci_cmd_disable_rxen {
 	u8 lport_num;
@@ -960,55 +791,6 @@ struct ixgbe_aci_cmd_nvm_comp_tbl {
 	u8 cvs[]; /* Component Version String */
 } __packed;
 
-/**
- * struct ixgbe_aci_desc - Admin Command (AC) descriptor
- * @flags: IXGBE_ACI_FLAG_* flags
- * @opcode: Admin command opcode
- * @datalen: length in bytes of indirect/external data buffer
- * @retval: return value from firmware
- * @cookie_high: opaque data high-half
- * @cookie_low: opaque data low-half
- * @params: command-specific parameters
- *
- * Descriptor format for commands the driver posts via the
- * Admin Command Interface (ACI).
- * The firmware writes back onto the command descriptor and returns
- * the result of the command. Asynchronous events that are not an immediate
- * result of the command are written to the Admin Command Interface (ACI) using
- * the same descriptor format. Descriptors are in little-endian notation with
- * 32-bit words.
- */
-struct ixgbe_aci_desc {
-	__le16 flags;
-	__le16 opcode;
-	__le16 datalen;
-	__le16 retval;
-	__le32 cookie_high;
-	__le32 cookie_low;
-	union {
-		u8 raw[16];
-		struct ixgbe_aci_cmd_get_ver get_ver;
-		struct ixgbe_aci_cmd_driver_ver driver_ver;
-		struct ixgbe_aci_cmd_get_exp_err exp_err;
-		struct ixgbe_aci_cmd_req_res res_owner;
-		struct ixgbe_aci_cmd_list_caps get_cap;
-		struct ixgbe_aci_cmd_disable_rxen disable_rxen;
-		struct ixgbe_aci_cmd_get_phy_caps get_phy;
-		struct ixgbe_aci_cmd_set_phy_cfg set_phy;
-		struct ixgbe_aci_cmd_restart_an restart_an;
-		struct ixgbe_aci_cmd_get_link_status get_link_status;
-		struct ixgbe_aci_cmd_set_event_mask set_event_mask;
-		struct ixgbe_aci_cmd_set_port_id_led set_port_id_led;
-		struct ixgbe_aci_cmd_get_link_topo get_link_topo;
-		struct ixgbe_aci_cmd_get_link_topo_pin get_link_topo_pin;
-		struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param;
-		struct ixgbe_aci_cmd_nvm nvm;
-		struct ixgbe_aci_cmd_nvm_checksum nvm_checksum;
-		struct ixgbe_aci_cmd_nvm_pkg_data pkg_data;
-		struct ixgbe_aci_cmd_nvm_pass_comp_tbl pass_comp_tbl;
-	} params;
-};
-
 /* E610-specific adapter context structures */
 
 struct ixgbe_link_status {
@@ -1172,7 +954,7 @@ struct ixgbe_hw_dev_caps {
 
 /* ACI event information */
 struct ixgbe_aci_event {
-	struct ixgbe_aci_desc desc;
+	struct libie_aq_desc desc;
 	u8 *msg_buf;
 	u16 msg_len;
 	u16 buf_len;
@@ -1180,7 +962,7 @@ struct ixgbe_aci_event {
 
 struct ixgbe_aci_info {
 	struct mutex lock;		/* admin command interface lock */
-	enum ixgbe_aci_err last_status;	/* last status of sent admin command */
+	enum libie_aq_err last_status;	/* last status of sent admin command */
 };
 
 enum ixgbe_bank_select {
diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h
index 3676adc33d3e..b8079e7d842a 100644
--- a/include/linux/net/intel/libie/adminq.h
+++ b/include/linux/net/intel/libie/adminq.h
@@ -146,8 +146,10 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
 
 /* Device/Function buffer entry, repeated per reported capability */
 #define LIBIE_AQC_CAPS_VALID_FUNCTIONS			0x0005
+#define LIBIE_AQC_MAX_VALID_FUNCTIONS			0x8
 #define LIBIE_AQC_CAPS_SRIOV				0x0012
 #define LIBIE_AQC_CAPS_VF				0x0013
+#define LIBIE_AQC_CAPS_VMDQ				0x0014
 #define LIBIE_AQC_CAPS_VSI				0x0017
 #define LIBIE_AQC_CAPS_DCB				0x0018
 #define LIBIE_AQC_CAPS_RSS				0x0040
@@ -165,9 +167,15 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
 #define LIBIE_AQC_CAPS_PENDING_NET_VER			0x004D
 #define LIBIE_AQC_CAPS_RDMA				0x0051
 #define LIBIE_AQC_CAPS_SENSOR_READING			0x0067
+#define LIBIE_AQC_INLINE_IPSEC				0x0070
+#define LIBIE_AQC_CAPS_NUM_ENABLED_PORTS		0x0072
 #define LIBIE_AQC_CAPS_PCIE_RESET_AVOIDANCE		0x0076
 #define LIBIE_AQC_CAPS_POST_UPDATE_RESET_RESTRICT	0x0077
 #define LIBIE_AQC_CAPS_NVM_MGMT				0x0080
+#define LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG0		0x0081
+#define LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG1		0x0082
+#define LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG2		0x0083
+#define LIBIE_AQC_CAPS_EXT_TOPO_DEV_IMG3		0x0084
 #define LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE		0x0085
 #define LIBIE_AQC_CAPS_NAC_TOPOLOGY			0x0087
 #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT			0x0092
@@ -236,13 +244,21 @@ LIBIE_CHECK_STRUCT_LEN(32, libie_aq_desc);
 /* FW defined boundary for a large buffer, 4k >= Large buffer > 512 bytes */
 #define LIBIE_AQ_LG_BUF				512
 
+/* Flags sub-structure
+ * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
+ * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
+ */
 #define LIBIE_AQ_FLAG_DD			BIT(0)	/* 0x1    */
 #define LIBIE_AQ_FLAG_CMP			BIT(1)	/* 0x2    */
 #define LIBIE_AQ_FLAG_ERR			BIT(2)	/* 0x4    */
+#define LIBIE_AQ_FLAG_VFE			BIT(3)	/* 0x8    */
 #define LIBIE_AQ_FLAG_LB			BIT(9)	/* 0x200  */
 #define LIBIE_AQ_FLAG_RD			BIT(10)	/* 0x400  */
+#define LIBIE_AQ_FLAG_VFC			BIT(11) /* 0x800  */
 #define LIBIE_AQ_FLAG_BUF			BIT(12)	/* 0x1000 */
 #define LIBIE_AQ_FLAG_SI			BIT(13)	/* 0x2000 */
+#define LIBIE_AQ_FLAG_EI			BIT(14)	/* 0x4000 */
+#define LIBIE_AQ_FLAG_FE			BIT(15)	/* 0x8000 */
 
 /* error codes */
 enum libie_aq_err {
-- 
cgit v1.2.3


From b46012a20006a689529b6b51e05a8ad5320f7e7c Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Fri, 25 Apr 2025 08:08:04 +0200
Subject: i40e: use libie adminq descriptors

Use libie_aq_desc instead of i40e_aq_desc. Do needed changes to allow
clean build.

Get version descriptor is a little less detailed on i40e. To not mess up
with shifting or union inside libie desc use get version descriptor from
i40e.

Move additional caps for i40e to libie.

Fix RCT in declaration that is using libie_aq_desc;

Use libie_aq_raw() wherever it can be used.

The libie aq error is extended, cover it in ice driver just to clean
build. In next patches the libie code for that will be used in each
of intel driver.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Tested-by: Rinitha S <sx.rinitha@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.c     |  68 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq.h     |  12 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 155 +----
 drivers/net/ethernet/intel/i40e/i40e_common.c     | 754 +++++++++++-----------
 drivers/net/ethernet/intel/i40e/i40e_dcb.c        |  10 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c    |  46 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c    |  14 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c       |  31 +-
 drivers/net/ethernet/intel/i40e/i40e_nvm.c        |  16 +-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h  |  16 +-
 drivers/net/ethernet/intel/i40e/i40e_type.h       |   6 +-
 drivers/net/ethernet/intel/ice/ice_main.c         |   4 +
 include/linux/net/intel/libie/adminq.h            |  17 +
 13 files changed, 499 insertions(+), 650 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index 175c1320c143..096ec46bb619 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -18,7 +18,7 @@ static int i40e_alloc_adminq_asq_ring(struct i40e_hw *hw)
 
 	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.asq.desc_buf,
 					 (hw->aq.num_asq_entries *
-					 sizeof(struct i40e_aq_desc)),
+					 sizeof(struct libie_aq_desc)),
 					 I40E_ADMINQ_DESC_ALIGNMENT);
 	if (ret_code)
 		return ret_code;
@@ -44,7 +44,7 @@ static int i40e_alloc_adminq_arq_ring(struct i40e_hw *hw)
 
 	ret_code = i40e_allocate_dma_mem(hw, &hw->aq.arq.desc_buf,
 					 (hw->aq.num_arq_entries *
-					 sizeof(struct i40e_aq_desc)),
+					 sizeof(struct libie_aq_desc)),
 					 I40E_ADMINQ_DESC_ALIGNMENT);
 
 	return ret_code;
@@ -80,7 +80,7 @@ static void i40e_free_adminq_arq(struct i40e_hw *hw)
  **/
 static int i40e_alloc_arq_bufs(struct i40e_hw *hw)
 {
-	struct i40e_aq_desc *desc;
+	struct libie_aq_desc *desc;
 	struct i40e_dma_mem *bi;
 	int ret_code;
 	int i;
@@ -108,9 +108,9 @@ static int i40e_alloc_arq_bufs(struct i40e_hw *hw)
 		/* now configure the descriptors for use */
 		desc = I40E_ADMINQ_DESC(hw->aq.arq, i);
 
-		desc->flags = cpu_to_le16(I40E_AQ_FLAG_BUF);
+		desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_BUF);
 		if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
-			desc->flags |= cpu_to_le16(I40E_AQ_FLAG_LB);
+			desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 		desc->opcode = 0;
 		/* This is in accordance with Admin queue design, there is no
 		 * register for buffer size configuration
@@ -119,12 +119,12 @@ static int i40e_alloc_arq_bufs(struct i40e_hw *hw)
 		desc->retval = 0;
 		desc->cookie_high = 0;
 		desc->cookie_low = 0;
-		desc->params.external.addr_high =
+		desc->params.generic.addr_high =
 			cpu_to_le32(upper_32_bits(bi->pa));
-		desc->params.external.addr_low =
+		desc->params.generic.addr_low =
 			cpu_to_le32(lower_32_bits(bi->pa));
-		desc->params.external.param0 = 0;
-		desc->params.external.param1 = 0;
+		desc->params.generic.param0 = 0;
+		desc->params.generic.param1 = 0;
 	}
 
 alloc_arq_bufs:
@@ -691,8 +691,8 @@ static u16 i40e_clean_asq(struct i40e_hw *hw)
 	struct i40e_adminq_ring *asq = &(hw->aq.asq);
 	struct i40e_asq_cmd_details *details;
 	u16 ntc = asq->next_to_clean;
-	struct i40e_aq_desc desc_cb;
-	struct i40e_aq_desc *desc;
+	struct libie_aq_desc desc_cb;
+	struct libie_aq_desc *desc;
 
 	desc = I40E_ADMINQ_DESC(*asq, ntc);
 	details = I40E_ADMINQ_DETAILS(*asq, ntc);
@@ -750,7 +750,7 @@ static bool i40e_asq_done(struct i40e_hw *hw)
  **/
 static int
 i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
-				  struct i40e_aq_desc *desc,
+				  struct libie_aq_desc *desc,
 				  void *buff, /* can be NULL */
 				  u16  buff_size,
 				  struct i40e_asq_cmd_details *cmd_details,
@@ -758,7 +758,7 @@ i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
 {
 	struct i40e_dma_mem *dma_buff = NULL;
 	struct i40e_asq_cmd_details *details;
-	struct i40e_aq_desc *desc_on_ring;
+	struct libie_aq_desc *desc_on_ring;
 	bool cmd_completed = false;
 	u16  retval = 0;
 	int status = 0;
@@ -771,7 +771,7 @@ i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
 		goto asq_send_command_error;
 	}
 
-	hw->aq.asq_last_status = I40E_AQ_RC_OK;
+	hw->aq.asq_last_status = LIBIE_AQ_RC_OK;
 
 	val = rd32(hw, I40E_PF_ATQH);
 	if (val >= hw->aq.num_asq_entries) {
@@ -851,9 +851,9 @@ i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
 		/* Update the address values in the desc with the pa value
 		 * for respective buffer
 		 */
-		desc_on_ring->params.external.addr_high =
+		desc_on_ring->params.generic.addr_high =
 				cpu_to_le32(upper_32_bits(dma_buff->pa));
-		desc_on_ring->params.external.addr_low =
+		desc_on_ring->params.generic.addr_low =
 				cpu_to_le32(lower_32_bits(dma_buff->pa));
 	}
 
@@ -905,13 +905,13 @@ i40e_asq_send_command_atomic_exec(struct i40e_hw *hw,
 			retval &= 0xff;
 		}
 		cmd_completed = true;
-		if ((enum i40e_admin_queue_err)retval == I40E_AQ_RC_OK)
+		if ((enum libie_aq_err)retval == LIBIE_AQ_RC_OK)
 			status = 0;
-		else if ((enum i40e_admin_queue_err)retval == I40E_AQ_RC_EBUSY)
+		else if ((enum libie_aq_err)retval == LIBIE_AQ_RC_EBUSY)
 			status = -EBUSY;
 		else
 			status = -EIO;
-		hw->aq.asq_last_status = (enum i40e_admin_queue_err)retval;
+		hw->aq.asq_last_status = (enum libie_aq_err)retval;
 	}
 
 	i40e_debug(hw, I40E_DEBUG_AQ_COMMAND,
@@ -954,7 +954,7 @@ asq_send_command_error:
  **/
 int
 i40e_asq_send_command_atomic(struct i40e_hw *hw,
-			     struct i40e_aq_desc *desc,
+			     struct libie_aq_desc *desc,
 			     void *buff, /* can be NULL */
 			     u16  buff_size,
 			     struct i40e_asq_cmd_details *cmd_details,
@@ -972,7 +972,7 @@ i40e_asq_send_command_atomic(struct i40e_hw *hw,
 }
 
 int
-i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
+i40e_asq_send_command(struct i40e_hw *hw, struct libie_aq_desc *desc,
 		      void *buff, /* can be NULL */ u16  buff_size,
 		      struct i40e_asq_cmd_details *cmd_details)
 {
@@ -996,12 +996,12 @@ i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
  **/
 int
 i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
-				struct i40e_aq_desc *desc,
+				struct libie_aq_desc *desc,
 				void *buff, /* can be NULL */
 				u16  buff_size,
 				struct i40e_asq_cmd_details *cmd_details,
 				bool is_atomic_context,
-				enum i40e_admin_queue_err *aq_status)
+				enum libie_aq_err *aq_status)
 {
 	int status;
 
@@ -1023,13 +1023,13 @@ i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
  *
  *  Fill the desc with default values
  **/
-void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+void i40e_fill_default_direct_cmd_desc(struct libie_aq_desc *desc,
 				       u16 opcode)
 {
 	/* zero out the desc */
-	memset((void *)desc, 0, sizeof(struct i40e_aq_desc));
+	memset((void *)desc, 0, sizeof(struct libie_aq_desc));
 	desc->opcode = cpu_to_le16(opcode);
-	desc->flags = cpu_to_le16(I40E_AQ_FLAG_SI);
+	desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_SI);
 }
 
 /**
@@ -1047,7 +1047,7 @@ int i40e_clean_arq_element(struct i40e_hw *hw,
 			   u16 *pending)
 {
 	u16 ntc = hw->aq.arq.next_to_clean;
-	struct i40e_aq_desc *desc;
+	struct libie_aq_desc *desc;
 	struct i40e_dma_mem *bi;
 	int ret_code = 0;
 	u16 desc_idx;
@@ -1081,9 +1081,9 @@ int i40e_clean_arq_element(struct i40e_hw *hw,
 	desc_idx = ntc;
 
 	hw->aq.arq_last_status =
-		(enum i40e_admin_queue_err)le16_to_cpu(desc->retval);
+		(enum libie_aq_err)le16_to_cpu(desc->retval);
 	flags = le16_to_cpu(desc->flags);
-	if (flags & I40E_AQ_FLAG_ERR) {
+	if (flags & LIBIE_AQ_FLAG_ERR) {
 		ret_code = -EIO;
 		i40e_debug(hw,
 			   I40E_DEBUG_AQ_MESSAGE,
@@ -1107,14 +1107,14 @@ int i40e_clean_arq_element(struct i40e_hw *hw,
 	 * size
 	 */
 	bi = &hw->aq.arq.r.arq_bi[ntc];
-	memset((void *)desc, 0, sizeof(struct i40e_aq_desc));
+	memset((void *)desc, 0, sizeof(struct libie_aq_desc));
 
-	desc->flags = cpu_to_le16(I40E_AQ_FLAG_BUF);
+	desc->flags = cpu_to_le16(LIBIE_AQ_FLAG_BUF);
 	if (hw->aq.arq_buf_size > I40E_AQ_LARGE_BUF)
-		desc->flags |= cpu_to_le16(I40E_AQ_FLAG_LB);
+		desc->flags |= cpu_to_le16(LIBIE_AQ_FLAG_LB);
 	desc->datalen = cpu_to_le16((u16)bi->size);
-	desc->params.external.addr_high = cpu_to_le32(upper_32_bits(bi->pa));
-	desc->params.external.addr_low = cpu_to_le32(lower_32_bits(bi->pa));
+	desc->params.generic.addr_high = cpu_to_le32(upper_32_bits(bi->pa));
+	desc->params.generic.addr_low = cpu_to_le32(lower_32_bits(bi->pa));
 
 	/* set tail = the last cleaned desc index. */
 	wr32(hw, I40E_PF_ARQT, ntc);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.h b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
index 55b5bb884d73..1be97a3a86ce 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
@@ -9,7 +9,7 @@
 #include "i40e_adminq_cmd.h"
 
 #define I40E_ADMINQ_DESC(R, i)   \
-	(&(((struct i40e_aq_desc *)((R).desc_buf.va))[i]))
+	(&(((struct libie_aq_desc *)((R).desc_buf.va))[i]))
 
 #define I40E_ADMINQ_DESC_ALIGNMENT 4096
 
@@ -39,7 +39,7 @@ struct i40e_asq_cmd_details {
 	u16 flags_dis;
 	bool async;
 	bool postpone;
-	struct i40e_aq_desc *wb_desc;
+	struct libie_aq_desc *wb_desc;
 };
 
 #define I40E_ADMINQ_DETAILS(R, i)   \
@@ -47,7 +47,7 @@ struct i40e_asq_cmd_details {
 
 /* ARQ event information */
 struct i40e_arq_event_info {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 msg_len;
 	u16 buf_len;
 	u8 *msg_buf;
@@ -72,8 +72,8 @@ struct i40e_adminq_info {
 	struct mutex arq_mutex; /* Receive queue lock */
 
 	/* last status values on send and receive queues */
-	enum i40e_admin_queue_err asq_last_status;
-	enum i40e_admin_queue_err arq_last_status;
+	enum libie_aq_err asq_last_status;
+	enum libie_aq_err arq_last_status;
 };
 
 /**
@@ -119,7 +119,7 @@ static inline int i40e_aq_rc_to_posix(int aq_ret, int aq_rc)
 #define I40E_AQ_LARGE_BUF	512
 #define I40E_ASQ_CMD_TIMEOUT	250000  /* usecs */
 
-void i40e_fill_default_direct_cmd_desc(struct i40e_aq_desc *desc,
+void i40e_fill_default_direct_cmd_desc(struct libie_aq_desc *desc,
 				       u16 opcode);
 
 #endif /* _I40E_ADMINQ_H_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index c8f35d4de271..76d872b91a38 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -4,6 +4,8 @@
 #ifndef _I40E_ADMINQ_CMD_H_
 #define _I40E_ADMINQ_CMD_H_
 
+#include <linux/net/intel/libie/adminq.h>
+
 #include <linux/bits.h>
 #include <linux/types.h>
 
@@ -30,75 +32,6 @@
 /* API version 1.10 for X722 devices adds ability to request FEC encoding */
 #define I40E_MINOR_VER_FW_REQUEST_FEC_X722 0x000A
 
-struct i40e_aq_desc {
-	__le16 flags;
-	__le16 opcode;
-	__le16 datalen;
-	__le16 retval;
-	__le32 cookie_high;
-	__le32 cookie_low;
-	union {
-		struct {
-			__le32 param0;
-			__le32 param1;
-			__le32 param2;
-			__le32 param3;
-		} internal;
-		struct {
-			__le32 param0;
-			__le32 param1;
-			__le32 addr_high;
-			__le32 addr_low;
-		} external;
-		u8 raw[16];
-	} params;
-};
-
-/* Flags sub-structure
- * |0  |1  |2  |3  |4  |5  |6  |7  |8  |9  |10 |11 |12 |13 |14 |15 |
- * |DD |CMP|ERR|VFE| * *  RESERVED * * |LB |RD |VFC|BUF|SI |EI |FE |
- */
-
-/* command flags and offsets*/
-#define I40E_AQ_FLAG_ERR_SHIFT	2
-#define I40E_AQ_FLAG_LB_SHIFT	9
-#define I40E_AQ_FLAG_RD_SHIFT	10
-#define I40E_AQ_FLAG_BUF_SHIFT	12
-#define I40E_AQ_FLAG_SI_SHIFT	13
-
-#define I40E_AQ_FLAG_ERR	BIT(I40E_AQ_FLAG_ERR_SHIFT) /* 0x4    */
-#define I40E_AQ_FLAG_LB		BIT(I40E_AQ_FLAG_LB_SHIFT)  /* 0x200  */
-#define I40E_AQ_FLAG_RD		BIT(I40E_AQ_FLAG_RD_SHIFT)  /* 0x400  */
-#define I40E_AQ_FLAG_BUF	BIT(I40E_AQ_FLAG_BUF_SHIFT) /* 0x1000 */
-#define I40E_AQ_FLAG_SI		BIT(I40E_AQ_FLAG_SI_SHIFT)  /* 0x2000 */
-
-/* error codes */
-enum i40e_admin_queue_err {
-	I40E_AQ_RC_OK		= 0,  /* success */
-	I40E_AQ_RC_EPERM	= 1,  /* Operation not permitted */
-	I40E_AQ_RC_ENOENT	= 2,  /* No such element */
-	I40E_AQ_RC_ESRCH	= 3,  /* Bad opcode */
-	I40E_AQ_RC_EINTR	= 4,  /* operation interrupted */
-	I40E_AQ_RC_EIO		= 5,  /* I/O error */
-	I40E_AQ_RC_ENXIO	= 6,  /* No such resource */
-	I40E_AQ_RC_E2BIG	= 7,  /* Arg too long */
-	I40E_AQ_RC_EAGAIN	= 8,  /* Try again */
-	I40E_AQ_RC_ENOMEM	= 9,  /* Out of memory */
-	I40E_AQ_RC_EACCES	= 10, /* Permission denied */
-	I40E_AQ_RC_EFAULT	= 11, /* Bad address */
-	I40E_AQ_RC_EBUSY	= 12, /* Device or resource busy */
-	I40E_AQ_RC_EEXIST	= 13, /* object already exists */
-	I40E_AQ_RC_EINVAL	= 14, /* Invalid argument */
-	I40E_AQ_RC_ENOTTY	= 15, /* Not a typewriter */
-	I40E_AQ_RC_ENOSPC	= 16, /* No space left or alloc failure */
-	I40E_AQ_RC_ENOSYS	= 17, /* Function not implemented */
-	I40E_AQ_RC_ERANGE	= 18, /* Parameter out of range */
-	I40E_AQ_RC_EFLUSHED	= 19, /* Cmd flushed due to prev cmd error */
-	I40E_AQ_RC_BAD_ADDR	= 20, /* Descriptor contains a bad pointer */
-	I40E_AQ_RC_EMODE	= 21, /* Op not allowed in current dev mode */
-	I40E_AQ_RC_EFBIG	= 22, /* File too large */
-};
-
 /* Admin Queue command opcodes */
 enum i40e_admin_queue_opc {
 	/* aq commands */
@@ -320,21 +253,6 @@ struct i40e_aqc_get_version {
 	__le16 api_minor;
 };
 
-I40E_CHECK_CMD_LENGTH(i40e_aqc_get_version);
-
-/* Send driver version (indirect 0x0002) */
-struct i40e_aqc_driver_version {
-	u8	driver_major_ver;
-	u8	driver_minor_ver;
-	u8	driver_build_ver;
-	u8	driver_subbuild_ver;
-	u8	reserved[4];
-	__le32	address_high;
-	__le32	address_low;
-};
-
-I40E_CHECK_CMD_LENGTH(i40e_aqc_driver_version);
-
 /* Queue Shutdown (direct 0x0003) */
 struct i40e_aqc_queue_shutdown {
 	__le32	driver_unloading;
@@ -352,75 +270,6 @@ struct i40e_aqc_set_pf_context {
 
 I40E_CHECK_CMD_LENGTH(i40e_aqc_set_pf_context);
 
-/* Request resource ownership (direct 0x0008)
- * Release resource ownership (direct 0x0009)
- */
-struct i40e_aqc_request_resource {
-	__le16	resource_id;
-	__le16	access_type;
-	__le32	timeout;
-	__le32	resource_number;
-	u8	reserved[4];
-};
-
-I40E_CHECK_CMD_LENGTH(i40e_aqc_request_resource);
-
-/* Get function capabilities (indirect 0x000A)
- * Get device capabilities (indirect 0x000B)
- */
-struct i40e_aqc_list_capabilites {
-	u8 command_flags;
-	u8 pf_index;
-	u8 reserved[2];
-	__le32 count;
-	__le32 addr_high;
-	__le32 addr_low;
-};
-
-I40E_CHECK_CMD_LENGTH(i40e_aqc_list_capabilites);
-
-struct i40e_aqc_list_capabilities_element_resp {
-	__le16	id;
-	u8	major_rev;
-	u8	minor_rev;
-	__le32	number;
-	__le32	logical_id;
-	__le32	phys_id;
-	u8	reserved[16];
-};
-
-/* list of caps */
-
-#define I40E_AQ_CAP_ID_SWITCH_MODE	0x0001
-#define I40E_AQ_CAP_ID_MNG_MODE		0x0002
-#define I40E_AQ_CAP_ID_NPAR_ACTIVE	0x0003
-#define I40E_AQ_CAP_ID_OS2BMC_CAP	0x0004
-#define I40E_AQ_CAP_ID_FUNCTIONS_VALID	0x0005
-#define I40E_AQ_CAP_ID_SRIOV		0x0012
-#define I40E_AQ_CAP_ID_VF		0x0013
-#define I40E_AQ_CAP_ID_VMDQ		0x0014
-#define I40E_AQ_CAP_ID_8021QBG		0x0015
-#define I40E_AQ_CAP_ID_8021QBR		0x0016
-#define I40E_AQ_CAP_ID_VSI		0x0017
-#define I40E_AQ_CAP_ID_DCB		0x0018
-#define I40E_AQ_CAP_ID_FCOE		0x0021
-#define I40E_AQ_CAP_ID_ISCSI		0x0022
-#define I40E_AQ_CAP_ID_RSS		0x0040
-#define I40E_AQ_CAP_ID_RXQ		0x0041
-#define I40E_AQ_CAP_ID_TXQ		0x0042
-#define I40E_AQ_CAP_ID_MSIX		0x0043
-#define I40E_AQ_CAP_ID_VF_MSIX		0x0044
-#define I40E_AQ_CAP_ID_FLOW_DIRECTOR	0x0045
-#define I40E_AQ_CAP_ID_1588		0x0046
-#define I40E_AQ_CAP_ID_IWARP		0x0051
-#define I40E_AQ_CAP_ID_LED		0x0061
-#define I40E_AQ_CAP_ID_SDP		0x0062
-#define I40E_AQ_CAP_ID_MDIO		0x0063
-#define I40E_AQ_CAP_ID_WSR_PROT		0x0064
-#define I40E_AQ_CAP_ID_NVM_MGMT		0x0080
-#define I40E_AQ_CAP_ID_FLEX10		0x00F1
-#define I40E_AQ_CAP_ID_CEM		0x00F2
-
 /* Set CPPM Configuration (direct 0x0103) */
 struct i40e_aqc_cppm_configuration {
 	__le16	command_flags;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index b11c35e307ca..75074611285a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -73,55 +73,47 @@ int i40e_set_mac_type(struct i40e_hw *hw)
  * @hw: pointer to the HW structure
  * @aq_err: the AQ error code to convert
  **/
-const char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err)
+const char *i40e_aq_str(struct i40e_hw *hw, enum libie_aq_err aq_err)
 {
 	switch (aq_err) {
-	case I40E_AQ_RC_OK:
+	case LIBIE_AQ_RC_OK:
 		return "OK";
-	case I40E_AQ_RC_EPERM:
-		return "I40E_AQ_RC_EPERM";
-	case I40E_AQ_RC_ENOENT:
-		return "I40E_AQ_RC_ENOENT";
-	case I40E_AQ_RC_ESRCH:
-		return "I40E_AQ_RC_ESRCH";
-	case I40E_AQ_RC_EINTR:
-		return "I40E_AQ_RC_EINTR";
-	case I40E_AQ_RC_EIO:
-		return "I40E_AQ_RC_EIO";
-	case I40E_AQ_RC_ENXIO:
-		return "I40E_AQ_RC_ENXIO";
-	case I40E_AQ_RC_E2BIG:
-		return "I40E_AQ_RC_E2BIG";
-	case I40E_AQ_RC_EAGAIN:
-		return "I40E_AQ_RC_EAGAIN";
-	case I40E_AQ_RC_ENOMEM:
-		return "I40E_AQ_RC_ENOMEM";
-	case I40E_AQ_RC_EACCES:
-		return "I40E_AQ_RC_EACCES";
-	case I40E_AQ_RC_EFAULT:
-		return "I40E_AQ_RC_EFAULT";
-	case I40E_AQ_RC_EBUSY:
-		return "I40E_AQ_RC_EBUSY";
-	case I40E_AQ_RC_EEXIST:
-		return "I40E_AQ_RC_EEXIST";
-	case I40E_AQ_RC_EINVAL:
-		return "I40E_AQ_RC_EINVAL";
-	case I40E_AQ_RC_ENOTTY:
-		return "I40E_AQ_RC_ENOTTY";
-	case I40E_AQ_RC_ENOSPC:
-		return "I40E_AQ_RC_ENOSPC";
-	case I40E_AQ_RC_ENOSYS:
-		return "I40E_AQ_RC_ENOSYS";
-	case I40E_AQ_RC_ERANGE:
-		return "I40E_AQ_RC_ERANGE";
-	case I40E_AQ_RC_EFLUSHED:
-		return "I40E_AQ_RC_EFLUSHED";
-	case I40E_AQ_RC_BAD_ADDR:
-		return "I40E_AQ_RC_BAD_ADDR";
-	case I40E_AQ_RC_EMODE:
-		return "I40E_AQ_RC_EMODE";
-	case I40E_AQ_RC_EFBIG:
-		return "I40E_AQ_RC_EFBIG";
+	case LIBIE_AQ_RC_EPERM:
+		return "LIBIE_AQ_RC_EPERM";
+	case LIBIE_AQ_RC_ENOENT:
+		return "LIBIE_AQ_RC_ENOENT";
+	case LIBIE_AQ_RC_ESRCH:
+		return "LIBIE_AQ_RC_ESRCH";
+	case LIBIE_AQ_RC_EIO:
+		return "LIBIE_AQ_RC_EIO";
+	case LIBIE_AQ_RC_EAGAIN:
+		return "LIBIE_AQ_RC_EAGAIN";
+	case LIBIE_AQ_RC_ENOMEM:
+		return "LIBIE_AQ_RC_ENOMEM";
+	case LIBIE_AQ_RC_EACCES:
+		return "LIBIE_AQ_RC_EACCES";
+	case LIBIE_AQ_RC_EBUSY:
+		return "LIBIE_AQ_RC_EBUSY";
+	case LIBIE_AQ_RC_EEXIST:
+		return "LIBIE_AQ_RC_EEXIST";
+	case LIBIE_AQ_RC_EINVAL:
+		return "LIBIE_AQ_RC_EINVAL";
+	case LIBIE_AQ_RC_ENOSPC:
+		return "LIBIE_AQ_RC_ENOSPC";
+	case LIBIE_AQ_RC_ENOSYS:
+		return "LIBIE_AQ_RC_ENOSYS";
+	case LIBIE_AQ_RC_EMODE:
+		return "LIBIE_AQ_RC_EMODE";
+	case LIBIE_AQ_RC_ENOSEC:
+		return "LIBIE_AQ_RC_ENOSEC";
+	case LIBIE_AQ_RC_EBADSIG:
+		return "LIBIE_AQ_RC_EBADSIG";
+	case LIBIE_AQ_RC_ESVN:
+		return "LIBIE_AQ_RC_ESVN";
+	case LIBIE_AQ_RC_EBADMAN:
+		return "LIBIE_AQ_RC_EBADMAN";
+	case LIBIE_AQ_RC_EBADBUF:
+		return "LIBIE_AQ_RC_EBADBUF";
 	}
 
 	snprintf(hw->err_str, sizeof(hw->err_str), "%d", aq_err);
@@ -141,7 +133,7 @@ const char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err)
 void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, void *desc,
 		   void *buffer, u16 buf_len)
 {
-	struct i40e_aq_desc *aq_desc = (struct i40e_aq_desc *)desc;
+	struct libie_aq_desc *aq_desc = (struct libie_aq_desc *)desc;
 	u32 effective_mask = hw->debug_mask & mask;
 	char prefix[27];
 	u16 len;
@@ -164,12 +156,12 @@ void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask, void *desc,
 		   le32_to_cpu(aq_desc->cookie_low));
 	i40e_debug(hw, mask & I40E_DEBUG_AQ_DESCRIPTOR,
 		   "\tparam (0,1)  0x%08X 0x%08X\n",
-		   le32_to_cpu(aq_desc->params.internal.param0),
-		   le32_to_cpu(aq_desc->params.internal.param1));
+		   le32_to_cpu(aq_desc->params.generic.param0),
+		   le32_to_cpu(aq_desc->params.generic.param1));
 	i40e_debug(hw, mask & I40E_DEBUG_AQ_DESCRIPTOR,
 		   "\taddr (h,l)   0x%08X 0x%08X\n",
-		   le32_to_cpu(aq_desc->params.external.addr_high),
-		   le32_to_cpu(aq_desc->params.external.addr_low));
+		   le32_to_cpu(aq_desc->params.generic.addr_high),
+		   le32_to_cpu(aq_desc->params.generic.addr_low));
 
 	if (buffer && buf_len != 0 && len != 0 &&
 	    (effective_mask & I40E_DEBUG_AQ_DESC_BUFFER)) {
@@ -214,14 +206,14 @@ bool i40e_check_asq_alive(struct i40e_hw *hw)
 int i40e_aq_queue_shutdown(struct i40e_hw *hw,
 			   bool unloading)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_queue_shutdown *cmd =
-		(struct i40e_aqc_queue_shutdown *)&desc.params.raw;
+	struct i40e_aqc_queue_shutdown *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_queue_shutdown);
 
+	cmd = libie_aq_raw(&desc);
 	if (unloading)
 		cmd->driver_unloading = cpu_to_le32(I40E_AQ_DRIVER_UNLOADING);
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, NULL);
@@ -245,9 +237,8 @@ static int i40e_aq_get_set_rss_lut(struct i40e_hw *hw,
 				   u8 *lut, u16 lut_size,
 				   bool set)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_set_rss_lut *cmd_resp =
-		   (struct i40e_aqc_get_set_rss_lut *)&desc.params.raw;
+	struct i40e_aqc_get_set_rss_lut *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 	u16 flags;
 
@@ -258,9 +249,10 @@ static int i40e_aq_get_set_rss_lut(struct i40e_hw *hw,
 		i40e_fill_default_direct_cmd_desc(&desc,
 						  i40e_aqc_opc_get_rss_lut);
 
+	cmd_resp = libie_aq_raw(&desc);
 	/* Indirect command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_RD);
 
 	vsi_id = FIELD_PREP(I40E_AQC_SET_RSS_LUT_VSI_ID_MASK, vsi_id) |
 		 FIELD_PREP(I40E_AQC_SET_RSS_LUT_VSI_VALID, 1);
@@ -326,10 +318,9 @@ static int i40e_aq_get_set_rss_key(struct i40e_hw *hw,
 				   struct i40e_aqc_get_set_rss_key_data *key,
 				   bool set)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_set_rss_key *cmd_resp =
-			(struct i40e_aqc_get_set_rss_key *)&desc.params.raw;
 	u16 key_size = sizeof(struct i40e_aqc_get_set_rss_key_data);
+	struct i40e_aqc_get_set_rss_key *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (set)
@@ -339,9 +330,10 @@ static int i40e_aq_get_set_rss_key(struct i40e_hw *hw,
 		i40e_fill_default_direct_cmd_desc(&desc,
 						  i40e_aqc_opc_get_rss_key);
 
+	cmd_resp = libie_aq_raw(&desc);
 	/* Indirect command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_RD);
 
 	vsi_id = FIELD_PREP(I40E_AQC_SET_RSS_KEY_VSI_ID_MASK, vsi_id) |
 		 FIELD_PREP(I40E_AQC_SET_RSS_KEY_VSI_VALID, 1);
@@ -439,13 +431,13 @@ i40e_aq_mac_address_read(struct i40e_hw *hw,
 			 struct i40e_aqc_mac_address_read_data *addrs,
 			 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_mac_address_read *cmd_data =
-		(struct i40e_aqc_mac_address_read *)&desc.params.raw;
+	struct i40e_aqc_mac_address_read *cmd_data;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_mac_address_read);
-	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF);
+	cmd_data = libie_aq_raw(&desc);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF);
 
 	status = i40e_asq_send_command(hw, &desc, addrs,
 				       sizeof(*addrs), cmd_details);
@@ -465,13 +457,13 @@ int i40e_aq_mac_address_write(struct i40e_hw *hw,
 			      u16 flags, u8 *mac_addr,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_mac_address_write *cmd_data =
-		(struct i40e_aqc_mac_address_write *)&desc.params.raw;
+	struct i40e_aqc_mac_address_write *cmd_data;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_mac_address_write);
+	cmd_data = libie_aq_raw(&desc);
 	cmd_data->command_flags = cpu_to_le16(flags);
 	cmd_data->mac_sah = cpu_to_le16((u16)mac_addr[0] << 8 | mac_addr[1]);
 	cmd_data->mac_sal = cpu_to_le32(((u32)mac_addr[2] << 24) |
@@ -1061,7 +1053,7 @@ i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
 {
 	u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp);
 	u16 max_delay = I40E_MAX_PHY_TIMEOUT, total_delay = 0;
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!abilities)
@@ -1071,36 +1063,36 @@ i40e_aq_get_phy_capabilities(struct i40e_hw *hw,
 		i40e_fill_default_direct_cmd_desc(&desc,
 					       i40e_aqc_opc_get_phy_abilities);
 
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 		if (abilities_size > I40E_AQ_LARGE_BUF)
-			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+			desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 		if (qualified_modules)
-			desc.params.external.param0 |=
+			desc.params.generic.param0 |=
 			cpu_to_le32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES);
 
 		if (report_init)
-			desc.params.external.param0 |=
+			desc.params.generic.param0 |=
 			cpu_to_le32(I40E_AQ_PHY_REPORT_INITIAL_VALUES);
 
 		status = i40e_asq_send_command(hw, &desc, abilities,
 					       abilities_size, cmd_details);
 
 		switch (hw->aq.asq_last_status) {
-		case I40E_AQ_RC_EIO:
+		case LIBIE_AQ_RC_EIO:
 			status = -EIO;
 			break;
-		case I40E_AQ_RC_EAGAIN:
+		case LIBIE_AQ_RC_EAGAIN:
 			usleep_range(1000, 2000);
 			total_delay++;
 			status = -EIO;
 			break;
-		/* also covers I40E_AQ_RC_OK */
+		/* also covers LIBIE_AQ_RC_OK */
 		default:
 			break;
 		}
 
-	} while ((hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN) &&
+	} while ((hw->aq.asq_last_status == LIBIE_AQ_RC_EAGAIN) &&
 		(total_delay < max_delay));
 
 	if (status)
@@ -1137,9 +1129,8 @@ int i40e_aq_set_phy_config(struct i40e_hw *hw,
 			   struct i40e_aq_set_phy_config *config,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aq_set_phy_config *cmd =
-			(struct i40e_aq_set_phy_config *)&desc.params.raw;
+	struct i40e_aq_set_phy_config *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!config)
@@ -1148,6 +1139,7 @@ int i40e_aq_set_phy_config(struct i40e_hw *hw,
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_phy_config);
 
+	cmd = libie_aq_raw(&desc);
 	*cmd = *config;
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -1259,14 +1251,14 @@ int i40e_set_fc(struct i40e_hw *hw, u8 *aq_failures,
 int i40e_aq_clear_pxe_mode(struct i40e_hw *hw,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_clear_pxe *cmd =
-		(struct i40e_aqc_clear_pxe *)&desc.params.raw;
+	struct i40e_aqc_clear_pxe *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_clear_pxe_mode);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->rx_cnt = 0x2;
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -1288,14 +1280,14 @@ int i40e_aq_set_link_restart_an(struct i40e_hw *hw,
 				bool enable_link,
 				struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_link_restart_an *cmd =
-		(struct i40e_aqc_set_link_restart_an *)&desc.params.raw;
+	struct i40e_aqc_set_link_restart_an *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_link_restart_an);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->command = I40E_AQ_PHY_RESTART_AN;
 	if (enable_link)
 		cmd->command |= I40E_AQ_PHY_LINK_ENABLE;
@@ -1320,16 +1312,16 @@ int i40e_aq_get_link_info(struct i40e_hw *hw,
 			  bool enable_lse, struct i40e_link_status *link,
 			  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_link_status *resp =
-		(struct i40e_aqc_get_link_status *)&desc.params.raw;
 	struct i40e_link_status *hw_link_info = &hw->phy.link_info;
+	struct i40e_aqc_get_link_status *resp;
+	struct libie_aq_desc desc;
 	bool tx_pause, rx_pause;
 	u16 command_flags;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_link_status);
 
+	resp = libie_aq_raw(&desc);
 	if (enable_lse)
 		command_flags = I40E_AQ_LSE_ENABLE;
 	else
@@ -1415,14 +1407,14 @@ int i40e_aq_set_phy_int_mask(struct i40e_hw *hw,
 			     u16 mask,
 			     struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_phy_int_mask *cmd =
-		(struct i40e_aqc_set_phy_int_mask *)&desc.params.raw;
+	struct i40e_aqc_set_phy_int_mask *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_phy_int_mask);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->event_mask = cpu_to_le16(mask);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -1441,11 +1433,11 @@ int i40e_aq_set_phy_int_mask(struct i40e_hw *hw,
 int i40e_aq_set_mac_loopback(struct i40e_hw *hw, bool ena_lpbk,
 			     struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_lb_mode *cmd =
-		(struct i40e_aqc_set_lb_mode *)&desc.params.raw;
+	struct i40e_aqc_set_lb_mode *cmd;
+	struct libie_aq_desc desc;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_set_lb_modes);
+	cmd = libie_aq_raw(&desc);
 	if (ena_lpbk) {
 		if (hw->nvm.version <= I40E_LEGACY_LOOPBACK_NVM_VER)
 			cmd->lb_mode = cpu_to_le16(I40E_AQ_LB_MAC_LOCAL_LEGACY);
@@ -1467,14 +1459,14 @@ int i40e_aq_set_mac_loopback(struct i40e_hw *hw, bool ena_lpbk,
 int i40e_aq_set_phy_debug(struct i40e_hw *hw, u8 cmd_flags,
 			  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_phy_debug *cmd =
-		(struct i40e_aqc_set_phy_debug *)&desc.params.raw;
+	struct i40e_aqc_set_phy_debug *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_phy_debug);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->command_flags = cmd_flags;
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -1494,23 +1486,22 @@ int i40e_aq_add_vsi(struct i40e_hw *hw,
 		    struct i40e_vsi_context *vsi_ctx,
 		    struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_get_update_vsi *cmd =
-		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
-	struct i40e_aqc_add_get_update_vsi_completion *resp =
-		(struct i40e_aqc_add_get_update_vsi_completion *)
-		&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp;
+	struct i40e_aqc_add_get_update_vsi *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_add_vsi);
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	cmd->uplink_seid = cpu_to_le16(vsi_ctx->uplink_seid);
 	cmd->connection_type = vsi_ctx->connection_type;
 	cmd->vf_id = vsi_ctx->vf_num;
 	cmd->vsi_flags = cpu_to_le16(vsi_ctx->flags);
 
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 
 	status = i40e_asq_send_command_atomic(hw, &desc, &vsi_ctx->info,
 					      sizeof(vsi_ctx->info),
@@ -1538,15 +1529,14 @@ int i40e_aq_set_default_vsi(struct i40e_hw *hw,
 			    u16 seid,
 			    struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)
-		&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->promiscuous_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
 	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
 	cmd->seid = cpu_to_le16(seid);
@@ -1566,15 +1556,14 @@ int i40e_aq_clear_default_vsi(struct i40e_hw *hw,
 			      u16 seid,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)
-		&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->promiscuous_flags = cpu_to_le16(0);
 	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_DEFAULT);
 	cmd->seid = cpu_to_le16(seid);
@@ -1597,15 +1586,15 @@ int i40e_aq_set_vsi_unicast_promiscuous(struct i40e_hw *hw,
 					struct i40e_asq_cmd_details *cmd_details,
 					bool rx_only_promisc)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	u16 flags = 0;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	if (set) {
 		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
 		if (rx_only_promisc && i40e_is_aq_api_ver_ge(hw, 1, 5))
@@ -1636,15 +1625,15 @@ int i40e_aq_set_vsi_multicast_promiscuous(struct i40e_hw *hw,
 					  u16 seid, bool set,
 					  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	u16 flags = 0;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	if (set)
 		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
 
@@ -1671,15 +1660,15 @@ int i40e_aq_set_vsi_mc_promisc_on_vlan(struct i40e_hw *hw,
 				       u16 vid,
 				       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	u16 flags = 0;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	if (enable)
 		flags |= I40E_AQC_SET_VSI_PROMISC_MULTICAST;
 
@@ -1707,15 +1696,15 @@ int i40e_aq_set_vsi_uc_promisc_on_vlan(struct i40e_hw *hw,
 				       u16 vid,
 				       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	u16 flags = 0;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	if (enable) {
 		flags |= I40E_AQC_SET_VSI_PROMISC_UNICAST;
 		if (i40e_is_aq_api_ver_ge(hw, 1, 5))
@@ -1748,9 +1737,8 @@ int i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw,
 				       u16 seid, bool enable, u16 vid,
 				       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	u16 flags = 0;
 	int status;
 
@@ -1760,6 +1748,7 @@ int i40e_aq_set_vsi_bc_promisc_on_vlan(struct i40e_hw *hw,
 	if (enable)
 		flags |= I40E_AQC_SET_VSI_PROMISC_BROADCAST;
 
+	cmd = libie_aq_raw(&desc);
 	cmd->promiscuous_flags = cpu_to_le16(flags);
 	cmd->valid_flags = cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
 	cmd->seid = cpu_to_le16(seid);
@@ -1783,14 +1772,14 @@ int i40e_aq_set_vsi_broadcast(struct i40e_hw *hw,
 			      u16 seid, bool set_filter,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_vsi_promiscuous_modes *cmd =
-		(struct i40e_aqc_set_vsi_promiscuous_modes *)&desc.params.raw;
+	struct i40e_aqc_set_vsi_promiscuous_modes *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					i40e_aqc_opc_set_vsi_promiscuous_modes);
 
+	cmd = libie_aq_raw(&desc);
 	if (set_filter)
 		cmd->promiscuous_flags
 			    |= cpu_to_le16(I40E_AQC_SET_VSI_PROMISC_BROADCAST);
@@ -1815,20 +1804,19 @@ int i40e_aq_get_vsi_params(struct i40e_hw *hw,
 			   struct i40e_vsi_context *vsi_ctx,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_get_update_vsi *cmd =
-		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
-	struct i40e_aqc_add_get_update_vsi_completion *resp =
-		(struct i40e_aqc_add_get_update_vsi_completion *)
-		&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp;
+	struct i40e_aqc_add_get_update_vsi *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_get_vsi_parameters);
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	cmd->uplink_seid = cpu_to_le16(vsi_ctx->seid);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 
 	status = i40e_asq_send_command(hw, &desc, &vsi_ctx->info,
 				    sizeof(vsi_ctx->info), NULL);
@@ -1857,19 +1845,18 @@ int i40e_aq_update_vsi_params(struct i40e_hw *hw,
 			      struct i40e_vsi_context *vsi_ctx,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_get_update_vsi *cmd =
-		(struct i40e_aqc_add_get_update_vsi *)&desc.params.raw;
-	struct i40e_aqc_add_get_update_vsi_completion *resp =
-		(struct i40e_aqc_add_get_update_vsi_completion *)
-		&desc.params.raw;
+	struct i40e_aqc_add_get_update_vsi_completion *resp;
+	struct i40e_aqc_add_get_update_vsi *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_update_vsi_parameters);
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	cmd->uplink_seid = cpu_to_le16(vsi_ctx->seid);
 
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 
 	status = i40e_asq_send_command_atomic(hw, &desc, &vsi_ctx->info,
 					      sizeof(vsi_ctx->info),
@@ -1896,16 +1883,16 @@ int i40e_aq_get_switch_config(struct i40e_hw *hw,
 			      u16 buf_size, u16 *start_seid,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_switch_seid *scfg =
-		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	struct i40e_aqc_switch_seid *scfg;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_get_switch_config);
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	scfg = libie_aq_raw(&desc);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (buf_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 	scfg->seid = cpu_to_le16(*start_seid);
 
 	status = i40e_asq_send_command(hw, &desc, buf, buf_size, cmd_details);
@@ -1930,13 +1917,13 @@ int i40e_aq_set_switch_config(struct i40e_hw *hw,
 			      u16 valid_flags, u8 mode,
 			      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_switch_config *scfg =
-		(struct i40e_aqc_set_switch_config *)&desc.params.raw;
+	struct i40e_aqc_set_switch_config *scfg;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_switch_config);
+	scfg = libie_aq_raw(&desc);
 	scfg->flags = cpu_to_le16(flags);
 	scfg->valid_flags = cpu_to_le16(valid_flags);
 	scfg->mode = mode;
@@ -1968,11 +1955,11 @@ int i40e_aq_get_firmware_version(struct i40e_hw *hw,
 				 u16 *api_major_version, u16 *api_minor_version,
 				 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_version *resp =
-		(struct i40e_aqc_get_version *)&desc.params.raw;
+	struct i40e_aqc_get_version *resp;
+	struct libie_aq_desc desc;
 	int status;
 
+	resp = libie_aq_raw(&desc);
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_version);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -2005,22 +1992,22 @@ int i40e_aq_send_driver_version(struct i40e_hw *hw,
 				struct i40e_driver_version *dv,
 				struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_driver_version *cmd =
-		(struct i40e_aqc_driver_version *)&desc.params.raw;
+	struct libie_aqc_driver_ver *cmd;
+	struct libie_aq_desc desc;
 	int status;
 	u16 len;
 
 	if (dv == NULL)
 		return -EINVAL;
 
+	cmd = libie_aq_raw(&desc);
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_driver_version);
 
-	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD);
-	cmd->driver_major_ver = dv->major_version;
-	cmd->driver_minor_ver = dv->minor_version;
-	cmd->driver_build_ver = dv->build_version;
-	cmd->driver_subbuild_ver = dv->subbuild_version;
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD);
+	cmd->major_ver = dv->major_version;
+	cmd->minor_ver = dv->minor_version;
+	cmd->build_ver = dv->build_version;
+	cmd->subbuild_ver = dv->subbuild_version;
 
 	len = 0;
 	while (len < sizeof(dv->driver_string) &&
@@ -2120,11 +2107,9 @@ int i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
 		    bool enable_stats,
 		    struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_veb *cmd =
-		(struct i40e_aqc_add_veb *)&desc.params.raw;
-	struct i40e_aqc_add_veb_completion *resp =
-		(struct i40e_aqc_add_veb_completion *)&desc.params.raw;
+	struct i40e_aqc_add_veb_completion *resp;
+	struct i40e_aqc_add_veb *cmd;
+	struct libie_aq_desc desc;
 	u16 veb_flags = 0;
 	int status;
 
@@ -2132,6 +2117,8 @@ int i40e_aq_add_veb(struct i40e_hw *hw, u16 uplink_seid,
 	if (!!uplink_seid != !!downlink_seid)
 		return -EINVAL;
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_veb);
 
 	cmd->uplink_seid = cpu_to_le16(uplink_seid);
@@ -2178,15 +2165,14 @@ int i40e_aq_get_veb_parameters(struct i40e_hw *hw,
 			       u16 *vebs_used, u16 *vebs_free,
 			       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_veb_parameters_completion *cmd_resp =
-		(struct i40e_aqc_get_veb_parameters_completion *)
-		&desc.params.raw;
+	struct i40e_aqc_get_veb_parameters_completion *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (veb_seid == 0)
 		return -EINVAL;
 
+	cmd_resp = libie_aq_raw(&desc);
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_get_veb_parameters);
 	cmd_resp->seid = cpu_to_le16(veb_seid);
@@ -2228,10 +2214,9 @@ get_veb_exit:
  **/
 static u16
 i40e_prepare_add_macvlan(struct i40e_aqc_add_macvlan_element_data *mv_list,
-			 struct i40e_aq_desc *desc, u16 count, u16 seid)
+			 struct libie_aq_desc *desc, u16 count, u16 seid)
 {
-	struct i40e_aqc_macvlan *cmd =
-		(struct i40e_aqc_macvlan *)&desc->params.raw;
+	struct i40e_aqc_macvlan *cmd = libie_aq_raw(desc);
 	u16 buf_size;
 	int i;
 
@@ -2249,9 +2234,9 @@ i40e_prepare_add_macvlan(struct i40e_aqc_add_macvlan_element_data *mv_list,
 			mv_list[i].flags |=
 			       cpu_to_le16(I40E_AQC_MACVLAN_ADD_USE_SHARED_MAC);
 
-	desc->flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc->flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	if (buf_size > I40E_AQ_LARGE_BUF)
-		desc->flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc->flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	return buf_size;
 }
@@ -2271,7 +2256,7 @@ i40e_aq_add_macvlan(struct i40e_hw *hw, u16 seid,
 		    struct i40e_aqc_add_macvlan_element_data *mv_list,
 		    u16 count, struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 
 	if (count == 0 || !mv_list || !hw)
@@ -2302,9 +2287,9 @@ int
 i40e_aq_add_macvlan_v2(struct i40e_hw *hw, u16 seid,
 		       struct i40e_aqc_add_macvlan_element_data *mv_list,
 		       u16 count, struct i40e_asq_cmd_details *cmd_details,
-		       enum i40e_admin_queue_err *aq_status)
+		       enum libie_aq_err *aq_status)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 
 	if (count == 0 || !mv_list || !hw)
@@ -2331,9 +2316,8 @@ i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
 		       struct i40e_aqc_remove_macvlan_element_data *mv_list,
 		       u16 count, struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_macvlan *cmd =
-		(struct i40e_aqc_macvlan *)&desc.params.raw;
+	struct i40e_aqc_macvlan *cmd;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 	int status;
 
@@ -2344,14 +2328,15 @@ i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 seid,
 
 	/* prep the rest of the request */
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan);
+	cmd = libie_aq_raw(&desc);
 	cmd->num_addresses = cpu_to_le16(count);
 	cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
 	cmd->seid[1] = 0;
 	cmd->seid[2] = 0;
 
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	if (buf_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	status = i40e_asq_send_command_atomic(hw, &desc, mv_list, buf_size,
 					      cmd_details, true);
@@ -2378,10 +2363,10 @@ int
 i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
 			  struct i40e_aqc_remove_macvlan_element_data *mv_list,
 			  u16 count, struct i40e_asq_cmd_details *cmd_details,
-			  enum i40e_admin_queue_err *aq_status)
+			  enum libie_aq_err *aq_status)
 {
 	struct i40e_aqc_macvlan *cmd;
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	u16 buf_size;
 
 	if (count == 0 || !mv_list || !hw)
@@ -2391,15 +2376,15 @@ i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
 
 	/* prep the rest of the request */
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_remove_macvlan);
-	cmd = (struct i40e_aqc_macvlan *)&desc.params.raw;
+	cmd = libie_aq_raw(&desc);
 	cmd->num_addresses = cpu_to_le16(count);
 	cmd->seid[0] = cpu_to_le16(I40E_AQC_MACVLAN_CMD_SEID_VALID | seid);
 	cmd->seid[1] = 0;
 	cmd->seid[2] = 0;
 
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	if (buf_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	return i40e_asq_send_command_atomic_v2(hw, &desc, mv_list, buf_size,
 						 cmd_details, true, aq_status);
@@ -2421,21 +2406,21 @@ int i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
 			   u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_pf_vf_message *cmd =
-		(struct i40e_aqc_pf_vf_message *)&desc.params.raw;
+	struct i40e_aqc_pf_vf_message *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_send_msg_to_vf);
+	cmd = libie_aq_raw(&desc);
 	cmd->id = cpu_to_le32(vfid);
 	desc.cookie_high = cpu_to_le32(v_opcode);
 	desc.cookie_low = cpu_to_le32(v_retval);
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_SI);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_SI);
 	if (msglen) {
-		desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF |
-						I40E_AQ_FLAG_RD));
+		desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF |
+						LIBIE_AQ_FLAG_RD));
 		if (msglen > I40E_AQ_LARGE_BUF)
-			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+			desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 		desc.datalen = cpu_to_le16(msglen);
 	}
 	status = i40e_asq_send_command(hw, &desc, msg, msglen, cmd_details);
@@ -2456,9 +2441,8 @@ int i40e_aq_debug_read_register(struct i40e_hw *hw,
 				u32 reg_addr, u64 *reg_val,
 				struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_debug_reg_read_write *cmd_resp =
-		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	struct i40e_aqc_debug_reg_read_write *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (reg_val == NULL)
@@ -2466,6 +2450,7 @@ int i40e_aq_debug_read_register(struct i40e_hw *hw,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_read_reg);
 
+	cmd_resp = libie_aq_raw(&desc);
 	cmd_resp->address = cpu_to_le32(reg_addr);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -2491,13 +2476,13 @@ int i40e_aq_debug_write_register(struct i40e_hw *hw,
 				 u32 reg_addr, u64 reg_val,
 				 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_debug_reg_read_write *cmd =
-		(struct i40e_aqc_debug_reg_read_write *)&desc.params.raw;
+	struct i40e_aqc_debug_reg_read_write *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_debug_write_reg);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->address = cpu_to_le32(reg_addr);
 	cmd->value_high = cpu_to_le32((u32)(reg_val >> 32));
 	cmd->value_low = cpu_to_le32((u32)(reg_val & 0xFFFFFFFF));
@@ -2524,16 +2509,16 @@ int i40e_aq_request_resource(struct i40e_hw *hw,
 			     u8 sdp_number, u64 *timeout,
 			     struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_request_resource *cmd_resp =
-		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	struct libie_aqc_req_res *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_request_resource);
 
-	cmd_resp->resource_id = cpu_to_le16(resource);
+	cmd_resp = libie_aq_raw(&desc);
+	cmd_resp->res_id = cpu_to_le16(resource);
 	cmd_resp->access_type = cpu_to_le16(access);
-	cmd_resp->resource_number = cpu_to_le32(sdp_number);
+	cmd_resp->res_number = cpu_to_le32(sdp_number);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
 	/* The completion specifies the maximum time in ms that the driver
@@ -2542,7 +2527,7 @@ int i40e_aq_request_resource(struct i40e_hw *hw,
 	 * busy return value and the timeout field indicates the maximum time
 	 * the current owner of the resource has to free it.
 	 */
-	if (!status || hw->aq.asq_last_status == I40E_AQ_RC_EBUSY)
+	if (!status || hw->aq.asq_last_status == LIBIE_AQ_RC_EBUSY)
 		*timeout = le32_to_cpu(cmd_resp->timeout);
 
 	return status;
@@ -2562,15 +2547,15 @@ int i40e_aq_release_resource(struct i40e_hw *hw,
 			     u8 sdp_number,
 			     struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_request_resource *cmd =
-		(struct i40e_aqc_request_resource *)&desc.params.raw;
+	struct libie_aqc_req_res *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_release_resource);
 
-	cmd->resource_id = cpu_to_le16(resource);
-	cmd->resource_number = cpu_to_le32(sdp_number);
+	cmd = libie_aq_raw(&desc);
+	cmd->res_id = cpu_to_le16(resource);
+	cmd->res_number = cpu_to_le32(sdp_number);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
 
@@ -2594,9 +2579,8 @@ int i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
 		     bool last_command,
 		     struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_nvm_update *cmd =
-		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	struct i40e_aqc_nvm_update *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	/* In offset the highest byte must be zeroed. */
@@ -2607,6 +2591,7 @@ int i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_read);
 
+	cmd = libie_aq_raw(&desc);
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
@@ -2614,9 +2599,9 @@ int i40e_aq_read_nvm(struct i40e_hw *hw, u8 module_pointer,
 	cmd->offset = cpu_to_le32(offset);
 	cmd->length = cpu_to_le16(length);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (length > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
 
@@ -2639,9 +2624,8 @@ int i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
 		      u32 offset, u16 length, bool last_command,
 		      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_nvm_update *cmd =
-		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	struct i40e_aqc_nvm_update *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	/* In offset the highest byte must be zeroed. */
@@ -2652,6 +2636,7 @@ int i40e_aq_erase_nvm(struct i40e_hw *hw, u8 module_pointer,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_erase);
 
+	cmd = libie_aq_raw(&desc);
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
@@ -2678,7 +2663,7 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 				     u32 cap_count,
 				     enum i40e_admin_queue_opc list_type_opc)
 {
-	struct i40e_aqc_list_capabilities_element_resp *cap;
+	struct libie_aqc_list_caps_elem *cap;
 	u32 valid_functions, num_functions;
 	u32 number, logical_id, phys_id;
 	struct i40e_hw_capabilities *p;
@@ -2687,7 +2672,7 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 	int status;
 	u32 i = 0;
 
-	cap = (struct i40e_aqc_list_capabilities_element_resp *) buff;
+	cap = (struct libie_aqc_list_caps_elem *)buff;
 
 	if (list_type_opc == i40e_aqc_opc_list_dev_capabilities)
 		p = &hw->dev_caps;
@@ -2697,17 +2682,17 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 		return;
 
 	for (i = 0; i < cap_count; i++, cap++) {
-		id = le16_to_cpu(cap->id);
+		id = le16_to_cpu(cap->cap);
 		number = le32_to_cpu(cap->number);
 		logical_id = le32_to_cpu(cap->logical_id);
 		phys_id = le32_to_cpu(cap->phys_id);
-		major_rev = cap->major_rev;
+		major_rev = cap->major_ver;
 
 		switch (id) {
-		case I40E_AQ_CAP_ID_SWITCH_MODE:
+		case LIBIE_AQC_CAPS_SWITCH_MODE:
 			p->switch_mode = number;
 			break;
-		case I40E_AQ_CAP_ID_MNG_MODE:
+		case LIBIE_AQC_CAPS_MNG_MODE:
 			p->management_mode = number;
 			if (major_rev > 1) {
 				p->mng_protocols_over_mctp = logical_id;
@@ -2718,76 +2703,76 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 				p->mng_protocols_over_mctp = 0;
 			}
 			break;
-		case I40E_AQ_CAP_ID_NPAR_ACTIVE:
+		case LIBIE_AQC_CAPS_NPAR_ACTIVE:
 			p->npar_enable = number;
 			break;
-		case I40E_AQ_CAP_ID_OS2BMC_CAP:
+		case LIBIE_AQC_CAPS_OS2BMC_CAP:
 			p->os2bmc = number;
 			break;
-		case I40E_AQ_CAP_ID_FUNCTIONS_VALID:
+		case LIBIE_AQC_CAPS_VALID_FUNCTIONS:
 			p->valid_functions = number;
 			break;
-		case I40E_AQ_CAP_ID_SRIOV:
+		case LIBIE_AQC_CAPS_SRIOV:
 			if (number == 1)
 				p->sr_iov_1_1 = true;
 			break;
-		case I40E_AQ_CAP_ID_VF:
+		case LIBIE_AQC_CAPS_VF:
 			p->num_vfs = number;
 			p->vf_base_id = logical_id;
 			break;
-		case I40E_AQ_CAP_ID_VMDQ:
+		case LIBIE_AQC_CAPS_VMDQ:
 			if (number == 1)
 				p->vmdq = true;
 			break;
-		case I40E_AQ_CAP_ID_8021QBG:
+		case LIBIE_AQC_CAPS_8021QBG:
 			if (number == 1)
 				p->evb_802_1_qbg = true;
 			break;
-		case I40E_AQ_CAP_ID_8021QBR:
+		case LIBIE_AQC_CAPS_8021QBR:
 			if (number == 1)
 				p->evb_802_1_qbh = true;
 			break;
-		case I40E_AQ_CAP_ID_VSI:
+		case LIBIE_AQC_CAPS_VSI:
 			p->num_vsis = number;
 			break;
-		case I40E_AQ_CAP_ID_DCB:
+		case LIBIE_AQC_CAPS_DCB:
 			if (number == 1) {
 				p->dcb = true;
 				p->enabled_tcmap = logical_id;
 				p->maxtc = phys_id;
 			}
 			break;
-		case I40E_AQ_CAP_ID_FCOE:
+		case LIBIE_AQC_CAPS_FCOE:
 			if (number == 1)
 				p->fcoe = true;
 			break;
-		case I40E_AQ_CAP_ID_ISCSI:
+		case LIBIE_AQC_CAPS_ISCSI:
 			if (number == 1)
 				p->iscsi = true;
 			break;
-		case I40E_AQ_CAP_ID_RSS:
+		case LIBIE_AQC_CAPS_RSS:
 			p->rss = true;
 			p->rss_table_size = number;
 			p->rss_table_entry_width = logical_id;
 			break;
-		case I40E_AQ_CAP_ID_RXQ:
+		case LIBIE_AQC_CAPS_RXQS:
 			p->num_rx_qp = number;
 			p->base_queue = phys_id;
 			break;
-		case I40E_AQ_CAP_ID_TXQ:
+		case LIBIE_AQC_CAPS_TXQS:
 			p->num_tx_qp = number;
 			p->base_queue = phys_id;
 			break;
-		case I40E_AQ_CAP_ID_MSIX:
+		case LIBIE_AQC_CAPS_MSIX:
 			p->num_msix_vectors = number;
 			i40e_debug(hw, I40E_DEBUG_INIT,
 				   "HW Capability: MSIX vector count = %d\n",
 				   p->num_msix_vectors);
 			break;
-		case I40E_AQ_CAP_ID_VF_MSIX:
+		case LIBIE_AQC_CAPS_VF_MSIX:
 			p->num_msix_vectors_vf = number;
 			break;
-		case I40E_AQ_CAP_ID_FLEX10:
+		case LIBIE_AQC_CAPS_FLEX10:
 			if (major_rev == 1) {
 				if (number == 1) {
 					p->flex10_enable = true;
@@ -2803,42 +2788,42 @@ static void i40e_parse_discover_capabilities(struct i40e_hw *hw, void *buff,
 			p->flex10_mode = logical_id;
 			p->flex10_status = phys_id;
 			break;
-		case I40E_AQ_CAP_ID_CEM:
+		case LIBIE_AQC_CAPS_CEM:
 			if (number == 1)
 				p->mgmt_cem = true;
 			break;
-		case I40E_AQ_CAP_ID_IWARP:
+		case LIBIE_AQC_CAPS_RDMA:
 			if (number == 1)
 				p->iwarp = true;
 			break;
-		case I40E_AQ_CAP_ID_LED:
+		case LIBIE_AQC_CAPS_LED:
 			if (phys_id < I40E_HW_CAP_MAX_GPIO)
 				p->led[phys_id] = true;
 			break;
-		case I40E_AQ_CAP_ID_SDP:
+		case LIBIE_AQC_CAPS_SDP:
 			if (phys_id < I40E_HW_CAP_MAX_GPIO)
 				p->sdp[phys_id] = true;
 			break;
-		case I40E_AQ_CAP_ID_MDIO:
+		case LIBIE_AQC_CAPS_MDIO:
 			if (number == 1) {
 				p->mdio_port_num = phys_id;
 				p->mdio_port_mode = logical_id;
 			}
 			break;
-		case I40E_AQ_CAP_ID_1588:
+		case LIBIE_AQC_CAPS_1588:
 			if (number == 1)
 				p->ieee_1588 = true;
 			break;
-		case I40E_AQ_CAP_ID_FLOW_DIRECTOR:
+		case LIBIE_AQC_CAPS_FD:
 			p->fd = true;
 			p->fd_filters_guaranteed = number;
 			p->fd_filters_best_effort = logical_id;
 			break;
-		case I40E_AQ_CAP_ID_WSR_PROT:
+		case LIBIE_AQC_CAPS_WSR_PROT:
 			p->wr_csr_prot = (u64)number;
 			p->wr_csr_prot |= (u64)logical_id << 32;
 			break;
-		case I40E_AQ_CAP_ID_NVM_MGMT:
+		case LIBIE_AQC_CAPS_NVM_MGMT:
 			if (number & I40E_NVM_MGMT_SEC_REV_DISABLED)
 				p->sec_rev_disabled = true;
 			if (number & I40E_NVM_MGMT_UPDATE_DISABLED)
@@ -2930,11 +2915,11 @@ int i40e_aq_discover_capabilities(struct i40e_hw *hw,
 				  enum i40e_admin_queue_opc list_type_opc,
 				  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aqc_list_capabilites *cmd;
-	struct i40e_aq_desc desc;
+	struct libie_aqc_list_caps *cmd;
+	struct libie_aq_desc desc;
 	int status = 0;
 
-	cmd = (struct i40e_aqc_list_capabilites *)&desc.params.raw;
+	cmd = libie_aq_raw(&desc);
 
 	if (list_type_opc != i40e_aqc_opc_list_func_capabilities &&
 		list_type_opc != i40e_aqc_opc_list_dev_capabilities) {
@@ -2944,9 +2929,9 @@ int i40e_aq_discover_capabilities(struct i40e_hw *hw,
 
 	i40e_fill_default_direct_cmd_desc(&desc, list_type_opc);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
 	*data_size = le16_to_cpu(desc.datalen);
@@ -2979,9 +2964,8 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
 		       bool last_command, u8 preservation_flags,
 		       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_nvm_update *cmd =
-		(struct i40e_aqc_nvm_update *)&desc.params.raw;
+	struct i40e_aqc_nvm_update *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	/* In offset the highest byte must be zeroed. */
@@ -2992,6 +2976,7 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_nvm_update);
 
+	cmd = libie_aq_raw(&desc);
 	/* If this is the last command in a series, set the proper flag. */
 	if (last_command)
 		cmd->command_flags |= I40E_AQ_NVM_LAST_CMD;
@@ -3009,9 +2994,9 @@ int i40e_aq_update_nvm(struct i40e_hw *hw, u8 module_pointer,
 	cmd->offset = cpu_to_le32(offset);
 	cmd->length = cpu_to_le16(length);
 
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	if (length > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	status = i40e_asq_send_command(hw, &desc, data, length, cmd_details);
 
@@ -3037,11 +3022,9 @@ int i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
 			 u16 *local_len, u16 *remote_len,
 			 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_lldp_get_mib *cmd =
-		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
-	struct i40e_aqc_lldp_get_mib *resp =
-		(struct i40e_aqc_lldp_get_mib *)&desc.params.raw;
+	struct i40e_aqc_lldp_get_mib *resp;
+	struct i40e_aqc_lldp_get_mib *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (buff_size == 0 || !buff)
@@ -3049,16 +3032,18 @@ int i40e_aq_get_lldp_mib(struct i40e_hw *hw, u8 bridge_type,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_get_mib);
 	/* Indirect Command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	cmd->type = mib_type & I40E_AQ_LLDP_MIB_TYPE_MASK;
 	cmd->type |= FIELD_PREP(I40E_AQ_LLDP_BRIDGE_TYPE_MASK, bridge_type);
 
 	desc.datalen = cpu_to_le16(buff_size);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
 	if (!status) {
@@ -3087,19 +3072,19 @@ i40e_aq_set_lldp_mib(struct i40e_hw *hw,
 		     struct i40e_asq_cmd_details *cmd_details)
 {
 	struct i40e_aqc_lldp_set_local_mib *cmd;
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = (struct i40e_aqc_lldp_set_local_mib *)&desc.params.raw;
+	cmd = libie_aq_raw(&desc);
 	if (buff_size == 0 || !buff)
 		return -EINVAL;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_lldp_set_local_mib);
 	/* Indirect Command */
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 	desc.datalen = cpu_to_le16(buff_size);
 
 	cmd->type = mib_type;
@@ -3124,13 +3109,13 @@ int i40e_aq_cfg_lldp_mib_change_event(struct i40e_hw *hw,
 				      bool enable_update,
 				      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_lldp_update_mib *cmd =
-		(struct i40e_aqc_lldp_update_mib *)&desc.params.raw;
+	struct i40e_aqc_lldp_update_mib *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_update_mib);
 
+	cmd = libie_aq_raw(&desc);
 	if (!enable_update)
 		cmd->command |= I40E_AQ_LLDP_MIB_UPDATE_DISABLE;
 
@@ -3152,13 +3137,13 @@ int i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
 		      bool persist,
 		      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_lldp_stop *cmd =
-		(struct i40e_aqc_lldp_stop *)&desc.params.raw;
+	struct i40e_aqc_lldp_stop *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_stop);
 
+	cmd = libie_aq_raw(&desc);
 	if (shutdown_agent)
 		cmd->command |= I40E_AQ_LLDP_AGENT_SHUTDOWN;
 
@@ -3186,13 +3171,13 @@ int i40e_aq_stop_lldp(struct i40e_hw *hw, bool shutdown_agent,
 int i40e_aq_start_lldp(struct i40e_hw *hw, bool persist,
 		       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_lldp_start *cmd =
-		(struct i40e_aqc_lldp_start *)&desc.params.raw;
+	struct i40e_aqc_lldp_start *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_lldp_start);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->command = I40E_AQ_LLDP_AGENT_START;
 
 	if (persist) {
@@ -3219,9 +3204,8 @@ int
 i40e_aq_set_dcb_parameters(struct i40e_hw *hw, bool dcb_enable,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_set_dcb_parameters *cmd =
-		(struct i40e_aqc_set_dcb_parameters *)&desc.params.raw;
+	struct i40e_aqc_set_dcb_parameters *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!test_bit(I40E_HW_CAP_FW_LLDP_STOPPABLE, hw->caps))
@@ -3230,6 +3214,7 @@ i40e_aq_set_dcb_parameters(struct i40e_hw *hw, bool dcb_enable,
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_dcb_parameters);
 
+	cmd = libie_aq_raw(&desc);
 	if (dcb_enable) {
 		cmd->valid_flags = I40E_DCB_VALID;
 		cmd->command = I40E_AQ_DCB_SET_AGENT;
@@ -3252,7 +3237,7 @@ int i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
 			       void *buff, u16 buff_size,
 			       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (buff_size == 0 || !buff)
@@ -3260,7 +3245,7 @@ int i40e_aq_get_cee_dcb_config(struct i40e_hw *hw,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_get_cee_dcb_cfg);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	status = i40e_asq_send_command(hw, &desc, (void *)buff, buff_size,
 				       cmd_details);
 
@@ -3284,15 +3269,15 @@ int i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
 			   u8 *filter_index,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_udp_tunnel *cmd =
-		(struct i40e_aqc_add_udp_tunnel *)&desc.params.raw;
-	struct i40e_aqc_del_udp_tunnel_completion *resp =
-		(struct i40e_aqc_del_udp_tunnel_completion *)&desc.params.raw;
+	struct i40e_aqc_del_udp_tunnel_completion *resp;
+	struct i40e_aqc_add_udp_tunnel *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_add_udp_tunnel);
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	cmd->udp_port = cpu_to_le16(udp_port);
 	cmd->protocol_type = protocol_index;
 
@@ -3313,13 +3298,13 @@ int i40e_aq_add_udp_tunnel(struct i40e_hw *hw,
 int i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_remove_udp_tunnel *cmd =
-		(struct i40e_aqc_remove_udp_tunnel *)&desc.params.raw;
+	struct i40e_aqc_remove_udp_tunnel *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_del_udp_tunnel);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->index = index;
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -3338,9 +3323,8 @@ int i40e_aq_del_udp_tunnel(struct i40e_hw *hw, u8 index,
 int i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_switch_seid *cmd =
-		(struct i40e_aqc_switch_seid *)&desc.params.raw;
+	struct i40e_aqc_switch_seid *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (seid == 0)
@@ -3348,6 +3332,7 @@ int i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_delete_element);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->seid = cpu_to_le16(seid);
 
 	status = i40e_asq_send_command_atomic(hw, &desc, NULL, 0,
@@ -3368,7 +3353,7 @@ int i40e_aq_delete_element(struct i40e_hw *hw, u16 seid,
 int i40e_aq_dcb_updated(struct i40e_hw *hw,
 			struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_dcb_updated);
@@ -3394,9 +3379,8 @@ static int i40e_aq_tx_sched_cmd(struct i40e_hw *hw, u16 seid,
 				enum i40e_admin_queue_opc opcode,
 				struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_tx_sched_ind *cmd =
-		(struct i40e_aqc_tx_sched_ind *)&desc.params.raw;
+	struct i40e_aqc_tx_sched_ind *cmd;
+	struct libie_aq_desc desc;
 	int status;
 	bool cmd_param_flag = false;
 
@@ -3423,12 +3407,13 @@ static int i40e_aq_tx_sched_cmd(struct i40e_hw *hw, u16 seid,
 
 	i40e_fill_default_direct_cmd_desc(&desc, opcode);
 
+	cmd = libie_aq_raw(&desc);
 	/* Indirect command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (cmd_param_flag)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_RD);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	desc.datalen = cpu_to_le16(buff_size);
 
@@ -3451,14 +3436,14 @@ int i40e_aq_config_vsi_bw_limit(struct i40e_hw *hw,
 				u16 seid, u16 credit, u8 max_credit,
 				struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_configure_vsi_bw_limit *cmd =
-		(struct i40e_aqc_configure_vsi_bw_limit *)&desc.params.raw;
+	struct i40e_aqc_configure_vsi_bw_limit *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_configure_vsi_bw_limit);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->vsi_seid = cpu_to_le16(seid);
 	cmd->credit = cpu_to_le16(credit);
 	cmd->max_credit = max_credit;
@@ -3786,18 +3771,16 @@ int i40e_aq_add_rem_control_packet_filter(struct i40e_hw *hw,
 					  struct i40e_control_filter_stats *stats,
 					  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_remove_control_packet_filter *cmd =
-		(struct i40e_aqc_add_remove_control_packet_filter *)
-		&desc.params.raw;
-	struct i40e_aqc_add_remove_control_packet_filter_completion *resp =
-		(struct i40e_aqc_add_remove_control_packet_filter_completion *)
-		&desc.params.raw;
+	struct i40e_aqc_add_remove_control_packet_filter_completion *resp;
+	struct i40e_aqc_add_remove_control_packet_filter *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (vsi_seid == 0)
 		return -EINVAL;
 
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	if (is_add) {
 		i40e_fill_default_direct_cmd_desc(&desc,
 				i40e_aqc_opc_add_control_packet_filter);
@@ -3865,15 +3848,15 @@ static int i40e_aq_alternate_read(struct i40e_hw *hw,
 				  u32 reg_addr0, u32 *reg_val0,
 				  u32 reg_addr1, u32 *reg_val1)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_alternate_write *cmd_resp =
-		(struct i40e_aqc_alternate_write *)&desc.params.raw;
+	struct i40e_aqc_alternate_write *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!reg_val0)
 		return -EINVAL;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_alternate_read);
+	cmd_resp = libie_aq_raw(&desc);
 	cmd_resp->address0 = cpu_to_le32(reg_addr0);
 	cmd_resp->address1 = cpu_to_le32(reg_addr1);
 
@@ -3901,10 +3884,10 @@ int i40e_aq_suspend_port_tx(struct i40e_hw *hw, u16 seid,
 			    struct i40e_asq_cmd_details *cmd_details)
 {
 	struct i40e_aqc_tx_sched_ind *cmd;
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
-	cmd = (struct i40e_aqc_tx_sched_ind *)&desc.params.raw;
+	cmd = libie_aq_raw(&desc);
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_suspend_port_tx);
 	cmd->vsi_seid = cpu_to_le16(seid);
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -3922,7 +3905,7 @@ int i40e_aq_suspend_port_tx(struct i40e_hw *hw, u16 seid,
 int i40e_aq_resume_port_tx(struct i40e_hw *hw,
 			   struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_resume_port_tx);
@@ -3999,11 +3982,9 @@ int i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
 		       u8 *ret_next_table, u32 *ret_next_index,
 		       struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_debug_dump_internals *cmd =
-		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
-	struct i40e_aqc_debug_dump_internals *resp =
-		(struct i40e_aqc_debug_dump_internals *)&desc.params.raw;
+	struct i40e_aqc_debug_dump_internals *resp;
+	struct i40e_aqc_debug_dump_internals *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (buff_size == 0 || !buff)
@@ -4011,10 +3992,12 @@ int i40e_aq_debug_dump(struct i40e_hw *hw, u8 cluster_id,
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_debug_dump_internals);
+	resp = libie_aq_raw(&desc);
+	cmd = libie_aq_raw(&desc);
 	/* Indirect Command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	cmd->cluster_id = cluster_id;
 	cmd->table_id = table_id;
@@ -4091,18 +4074,18 @@ i40e_aq_configure_partition_bw(struct i40e_hw *hw,
 			       struct i40e_asq_cmd_details *cmd_details)
 {
 	u16 bwd_size = sizeof(*bw_data);
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_configure_partition_bw);
 
 	/* Indirect command */
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_RD);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_RD);
 
 	if (bwd_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	desc.datalen = cpu_to_le16(bwd_size);
 
@@ -4534,9 +4517,8 @@ int i40e_aq_rx_ctl_read_register(struct i40e_hw *hw,
 				 u32 reg_addr, u32 *reg_val,
 				 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_rx_ctl_reg_read_write *cmd_resp =
-		(struct i40e_aqc_rx_ctl_reg_read_write *)&desc.params.raw;
+	struct i40e_aqc_rx_ctl_reg_read_write *cmd_resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	if (!reg_val)
@@ -4544,6 +4526,7 @@ int i40e_aq_rx_ctl_read_register(struct i40e_hw *hw,
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_rx_ctl_reg_read);
 
+	cmd_resp = libie_aq_raw(&desc);
 	cmd_resp->address = cpu_to_le32(reg_addr);
 
 	status = i40e_asq_send_command(hw, &desc, NULL, 0, cmd_details);
@@ -4572,7 +4555,7 @@ u32 i40e_read_rx_ctl(struct i40e_hw *hw, u32 reg_addr)
 	if (!use_register) {
 do_retry:
 		status = i40e_aq_rx_ctl_read_register(hw, reg_addr, &val, NULL);
-		if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN && retry) {
+		if (hw->aq.asq_last_status == LIBIE_AQ_RC_EAGAIN && retry) {
 			usleep_range(1000, 2000);
 			retry--;
 			goto do_retry;
@@ -4600,13 +4583,13 @@ int i40e_aq_rx_ctl_write_register(struct i40e_hw *hw,
 				  u32 reg_addr, u32 reg_val,
 				  struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_rx_ctl_reg_read_write *cmd =
-		(struct i40e_aqc_rx_ctl_reg_read_write *)&desc.params.raw;
+	struct i40e_aqc_rx_ctl_reg_read_write *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc, i40e_aqc_opc_rx_ctl_reg_write);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->address = cpu_to_le32(reg_addr);
 	cmd->value = cpu_to_le32(reg_val);
 
@@ -4634,7 +4617,7 @@ void i40e_write_rx_ctl(struct i40e_hw *hw, u32 reg_addr, u32 reg_val)
 do_retry:
 		status = i40e_aq_rx_ctl_write_register(hw, reg_addr,
 						       reg_val, NULL);
-		if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN && retry) {
+		if (hw->aq.asq_last_status == LIBIE_AQ_RC_EAGAIN && retry) {
 			usleep_range(1000, 2000);
 			retry--;
 			goto do_retry;
@@ -4693,14 +4676,14 @@ int i40e_aq_set_phy_register_ext(struct i40e_hw *hw,
 				 u32 reg_addr, u32 reg_val,
 				 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_phy_register_access *cmd =
-		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	struct i40e_aqc_phy_register_access *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_set_phy_register);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->phy_interface = phy_select;
 	cmd->dev_address = dev_addr;
 	cmd->reg_address = cpu_to_le32(reg_addr);
@@ -4738,14 +4721,14 @@ int i40e_aq_get_phy_register_ext(struct i40e_hw *hw,
 				 u32 reg_addr, u32 *reg_val,
 				 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_phy_register_access *cmd =
-		(struct i40e_aqc_phy_register_access *)&desc.params.raw;
+	struct i40e_aqc_phy_register_access *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_get_phy_register);
 
+	cmd = libie_aq_raw(&desc);
 	cmd->phy_interface = phy_select;
 	cmd->dev_address = dev_addr;
 	cmd->reg_address = cpu_to_le32(reg_addr);
@@ -4777,19 +4760,18 @@ int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
 		      u32 *error_offset, u32 *error_info,
 		      struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_write_personalization_profile *cmd =
-		(struct i40e_aqc_write_personalization_profile *)
-		&desc.params.raw;
+	struct i40e_aqc_write_personalization_profile *cmd;
 	struct i40e_aqc_write_ddp_resp *resp;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_write_personalization_profile);
 
-	desc.flags |= cpu_to_le16(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD);
+	cmd = libie_aq_raw(&desc);
+	desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 
 	desc.datalen = cpu_to_le16(buff_size);
 
@@ -4797,7 +4779,7 @@ int i40e_aq_write_ddp(struct i40e_hw *hw, void *buff,
 
 	status = i40e_asq_send_command(hw, &desc, buff, buff_size, cmd_details);
 	if (!status) {
-		resp = (struct i40e_aqc_write_ddp_resp *)&desc.params.raw;
+		resp = libie_aq_raw(&desc);
 		if (error_offset)
 			*error_offset = le32_to_cpu(resp->error_offset);
 		if (error_info)
@@ -4819,17 +4801,17 @@ int i40e_aq_get_ddp_list(struct i40e_hw *hw, void *buff,
 			 u16 buff_size, u8 flags,
 			 struct i40e_asq_cmd_details *cmd_details)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_get_applied_profiles *cmd =
-		(struct i40e_aqc_get_applied_profiles *)&desc.params.raw;
+	struct i40e_aqc_get_applied_profiles *cmd;
+	struct libie_aq_desc desc;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_get_personalization_profile_list);
 
-	desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+	cmd = libie_aq_raw(&desc);
+	desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 	if (buff_size > I40E_AQ_LARGE_BUF)
-		desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+		desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 	desc.datalen = cpu_to_le16(buff_size);
 
 	cmd->flags = flags;
@@ -4891,7 +4873,7 @@ i40e_find_segment_in_package(u32 segment_type,
 static int i40e_ddp_exec_aq_section(struct i40e_hw *hw,
 				    struct i40e_profile_aq_section *aq)
 {
-	struct i40e_aq_desc desc;
+	struct libie_aq_desc desc;
 	u8 *msg = NULL;
 	u16 msglen;
 	int status;
@@ -4902,10 +4884,10 @@ static int i40e_ddp_exec_aq_section(struct i40e_hw *hw,
 
 	msglen = aq->datalen;
 	if (msglen) {
-		desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF |
-						I40E_AQ_FLAG_RD));
+		desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF |
+						LIBIE_AQ_FLAG_RD));
 		if (msglen > I40E_AQ_LARGE_BUF)
-			desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB);
+			desc.flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_LB);
 		desc.datalen = cpu_to_le16(msglen);
 		msg = &aq->data[0];
 	}
@@ -5122,18 +5104,18 @@ i40e_aq_add_cloud_filters(struct i40e_hw *hw, u16 seid,
 			  struct i40e_aqc_cloud_filters_element_data *filters,
 			  u8 filter_count)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_remove_cloud_filters *cmd =
-	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	struct i40e_aqc_add_remove_cloud_filters *cmd;
+	struct libie_aq_desc desc;
 	u16 buff_len;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_add_cloud_filters);
 
+	cmd = libie_aq_raw(&desc);
 	buff_len = filter_count * sizeof(*filters);
 	desc.datalen = cpu_to_le16(buff_len);
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	cmd->num_filters = filter_count;
 	cmd->seid = cpu_to_le16(seid);
 
@@ -5159,9 +5141,8 @@ i40e_aq_add_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
 			     struct i40e_aqc_cloud_filters_element_bb *filters,
 			     u8 filter_count)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_remove_cloud_filters *cmd =
-	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	struct i40e_aqc_add_remove_cloud_filters *cmd;
+	struct libie_aq_desc desc;
 	u16 buff_len;
 	int status;
 	int i;
@@ -5169,9 +5150,10 @@ i40e_aq_add_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_add_cloud_filters);
 
+	cmd = libie_aq_raw(&desc);
 	buff_len = filter_count * sizeof(*filters);
 	desc.datalen = cpu_to_le16(buff_len);
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	cmd->num_filters = filter_count;
 	cmd->seid = cpu_to_le16(seid);
 	cmd->big_buffer_flag = I40E_AQC_ADD_CLOUD_CMD_BB;
@@ -5215,18 +5197,18 @@ i40e_aq_rem_cloud_filters(struct i40e_hw *hw, u16 seid,
 			  struct i40e_aqc_cloud_filters_element_data *filters,
 			  u8 filter_count)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_remove_cloud_filters *cmd =
-	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	struct i40e_aqc_add_remove_cloud_filters *cmd;
+	struct libie_aq_desc desc;
 	u16 buff_len;
 	int status;
 
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_remove_cloud_filters);
 
+	cmd = libie_aq_raw(&desc);
 	buff_len = filter_count * sizeof(*filters);
 	desc.datalen = cpu_to_le16(buff_len);
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	cmd->num_filters = filter_count;
 	cmd->seid = cpu_to_le16(seid);
 
@@ -5252,9 +5234,8 @@ i40e_aq_rem_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
 			     struct i40e_aqc_cloud_filters_element_bb *filters,
 			     u8 filter_count)
 {
-	struct i40e_aq_desc desc;
-	struct i40e_aqc_add_remove_cloud_filters *cmd =
-	(struct i40e_aqc_add_remove_cloud_filters *)&desc.params.raw;
+	struct i40e_aqc_add_remove_cloud_filters *cmd;
+	struct libie_aq_desc desc;
 	u16 buff_len;
 	int status;
 	int i;
@@ -5262,9 +5243,10 @@ i40e_aq_rem_cloud_filters_bb(struct i40e_hw *hw, u16 seid,
 	i40e_fill_default_direct_cmd_desc(&desc,
 					  i40e_aqc_opc_remove_cloud_filters);
 
+	cmd = libie_aq_raw(&desc);
 	buff_len = filter_count * sizeof(*filters);
 	desc.datalen = cpu_to_le16(buff_len);
-	desc.flags |= cpu_to_le16((u16)(I40E_AQ_FLAG_BUF | I40E_AQ_FLAG_RD));
+	desc.flags |= cpu_to_le16((u16)(LIBIE_AQ_FLAG_BUF | LIBIE_AQ_FLAG_RD));
 	cmd->num_filters = filter_count;
 	cmd->seid = cpu_to_le16(seid);
 	cmd->big_buffer_flag = I40E_AQC_ADD_CLOUD_CMD_BB;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 352e957443fd..9e0c9597aeb9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -750,7 +750,7 @@ static int i40e_get_ieee_dcb_config(struct i40e_hw *hw)
 				     I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
 				     &hw->remote_dcbx_config);
 	/* Don't treat ENOENT as an error for Remote MIBs */
-	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+	if (hw->aq.asq_last_status == LIBIE_AQ_RC_ENOENT)
 		ret = 0;
 
 out:
@@ -799,7 +799,7 @@ int i40e_get_dcb_config(struct i40e_hw *hw)
 	}
 
 	/* CEE mode not enabled try querying IEEE data */
-	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+	if (hw->aq.asq_last_status == LIBIE_AQ_RC_ENOENT)
 		return i40e_get_ieee_dcb_config(hw);
 
 	if (ret)
@@ -816,7 +816,7 @@ int i40e_get_dcb_config(struct i40e_hw *hw)
 				     I40E_AQ_LLDP_BRIDGE_TYPE_NEAREST_BRIDGE,
 				     &hw->remote_dcbx_config);
 	/* Don't treat ENOENT as an error for Remote MIBs */
-	if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT)
+	if (hw->aq.asq_last_status == LIBIE_AQ_RC_ENOENT)
 		ret = 0;
 
 out:
@@ -925,11 +925,11 @@ i40e_get_fw_lldp_status(struct i40e_hw *hw,
 
 	if (!ret) {
 		*lldp_status = I40E_GET_FW_LLDP_STATUS_ENABLED;
-	} else if (hw->aq.asq_last_status == I40E_AQ_RC_ENOENT) {
+	} else if (hw->aq.asq_last_status == LIBIE_AQ_RC_ENOENT) {
 		/* MIB is not available yet but the agent is running */
 		*lldp_status = I40E_GET_FW_LLDP_STATUS_ENABLED;
 		ret = 0;
-	} else if (hw->aq.asq_last_status == I40E_AQ_RC_EPERM) {
+	} else if (hw->aq.asq_last_status == LIBIE_AQ_RC_EPERM) {
 		*lldp_status = I40E_GET_FW_LLDP_STATUS_DISABLED;
 		ret = 0;
 	}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 6cd9da662ae1..6cd6f23d42a6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -489,7 +489,7 @@ static void i40e_dbg_dump_aq_desc(struct i40e_pf *pf)
 	dev_info(&pf->pdev->dev, "AdminQ Tx Ring\n");
 	ring = &(hw->aq.asq);
 	for (i = 0; i < ring->count; i++) {
-		struct i40e_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
+		struct libie_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
 
 		dev_info(&pf->pdev->dev,
 			 "   at[%02d] flags=0x%04x op=0x%04x dlen=0x%04x ret=0x%04x cookie_h=0x%08x cookie_l=0x%08x\n",
@@ -502,7 +502,7 @@ static void i40e_dbg_dump_aq_desc(struct i40e_pf *pf)
 	dev_info(&pf->pdev->dev, "AdminQ Rx Ring\n");
 	ring = &(hw->aq.arq);
 	for (i = 0; i < ring->count; i++) {
-		struct i40e_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
+		struct libie_aq_desc *d = I40E_ADMINQ_DESC(*ring, i);
 
 		dev_info(&pf->pdev->dev,
 			 "   ar[%02d] flags=0x%04x op=0x%04x dlen=0x%04x ret=0x%04x cookie_h=0x%08x cookie_l=0x%08x\n",
@@ -1268,10 +1268,10 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			dev_info(&pf->pdev->dev, "clear_stats vsi [seid] or clear_stats port\n");
 		}
 	} else if (strncmp(cmd_buf, "send aq_cmd", 11) == 0) {
-		struct i40e_aq_desc *desc;
+		struct libie_aq_desc *desc;
 		int ret;
 
-		desc = kzalloc(sizeof(struct i40e_aq_desc), GFP_KERNEL);
+		desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 		if (!desc)
 			goto command_write_done;
 		cnt = sscanf(&cmd_buf[11],
@@ -1279,10 +1279,10 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			     &desc->flags,
 			     &desc->opcode, &desc->datalen, &desc->retval,
 			     &desc->cookie_high, &desc->cookie_low,
-			     &desc->params.internal.param0,
-			     &desc->params.internal.param1,
-			     &desc->params.internal.param2,
-			     &desc->params.internal.param3);
+			     &desc->params.generic.param0,
+			     &desc->params.generic.param1,
+			     &desc->params.generic.addr_high,
+			     &desc->params.generic.addr_low);
 		if (cnt != 10) {
 			dev_info(&pf->pdev->dev,
 				 "send aq_cmd: bad command string, cnt=%d\n",
@@ -1307,19 +1307,19 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			 "AQ desc WB 0x%04x 0x%04x 0x%04x 0x%04x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
 			 desc->flags, desc->opcode, desc->datalen, desc->retval,
 			 desc->cookie_high, desc->cookie_low,
-			 desc->params.internal.param0,
-			 desc->params.internal.param1,
-			 desc->params.internal.param2,
-			 desc->params.internal.param3);
+			 desc->params.generic.param0,
+			 desc->params.generic.param1,
+			 desc->params.generic.addr_high,
+			 desc->params.generic.addr_low);
 		kfree(desc);
 		desc = NULL;
 	} else if (strncmp(cmd_buf, "send indirect aq_cmd", 20) == 0) {
-		struct i40e_aq_desc *desc;
+		struct libie_aq_desc *desc;
 		u16 buffer_len;
 		u8 *buff;
 		int ret;
 
-		desc = kzalloc(sizeof(struct i40e_aq_desc), GFP_KERNEL);
+		desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 		if (!desc)
 			goto command_write_done;
 		cnt = sscanf(&cmd_buf[20],
@@ -1327,10 +1327,10 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			     &desc->flags,
 			     &desc->opcode, &desc->datalen, &desc->retval,
 			     &desc->cookie_high, &desc->cookie_low,
-			     &desc->params.internal.param0,
-			     &desc->params.internal.param1,
-			     &desc->params.internal.param2,
-			     &desc->params.internal.param3,
+			     &desc->params.generic.param0,
+			     &desc->params.generic.param1,
+			     &desc->params.generic.addr_high,
+			     &desc->params.generic.addr_low,
 			     &buffer_len);
 		if (cnt != 11) {
 			dev_info(&pf->pdev->dev,
@@ -1350,7 +1350,7 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			desc = NULL;
 			goto command_write_done;
 		}
-		desc->flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF);
+		desc->flags |= cpu_to_le16((u16)LIBIE_AQ_FLAG_BUF);
 		ret = i40e_asq_send_command(&pf->hw, desc, buff,
 					    buffer_len, NULL);
 		if (!ret) {
@@ -1368,10 +1368,10 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
 			 "AQ desc WB 0x%04x 0x%04x 0x%04x 0x%04x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x\n",
 			 desc->flags, desc->opcode, desc->datalen, desc->retval,
 			 desc->cookie_high, desc->cookie_low,
-			 desc->params.internal.param0,
-			 desc->params.internal.param1,
-			 desc->params.internal.param2,
-			 desc->params.internal.param3);
+			 desc->params.generic.param0,
+			 desc->params.generic.param1,
+			 desc->params.generic.addr_high,
+			 desc->params.generic.addr_low);
 		print_hex_dump(KERN_INFO, "AQ buffer WB: ",
 			       DUMP_PREFIX_OFFSET, 16, 1,
 			       buff, buffer_len, true);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 2ff17d50135c..2b01eedf3605 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1918,13 +1918,13 @@ static int i40e_get_eeprom(struct net_device *netdev,
 		ret_val = i40e_aq_read_nvm(hw, 0x0, offset, len,
 				(u8 *)eeprom_buff + (I40E_NVM_SECTOR_SIZE * i),
 				last, NULL);
-		if (ret_val && hw->aq.asq_last_status == I40E_AQ_RC_EPERM) {
+		if (ret_val && hw->aq.asq_last_status == LIBIE_AQ_RC_EPERM) {
 			dev_info(&pf->pdev->dev,
 				 "read NVM failed, invalid offset 0x%x\n",
 				 offset);
 			break;
 		} else if (ret_val &&
-			   hw->aq.asq_last_status == I40E_AQ_RC_EACCES) {
+			   hw->aq.asq_last_status == LIBIE_AQ_RC_EACCES) {
 			dev_info(&pf->pdev->dev,
 				 "read NVM failed, access, offset 0x%x\n",
 				 offset);
@@ -5249,9 +5249,9 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags)
 	DECLARE_BITMAP(orig_flags, I40E_PF_FLAGS_NBITS);
 	DECLARE_BITMAP(new_flags, I40E_PF_FLAGS_NBITS);
 	struct i40e_netdev_priv *np = netdev_priv(dev);
-	enum i40e_admin_queue_err adq_err;
 	struct i40e_vsi *vsi = np->vsi;
 	struct i40e_pf *pf = vsi->back;
+	enum libie_aq_err adq_err;
 	u32 reset_needed = 0;
 	int status;
 	u32 i, j;
@@ -5371,7 +5371,7 @@ flags_complete:
 		valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
 		ret = i40e_aq_set_switch_config(&pf->hw, sw_flags, valid_flags,
 						0, NULL);
-		if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
+		if (ret && pf->hw.aq.asq_last_status != LIBIE_AQ_RC_ESRCH) {
 			dev_info(&pf->pdev->dev,
 				 "couldn't set switch config bits, err %pe aq_err %s\n",
 				 ERR_PTR(ret),
@@ -5438,16 +5438,16 @@ flags_complete:
 			if (status) {
 				adq_err = pf->hw.aq.asq_last_status;
 				switch (adq_err) {
-				case I40E_AQ_RC_EEXIST:
+				case LIBIE_AQ_RC_EEXIST:
 					dev_warn(&pf->pdev->dev,
 						 "FW LLDP agent is already running\n");
 					reset_needed = 0;
 					break;
-				case I40E_AQ_RC_EPERM:
+				case LIBIE_AQ_RC_EPERM:
 					dev_warn(&pf->pdev->dev,
 						 "Device configuration forbids SW from starting the LLDP agent.\n");
 					return -EINVAL;
-				case I40E_AQ_RC_EAGAIN:
+				case LIBIE_AQ_RC_EAGAIN:
 					dev_warn(&pf->pdev->dev,
 						 "Stop FW LLDP agent command is still being processed, please try again in a second.\n");
 					return -EBUSY;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 949b74fbb127..f4d913eeeac6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2340,14 +2340,14 @@ void i40e_aqc_del_filters(struct i40e_vsi *vsi, const char *vsi_name,
 			  int num_del, int *retval)
 {
 	struct i40e_hw *hw = &vsi->back->hw;
-	enum i40e_admin_queue_err aq_status;
+	enum libie_aq_err aq_status;
 	int aq_ret;
 
 	aq_ret = i40e_aq_remove_macvlan_v2(hw, vsi->seid, list, num_del, NULL,
 					   &aq_status);
 
 	/* Explicitly ignore and do not report when firmware returns ENOENT */
-	if (aq_ret && !(aq_status == I40E_AQ_RC_ENOENT)) {
+	if (aq_ret && !(aq_status == LIBIE_AQ_RC_ENOENT)) {
 		*retval = -EIO;
 		dev_info(&vsi->back->pdev->dev,
 			 "ignoring delete macvlan error on %s, err %pe, aq_err %s\n",
@@ -2375,7 +2375,7 @@ void i40e_aqc_add_filters(struct i40e_vsi *vsi, const char *vsi_name,
 			  int num_add)
 {
 	struct i40e_hw *hw = &vsi->back->hw;
-	enum i40e_admin_queue_err aq_status;
+	enum libie_aq_err aq_status;
 	int fcnt;
 
 	i40e_aq_add_macvlan_v2(hw, vsi->seid, list, num_add, NULL, &aq_status);
@@ -5997,8 +5997,8 @@ int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate)
  **/
 static void i40e_remove_queue_channels(struct i40e_vsi *vsi)
 {
-	enum i40e_admin_queue_err last_aq_status;
 	struct i40e_cloud_filter *cfilter;
+	enum libie_aq_err last_aq_status;
 	struct i40e_channel *ch, *ch_tmp;
 	struct i40e_pf *pf = vsi->back;
 	struct hlist_node *node;
@@ -6539,7 +6539,7 @@ static int i40e_validate_and_set_switch_mode(struct i40e_vsi *vsi)
 	ret = i40e_aq_set_switch_config(hw, pf->last_sw_conf_flags,
 					pf->last_sw_conf_valid_flags,
 					mode, NULL);
-	if (ret && hw->aq.asq_last_status != I40E_AQ_RC_ESRCH)
+	if (ret && hw->aq.asq_last_status != LIBIE_AQ_RC_ESRCH)
 		dev_err(&pf->pdev->dev,
 			"couldn't set switch config bits, err %pe aq_err %s\n",
 			ERR_PTR(ret),
@@ -7214,7 +7214,7 @@ static int i40e_init_pf_dcb(struct i40e_pf *pf)
 			dev_dbg(&pf->pdev->dev,
 				"DCBX offload is supported for this PF.\n");
 		}
-	} else if (pf->hw.aq.asq_last_status == I40E_AQ_RC_EPERM) {
+	} else if (pf->hw.aq.asq_last_status == LIBIE_AQ_RC_EPERM) {
 		dev_info(&pf->pdev->dev, "FW LLDP disabled for this PF.\n");
 		set_bit(I40E_FLAG_FW_LLDP_DIS, pf->flags);
 	} else {
@@ -9419,8 +9419,7 @@ bool i40e_dcb_need_reconfig(struct i40e_pf *pf,
 static int i40e_handle_lldp_event(struct i40e_pf *pf,
 				  struct i40e_arq_event_info *e)
 {
-	struct i40e_aqc_lldp_get_mib *mib =
-		(struct i40e_aqc_lldp_get_mib *)&e->desc.params.raw;
+	struct i40e_aqc_lldp_get_mib *mib = libie_aq_raw(&e->desc);
 	struct i40e_hw *hw = &pf->hw;
 	struct i40e_dcbx_config tmp_dcbx_cfg;
 	bool need_reconfig = false;
@@ -9559,8 +9558,7 @@ void i40e_do_reset_safe(struct i40e_pf *pf, u32 reset_flags)
 static void i40e_handle_lan_overflow_event(struct i40e_pf *pf,
 					   struct i40e_arq_event_info *e)
 {
-	struct i40e_aqc_lan_overflow *data =
-		(struct i40e_aqc_lan_overflow *)&e->desc.params.raw;
+	struct i40e_aqc_lan_overflow *data = libie_aq_raw(&e->desc);
 	u32 queue = le32_to_cpu(data->prtdcb_rupto);
 	u32 qtx_ctl = le32_to_cpu(data->otx_ctl);
 	struct i40e_hw *hw = &pf->hw;
@@ -10080,8 +10078,7 @@ static void i40e_reset_subtask(struct i40e_pf *pf)
 static void i40e_handle_link_event(struct i40e_pf *pf,
 				   struct i40e_arq_event_info *e)
 {
-	struct i40e_aqc_get_link_status *status =
-		(struct i40e_aqc_get_link_status *)&e->desc.params.raw;
+	struct i40e_aqc_get_link_status *status = libie_aq_raw(&e->desc);
 
 	/* Do a new status request to re-enable LSE reporting
 	 * and load new status information into the hw struct
@@ -10453,12 +10450,12 @@ static int i40e_reconstitute_veb(struct i40e_veb *veb)
 static int i40e_get_capabilities(struct i40e_pf *pf,
 				 enum i40e_admin_queue_opc list_type)
 {
-	struct i40e_aqc_list_capabilities_element_resp *cap_buf;
+	struct libie_aqc_list_caps_elem *cap_buf;
 	u16 data_size;
 	int buf_len;
 	int err;
 
-	buf_len = 40 * sizeof(struct i40e_aqc_list_capabilities_element_resp);
+	buf_len = 40 * sizeof(struct libie_aqc_list_caps_elem);
 	do {
 		cap_buf = kzalloc(buf_len, GFP_KERNEL);
 		if (!cap_buf)
@@ -10471,10 +10468,10 @@ static int i40e_get_capabilities(struct i40e_pf *pf,
 		/* data loaded, buffer no longer needed */
 		kfree(cap_buf);
 
-		if (pf->hw.aq.asq_last_status == I40E_AQ_RC_ENOMEM) {
+		if (pf->hw.aq.asq_last_status == LIBIE_AQ_RC_ENOMEM) {
 			/* retry with a larger buffer */
 			buf_len = data_size;
-		} else if (pf->hw.aq.asq_last_status != I40E_AQ_RC_OK || err) {
+		} else if (pf->hw.aq.asq_last_status != LIBIE_AQ_RC_OK || err) {
 			dev_info(&pf->pdev->dev,
 				 "capability discovery failed, err %pe aq_err %s\n",
 				 ERR_PTR(err),
@@ -15029,7 +15026,7 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit, bool lock_acqui
 		valid_flags = I40E_AQ_SET_SWITCH_CFG_PROMISC;
 		ret = i40e_aq_set_switch_config(&pf->hw, flags, valid_flags, 0,
 						NULL);
-		if (ret && pf->hw.aq.asq_last_status != I40E_AQ_RC_ESRCH) {
+		if (ret && pf->hw.aq.asq_last_status != LIBIE_AQ_RC_ESRCH) {
 			dev_info(&pf->pdev->dev,
 				 "couldn't set switch config bits, err %pe aq_err %s\n",
 				 ERR_PTR(ret),
diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index 7f0936f4e05e..5dfbe71205e6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -997,7 +997,7 @@ static int i40e_nvmupd_exec_aq(struct i40e_hw *hw,
 			       u8 *bytes, int *perrno)
 {
 	struct i40e_asq_cmd_details cmd_details;
-	struct i40e_aq_desc *aq_desc;
+	struct libie_aq_desc *aq_desc;
 	u32 buff_size = 0;
 	u8 *buff = NULL;
 	u32 aq_desc_len;
@@ -1011,7 +1011,7 @@ static int i40e_nvmupd_exec_aq(struct i40e_hw *hw,
 	memset(&cmd_details, 0, sizeof(cmd_details));
 	cmd_details.wb_desc = &hw->nvm_wb_desc;
 
-	aq_desc_len = sizeof(struct i40e_aq_desc);
+	aq_desc_len = sizeof(struct libie_aq_desc);
 	memset(&hw->nvm_wb_desc, 0, aq_desc_len);
 
 	/* get the aq descriptor */
@@ -1022,7 +1022,7 @@ static int i40e_nvmupd_exec_aq(struct i40e_hw *hw,
 		*perrno = -EINVAL;
 		return -EINVAL;
 	}
-	aq_desc = (struct i40e_aq_desc *)bytes;
+	aq_desc = (struct libie_aq_desc *)bytes;
 
 	/* if data buffer needed, make sure it's ready */
 	aq_data_len = cmd->data_size - aq_desc_len;
@@ -1087,7 +1087,7 @@ static int i40e_nvmupd_get_aq_result(struct i40e_hw *hw,
 
 	i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__);
 
-	aq_desc_len = sizeof(struct i40e_aq_desc);
+	aq_desc_len = sizeof(struct libie_aq_desc);
 	aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_wb_desc.datalen);
 
 	/* check offset range */
@@ -1154,7 +1154,7 @@ static int i40e_nvmupd_get_aq_event(struct i40e_hw *hw,
 
 	i40e_debug(hw, I40E_DEBUG_NVM, "NVMUPD: %s\n", __func__);
 
-	aq_desc_len = sizeof(struct i40e_aq_desc);
+	aq_desc_len = sizeof(struct libie_aq_desc);
 	aq_total_len = aq_desc_len + le16_to_cpu(hw->nvm_aq_event_desc.datalen);
 
 	/* check copylength range */
@@ -1442,7 +1442,7 @@ retry:
 	 * so here we try to reacquire the semaphore then retry the write.
 	 * We only do one retry, then give up.
 	 */
-	if (status && hw->aq.asq_last_status == I40E_AQ_RC_EBUSY &&
+	if (status && hw->aq.asq_last_status == LIBIE_AQ_RC_EBUSY &&
 	    !retry_attempt) {
 		u32 old_asq_status = hw->aq.asq_last_status;
 		int old_status = status;
@@ -1628,9 +1628,9 @@ void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw)
  * @desc: AdminQ descriptor
  **/
 void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode,
-				  struct i40e_aq_desc *desc)
+				  struct libie_aq_desc *desc)
 {
-	u32 aq_desc_len = sizeof(struct i40e_aq_desc);
+	u32 aq_desc_len = sizeof(struct libie_aq_desc);
 
 	if (opcode == hw->nvm_wait_opcode) {
 		memcpy(&hw->nvm_aq_event_desc, desc, aq_desc_len);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 099bb8ab7d70..bd54f06b43cd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -23,22 +23,22 @@ int i40e_clean_arq_element(struct i40e_hw *hw,
 			   struct i40e_arq_event_info *e,
 			   u16 *events_pending);
 int
-i40e_asq_send_command(struct i40e_hw *hw, struct i40e_aq_desc *desc,
+i40e_asq_send_command(struct i40e_hw *hw, struct libie_aq_desc *desc,
 		      void *buff, /* can be NULL */ u16  buff_size,
 		      struct i40e_asq_cmd_details *cmd_details);
 int
-i40e_asq_send_command_atomic(struct i40e_hw *hw, struct i40e_aq_desc *desc,
+i40e_asq_send_command_atomic(struct i40e_hw *hw, struct libie_aq_desc *desc,
 			     void *buff, /* can be NULL */ u16  buff_size,
 			     struct i40e_asq_cmd_details *cmd_details,
 			     bool is_atomic_context);
 int
 i40e_asq_send_command_atomic_v2(struct i40e_hw *hw,
-				struct i40e_aq_desc *desc,
+				struct libie_aq_desc *desc,
 				void *buff, /* can be NULL */
 				u16  buff_size,
 				struct i40e_asq_cmd_details *cmd_details,
 				bool is_atomic_context,
-				enum i40e_admin_queue_err *aq_status);
+				enum libie_aq_err *aq_status);
 
 /* debug function for adminq */
 void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask,
@@ -46,7 +46,7 @@ void i40e_debug_aq(struct i40e_hw *hw, enum i40e_debug_mask mask,
 
 bool i40e_check_asq_alive(struct i40e_hw *hw);
 int i40e_aq_queue_shutdown(struct i40e_hw *hw, bool unloading);
-const char *i40e_aq_str(struct i40e_hw *hw, enum i40e_admin_queue_err aq_err);
+const char *i40e_aq_str(struct i40e_hw *hw, enum libie_aq_err aq_err);
 
 int i40e_aq_get_rss_lut(struct i40e_hw *hw, u16 seid,
 			bool pf_lut, u8 *lut, u16 lut_size);
@@ -155,7 +155,7 @@ int
 i40e_aq_add_macvlan_v2(struct i40e_hw *hw, u16 seid,
 		       struct i40e_aqc_add_macvlan_element_data *mv_list,
 		       u16 count, struct i40e_asq_cmd_details *cmd_details,
-		       enum i40e_admin_queue_err *aq_status);
+		       enum libie_aq_err *aq_status);
 int i40e_aq_remove_macvlan(struct i40e_hw *hw, u16 vsi_id,
 			   struct i40e_aqc_remove_macvlan_element_data *mv_list,
 			   u16 count, struct i40e_asq_cmd_details *cmd_details);
@@ -163,7 +163,7 @@ int
 i40e_aq_remove_macvlan_v2(struct i40e_hw *hw, u16 seid,
 			  struct i40e_aqc_remove_macvlan_element_data *mv_list,
 			  u16 count, struct i40e_asq_cmd_details *cmd_details,
-			  enum i40e_admin_queue_err *aq_status);
+			  enum libie_aq_err *aq_status);
 
 int i40e_aq_send_msg_to_vf(struct i40e_hw *hw, u16 vfid,
 			   u32 v_opcode, u32 v_retval, u8 *msg, u16 msglen,
@@ -339,7 +339,7 @@ int i40e_nvmupd_command(struct i40e_hw *hw,
 			struct i40e_nvm_access *cmd,
 			u8 *bytes, int *errno);
 void i40e_nvmupd_check_wait_event(struct i40e_hw *hw, u16 opcode,
-				  struct i40e_aq_desc *desc);
+				  struct libie_aq_desc *desc);
 void i40e_nvmupd_clear_wait_state(struct i40e_hw *hw);
 void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index a09ed83835ff..ed8bbdb586da 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -24,7 +24,7 @@
 
 /* forward declaration */
 struct i40e_hw;
-typedef void (*I40E_ADMINQ_CALLBACK)(struct i40e_hw *, struct i40e_aq_desc *);
+typedef void (*I40E_ADMINQ_CALLBACK)(struct i40e_hw *, struct libie_aq_desc *);
 
 /* Data type manipulation macros. */
 
@@ -555,8 +555,8 @@ struct i40e_hw {
 
 	/* state of nvm update process */
 	enum i40e_nvmupd_state nvmupd_state;
-	struct i40e_aq_desc nvm_wb_desc;
-	struct i40e_aq_desc nvm_aq_event_desc;
+	struct libie_aq_desc nvm_wb_desc;
+	struct libie_aq_desc nvm_aq_event_desc;
 	struct i40e_virt_mem nvm_buff;
 	bool nvm_release_on_done;
 	u16 nvm_wait_opcode;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 3024f5dde384..143ce7e4aea2 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -7951,6 +7951,10 @@ const char *ice_aq_str(enum libie_aq_err aq_err)
 		return "ICE_AQ_RC_EBADMAN";
 	case LIBIE_AQ_RC_EBADBUF:
 		return "ICE_AQ_RC_EBADBUF";
+	case LIBIE_AQ_RC_EIO:
+		return "ICE_AQ_RC_EIO";
+	case LIBIE_AQ_RC_EACCES:
+		return "ICE_AQ_RC_EACCES";
 	}
 
 	return "ICE_AQ_RC_UNKNOWN";
diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h
index b8079e7d842a..bab7cecc657f 100644
--- a/include/linux/net/intel/libie/adminq.h
+++ b/include/linux/net/intel/libie/adminq.h
@@ -145,17 +145,26 @@ struct libie_aqc_list_caps {
 LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
 
 /* Device/Function buffer entry, repeated per reported capability */
+#define LIBIE_AQC_CAPS_SWITCH_MODE			0x0001
+#define LIBIE_AQC_CAPS_MNG_MODE				0x0002
+#define LIBIE_AQC_CAPS_NPAR_ACTIVE			0x0003
+#define LIBIE_AQC_CAPS_OS2BMC_CAP			0x0004
 #define LIBIE_AQC_CAPS_VALID_FUNCTIONS			0x0005
 #define LIBIE_AQC_MAX_VALID_FUNCTIONS			0x8
 #define LIBIE_AQC_CAPS_SRIOV				0x0012
 #define LIBIE_AQC_CAPS_VF				0x0013
 #define LIBIE_AQC_CAPS_VMDQ				0x0014
+#define LIBIE_AQC_CAPS_8021QBG				0x0015
+#define LIBIE_AQC_CAPS_8021QBR				0x0016
 #define LIBIE_AQC_CAPS_VSI				0x0017
 #define LIBIE_AQC_CAPS_DCB				0x0018
+#define LIBIE_AQC_CAPS_FCOE				0x0021
+#define LIBIE_AQC_CAPS_ISCSI				0x0022
 #define LIBIE_AQC_CAPS_RSS				0x0040
 #define LIBIE_AQC_CAPS_RXQS				0x0041
 #define LIBIE_AQC_CAPS_TXQS				0x0042
 #define LIBIE_AQC_CAPS_MSIX				0x0043
+#define LIBIE_AQC_CAPS_VF_MSIX				0x0044
 #define LIBIE_AQC_CAPS_FD				0x0045
 #define LIBIE_AQC_CAPS_1588				0x0046
 #define LIBIE_AQC_CAPS_MAX_MTU				0x0047
@@ -166,6 +175,10 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
 #define LIBIE_AQC_CAPS_NET_VER				0x004C
 #define LIBIE_AQC_CAPS_PENDING_NET_VER			0x004D
 #define LIBIE_AQC_CAPS_RDMA				0x0051
+#define LIBIE_AQC_CAPS_LED				0x0061
+#define LIBIE_AQC_CAPS_SDP				0x0062
+#define LIBIE_AQC_CAPS_MDIO				0x0063
+#define LIBIE_AQC_CAPS_WSR_PROT				0x0064
 #define LIBIE_AQC_CAPS_SENSOR_READING			0x0067
 #define LIBIE_AQC_INLINE_IPSEC				0x0070
 #define LIBIE_AQC_CAPS_NUM_ENABLED_PORTS		0x0072
@@ -181,6 +194,8 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps);
 #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT			0x0092
 #define LIBIE_AQC_BIT_ROCEV2_LAG			0x01
 #define LIBIE_AQC_BIT_SRIOV_LAG				0x02
+#define LIBIE_AQC_CAPS_FLEX10				0x00F1
+#define LIBIE_AQC_CAPS_CEM				0x00F2
 
 /**
  * struct libie_aqc_list_caps_elem - Getting list of caps elements
@@ -266,8 +281,10 @@ enum libie_aq_err {
 	LIBIE_AQ_RC_EPERM	= 1,  /* Operation not permitted */
 	LIBIE_AQ_RC_ENOENT	= 2,  /* No such element */
 	LIBIE_AQ_RC_ESRCH	= 3,  /* Bad opcode */
+	LIBIE_AQ_RC_EIO		= 5,  /* I/O error */
 	LIBIE_AQ_RC_EAGAIN	= 8,  /* Try again */
 	LIBIE_AQ_RC_ENOMEM	= 9,  /* Out of memory */
+	LIBIE_AQ_RC_EACCES	= 10, /* Permission denied */
 	LIBIE_AQ_RC_EBUSY	= 12, /* Device or resource busy */
 	LIBIE_AQ_RC_EEXIST	= 13, /* Object already exists */
 	LIBIE_AQ_RC_EINVAL	= 14, /* Invalid argument */
-- 
cgit v1.2.3


From 5feaa7a07b85ebbef418ba4b80e4e0d23dc379f5 Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Fri, 25 Apr 2025 08:08:06 +0200
Subject: libie: add adminq helper for converting err to str

Add a new module for common handling of Admin Queue related logic.
Start by a helper for error to string conversion. This lives inside
libie/, but is a separate module what follows our logic of splitting
into topical modules, to avoid pulling in not needed stuff, and have
better organization in general.

Olek suggested how to better solve the error to string conversion.

It will be used in follow-up patches in ice, i40e and iavf.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Suggested-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libie/Kconfig  |  6 ++++
 drivers/net/ethernet/intel/libie/Makefile |  4 +++
 drivers/net/ethernet/intel/libie/adminq.c | 52 +++++++++++++++++++++++++++++++
 include/linux/net/intel/libie/adminq.h    |  2 ++
 4 files changed, 64 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/libie/adminq.c

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libie/Kconfig b/drivers/net/ethernet/intel/libie/Kconfig
index 33aff6bc8f81..e6072758e3d8 100644
--- a/drivers/net/ethernet/intel/libie/Kconfig
+++ b/drivers/net/ethernet/intel/libie/Kconfig
@@ -8,3 +8,9 @@ config LIBIE
 	  libie (Intel Ethernet library) is a common library built on top of
 	  libeth and containing vendor-specific routines shared between several
 	  Intel Ethernet drivers.
+
+config LIBIE_ADMINQ
+	tristate
+	help
+	  Helper functions used by Intel Ethernet drivers for administration
+	  queue command interface (aka adminq).
diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
index ffd27fab916a..e98f00b865d3 100644
--- a/drivers/net/ethernet/intel/libie/Makefile
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -4,3 +4,7 @@
 obj-$(CONFIG_LIBIE)	+= libie.o
 
 libie-y			:= rx.o
+
+obj-$(CONFIG_LIBIE_ADMINQ) 	+= libie_adminq.o
+
+libie_adminq-y			:= adminq.o
diff --git a/drivers/net/ethernet/intel/libie/adminq.c b/drivers/net/ethernet/intel/libie/adminq.c
new file mode 100644
index 000000000000..55356548e3f0
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/adminq.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2025 Intel Corporation */
+
+#include <linux/module.h>
+#include <linux/net/intel/libie/adminq.h>
+
+static const char * const libie_aq_str_arr[] = {
+#define LIBIE_AQ_STR(x)					\
+	[LIBIE_AQ_RC_##x]	= "LIBIE_AQ_RC" #x
+	LIBIE_AQ_STR(OK),
+	LIBIE_AQ_STR(EPERM),
+	LIBIE_AQ_STR(ENOENT),
+	LIBIE_AQ_STR(ESRCH),
+	LIBIE_AQ_STR(EIO),
+	LIBIE_AQ_STR(EAGAIN),
+	LIBIE_AQ_STR(ENOMEM),
+	LIBIE_AQ_STR(EACCES),
+	LIBIE_AQ_STR(EBUSY),
+	LIBIE_AQ_STR(EEXIST),
+	LIBIE_AQ_STR(EINVAL),
+	LIBIE_AQ_STR(ENOSPC),
+	LIBIE_AQ_STR(ENOSYS),
+	LIBIE_AQ_STR(EMODE),
+	LIBIE_AQ_STR(ENOSEC),
+	LIBIE_AQ_STR(EBADSIG),
+	LIBIE_AQ_STR(ESVN),
+	LIBIE_AQ_STR(EBADMAN),
+	LIBIE_AQ_STR(EBADBUF),
+#undef LIBIE_AQ_STR
+	"LIBIE_AQ_RC_UNKNOWN",
+};
+
+#define __LIBIE_AQ_STR_NUM (ARRAY_SIZE(libie_aq_str_arr) - 1)
+
+/**
+ * libie_aq_str - get error string based on aq error
+ * @err: admin queue error type
+ *
+ * Return: error string for passed error code
+ */
+const char *libie_aq_str(enum libie_aq_err err)
+{
+	if (err >= ARRAY_SIZE(libie_aq_str_arr) ||
+	    !libie_aq_str_arr[err])
+		err = __LIBIE_AQ_STR_NUM;
+
+	return libie_aq_str_arr[err];
+}
+EXPORT_SYMBOL_NS_GPL(libie_aq_str, "LIBIE_ADMINQ");
+
+MODULE_DESCRIPTION("Intel(R) Ethernet common library - adminq helpers");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h
index bab7cecc657f..012b5d499c1a 100644
--- a/include/linux/net/intel/libie/adminq.h
+++ b/include/linux/net/intel/libie/adminq.h
@@ -303,4 +303,6 @@ static inline void *libie_aq_raw(struct libie_aq_desc *desc)
 	return &desc->params.raw;
 }
 
+const char *libie_aq_str(enum libie_aq_err err);
+
 #endif /* __LIBIE_ADMINQ_H */
-- 
cgit v1.2.3


From ca7be9c0a148cbfe38df95a0285339c532ca6e17 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sun, 8 Jun 2025 17:15:35 +0100
Subject: mtd: ubi: Remove unused ubi_flush

ubi_flush() was added in 2012 as part of
commit 62f384552b67 ("UBI: modify ubi_wl_flush function to clear work queue
for a lnum")
but has remained unused.

(It's friend ubi_wl_flush() is still used)

Remove it.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/ubi/kapi.c  | 27 ---------------------------
 include/linux/mtd/ubi.h |  1 -
 2 files changed, 28 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
index f1ea8677467f..df0a5a57b072 100644
--- a/drivers/mtd/ubi/kapi.c
+++ b/drivers/mtd/ubi/kapi.c
@@ -791,33 +791,6 @@ int ubi_sync(int ubi_num)
 }
 EXPORT_SYMBOL_GPL(ubi_sync);
 
-/**
- * ubi_flush - flush UBI work queue.
- * @ubi_num: UBI device to flush work queue
- * @vol_id: volume id to flush for
- * @lnum: logical eraseblock number to flush for
- *
- * This function executes all pending works for a particular volume id / logical
- * eraseblock number pair. If either value is set to %UBI_ALL, then it acts as
- * a wildcard for all of the corresponding volume numbers or logical
- * eraseblock numbers. It returns zero in case of success and a negative error
- * code in case of failure.
- */
-int ubi_flush(int ubi_num, int vol_id, int lnum)
-{
-	struct ubi_device *ubi;
-	int err = 0;
-
-	ubi = ubi_get_device(ubi_num);
-	if (!ubi)
-		return -ENODEV;
-
-	err = ubi_wl_flush(ubi, vol_id, lnum);
-	ubi_put_device(ubi);
-	return err;
-}
-EXPORT_SYMBOL_GPL(ubi_flush);
-
 BLOCKING_NOTIFIER_HEAD(ubi_notifiers);
 
 /**
diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
index 562f92504f2b..c3f79c4be1cc 100644
--- a/include/linux/mtd/ubi.h
+++ b/include/linux/mtd/ubi.h
@@ -250,7 +250,6 @@ int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum);
 int ubi_leb_map(struct ubi_volume_desc *desc, int lnum);
 int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum);
 int ubi_sync(int ubi_num);
-int ubi_flush(int ubi_num, int vol_id, int lnum);
 
 /*
  * This function is the same as the 'ubi_leb_read()' function, but it does not
-- 
cgit v1.2.3


From 1c3b002c6bf684b445a7107609979bca5f21bc03 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 10 Jul 2025 15:13:49 -0400
Subject: PCI: endpoint: Add RC-to-EP doorbell support using platform MSI
 controller

Implement the doorbell feature by mapping the EP's MSI interrupt controller
message address to a dedicated BAR.

The EPF driver should pass the actual message data to be written to the
message address by the host through implementation-specific logic.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
[mani: minor code cleanups and reworded commit message]
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
[bhelgaas: fix kernel-doc]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Link: https://patch.msgid.link/20250710-ep-msi-v21-3-57683fc7fb25@nxp.com
---
 drivers/pci/endpoint/Kconfig      |  8 ++++
 drivers/pci/endpoint/Makefile     |  1 +
 drivers/pci/endpoint/pci-ep-msi.c | 92 +++++++++++++++++++++++++++++++++++++++
 include/linux/pci-ep-msi.h        | 28 ++++++++++++
 include/linux/pci-epf.h           | 15 +++++++
 5 files changed, 144 insertions(+)
 create mode 100644 drivers/pci/endpoint/pci-ep-msi.c
 create mode 100644 include/linux/pci-ep-msi.h

(limited to 'include')

diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig
index 1c5d82eb57d4..8dad291be8b8 100644
--- a/drivers/pci/endpoint/Kconfig
+++ b/drivers/pci/endpoint/Kconfig
@@ -28,6 +28,14 @@ config PCI_ENDPOINT_CONFIGFS
 	   configure the endpoint function and used to bind the
 	   function with an endpoint controller.
 
+config PCI_ENDPOINT_MSI_DOORBELL
+	bool "PCI Endpoint MSI Doorbell Support"
+	depends on PCI_ENDPOINT && GENERIC_MSI_IRQ
+	help
+	  This enables the EP's MSI interrupt controller to function as a
+	  doorbell. The RC can trigger doorbell in EP by writing data to a
+	  dedicated BAR, which the EP maps to the controller's message address.
+
 source "drivers/pci/endpoint/functions/Kconfig"
 
 endmenu
diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
index 95b2fe47e3b0..b4869d52053a 100644
--- a/drivers/pci/endpoint/Makefile
+++ b/drivers/pci/endpoint/Makefile
@@ -6,3 +6,4 @@
 obj-$(CONFIG_PCI_ENDPOINT_CONFIGFS)	+= pci-ep-cfs.o
 obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
 					   pci-epc-mem.o functions/
+obj-$(CONFIG_PCI_ENDPOINT_MSI_DOORBELL)	+= pci-ep-msi.o
diff --git a/drivers/pci/endpoint/pci-ep-msi.c b/drivers/pci/endpoint/pci-ep-msi.c
new file mode 100644
index 000000000000..95a47ce155ac
--- /dev/null
+++ b/drivers/pci/endpoint/pci-ep-msi.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Endpoint *Controller* (EPC) MSI library
+ *
+ * Copyright (C) 2025 NXP
+ * Author: Frank Li <Frank.Li@nxp.com>
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_irq.h>
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+#include <linux/pci-ep-cfs.h>
+#include <linux/pci-ep-msi.h>
+#include <linux/slab.h>
+
+static void pci_epf_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct pci_epc *epc;
+	struct pci_epf *epf;
+
+	epc = pci_epc_get(dev_name(msi_desc_to_dev(desc)));
+	if (!epc)
+		return;
+
+	epf = list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list);
+
+	if (epf && epf->db_msg && desc->msi_index < epf->num_db)
+		memcpy(&epf->db_msg[desc->msi_index].msg, msg, sizeof(*msg));
+
+	pci_epc_put(epc);
+}
+
+int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 num_db)
+{
+	struct pci_epc *epc = epf->epc;
+	struct device *dev = &epf->dev;
+	struct irq_domain *domain;
+	void *msg;
+	int ret;
+	int i;
+
+	/* TODO: Multi-EPF support */
+	if (list_first_entry_or_null(&epc->pci_epf, struct pci_epf, list) != epf) {
+		dev_err(dev, "MSI doorbell doesn't support multiple EPF\n");
+		return -EINVAL;
+	}
+
+	domain = of_msi_map_get_device_domain(epc->dev.parent, 0,
+					      DOMAIN_BUS_PLATFORM_MSI);
+	if (!domain) {
+		dev_err(dev, "Can't find MSI domain for EPC\n");
+		return -ENODEV;
+	}
+
+	dev_set_msi_domain(epc->dev.parent, domain);
+
+	msg = kcalloc(num_db, sizeof(struct pci_epf_doorbell_msg), GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	epf->num_db = num_db;
+	epf->db_msg = msg;
+
+	ret = platform_device_msi_init_and_alloc_irqs(epc->dev.parent, num_db,
+						      pci_epf_write_msi_msg);
+	if (ret) {
+		dev_err(dev, "Failed to allocate MSI\n");
+		kfree(msg);
+		return ret;
+	}
+
+	for (i = 0; i < num_db; i++)
+		epf->db_msg[i].virq = msi_get_virq(epc->dev.parent, i);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epf_alloc_doorbell);
+
+void pci_epf_free_doorbell(struct pci_epf *epf)
+{
+	platform_device_msi_free_irqs_all(epf->epc->dev.parent);
+
+	kfree(epf->db_msg);
+	epf->db_msg = NULL;
+	epf->num_db = 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_free_doorbell);
diff --git a/include/linux/pci-ep-msi.h b/include/linux/pci-ep-msi.h
new file mode 100644
index 000000000000..7c5db90f9620
--- /dev/null
+++ b/include/linux/pci-ep-msi.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PCI Endpoint *Function* side MSI header file
+ *
+ * Copyright (C) 2024 NXP
+ * Author: Frank Li <Frank.Li@nxp.com>
+ */
+
+#ifndef __PCI_EP_MSI__
+#define __PCI_EP_MSI__
+
+struct pci_epf;
+
+#ifdef CONFIG_PCI_ENDPOINT_MSI_DOORBELL
+int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 nums);
+void pci_epf_free_doorbell(struct pci_epf *epf);
+#else
+static inline int pci_epf_alloc_doorbell(struct pci_epf *epf, u16 nums)
+{
+	return -ENODATA;
+}
+
+static inline void pci_epf_free_doorbell(struct pci_epf *epf)
+{
+}
+#endif /* CONFIG_GENERIC_MSI_IRQ */
+
+#endif /* __PCI_EP_MSI__ */
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 749cee0bcf2c..52e07602f08e 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -12,6 +12,7 @@
 #include <linux/configfs.h>
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
+#include <linux/msi.h>
 #include <linux/pci.h>
 
 struct pci_epf;
@@ -128,6 +129,16 @@ struct pci_epf_bar {
 	int		flags;
 };
 
+/**
+ * struct pci_epf_doorbell_msg - represents doorbell message
+ * @msg: MSI message
+ * @virq: IRQ number of this doorbell MSI message
+ */
+struct pci_epf_doorbell_msg {
+	struct msi_msg msg;
+	int virq;
+};
+
 /**
  * struct pci_epf - represents the PCI EPF device
  * @dev: the PCI EPF device
@@ -155,6 +166,8 @@ struct pci_epf_bar {
  * @vfunction_num_map: bitmap to manage virtual function number
  * @pci_vepf: list of virtual endpoint functions associated with this function
  * @event_ops: callbacks for capturing the EPC events
+ * @db_msg: data for MSI from RC side
+ * @num_db: number of doorbells
  */
 struct pci_epf {
 	struct device		dev;
@@ -185,6 +198,8 @@ struct pci_epf {
 	unsigned long		vfunction_num_map;
 	struct list_head	pci_vepf;
 	const struct pci_epc_event_ops *event_ops;
+	struct pci_epf_doorbell_msg *db_msg;
+	u16 num_db;
 };
 
 /**
-- 
cgit v1.2.3


From 4ff4252a2355f585c5cad8dc959ff1097300aa47 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 10 Jul 2025 15:13:51 -0400
Subject: PCI: endpoint: Add pci_epf_align_inbound_addr() helper for inbound
 address alignment

Add pci_epf_align_inbound_addr() to align the inbound addresses according
to PCI BAR alignment requirements. The aligned base address and offset are
returned via 'base' and 'off' parameters.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
[mani: reworded kernel-doc and commit message]
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Link: https://patch.msgid.link/20250710-ep-msi-v21-5-57683fc7fb25@nxp.com
---
 drivers/pci/endpoint/pci-epf-core.c | 38 +++++++++++++++++++++++++++++++++++++
 include/linux/pci-epf.h             |  3 +++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index 577a9e490115..09b90e1631d5 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -477,6 +477,44 @@ struct pci_epf *pci_epf_create(const char *name)
 }
 EXPORT_SYMBOL_GPL(pci_epf_create);
 
+/**
+ * pci_epf_align_inbound_addr() - Align the given address based on the BAR
+ *				  alignment requirement
+ * @epf: the EPF device
+ * @addr: inbound address to be aligned
+ * @bar: the BAR number corresponding to the given addr
+ * @base: base address matching the @bar alignment requirement
+ * @off: offset to be added to the @base address
+ *
+ * Helper function to align input @addr based on BAR's alignment requirement.
+ * The aligned base address and offset are returned via @base and @off.
+ *
+ * NOTE: The pci_epf_alloc_space() function already accounts for alignment.
+ * This API is primarily intended for use with other memory regions not
+ * allocated by pci_epf_alloc_space(), such as peripheral register spaces or
+ * the message address of a platform MSI controller.
+ *
+ * Return: 0 on success, errno otherwise.
+ */
+int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar,
+			       u64 addr, dma_addr_t *base, size_t *off)
+{
+	/*
+	 * Most EP controllers require the BAR start address to be aligned to
+	 * the BAR size, because they mask off the lower bits.
+	 *
+	 * Alignment to BAR size also works for controllers that support
+	 * unaligned addresses.
+	 */
+	u64 align = epf->bar[bar].size;
+
+	*base = round_down(addr, align);
+	*off = addr & (align - 1);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_align_inbound_addr);
+
 static void pci_epf_dev_release(struct device *dev)
 {
 	struct pci_epf *epf = to_pci_epf(dev);
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 52e07602f08e..2e85504ba2ba 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -241,6 +241,9 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar,
 			  enum pci_epc_interface_type type);
 void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar,
 			enum pci_epc_interface_type type);
+
+int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar,
+			       u64 addr, dma_addr_t *base, size_t *off);
 int pci_epf_bind(struct pci_epf *epf);
 void pci_epf_unbind(struct pci_epf *epf);
 int pci_epf_add_vepf(struct pci_epf *epf_pf, struct pci_epf *epf_vf);
-- 
cgit v1.2.3


From eefb83790a0dda112d1755e4f5e213738d717e76 Mon Sep 17 00:00:00 2001
From: Frank Li <Frank.Li@nxp.com>
Date: Thu, 10 Jul 2025 15:13:53 -0400
Subject: misc: pci_endpoint_test: Add doorbell test case

Add doorbell support with the help of three new registers:
PCIE_ENDPOINT_TEST_DB_BAR, PCIE_ENDPOINT_TEST_DB_ADDR, and
PCIE_ENDPOINT_TEST_DB_DATA.

The testcase works by triggering the doorbell in Endpoint by writing the
value from PCI_ENDPOINT_TEST_DB_DATA register to the address provided by
PCI_ENDPOINT_TEST_DB_OFFSET register of the BAR indicated by the
PCIE_ENDPOINT_TEST_DB_BAR register and waiting for the completion status
from the Endpoint.

Signed-off-by: Frank Li <Frank.Li@nxp.com>
[mani: removed one spurious change and reworded the commit message]
Signed-off-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Tested-by: Niklas Cassel <cassel@kernel.org>
Link: https://patch.msgid.link/20250710-ep-msi-v21-7-57683fc7fb25@nxp.com
---
 drivers/misc/pci_endpoint_test.c | 83 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pcitest.h     |  1 +
 2 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index c4e5e2c977be..1c156a3f845e 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -37,6 +37,8 @@
 #define COMMAND_READ				BIT(3)
 #define COMMAND_WRITE				BIT(4)
 #define COMMAND_COPY				BIT(5)
+#define COMMAND_ENABLE_DOORBELL			BIT(6)
+#define COMMAND_DISABLE_DOORBELL		BIT(7)
 
 #define PCI_ENDPOINT_TEST_STATUS		0x8
 #define STATUS_READ_SUCCESS			BIT(0)
@@ -48,6 +50,11 @@
 #define STATUS_IRQ_RAISED			BIT(6)
 #define STATUS_SRC_ADDR_INVALID			BIT(7)
 #define STATUS_DST_ADDR_INVALID			BIT(8)
+#define STATUS_DOORBELL_SUCCESS			BIT(9)
+#define STATUS_DOORBELL_ENABLE_SUCCESS		BIT(10)
+#define STATUS_DOORBELL_ENABLE_FAIL		BIT(11)
+#define STATUS_DOORBELL_DISABLE_SUCCESS		BIT(12)
+#define STATUS_DOORBELL_DISABLE_FAIL		BIT(13)
 
 #define PCI_ENDPOINT_TEST_LOWER_SRC_ADDR	0x0c
 #define PCI_ENDPOINT_TEST_UPPER_SRC_ADDR	0x10
@@ -62,6 +69,7 @@
 #define PCI_ENDPOINT_TEST_IRQ_NUMBER		0x28
 
 #define PCI_ENDPOINT_TEST_FLAGS			0x2c
+
 #define FLAG_USE_DMA				BIT(0)
 
 #define PCI_ENDPOINT_TEST_CAPS			0x30
@@ -70,6 +78,10 @@
 #define CAP_MSIX				BIT(2)
 #define CAP_INTX				BIT(3)
 
+#define PCI_ENDPOINT_TEST_DB_BAR		0x34
+#define PCI_ENDPOINT_TEST_DB_OFFSET		0x38
+#define PCI_ENDPOINT_TEST_DB_DATA		0x3c
+
 #define PCI_DEVICE_ID_TI_AM654			0xb00c
 #define PCI_DEVICE_ID_TI_J7200			0xb00f
 #define PCI_DEVICE_ID_TI_AM64			0xb010
@@ -100,6 +112,7 @@ enum pci_barno {
 	BAR_3,
 	BAR_4,
 	BAR_5,
+	NO_BAR = -1,
 };
 
 struct pci_endpoint_test {
@@ -841,6 +854,73 @@ static int pci_endpoint_test_set_irq(struct pci_endpoint_test *test,
 	return 0;
 }
 
+static int pci_endpoint_test_doorbell(struct pci_endpoint_test *test)
+{
+	struct pci_dev *pdev = test->pdev;
+	struct device *dev = &pdev->dev;
+	int irq_type = test->irq_type;
+	enum pci_barno bar;
+	u32 data, status;
+	u32 addr;
+	int left;
+
+	if (irq_type < PCITEST_IRQ_TYPE_INTX ||
+	    irq_type > PCITEST_IRQ_TYPE_MSIX) {
+		dev_err(dev, "Invalid IRQ type\n");
+		return -EINVAL;
+	}
+
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
+				 COMMAND_ENABLE_DOORBELL);
+
+	left = wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000));
+
+	status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS);
+	if (!left || (status & STATUS_DOORBELL_ENABLE_FAIL)) {
+		dev_err(dev, "Failed to enable doorbell\n");
+		return -EINVAL;
+	}
+
+	data = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_DATA);
+	addr = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_OFFSET);
+	bar = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_BAR);
+
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_TYPE, irq_type);
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_IRQ_NUMBER, 1);
+
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_STATUS, 0);
+
+	bar = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_DB_BAR);
+
+	writel(data, test->bar[bar] + addr);
+
+	left = wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000));
+
+	status = pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS);
+
+	if (!left || !(status & STATUS_DOORBELL_SUCCESS))
+		dev_err(dev, "Failed to trigger doorbell in endpoint\n");
+
+	pci_endpoint_test_writel(test, PCI_ENDPOINT_TEST_COMMAND,
+				 COMMAND_DISABLE_DOORBELL);
+
+	wait_for_completion_timeout(&test->irq_raised, msecs_to_jiffies(1000));
+
+	status |= pci_endpoint_test_readl(test, PCI_ENDPOINT_TEST_STATUS);
+
+	if (status & STATUS_DOORBELL_DISABLE_FAIL) {
+		dev_err(dev, "Failed to disable doorbell\n");
+		return -EINVAL;
+	}
+
+	if (!(status & STATUS_DOORBELL_SUCCESS))
+		return -EINVAL;
+
+	return 0;
+}
+
 static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 				    unsigned long arg)
 {
@@ -891,6 +971,9 @@ static long pci_endpoint_test_ioctl(struct file *file, unsigned int cmd,
 	case PCITEST_CLEAR_IRQ:
 		ret = pci_endpoint_test_clear_irq(test);
 		break;
+	case PCITEST_DOORBELL:
+		ret = pci_endpoint_test_doorbell(test);
+		break;
 	}
 
 ret:
diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h
index d3aa8715a525..d6023a45a9d0 100644
--- a/include/uapi/linux/pcitest.h
+++ b/include/uapi/linux/pcitest.h
@@ -21,6 +21,7 @@
 #define PCITEST_SET_IRQTYPE	_IOW('P', 0x8, int)
 #define PCITEST_GET_IRQTYPE	_IO('P', 0x9)
 #define PCITEST_BARS		_IO('P', 0xa)
+#define PCITEST_DOORBELL	_IO('P', 0xb)
 #define PCITEST_CLEAR_IRQ	_IO('P', 0x10)
 
 #define PCITEST_IRQ_TYPE_UNDEFINED	-1
-- 
cgit v1.2.3


From 8245d47cfaba8a38337a447230b4d01f9946f5e1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 23 Jul 2025 22:50:26 -0700
Subject: x86: Handle KCOV __init vs inline mismatches

GCC appears to have kind of fragile inlining heuristics, in the
sense that it can change whether or not it inlines something based on
optimizations. It looks like the kcov instrumentation being added (or in
this case, removed) from a function changes the optimization results,
and some functions marked "inline" are _not_ inlined. In that case,
we end up with __init code calling a function not marked __init, and we
get the build warnings I'm trying to eliminate in the coming patch that
adds __no_sanitize_coverage to __init functions:

WARNING: modpost: vmlinux: section mismatch in reference: xbc_exit+0x8 (section: .text.unlikely) -> _xbc_exit (section: .init.text)
WARNING: modpost: vmlinux: section mismatch in reference: real_mode_size_needed+0x15 (section: .text.unlikely) -> real_mode_blob_end (section: .init.data)
WARNING: modpost: vmlinux: section mismatch in reference: __set_percpu_decrypted+0x16 (section: .text.unlikely) -> early_set_memory_decrypted (section: .init.text)
WARNING: modpost: vmlinux: section mismatch in reference: memblock_alloc_from+0x26 (section: .text.unlikely) -> memblock_alloc_try_nid (section: .init.text)
WARNING: modpost: vmlinux: section mismatch in reference: acpi_arch_set_root_pointer+0xc (section: .text.unlikely) -> x86_init (section: .init.data)
WARNING: modpost: vmlinux: section mismatch in reference: acpi_arch_get_root_pointer+0x8 (section: .text.unlikely) -> x86_init (section: .init.data)
WARNING: modpost: vmlinux: section mismatch in reference: efi_config_table_is_usable+0x16 (section: .text.unlikely) -> xen_efi_config_table_is_usable (section: .init.text)

This problem is somewhat fragile (though using either __always_inline
or __init will deterministically solve it), but we've tripped over
this before with GCC and the solution has usually been to just use
__always_inline and move on.

For x86 this means forcing several functions to be inline with
__always_inline.

Link: https://lore.kernel.org/r/20250724055029.3623499-2-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 arch/x86/include/asm/acpi.h     | 4 ++--
 arch/x86/include/asm/realmode.h | 2 +-
 arch/x86/kernel/kvm.c           | 2 +-
 arch/x86/mm/init_64.c           | 2 +-
 include/linux/acpi.h            | 4 ++--
 include/linux/bootconfig.h      | 2 +-
 include/linux/efi.h             | 2 +-
 include/linux/memblock.h        | 2 +-
 include/linux/smp.h             | 2 +-
 kernel/kexec_handover.c         | 4 ++--
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5ab1a4598d00..a03aa6f999d1 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -158,13 +158,13 @@ static inline bool acpi_has_cpu_in_madt(void)
 }
 
 #define ACPI_HAVE_ARCH_SET_ROOT_POINTER
-static inline void acpi_arch_set_root_pointer(u64 addr)
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
 {
 	x86_init.acpi.set_root_pointer(addr);
 }
 
 #define ACPI_HAVE_ARCH_GET_ROOT_POINTER
-static inline u64 acpi_arch_get_root_pointer(void)
+static __always_inline u64 acpi_arch_get_root_pointer(void)
 {
 	return x86_init.acpi.get_root_pointer();
 }
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index f607081a022a..e406a1e92c63 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -78,7 +78,7 @@ extern unsigned char secondary_startup_64[];
 extern unsigned char secondary_startup_64_no_verify[];
 #endif
 
-static inline size_t real_mode_size_needed(void)
+static __always_inline size_t real_mode_size_needed(void)
 {
 	if (real_mode_header)
 		return 0;	/* already allocated. */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 921c1c783bc1..8ae750cde0c6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -420,7 +420,7 @@ static u64 kvm_steal_clock(int cpu)
 	return steal;
 }
 
-static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
 {
 	early_set_memory_decrypted((unsigned long) ptr, size);
 }
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee66fae9ebcc..c02029cd27e1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -806,7 +806,7 @@ kernel_physical_mapping_change(unsigned long paddr_start,
 }
 
 #ifndef CONFIG_NUMA
-static inline void x86_numa_init(void)
+static __always_inline void x86_numa_init(void)
 {
 	memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0);
 }
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index f102c0fe3431..fc7f5e039074 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -759,13 +759,13 @@ int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count)
 #endif
 
 #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER
-static inline void acpi_arch_set_root_pointer(u64 addr)
+static __always_inline void acpi_arch_set_root_pointer(u64 addr)
 {
 }
 #endif
 
 #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER
-static inline u64 acpi_arch_get_root_pointer(void)
+static __always_inline u64 acpi_arch_get_root_pointer(void)
 {
 	return 0;
 }
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 3f4b4ac527ca..25df9260d206 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -290,7 +290,7 @@ int __init xbc_get_info(int *node_size, size_t *data_size);
 /* XBC cleanup data structures */
 void __init _xbc_exit(bool early);
 
-static inline void xbc_exit(void)
+static __always_inline void xbc_exit(void)
 {
 	_xbc_exit(false);
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 7d63d1d75f22..e3776d9cad07 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1334,7 +1334,7 @@ struct linux_efi_initrd {
 
 bool xen_efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table);
 
-static inline
+static __always_inline
 bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table)
 {
 	if (!IS_ENABLED(CONFIG_XEN_EFI))
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bb19a2534224..b96746376e17 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -463,7 +463,7 @@ static inline void *memblock_alloc_raw(phys_addr_t size,
 					  NUMA_NO_NODE);
 }
 
-static inline void *memblock_alloc_from(phys_addr_t size,
+static __always_inline void *memblock_alloc_from(phys_addr_t size,
 						phys_addr_t align,
 						phys_addr_t min_addr)
 {
diff --git a/include/linux/smp.h b/include/linux/smp.h
index f1aa0952e8c3..84e948eb1c20 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -221,7 +221,7 @@ static inline void wake_up_all_idle_cpus(void) {  }
 
 #ifdef CONFIG_UP_LATE_INIT
 extern void __init up_late_init(void);
-static inline void smp_init(void) { up_late_init(); }
+static __always_inline void smp_init(void) { up_late_init(); }
 #else
 static inline void smp_init(void) { }
 #endif
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 69b953551677..f3f6bfe43d47 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -305,8 +305,8 @@ err_free:
 	return -ENOMEM;
 }
 
-static void deserialize_bitmap(unsigned int order,
-			       struct khoser_mem_bitmap_ptr *elm)
+static void __init deserialize_bitmap(unsigned int order,
+				      struct khoser_mem_bitmap_ptr *elm)
 {
 	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
 	unsigned long bit;
-- 
cgit v1.2.3


From 0dec7201788b9152f06321d0dab46eed93834cda Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 21 Jul 2025 16:15:57 +1000
Subject: sprintf.h requires stdarg.h

In file included from drivers/crypto/intel/qat/qat_common/adf_pm_dbgfs_utils.c:4:
include/linux/sprintf.h:11:54: error: unknown type name 'va_list'
   11 | __printf(2, 0) int vsprintf(char *buf, const char *, va_list);
      |                                                      ^~~~~~~
include/linux/sprintf.h:1:1: note: 'va_list' is defined in header '<stdarg.h>'; this is probably fixable by adding '#include <stdarg.h>'

Link: https://lkml.kernel.org/r/20250721173754.42865913@canb.auug.org.au
Fixes: 39ced19b9e60 ("lib/vsprintf: split out sprintf() and friends")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sprintf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sprintf.h b/include/linux/sprintf.h
index 51cab2def9ec..876130091384 100644
--- a/include/linux/sprintf.h
+++ b/include/linux/sprintf.h
@@ -4,6 +4,7 @@
 
 #include <linux/compiler_attributes.h>
 #include <linux/types.h>
+#include <linux/stdarg.h>
 
 int num_to_str(char *buf, int size, unsigned long long num, unsigned int width);
 
-- 
cgit v1.2.3


From 71c52411c51bf4f0869c572294ce8123b26528d5 Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Wed, 23 Jul 2025 01:30:29 +0000
Subject: net: Create separate gro_flush_normal function

Move multiple copies of same code snippet doing `gro_flush` and
`gro_normal_list` into separate helper function.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250723013031.2911384-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/gro.h   | 6 ++++++
 kernel/bpf/cpumap.c | 3 +--
 net/core/dev.c      | 9 +++------
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index 22d3a69e4404..a0fca7ac6e7e 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -534,6 +534,12 @@ static inline void gro_normal_list(struct gro_node *gro)
 	gro->rx_count = 0;
 }
 
+static inline void gro_flush_normal(struct gro_node *gro, bool flush_old)
+{
+	gro_flush(gro, flush_old);
+	gro_normal_list(gro);
+}
+
 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
  * pass the whole batch up to the stack.
  */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 67e8a2fc1a99..b2b7b8ec2c2a 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
 	 * This is equivalent to how NAPI decides whether to perform a full
 	 * flush.
 	 */
-	gro_flush(&rcpu->gro, !empty && HZ >= 1000);
-	gro_normal_list(&rcpu->gro);
+	gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000);
 }
 
 static int cpu_map_kthread_run(void *data)
diff --git a/net/core/dev.c b/net/core/dev.c
index 354d3453b407..76384b8a7871 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6578,8 +6578,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 	 * it, we need to bound somehow the time packets are kept in
 	 * the GRO layer.
 	 */
-	gro_flush(&n->gro, !!timeout);
-	gro_normal_list(&n->gro);
+	gro_flush_normal(&n->gro, !!timeout);
 
 	if (unlikely(!list_empty(&n->poll_list))) {
 		/* If n->poll_list is not empty, we need to mask irqs */
@@ -6649,8 +6648,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 	}
 
 	/* Flush too old packets. If HZ < 1000, flush all packets */
-	gro_flush(&napi->gro, HZ >= 1000);
-	gro_normal_list(&napi->gro);
+	gro_flush_normal(&napi->gro, HZ >= 1000);
 
 	clear_bit(NAPI_STATE_SCHED, &napi->state);
 }
@@ -7515,8 +7513,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
 	}
 
 	/* Flush too old packets. If HZ < 1000, flush all packets */
-	gro_flush(&n->gro, HZ >= 1000);
-	gro_normal_list(&n->gro);
+	gro_flush_normal(&n->gro, HZ >= 1000);
 
 	/* Some drivers may have called napi_schedule
 	 * prior to exhausting their budget.
-- 
cgit v1.2.3


From 78afdadafe6fe0c74c08fda156e7be0a0b402b90 Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Wed, 23 Jul 2025 01:30:30 +0000
Subject: net: Use netif_threaded_enable instead of netif_set_threaded in
 drivers

Prepare for adding an enum type for NAPI threaded states by adding
netif_threaded_enable API. De-export the existing netif_set_threaded API
and only use it internally. Update existing drivers to use
netif_threaded_enable instead of the de-exported netif_set_threaded.

Note that dev_set_threaded used by mt76 debugfs file is unchanged.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Link: https://patch.msgid.link/20250723013031.2911384-3-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c       |  2 +-
 drivers/net/ethernet/renesas/ravb_main.c        |  2 +-
 drivers/net/wireguard/device.c                  |  2 +-
 drivers/net/wireless/ath/ath10k/snoc.c          |  2 +-
 include/linux/netdevice.h                       |  2 +-
 net/core/dev.c                                  | 18 +++++++++++++++++-
 net/core/dev.h                                  |  2 ++
 8 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3a9ad4a9c1cb..7efa3fc257b3 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2688,7 +2688,7 @@ static int atl1c_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->mii.mdio_write = atl1c_mdio_write;
 	adapter->mii.phy_id_mask = 0x1f;
 	adapter->mii.reg_num_mask = MDIO_CTRL_REG_MASK;
-	netif_set_threaded(netdev, true);
+	netif_threaded_enable(netdev);
 	for (i = 0; i < adapter->rx_queue_count; ++i)
 		netif_napi_add(netdev, &adapter->rrd_ring[i].napi,
 			       atl1c_clean_rx);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index a2e97b712a3d..8769cba2c746 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -156,7 +156,7 @@ static int mlxsw_pci_napi_devs_init(struct mlxsw_pci *mlxsw_pci)
 	}
 	strscpy(mlxsw_pci->napi_dev_rx->name, "mlxsw_rx",
 		sizeof(mlxsw_pci->napi_dev_rx->name));
-	netif_set_threaded(mlxsw_pci->napi_dev_rx, true);
+	netif_threaded_enable(mlxsw_pci->napi_dev_rx);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 4e79bf88688a..94b6fb94f8f1 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -3075,7 +3075,7 @@ static int ravb_probe(struct platform_device *pdev)
 	if (info->coalesce_irqs) {
 		netdev_sw_irq_coalesce_default_on(ndev);
 		if (num_present_cpus() == 1)
-			netif_set_threaded(ndev, true);
+			netif_threaded_enable(ndev);
 	}
 
 	/* Network device register */
diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c
index 5afec5a865f4..813bd10d3dc7 100644
--- a/drivers/net/wireguard/device.c
+++ b/drivers/net/wireguard/device.c
@@ -366,7 +366,7 @@ static int wg_newlink(struct net_device *dev,
 	if (ret < 0)
 		goto err_free_handshake_queue;
 
-	netif_set_threaded(dev, true);
+	netif_threaded_enable(dev);
 	ret = register_netdevice(dev);
 	if (ret < 0)
 		goto err_uninit_ratelimiter;
diff --git a/drivers/net/wireless/ath/ath10k/snoc.c b/drivers/net/wireless/ath/ath10k/snoc.c
index 0ee68d3dad12..f0713bd36173 100644
--- a/drivers/net/wireless/ath/ath10k/snoc.c
+++ b/drivers/net/wireless/ath/ath10k/snoc.c
@@ -936,7 +936,7 @@ static int ath10k_snoc_hif_start(struct ath10k *ar)
 
 	bitmap_clear(ar_snoc->pending_ce_irqs, 0, CE_COUNT_MAX);
 
-	netif_set_threaded(ar->napi_dev, true);
+	netif_threaded_enable(ar->napi_dev);
 	ath10k_core_napi_enable(ar);
 	/* IRQs are left enabled when we restart due to a firmware crash */
 	if (!test_bit(ATH10K_SNOC_FLAG_RECOVERY, &ar_snoc->flags))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5aee8d3895f4..a97c9a337d6b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -589,7 +589,7 @@ static inline bool napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
-int netif_set_threaded(struct net_device *dev, bool threaded);
+void netif_threaded_enable(struct net_device *dev);
 int dev_set_threaded(struct net_device *dev, bool threaded);
 
 void napi_disable(struct napi_struct *n);
diff --git a/net/core/dev.c b/net/core/dev.c
index 76384b8a7871..f28661d6f5ea 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7029,7 +7029,23 @@ int netif_set_threaded(struct net_device *dev, bool threaded)
 
 	return err;
 }
-EXPORT_SYMBOL(netif_set_threaded);
+
+/**
+ * netif_threaded_enable() - enable threaded NAPIs
+ * @dev: net_device instance
+ *
+ * Enable threaded mode for the NAPI instances of the device. This may be useful
+ * for devices where multiple NAPI instances get scheduled by a single
+ * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
+ * than the core where IRQ is mapped.
+ *
+ * This function should be called before @dev is registered.
+ */
+void netif_threaded_enable(struct net_device *dev)
+{
+	WARN_ON_ONCE(netif_set_threaded(dev, true));
+}
+EXPORT_SYMBOL(netif_threaded_enable);
 
 /**
  * netif_queue_set_napi - Associate queue with the napi
diff --git a/net/core/dev.h b/net/core/dev.h
index a603387fb566..f5b567310908 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -322,6 +322,8 @@ static inline bool napi_get_threaded(struct napi_struct *n)
 
 int napi_set_threaded(struct napi_struct *n, bool threaded);
 
+int netif_set_threaded(struct net_device *dev, bool threaded);
+
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
 #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
-- 
cgit v1.2.3


From 8e7583a4f65f3dbf3e8deb4e60f3679c276bef62 Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Wed, 23 Jul 2025 01:30:31 +0000
Subject: net: define an enum for the napi threaded state

Instead of using '0' and '1' for napi threaded state use an enum with
'disabled' and 'enabled' states.

Tested:
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml            | 13 +++++---
 .../networking/net_cachelines/net_device.rst       |  2 +-
 include/linux/netdevice.h                          | 10 +++---
 include/uapi/linux/netdev.h                        |  5 +++
 net/core/dev.c                                     | 12 +++++---
 net/core/dev.h                                     | 13 +++++---
 net/core/dev_api.c                                 |  3 +-
 net/core/netdev-genl-gen.c                         |  2 +-
 net/core/netdev-genl.c                             |  2 +-
 tools/include/uapi/linux/netdev.h                  |  5 +++
 tools/testing/selftests/net/nl_netdev.py           | 36 +++++++++++-----------
 11 files changed, 62 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 85d0ea6ac426..c035dc0f64fd 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -85,6 +85,10 @@ definitions:
     name: qstats-scope
     type: flags
     entries: [queue]
+  -
+    name: napi-threaded
+    type: enum
+    entries: [disabled, enabled]
 
 attribute-sets:
   -
@@ -286,11 +290,10 @@ attribute-sets:
       -
         name: threaded
         doc: Whether the NAPI is configured to operate in threaded polling
-             mode. If this is set to 1 then the NAPI context operates in
-             threaded polling mode.
-        type: uint
-        checks:
-          max: 1
+             mode. If this is set to enabled then the NAPI context operates
+             in threaded polling mode.
+        type: u32
+        enum: napi-threaded
   -
     name: xsk-info
     attributes: []
diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst
index 2d3dc4692d20..1c19bb7705df 100644
--- a/Documentation/networking/net_cachelines/net_device.rst
+++ b/Documentation/networking/net_cachelines/net_device.rst
@@ -68,6 +68,7 @@ unsigned_char                       addr_assign_type
 unsigned_char                       addr_len
 unsigned_char                       upper_level
 unsigned_char                       lower_level
+u8                                  threaded                                                            napi_poll(napi_enable,netif_set_threaded)
 unsigned_short                      neigh_priv_len
 unsigned_short                      padded
 unsigned_short                      dev_id
@@ -165,7 +166,6 @@ struct sfp_bus*                     sfp_bus
 struct lock_class_key*              qdisc_tx_busylock
 bool                                proto_down
 unsigned:1                          wol_enabled
-unsigned:1                          threaded                                                            napi_poll(napi_enable,netif_set_threaded)
 unsigned_long:1                     see_all_hwtstamp_requests
 unsigned_long:1                     change_proto_down
 unsigned_long:1                     netns_immutable
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a97c9a337d6b..5e5de4b0a433 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -369,7 +369,7 @@ struct napi_config {
 	u64 irq_suspend_timeout;
 	u32 defer_hard_irqs;
 	cpumask_t affinity_mask;
-	bool threaded;
+	u8 threaded;
 	unsigned int napi_id;
 };
 
@@ -590,7 +590,8 @@ static inline bool napi_complete(struct napi_struct *n)
 }
 
 void netif_threaded_enable(struct net_device *dev);
-int dev_set_threaded(struct net_device *dev, bool threaded);
+int dev_set_threaded(struct net_device *dev,
+		     enum netdev_napi_threaded threaded);
 
 void napi_disable(struct napi_struct *n);
 void napi_disable_locked(struct napi_struct *n);
@@ -1872,6 +1873,7 @@ enum netdev_reg_state {
  * 	@addr_len:		Hardware address length
  *	@upper_level:		Maximum depth level of upper devices.
  *	@lower_level:		Maximum depth level of lower devices.
+ *	@threaded:		napi threaded state.
  *	@neigh_priv_len:	Used in neigh_alloc()
  * 	@dev_id:		Used to differentiate devices that share
  * 				the same link layer address
@@ -2011,8 +2013,6 @@ enum netdev_reg_state {
  *			switch driver and used to set the phys state of the
  *			switch port.
  *
- *	@threaded:	napi threaded mode is enabled
- *
  *	@irq_affinity_auto: driver wants the core to store and re-assign the IRQ
  *			    affinity. Set by netif_enable_irq_affinity(), then
  *			    the driver must create a persistent napi by
@@ -2248,6 +2248,7 @@ struct net_device {
 	unsigned char		addr_len;
 	unsigned char		upper_level;
 	unsigned char		lower_level;
+	u8			threaded;
 
 	unsigned short		neigh_priv_len;
 	unsigned short          dev_id;
@@ -2429,7 +2430,6 @@ struct net_device {
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
 	bool			proto_down;
-	bool			threaded;
 	bool			irq_affinity_auto;
 	bool			rx_cpu_rmap_auto;
 
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 1f3719a9a0eb..48eb49aa03d4 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -77,6 +77,11 @@ enum netdev_qstats_scope {
 	NETDEV_QSTATS_SCOPE_QUEUE = 1,
 };
 
+enum netdev_napi_threaded {
+	NETDEV_NAPI_THREADED_DISABLED,
+	NETDEV_NAPI_THREADED_ENABLED,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
diff --git a/net/core/dev.c b/net/core/dev.c
index f28661d6f5ea..1c6e755841ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6963,7 +6963,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
 	napi->thread = NULL;
 }
 
-int napi_set_threaded(struct napi_struct *napi, bool threaded)
+int napi_set_threaded(struct napi_struct *napi,
+		      enum netdev_napi_threaded threaded)
 {
 	if (threaded) {
 		if (!napi->thread) {
@@ -6988,7 +6989,8 @@ int napi_set_threaded(struct napi_struct *napi, bool threaded)
 	return 0;
 }
 
-int netif_set_threaded(struct net_device *dev, bool threaded)
+int netif_set_threaded(struct net_device *dev,
+		       enum netdev_napi_threaded threaded)
 {
 	struct napi_struct *napi;
 	int err = 0;
@@ -7000,7 +7002,7 @@ int netif_set_threaded(struct net_device *dev, bool threaded)
 			if (!napi->thread) {
 				err = napi_kthread_create(napi);
 				if (err) {
-					threaded = false;
+					threaded = NETDEV_NAPI_THREADED_DISABLED;
 					break;
 				}
 			}
@@ -7043,7 +7045,7 @@ int netif_set_threaded(struct net_device *dev, bool threaded)
  */
 void netif_threaded_enable(struct net_device *dev)
 {
-	WARN_ON_ONCE(netif_set_threaded(dev, true));
+	WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
 }
 EXPORT_SYMBOL(netif_threaded_enable);
 
@@ -7360,7 +7362,7 @@ void netif_napi_add_weight_locked(struct net_device *dev,
 	 * threaded mode will not be enabled in napi_enable().
 	 */
 	if (dev->threaded && napi_kthread_create(napi))
-		dev->threaded = false;
+		dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
 	netif_napi_set_irq_locked(napi, -1);
 }
 EXPORT_SYMBOL(netif_napi_add_weight_locked);
diff --git a/net/core/dev.h b/net/core/dev.h
index f5b567310908..ab69edc0c3e3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -315,14 +315,19 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
 	WRITE_ONCE(n->irq_suspend_timeout, timeout);
 }
 
-static inline bool napi_get_threaded(struct napi_struct *n)
+static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
 {
-	return test_bit(NAPI_STATE_THREADED, &n->state);
+	if (test_bit(NAPI_STATE_THREADED, &n->state))
+		return NETDEV_NAPI_THREADED_ENABLED;
+
+	return NETDEV_NAPI_THREADED_DISABLED;
 }
 
-int napi_set_threaded(struct napi_struct *n, bool threaded);
+int napi_set_threaded(struct napi_struct *n,
+		      enum netdev_napi_threaded threaded);
 
-int netif_set_threaded(struct net_device *dev, bool threaded);
+int netif_set_threaded(struct net_device *dev,
+		       enum netdev_napi_threaded threaded);
 
 int rps_cpumask_housekeeping(struct cpumask *mask);
 
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
index dd7f57013ce5..f28852078aa6 100644
--- a/net/core/dev_api.c
+++ b/net/core/dev_api.c
@@ -368,7 +368,8 @@ void netdev_state_change(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_state_change);
 
-int dev_set_threaded(struct net_device *dev, bool threaded)
+int dev_set_threaded(struct net_device *dev,
+		     enum netdev_napi_threaded threaded)
 {
 	int ret;
 
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 0994bd68a7e6..e9a2a6f26cb7 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
 	[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
 	[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
 	[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
-	[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_UINT, 1),
+	[NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
 };
 
 /* NETDEV_CMD_BIND_TX - do */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 5875df372415..6314eb7bdf69 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -333,7 +333,7 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
 		int ret;
 
 		threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]);
-		ret = napi_set_threaded(napi, !!threaded);
+		ret = napi_set_threaded(napi, threaded);
 		if (ret)
 			return ret;
 	}
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 1f3719a9a0eb..48eb49aa03d4 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -77,6 +77,11 @@ enum netdev_qstats_scope {
 	NETDEV_QSTATS_SCOPE_QUEUE = 1,
 };
 
+enum netdev_napi_threaded {
+	NETDEV_NAPI_THREADED_DISABLED,
+	NETDEV_NAPI_THREADED_ENABLED,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index c8ffade79a52..5c66421ab8aa 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -52,14 +52,14 @@ def napi_set_threaded(nf) -> None:
         napi1_id = napis[1]['id']
 
         # set napi threaded and verify
-        nf.napi_set({'id': napi0_id, 'threaded': 1})
+        nf.napi_set({'id': napi0_id, 'threaded': "enabled"})
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 1)
+        ksft_eq(napi0['threaded'], "enabled")
         ksft_ne(napi0.get('pid'), None)
 
         # check it is not set for napi1
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1['threaded'], "disabled")
         ksft_eq(napi1.get('pid'), None)
 
         ip(f"link set dev {nsim.ifname} down")
@@ -67,18 +67,18 @@ def napi_set_threaded(nf) -> None:
 
         # verify if napi threaded is still set
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 1)
+        ksft_eq(napi0['threaded'], "enabled")
         ksft_ne(napi0.get('pid'), None)
 
         # check it is still not set for napi1
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1['threaded'], "disabled")
         ksft_eq(napi1.get('pid'), None)
 
         # unset napi threaded and verify
-        nf.napi_set({'id': napi0_id, 'threaded': 0})
+        nf.napi_set({'id': napi0_id, 'threaded': "disabled"})
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0['threaded'], "disabled")
         ksft_eq(napi0.get('pid'), None)
 
         # set threaded at device level
@@ -86,10 +86,10 @@ def napi_set_threaded(nf) -> None:
 
         # check napi threaded is set for both napis
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 1)
+        ksft_eq(napi0['threaded'], "enabled")
         ksft_ne(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 1)
+        ksft_eq(napi1['threaded'], "enabled")
         ksft_ne(napi1.get('pid'), None)
 
         # unset threaded at device level
@@ -97,16 +97,16 @@ def napi_set_threaded(nf) -> None:
 
         # check napi threaded is unset for both napis
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0['threaded'], "disabled")
         ksft_eq(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1['threaded'], "disabled")
         ksft_eq(napi1.get('pid'), None)
 
         # set napi threaded for napi0
         nf.napi_set({'id': napi0_id, 'threaded': 1})
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 1)
+        ksft_eq(napi0['threaded'], "enabled")
         ksft_ne(napi0.get('pid'), None)
 
         # unset threaded at device level
@@ -114,10 +114,10 @@ def napi_set_threaded(nf) -> None:
 
         # check napi threaded is unset for both napis
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0['threaded'], "disabled")
         ksft_eq(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1['threaded'], "disabled")
         ksft_eq(napi1.get('pid'), None)
 
 def dev_set_threaded(nf) -> None:
@@ -141,10 +141,10 @@ def dev_set_threaded(nf) -> None:
 
         # check napi threaded is set for both napis
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 1)
+        ksft_eq(napi0['threaded'], "enabled")
         ksft_ne(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 1)
+        ksft_eq(napi1['threaded'], "enabled")
         ksft_ne(napi1.get('pid'), None)
 
         # unset threaded
@@ -152,10 +152,10 @@ def dev_set_threaded(nf) -> None:
 
         # check napi threaded is unset for both napis
         napi0 = nf.napi_get({'id': napi0_id})
-        ksft_eq(napi0['threaded'], 0)
+        ksft_eq(napi0['threaded'], "disabled")
         ksft_eq(napi0.get('pid'), None)
         napi1 = nf.napi_get({'id': napi1_id})
-        ksft_eq(napi1['threaded'], 0)
+        ksft_eq(napi1['threaded'], "disabled")
         ksft_eq(napi1.get('pid'), None)
 
 def nsim_rxq_reset_down(nf) -> None:
-- 
cgit v1.2.3


From 3865301dc58aec2ab77651bdbf1e55352f50a608 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 13 Jul 2025 12:57:18 -0700
Subject: mm: optimize lru_note_cost() by adding lru_note_cost_unlock_irq()

Dropping a lock, just to demand it again for an afterthought, cannot be
good if contended: convert lru_note_cost() to lru_note_cost_unlock_irq().

[hughd@google.com: delete unneeded comment]
  Link: https://lkml.kernel.org/r/dbf9352a-1ed9-a021-c0c7-9309ac73e174@google.com
Link: https://lkml.kernel.org/r/21100102-51b6-79d5-03db-1bb7f97fa94c@google.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Tested-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  5 +++--
 mm/swap.c            | 33 +++++++++++++++++++--------------
 mm/vmscan.c          |  8 +++-----
 3 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 95c6061fa1dc..2fe6ed2cc3fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -376,8 +376,9 @@ extern unsigned long totalreserve_pages;
 
 
 /* linux/mm/swap.c */
-void lru_note_cost(struct lruvec *lruvec, bool file,
-		   unsigned int nr_io, unsigned int nr_rotated);
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+		unsigned int nr_io, unsigned int nr_rotated)
+		__releases(lruvec->lru_lock);
 void lru_note_cost_refault(struct folio *);
 void folio_add_lru(struct folio *);
 void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
diff --git a/mm/swap.c b/mm/swap.c
index 4fc322f7111a..3632dd061beb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -237,8 +237,9 @@ void folio_rotate_reclaimable(struct folio *folio)
 	folio_batch_add_and_move(folio, lru_move_tail, true);
 }
 
-void lru_note_cost(struct lruvec *lruvec, bool file,
-		   unsigned int nr_io, unsigned int nr_rotated)
+void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file,
+		unsigned int nr_io, unsigned int nr_rotated)
+		__releases(lruvec->lru_lock)
 {
 	unsigned long cost;
 
@@ -250,18 +251,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
 	 * different between them, adjust scan balance for CPU work.
 	 */
 	cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;
+	if (!cost) {
+		spin_unlock_irq(&lruvec->lru_lock);
+		return;
+	}
 
-	do {
+	for (;;) {
 		unsigned long lrusize;
 
-		/*
-		 * Hold lruvec->lru_lock is safe here, since
-		 * 1) The pinned lruvec in reclaim, or
-		 * 2) From a pre-LRU page during refault (which also holds the
-		 *    rcu lock, so would be safe even if the page was on the LRU
-		 *    and could move simultaneously to a new lruvec).
-		 */
-		spin_lock_irq(&lruvec->lru_lock);
 		/* Record cost event */
 		if (file)
 			lruvec->file_cost += cost;
@@ -285,14 +282,22 @@ void lru_note_cost(struct lruvec *lruvec, bool file,
 			lruvec->file_cost /= 2;
 			lruvec->anon_cost /= 2;
 		}
+
 		spin_unlock_irq(&lruvec->lru_lock);
-	} while ((lruvec = parent_lruvec(lruvec)));
+		lruvec = parent_lruvec(lruvec);
+		if (!lruvec)
+			break;
+		spin_lock_irq(&lruvec->lru_lock);
+	}
 }
 
 void lru_note_cost_refault(struct folio *folio)
 {
-	lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
-		      folio_nr_pages(folio), 0);
+	struct lruvec *lruvec;
+
+	lruvec = folio_lruvec_lock_irq(folio);
+	lru_note_cost_unlock_irq(lruvec, folio_is_file_lru(folio),
+				folio_nr_pages(folio), 0);
 }
 
 static void lru_activate(struct lruvec *lruvec, struct folio *folio)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0d6b8e1e95e5..d71aeabdc725 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2053,9 +2053,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 		__count_vm_events(item, nr_reclaimed);
 	count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
 	__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
-	spin_unlock_irq(&lruvec->lru_lock);
 
-	lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
+	lru_note_cost_unlock_irq(lruvec, file, stat.nr_pageout,
+					nr_scanned - nr_reclaimed);
 
 	/*
 	 * If dirty folios are scanned that are not queued for IO, it
@@ -2201,10 +2201,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
 
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	spin_unlock_irq(&lruvec->lru_lock);
 
-	if (nr_rotated)
-		lru_note_cost(lruvec, file, 0, nr_rotated);
+	lru_note_cost_unlock_irq(lruvec, file, 0, nr_rotated);
 	trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
 			nr_deactivate, nr_rotated, sc->priority, file);
 }
-- 
cgit v1.2.3


From f9f11398d4dac3c85507f31192e318b20b19af61 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Thu, 17 Jul 2025 17:55:55 +0100
Subject: mm/mremap: use an explicit uffd failure path for mremap

Right now it appears that the code is relying upon the returned
destination address having bits outside PAGE_MASK to indicate whether an
error value is specified, and decrementing the increased refcount on the
uffd ctx if so.

This is not a safe means of determining an error value, so instead, be
specific.  It makes far more sense to do so in a dedicated error path, so
add mremap_userfaultfd_fail() for this purpose and use this when an error
arises.

A vm_userfaultfd_ctx is not established until we are at the point where
mremap_userfaultfd_prep() is invoked in copy_vma_and_data(), so this is a
no-op until this happens.

That is - uffd remap notification only occurs if the VMA is actually moved
- at which point a UFFD_EVENT_REMAP event is raised.

No errors can occur after this point currently, though it's certainly not
guaranteed this will always remain the case, and we mustn't rely on this.

However, the reason for needing to handle this case is that, when an error
arises on a VMA move at the point of adjusting page tables, we revert this
operation, and propagate the error.

At this point, it is not correct to raise a uffd remap event, and we must
handle it.

This refactoring makes it abundantly clear what we are doing.

We assume vrm->new_addr is always valid, which a prior change made the
case even for mremap() invocations which don't move the VMA, however given
no uffd context would be set up in this case it's immaterial to this
change anyway.

No functional change intended.

Link: https://lkml.kernel.org/r/a70e8a1f7bce9f43d1431065b414e0f212297297.1752770784.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/userfaultfd.c              | 15 ++++++++++-----
 include/linux/userfaultfd_k.h |  5 +++++
 mm/mremap.c                   | 16 ++++++++++++----
 3 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2a644aa1a510..54c6cc7fe9c6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -750,11 +750,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	if (!ctx)
 		return;
 
-	if (to & ~PAGE_MASK) {
-		userfaultfd_ctx_put(ctx);
-		return;
-	}
-
 	msg_init(&ewq.msg);
 
 	ewq.msg.event = UFFD_EVENT_REMAP;
@@ -765,6 +760,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
 	userfaultfd_event_wait_completion(ctx, &ewq);
 }
 
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+	if (!ctx)
+		return;
+
+	userfaultfd_ctx_put(ctx);
+}
+
 bool userfaultfd_remove(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end)
 {
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index df85330bcfa6..c0e716aec26a 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -259,6 +259,7 @@ extern void mremap_userfaultfd_prep(struct vm_area_struct *,
 extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *,
 					unsigned long from, unsigned long to,
 					unsigned long len);
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *);
 
 extern bool userfaultfd_remove(struct vm_area_struct *vma,
 			       unsigned long start,
@@ -371,6 +372,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx,
 {
 }
 
+static inline void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *ctx)
+{
+}
+
 static inline bool userfaultfd_remove(struct vm_area_struct *vma,
 				      unsigned long start,
 				      unsigned long end)
diff --git a/mm/mremap.c b/mm/mremap.c
index 53447761e55d..db7e773d0884 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -1729,12 +1729,17 @@ static int check_prep_vma(struct vma_remap_struct *vrm)
 	return 0;
 }
 
-static void notify_uffd(struct vma_remap_struct *vrm, unsigned long to)
+static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
 {
 	struct mm_struct *mm = current->mm;
 
+	/* Regardless of success/failure, we always notify of any unmaps. */
 	userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
-	mremap_userfaultfd_complete(vrm->uf, vrm->addr, to, vrm->old_len);
+	if (failed)
+		mremap_userfaultfd_fail(vrm->uf);
+	else
+		mremap_userfaultfd_complete(vrm->uf, vrm->addr,
+			vrm->new_addr, vrm->old_len);
 	userfaultfd_unmap_complete(mm, vrm->uf_unmap);
 }
 
@@ -1742,6 +1747,7 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm)
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long res;
+	bool failed;
 
 	vrm->old_len = PAGE_ALIGN(vrm->old_len);
 	vrm->new_len = PAGE_ALIGN(vrm->new_len);
@@ -1763,13 +1769,15 @@ static unsigned long do_mremap(struct vma_remap_struct *vrm)
 	res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
 
 out:
+	failed = IS_ERR_VALUE(res);
+
 	if (vrm->mmap_locked)
 		mmap_write_unlock(mm);
 
-	if (!IS_ERR_VALUE(res) && vrm->mlocked && vrm->new_len > vrm->old_len)
+	if (!failed && vrm->mlocked && vrm->new_len > vrm->old_len)
 		mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
 
-	notify_uffd(vrm, res);
+	notify_uffd(vrm, failed);
 	return res;
 }
 
-- 
cgit v1.2.3


From 441413d2a99d1d23bea2df2497493024b00ace57 Mon Sep 17 00:00:00 2001
From: Anthony Yznaga <anthony.yznaga@oracle.com>
Date: Tue, 15 Jul 2025 18:26:11 -0700
Subject: mm: drop hugetlb_free_pgd_range()

There are no longer any callers of hugetlb_free_pgd_range().

Link: https://lkml.kernel.org/r/20250716012611.10369-4-anthony.yznaga@oracle.com
Signed-off-by: Anthony Yznaga <anthony.yznaga@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/hugetlb.h | 9 ---------
 include/linux/hugetlb.h       | 7 -------
 2 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 4bce4f07f44f..dcb8727f2b82 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -66,15 +66,6 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 }
 #endif
 
-#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
-		unsigned long addr, unsigned long end,
-		unsigned long floor, unsigned long ceiling)
-{
-	free_pgd_range(tlb, addr, end, floor, ceiling);
-}
-#endif
-
 #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pte, unsigned long sz)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 474de8e2a8f2..526d27e88b3b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -390,13 +390,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 	return 0;
 }
 
-static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
-				unsigned long addr, unsigned long end,
-				unsigned long floor, unsigned long ceiling)
-{
-	BUG();
-}
-
 #ifdef CONFIG_USERFAULTFD
 static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
 					   struct vm_area_struct *dst_vma,
-- 
cgit v1.2.3


From 378bdb97405a00bf03d8d993435c83add1688e36 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Thu, 17 Jul 2025 19:46:43 +0000
Subject: memcg: convert memcg->socket_pressure to u64

memcg->socket_pressure is initialised with jiffies when the memcg is
created.

Once vmpressure detects that the cgroup is under memory pressure, the
field is updated with jiffies + HZ to signal the fact to the socket layer
and suppress memory allocation for one second.

Otherwise, the field is not updated.

mem_cgroup_under_socket_pressure() uses time_before() to check if jiffies
is less than memcg->socket_pressure, and this has a bug on 32-bit kernel.

  if (time_before(jiffies, memcg->socket_pressure))
          return true;

As time_before() casts the final result to long, the acceptable delta
between two timestamps is 2 ^ (BITS_PER_LONG - 1).

On 32-bit kernel with CONFIG_HZ=1000, this is about 24 days.

  >>> (2 ** 31) / 1000 / 60 / 60 / 24
  24.855134814814818

Once 24 days have passed since the last update of socket_pressure,
mem_cgroup_under_socket_pressure() starts to lie until the next 24 days
pass.

We don't need to worry about this on 64-bit machines unless they serve for
300 million years.

  >>> (2 ** 63) / 1000 / 60 / 60 / 24 / 365
  292471208.6775361

Let's convert memcg->socket_pressure to u64.

Performance teting:

I don't have a real 32-bit machine so this is a result on QEMU, but
with/without the u64 jiffie patch, the time spent in
mem_cgroup_under_socket_pressure() was 1~5us and I didn't see any
measurable delta.

no patch applied:
iperf3   273 [000]   137.296248:
probe:mem_cgroup_under_socket_pressure: (c13660d0)
                c13660d1 mem_cgroup_under_socket_pressure+0x1
([kernel.kallsyms])
iperf3   273 [000]   137.296249:
probe:mem_cgroup_under_socket_pressure__return: (c13660d0 <- c1d8fd7f)
iperf3   273 [000]   137.296251:
probe:mem_cgroup_under_socket_pressure: (c13660d0)
                c13660d1 mem_cgroup_under_socket_pressure+0x1
([kernel.kallsyms])
iperf3   273 [000]   137.296253:
probe:mem_cgroup_under_socket_pressure__return: (c13660d0 <- c1d8fd7f)


u64 jiffies patch applied:
iperf3   308 [001]   330.669370:
probe:mem_cgroup_under_socket_pressure: (c12ddba0)
                c12ddba1 mem_cgroup_under_socket_pressure+0x1
([kernel.kallsyms])
iperf3   308 [001]   330.669371:
probe:mem_cgroup_under_socket_pressure__return: (c12ddba0 <- c1ce98bf)
iperf3   308 [001]   330.669382:
probe:mem_cgroup_under_socket_pressure: (c12ddba0)
                c12ddba1 mem_cgroup_under_socket_pressure+0x1
([kernel.kallsyms])
iperf3   308 [001]   330.669384:
probe:mem_cgroup_under_socket_pressure__return: (c12ddba0 <- c1ce98bf)

So the u64 approach is good enough.

Link: https://lkml.kernel.org/r/20250717194645.1096500-1-kuniyu@google.com
Fixes: 8e8ae645249b ("mm: memcontrol: hook up vmpressure to socket pressure")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <ncardwell@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 44 +++++++++++++++++++++++++++++++++++++++++---
 mm/memcontrol.c            |  5 ++++-
 mm/vmpressure.c            |  2 +-
 3 files changed, 46 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 87b6688f124a..785173aa0739 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -251,8 +251,10 @@ struct mem_cgroup {
 	 * that this indicator should NOT be used in legacy cgroup mode
 	 * where socket memory is accounted/charged separately.
 	 */
-	unsigned long		socket_pressure;
-
+	u64			socket_pressure;
+#if BITS_PER_LONG < 64
+	seqlock_t		socket_pressure_seqlock;
+#endif
 	int kmemcg_id;
 	/*
 	 * memcg->objcg is wiped out as a part of the objcg repaprenting
@@ -1602,6 +1604,42 @@ extern struct static_key_false memcg_sockets_enabled_key;
 #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
 void mem_cgroup_sk_alloc(struct sock *sk);
 void mem_cgroup_sk_free(struct sock *sk);
+
+#if BITS_PER_LONG < 64
+static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
+{
+	u64 val = get_jiffies_64() + HZ;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&memcg->socket_pressure_seqlock, flags);
+	memcg->socket_pressure = val;
+	write_sequnlock_irqrestore(&memcg->socket_pressure_seqlock, flags);
+}
+
+static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
+{
+	unsigned int seq;
+	u64 val;
+
+	do {
+		seq = read_seqbegin(&memcg->socket_pressure_seqlock);
+		val = memcg->socket_pressure;
+	} while (read_seqretry(&memcg->socket_pressure_seqlock, seq));
+
+	return val;
+}
+#else
+static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg)
+{
+	WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
+}
+
+static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg)
+{
+	return READ_ONCE(memcg->socket_pressure);
+}
+#endif
+
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
 #ifdef CONFIG_MEMCG_V1
@@ -1609,7 +1647,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 		return !!memcg->tcpmem_pressure;
 #endif /* CONFIG_MEMCG_V1 */
 	do {
-		if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
+		if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg)))
 			return true;
 	} while ((memcg = parent_mem_cgroup(memcg)));
 	return false;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 235c66d2161b..de7d737fe011 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3754,7 +3754,10 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 	INIT_LIST_HEAD(&memcg->memory_peaks);
 	INIT_LIST_HEAD(&memcg->swap_peaks);
 	spin_lock_init(&memcg->peaks_lock);
-	memcg->socket_pressure = jiffies;
+	memcg->socket_pressure = get_jiffies_64();
+#if BITS_PER_LONG < 64
+	seqlock_init(&memcg->socket_pressure_seqlock);
+#endif
 	memcg1_memcg_init(memcg);
 	memcg->kmemcg_id = -1;
 	INIT_LIST_HEAD(&memcg->objcg_list);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..c197ed47bcc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -316,7 +316,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
 			 * asserted for a second in which subsequent
 			 * pressure events can occur.
 			 */
-			WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
+			mem_cgroup_set_socket_pressure(memcg);
 		}
 	}
 }
-- 
cgit v1.2.3


From 92c99fc614737eaa99125283c306d2ebb13b101a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 14 Jul 2025 09:16:51 -0400
Subject: mm/memory: introduce is_huge_zero_pfn() and use it in
 vm_normal_page_pmd()

Patch series "mm: introduce snapshot_page()", v3.

This series introduces snapshot_page(), a helper function that can be used
to create a snapshot of a struct page and its associated struct folio.

This function is intended to help callers with a consistent view of a a
folio while reducing the chance of encountering partially updated or
inconsistent state, such as during folio splitting which could lead to
crashes and BUG_ON()s being triggered.


This patch (of 4):

Let's avoid working with the PMD when not required.  If
vm_normal_page_pmd() would be called on something that is not a present
pmd, it would already be a bug (pfn possibly garbage).

While at it, let's support passing in any pfn covered by the huge zero
folio by masking off PFN bits -- which should be rather cheap.

Link: https://lkml.kernel.org/r/cover.1752499009.git.luizcap@redhat.com
Link: https://lkml.kernel.org/r/4940826e99f0c709a7cf7beb94f53288320aea5a.1752499009.git.luizcap@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Reviewed-by: Shivank Garg <shivankg@amd.com>
Tested-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 12 +++++++++++-
 mm/memory.c             |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4d5bb67dc4ec..7748489fde1b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -482,9 +482,14 @@ static inline bool is_huge_zero_folio(const struct folio *folio)
 	return READ_ONCE(huge_zero_folio) == folio;
 }
 
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+	return READ_ONCE(huge_zero_pfn) == (pfn & ~(HPAGE_PMD_NR - 1));
+}
+
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
-	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
+	return pmd_present(pmd) && is_huge_zero_pfn(pmd_pfn(pmd));
 }
 
 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
@@ -632,6 +637,11 @@ static inline bool is_huge_zero_folio(const struct folio *folio)
 	return false;
 }
 
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+	return false;
+}
+
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
 	return false;
diff --git a/mm/memory.c b/mm/memory.c
index b4fb559dd0c6..92fd18a5d8d1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -668,7 +668,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 		}
 	}
 
-	if (is_huge_zero_pmd(pmd))
+	if (is_huge_zero_pfn(pfn))
 		return NULL;
 	if (unlikely(pfn > highest_memmap_pfn))
 		return NULL;
-- 
cgit v1.2.3


From d863a12108f24c5f0b49f99f328e33371bd7c69d Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@redhat.com>
Date: Mon, 14 Jul 2025 09:16:52 -0400
Subject: mm/util: introduce snapshot_page()

This commit refactors __dump_page() into snapshot_page().

snapshot_page() tries to take a faithful snapshot of a page and its folio
representation.  The snapshot is returned in the struct page_snapshot
parameter along with additional flags that are best retrieved at snapshot
creation time to reduce race windows.

This function is intended to be used by callers that need a stable
representation of a struct page and struct folio so that pointers or page
information doesn't change while working on a page.

The idea and original implementation of snapshot_page() comes from Matthew
Wilcox with suggestions for improvements from David Hildenbrand.  All bugs
and misconceptions are mine.

[luizcap@redhat.com: fix set_ps_flags() commentary]
  Link: https://lkml.kernel.org/r/d5c75701-b353-4536-a306-187fab0655b3@redhat.com
Link: https://lkml.kernel.org/r/637a03a05cb2e3df88f84ff9e9f9642374ef813a.1752499009.git.luizcap@redhat.com
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Reviewed-by: Shivank Garg <shivankg@amd.com>
Tested-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 +++++++++++++
 mm/debug.c         | 42 +++-------------------------
 mm/util.c          | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 805108d7bbc3..8e3a4c5b78ff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4199,4 +4199,23 @@ static inline bool page_pool_page_is_pp(struct page *page)
 }
 #endif
 
+#define PAGE_SNAPSHOT_FAITHFUL (1 << 0)
+#define PAGE_SNAPSHOT_PG_BUDDY (1 << 1)
+#define PAGE_SNAPSHOT_PG_IDLE  (1 << 2)
+
+struct page_snapshot {
+	struct folio folio_snapshot;
+	struct page page_snapshot;
+	unsigned long pfn;
+	unsigned long idx;
+	unsigned long flags;
+};
+
+static inline bool snapshot_page_is_faithful(const struct page_snapshot *ps)
+{
+	return ps->flags & PAGE_SNAPSHOT_FAITHFUL;
+}
+
+void snapshot_page(struct page_snapshot *ps, const struct page *page);
+
 #endif /* _LINUX_MM_H */
diff --git a/mm/debug.c b/mm/debug.c
index e2973e1b3812..b4388f4dcd4d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -129,47 +129,13 @@ static void __dump_folio(struct folio *folio, struct page *page,
 
 static void __dump_page(const struct page *page)
 {
-	struct folio *foliop, folio;
-	struct page precise;
-	unsigned long head;
-	unsigned long pfn = page_to_pfn(page);
-	unsigned long idx, nr_pages = 1;
-	int loops = 5;
-
-again:
-	memcpy(&precise, page, sizeof(*page));
-	head = precise.compound_head;
-	if ((head & 1) == 0) {
-		foliop = (struct folio *)&precise;
-		idx = 0;
-		if (!folio_test_large(foliop))
-			goto dump;
-		foliop = (struct folio *)page;
-	} else {
-		foliop = (struct folio *)(head - 1);
-		idx = folio_page_idx(foliop, page);
-	}
+	struct page_snapshot ps;
 
-	if (idx < MAX_FOLIO_NR_PAGES) {
-		memcpy(&folio, foliop, 2 * sizeof(struct page));
-		nr_pages = folio_nr_pages(&folio);
-		if (nr_pages > 1)
-			memcpy(&folio.__page_2, &foliop->__page_2,
-			       sizeof(struct page));
-		foliop = &folio;
-	}
-
-	if (idx > nr_pages) {
-		if (loops-- > 0)
-			goto again;
+	snapshot_page(&ps, page);
+	if (!snapshot_page_is_faithful(&ps))
 		pr_warn("page does not match folio\n");
-		precise.compound_head &= ~1UL;
-		foliop = (struct folio *)&precise;
-		idx = 0;
-	}
 
-dump:
-	__dump_folio(foliop, &precise, pfn, idx);
+	__dump_folio(&ps.folio_snapshot, &ps.page_snapshot, ps.pfn, ps.idx);
 }
 
 void dump_page(const struct page *page, const char *reason)
diff --git a/mm/util.c b/mm/util.c
index 68ea833ba25f..f814e6a59ab1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -25,6 +25,7 @@
 #include <linux/sizes.h>
 #include <linux/compat.h>
 #include <linux/fsnotify.h>
+#include <linux/page_idle.h>
 
 #include <linux/uaccess.h>
 
@@ -1172,6 +1173,86 @@ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma)
 }
 EXPORT_SYMBOL(compat_vma_mmap_prepare);
 
+static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
+			 const struct page *page)
+{
+	/*
+	 * Only the first page of a high-order buddy page has PageBuddy() set.
+	 * So we have to check manually whether this page is part of a high-
+	 * order buddy page.
+	 */
+	if (PageBuddy(page))
+		ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+	else if (page_count(page) == 0 && is_free_buddy_page(page))
+		ps->flags |= PAGE_SNAPSHOT_PG_BUDDY;
+
+	if (folio_test_idle(folio))
+		ps->flags |= PAGE_SNAPSHOT_PG_IDLE;
+}
+
+/**
+ * snapshot_page() - Create a snapshot of a struct page
+ * @ps: Pointer to a struct page_snapshot to store the page snapshot
+ * @page: The page to snapshot
+ *
+ * Create a snapshot of the page and store both its struct page and struct
+ * folio representations in @ps.
+ *
+ * A snapshot is marked as "faithful" if the compound state of @page was
+ * stable and allowed safe reconstruction of the folio representation. In
+ * rare cases where this is not possible (e.g. due to folio splitting),
+ * snapshot_page() falls back to treating @page as a single page and the
+ * snapshot is marked as "unfaithful". The snapshot_page_is_faithful()
+ * helper can be used to check for this condition.
+ */
+void snapshot_page(struct page_snapshot *ps, const struct page *page)
+{
+	unsigned long head, nr_pages = 1;
+	struct folio *foliop;
+	int loops = 5;
+
+	ps->pfn = page_to_pfn(page);
+	ps->flags = PAGE_SNAPSHOT_FAITHFUL;
+
+again:
+	memset(&ps->folio_snapshot, 0, sizeof(struct folio));
+	memcpy(&ps->page_snapshot, page, sizeof(*page));
+	head = ps->page_snapshot.compound_head;
+	if ((head & 1) == 0) {
+		ps->idx = 0;
+		foliop = (struct folio *)&ps->page_snapshot;
+		if (!folio_test_large(foliop)) {
+			set_ps_flags(ps, page_folio(page), page);
+			memcpy(&ps->folio_snapshot, foliop,
+			       sizeof(struct page));
+			return;
+		}
+		foliop = (struct folio *)page;
+	} else {
+		foliop = (struct folio *)(head - 1);
+		ps->idx = folio_page_idx(foliop, page);
+	}
+
+	if (ps->idx < MAX_FOLIO_NR_PAGES) {
+		memcpy(&ps->folio_snapshot, foliop, 2 * sizeof(struct page));
+		nr_pages = folio_nr_pages(&ps->folio_snapshot);
+		if (nr_pages > 1)
+			memcpy(&ps->folio_snapshot.__page_2, &foliop->__page_2,
+			       sizeof(struct page));
+		set_ps_flags(ps, foliop, page);
+	}
+
+	if (ps->idx > nr_pages) {
+		if (loops-- > 0)
+			goto again;
+		clear_compound_head(&ps->page_snapshot);
+		foliop = (struct folio *)&ps->page_snapshot;
+		memcpy(&ps->folio_snapshot, foliop, sizeof(struct page));
+		ps->flags = 0;
+		ps->idx = 0;
+	}
+}
+
 #ifdef CONFIG_MMU
 /**
  * folio_pte_batch - detect a PTE batch for a large folio
-- 
cgit v1.2.3


From 5631da56c9a87ea41d69d1bbbc1cee327eb9354b Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sat, 19 Jul 2025 11:28:54 -0700
Subject: fs/proc/task_mmu: read proc/pid/maps under per-vma lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With maple_tree supporting vma tree traversal under RCU and per-vma locks,
/proc/pid/maps can be read while holding individual vma locks instead of
locking the entire address space.

A completely lockless approach (walking vma tree under RCU) would be quite
complex with the main issue being get_vma_name() using callbacks which
might not work correctly with a stable vma copy, requiring original
(unstable) vma - see special_mapping_name() for example.

When per-vma lock acquisition fails, we take the mmap_lock for reading,
lock the vma, release the mmap_lock and continue.  This fallback to mmap
read lock guarantees the reader to make forward progress even during lock
contention.  This will interfere with the writer but for a very short time
while we are acquiring the per-vma lock and only when there was contention
on the vma reader is interested in.

We shouldn't see a repeated fallback to mmap read locks in practice, as
this require a very unlikely series of lock contentions (for instance due
to repeated vma split operations).  However even if this did somehow
happen, we would still progress.

One case requiring special handling is when a vma changes between the time
it was found and the time it got locked.  A problematic case would be if a
vma got shrunk so that its vm_start moved higher in the address space and
a new vma was installed at the beginning:

reader found:               |--------VMA A--------|
VMA is modified:            |-VMA B-|----VMA A----|
reader locks modified VMA A
reader reports VMA A:       |  gap  |----VMA A----|

This would result in reporting a gap in the address space that does not
exist.  To prevent this we retry the lookup after locking the vma, however
we do that only when we identify a gap and detect that the address space
was changed after we found the vma.

This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such as
monitoring/data collection services) from blocking address space updates.
Note that this change has a userspace visible disadvantage: it allows for
sub-page data tearing as opposed to the previous mechanism where data
tearing could happen only between pages of generated output data.  Since
current userspace considers data tearing between pages to be acceptable,
we assume is will be able to handle sub-page data tearing as well.

Link: https://lkml.kernel.org/r/20250719182854.3166724-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jeongjun Park <aha310510@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: T.J. Mercier <tjmercier@google.com>
Cc: Ye Bin <yebin10@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/internal.h        |   5 ++
 fs/proc/task_mmu.c        | 141 +++++++++++++++++++++++++++++++++++++++++++---
 include/linux/mmap_lock.h |  11 ++++
 mm/madvise.c              |   3 +-
 mm/mmap_lock.c            |  93 ++++++++++++++++++++++++++++++
 5 files changed, 244 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3d48ffe72583..7c235451c5ea 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -384,6 +384,11 @@ struct proc_maps_private {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct vma_iterator iter;
+	loff_t last_pos;
+#ifdef CONFIG_PER_VMA_LOCK
+	bool mmap_locked;
+	struct vm_area_struct *locked_vma;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;
 #endif
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90237df1ed33..3d6d8a9f13fc 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -130,13 +130,132 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
 }
 #endif
 
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
-						loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void unlock_vma(struct proc_maps_private *priv)
+{
+	if (priv->locked_vma) {
+		vma_end_read(priv->locked_vma);
+		priv->locked_vma = NULL;
+	}
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+				  struct proc_maps_private *priv)
+{
+	/*
+	 * smaps and numa_maps perform page table walk, therefore require
+	 * mmap_lock but maps can be read with locking just the vma and
+	 * walking the vma tree under rcu read protection.
+	 */
+	if (m->op != &proc_pid_maps_op) {
+		if (mmap_read_lock_killable(priv->mm))
+			return false;
+
+		priv->mmap_locked = true;
+	} else {
+		rcu_read_lock();
+		priv->locked_vma = NULL;
+		priv->mmap_locked = false;
+	}
+
+	return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+	if (priv->mmap_locked) {
+		mmap_read_unlock(priv->mm);
+	} else {
+		unlock_vma(priv);
+		rcu_read_unlock();
+	}
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+					   loff_t last_pos)
+{
+	struct vm_area_struct *vma;
+
+	if (priv->mmap_locked)
+		return vma_next(&priv->iter);
+
+	unlock_vma(priv);
+	vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
+	if (!IS_ERR_OR_NULL(vma))
+		priv->locked_vma = vma;
+
+	return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+					 loff_t pos)
 {
-	struct vm_area_struct *vma = vma_next(&priv->iter);
+	if (priv->mmap_locked)
+		return false;
+
+	rcu_read_unlock();
+	mmap_read_lock(priv->mm);
+	/* Reinitialize the iterator after taking mmap_lock */
+	vma_iter_set(&priv->iter, pos);
+	priv->mmap_locked = true;
 
+	return true;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+				  struct proc_maps_private *priv)
+{
+	return mmap_read_lock_killable(priv->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+	mmap_read_unlock(priv->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+					   loff_t last_pos)
+{
+	return vma_next(&priv->iter);
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+					 loff_t pos)
+{
+	return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma;
+
+retry:
+	vma = get_next_vma(priv, *ppos);
+	/* EINTR of EAGAIN is possible */
+	if (IS_ERR(vma)) {
+		if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+			goto retry;
+
+		return vma;
+	}
+
+	/* Store previous position to be able to restart if needed */
+	priv->last_pos = *ppos;
 	if (vma) {
-		*ppos = vma->vm_start;
+		/*
+		 * Track the end of the reported vma to ensure position changes
+		 * even if previous vma was merged with the next vma and we
+		 * found the extended vma with the same vm_start.
+		 */
+		*ppos = vma->vm_end;
 	} else {
 		*ppos = SENTINEL_VMA_GATE;
 		vma = get_gate_vma(priv->mm);
@@ -166,19 +285,25 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
 		return NULL;
 	}
 
-	if (mmap_read_lock_killable(mm)) {
+	if (!lock_vma_range(m, priv)) {
 		mmput(mm);
 		put_task_struct(priv->task);
 		priv->task = NULL;
 		return ERR_PTR(-EINTR);
 	}
 
+	/*
+	 * Reset current position if last_addr was set before
+	 * and it's not a sentinel.
+	 */
+	if (last_addr > 0)
+		*ppos = last_addr = priv->last_pos;
 	vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
 	hold_task_mempolicy(priv);
 	if (last_addr == SENTINEL_VMA_GATE)
 		return get_gate_vma(mm);
 
-	return proc_get_vma(priv, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
@@ -187,7 +312,7 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
 		*ppos = SENTINEL_VMA_END;
 		return NULL;
 	}
-	return proc_get_vma(m->private, ppos);
+	return proc_get_vma(m, ppos);
 }
 
 static void m_stop(struct seq_file *m, void *v)
@@ -199,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v)
 		return;
 
 	release_task_mempolicy(priv);
-	mmap_read_unlock(mm);
+	unlock_vma_range(priv);
 	mmput(mm);
 	put_task_struct(priv->task);
 	priv->task = NULL;
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 5da384bd0a26..1f4f44951abe 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma);
 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 					  unsigned long address);
 
+/*
+ * Locks next vma pointed by the iterator. Confirms the locked vma has not
+ * been modified and will retry under mmap_lock protection if modification
+ * was detected. Should be called from read RCU section.
+ * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
+ * process was interrupted.
+ */
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+				     struct vma_iterator *iter,
+				     unsigned long address);
+
 #else /* CONFIG_PER_VMA_LOCK */
 
 static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
diff --git a/mm/madvise.c b/mm/madvise.c
index 2bf80989d5b6..bb80fc5ea08f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
 
 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
 {
-	mmap_assert_locked(vma->vm_mm);
+	if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
+		vma_assert_locked(vma);
 
 	return vma->anon_name;
 }
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 5f725cc67334..729fb7d0dd59 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -178,6 +178,99 @@ inval:
 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
 	return NULL;
 }
+
+static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
+							    struct vma_iterator *vmi,
+							    unsigned long from_addr)
+{
+	struct vm_area_struct *vma;
+	int ret;
+
+	ret = mmap_read_lock_killable(mm);
+	if (ret)
+		return ERR_PTR(ret);
+
+	/* Lookup the vma at the last position again under mmap_read_lock */
+	vma_iter_set(vmi, from_addr);
+	vma = vma_next(vmi);
+	if (vma) {
+		/* Very unlikely vma->vm_refcnt overflow case */
+		if (unlikely(!vma_start_read_locked(vma)))
+			vma = ERR_PTR(-EAGAIN);
+	}
+
+	mmap_read_unlock(mm);
+
+	return vma;
+}
+
+struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
+				     struct vma_iterator *vmi,
+				     unsigned long from_addr)
+{
+	struct vm_area_struct *vma;
+	unsigned int mm_wr_seq;
+	bool mmap_unlocked;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
+retry:
+	/* Start mmap_lock speculation in case we need to verify the vma later */
+	mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
+	vma = vma_next(vmi);
+	if (!vma)
+		return NULL;
+
+	vma = vma_start_read(mm, vma);
+	if (IS_ERR_OR_NULL(vma)) {
+		/*
+		 * Retry immediately if the vma gets detached from under us.
+		 * Infinite loop should not happen because the vma we find will
+		 * have to be constantly knocked out from under us.
+		 */
+		if (PTR_ERR(vma) == -EAGAIN) {
+			/* reset to search from the last address */
+			vma_iter_set(vmi, from_addr);
+			goto retry;
+		}
+
+		goto fallback;
+	}
+
+	/*
+	 * Verify the vma we locked belongs to the same address space and it's
+	 * not behind of the last search position.
+	 */
+	if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
+		goto fallback_unlock;
+
+	/*
+	 * vma can be ahead of the last search position but we need to verify
+	 * it was not shrunk after we found it and another vma has not been
+	 * installed ahead of it. Otherwise we might observe a gap that should
+	 * not be there.
+	 */
+	if (from_addr < vma->vm_start) {
+		/* Verify only if the address space might have changed since vma lookup. */
+		if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
+			vma_iter_set(vmi, from_addr);
+			if (vma != vma_next(vmi))
+				goto fallback_unlock;
+		}
+	}
+
+	return vma;
+
+fallback_unlock:
+	vma_end_read(vma);
+fallback:
+	rcu_read_unlock();
+	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
+	rcu_read_lock();
+	/* Reinitialize the iterator after re-entering rcu read section */
+	vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
+
+	return vma;
+}
 #endif /* CONFIG_PER_VMA_LOCK */
 
 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-- 
cgit v1.2.3


From 0aa3657df3ec713fca1f00a57a063b28f2a78147 Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Fri, 18 Jul 2025 14:32:40 +0530
Subject: mm: add batched versions of ptep_modify_prot_start/commit

Batch ptep_modify_prot_start/commit in preparation for optimizing
mprotect, implementing them as a simple loop over the corresponding single
pte helpers.  Architecture may override these helpers.

Link: https://lkml.kernel.org/r/20250718090244.21092-4-dev.jain@arm.com
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/mprotect.c           |  4 +--
 2 files changed, 85 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cf1515c163e2..e3b99920be05 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1331,7 +1331,9 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
 
 /*
  * Commit an update to a pte, leaving any hardware-controlled bits in
- * the PTE unmodified.
+ * the PTE unmodified. The pte returned from ptep_modify_prot_start() may
+ * additionally have young and/or dirty bits set where previously they were not,
+ * so the updated pte may have these additional changes.
  */
 static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
 					   unsigned long addr,
@@ -1340,6 +1342,86 @@ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
 	__ptep_modify_prot_commit(vma, addr, ptep, pte);
 }
 #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
+
+/**
+ * modify_prot_start_ptes - Start a pte protection read-modify-write transaction
+ * over a batch of ptes, which protects against asynchronous hardware
+ * modifications to the ptes. The intention is not to prevent the hardware from
+ * making pte updates, but to prevent any updates it may make from being lost.
+ * Please see the comment above ptep_modify_prot_start() for full description.
+ *
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_modify_prot_start(), collecting the a/d bits from each pte
+ * in the batch.
+ *
+ * Note that PTE bits in the PTE batch besides the PFN can differ.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio. All other PTE bits must be identical for
+ * all PTEs in the batch except for young and dirty bits.  The PTEs are all in
+ * the same PMD.
+ */
+#ifndef modify_prot_start_ptes
+static inline pte_t modify_prot_start_ptes(struct vm_area_struct *vma,
+		unsigned long addr, pte_t *ptep, unsigned int nr)
+{
+	pte_t pte, tmp_pte;
+
+	pte = ptep_modify_prot_start(vma, addr, ptep);
+	while (--nr) {
+		ptep++;
+		addr += PAGE_SIZE;
+		tmp_pte = ptep_modify_prot_start(vma, addr, ptep);
+		if (pte_dirty(tmp_pte))
+			pte = pte_mkdirty(pte);
+		if (pte_young(tmp_pte))
+			pte = pte_mkyoung(pte);
+	}
+	return pte;
+}
+#endif
+
+/**
+ * modify_prot_commit_ptes - Commit an update to a batch of ptes, leaving any
+ * hardware-controlled bits in the PTE unmodified.
+ *
+ * @vma: The virtual memory area the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @old_pte: Old page table entry (for the first entry) which is now cleared.
+ * @pte: New page table entry to be set.
+ * @nr: Number of entries.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_modify_prot_commit().
+ *
+ * Context: The caller holds the page table lock. The PTEs are all in the same
+ * PMD. On exit, the set ptes in the batch map the same folio. The ptes set by
+ * ptep_modify_prot_start() may additionally have young and/or dirty bits set
+ * where previously they were not, so the updated ptes may have these
+ * additional changes.
+ */
+#ifndef modify_prot_commit_ptes
+static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr,
+		pte_t *ptep, pte_t old_pte, pte_t pte, unsigned int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; ++i, ++ptep, addr += PAGE_SIZE) {
+		ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte);
+
+		/* Advance PFN only, set same prot */
+		old_pte = pte_next_pfn(old_pte);
+		pte = pte_next_pfn(pte);
+	}
+}
+#endif
+
 #endif /* CONFIG_MMU */
 
 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 97adc62c50ab..4977f198168e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -204,7 +204,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 				}
 			}
 
-			oldpte = ptep_modify_prot_start(vma, addr, pte);
+			oldpte = modify_prot_start_ptes(vma, addr, pte, nr_ptes);
 			ptent = pte_modify(oldpte, newprot);
 
 			if (uffd_wp)
@@ -230,7 +230,7 @@ static long change_pte_range(struct mmu_gather *tlb,
 			    can_change_pte_writable(vma, addr, ptent))
 				ptent = pte_mkwrite(ptent, vma);
 
-			ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
+			modify_prot_commit_ptes(vma, addr, pte, oldpte, ptent, nr_ptes);
 			if (pte_needs_flush(oldpte, ptent))
 				tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
 			pages++;
-- 
cgit v1.2.3


From 45cd52c44e85453c9147d41b715cd03c53325cf4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 21 Jul 2025 21:46:18 +0100
Subject: mm: remove grab_cache_page()

All callers have been converted to use filemap_grab_folio().

Link: https://lkml.kernel.org/r/20250721204619.163883-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 10a222e68b85..a5ef64a15f96 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -878,7 +878,8 @@ static inline struct page *find_or_create_page(struct address_space *mapping,
  * @mapping: target address_space
  * @index: the page index
  *
- * Same as grab_cache_page(), but do not wait if the page is unavailable.
+ * Returns locked page at given index in given cache, creating it if
+ * needed, but do not wait if the page is locked or to reclaim memory.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
  * be safe to call while holding the lock for another page.
@@ -942,15 +943,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
 unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
 		pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
 
-/*
- * Returns locked page at given index in given cache, creating it if needed.
- */
-static inline struct page *grab_cache_page(struct address_space *mapping,
-								pgoff_t index)
-{
-	return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
-}
-
 struct folio *read_cache_folio(struct address_space *, pgoff_t index,
 		filler_t *filler, struct file *file);
 struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
-- 
cgit v1.2.3


From ffc72771ff6ec9f5b431a86c4b00d8ef0fea958b Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Fri, 25 Jul 2025 13:03:27 +0200
Subject: regmap: Annotate that MMIO implies fast IO

Document that using the MMIO helpers will automatically enable the
'fast_io' parameter. This makes the used locking scheme more transparent
and avoids superfluous setting of this parameter in drivers.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://patch.msgid.link/20250725110337.4303-2-wsa+renesas@sang-engineering.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 02b83f5499b8..4e1ac1fbcec4 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -913,7 +913,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
  * @config: Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
+ * a struct regmap. Implies 'fast_io'.
  */
 #define regmap_init_mmio_clk(dev, clk_id, regs, config)			\
 	__regmap_lockdep_wrapper(__regmap_init_mmio_clk, #config,	\
@@ -927,7 +927,7 @@ int regmap_attach_dev(struct device *dev, struct regmap *map,
  * @config: Configuration for register map
  *
  * The return value will be an ERR_PTR() on error or a valid pointer to
- * a struct regmap.
+ * a struct regmap. Implies 'fast_io'.
  */
 #define regmap_init_mmio(dev, regs, config)		\
 	regmap_init_mmio_clk(dev, NULL, regs, config)
@@ -1138,7 +1138,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  *
  * The return value will be an ERR_PTR() on error or a valid pointer
  * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
+ * device management code. Implies 'fast_io'.
  */
 #define devm_regmap_init_mmio_clk(dev, clk_id, regs, config)		\
 	__regmap_lockdep_wrapper(__devm_regmap_init_mmio_clk, #config,	\
@@ -1153,7 +1153,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg);
  *
  * The return value will be an ERR_PTR() on error or a valid pointer
  * to a struct regmap.  The regmap will be automatically freed by the
- * device management code.
+ * device management code. Implies 'fast_io'.
  */
 #define devm_regmap_init_mmio(dev, regs, config)		\
 	devm_regmap_init_mmio_clk(dev, NULL, regs, config)
-- 
cgit v1.2.3


From 43cf0e05089afe23dac74fa6e1e109d49f2903c4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:12:59 -0400
Subject: powerpc/thp: tracing: Hide hugepage events under CONFIG_PPC_BOOK3S_64

The events hugepage_set_pmd, hugepage_set_pud, hugepage_update_pmd and
hugepage_update_pud are only called when CONFIG_PPC_BOOK3S_64 is defined.
As each event can take up to 5K regardless if they are used or not, it's
best not to define them when they are not used. Add #ifdef around these
events when they are not used.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/20250612101259.0ad43e48@batman.local.home
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/thp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index f50048af5fcc..c8fe879d5828 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
+#ifdef CONFIG_PPC_BOOK3S_64
 DECLARE_EVENT_CLASS(hugepage_set,
 
 	    TP_PROTO(unsigned long addr, unsigned long pte),
@@ -66,6 +67,7 @@ DEFINE_EVENT(hugepage_update, hugepage_update_pud,
 	    TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set),
 	    TP_ARGS(addr, pud, clr, set)
 );
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 DECLARE_EVENT_CLASS(migration_pmd,
 
-- 
cgit v1.2.3


From 54091eee08acebfb5e971611c3f189e7577a1058 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 25 Jul 2025 10:58:14 +0900
Subject: scsi: libsas: Refactor dev_is_sata()

Use a switch statement in dev_is_sata() to make the code more readable
(and probably slightly better than a series of or conditions). Also have
this inline function return a boolean instead of an integer.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250725015818.171252-2-dlemoal@kernel.org
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/sas_ata.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/scsi/sas_ata.h b/include/scsi/sas_ata.h
index 92e27e7bf088..8dddd0036f99 100644
--- a/include/scsi/sas_ata.h
+++ b/include/scsi/sas_ata.h
@@ -15,10 +15,17 @@
 
 #ifdef CONFIG_SCSI_SAS_ATA
 
-static inline int dev_is_sata(struct domain_device *dev)
+static inline bool dev_is_sata(struct domain_device *dev)
 {
-	return dev->dev_type == SAS_SATA_DEV || dev->dev_type == SAS_SATA_PM ||
-	       dev->dev_type == SAS_SATA_PM_PORT || dev->dev_type == SAS_SATA_PENDING;
+	switch (dev->dev_type) {
+	case SAS_SATA_DEV:
+	case SAS_SATA_PENDING:
+	case SAS_SATA_PM:
+	case SAS_SATA_PM_PORT:
+		return true;
+	default:
+		return false;
+	}
 }
 
 int sas_get_ata_info(struct domain_device *dev, struct ex_phy *phy);
@@ -49,9 +56,9 @@ static inline void sas_ata_disabled_notice(void)
 	pr_notice_once("ATA device seen but CONFIG_SCSI_SAS_ATA=N\n");
 }
 
-static inline int dev_is_sata(struct domain_device *dev)
+static inline bool dev_is_sata(struct domain_device *dev)
 {
-	return 0;
+	return false;
 }
 static inline int sas_ata_init(struct domain_device *dev)
 {
-- 
cgit v1.2.3


From bd31394aabf36ee18781c6371e02d789484ffda3 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 25 Jul 2025 10:58:16 +0900
Subject: scsi: libsas: Make sas_get_ata_info() static

The function sas_get_ata_info() is used only in
drivers/scsi/libsas/sas_ata.c. Remove its definition from
include/scsi/sas_ata.h and make this function static.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250725015818.171252-4-dlemoal@kernel.org
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_ata.c | 2 +-
 include/scsi/sas_ata.h        | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 2cbf38b18c5c..cc093cdc9c69 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -252,7 +252,7 @@ static int sas_get_ata_command_set(struct domain_device *dev)
 	return ata_dev_classify(&tf);
 }
 
-int sas_get_ata_info(struct domain_device *dev, struct ex_phy *phy)
+static int sas_get_ata_info(struct domain_device *dev, struct ex_phy *phy)
 {
 	if (phy->attached_tproto & SAS_PROTOCOL_STP)
 		dev->tproto = phy->attached_tproto;
diff --git a/include/scsi/sas_ata.h b/include/scsi/sas_ata.h
index 8dddd0036f99..5e3475975aee 100644
--- a/include/scsi/sas_ata.h
+++ b/include/scsi/sas_ata.h
@@ -28,7 +28,6 @@ static inline bool dev_is_sata(struct domain_device *dev)
 	}
 }
 
-int sas_get_ata_info(struct domain_device *dev, struct ex_phy *phy);
 int sas_ata_init(struct domain_device *dev);
 void sas_ata_task_abort(struct sas_task *task);
 void sas_ata_strategy_handler(struct Scsi_Host *shost);
@@ -96,11 +95,6 @@ static inline void sas_resume_sata(struct asd_sas_port *port)
 {
 }
 
-static inline int sas_get_ata_info(struct domain_device *dev, struct ex_phy *phy)
-{
-	return 0;
-}
-
 static inline void sas_ata_end_eh(struct ata_port *ap)
 {
 }
-- 
cgit v1.2.3


From 704ed03abf6b1c2752a8b16446a5ebf18694fefe Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Fri, 25 Jul 2025 10:58:17 +0900
Subject: scsi: libsas: Move declarations of internal functions to
 sas_internal.h

Move the declaration of all functions used only within libsas from
include/scsi/sas_ata.h to drivers/scsi/libsas/sas_internal.h.

No functional changes.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250725015818.171252-5-dlemoal@kernel.org
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_internal.h | 74 ++++++++++++++++++++++++++++++++++++++
 include/scsi/sas_ata.h             | 68 +----------------------------------
 2 files changed, 75 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
index 03d6ec1eb970..16f8d81d7531 100644
--- a/drivers/scsi/libsas/sas_internal.h
+++ b/drivers/scsi/libsas/sas_internal.h
@@ -222,4 +222,78 @@ static inline void sas_put_device(struct domain_device *dev)
 	kref_put(&dev->kref, sas_free_device);
 }
 
+#ifdef CONFIG_SCSI_SAS_ATA
+
+int sas_ata_init(struct domain_device *dev);
+void sas_ata_task_abort(struct sas_task *task);
+int sas_discover_sata(struct domain_device *dev);
+int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *phy,
+		    struct domain_device *child, int phy_id);
+void sas_ata_strategy_handler(struct Scsi_Host *shost);
+void sas_ata_eh(struct Scsi_Host *shost, struct list_head *work_q);
+void sas_ata_end_eh(struct ata_port *ap);
+void sas_ata_wait_eh(struct domain_device *dev);
+void sas_probe_sata(struct asd_sas_port *port);
+void sas_suspend_sata(struct asd_sas_port *port);
+void sas_resume_sata(struct asd_sas_port *port);
+
+#else
+
+static inline int sas_ata_init(struct domain_device *dev)
+{
+	return 0;
+}
+
+static inline void sas_ata_task_abort(struct sas_task *task)
+{
+}
+
+static inline void sas_ata_strategy_handler(struct Scsi_Host *shost)
+{
+}
+
+static inline void sas_ata_eh(struct Scsi_Host *shost, struct list_head *work_q)
+{
+}
+
+static inline void sas_ata_end_eh(struct ata_port *ap)
+{
+}
+
+static inline void sas_ata_wait_eh(struct domain_device *dev)
+{
+}
+
+static inline void sas_probe_sata(struct asd_sas_port *port)
+{
+}
+
+static inline void sas_suspend_sata(struct asd_sas_port *port)
+{
+}
+
+static inline void sas_resume_sata(struct asd_sas_port *port)
+{
+}
+
+static inline void sas_ata_disabled_notice(void)
+{
+	pr_notice_once("ATA device seen but CONFIG_SCSI_SAS_ATA=N\n");
+}
+
+static inline int sas_discover_sata(struct domain_device *dev)
+{
+	sas_ata_disabled_notice();
+	return -ENXIO;
+}
+
+static inline int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *phy,
+				  struct domain_device *child, int phy_id)
+{
+	sas_ata_disabled_notice();
+	return -ENODEV;
+}
+
+#endif
+
 #endif /* _SAS_INTERNAL_H_ */
diff --git a/include/scsi/sas_ata.h b/include/scsi/sas_ata.h
index 5e3475975aee..a161c0222931 100644
--- a/include/scsi/sas_ata.h
+++ b/include/scsi/sas_ata.h
@@ -28,77 +28,24 @@ static inline bool dev_is_sata(struct domain_device *dev)
 	}
 }
 
-int sas_ata_init(struct domain_device *dev);
-void sas_ata_task_abort(struct sas_task *task);
-void sas_ata_strategy_handler(struct Scsi_Host *shost);
-void sas_ata_eh(struct Scsi_Host *shost, struct list_head *work_q);
 void sas_ata_schedule_reset(struct domain_device *dev);
-void sas_ata_wait_eh(struct domain_device *dev);
-void sas_probe_sata(struct asd_sas_port *port);
-void sas_suspend_sata(struct asd_sas_port *port);
-void sas_resume_sata(struct asd_sas_port *port);
-void sas_ata_end_eh(struct ata_port *ap);
 void sas_ata_device_link_abort(struct domain_device *dev, bool force_reset);
-int sas_execute_ata_cmd(struct domain_device *device, u8 *fis,
-			int force_phy_id);
+int sas_execute_ata_cmd(struct domain_device *device, u8 *fis, int force_phy_id);
 int smp_ata_check_ready_type(struct ata_link *link);
-int sas_discover_sata(struct domain_device *dev);
-int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *phy,
-		    struct domain_device *child, int phy_id);
 
 extern const struct attribute_group sas_ata_sdev_attr_group;
 
 #else
 
-static inline void sas_ata_disabled_notice(void)
-{
-	pr_notice_once("ATA device seen but CONFIG_SCSI_SAS_ATA=N\n");
-}
-
 static inline bool dev_is_sata(struct domain_device *dev)
 {
 	return false;
 }
-static inline int sas_ata_init(struct domain_device *dev)
-{
-	return 0;
-}
-static inline void sas_ata_task_abort(struct sas_task *task)
-{
-}
-
-static inline void sas_ata_strategy_handler(struct Scsi_Host *shost)
-{
-}
-
-static inline void sas_ata_eh(struct Scsi_Host *shost, struct list_head *work_q)
-{
-}
 
 static inline void sas_ata_schedule_reset(struct domain_device *dev)
 {
 }
 
-static inline void sas_ata_wait_eh(struct domain_device *dev)
-{
-}
-
-static inline void sas_probe_sata(struct asd_sas_port *port)
-{
-}
-
-static inline void sas_suspend_sata(struct asd_sas_port *port)
-{
-}
-
-static inline void sas_resume_sata(struct asd_sas_port *port)
-{
-}
-
-static inline void sas_ata_end_eh(struct ata_port *ap)
-{
-}
-
 static inline void sas_ata_device_link_abort(struct domain_device *dev,
 					     bool force_reset)
 {
@@ -115,19 +62,6 @@ static inline int smp_ata_check_ready_type(struct ata_link *link)
 	return 0;
 }
 
-static inline int sas_discover_sata(struct domain_device *dev)
-{
-	sas_ata_disabled_notice();
-	return -ENXIO;
-}
-
-static inline int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *phy,
-				  struct domain_device *child, int phy_id)
-{
-	sas_ata_disabled_notice();
-	return -ENODEV;
-}
-
 #define sas_ata_sdev_attr_group ((struct attribute_group) {})
 
 #endif
-- 
cgit v1.2.3


From 24cbfe18d55a6866bc2e27fda74306f4a1b5cb01 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 24 Jul 2025 19:33:27 +0200
Subject: rv: Merge struct rv_monitor_def into struct rv_monitor

Each struct rv_monitor has a unique struct rv_monitor_def associated with
it. struct rv_monitor is statically allocated, while struct rv_monitor_def
is dynamically allocated.

This makes the code more complicated than it should be:

  - Lookup is required to get the associated rv_monitor_def from rv_monitor

  - Dynamic memory allocation is required for rv_monitor_def. This is
    harder to get right compared to static memory. For instance, there is
    an existing mistake: rv_unregister_monitor() does not free the memory
    allocated by rv_register_monitor(). This is fortunately not a real
    memory leak problem, as rv_unregister_monitor() is never called.

Simplify and merge rv_monitor_def into rv_monitor.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/194449c00f87945c207aab4c96920c75796a4f53.1753378331.git.namcao@linutronix.de
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |   8 ++
 kernel/trace/rv/rv.c          | 211 +++++++++++++++++++-----------------------
 kernel/trace/rv/rv.h          |  27 ++----
 kernel/trace/rv/rv_reactors.c |  62 ++++++-------
 4 files changed, 140 insertions(+), 168 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 97baf58d88b2..dba53aecdfab 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -7,6 +7,9 @@
 #ifndef _LINUX_RV_H
 #define _LINUX_RV_H
 
+#include <linux/types.h>
+#include <linux/list.h>
+
 #define MAX_DA_NAME_LEN	32
 
 #ifdef CONFIG_RV
@@ -98,8 +101,13 @@ struct rv_monitor {
 	void			(*disable)(void);
 	void			(*reset)(void);
 #ifdef CONFIG_RV_REACTORS
+	struct rv_reactor_def	*rdef;
 	__printf(1, 2) void	(*react)(const char *msg, ...);
+	bool			reacting;
 #endif
+	struct list_head	list;
+	struct rv_monitor	*parent;
+	struct dentry		*root_d;
 };
 
 bool rv_monitoring_on(void);
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 108429d16ec1..6c0be2fdc52d 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot)
  * Monitors with a parent are nested,
  * Monitors without a parent could be standalone or containers.
  */
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
+bool rv_is_nested_monitor(struct rv_monitor *mon)
 {
-	return mdef->parent != NULL;
+	return mon->parent != NULL;
 }
 
 /*
@@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
  * for enable()/disable(). Use this condition to find empty containers.
  * Keep both conditions in case we have some non-compliant containers.
  */
-bool rv_is_container_monitor(struct rv_monitor_def *mdef)
+bool rv_is_container_monitor(struct rv_monitor *mon)
 {
-	struct rv_monitor_def *next;
+	struct rv_monitor *next;
 
-	if (list_is_last(&mdef->list, &rv_monitors_list))
+	if (list_is_last(&mon->list, &rv_monitors_list))
 		return false;
 
-	next = list_next_entry(mdef, list);
+	next = list_next_entry(mon, list);
 
-	return next->parent == mdef->monitor || !mdef->monitor->enable;
+	return next->parent == mon || !mon->enable;
 }
 
 /*
@@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef)
 static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
 					loff_t *ppos)
 {
-	struct rv_monitor_def *mdef = filp->private_data;
+	struct rv_monitor *mon = filp->private_data;
 	const char *buff;
 
-	buff = mdef->monitor->enabled ? "1\n" : "0\n";
+	buff = mon->enabled ? "1\n" : "0\n";
 
 	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
 }
@@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf
 /*
  * __rv_disable_monitor - disabled an enabled monitor
  */
-static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+static int __rv_disable_monitor(struct rv_monitor *mon, bool sync)
 {
 	lockdep_assert_held(&rv_interface_lock);
 
-	if (mdef->monitor->enabled) {
-		mdef->monitor->enabled = 0;
-		if (mdef->monitor->disable)
-			mdef->monitor->disable();
+	if (mon->enabled) {
+		mon->enabled = 0;
+		if (mon->disable)
+			mon->disable();
 
 		/*
 		 * Wait for the execution of all events to finish.
@@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
 	return 0;
 }
 
-static void rv_disable_single(struct rv_monitor_def *mdef)
+static void rv_disable_single(struct rv_monitor *mon)
 {
-	__rv_disable_monitor(mdef, true);
+	__rv_disable_monitor(mon, true);
 }
 
-static int rv_enable_single(struct rv_monitor_def *mdef)
+static int rv_enable_single(struct rv_monitor *mon)
 {
 	int retval;
 
 	lockdep_assert_held(&rv_interface_lock);
 
-	if (mdef->monitor->enabled)
+	if (mon->enabled)
 		return 0;
 
-	retval = mdef->monitor->enable();
+	retval = mon->enable();
 
 	if (!retval)
-		mdef->monitor->enabled = 1;
+		mon->enabled = 1;
 
 	return retval;
 }
 
-static void rv_disable_container(struct rv_monitor_def *mdef)
+static void rv_disable_container(struct rv_monitor *mon)
 {
-	struct rv_monitor_def *p = mdef;
+	struct rv_monitor *p = mon;
 	int enabled = 0;
 
 	list_for_each_entry_continue(p, &rv_monitors_list, list) {
-		if (p->parent != mdef->monitor)
+		if (p->parent != mon)
 			break;
 		enabled += __rv_disable_monitor(p, false);
 	}
 	if (enabled)
 		tracepoint_synchronize_unregister();
-	mdef->monitor->enabled = 0;
+	mon->enabled = 0;
 }
 
-static int rv_enable_container(struct rv_monitor_def *mdef)
+static int rv_enable_container(struct rv_monitor *mon)
 {
-	struct rv_monitor_def *p = mdef;
+	struct rv_monitor *p = mon;
 	int retval = 0;
 
 	list_for_each_entry_continue(p, &rv_monitors_list, list) {
-		if (retval || p->parent != mdef->monitor)
+		if (retval || p->parent != mon)
 			break;
 		retval = rv_enable_single(p);
 	}
 	if (retval)
-		rv_disable_container(mdef);
+		rv_disable_container(mon);
 	else
-		mdef->monitor->enabled = 1;
+		mon->enabled = 1;
 	return retval;
 }
 
 /**
  * rv_disable_monitor - disable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
  *
  * Returns 0 on success.
  */
-int rv_disable_monitor(struct rv_monitor_def *mdef)
+int rv_disable_monitor(struct rv_monitor *mon)
 {
-	if (rv_is_container_monitor(mdef))
-		rv_disable_container(mdef);
+	if (rv_is_container_monitor(mon))
+		rv_disable_container(mon);
 	else
-		rv_disable_single(mdef);
+		rv_disable_single(mon);
 
 	return 0;
 }
 
 /**
  * rv_enable_monitor - enable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
  *
  * Returns 0 on success, error otherwise.
  */
-int rv_enable_monitor(struct rv_monitor_def *mdef)
+int rv_enable_monitor(struct rv_monitor *mon)
 {
 	int retval;
 
-	if (rv_is_container_monitor(mdef))
-		retval = rv_enable_container(mdef);
+	if (rv_is_container_monitor(mon))
+		retval = rv_enable_container(mon);
 	else
-		retval = rv_enable_single(mdef);
+		retval = rv_enable_single(mon);
 
 	return retval;
 }
@@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
 static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
 					 size_t count, loff_t *ppos)
 {
-	struct rv_monitor_def *mdef = filp->private_data;
+	struct rv_monitor *mon = filp->private_data;
 	int retval;
 	bool val;
 
@@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
 	mutex_lock(&rv_interface_lock);
 
 	if (val)
-		retval = rv_enable_monitor(mdef);
+		retval = rv_enable_monitor(mon);
 	else
-		retval = rv_disable_monitor(mdef);
+		retval = rv_disable_monitor(mon);
 
 	mutex_unlock(&rv_interface_lock);
 
@@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = {
 static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
 				      loff_t *ppos)
 {
-	struct rv_monitor_def *mdef = filp->private_data;
+	struct rv_monitor *mon = filp->private_data;
 	char buff[256];
 
 	memset(buff, 0, sizeof(buff));
 
-	snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+	snprintf(buff, sizeof(buff), "%s\n", mon->description);
 
 	return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
 }
@@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = {
  * the monitor dir, where the specific options of the monitor
  * are exposed.
  */
-static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent)
+static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent)
 {
 	struct dentry *root = parent ? parent->root_d : get_monitors_root();
-	const char *name = mdef->monitor->name;
+	const char *name = mon->name;
 	struct dentry *tmp;
 	int retval;
 
-	mdef->root_d = rv_create_dir(name, root);
-	if (!mdef->root_d)
+	mon->root_d = rv_create_dir(name, root);
+	if (!mon->root_d)
 		return -ENOMEM;
 
-	tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+	tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops);
 	if (!tmp) {
 		retval = -ENOMEM;
 		goto out_remove_root;
 	}
 
-	tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+	tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops);
 	if (!tmp) {
 		retval = -ENOMEM;
 		goto out_remove_root;
 	}
 
-	retval = reactor_populate_monitor(mdef);
+	retval = reactor_populate_monitor(mon);
 	if (retval)
 		goto out_remove_root;
 
 	return 0;
 
 out_remove_root:
-	rv_remove(mdef->root_d);
+	rv_remove(mon->root_d);
 	return retval;
 }
 
@@ -458,13 +458,12 @@ out_remove_root:
  */
 static int monitors_show(struct seq_file *m, void *p)
 {
-	struct rv_monitor_def *mon_def = p;
+	struct rv_monitor *mon = p;
 
-	if (mon_def->parent)
-		seq_printf(m, "%s:%s\n", mon_def->parent->name,
-			   mon_def->monitor->name);
+	if (mon->parent)
+		seq_printf(m, "%s:%s\n", mon->parent->name, mon->name);
 	else
-		seq_printf(m, "%s\n", mon_def->monitor->name);
+		seq_printf(m, "%s\n", mon->name);
 	return 0;
 }
 
@@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
  */
 static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	struct rv_monitor_def *m_def = p;
+	struct rv_monitor *mon = p;
 
 	(*pos)++;
 
-	list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
-		if (m_def->monitor->enabled)
-			return m_def;
+	list_for_each_entry_continue(mon, &rv_monitors_list, list) {
+		if (mon->enabled)
+			return mon;
 	}
 
 	return NULL;
@@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
 
 static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
 {
-	struct rv_monitor_def *m_def;
+	struct rv_monitor *mon;
 	loff_t l;
 
 	mutex_lock(&rv_interface_lock);
@@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
 	if (list_empty(&rv_monitors_list))
 		return NULL;
 
-	m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+	mon = list_entry(&rv_monitors_list, struct rv_monitor, list);
 
 	for (l = 0; l <= *pos; ) {
-		m_def = enabled_monitors_next(m, m_def, &l);
-		if (!m_def)
+		mon = enabled_monitors_next(m, mon, &l);
+		if (!mon)
 			break;
 	}
 
-	return m_def;
+	return mon;
 }
 
 /*
@@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = {
  */
 static void disable_all_monitors(void)
 {
-	struct rv_monitor_def *mdef;
+	struct rv_monitor *mon;
 	int enabled = 0;
 
 	mutex_lock(&rv_interface_lock);
 
-	list_for_each_entry(mdef, &rv_monitors_list, list)
-		enabled += __rv_disable_monitor(mdef, false);
+	list_for_each_entry(mon, &rv_monitors_list, list)
+		enabled += __rv_disable_monitor(mon, false);
 
 	if (enabled) {
 		/*
@@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
 				      size_t count, loff_t *ppos)
 {
 	char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
-	struct rv_monitor_def *mdef;
+	struct rv_monitor *mon;
 	int retval = -EINVAL;
 	bool enable = true;
 	char *ptr, *tmp;
@@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
 	if (tmp)
 		ptr = tmp+1;
 
-	list_for_each_entry(mdef, &rv_monitors_list, list) {
-		if (strcmp(ptr, mdef->monitor->name) != 0)
+	list_for_each_entry(mon, &rv_monitors_list, list) {
+		if (strcmp(ptr, mon->name) != 0)
 			continue;
 
 		/*
 		 * Monitor found!
 		 */
 		if (enable)
-			retval = rv_enable_monitor(mdef);
+			retval = rv_enable_monitor(mon);
 		else
-			retval = rv_disable_monitor(mdef);
+			retval = rv_disable_monitor(mon);
 
 		if (!retval)
 			retval = count;
@@ -702,11 +701,11 @@ static void turn_monitoring_off(void)
 
 static void reset_all_monitors(void)
 {
-	struct rv_monitor_def *mdef;
+	struct rv_monitor *mon;
 
-	list_for_each_entry(mdef, &rv_monitors_list, list) {
-		if (mdef->monitor->enabled && mdef->monitor->reset)
-			mdef->monitor->reset();
+	list_for_each_entry(mon, &rv_monitors_list, list) {
+		if (mon->enabled && mon->reset)
+			mon->reset();
 	}
 }
 
@@ -768,10 +767,10 @@ static const struct file_operations monitoring_on_fops = {
 	.read   = monitoring_on_read_data,
 };
 
-static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+static void destroy_monitor_dir(struct rv_monitor *mon)
 {
-	reactor_cleanup_monitor(mdef);
-	rv_remove(mdef->root_d);
+	reactor_cleanup_monitor(mon);
+	rv_remove(mon->root_d);
 }
 
 /**
@@ -783,7 +782,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef)
  */
 int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
 {
-	struct rv_monitor_def *r, *p = NULL;
+	struct rv_monitor *r;
 	int retval = 0;
 
 	if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
@@ -795,49 +794,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
 	mutex_lock(&rv_interface_lock);
 
 	list_for_each_entry(r, &rv_monitors_list, list) {
-		if (strcmp(monitor->name, r->monitor->name) == 0) {
+		if (strcmp(monitor->name, r->name) == 0) {
 			pr_info("Monitor %s is already registered\n", monitor->name);
 			retval = -EEXIST;
 			goto out_unlock;
 		}
 	}
 
-	if (parent) {
-		list_for_each_entry(r, &rv_monitors_list, list) {
-			if (strcmp(parent->name, r->monitor->name) == 0) {
-				p = r;
-				break;
-			}
-		}
-	}
-
-	if (p && rv_is_nested_monitor(p)) {
+	if (parent && rv_is_nested_monitor(parent)) {
 		pr_info("Parent monitor %s is already nested, cannot nest further\n",
 			parent->name);
 		retval = -EINVAL;
 		goto out_unlock;
 	}
 
-	r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
-	if (!r) {
-		retval = -ENOMEM;
-		goto out_unlock;
-	}
-
-	r->monitor = monitor;
-	r->parent = parent;
+	monitor->parent = parent;
 
-	retval = create_monitor_dir(r, p);
-	if (retval) {
-		kfree(r);
-		goto out_unlock;
-	}
+	retval = create_monitor_dir(monitor, parent);
+	if (retval)
+		return retval;
 
 	/* keep children close to the parent for easier visualisation */
-	if (p)
-		list_add(&r->list, &p->list);
+	if (parent)
+		list_add(&monitor->list, &parent->list);
 	else
-		list_add_tail(&r->list, &rv_monitors_list);
+		list_add_tail(&monitor->list, &rv_monitors_list);
 
 out_unlock:
 	mutex_unlock(&rv_interface_lock);
@@ -852,17 +833,11 @@ out_unlock:
  */
 int rv_unregister_monitor(struct rv_monitor *monitor)
 {
-	struct rv_monitor_def *ptr, *next;
-
 	mutex_lock(&rv_interface_lock);
 
-	list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
-		if (strcmp(monitor->name, ptr->monitor->name) == 0) {
-			rv_disable_monitor(ptr);
-			list_del(&ptr->list);
-			destroy_monitor_dir(ptr);
-		}
-	}
+	rv_disable_monitor(monitor);
+	list_del(&monitor->list);
+	destroy_monitor_dir(monitor);
 
 	mutex_unlock(&rv_interface_lock);
 	return 0;
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 873364094402..f039ec1c9156 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -32,34 +32,23 @@ struct rv_reactor_def {
 };
 #endif
 
-struct rv_monitor_def {
-	struct list_head	list;
-	struct rv_monitor	*monitor;
-	struct rv_monitor	*parent;
-	struct dentry		*root_d;
-#ifdef CONFIG_RV_REACTORS
-	struct rv_reactor_def	*rdef;
-	bool			reacting;
-#endif
-};
-
 struct dentry *get_monitors_root(void);
-int rv_disable_monitor(struct rv_monitor_def *mdef);
-int rv_enable_monitor(struct rv_monitor_def *mdef);
-bool rv_is_container_monitor(struct rv_monitor_def *mdef);
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef);
+int rv_disable_monitor(struct rv_monitor *mon);
+int rv_enable_monitor(struct rv_monitor *mon);
+bool rv_is_container_monitor(struct rv_monitor *mon);
+bool rv_is_nested_monitor(struct rv_monitor *mon);
 
 #ifdef CONFIG_RV_REACTORS
-int reactor_populate_monitor(struct rv_monitor_def *mdef);
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int reactor_populate_monitor(struct rv_monitor *mon);
+void reactor_cleanup_monitor(struct rv_monitor *mon);
 int init_rv_reactors(struct dentry *root_dir);
 #else
-static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+static inline int reactor_populate_monitor(struct rv_monitor *mon)
 {
 	return 0;
 }
 
-static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+static inline void reactor_cleanup_monitor(struct rv_monitor *mon)
 {
 	return;
 }
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 740603670dd1..7cc620a1be1a 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -138,10 +138,10 @@ static const struct file_operations available_reactors_ops = {
  */
 static int monitor_reactor_show(struct seq_file *m, void *p)
 {
-	struct rv_monitor_def *mdef = m->private;
+	struct rv_monitor *mon = m->private;
 	struct rv_reactor_def *rdef = p;
 
-	if (mdef->rdef == rdef)
+	if (mon->rdef == rdef)
 		seq_printf(m, "[%s]\n", rdef->reactor->name);
 	else
 		seq_printf(m, "%s\n", rdef->reactor->name);
@@ -158,41 +158,41 @@ static const struct seq_operations monitor_reactors_seq_ops = {
 	.show	= monitor_reactor_show
 };
 
-static void monitor_swap_reactors_single(struct rv_monitor_def *mdef,
+static void monitor_swap_reactors_single(struct rv_monitor *mon,
 					 struct rv_reactor_def *rdef,
 					 bool reacting, bool nested)
 {
 	bool monitor_enabled;
 
 	/* nothing to do */
-	if (mdef->rdef == rdef)
+	if (mon->rdef == rdef)
 		return;
 
-	monitor_enabled = mdef->monitor->enabled;
+	monitor_enabled = mon->enabled;
 	if (monitor_enabled)
-		rv_disable_monitor(mdef);
+		rv_disable_monitor(mon);
 
 	/* swap reactor's usage */
-	mdef->rdef->counter--;
+	mon->rdef->counter--;
 	rdef->counter++;
 
-	mdef->rdef = rdef;
-	mdef->reacting = reacting;
-	mdef->monitor->react = rdef->reactor->react;
+	mon->rdef = rdef;
+	mon->reacting = reacting;
+	mon->react = rdef->reactor->react;
 
 	/* enable only once if iterating through a container */
 	if (monitor_enabled && !nested)
-		rv_enable_monitor(mdef);
+		rv_enable_monitor(mon);
 }
 
-static void monitor_swap_reactors(struct rv_monitor_def *mdef,
+static void monitor_swap_reactors(struct rv_monitor *mon,
 				  struct rv_reactor_def *rdef, bool reacting)
 {
-	struct rv_monitor_def *p = mdef;
+	struct rv_monitor *p = mon;
 
-	if (rv_is_container_monitor(mdef))
+	if (rv_is_container_monitor(mon))
 		list_for_each_entry_continue(p, &rv_monitors_list, list) {
-			if (p->parent != mdef->monitor)
+			if (p->parent != mon)
 				break;
 			monitor_swap_reactors_single(p, rdef, reacting, true);
 		}
@@ -202,7 +202,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef,
 	 * All nested monitors are enabled also if they were off, we may refine
 	 * this logic in the future.
 	 */
-	monitor_swap_reactors_single(mdef, rdef, reacting, false);
+	monitor_swap_reactors_single(mon, rdef, reacting, false);
 }
 
 static ssize_t
@@ -210,7 +210,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 		      size_t count, loff_t *ppos)
 {
 	char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
-	struct rv_monitor_def *mdef;
+	struct rv_monitor *mon;
 	struct rv_reactor_def *rdef;
 	struct seq_file *seq_f;
 	int retval = -EINVAL;
@@ -237,7 +237,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 	 * See monitor_reactors_open()
 	 */
 	seq_f = file->private_data;
-	mdef = seq_f->private;
+	mon = seq_f->private;
 
 	mutex_lock(&rv_interface_lock);
 
@@ -252,7 +252,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 		else
 			enable = true;
 
-		monitor_swap_reactors(mdef, rdef, enable);
+		monitor_swap_reactors(mon, rdef, enable);
 
 		retval = count;
 		break;
@@ -268,7 +268,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
  */
 static int monitor_reactors_open(struct inode *inode, struct file *file)
 {
-	struct rv_monitor_def *mdef = inode->i_private;
+	struct rv_monitor *mon = inode->i_private;
 	struct seq_file *seq_f;
 	int ret;
 
@@ -284,7 +284,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file)
 	/*
 	 * Copy the create file "private" data to the seq_file private data.
 	 */
-	seq_f->private = mdef;
+	seq_f->private = mon;
 
 	return 0;
 };
@@ -454,37 +454,37 @@ static const struct file_operations reacting_on_fops = {
 
 /**
  * reactor_populate_monitor - creates per monitor reactors file
- * @mdef:	monitor's definition.
+ * @mon:	The monitor.
  *
  * Returns 0 if successful, error otherwise.
  */
-int reactor_populate_monitor(struct rv_monitor_def *mdef)
+int reactor_populate_monitor(struct rv_monitor *mon)
 {
 	struct dentry *tmp;
 
-	tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+	tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops);
 	if (!tmp)
 		return -ENOMEM;
 
 	/*
 	 * Configure as the rv_nop reactor.
 	 */
-	mdef->rdef = get_reactor_rdef_by_name("nop");
-	mdef->rdef->counter++;
-	mdef->reacting = false;
+	mon->rdef = get_reactor_rdef_by_name("nop");
+	mon->rdef->counter++;
+	mon->reacting = false;
 
 	return 0;
 }
 
 /**
  * reactor_cleanup_monitor - cleanup a monitor reference
- * @mdef:       monitor's definition.
+ * @mon:       the monitor.
  */
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
+void reactor_cleanup_monitor(struct rv_monitor *mon)
 {
 	lockdep_assert_held(&rv_interface_lock);
-	mdef->rdef->counter--;
-	WARN_ON_ONCE(mdef->rdef->counter < 0);
+	mon->rdef->counter--;
+	WARN_ON_ONCE(mon->rdef->counter < 0);
 }
 
 /*
-- 
cgit v1.2.3


From 3d3c376118b5f7ed7723c2b4fd7a0a1c1893d63e Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 24 Jul 2025 19:33:28 +0200
Subject: rv: Merge struct rv_reactor_def into struct rv_reactor

Each struct rv_reactor has a unique struct rv_reactor_def associated with
it. struct rv_reactor is statically allocated, while struct rv_reactor_def
is dynamically allocated.

This makes the code more complicated than it should be:

  - Lookup is required to get the associated rv_reactor_def from rv_reactor

  - Dynamic memory allocation is required for rv_reactor_def. This is
    harder to get right compared to static memory. For instance, there is
    an existing mistake: rv_unregister_reactor() does not free the memory
    allocated by rv_register_reactor(). This is fortunately not a real
    memory leak problem as rv_unregister_reactor() is never called.

Simplify and merge rv_reactor_def into rv_reactor.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/71cb91c86cd40df5b8c492b788787f2a73c3eaa3.1753378331.git.namcao@linutronix.de
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |  5 ++-
 kernel/trace/rv/rv.h          |  9 -----
 kernel/trace/rv/rv_reactors.c | 92 ++++++++++++++++++-------------------------
 3 files changed, 43 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index dba53aecdfab..c22c9b8c1567 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -90,6 +90,9 @@ struct rv_reactor {
 	const char		*name;
 	const char		*description;
 	__printf(1, 2) void	(*react)(const char *msg, ...);
+	struct list_head	list;
+	/* protected by the monitor interface lock */
+	int			counter;
 };
 #endif
 
@@ -101,7 +104,7 @@ struct rv_monitor {
 	void			(*disable)(void);
 	void			(*reset)(void);
 #ifdef CONFIG_RV_REACTORS
-	struct rv_reactor_def	*rdef;
+	struct rv_reactor	*reactor;
 	__printf(1, 2) void	(*react)(const char *msg, ...);
 	bool			reacting;
 #endif
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index f039ec1c9156..8c38f9dd41bc 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -23,15 +23,6 @@ struct rv_interface {
 extern struct mutex rv_interface_lock;
 extern struct list_head rv_monitors_list;
 
-#ifdef CONFIG_RV_REACTORS
-struct rv_reactor_def {
-	struct list_head	list;
-	struct rv_reactor	*reactor;
-	/* protected by the monitor interface lock */
-	int			counter;
-};
-#endif
-
 struct dentry *get_monitors_root(void);
 int rv_disable_monitor(struct rv_monitor *mon);
 int rv_enable_monitor(struct rv_monitor *mon);
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 7cc620a1be1a..2c7909e6d0e7 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -70,12 +70,12 @@
  */
 static LIST_HEAD(rv_reactors_list);
 
-static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+static struct rv_reactor *get_reactor_rdef_by_name(char *name)
 {
-	struct rv_reactor_def *r;
+	struct rv_reactor *r;
 
 	list_for_each_entry(r, &rv_reactors_list, list) {
-		if (strcmp(name, r->reactor->name) == 0)
+		if (strcmp(name, r->name) == 0)
 			return r;
 	}
 	return NULL;
@@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
  */
 static int reactors_show(struct seq_file *m, void *p)
 {
-	struct rv_reactor_def *rea_def = p;
+	struct rv_reactor *reactor = p;
 
-	seq_printf(m, "%s\n", rea_def->reactor->name);
+	seq_printf(m, "%s\n", reactor->name);
 	return 0;
 }
 
@@ -139,12 +139,12 @@ static const struct file_operations available_reactors_ops = {
 static int monitor_reactor_show(struct seq_file *m, void *p)
 {
 	struct rv_monitor *mon = m->private;
-	struct rv_reactor_def *rdef = p;
+	struct rv_reactor *reactor = p;
 
-	if (mon->rdef == rdef)
-		seq_printf(m, "[%s]\n", rdef->reactor->name);
+	if (mon->reactor == reactor)
+		seq_printf(m, "[%s]\n", reactor->name);
 	else
-		seq_printf(m, "%s\n", rdef->reactor->name);
+		seq_printf(m, "%s\n", reactor->name);
 	return 0;
 }
 
@@ -159,13 +159,13 @@ static const struct seq_operations monitor_reactors_seq_ops = {
 };
 
 static void monitor_swap_reactors_single(struct rv_monitor *mon,
-					 struct rv_reactor_def *rdef,
+					 struct rv_reactor *reactor,
 					 bool reacting, bool nested)
 {
 	bool monitor_enabled;
 
 	/* nothing to do */
-	if (mon->rdef == rdef)
+	if (mon->reactor == reactor)
 		return;
 
 	monitor_enabled = mon->enabled;
@@ -173,12 +173,12 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon,
 		rv_disable_monitor(mon);
 
 	/* swap reactor's usage */
-	mon->rdef->counter--;
-	rdef->counter++;
+	mon->reactor->counter--;
+	reactor->counter++;
 
-	mon->rdef = rdef;
+	mon->reactor = reactor;
 	mon->reacting = reacting;
-	mon->react = rdef->reactor->react;
+	mon->react = reactor->react;
 
 	/* enable only once if iterating through a container */
 	if (monitor_enabled && !nested)
@@ -186,7 +186,7 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon,
 }
 
 static void monitor_swap_reactors(struct rv_monitor *mon,
-				  struct rv_reactor_def *rdef, bool reacting)
+				  struct rv_reactor *reactor, bool reacting)
 {
 	struct rv_monitor *p = mon;
 
@@ -194,7 +194,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon,
 		list_for_each_entry_continue(p, &rv_monitors_list, list) {
 			if (p->parent != mon)
 				break;
-			monitor_swap_reactors_single(p, rdef, reacting, true);
+			monitor_swap_reactors_single(p, reactor, reacting, true);
 		}
 	/*
 	 * This call enables and disables the monitor if they were active.
@@ -202,7 +202,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon,
 	 * All nested monitors are enabled also if they were off, we may refine
 	 * this logic in the future.
 	 */
-	monitor_swap_reactors_single(mon, rdef, reacting, false);
+	monitor_swap_reactors_single(mon, reactor, reacting, false);
 }
 
 static ssize_t
@@ -211,7 +211,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 {
 	char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
 	struct rv_monitor *mon;
-	struct rv_reactor_def *rdef;
+	struct rv_reactor *reactor;
 	struct seq_file *seq_f;
 	int retval = -EINVAL;
 	bool enable;
@@ -243,16 +243,16 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 
 	retval = -EINVAL;
 
-	list_for_each_entry(rdef, &rv_reactors_list, list) {
-		if (strcmp(ptr, rdef->reactor->name) != 0)
+	list_for_each_entry(reactor, &rv_reactors_list, list) {
+		if (strcmp(ptr, reactor->name) != 0)
 			continue;
 
-		if (rdef == get_reactor_rdef_by_name("nop"))
+		if (strcmp(reactor->name, "nop"))
 			enable = false;
 		else
 			enable = true;
 
-		monitor_swap_reactors(mon, rdef, enable);
+		monitor_swap_reactors(mon, reactor, enable);
 
 		retval = count;
 		break;
@@ -299,23 +299,16 @@ static const struct file_operations monitor_reactors_ops = {
 
 static int __rv_register_reactor(struct rv_reactor *reactor)
 {
-	struct rv_reactor_def *r;
+	struct rv_reactor *r;
 
 	list_for_each_entry(r, &rv_reactors_list, list) {
-		if (strcmp(reactor->name, r->reactor->name) == 0) {
+		if (strcmp(reactor->name, r->name) == 0) {
 			pr_info("Reactor %s is already registered\n", reactor->name);
 			return -EINVAL;
 		}
 	}
 
-	r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
-	if (!r)
-		return -ENOMEM;
-
-	r->reactor = reactor;
-	r->counter = 0;
-
-	list_add_tail(&r->list, &rv_reactors_list);
+	list_add_tail(&reactor->list, &rv_reactors_list);
 
 	return 0;
 }
@@ -350,26 +343,19 @@ int rv_register_reactor(struct rv_reactor *reactor)
  */
 int rv_unregister_reactor(struct rv_reactor *reactor)
 {
-	struct rv_reactor_def *ptr, *next;
 	int ret = 0;
 
 	mutex_lock(&rv_interface_lock);
 
-	list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
-		if (strcmp(reactor->name, ptr->reactor->name) == 0) {
-
-			if (!ptr->counter) {
-				list_del(&ptr->list);
-			} else {
-				printk(KERN_WARNING
-				       "rv: the rv_reactor %s is in use by %d monitor(s)\n",
-				       ptr->reactor->name, ptr->counter);
-				printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
-				       ptr->reactor->name);
-				ret = -EBUSY;
-				break;
-			}
-		}
+	if (!reactor->counter) {
+		list_del(&reactor->list);
+	} else {
+		printk(KERN_WARNING
+		       "rv: the rv_reactor %s is in use by %d monitor(s)\n",
+		       reactor->name, reactor->counter);
+		printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
+		       reactor->name);
+		ret = -EBUSY;
 	}
 
 	mutex_unlock(&rv_interface_lock);
@@ -469,8 +455,8 @@ int reactor_populate_monitor(struct rv_monitor *mon)
 	/*
 	 * Configure as the rv_nop reactor.
 	 */
-	mon->rdef = get_reactor_rdef_by_name("nop");
-	mon->rdef->counter++;
+	mon->reactor = get_reactor_rdef_by_name("nop");
+	mon->reactor->counter++;
 	mon->reacting = false;
 
 	return 0;
@@ -483,8 +469,8 @@ int reactor_populate_monitor(struct rv_monitor *mon)
 void reactor_cleanup_monitor(struct rv_monitor *mon)
 {
 	lockdep_assert_held(&rv_interface_lock);
-	mon->rdef->counter--;
-	WARN_ON_ONCE(mon->rdef->counter < 0);
+	mon->reactor->counter--;
+	WARN_ON_ONCE(mon->reactor->counter < 0);
 }
 
 /*
-- 
cgit v1.2.3


From 3d3800b4f7f4b1472a0ec2cffd535c05603f8f60 Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 24 Jul 2025 19:33:29 +0200
Subject: rv: Remove rv_reactor's reference counter

rv_reactor has a reference counter to ensure it is not removed while
monitors are still using it.

However, this is futile, as __exit functions are not expected to fail and
will proceed normally despite rv_unregister_reactor() returning an error.

At the moment, reactors do not support being built as modules, therefore
they are never removed and the reference counters are not necessary.

If we support building RV reactors as modules in the future, kernel
module's centralized facilities such as try_module_get(), module_put() or
MODULE_SOFTDEP should be used instead of this custom implementation.

Remove this reference counter.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/bb946398436a5e17fb0f5b842ef3313c02291852.1753378331.git.namcao@linutronix.de
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |  2 --
 kernel/trace/rv/rv.c          |  1 -
 kernel/trace/rv/rv.h          |  6 ------
 kernel/trace/rv/rv_reactors.c | 33 ++-------------------------------
 4 files changed, 2 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index c22c9b8c1567..2f867d6f72ba 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -91,8 +91,6 @@ struct rv_reactor {
 	const char		*description;
 	__printf(1, 2) void	(*react)(const char *msg, ...);
 	struct list_head	list;
-	/* protected by the monitor interface lock */
-	int			counter;
 };
 #endif
 
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 6c0be2fdc52d..6c8498743b98 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -769,7 +769,6 @@ static const struct file_operations monitoring_on_fops = {
 
 static void destroy_monitor_dir(struct rv_monitor *mon)
 {
-	reactor_cleanup_monitor(mon);
 	rv_remove(mon->root_d);
 }
 
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 8c38f9dd41bc..1485a70c1bf4 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -31,7 +31,6 @@ bool rv_is_nested_monitor(struct rv_monitor *mon);
 
 #ifdef CONFIG_RV_REACTORS
 int reactor_populate_monitor(struct rv_monitor *mon);
-void reactor_cleanup_monitor(struct rv_monitor *mon);
 int init_rv_reactors(struct dentry *root_dir);
 #else
 static inline int reactor_populate_monitor(struct rv_monitor *mon)
@@ -39,11 +38,6 @@ static inline int reactor_populate_monitor(struct rv_monitor *mon)
 	return 0;
 }
 
-static inline void reactor_cleanup_monitor(struct rv_monitor *mon)
-{
-	return;
-}
-
 static inline int init_rv_reactors(struct dentry *root_dir)
 {
 	return 0;
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 2c7909e6d0e7..a8e849e6cd85 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -172,10 +172,6 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon,
 	if (monitor_enabled)
 		rv_disable_monitor(mon);
 
-	/* swap reactor's usage */
-	mon->reactor->counter--;
-	reactor->counter++;
-
 	mon->reactor = reactor;
 	mon->reacting = reacting;
 	mon->react = reactor->react;
@@ -343,23 +339,10 @@ int rv_register_reactor(struct rv_reactor *reactor)
  */
 int rv_unregister_reactor(struct rv_reactor *reactor)
 {
-	int ret = 0;
-
 	mutex_lock(&rv_interface_lock);
-
-	if (!reactor->counter) {
-		list_del(&reactor->list);
-	} else {
-		printk(KERN_WARNING
-		       "rv: the rv_reactor %s is in use by %d monitor(s)\n",
-		       reactor->name, reactor->counter);
-		printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
-		       reactor->name);
-		ret = -EBUSY;
-	}
-
+	list_del(&reactor->list);
 	mutex_unlock(&rv_interface_lock);
-	return ret;
+	return 0;
 }
 
 /*
@@ -456,23 +439,11 @@ int reactor_populate_monitor(struct rv_monitor *mon)
 	 * Configure as the rv_nop reactor.
 	 */
 	mon->reactor = get_reactor_rdef_by_name("nop");
-	mon->reactor->counter++;
 	mon->reacting = false;
 
 	return 0;
 }
 
-/**
- * reactor_cleanup_monitor - cleanup a monitor reference
- * @mon:       the monitor.
- */
-void reactor_cleanup_monitor(struct rv_monitor *mon)
-{
-	lockdep_assert_held(&rv_interface_lock);
-	mon->reactor->counter--;
-	WARN_ON_ONCE(mon->reactor->counter < 0);
-}
-
 /*
  * Nop reactor register
  */
-- 
cgit v1.2.3


From b8a7fba39cd49eab343bfe561d85bb5dc57541af Mon Sep 17 00:00:00 2001
From: Nam Cao <namcao@linutronix.de>
Date: Thu, 24 Jul 2025 19:33:30 +0200
Subject: rv: Remove struct rv_monitor::reacting

The field 'reacting' in struct rv_monitor is set but never used. Delete it.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/a6c16f845d2f1a09c4d0934ab83f3cb14478a71d.1753378331.git.namcao@linutronix.de
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h            |  1 -
 kernel/trace/rv/rv_reactors.c | 19 +++++--------------
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 2f867d6f72ba..80731242fe60 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -104,7 +104,6 @@ struct rv_monitor {
 #ifdef CONFIG_RV_REACTORS
 	struct rv_reactor	*reactor;
 	__printf(1, 2) void	(*react)(const char *msg, ...);
-	bool			reacting;
 #endif
 	struct list_head	list;
 	struct rv_monitor	*parent;
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index a8e849e6cd85..106f2c4740f2 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -160,7 +160,7 @@ static const struct seq_operations monitor_reactors_seq_ops = {
 
 static void monitor_swap_reactors_single(struct rv_monitor *mon,
 					 struct rv_reactor *reactor,
-					 bool reacting, bool nested)
+					 bool nested)
 {
 	bool monitor_enabled;
 
@@ -173,7 +173,6 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon,
 		rv_disable_monitor(mon);
 
 	mon->reactor = reactor;
-	mon->reacting = reacting;
 	mon->react = reactor->react;
 
 	/* enable only once if iterating through a container */
@@ -181,8 +180,7 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon,
 		rv_enable_monitor(mon);
 }
 
-static void monitor_swap_reactors(struct rv_monitor *mon,
-				  struct rv_reactor *reactor, bool reacting)
+static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor)
 {
 	struct rv_monitor *p = mon;
 
@@ -190,7 +188,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon,
 		list_for_each_entry_continue(p, &rv_monitors_list, list) {
 			if (p->parent != mon)
 				break;
-			monitor_swap_reactors_single(p, reactor, reacting, true);
+			monitor_swap_reactors_single(p, reactor, true);
 		}
 	/*
 	 * This call enables and disables the monitor if they were active.
@@ -198,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon,
 	 * All nested monitors are enabled also if they were off, we may refine
 	 * this logic in the future.
 	 */
-	monitor_swap_reactors_single(mon, reactor, reacting, false);
+	monitor_swap_reactors_single(mon, reactor, false);
 }
 
 static ssize_t
@@ -210,7 +208,6 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 	struct rv_reactor *reactor;
 	struct seq_file *seq_f;
 	int retval = -EINVAL;
-	bool enable;
 	char *ptr;
 	int len;
 
@@ -243,12 +240,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
 		if (strcmp(ptr, reactor->name) != 0)
 			continue;
 
-		if (strcmp(reactor->name, "nop"))
-			enable = false;
-		else
-			enable = true;
-
-		monitor_swap_reactors(mon, reactor, enable);
+		monitor_swap_reactors(mon, reactor);
 
 		retval = count;
 		break;
@@ -439,7 +431,6 @@ int reactor_populate_monitor(struct rv_monitor *mon)
 	 * Configure as the rv_nop reactor.
 	 */
 	mon->reactor = get_reactor_rdef_by_name("nop");
-	mon->reacting = false;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 4d18a0b98259c2fa62f04ce5f94a7ec6e840f220 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Mon, 14 Jul 2025 21:03:16 +0800
Subject: ext4: get rid of some obsolete EXT4_MB_HINT flags

Since nobody has used these EXT4_MB_HINT flags for ages,
let's remove them.

Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://patch.msgid.link/20250714130327.1830534-7-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ext4.h              | 6 ------
 include/trace/events/ext4.h | 3 ---
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b1fa677229ac..6dc6fabf0228 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -185,14 +185,8 @@ enum criteria {
 
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED		0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA		0x0004
 /* first blocks in the file */
 #define EXT4_MB_HINT_FIRST		0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST		0x0010
 /* data is being allocated */
 #define EXT4_MB_HINT_DATA		0x0020
 /* don't preallocate (for tails) */
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 845451077c41..53dd2cc28fc5 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -23,10 +23,7 @@ struct partial_cluster;
 
 #define show_mballoc_flags(flags) __print_flags(flags, "|",	\
 	{ EXT4_MB_HINT_MERGE,		"HINT_MERGE" },		\
-	{ EXT4_MB_HINT_RESERVED,	"HINT_RESV" },		\
-	{ EXT4_MB_HINT_METADATA,	"HINT_MDATA" },		\
 	{ EXT4_MB_HINT_FIRST,		"HINT_FIRST" },		\
-	{ EXT4_MB_HINT_BEST,		"HINT_BEST" },		\
 	{ EXT4_MB_HINT_DATA,		"HINT_DATA" },		\
 	{ EXT4_MB_HINT_NOPREALLOC,	"HINT_NOPREALLOC" },	\
 	{ EXT4_MB_HINT_GROUP_ALLOC,	"HINT_GRP_ALLOC" },	\
-- 
cgit v1.2.3


From e89a68046687fe9913ce3bfad82f7ccbb65687e0 Mon Sep 17 00:00:00 2001
From: Lance Yang <lance.yang@linux.dev>
Date: Mon, 26 May 2025 16:59:02 +0800
Subject: netfilter: load nf_log_syslog on enabling nf_conntrack_log_invalid

When no logger is registered, nf_conntrack_log_invalid fails to log invalid
packets, leaving users unaware of actual invalid traffic. Improve this by
loading nf_log_syslog, similar to how 'iptables -I FORWARD 1 -m conntrack
--ctstate INVALID -j LOG' triggers it.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Zi Li <zi.li@linux.dev>
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h          |  3 +++
 net/netfilter/nf_conntrack_standalone.c | 26 +++++++++++++++++++++++++-
 net/netfilter/nf_log.c                  | 26 ++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index e55eedc84ed7..00506792a06d 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns;
 int nf_log_register(u_int8_t pf, struct nf_logger *logger);
 void nf_log_unregister(struct nf_logger *logger);
 
+/* Check if any logger is registered for a given protocol family. */
+bool nf_log_is_registered(u_int8_t pf);
+
 int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger);
 void nf_log_unset(struct net *net, const struct nf_logger *logger);
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 829f60496008..9b8b10a85233 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -14,6 +14,7 @@
 #include <linux/sysctl.h>
 #endif
 
+#include <net/netfilter/nf_log.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
@@ -555,6 +556,29 @@ nf_conntrack_hash_sysctl(const struct ctl_table *table, int write,
 	return ret;
 }
 
+static int
+nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write,
+				void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, i;
+
+	ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+	if (ret < 0 || !write)
+		return ret;
+
+	if (*(u8 *)table->data == 0)
+		return ret;
+
+	/* Load nf_log_syslog only if no logger is currently registered */
+	for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+		if (nf_log_is_registered(i))
+			return ret;
+	}
+	request_module("%s", "nf_log_syslog");
+
+	return ret;
+}
+
 static struct ctl_table_header *nf_ct_netfilter_header;
 
 enum nf_ct_sysctl_index {
@@ -651,7 +675,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.data		= &init_net.ct.sysctl_log_invalid,
 		.maxlen		= sizeof(u8),
 		.mode		= 0644,
-		.proc_handler	= proc_dou8vec_minmax,
+		.proc_handler	= nf_conntrack_log_invalid_sysctl,
 	},
 	[NF_SYSCTL_CT_EXPECT_MAX] = {
 		.procname	= "nf_conntrack_expect_max",
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 6dd0de33eebd..74cef8bf554c 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger)
 }
 EXPORT_SYMBOL(nf_log_unregister);
 
+/**
+ * nf_log_is_registered - Check if any logger is registered for a given
+ * protocol family.
+ *
+ * @pf: Protocol family
+ *
+ * Returns: true if at least one logger is active for @pf, false otherwise.
+ */
+bool nf_log_is_registered(u_int8_t pf)
+{
+	int i;
+
+	if (pf >= NFPROTO_NUMPROTO) {
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	for (i = 0; i < NF_LOG_TYPE_MAX; i++) {
+		if (rcu_access_pointer(loggers[pf][i]))
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(nf_log_is_registered);
+
 int nf_log_bind_pf(struct net *net, u_int8_t pf,
 		   const struct nf_logger *logger)
 {
-- 
cgit v1.2.3


From 031a712471943ce780a7fc56e35b68cf77243e1e Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 24 Jun 2025 09:44:32 +0800
Subject: netfilter: x_tables: Remove unused functions xt_{in|out}name()

Since commit 2173c519d5e9 ("audit: normalize NETFILTER_PKT")
these are unused, so can be removed.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index f39f688d7285..77c778d84d4c 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -51,21 +51,11 @@ static inline struct net_device *xt_in(const struct xt_action_param *par)
 	return par->state->in;
 }
 
-static inline const char *xt_inname(const struct xt_action_param *par)
-{
-	return par->state->in->name;
-}
-
 static inline struct net_device *xt_out(const struct xt_action_param *par)
 {
 	return par->state->out;
 }
 
-static inline const char *xt_outname(const struct xt_action_param *par)
-{
-	return par->state->out->name;
-}
-
 static inline unsigned int xt_hooknum(const struct xt_action_param *par)
 {
 	return par->state->hook;
-- 
cgit v1.2.3


From bf6788742b8d6c73de441e088a71de7154f0d4aa Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Tue, 24 Jun 2025 09:48:18 +0800
Subject: netfilter: nf_tables: Remove unused nft_reduce_is_readonly()

Since commit 9e539c5b6d9c ("netfilter: nf_tables: disable expression
reduction infra") this is unused.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5e49619ae49c..b092e57d3c75 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1939,11 +1939,6 @@ static inline u64 nft_net_tstamp(const struct net *net)
 #define __NFT_REDUCE_READONLY	1UL
 #define NFT_REDUCE_READONLY	(void *)__NFT_REDUCE_READONLY
 
-static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
-{
-	return expr->ops->reduce == NFT_REDUCE_READONLY;
-}
-
 void nft_reg_track_update(struct nft_regs_track *track,
 			  const struct nft_expr *expr, u8 dreg, u8 len);
 void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
-- 
cgit v1.2.3


From bc8c43adfdc57c8253884fc1853cb6679cd5953d Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 8 Jul 2025 15:04:02 +0200
Subject: netfilter: nfnetlink_hook: Dump flowtable info

Introduce NFNL_HOOK_TYPE_NFT_FLOWTABLE to distinguish flowtable hooks
from base chain ones. Nested attributes are shared with the old NFTABLES
hook info type since they fit apart from their misleading name.

Old nftables in user space will ignore this new hook type and thus
continue to print flowtable hooks just like before, e.g.:

| family netdev {
| 	hook ingress device test0 {
| 		 0000000000 nf_flow_offload_ip_hook [nf_flow_table]
| 	}
| }

With this patch in place and support for the new hook info type, output
becomes more useful:

| family netdev {
| 	hook ingress device test0 {
| 		 0000000000 flowtable ip mytable myft [nf_flow_table]
| 	}
| }

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                     |  1 +
 include/uapi/linux/netfilter/nfnetlink_hook.h |  2 ++
 net/netfilter/nf_tables_api.c                 | 24 +++++++++---------
 net/netfilter/nfnetlink_hook.c                | 35 +++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 5f896fcc074d..efbbfa770d66 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -92,6 +92,7 @@ enum nf_hook_ops_type {
 	NF_HOOK_OP_UNDEFINED,
 	NF_HOOK_OP_NF_TABLES,
 	NF_HOOK_OP_BPF,
+	NF_HOOK_OP_NFT_FT,
 };
 
 struct nf_hook_ops {
diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index 84a561a74b98..1a2c4d6424b5 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -61,10 +61,12 @@ enum nfnl_hook_chain_desc_attributes {
  *
  * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain
  * @NFNL_HOOK_TYPE_BPF: bpf program
+ * @NFNL_HOOK_TYPE_NFT_FLOWTABLE: nf_tables flowtable
  */
 enum nfnl_hook_chaintype {
 	NFNL_HOOK_TYPE_NFTABLES = 0x1,
 	NFNL_HOOK_TYPE_BPF,
+	NFNL_HOOK_TYPE_NFT_FLOWTABLE,
 };
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 04795af6e586..13d0ed9d1895 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8895,11 +8895,12 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
 
 	list_for_each_entry(hook, &flowtable_hook->list, list) {
 		list_for_each_entry(ops, &hook->ops_list, list) {
-			ops->pf		= NFPROTO_NETDEV;
-			ops->hooknum	= flowtable_hook->num;
-			ops->priority	= flowtable_hook->priority;
-			ops->priv	= &flowtable->data;
-			ops->hook	= flowtable->data.type->hook;
+			ops->pf			= NFPROTO_NETDEV;
+			ops->hooknum		= flowtable_hook->num;
+			ops->priority		= flowtable_hook->priority;
+			ops->priv		= &flowtable->data;
+			ops->hook		= flowtable->data.type->hook;
+			ops->hook_ops_type	= NF_HOOK_OP_NFT_FT;
 		}
 	}
 
@@ -9727,12 +9728,13 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev,
 			if (!ops)
 				return 1;
 
-			ops->pf		= NFPROTO_NETDEV;
-			ops->hooknum	= flowtable->hooknum;
-			ops->priority	= flowtable->data.priority;
-			ops->priv	= &flowtable->data;
-			ops->hook	= flowtable->data.type->hook;
-			ops->dev	= dev;
+			ops->pf			= NFPROTO_NETDEV;
+			ops->hooknum		= flowtable->hooknum;
+			ops->priority		= flowtable->data.priority;
+			ops->priv		= &flowtable->data;
+			ops->hook		= flowtable->data.type->hook;
+			ops->hook_ops_type	= NF_HOOK_OP_NFT_FT;
+			ops->dev		= dev;
 			if (nft_register_flowtable_ops(dev_net(dev),
 						       flowtable, ops)) {
 				kfree(ops);
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index cd4056527ede..92d869317cba 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -156,6 +156,38 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
 	return 0;
 }
 
+static int nfnl_hook_put_nft_ft_info(struct sk_buff *nlskb,
+				     const struct nfnl_dump_hook_data *ctx,
+				     unsigned int seq,
+				     struct nf_flowtable *nf_ft)
+{
+	struct nft_flowtable *ft =
+		container_of(nf_ft, struct nft_flowtable, data);
+	struct net *net = sock_net(nlskb->sk);
+	struct nlattr *nest;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(!nf_ft))
+		return 0;
+
+	if (!nft_is_active(net, ft))
+		return 0;
+
+	nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFT_FLOWTABLE);
+	if (!nest)
+		return -EMSGSIZE;
+
+	ret = nfnl_hook_put_nft_info_desc(nlskb, ft->table->name,
+					  ft->name, ft->table->family);
+	if (ret) {
+		nla_nest_cancel(nlskb, nest);
+		return ret;
+	}
+
+	nla_nest_end(nlskb, nest);
+	return 0;
+}
+
 static int nfnl_hook_dump_one(struct sk_buff *nlskb,
 			      const struct nfnl_dump_hook_data *ctx,
 			      const struct nf_hook_ops *ops,
@@ -223,6 +255,9 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb,
 	case NF_HOOK_OP_BPF:
 		ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
 		break;
+	case NF_HOOK_OP_NFT_FT:
+		ret = nfnl_hook_put_nft_ft_info(nlskb, ctx, seq, ops->priv);
+		break;
 	case NF_HOOK_OP_UNDEFINED:
 		break;
 	default:
-- 
cgit v1.2.3


From 17a20e09f086f2c574ac87f3cf6e14c4377f65f6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Jul 2025 19:05:13 +0200
Subject: netfilter: nft_set: remove one argument from lookup and update
 functions

Return the extension pointer instead of passing it as a function
argument to be filled in by the callee.

As-is, whenever false is returned, the extension pointer is not used.

For all set types, when true is returned, the extension pointer was set
to the matching element.

Only exception: nft_set_bitmap doesn't support extensions.
Return a pointer to a static const empty element extension container.

return false -> return NULL
return true -> return the elements' extension pointer.

This saves one function argument.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      | 10 +++----
 include/net/netfilter/nf_tables_core.h | 47 ++++++++++++++++-------------
 net/netfilter/nft_dynset.c             |  5 ++--
 net/netfilter/nft_lookup.c             | 27 +++++++++--------
 net/netfilter/nft_objref.c             |  5 ++--
 net/netfilter/nft_set_bitmap.c         | 11 +++++--
 net/netfilter/nft_set_hash.c           | 54 +++++++++++++++-------------------
 net/netfilter/nft_set_pipapo.c         | 19 +++++++-----
 net/netfilter/nft_set_pipapo_avx2.c    | 25 +++++++++-------
 net/netfilter/nft_set_rbtree.c         | 40 ++++++++++++-------------
 10 files changed, 126 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index b092e57d3c75..5b6725475906 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -459,19 +459,17 @@ struct nft_set_ext;
  *	control plane functions.
  */
 struct nft_set_ops {
-	bool				(*lookup)(const struct net *net,
+	const struct nft_set_ext *	(*lookup)(const struct net *net,
 						  const struct nft_set *set,
-						  const u32 *key,
-						  const struct nft_set_ext **ext);
-	bool				(*update)(struct nft_set *set,
+						  const u32 *key);
+	const struct nft_set_ext *	(*update)(struct nft_set *set,
 						  const u32 *key,
 						  struct nft_elem_priv *
 							(*new)(struct nft_set *,
 							       const struct nft_expr *,
 							       struct nft_regs *),
 						  const struct nft_expr *expr,
-						  struct nft_regs *regs,
-						  const struct nft_set_ext **ext);
+						  struct nft_regs *regs);
 	bool				(*delete)(const struct nft_set *set,
 						  const u32 *key);
 
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 03b6165756fc..6a52fb97b844 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -94,34 +94,41 @@ extern const struct nft_set_type nft_set_pipapo_type;
 extern const struct nft_set_type nft_set_pipapo_avx2_type;
 
 #ifdef CONFIG_MITIGATION_RETPOLINE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
-		      const u32 *key, const struct nft_set_ext **ext);
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext);
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup_fast(const struct net *net,
-			  const struct nft_set *set,
-			  const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
-		     const u32 *key, const struct nft_set_ext **ext);
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+		 const u32 *key);
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key);
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+		     const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+		const u32 *key);
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key);
 #else
-static inline bool
+static inline const struct nft_set_ext *
 nft_set_do_lookup(const struct net *net, const struct nft_set *set,
-		  const u32 *key, const struct nft_set_ext **ext)
+		  const u32 *key)
 {
-	return set->ops->lookup(net, set, key, ext);
+	return set->ops->lookup(net, set, key);
 }
 #endif
 
 /* called from nft_pipapo_avx2.c */
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key);
 /* called from nft_set_pipapo.c */
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
-			    const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+			const u32 *key);
 
 void nft_counter_init_seqcount(void);
 
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 88922e0e8e83..e24493d9e776 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -91,8 +91,9 @@ void nft_dynset_eval(const struct nft_expr *expr,
 		return;
 	}
 
-	if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
-			     expr, regs, &ext)) {
+	ext = set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
+			     expr, regs);
+	if (ext) {
 		if (priv->op == NFT_DYNSET_OP_UPDATE &&
 		    nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
 		    READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 63ef832b8aa7..40c602ffbcba 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -25,32 +25,33 @@ struct nft_lookup {
 };
 
 #ifdef CONFIG_MITIGATION_RETPOLINE
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key)
 {
 	if (set->ops == &nft_set_hash_fast_type.ops)
-		return nft_hash_lookup_fast(net, set, key, ext);
+		return nft_hash_lookup_fast(net, set, key);
 	if (set->ops == &nft_set_hash_type.ops)
-		return nft_hash_lookup(net, set, key, ext);
+		return nft_hash_lookup(net, set, key);
 
 	if (set->ops == &nft_set_rhash_type.ops)
-		return nft_rhash_lookup(net, set, key, ext);
+		return nft_rhash_lookup(net, set, key);
 
 	if (set->ops == &nft_set_bitmap_type.ops)
-		return nft_bitmap_lookup(net, set, key, ext);
+		return nft_bitmap_lookup(net, set, key);
 
 	if (set->ops == &nft_set_pipapo_type.ops)
-		return nft_pipapo_lookup(net, set, key, ext);
+		return nft_pipapo_lookup(net, set, key);
 #if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
 	if (set->ops == &nft_set_pipapo_avx2_type.ops)
-		return nft_pipapo_avx2_lookup(net, set, key, ext);
+		return nft_pipapo_avx2_lookup(net, set, key);
 #endif
 
 	if (set->ops == &nft_set_rbtree_type.ops)
-		return nft_rbtree_lookup(net, set, key, ext);
+		return nft_rbtree_lookup(net, set, key);
 
 	WARN_ON_ONCE(1);
-	return set->ops->lookup(net, set, key, ext);
+	return set->ops->lookup(net, set, key);
 }
 EXPORT_SYMBOL_GPL(nft_set_do_lookup);
 #endif
@@ -61,12 +62,12 @@ void nft_lookup_eval(const struct nft_expr *expr,
 {
 	const struct nft_lookup *priv = nft_expr_priv(expr);
 	const struct nft_set *set = priv->set;
-	const struct nft_set_ext *ext = NULL;
 	const struct net *net = nft_net(pkt);
+	const struct nft_set_ext *ext;
 	bool found;
 
-	found =	nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^
-				  priv->invert;
+	ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+	found = !!ext ^ priv->invert;
 	if (!found) {
 		ext = nft_set_catchall_lookup(net, set);
 		if (!ext) {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 09da7a3f9f96..8ee66a86c3bc 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -111,10 +111,9 @@ void nft_objref_map_eval(const struct nft_expr *expr,
 	struct net *net = nft_net(pkt);
 	const struct nft_set_ext *ext;
 	struct nft_object *obj;
-	bool found;
 
-	found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext);
-	if (!found) {
+	ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+	if (!ext) {
 		ext = nft_set_catchall_lookup(net, set);
 		if (!ext) {
 			regs->verdict.code = NFT_BREAK;
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 12390d2e994f..c24c922f895d 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -75,16 +75,21 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
 }
 
 INDIRECT_CALLABLE_SCOPE
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key)
 {
 	const struct nft_bitmap *priv = nft_set_priv(set);
+	static const struct nft_set_ext found;
 	u8 genmask = nft_genmask_cur(net);
 	u32 idx, off;
 
 	nft_bitmap_location(set, key, &idx, &off);
 
-	return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+	if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+		return &found;
+
+	return NULL;
 }
 
 static struct nft_bitmap_elem *
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index abb0c8ec6371..9903c737c9f0 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -81,8 +81,9 @@ static const struct rhashtable_params nft_rhash_params = {
 };
 
 INDIRECT_CALLABLE_SCOPE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
-		      const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+		 const u32 *key)
 {
 	struct nft_rhash *priv = nft_set_priv(set);
 	const struct nft_rhash_elem *he;
@@ -95,9 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
 
 	he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
 	if (he != NULL)
-		*ext = &he->ext;
+		return &he->ext;
 
-	return !!he;
+	return NULL;
 }
 
 static struct nft_elem_priv *
@@ -120,14 +121,11 @@ nft_rhash_get(const struct net *net, const struct nft_set *set,
 	return ERR_PTR(-ENOENT);
 }
 
-static bool nft_rhash_update(struct nft_set *set, const u32 *key,
-			     struct nft_elem_priv *
-				   (*new)(struct nft_set *,
-					  const struct nft_expr *,
-					  struct nft_regs *regs),
-			     const struct nft_expr *expr,
-			     struct nft_regs *regs,
-			     const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+nft_rhash_update(struct nft_set *set, const u32 *key,
+		 struct nft_elem_priv *(*new)(struct nft_set *, const struct nft_expr *,
+		 struct nft_regs *regs),
+		 const struct nft_expr *expr, struct nft_regs *regs)
 {
 	struct nft_rhash *priv = nft_set_priv(set);
 	struct nft_rhash_elem *he, *prev;
@@ -161,14 +159,13 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
 	}
 
 out:
-	*ext = &he->ext;
-	return true;
+	return &he->ext;
 
 err2:
 	nft_set_elem_destroy(set, &he->priv, true);
 	atomic_dec(&set->nelems);
 err1:
-	return false;
+	return NULL;
 }
 
 static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
@@ -507,8 +504,9 @@ struct nft_hash_elem {
 };
 
 INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
-		     const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+		const u32 *key)
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_cur(net);
@@ -519,12 +517,10 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
 	hash = reciprocal_scale(hash, priv->buckets);
 	hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
 		if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
-		    nft_set_elem_active(&he->ext, genmask)) {
-			*ext = &he->ext;
-			return true;
-		}
+		    nft_set_elem_active(&he->ext, genmask))
+			return &he->ext;
 	}
-	return false;
+	return NULL;
 }
 
 static struct nft_elem_priv *
@@ -547,9 +543,9 @@ nft_hash_get(const struct net *net, const struct nft_set *set,
 }
 
 INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup_fast(const struct net *net,
-			  const struct nft_set *set,
-			  const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+		     const u32 *key)
 {
 	struct nft_hash *priv = nft_set_priv(set);
 	u8 genmask = nft_genmask_cur(net);
@@ -562,12 +558,10 @@ bool nft_hash_lookup_fast(const struct net *net,
 	hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
 		k2 = *(u32 *)nft_set_ext_key(&he->ext)->data;
 		if (k1 == k2 &&
-		    nft_set_elem_active(&he->ext, genmask)) {
-			*ext = &he->ext;
-			return true;
-		}
+		    nft_set_elem_active(&he->ext, genmask))
+			return &he->ext;
 	}
-	return false;
+	return NULL;
 }
 
 static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 08fb6720673f..36a4de11995b 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -407,8 +407,9 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
  *
  * Return: true on match, false otherwise.
  */
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_scratch *scratch;
@@ -465,13 +466,15 @@ next_match:
 			scratch->map_index = map_index;
 			local_bh_enable();
 
-			return false;
+			return NULL;
 		}
 
 		if (last) {
-			*ext = &f->mt[b].e->ext;
-			if (unlikely(nft_set_elem_expired(*ext) ||
-				     !nft_set_elem_active(*ext, genmask)))
+			const struct nft_set_ext *ext;
+
+			ext = &f->mt[b].e->ext;
+			if (unlikely(nft_set_elem_expired(ext) ||
+				     !nft_set_elem_active(ext, genmask)))
 				goto next_match;
 
 			/* Last field: we're just returning the key without
@@ -482,7 +485,7 @@ next_match:
 			scratch->map_index = map_index;
 			local_bh_enable();
 
-			return true;
+			return ext;
 		}
 
 		/* Swap bitmap indices: res_map is the initial bitmap for the
@@ -497,7 +500,7 @@ next_match:
 
 out:
 	local_bh_enable();
-	return false;
+	return NULL;
 }
 
 /**
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index be7c16c79f71..6c441e2dc8af 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1146,8 +1146,9 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
  *
  * Return: true on match, false otherwise.
  */
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
-			    const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+		       const u32 *key)
 {
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_scratch *scratch;
@@ -1155,17 +1156,18 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
 	const struct nft_pipapo_match *m;
 	const struct nft_pipapo_field *f;
 	const u8 *rp = (const u8 *)key;
+	const struct nft_set_ext *ext;
 	unsigned long *res, *fill;
 	bool map_index;
-	int i, ret = 0;
+	int i;
 
 	local_bh_disable();
 
 	if (unlikely(!irq_fpu_usable())) {
-		bool fallback_res = nft_pipapo_lookup(net, set, key, ext);
+		ext = nft_pipapo_lookup(net, set, key);
 
 		local_bh_enable();
-		return fallback_res;
+		return ext;
 	}
 
 	m = rcu_dereference(priv->match);
@@ -1182,7 +1184,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
 	if (unlikely(!scratch)) {
 		kernel_fpu_end();
 		local_bh_enable();
-		return false;
+		return NULL;
 	}
 
 	map_index = scratch->map_index;
@@ -1197,6 +1199,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
 next_match:
 	nft_pipapo_for_each_field(f, i, m) {
 		bool last = i == m->field_count - 1, first = !i;
+		int ret = 0;
 
 #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n)				\
 		(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f,	\
@@ -1244,10 +1247,10 @@ next_match:
 			goto out;
 
 		if (last) {
-			*ext = &f->mt[ret].e->ext;
-			if (unlikely(nft_set_elem_expired(*ext) ||
-				     !nft_set_elem_active(*ext, genmask))) {
-				ret = 0;
+			ext = &f->mt[ret].e->ext;
+			if (unlikely(nft_set_elem_expired(ext) ||
+				     !nft_set_elem_active(ext, genmask))) {
+				ext = NULL;
 				goto next_match;
 			}
 
@@ -1264,5 +1267,5 @@ out:
 	kernel_fpu_end();
 	local_bh_enable();
 
-	return ret >= 0;
+	return ext;
 }
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 2e8ef16ff191..938a257c069e 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -52,9 +52,9 @@ static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
 	return nft_set_elem_expired(&rbe->ext);
 }
 
-static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
-				const u32 *key, const struct nft_set_ext **ext,
-				unsigned int seq)
+static const struct nft_set_ext *
+__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+		    const u32 *key, unsigned int seq)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	const struct nft_rbtree_elem *rbe, *interval = NULL;
@@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 	parent = rcu_dereference_raw(priv->root.rb_node);
 	while (parent != NULL) {
 		if (read_seqcount_retry(&priv->count, seq))
-			return false;
+			return NULL;
 
 		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
 
@@ -87,50 +87,48 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
 			}
 
 			if (nft_rbtree_elem_expired(rbe))
-				return false;
+				return NULL;
 
 			if (nft_rbtree_interval_end(rbe)) {
 				if (nft_set_is_anonymous(set))
-					return false;
+					return NULL;
 				parent = rcu_dereference_raw(parent->rb_left);
 				interval = NULL;
 				continue;
 			}
 
-			*ext = &rbe->ext;
-			return true;
+			return &rbe->ext;
 		}
 	}
 
 	if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
 	    nft_set_elem_active(&interval->ext, genmask) &&
 	    !nft_rbtree_elem_expired(interval) &&
-	    nft_rbtree_interval_start(interval)) {
-		*ext = &interval->ext;
-		return true;
-	}
+	    nft_rbtree_interval_start(interval))
+		return &interval->ext;
 
-	return false;
+	return NULL;
 }
 
 INDIRECT_CALLABLE_SCOPE
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
-		       const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+		  const u32 *key)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	unsigned int seq = read_seqcount_begin(&priv->count);
-	bool ret;
+	const struct nft_set_ext *ext;
 
-	ret = __nft_rbtree_lookup(net, set, key, ext, seq);
-	if (ret || !read_seqcount_retry(&priv->count, seq))
-		return ret;
+	ext = __nft_rbtree_lookup(net, set, key, seq);
+	if (ext || !read_seqcount_retry(&priv->count, seq))
+		return ext;
 
 	read_lock_bh(&priv->lock);
 	seq = read_seqcount_begin(&priv->count);
-	ret = __nft_rbtree_lookup(net, set, key, ext, seq);
+	ext = __nft_rbtree_lookup(net, set, key, seq);
 	read_unlock_bh(&priv->lock);
 
-	return ret;
+	return ext;
 }
 
 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
-- 
cgit v1.2.3


From 531e61312104d991459af73c838396db26aa3550 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Jul 2025 19:05:14 +0200
Subject: netfilter: nft_set: remove indirection from update API call

This stems from a time when sets and nft_dynset resided in different kernel
modules.  We can replace this with a direct call.

We could even remove both ->update and ->delete, given its only
supported by rhashtable, but on the off-chance we'll see runtime
add/delete for other types or a new set type keep that as-is for now.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      | 4 ----
 include/net/netfilter/nf_tables_core.h | 3 +++
 net/netfilter/nft_dynset.c             | 9 ++++-----
 net/netfilter/nft_set_hash.c           | 4 +---
 net/netfilter/nft_set_pipapo_avx2.c    | 1 -
 5 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5b6725475906..891e43a01bdc 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -464,10 +464,6 @@ struct nft_set_ops {
 						  const u32 *key);
 	const struct nft_set_ext *	(*update)(struct nft_set *set,
 						  const u32 *key,
-						  struct nft_elem_priv *
-							(*new)(struct nft_set *,
-							       const struct nft_expr *,
-							       struct nft_regs *),
 						  const struct nft_expr *expr,
 						  struct nft_regs *regs);
 	bool				(*delete)(const struct nft_set *set,
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 6a52fb97b844..6c2f483d9828 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -188,4 +188,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs,
 		     const struct nft_pktinfo *pkt);
 void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs,
 			 const struct nft_pktinfo *pkt);
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+				     const struct nft_expr *expr,
+				     struct nft_regs *regs);
 #endif /* _NET_NF_TABLES_CORE_H */
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index e24493d9e776..7807d8129664 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -44,9 +44,9 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
 	return 0;
 }
 
-static struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
-					    const struct nft_expr *expr,
-					    struct nft_regs *regs)
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+				     const struct nft_expr *expr,
+				     struct nft_regs *regs)
 {
 	const struct nft_dynset *priv = nft_expr_priv(expr);
 	struct nft_set_ext *ext;
@@ -91,8 +91,7 @@ void nft_dynset_eval(const struct nft_expr *expr,
 		return;
 	}
 
-	ext = set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
-			     expr, regs);
+	ext = set->ops->update(set, &regs->data[priv->sreg_key], expr, regs);
 	if (ext) {
 		if (priv->op == NFT_DYNSET_OP_UPDATE &&
 		    nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 9903c737c9f0..266d0c637225 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -123,8 +123,6 @@ nft_rhash_get(const struct net *net, const struct nft_set *set,
 
 static const struct nft_set_ext *
 nft_rhash_update(struct nft_set *set, const u32 *key,
-		 struct nft_elem_priv *(*new)(struct nft_set *, const struct nft_expr *,
-		 struct nft_regs *regs),
 		 const struct nft_expr *expr, struct nft_regs *regs)
 {
 	struct nft_rhash *priv = nft_set_priv(set);
@@ -141,7 +139,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key,
 	if (he != NULL)
 		goto out;
 
-	elem_priv = new(set, expr, regs);
+	elem_priv = nft_dynset_new(set, expr, regs);
 	if (!elem_priv)
 		goto err1;
 
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index 6c441e2dc8af..db5d367e43c4 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1137,7 +1137,6 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
  * @net:	Network namespace
  * @set:	nftables API set representation
  * @key:	nftables API element representation containing key data
- * @ext:	nftables API extension pointer, filled with matching reference
  *
  * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
  *
-- 
cgit v1.2.3


From 0d9cfc9b8cb17dbc29a98792d36ec39a1cf1395f Mon Sep 17 00:00:00 2001
From: John Ernberg <john.ernberg@actia.se>
Date: Wed, 23 Jul 2025 10:25:35 +0000
Subject: net: usbnet: Avoid potential RCU stall on LINK_CHANGE event

The Gemalto Cinterion PLS83-W modem (cdc_ether) is emitting confusing link
up and down events when the WWAN interface is activated on the modem-side.

Interrupt URBs will in consecutive polls grab:
* Link Connected
* Link Disconnected
* Link Connected

Where the last Connected is then a stable link state.

When the system is under load this may cause the unlink_urbs() work in
__handle_link_change() to not complete before the next usbnet_link_change()
call turns the carrier on again, allowing rx_submit() to queue new SKBs.

In that event the URB queue is filled faster than it can drain, ending up
in a RCU stall:

    rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 0-.... } 33108 jiffies s: 201 root: 0x1/.
    rcu: blocking rcu_node structures (internal RCU debug):
    Sending NMI from CPU 1 to CPUs 0:
    NMI backtrace for cpu 0

    Call trace:
     arch_local_irq_enable+0x4/0x8
     local_bh_enable+0x18/0x20
     __netdev_alloc_skb+0x18c/0x1cc
     rx_submit+0x68/0x1f8 [usbnet]
     rx_alloc_submit+0x4c/0x74 [usbnet]
     usbnet_bh+0x1d8/0x218 [usbnet]
     usbnet_bh_tasklet+0x10/0x18 [usbnet]
     tasklet_action_common+0xa8/0x110
     tasklet_action+0x2c/0x34
     handle_softirqs+0x2cc/0x3a0
     __do_softirq+0x10/0x18
     ____do_softirq+0xc/0x14
     call_on_irq_stack+0x24/0x34
     do_softirq_own_stack+0x18/0x20
     __irq_exit_rcu+0xa8/0xb8
     irq_exit_rcu+0xc/0x30
     el1_interrupt+0x34/0x48
     el1h_64_irq_handler+0x14/0x1c
     el1h_64_irq+0x68/0x6c
     _raw_spin_unlock_irqrestore+0x38/0x48
     xhci_urb_dequeue+0x1ac/0x45c [xhci_hcd]
     unlink1+0xd4/0xdc [usbcore]
     usb_hcd_unlink_urb+0x70/0xb0 [usbcore]
     usb_unlink_urb+0x24/0x44 [usbcore]
     unlink_urbs.constprop.0.isra.0+0x64/0xa8 [usbnet]
     __handle_link_change+0x34/0x70 [usbnet]
     usbnet_deferred_kevent+0x1c0/0x320 [usbnet]
     process_scheduled_works+0x2d0/0x48c
     worker_thread+0x150/0x1dc
     kthread+0xd8/0xe8
     ret_from_fork+0x10/0x20

Get around the problem by delaying the carrier on to the scheduled work.

This needs a new flag to keep track of the necessary action.

The carrier ok check cannot be removed as it remains required for the
LINK_RESET event flow.

Fixes: 4b49f58fff00 ("usbnet: handle link change")
Cc: stable@vger.kernel.org
Signed-off-by: John Ernberg <john.ernberg@actia.se>
Link: https://patch.msgid.link/20250723102526.1305339-1-john.ernberg@actia.se
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/usbnet.c   | 11 ++++++++---
 include/linux/usb/usbnet.h |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index c04e715a4c2a..bc1d8631ffe0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1122,6 +1122,9 @@ static void __handle_link_change(struct usbnet *dev)
 		 * tx queue is stopped by netcore after link becomes off
 		 */
 	} else {
+		if (test_and_clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags))
+			netif_carrier_on(dev->net);
+
 		/* submitting URBs for reading packets */
 		tasklet_schedule(&dev->bh);
 	}
@@ -2009,10 +2012,12 @@ EXPORT_SYMBOL(usbnet_manage_power);
 void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset)
 {
 	/* update link after link is reseted */
-	if (link && !need_reset)
-		netif_carrier_on(dev->net);
-	else
+	if (link && !need_reset) {
+		set_bit(EVENT_LINK_CARRIER_ON, &dev->flags);
+	} else {
+		clear_bit(EVENT_LINK_CARRIER_ON, &dev->flags);
 		netif_carrier_off(dev->net);
+	}
 
 	if (need_reset && link)
 		usbnet_defer_kevent(dev, EVENT_LINK_RESET);
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 0b9f1e598e3a..4bc6bb01a0eb 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -76,6 +76,7 @@ struct usbnet {
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
 #		define EVENT_NO_IP_ALIGN	13
+#		define EVENT_LINK_CARRIER_ON	14
 /* This one is special, as it indicates that the device is going away
  * there are cyclic dependencies between tasklet, timer and bh
  * that must be broken
-- 
cgit v1.2.3


From f24987ef6959a7efaf79bffd265522c3df18d431 Mon Sep 17 00:00:00 2001
From: Gabriel Goller <g.goller@proxmox.com>
Date: Tue, 22 Jul 2025 10:18:45 +0200
Subject: ipv6: add `force_forwarding` sysctl to enable per-interface
 forwarding

It is currently impossible to enable ipv6 forwarding on a per-interface
basis like in ipv4. To enable forwarding on an ipv6 interface we need to
enable it on all interfaces and disable it on the other interfaces using
a netfilter rule. This is especially cumbersome if you have lots of
interfaces and only want to enable forwarding on a few. According to the
sysctl docs [0] the `net.ipv6.conf.all.forwarding` enables forwarding
for all interfaces, while the interface-specific
`net.ipv6.conf.<interface>.forwarding` configures the interface
Host/Router configuration.

Introduce a new sysctl flag `force_forwarding`, which can be set on every
interface. The ip6_forwarding function will then check if the global
forwarding flag OR the force_forwarding flag is active and forward the
packet.

To preserve backwards-compatibility reset the flag (on all interfaces)
to 0 if the net.ipv6.conf.all.forwarding flag is set to 0.

Add a short selftest that checks if a packet gets forwarded with and
without `force_forwarding`.

[0]: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt

Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Gabriel Goller <g.goller@proxmox.com>
Link: https://patch.msgid.link/20250722081847.132632-1-g.goller@proxmox.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst             |   8 +-
 include/linux/ipv6.h                               |   1 +
 include/uapi/linux/ipv6.h                          |   1 +
 include/uapi/linux/netconf.h                       |   1 +
 include/uapi/linux/sysctl.h                        |   1 +
 net/ipv6/addrconf.c                                |  82 ++++++++++++++++
 net/ipv6/ip6_output.c                              |   3 +-
 tools/testing/selftests/net/Makefile               |   1 +
 .../testing/selftests/net/ipv6_force_forwarding.sh | 105 +++++++++++++++++++++
 9 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100755 tools/testing/selftests/net/ipv6_force_forwarding.sh

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 14700ea77e75..bb620f554598 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2543,8 +2543,8 @@ conf/all/disable_ipv6 - BOOLEAN
 conf/all/forwarding - BOOLEAN
 	Enable global IPv6 forwarding between all interfaces.
 
-	IPv4 and IPv6 work differently here; e.g. netfilter must be used
-	to control which interfaces may forward packets and which not.
+	IPv4 and IPv6 work differently here; the ``force_forwarding`` flag must
+	be used to control which interfaces may forward packets.
 
 	This also sets all interfaces' Host/Router setting
 	'forwarding' to the specified value.  See below for details.
@@ -2561,6 +2561,10 @@ proxy_ndp - BOOLEAN
 
 	Default: 0 (disabled)
 
+force_forwarding - BOOLEAN
+	Enable forwarding on this interface only -- regardless of the setting on
+	``conf/all/forwarding``. When setting ``conf.all.forwarding`` to 0,
+	the ``force_forwarding`` flag will be reset on all interfaces.
 
 fwmark_reflect - BOOLEAN
 	Controls the fwmark of kernel-generated IPv6 reply packets that are not
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index db0eb0d86b64..bc6ec2959173 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -17,6 +17,7 @@ struct ipv6_devconf {
 	__s32		hop_limit;
 	__s32		mtu6;
 	__s32		forwarding;
+	__s32		force_forwarding;
 	__s32		disable_policy;
 	__s32		proxy_ndp;
 	__cacheline_group_end(ipv6_devconf_read_txrx);
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index cf592d7b630f..d4d3ae774b26 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -199,6 +199,7 @@ enum {
 	DEVCONF_NDISC_EVICT_NOCARRIER,
 	DEVCONF_ACCEPT_UNTRACKED_NA,
 	DEVCONF_ACCEPT_RA_MIN_LFT,
+	DEVCONF_FORCE_FORWARDING,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index fac4edd55379..1c8c84d65ae3 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 	NETCONFA_INPUT,
 	NETCONFA_BC_FORWARDING,
+	NETCONFA_FORCE_FORWARDING,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 8981f00204db..63d1464cb71c 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -573,6 +573,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
 	NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
 	NET_IPV6_RA_DEFRTR_METRIC=28,
+	NET_IPV6_FORCE_FORWARDING=29,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4f1d7d110302..81a067a2e526 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -239,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.ndisc_evict_nocarrier	= 1,
 	.ra_honor_pio_life	= 0,
 	.ra_honor_pio_pflag	= 0,
+	.force_forwarding	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -303,6 +304,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.ndisc_evict_nocarrier	= 1,
 	.ra_honor_pio_life	= 0,
 	.ra_honor_pio_pflag	= 0,
+	.force_forwarding	= 0,
 };
 
 /* Check if link is ready: is it up and is a valid qdisc available */
@@ -857,6 +859,9 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
 		idev = __in6_dev_get_rtnl_net(dev);
 		if (idev) {
 			int changed = (!idev->cnf.forwarding) ^ (!newf);
+			/* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */
+			if (newf == 0)
+				WRITE_ONCE(idev->cnf.force_forwarding, 0);
 
 			WRITE_ONCE(idev->cnf.forwarding, newf);
 			if (changed)
@@ -5710,6 +5715,7 @@ static void ipv6_store_devconf(const struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_UNTRACKED_NA] =
 		READ_ONCE(cnf->accept_untracked_na);
 	array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft);
+	array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding);
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -6738,6 +6744,75 @@ static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write
 	return ret;
 }
 
+static void addrconf_force_forward_change(struct net *net, __s32 newf)
+{
+	struct net_device *dev;
+	struct inet6_dev *idev;
+
+	for_each_netdev(net, dev) {
+		idev = __in6_dev_get_rtnl_net(dev);
+		if (idev) {
+			int changed = (!idev->cnf.force_forwarding) ^ (!newf);
+
+			WRITE_ONCE(idev->cnf.force_forwarding, newf);
+			if (changed)
+				inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+							     NETCONFA_FORCE_FORWARDING,
+							     dev->ifindex, &idev->cnf);
+		}
+	}
+}
+
+static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write,
+					    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct inet6_dev *idev = ctl->extra1;
+	struct ctl_table tmp_ctl = *ctl;
+	struct net *net = ctl->extra2;
+	int *valp = ctl->data;
+	int new_val = *valp;
+	int old_val = *valp;
+	loff_t pos = *ppos;
+	int ret;
+
+	tmp_ctl.extra1 = SYSCTL_ZERO;
+	tmp_ctl.extra2 = SYSCTL_ONE;
+	tmp_ctl.data = &new_val;
+
+	ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos);
+
+	if (write && old_val != new_val) {
+		if (!rtnl_net_trylock(net))
+			return restart_syscall();
+
+		WRITE_ONCE(*valp, new_val);
+
+		if (valp == &net->ipv6.devconf_dflt->force_forwarding) {
+			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						     NETCONFA_FORCE_FORWARDING,
+						     NETCONFA_IFINDEX_DEFAULT,
+						     net->ipv6.devconf_dflt);
+		} else if (valp == &net->ipv6.devconf_all->force_forwarding) {
+			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						     NETCONFA_FORCE_FORWARDING,
+						     NETCONFA_IFINDEX_ALL,
+						     net->ipv6.devconf_all);
+
+			addrconf_force_forward_change(net, new_val);
+		} else {
+			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						     NETCONFA_FORCE_FORWARDING,
+						     idev->dev->ifindex,
+						     &idev->cnf);
+		}
+		rtnl_net_unlock(net);
+	}
+
+	if (ret)
+		*ppos = pos;
+	return ret;
+}
+
 static int minus_one = -1;
 static const int two_five_five = 255;
 static u32 ioam6_if_id_max = U16_MAX;
@@ -7208,6 +7283,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_TWO,
 	},
+	{
+		.procname	= "force_forwarding",
+		.data		= &ipv6_devconf.force_forwarding,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= addrconf_sysctl_force_forwarding,
+	},
 };
 
 static int __addrconf_sysctl_register(struct net *net, char *dev_name,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0412f8544695..1e1410237b6e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -511,7 +511,8 @@ int ip6_forward(struct sk_buff *skb)
 	u32 mtu;
 
 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
-	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
+	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
 		goto error;
 
 	if (skb->pkt_type != PACKET_HOST)
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 13e2678d418b..b31a71f2b372 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -116,6 +116,7 @@ TEST_GEN_FILES += skf_net_off
 TEST_GEN_FILES += tfo
 TEST_PROGS += tfo_passive.sh
 TEST_PROGS += broadcast_pmtu.sh
+TEST_PROGS += ipv6_force_forwarding.sh
 
 # YNL files, must be before "include ..lib.mk"
 YNL_GEN_FILES := busy_poller netlink-dumps
diff --git a/tools/testing/selftests/net/ipv6_force_forwarding.sh b/tools/testing/selftests/net/ipv6_force_forwarding.sh
new file mode 100755
index 000000000000..bf0243366caa
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_force_forwarding.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test IPv6 force_forwarding interface property
+#
+# This test verifies that the force_forwarding property works correctly:
+# - When global forwarding is disabled, packets are not forwarded normally
+# - When force_forwarding is enabled on an interface, packets are forwarded
+#   regardless of the global forwarding setting
+
+source lib.sh
+
+cleanup() {
+    cleanup_ns $ns1 $ns2 $ns3
+}
+
+trap cleanup EXIT
+
+setup_test() {
+    # Create three namespaces: sender, router, receiver
+    setup_ns ns1 ns2 ns3
+
+    # Create veth pairs: ns1 <-> ns2 <-> ns3
+    ip link add name veth12 type veth peer name veth21
+    ip link add name veth23 type veth peer name veth32
+
+    # Move interfaces to namespaces
+    ip link set veth12 netns $ns1
+    ip link set veth21 netns $ns2
+    ip link set veth23 netns $ns2
+    ip link set veth32 netns $ns3
+
+    # Configure interfaces
+    ip -n $ns1 addr add 2001:db8:1::1/64 dev veth12 nodad
+    ip -n $ns2 addr add 2001:db8:1::2/64 dev veth21 nodad
+    ip -n $ns2 addr add 2001:db8:2::1/64 dev veth23 nodad
+    ip -n $ns3 addr add 2001:db8:2::2/64 dev veth32 nodad
+
+    # Bring up interfaces
+    ip -n $ns1 link set veth12 up
+    ip -n $ns2 link set veth21 up
+    ip -n $ns2 link set veth23 up
+    ip -n $ns3 link set veth32 up
+
+    # Add routes
+    ip -n $ns1 route add 2001:db8:2::/64 via 2001:db8:1::2
+    ip -n $ns3 route add 2001:db8:1::/64 via 2001:db8:2::1
+
+    # Disable global forwarding
+    ip netns exec $ns2 sysctl -qw net.ipv6.conf.all.forwarding=0
+}
+
+test_force_forwarding() {
+    local ret=0
+
+    echo "TEST: force_forwarding functionality"
+
+    # Check if force_forwarding sysctl exists
+    if ! ip netns exec $ns2 test -f /proc/sys/net/ipv6/conf/veth21/force_forwarding; then
+        echo "SKIP: force_forwarding not available"
+        return $ksft_skip
+    fi
+
+    # Test 1: Without force_forwarding, ping should fail
+    ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth21.force_forwarding=0
+    ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth23.force_forwarding=0
+
+    if ip netns exec $ns1 ping -6 -c 1 -W 2 2001:db8:2::2 &>/dev/null; then
+        echo "FAIL: ping succeeded when forwarding disabled"
+        ret=1
+    else
+        echo "PASS: forwarding disabled correctly"
+    fi
+
+    # Test 2: With force_forwarding enabled, ping should succeed
+    ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth21.force_forwarding=1
+    ip netns exec $ns2 sysctl -qw net.ipv6.conf.veth23.force_forwarding=1
+
+    if ip netns exec $ns1 ping -6 -c 1 -W 2 2001:db8:2::2 &>/dev/null; then
+        echo "PASS: force_forwarding enabled forwarding"
+    else
+        echo "FAIL: ping failed with force_forwarding enabled"
+        ret=1
+    fi
+
+    return $ret
+}
+
+echo "IPv6 force_forwarding test"
+echo "=========================="
+
+setup_test
+test_force_forwarding
+ret=$?
+
+if [ $ret -eq 0 ]; then
+    echo "OK"
+    exit 0
+elif [ $ret -eq $ksft_skip ]; then
+    echo "SKIP"
+    exit $ksft_skip
+else
+    echo "FAIL"
+    exit 1
+fi
-- 
cgit v1.2.3


From 33360f2508e07b07bb926ea75f11744dcc1cde07 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 23 Jul 2025 10:20:29 -0700
Subject: netpoll: Remove unused fields from inet_addr union

Clean up the inet_addr union by removing unused fields that are
redundant with existing members:

This simplifies the union structure while maintaining all necessary
functionality for both IPv4 and IPv6 address handling.

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://patch.msgid.link/20250723-netconsole_ref-v3-1-8be9b24e4a99@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netpoll.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 735e65c3cc11..b5ea9882eda8 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -15,10 +15,7 @@
 #include <linux/refcount.h>
 
 union inet_addr {
-	__u32		all[4];
 	__be32		ip;
-	__be32		ip6[4];
-	struct in_addr	in;
 	struct in6_addr	in6;
 };
 
-- 
cgit v1.2.3


From c2dbaf0af05193fa69f267c37d1f825c1c3a59ab Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 12 Jun 2025 10:05:52 -0400
Subject: tracing: sched: Hide numa events under CONFIG_NUMA_BALANCING

The events sched_move_numa, sched_stick_numa and sched_swap_numa are only
called when CONFIG_NUMA_BALANCING is configured. As each event can take up
to 5K of memory in text and meta data regardless if they are used or not,
they should not be defined when unused.

Move the #ifdef CONFIG_NUMA_BALANCING to hide these events as well.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/20250612100552.39672cf9@batman.local.home
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/trace/events/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3..0243f32e068a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -628,6 +628,7 @@ TRACE_EVENT(sched_process_hang,
 );
 #endif /* CONFIG_DETECT_HUNG_TASK */
 
+#ifdef CONFIG_NUMA_BALANCING
 /*
  * Tracks migration of tasks from one runqueue to another. Can be used to
  * detect if automatic NUMA balancing is bouncing between nodes.
@@ -720,7 +721,6 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
 	TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu)
 );
 
-#ifdef CONFIG_NUMA_BALANCING
 #define NUMAB_SKIP_REASON					\
 	EM( NUMAB_SKIP_UNSUITABLE,		"unsuitable" )	\
 	EM( NUMAB_SKIP_SHARED_RO,		"shared_ro" )	\
-- 
cgit v1.2.3


From 463deed51796fd0995d08d8b6aa793d7ab5a2059 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 22 Jul 2025 10:18:31 -0700
Subject: ipv6: Add sockaddr_inet unified address structure

There are cases in networking (e.g. wireguard, sctp) where a union is
used to provide coverage for either IPv4 or IPv6 network addresses,
and they include an embedded "struct sockaddr" as well (for "sa_family"
and raw "sa_data" access). The current struct sockaddr contains a
flexible array, which means these unions should not be further embedded
in other structs because they do not technically have a fixed size (and
are generating warnings for the coming -Wflexible-array-not-at-end flag
addition). But the future changes to make struct sockaddr a fixed size
(i.e. with a 14 byte sa_data member) make the "sa_data" uses with an IPv6
address a potential place for the compiler to get upset about object size
mismatches. Therefore, we need a sockaddr that cleanly provides both an
sa_family member and an appropriately fixed-sized sa_data member that does
not bloat member usage via the potential alternative of sockaddr_storage
to cover both IPv4 and IPv6, to avoid unseemly churn in the affected code
bases.

Introduce sockaddr_inet as a unified structure for holding both IPv4 and
IPv6 addresses (i.e. large enough to accommodate sockaddr_in6).

The structure is defined in linux/in6.h since its max size is sized
based on sockaddr_in6 and provides a more specific alternative to the
generic sockaddr_storage for IPv4 with IPv6 address family handling.

The "sa_family" member doesn't use the sa_family_t type to avoid needing
layer violating header inclusions.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20250722171836.1078436-1-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/in6.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/in6.h b/include/linux/in6.h
index 0777a21cbf86..403f926d33d8 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -18,6 +18,13 @@
 
 #include <uapi/linux/in6.h>
 
+/* Large enough to hold both sockaddr_in and sockaddr_in6. */
+struct sockaddr_inet {
+	unsigned short	sa_family;
+	char		sa_data[sizeof(struct sockaddr_in6) -
+				sizeof(unsigned short)];
+};
+
 /* IPv6 Wildcard Address (::) and Loopback Address (::1) defined in RFC2553
  * NOTE: Be aware the IN6ADDR_* constants and in6addr_* externals are defined
  * in network byte order, not in host byte order as are the IPv4 equivalents
-- 
cgit v1.2.3


From 511d10b4c2f91fb6aa676006b2bdff4df5d6e270 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 22 Jul 2025 10:18:33 -0700
Subject: sctp: Replace sockaddr with sockaddr_inet in sctp_addr union

As part of the removal of the variably-sized sockaddr for kernel
internals, replace struct sockaddr with sockaddr_inet in the sctp_addr
union.

No binary changes; the union size remains unchanged due to sockaddr_inet
matching the size of sockaddr_in6.

Signed-off-by: Kees Cook <kees@kernel.org>
Link: https://patch.msgid.link/20250722171836.1078436-3-kees@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/structs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 1ad7ce71d0a7..8a540ad9b509 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -51,9 +51,9 @@
  * We should wean ourselves off this.
  */
 union sctp_addr {
+	struct sockaddr_inet sa;	/* Large enough for both address families */
 	struct sockaddr_in v4;
 	struct sockaddr_in6 v6;
-	struct sockaddr sa;
 };
 
 /* Forward declarations for data structures. */
-- 
cgit v1.2.3


From 84c47bfc5b3b40b50b798b6b6c15e8a1442d936d Mon Sep 17 00:00:00 2001
From: Tristram Ha <tristram.ha@microchip.com>
Date: Thu, 24 Jul 2025 17:17:49 -0700
Subject: net: dsa: microchip: Add KSZ8463 switch support to KSZ DSA driver

KSZ8463 switch is a 3-port switch based from KSZ8863.  Its major
difference from other KSZ SPI switches is its register access is not a
simple continual 8-bit transfer with automatic address increase but uses
a byte-enable mechanism specifying 8-bit, 16-bit, or 32-bit access.  Its
registers are also defined in 16-bit format because it shares a design
with a MAC controller using 16-bit access.  As a result some common
register accesses need to be re-arranged.

This patch adds the basic structure for using KSZ8463.  It cannot use the
same regmap table for other KSZ switches as it interprets the 16-bit
value as little-endian and its SPI commands are different.

KSZ8463 uses a byte-enable mechanism to specify 8-bit, 16-bit, and 32-bit
access.  The register is first shifted right by 2 then left by 4.  Extra
4 bits are added.  If the access is 8-bit one of the 4 bits is set.  If
the access is 16-bit two of the 4 bits are set.  If the access is 32-bit
all 4 bits are set.  The SPI command for read or write is then added.

Because of this register transformation separate SPI read and write
functions are provided for KSZ8463.

KSZ8463's internal PHYs use standard PHY register definitions so there is
no need to remap things.  However, the hardware has a bug that the high
word and low word of the PHY id are swapped.  In addition the port
registers are arranged differently so KSZ8463 has its own mapping for
port registers and PHY registers.  Therefore the PORT_CTRL_ADDR macro is
replaced with the get_port_addr helper function.

Signed-off-by: Tristram Ha <tristram.ha@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://patch.msgid.link/20250725001753.6330-3-Tristram.Ha@microchip.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/microchip/ksz8.c            |  83 +++++++++++++++++++-
 drivers/net/dsa/microchip/ksz8.h            |   4 +
 drivers/net/dsa/microchip/ksz8_reg.h        |  49 ++++++++++++
 drivers/net/dsa/microchip/ksz_common.c      | 114 ++++++++++++++++++++++++++++
 drivers/net/dsa/microchip/ksz_common.h      |  37 ++++++++-
 drivers/net/dsa/microchip/ksz_spi.c         | 104 +++++++++++++++++++++++++
 include/linux/platform_data/microchip-ksz.h |   1 +
 7 files changed, 389 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/microchip/ksz8.c b/drivers/net/dsa/microchip/ksz8.c
index be433b4e2b1c..3761a81a7320 100644
--- a/drivers/net/dsa/microchip/ksz8.c
+++ b/drivers/net/dsa/microchip/ksz8.c
@@ -3,6 +3,7 @@
  * Microchip KSZ8XXX series switch driver
  *
  * It supports the following switches:
+ * - KSZ8463
  * - KSZ8863, KSZ8873 aka KSZ88X3
  * - KSZ8895, KSZ8864 aka KSZ8895 family
  * - KSZ8794, KSZ8795, KSZ8765 aka KSZ87XX
@@ -41,7 +42,8 @@ static void ksz_cfg(struct ksz_device *dev, u32 addr, u8 bits, bool set)
 static void ksz_port_cfg(struct ksz_device *dev, int port, int offset, u8 bits,
 			 bool set)
 {
-	regmap_update_bits(ksz_regmap_8(dev), PORT_CTRL_ADDR(port, offset),
+	regmap_update_bits(ksz_regmap_8(dev),
+			   dev->dev_ops->get_port_addr(port, offset),
 			   bits, set ? bits : 0);
 }
 
@@ -194,6 +196,7 @@ int ksz8_change_mtu(struct ksz_device *dev, int port, int mtu)
 	case KSZ8794_CHIP_ID:
 	case KSZ8765_CHIP_ID:
 		return ksz8795_change_mtu(dev, frame_size);
+	case KSZ8463_CHIP_ID:
 	case KSZ88X3_CHIP_ID:
 	case KSZ8864_CHIP_ID:
 	case KSZ8895_CHIP_ID:
@@ -1947,6 +1950,84 @@ u32 ksz8_get_port_addr(int port, int offset)
 	return PORT_CTRL_ADDR(port, offset);
 }
 
+u32 ksz8463_get_port_addr(int port, int offset)
+{
+	return offset + 0x18 * port;
+}
+
+static u16 ksz8463_get_phy_addr(u16 phy, u16 reg, u16 offset)
+{
+	return offset + reg * 2 + phy * (P2MBCR - P1MBCR);
+}
+
+int ksz8463_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val)
+{
+	u16 sw_reg = 0;
+	u16 data = 0;
+	int ret;
+
+	if (phy > 1)
+		return -ENOSPC;
+	switch (reg) {
+	case MII_PHYSID1:
+		sw_reg = ksz8463_get_phy_addr(phy, 0, PHY1IHR);
+		break;
+	case MII_PHYSID2:
+		sw_reg = ksz8463_get_phy_addr(phy, 0, PHY1ILR);
+		break;
+	case MII_BMCR:
+	case MII_BMSR:
+	case MII_ADVERTISE:
+	case MII_LPA:
+		sw_reg = ksz8463_get_phy_addr(phy, reg, P1MBCR);
+		break;
+	case MII_TPISTATUS:
+		/* This register holds the PHY interrupt status for simulated
+		 * Micrel KSZ PHY.
+		 */
+		data = 0x0505;
+		break;
+	default:
+		break;
+	}
+	if (sw_reg) {
+		ret = ksz_read16(dev, sw_reg, &data);
+		if (ret)
+			return ret;
+	}
+	*val = data;
+
+	return 0;
+}
+
+int ksz8463_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val)
+{
+	u16 sw_reg = 0;
+	int ret;
+
+	if (phy > 1)
+		return -ENOSPC;
+
+	/* No write to fiber port. */
+	if (dev->ports[phy].fiber)
+		return 0;
+	switch (reg) {
+	case MII_BMCR:
+	case MII_ADVERTISE:
+		sw_reg = ksz8463_get_phy_addr(phy, reg, P1MBCR);
+		break;
+	default:
+		break;
+	}
+	if (sw_reg) {
+		ret = ksz_write16(dev, sw_reg, val);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int ksz8_switch_init(struct ksz_device *dev)
 {
 	dev->cpu_port = fls(dev->info->cpu_ports) - 1;
diff --git a/drivers/net/dsa/microchip/ksz8.h b/drivers/net/dsa/microchip/ksz8.h
index e1c79ff97123..0f2cd1474b44 100644
--- a/drivers/net/dsa/microchip/ksz8.h
+++ b/drivers/net/dsa/microchip/ksz8.h
@@ -63,4 +63,8 @@ void ksz8_phylink_mac_link_up(struct phylink_config *config,
 			      bool tx_pause, bool rx_pause);
 int ksz8_all_queues_split(struct ksz_device *dev, int queues);
 
+u32 ksz8463_get_port_addr(int port, int offset);
+int ksz8463_r_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 *val);
+int ksz8463_w_phy(struct ksz_device *dev, u16 phy, u16 reg, u16 val);
+
 #endif
diff --git a/drivers/net/dsa/microchip/ksz8_reg.h b/drivers/net/dsa/microchip/ksz8_reg.h
index 329688603a58..491aa1e50175 100644
--- a/drivers/net/dsa/microchip/ksz8_reg.h
+++ b/drivers/net/dsa/microchip/ksz8_reg.h
@@ -729,6 +729,55 @@
 #define PHY_POWER_SAVING_ENABLE		BIT(2)
 #define PHY_REMOTE_LOOPBACK		BIT(1)
 
+/* KSZ8463 specific registers. */
+#define P1MBCR				0x4C
+#define P1MBSR				0x4E
+#define PHY1ILR				0x50
+#define PHY1IHR				0x52
+#define P1ANAR				0x54
+#define P1ANLPR				0x56
+#define P2MBCR				0x58
+#define P2MBSR				0x5A
+#define PHY2ILR				0x5C
+#define PHY2IHR				0x5E
+#define P2ANAR				0x60
+#define P2ANLPR				0x62
+
+#define P1CR1				0x6C
+#define P1CR2				0x6E
+#define P1CR3				0x72
+#define P1CR4				0x7E
+#define P1SR				0x80
+
+#define KSZ8463_FLUSH_TABLE_CTRL	0xAD
+
+#define KSZ8463_FLUSH_DYN_MAC_TABLE	BIT(2)
+#define KSZ8463_FLUSH_STA_MAC_TABLE	BIT(1)
+
+#define KSZ8463_REG_SW_CTRL_9		0xAE
+
+#define KSZ8463_REG_CFG_CTRL		0xD8
+
+#define PORT_2_COPPER_MODE		BIT(7)
+#define PORT_1_COPPER_MODE		BIT(6)
+#define PORT_COPPER_MODE_S		6
+
+#define KSZ8463_REG_SW_RESET		0x126
+
+#define KSZ8463_GLOBAL_SOFTWARE_RESET	BIT(0)
+
+#define KSZ8463_PTP_CLK_CTRL		0x600
+
+#define PTP_CLK_ENABLE			BIT(1)
+
+#define KSZ8463_PTP_MSG_CONF1		0x620
+
+#define PTP_ENABLE			BIT(6)
+
+#define KSZ8463_REG_DSP_CTRL_6		0x734
+
+#define COPPER_RECEIVE_ADJUSTMENT	BIT(13)
+
 /* Chip resource */
 
 #define PRIO_QUEUES			4
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index 6e1daf0018bc..095e647b3897 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -331,6 +331,38 @@ static const struct phylink_mac_ops ksz8_phylink_mac_ops = {
 	.mac_enable_tx_lpi = ksz_phylink_mac_enable_tx_lpi,
 };
 
+static const struct ksz_dev_ops ksz8463_dev_ops = {
+	.setup = ksz8_setup,
+	.get_port_addr = ksz8463_get_port_addr,
+	.cfg_port_member = ksz8_cfg_port_member,
+	.flush_dyn_mac_table = ksz8_flush_dyn_mac_table,
+	.port_setup = ksz8_port_setup,
+	.r_phy = ksz8463_r_phy,
+	.w_phy = ksz8463_w_phy,
+	.r_mib_cnt = ksz8_r_mib_cnt,
+	.r_mib_pkt = ksz8_r_mib_pkt,
+	.r_mib_stat64 = ksz88xx_r_mib_stats64,
+	.freeze_mib = ksz8_freeze_mib,
+	.port_init_cnt = ksz8_port_init_cnt,
+	.fdb_dump = ksz8_fdb_dump,
+	.fdb_add = ksz8_fdb_add,
+	.fdb_del = ksz8_fdb_del,
+	.mdb_add = ksz8_mdb_add,
+	.mdb_del = ksz8_mdb_del,
+	.vlan_filtering = ksz8_port_vlan_filtering,
+	.vlan_add = ksz8_port_vlan_add,
+	.vlan_del = ksz8_port_vlan_del,
+	.mirror_add = ksz8_port_mirror_add,
+	.mirror_del = ksz8_port_mirror_del,
+	.get_caps = ksz8_get_caps,
+	.config_cpu_port = ksz8_config_cpu_port,
+	.enable_stp_addr = ksz8_enable_stp_addr,
+	.reset = ksz8_reset_switch,
+	.init = ksz8_switch_init,
+	.exit = ksz8_switch_exit,
+	.change_mtu = ksz8_change_mtu,
+};
+
 static const struct ksz_dev_ops ksz88xx_dev_ops = {
 	.setup = ksz8_setup,
 	.get_port_addr = ksz8_get_port_addr,
@@ -517,6 +549,60 @@ static const struct ksz_dev_ops lan937x_dev_ops = {
 	.exit = lan937x_switch_exit,
 };
 
+static const u16 ksz8463_regs[] = {
+	[REG_SW_MAC_ADDR]		= 0x10,
+	[REG_IND_CTRL_0]		= 0x30,
+	[REG_IND_DATA_8]		= 0x26,
+	[REG_IND_DATA_CHECK]		= 0x26,
+	[REG_IND_DATA_HI]		= 0x28,
+	[REG_IND_DATA_LO]		= 0x2C,
+	[REG_IND_MIB_CHECK]		= 0x2F,
+	[P_FORCE_CTRL]			= 0x0C,
+	[P_LINK_STATUS]			= 0x0E,
+	[P_LOCAL_CTRL]			= 0x0C,
+	[P_NEG_RESTART_CTRL]		= 0x0D,
+	[P_REMOTE_STATUS]		= 0x0E,
+	[P_SPEED_STATUS]		= 0x0F,
+	[S_TAIL_TAG_CTRL]		= 0xAD,
+	[P_STP_CTRL]			= 0x6F,
+	[S_START_CTRL]			= 0x01,
+	[S_BROADCAST_CTRL]		= 0x06,
+	[S_MULTICAST_CTRL]		= 0x04,
+};
+
+static const u32 ksz8463_masks[] = {
+	[PORT_802_1P_REMAPPING]		= BIT(3),
+	[SW_TAIL_TAG_ENABLE]		= BIT(0),
+	[MIB_COUNTER_OVERFLOW]		= BIT(7),
+	[MIB_COUNTER_VALID]		= BIT(6),
+	[VLAN_TABLE_FID]		= GENMASK(15, 12),
+	[VLAN_TABLE_MEMBERSHIP]		= GENMASK(18, 16),
+	[VLAN_TABLE_VALID]		= BIT(19),
+	[STATIC_MAC_TABLE_VALID]	= BIT(19),
+	[STATIC_MAC_TABLE_USE_FID]	= BIT(21),
+	[STATIC_MAC_TABLE_FID]		= GENMASK(25, 22),
+	[STATIC_MAC_TABLE_OVERRIDE]	= BIT(20),
+	[STATIC_MAC_TABLE_FWD_PORTS]	= GENMASK(18, 16),
+	[DYNAMIC_MAC_TABLE_ENTRIES_H]	= GENMASK(1, 0),
+	[DYNAMIC_MAC_TABLE_MAC_EMPTY]	= BIT(2),
+	[DYNAMIC_MAC_TABLE_NOT_READY]	= BIT(7),
+	[DYNAMIC_MAC_TABLE_ENTRIES]	= GENMASK(31, 24),
+	[DYNAMIC_MAC_TABLE_FID]		= GENMASK(19, 16),
+	[DYNAMIC_MAC_TABLE_SRC_PORT]	= GENMASK(21, 20),
+	[DYNAMIC_MAC_TABLE_TIMESTAMP]	= GENMASK(23, 22),
+};
+
+static u8 ksz8463_shifts[] = {
+	[VLAN_TABLE_MEMBERSHIP_S]	= 16,
+	[STATIC_MAC_FWD_PORTS]		= 16,
+	[STATIC_MAC_FID]		= 22,
+	[DYNAMIC_MAC_ENTRIES_H]		= 8,
+	[DYNAMIC_MAC_ENTRIES]		= 24,
+	[DYNAMIC_MAC_FID]		= 16,
+	[DYNAMIC_MAC_TIMESTAMP]		= 22,
+	[DYNAMIC_MAC_SRC_PORT]		= 20,
+};
+
 static const u16 ksz8795_regs[] = {
 	[REG_SW_MAC_ADDR]		= 0x68,
 	[REG_IND_CTRL_0]		= 0x6E,
@@ -1387,6 +1473,29 @@ static const struct regmap_access_table ksz8873_register_set = {
 };
 
 const struct ksz_chip_data ksz_switch_chips[] = {
+	[KSZ8463] = {
+		.chip_id = KSZ8463_CHIP_ID,
+		.dev_name = "KSZ8463",
+		.num_vlans = 16,
+		.num_alus = 0,
+		.num_statics = 8,
+		.cpu_ports = 0x4,	/* can be configured as cpu port */
+		.port_cnt = 3,
+		.num_tx_queues = 4,
+		.num_ipms = 4,
+		.ops = &ksz8463_dev_ops,
+		.phylink_mac_ops = &ksz88x3_phylink_mac_ops,
+		.mib_names = ksz88xx_mib_names,
+		.mib_cnt = ARRAY_SIZE(ksz88xx_mib_names),
+		.reg_mib_cnt = MIB_COUNTER_NUM,
+		.regs = ksz8463_regs,
+		.masks = ksz8463_masks,
+		.shifts = ksz8463_shifts,
+		.supports_mii = {false, false, true},
+		.supports_rmii = {false, false, true},
+		.internal_phy = {true, true, false},
+	},
+
 	[KSZ8563] = {
 		.chip_id = KSZ8563_CHIP_ID,
 		.dev_name = "KSZ8563",
@@ -3400,6 +3509,7 @@ static enum dsa_tag_protocol ksz_get_tag_protocol(struct dsa_switch *ds,
 		proto = DSA_TAG_PROTO_KSZ8795;
 
 	if (dev->chip_id == KSZ88X3_CHIP_ID ||
+	    dev->chip_id == KSZ8463_CHIP_ID ||
 	    dev->chip_id == KSZ8563_CHIP_ID ||
 	    dev->chip_id == KSZ9893_CHIP_ID ||
 	    dev->chip_id == KSZ9563_CHIP_ID)
@@ -3512,6 +3622,7 @@ static int ksz_max_mtu(struct dsa_switch *ds, int port)
 	case KSZ8794_CHIP_ID:
 	case KSZ8765_CHIP_ID:
 		return KSZ8795_HUGE_PACKET_SIZE - VLAN_ETH_HLEN - ETH_FCS_LEN;
+	case KSZ8463_CHIP_ID:
 	case KSZ88X3_CHIP_ID:
 	case KSZ8864_CHIP_ID:
 	case KSZ8895_CHIP_ID:
@@ -3866,6 +3977,9 @@ static int ksz_switch_detect(struct ksz_device *dev)
 	id2 = FIELD_GET(SW_CHIP_ID_M, id16);
 
 	switch (id1) {
+	case KSZ84_FAMILY_ID:
+		dev->chip_id = KSZ8463_CHIP_ID;
+		break;
 	case KSZ87_FAMILY_ID:
 		if (id2 == KSZ87_CHIP_ID_95) {
 			u8 val;
diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h
index a08417df2ca4..a1eb39771bb9 100644
--- a/drivers/net/dsa/microchip/ksz_common.h
+++ b/drivers/net/dsa/microchip/ksz_common.h
@@ -222,6 +222,7 @@ struct ksz_device {
 
 /* List of supported models */
 enum ksz_model {
+	KSZ8463,
 	KSZ8563,
 	KSZ8567,
 	KSZ8795,
@@ -484,6 +485,11 @@ static inline struct regmap *ksz_regmap_32(struct ksz_device *dev)
 	return dev->regmap[KSZ_REGMAP_32];
 }
 
+static inline bool ksz_is_ksz8463(struct ksz_device *dev)
+{
+	return dev->chip_id == KSZ8463_CHIP_ID;
+}
+
 static inline int ksz_read8(struct ksz_device *dev, u32 reg, u8 *val)
 {
 	unsigned int value;
@@ -709,12 +715,13 @@ static inline bool ksz_is_8895_family(struct ksz_device *dev)
 static inline bool is_ksz8(struct ksz_device *dev)
 {
 	return ksz_is_ksz87xx(dev) || ksz_is_ksz88x3(dev) ||
-	       ksz_is_8895_family(dev);
+	       ksz_is_8895_family(dev) || ksz_is_ksz8463(dev);
 }
 
 static inline bool is_ksz88xx(struct ksz_device *dev)
 {
-	return ksz_is_ksz88x3(dev) || ksz_is_8895_family(dev);
+	return ksz_is_ksz88x3(dev) || ksz_is_8895_family(dev) ||
+	       ksz_is_ksz8463(dev);
 }
 
 static inline bool is_ksz9477(struct ksz_device *dev)
@@ -761,6 +768,7 @@ static inline bool ksz_is_sgmii_port(struct ksz_device *dev, int port)
 #define REG_CHIP_ID0			0x00
 
 #define SW_FAMILY_ID_M			GENMASK(15, 8)
+#define KSZ84_FAMILY_ID			0x84
 #define KSZ87_FAMILY_ID			0x87
 #define KSZ88_FAMILY_ID			0x88
 #define KSZ8895_FAMILY_ID		0x95
@@ -939,4 +947,29 @@ static inline bool ksz_is_sgmii_port(struct ksz_device *dev, int port)
 		[KSZ_REGMAP_32] = KSZ_REGMAP_ENTRY(32, swp, (regbits), (regpad), (regalign)), \
 	}
 
+#define KSZ8463_REGMAP_ENTRY(width, regbits, regpad, regalign)		\
+	{								\
+		.name = #width,						\
+		.val_bits = (width),					\
+		.reg_stride = (width / 8),				\
+		.reg_bits = (regbits) + (regalign),			\
+		.pad_bits = (regpad),					\
+		.read = ksz8463_spi_read,				\
+		.write = ksz8463_spi_write,				\
+		.max_register = BIT(regbits) - 1,			\
+		.cache_type = REGCACHE_NONE,				\
+		.zero_flag_mask = 1,					\
+		.use_single_read = 1,					\
+		.use_single_write = 1,					\
+		.lock = ksz_regmap_lock,				\
+		.unlock = ksz_regmap_unlock,				\
+	}
+
+#define KSZ8463_REGMAP_TABLE(ksz, regbits, regpad, regalign)		\
+	static const struct regmap_config ksz##_regmap_config[] = {	\
+		[KSZ_REGMAP_8] = KSZ8463_REGMAP_ENTRY(8, (regbits), (regpad), (regalign)), \
+		[KSZ_REGMAP_16] = KSZ8463_REGMAP_ENTRY(16, (regbits), (regpad), (regalign)), \
+		[KSZ_REGMAP_32] = KSZ8463_REGMAP_ENTRY(32, (regbits), (regpad), (regalign)), \
+	}
+
 #endif
diff --git a/drivers/net/dsa/microchip/ksz_spi.c b/drivers/net/dsa/microchip/ksz_spi.c
index b633d263098c..d8001734b057 100644
--- a/drivers/net/dsa/microchip/ksz_spi.c
+++ b/drivers/net/dsa/microchip/ksz_spi.c
@@ -16,6 +16,10 @@
 
 #include "ksz_common.h"
 
+#define KSZ8463_SPI_ADDR_SHIFT			13
+#define KSZ8463_SPI_ADDR_ALIGN			3
+#define KSZ8463_SPI_TURNAROUND_SHIFT		2
+
 #define KSZ8795_SPI_ADDR_SHIFT			12
 #define KSZ8795_SPI_ADDR_ALIGN			3
 #define KSZ8795_SPI_TURNAROUND_SHIFT		1
@@ -37,6 +41,99 @@ KSZ_REGMAP_TABLE(ksz8863, 16, KSZ8863_SPI_ADDR_SHIFT,
 KSZ_REGMAP_TABLE(ksz9477, 32, KSZ9477_SPI_ADDR_SHIFT,
 		 KSZ9477_SPI_TURNAROUND_SHIFT, KSZ9477_SPI_ADDR_ALIGN);
 
+static u16 ksz8463_reg(u16 reg, size_t size)
+{
+	switch (size) {
+	case 1:
+		reg = ((reg >> 2) << 4) | (1 << (reg & 3));
+		break;
+	case 2:
+		reg = ((reg >> 2) << 4) | (reg & 2 ? 0x0c : 0x03);
+		break;
+	default:
+		reg = ((reg >> 2) << 4) | 0xf;
+		break;
+	}
+	reg <<= KSZ8463_SPI_TURNAROUND_SHIFT;
+	return reg;
+}
+
+static int ksz8463_spi_read(void *context,
+			    const void *reg, size_t reg_size,
+			    void *val, size_t val_size)
+{
+	struct device *dev = context;
+	struct spi_device *spi = to_spi_device(dev);
+	u8 bytes[2];
+	u16 cmd;
+	int rc;
+
+	if (reg_size > 2 || val_size > 4)
+		return -EINVAL;
+	memcpy(&cmd, reg, sizeof(u16));
+	cmd = ksz8463_reg(cmd, val_size);
+	/* SPI command uses big-endian format. */
+	put_unaligned_be16(cmd, bytes);
+	rc = spi_write_then_read(spi, bytes, reg_size, val, val_size);
+#if defined(__BIG_ENDIAN)
+	/* Register value uses little-endian format so need to convert when
+	 * running in big-endian system.
+	 */
+	if (!rc && val_size > 1) {
+		if (val_size == 2) {
+			u16 v = get_unaligned_le16(val);
+
+			memcpy(val, &v, sizeof(v));
+		} else if (val_size == 4) {
+			u32 v = get_unaligned_le32(val);
+
+			memcpy(val, &v, sizeof(v));
+		}
+	}
+#endif
+	return rc;
+}
+
+static int ksz8463_spi_write(void *context, const void *data, size_t count)
+{
+	struct device *dev = context;
+	struct spi_device *spi = to_spi_device(dev);
+	size_t val_size = count - 2;
+	u8 bytes[6];
+	u16 cmd;
+
+	if (count <= 2 || count > 6)
+		return -EINVAL;
+	memcpy(bytes, data, count);
+	memcpy(&cmd, data, sizeof(u16));
+	cmd = ksz8463_reg(cmd, val_size);
+	cmd |= (1 << (KSZ8463_SPI_ADDR_SHIFT + KSZ8463_SPI_TURNAROUND_SHIFT));
+	/* SPI command uses big-endian format. */
+	put_unaligned_be16(cmd, bytes);
+#if defined(__BIG_ENDIAN)
+	/* Register value uses little-endian format so need to convert when
+	 * running in big-endian system.
+	 */
+	if (val_size == 2) {
+		u8 *val = &bytes[2];
+		u16 v;
+
+		memcpy(&v, val, sizeof(v));
+		put_unaligned_le16(v, val);
+	} else if (val_size == 4) {
+		u8 *val = &bytes[2];
+		u32 v;
+
+		memcpy(&v, val, sizeof(v));
+		put_unaligned_le32(v, val);
+	}
+#endif
+	return spi_write(spi, bytes, count);
+}
+
+KSZ8463_REGMAP_TABLE(ksz8463, KSZ8463_SPI_ADDR_SHIFT, 0,
+		     KSZ8463_SPI_ADDR_ALIGN);
+
 static int ksz_spi_probe(struct spi_device *spi)
 {
 	const struct regmap_config *regmap_config;
@@ -58,6 +155,8 @@ static int ksz_spi_probe(struct spi_device *spi)
 	dev->chip_id = chip->chip_id;
 	if (chip->chip_id == KSZ88X3_CHIP_ID)
 		regmap_config = ksz8863_regmap_config;
+	else if (chip->chip_id == KSZ8463_CHIP_ID)
+		regmap_config = ksz8463_regmap_config;
 	else if (chip->chip_id == KSZ8795_CHIP_ID ||
 		 chip->chip_id == KSZ8794_CHIP_ID ||
 		 chip->chip_id == KSZ8765_CHIP_ID)
@@ -125,6 +224,10 @@ static void ksz_spi_shutdown(struct spi_device *spi)
 }
 
 static const struct of_device_id ksz_dt_ids[] = {
+	{
+		.compatible = "microchip,ksz8463",
+		.data = &ksz_switch_chips[KSZ8463]
+	},
 	{
 		.compatible = "microchip,ksz8765",
 		.data = &ksz_switch_chips[KSZ8765]
@@ -214,6 +317,7 @@ static const struct of_device_id ksz_dt_ids[] = {
 MODULE_DEVICE_TABLE(of, ksz_dt_ids);
 
 static const struct spi_device_id ksz_spi_ids[] = {
+	{ "ksz8463" },
 	{ "ksz8765" },
 	{ "ksz8794" },
 	{ "ksz8795" },
diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h
index 0e0e8fe6975f..028781ad4059 100644
--- a/include/linux/platform_data/microchip-ksz.h
+++ b/include/linux/platform_data/microchip-ksz.h
@@ -23,6 +23,7 @@
 #include <linux/platform_data/dsa.h>
 
 enum ksz_chip_id {
+	KSZ8463_CHIP_ID = 0x8463,
 	KSZ8563_CHIP_ID = 0x8563,
 	KSZ8795_CHIP_ID = 0x8795,
 	KSZ8794_CHIP_ID = 0x8794,
-- 
cgit v1.2.3


From c652887a92887b9a2d48ee40f49f8013449b9a50 Mon Sep 17 00:00:00 2001
From: Raghavendra Rao Ananta <rananta@google.com>
Date: Wed, 23 Jul 2025 23:28:03 -0700
Subject: KVM: arm64: vgic-v3: Allow userspace to write GICD_TYPER2.nASSGIcap

KVM unconditionally advertises GICD_TYPER2.nASSGIcap (which internally
implies vSGIs) on GICv4.1 systems. Allow userspace to change whether a
VM supports the feature. Only allow changes prior to VGIC initialization
as at that point vPEs need to be allocated for the VM.

For convenience, bundle support for vLPIs and vSGIs behind this feature,
allowing userspace to control vPE allocation for VMs in environments
that may be constrained on vPE IDs.

Signed-off-by: Raghavendra Rao Ananta <rananta@google.com>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250724062805.2658919-5-oliver.upton@linux.dev
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
 arch/arm64/kvm/vgic/vgic-init.c       |  3 +++
 arch/arm64/kvm/vgic/vgic-kvm-device.c |  1 +
 arch/arm64/kvm/vgic/vgic-mmio-v3.c    | 19 +++++++++++++++++--
 arch/arm64/kvm/vgic/vgic.h            | 11 ++++++++++-
 include/kvm/arm_vgic.h                |  3 +++
 5 files changed, 34 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 31462ba093c9..6ec3535e142c 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -166,6 +166,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	else
 		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
 
+	if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
+		kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
+
 out_unlock:
 	mutex_unlock(&kvm->arch.config_lock);
 	kvm_unlock_all_vcpus(kvm);
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 8e24d550d9d2..3d1a776b716d 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -515,6 +515,7 @@ static bool reg_allowed_pre_init(struct kvm_device_attr *attr)
 
 	switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) {
 	case GICD_IIDR:
+	case GICD_TYPER2:
 		return true;
 	default:
 		return false;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index 8f0f460b6a61..a3ef185209e9 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -53,11 +53,16 @@ bool vgic_supports_direct_msis(struct kvm *kvm)
 	return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm);
 }
 
-bool vgic_supports_direct_sgis(struct kvm *kvm)
+bool system_supports_direct_sgis(void)
 {
 	return kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi();
 }
 
+bool vgic_supports_direct_sgis(struct kvm *kvm)
+{
+	return kvm->arch.vgic.nassgicap;
+}
+
 /*
  * The Revision field in the IIDR have the following meanings:
  *
@@ -163,8 +168,18 @@ static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
 
 	switch (addr & 0x0c) {
 	case GICD_TYPER2:
-		if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
+		reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
+
+		if (reg == val)
+			return 0;
+		if (vgic_initialized(vcpu->kvm))
+			return -EBUSY;
+		if ((reg ^ val) & ~GICD_TYPER2_nASSGIcap)
 			return -EINVAL;
+		if (!system_supports_direct_sgis() && val)
+			return -EINVAL;
+
+		dist->nassgicap = val & GICD_TYPER2_nASSGIcap;
 		return 0;
 	case GICD_IIDR:
 		reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index ebf9ed6adeac..fb6d7c098ae6 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -369,12 +369,21 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
 int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
 int vgic_its_invall(struct kvm_vcpu *vcpu);
 
+bool system_supports_direct_sgis(void);
 bool vgic_supports_direct_msis(struct kvm *kvm);
 bool vgic_supports_direct_sgis(struct kvm *kvm);
 
 static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
 {
-	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
+	/*
+	 * Deliberately conflate vLPI and vSGI support on GICv4.1 hardware,
+	 * indirectly allowing userspace to control whether or not vPEs are
+	 * allocated for the VM.
+	 */
+	if (system_supports_direct_sgis())
+		return vgic_supports_direct_sgis(kvm);
+
+	return vgic_supports_direct_msis(kvm);
 }
 
 int vgic_v4_init(struct kvm *kvm);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 4a34f7f0a864..1b4886f3fb20 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -264,6 +264,9 @@ struct vgic_dist {
 	/* distributor enabled */
 	bool			enabled;
 
+	/* Supports SGIs without active state */
+	bool			nassgicap;
+
 	/* Wants SGIs without active state */
 	bool			nassgireq;
 
-- 
cgit v1.2.3


From b7b3500bd4eef2c3b5124ed195f26eb048407d9b Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Mon, 21 Jul 2025 11:04:42 +0200
Subject: umd: Remove usermode driver framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code is unused since 98e20e5e13d2 ("bpfilter: remove bpfilter"),
therefore remove it.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Link: https://lore.kernel.org/bpf/20250721-remove-usermode-driver-v1-2-0d0083334382@linutronix.de
---
 include/linux/usermode_driver.h |  19 ----
 kernel/Makefile                 |   1 -
 kernel/bpf/preload/Kconfig      |   4 -
 kernel/usermode_driver.c        | 191 ----------------------------------------
 4 files changed, 215 deletions(-)
 delete mode 100644 include/linux/usermode_driver.h
 delete mode 100644 kernel/usermode_driver.c

(limited to 'include')

diff --git a/include/linux/usermode_driver.h b/include/linux/usermode_driver.h
deleted file mode 100644
index ad970416260d..000000000000
--- a/include/linux/usermode_driver.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __LINUX_USERMODE_DRIVER_H__
-#define __LINUX_USERMODE_DRIVER_H__
-
-#include <linux/umh.h>
-#include <linux/path.h>
-
-struct umd_info {
-	const char *driver_name;
-	struct file *pipe_to_umh;
-	struct file *pipe_from_umh;
-	struct path wd;
-	struct pid *tgid;
-};
-int umd_load_blob(struct umd_info *info, const void *data, size_t len);
-int umd_unload_blob(struct umd_info *info);
-int fork_usermode_driver(struct umd_info *info);
-void umd_cleanup_helper(struct umd_info *info);
-
-#endif /* __LINUX_USERMODE_DRIVER_H__ */
diff --git a/kernel/Makefile b/kernel/Makefile
index 32e80dd626af..4332de7ffdee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,7 +12,6 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
 	    async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
 
-obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
 obj-$(CONFIG_MULTIUSER) += groups.o
 obj-$(CONFIG_VHOST_TASK) += vhost_task.o
 
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index f9b11d01c3b5..aef7b0bc96d6 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -1,8 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-config USERMODE_DRIVER
-	bool
-	default n
-
 menuconfig BPF_PRELOAD
 	bool "Preload BPF file system with kernel specific program and map iterators"
 	depends on BPF
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
deleted file mode 100644
index 8303f4c7ca71..000000000000
--- a/kernel/usermode_driver.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * umd - User mode driver support
- */
-#include <linux/shmem_fs.h>
-#include <linux/pipe_fs_i.h>
-#include <linux/mount.h>
-#include <linux/fs_struct.h>
-#include <linux/task_work.h>
-#include <linux/usermode_driver.h>
-
-static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name)
-{
-	struct file_system_type *type;
-	struct vfsmount *mnt;
-	struct file *file;
-	ssize_t written;
-	loff_t pos = 0;
-
-	type = get_fs_type("tmpfs");
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
-	mnt = kern_mount(type);
-	put_filesystem(type);
-	if (IS_ERR(mnt))
-		return mnt;
-
-	file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700);
-	if (IS_ERR(file)) {
-		kern_unmount(mnt);
-		return ERR_CAST(file);
-	}
-
-	written = kernel_write(file, data, len, &pos);
-	if (written != len) {
-		int err = written;
-		if (err >= 0)
-			err = -ENOMEM;
-		filp_close(file, NULL);
-		kern_unmount(mnt);
-		return ERR_PTR(err);
-	}
-
-	fput(file);
-
-	/* Flush delayed fput so exec can open the file read-only */
-	flush_delayed_fput();
-	task_work_run();
-	return mnt;
-}
-
-/**
- * umd_load_blob - Remember a blob of bytes for fork_usermode_driver
- * @info: information about usermode driver
- * @data: a blob of bytes that can be executed as a file
- * @len:  The lentgh of the blob
- *
- */
-int umd_load_blob(struct umd_info *info, const void *data, size_t len)
-{
-	struct vfsmount *mnt;
-
-	if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt))
-		return -EBUSY;
-
-	mnt = blob_to_mnt(data, len, info->driver_name);
-	if (IS_ERR(mnt))
-		return PTR_ERR(mnt);
-
-	info->wd.mnt = mnt;
-	info->wd.dentry = mnt->mnt_root;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(umd_load_blob);
-
-/**
- * umd_unload_blob - Disassociate @info from a previously loaded blob
- * @info: information about usermode driver
- *
- */
-int umd_unload_blob(struct umd_info *info)
-{
-	if (WARN_ON_ONCE(!info->wd.mnt ||
-			 !info->wd.dentry ||
-			 info->wd.mnt->mnt_root != info->wd.dentry))
-		return -EINVAL;
-
-	kern_unmount(info->wd.mnt);
-	info->wd.mnt = NULL;
-	info->wd.dentry = NULL;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(umd_unload_blob);
-
-static int umd_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct umd_info *umd_info = info->data;
-	struct file *from_umh[2];
-	struct file *to_umh[2];
-	int err;
-
-	/* create pipe to send data to umh */
-	err = create_pipe_files(to_umh, 0);
-	if (err)
-		return err;
-	err = replace_fd(0, to_umh[0], 0);
-	fput(to_umh[0]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		return err;
-	}
-
-	/* create pipe to receive data from umh */
-	err = create_pipe_files(from_umh, 0);
-	if (err) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		return err;
-	}
-	err = replace_fd(1, from_umh[1], 0);
-	fput(from_umh[1]);
-	if (err < 0) {
-		fput(to_umh[1]);
-		replace_fd(0, NULL, 0);
-		fput(from_umh[0]);
-		return err;
-	}
-
-	set_fs_pwd(current->fs, &umd_info->wd);
-	umd_info->pipe_to_umh = to_umh[1];
-	umd_info->pipe_from_umh = from_umh[0];
-	umd_info->tgid = get_pid(task_tgid(current));
-	return 0;
-}
-
-static void umd_cleanup(struct subprocess_info *info)
-{
-	struct umd_info *umd_info = info->data;
-
-	/* cleanup if umh_setup() was successful but exec failed */
-	if (info->retval)
-		umd_cleanup_helper(umd_info);
-}
-
-/**
- * umd_cleanup_helper - release the resources which were allocated in umd_setup
- * @info: information about usermode driver
- */
-void umd_cleanup_helper(struct umd_info *info)
-{
-	fput(info->pipe_to_umh);
-	fput(info->pipe_from_umh);
-	put_pid(info->tgid);
-	info->tgid = NULL;
-}
-EXPORT_SYMBOL_GPL(umd_cleanup_helper);
-
-/**
- * fork_usermode_driver - fork a usermode driver
- * @info: information about usermode driver (shouldn't be NULL)
- *
- * Returns either negative error or zero which indicates success in
- * executing a usermode driver. In such case 'struct umd_info *info'
- * is populated with two pipes and a tgid of the process. The caller is
- * responsible for health check of the user process, killing it via
- * tgid, and closing the pipes when user process is no longer needed.
- */
-int fork_usermode_driver(struct umd_info *info)
-{
-	struct subprocess_info *sub_info;
-	const char *argv[] = { info->driver_name, NULL };
-	int err;
-
-	if (WARN_ON_ONCE(info->tgid))
-		return -EBUSY;
-
-	err = -ENOMEM;
-	sub_info = call_usermodehelper_setup(info->driver_name,
-					     (char **)argv, NULL, GFP_KERNEL,
-					     umd_setup, umd_cleanup, info);
-	if (!sub_info)
-		goto out;
-
-	err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
-out:
-	return err;
-}
-EXPORT_SYMBOL_GPL(fork_usermode_driver);
-
-
-- 
cgit v1.2.3


From 3ba58312e65665e5b9097c7969a51fa49914d85d Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Thu, 24 Jul 2025 12:02:53 +0000
Subject: bpf: Move bpf_jit_get_prog_name() to core.c

bpf_jit_get_prog_name() will be used by all JITs when enabling support
for private stack. This function is currently implemented in the x86
JIT.

Move the function to core.c so that other JITs can easily use it in
their implementation of private stack.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20250724120257.7299-2-puranjay@kernel.org
---
 arch/x86/net/bpf_jit_comp.c | 9 +--------
 include/linux/filter.h      | 2 ++
 kernel/bpf/core.c           | 7 +++++++
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 40e1b3b9634f..7e3fca164620 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3501,13 +3501,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
 }
 
-static const char *bpf_get_prog_name(struct bpf_prog *prog)
-{
-	if (prog->aux->ksym.prog)
-		return prog->aux->ksym.name;
-	return prog->aux->name;
-}
-
 static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int alloc_size)
 {
 	int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
@@ -3531,7 +3524,7 @@ static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int alloc_size
 		if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
 		    stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL) {
 			pr_err("BPF private stack overflow/underflow detected for prog %sx\n",
-			       bpf_get_prog_name(prog));
+			       bpf_jit_get_prog_name(prog));
 			break;
 		}
 	}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eca229752cbe..5cc7a82ec832 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1278,6 +1278,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			  const struct bpf_insn *insn, bool extra_pass,
 			  u64 *func_addr, bool *func_addr_fixed);
 
+const char *bpf_jit_get_prog_name(struct bpf_prog *prog);
+
 struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
 void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 61613785bdd0..29c0225c14aa 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1297,6 +1297,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 	return 0;
 }
 
+const char *bpf_jit_get_prog_name(struct bpf_prog *prog)
+{
+	if (prog->aux->ksym.prog)
+		return prog->aux->ksym.name;
+	return prog->aux->name;
+}
+
 static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      const struct bpf_insn *aux,
 			      struct bpf_insn *to_buff,
-- 
cgit v1.2.3


From 381a38ea53d25ed6f93ba007b021db86c2a36bc6 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 23 Jul 2025 22:50:27 -0700
Subject: init.h: Disable sanitizer coverage for __init and __head

While __noinstr already contained __no_sanitize_coverage, it needs to
be added to __init and __head section markings to support the Clang
implementation of CONFIG_KSTACK_ERASE. This is to make sure the stack
depth tracking callback is not executed in unsupported contexts.

The other sanitizer coverage options (trace-pc and trace-cmp) aren't
needed in __head nor __init either ("We are interested in code coverage
as a function of a syscall inputs"[1]), so this is fine to disable for
them as well.

Link: https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/kcov.c?h=v6.14#n179 [1]
Acked-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20250724055029.3623499-3-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 arch/x86/include/asm/init.h | 2 +-
 include/linux/init.h        | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8b1b1abcef15..6bfdaeddbae8 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
 #if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
 #define __head	__section(".head.text") __no_sanitize_undefined __no_stack_protector
 #else
-#define __head	__section(".head.text") __no_sanitize_undefined
+#define __head	__section(".head.text") __no_sanitize_undefined __no_sanitize_coverage
 #endif
 
 struct x86_mapping_info {
diff --git a/include/linux/init.h b/include/linux/init.h
index ee1309473bc6..c65a050d52a7 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -49,7 +49,9 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline
+#define __init		__section(".init.text") __cold __latent_entropy	\
+						__noinitretpoline	\
+						__no_sanitize_coverage
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
 #define __exitdata	__section(".exit.data")
-- 
cgit v1.2.3


From 32e42ab9fc88a884435c27527a433f61c4d2b61b Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Sat, 26 Jul 2025 00:29:54 -0700
Subject: sched/task_stack: Add missing const qualifier to end_of_stack()

Add missing const qualifier to the non-CONFIG_THREAD_INFO_IN_TASK
version of end_of_stack() to match the CONFIG_THREAD_INFO_IN_TASK
version. Fixes a warning with CONFIG_KSTACK_ERASE=y on archs that don't
select THREAD_INFO_IN_TASK (such as LoongArch):

  error: passing 'const struct task_struct *' to parameter of type 'struct task_struct *' discards qualifiers

The stackleak_task_low_bound() function correctly uses a const task
parameter, but the legacy end_of_stack() prototype didn't like that.

Build tested on loongarch (with CONFIG_KSTACK_ERASE=y) and m68k
(with CONFIG_DEBUG_STACK_USAGE=y).

Fixes: a45728fd4120 ("LoongArch: Enable HAVE_ARCH_STACKLEAK")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/all/20250726004313.GA3650901@ax162
Cc: Youling Tang <tangyouling@kylinos.cn>
Cc: Huacai Chen <chenhuacai@loongson.cn>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/sched/task_stack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index 85c5a6392e02..1fab7e9043a3 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -53,7 +53,7 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct
  * When the stack grows up, this is the highest address.
  * Beyond that position, we corrupt data on the next page.
  */
-static inline unsigned long *end_of_stack(struct task_struct *p)
+static inline unsigned long *end_of_stack(const struct task_struct *p)
 {
 #ifdef CONFIG_STACK_GROWSUP
 	return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
-- 
cgit v1.2.3


From 48e6561b667e7f0623da3ca34e2b93b7ae2a5d8d Mon Sep 17 00:00:00 2001
From: Zi Yan <ziy@nvidia.com>
Date: Tue, 22 Jul 2025 15:46:49 -0400
Subject: mm/page_alloc: remove trace_mm_alloc_contig_migrate_range_info()

The trace event has not recorded the right data since it was introduced at
commit c8b360031218 ("mm: add alloc_contig_migrate_range allocation
statistics").  Remove it.

Link: https://lkml.kernel.org/r/20250722194649.4135191-1-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202507220742.P3SaKlI6-lkp@intel.com/
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Martin Liu <liumartin@google.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/kmem.h | 40 ----------------------------------------
 mm/page_alloc.c             | 32 +++-----------------------------
 2 files changed, 3 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index efffcf578217..474358773abe 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -304,46 +304,6 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->change_ownership)
 );
 
-#ifdef CONFIG_CONTIG_ALLOC
-TRACE_EVENT(mm_alloc_contig_migrate_range_info,
-
-	TP_PROTO(unsigned long start,
-		 unsigned long end,
-		 unsigned long nr_migrated,
-		 unsigned long nr_reclaimed,
-		 unsigned long nr_mapped,
-		 acr_flags_t alloc_flags),
-
-	TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, alloc_flags),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, start)
-		__field(unsigned long, end)
-		__field(unsigned long, nr_migrated)
-		__field(unsigned long, nr_reclaimed)
-		__field(unsigned long, nr_mapped)
-		__field(acr_flags_t, alloc_flags)
-	),
-
-	TP_fast_assign(
-		__entry->start = start;
-		__entry->end = end;
-		__entry->nr_migrated = nr_migrated;
-		__entry->nr_reclaimed = nr_reclaimed;
-		__entry->nr_mapped = nr_mapped;
-		__entry->alloc_flags = alloc_flags;
-	),
-
-	TP_printk("start=0x%lx end=0x%lx alloc_flags=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
-		  __entry->start,
-		  __entry->end,
-		  __entry->alloc_flags,
-		  __entry->nr_migrated,
-		  __entry->nr_reclaimed,
-		  __entry->nr_mapped)
-);
-#endif
-
 TRACE_EVENT(mm_setup_per_zone_wmarks,
 
 	TP_PROTO(struct zone *zone),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fa09154a799c..d1d037f97c5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6694,14 +6694,9 @@ static void alloc_contig_dump_pages(struct list_head *page_list)
 	}
 }
 
-/*
- * [start, end) must belong to a single zone.
- * @alloc_flags: using acr_flags_t to filter the type of migration in
- *		trace_mm_alloc_contig_migrate_range_info.
- */
+/* [start, end) must belong to a single zone. */
 static int __alloc_contig_migrate_range(struct compact_control *cc,
-					unsigned long start, unsigned long end,
-					acr_flags_t alloc_flags)
+					unsigned long start, unsigned long end)
 {
 	/* This function is based on compact_zone() from compaction.c. */
 	unsigned int nr_reclaimed;
@@ -6713,10 +6708,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		.gfp_mask = cc->gfp_mask,
 		.reason = MR_CONTIG_RANGE,
 	};
-	struct page *page;
-	unsigned long total_mapped = 0;
-	unsigned long total_migrated = 0;
-	unsigned long total_reclaimed = 0;
 
 	lru_cache_disable();
 
@@ -6742,22 +6733,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 							&cc->migratepages);
 		cc->nr_migratepages -= nr_reclaimed;
 
-		if (trace_mm_alloc_contig_migrate_range_info_enabled()) {
-			total_reclaimed += nr_reclaimed;
-			list_for_each_entry(page, &cc->migratepages, lru) {
-				struct folio *folio = page_folio(page);
-
-				total_mapped += folio_mapped(folio) *
-						folio_nr_pages(folio);
-			}
-		}
-
 		ret = migrate_pages(&cc->migratepages, alloc_migration_target,
 			NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL);
 
-		if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret)
-			total_migrated += cc->nr_migratepages;
-
 		/*
 		 * On -ENOMEM, migrate_pages() bails out right away. It is pointless
 		 * to retry again over this error, so do the same here.
@@ -6773,10 +6751,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 		putback_movable_pages(&cc->migratepages);
 	}
 
-	trace_mm_alloc_contig_migrate_range_info(start, end, alloc_flags,
-						 total_migrated,
-						 total_reclaimed,
-						 total_mapped);
 	return (ret < 0) ? ret : 0;
 }
 
@@ -6921,7 +6895,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 	 * allocated.  So, if we fall through be sure to clear ret so that
 	 * -EBUSY is not accidentally used or returned to caller.
 	 */
-	ret = __alloc_contig_migrate_range(&cc, start, end, alloc_flags);
+	ret = __alloc_contig_migrate_range(&cc, start, end);
 	if (ret && ret != -EBUSY)
 		goto done;
 
-- 
cgit v1.2.3


From 97c01e65ef4c1878532be245b2899fc4363cc453 Mon Sep 17 00:00:00 2001
From: Vicki Pfau <vi@endrift.com>
Date: Sun, 27 Jul 2025 01:15:17 -0700
Subject: Input: Add and document BTN_GRIP*

Many controllers these days have started including grip buttons. As
there has been no particular assigned BTN_* constants for these, they've
been haphazardly assigned to BTN_TRIGGER_HAPPY*. Unfortunately, the
assignment of these has varied significantly between drivers.

Add and document new constants for these grip buttons.

Signed-off-by: Vicki Pfau <vi@endrift.com>
Link: https://lore.kernel.org/r/20250702040102.125432-2-vi@endrift.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 Documentation/input/gamepad.rst        | 13 +++++++++++++
 drivers/hid/hid-debug.c                |  2 ++
 include/uapi/linux/input-event-codes.h |  5 +++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/Documentation/input/gamepad.rst b/Documentation/input/gamepad.rst
index 2bba721aa20b..0c918b6f288b 100644
--- a/Documentation/input/gamepad.rst
+++ b/Documentation/input/gamepad.rst
@@ -190,6 +190,19 @@ Gamepads report the following events:
 
   Rumble is advertised as FF_RUMBLE.
 
+- Grip buttons:
+
+  Many pads include buttons on the rear, usually referred to as either grip or
+  rear buttons, or paddles. These are often reprogrammable by the firmware to
+  appear as "normal" buttons, but are sometimes exposed to software too. Some
+  notable examples of this are the Steam Deck, which has R4, R5, L4, and L5 on
+  the back; the Xbox Elite pads, which have P1-P4; and the Switch 2 Pro
+  Controller, which has GL and GR.
+
+  For these controllers, BTN_GRIPR and BTN_GRIPR2 should be used for the top
+  and bottom (if present) right grip button(s), and BTN_GRIPL and BTN_GRIPL2
+  should be used for the top and bottom (if present) left grip button(s).
+
 - Profile:
 
   Some pads provide a multi-value profile selection switch. Examples include
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 8433306148d5..3cd9c1150cdf 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -3291,6 +3291,8 @@ static const char *keys[KEY_MAX + 1] = {
 	[BTN_TR2] = "BtnTR2",			[BTN_SELECT] = "BtnSelect",
 	[BTN_START] = "BtnStart",		[BTN_MODE] = "BtnMode",
 	[BTN_THUMBL] = "BtnThumbL",		[BTN_THUMBR] = "BtnThumbR",
+	[BTN_GRIPL] = "BtnGripL",		[BTN_GRIPR] = "BtnGripR",
+	[BTN_GRIPL2] = "BtnGripL2",		[BTN_GRIPR2] = "BtnGripR2",
 	[BTN_TOOL_PEN] = "ToolPen",		[BTN_TOOL_RUBBER] = "ToolRubber",
 	[BTN_TOOL_BRUSH] = "ToolBrush",		[BTN_TOOL_PENCIL] = "ToolPencil",
 	[BTN_TOOL_AIRBRUSH] = "ToolAirbrush",	[BTN_TOOL_FINGER] = "ToolFinger",
diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 5a199f3d4a26..5426297d93fd 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -601,6 +601,11 @@
 #define BTN_DPAD_LEFT		0x222
 #define BTN_DPAD_RIGHT		0x223
 
+#define BTN_GRIPL		0x224
+#define BTN_GRIPR		0x225
+#define BTN_GRIPL2		0x226
+#define BTN_GRIPR2		0x227
+
 #define KEY_ALS_TOGGLE		0x230	/* Ambient light sensor */
 #define KEY_ROTATE_LOCK_TOGGLE	0x231	/* Display rotation lock */
 #define KEY_REFRESH_RATE_TOGGLE	0x232	/* Display refresh rate toggle */
-- 
cgit v1.2.3


From ea4d331050b4cd43e6a900937db88b01ef75e1f2 Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco@wolfvision.net>
Date: Wed, 16 Oct 2024 06:02:41 +0200
Subject: Input: touch-overlay - add touchscreen overlay handling

Some touch devices provide mechanical overlays with different objects
like buttons or clipped touchscreen surfaces.

In order to support these objects, add a series of helper functions
to the input subsystem to transform them into overlay objects via
device tree nodes.

These overlay objects consume the raw touch events and report the
expected input events depending on the object properties.

Note that the current implementation allows for multiple definitions
of touchscreen areas (regions that report touch events), but only the
first one will be used for the touchscreen device that the consumers
typically provide.
Should the need for multiple touchscreen areas arise, additional
touchscreen devices would be required at the consumer side.
There is no limitation in the number of touch areas defined as buttons.

Reviewed-by: Jeff LaBundy <jeff@labundy.com>
Signed-off-by: Javier Carrasco <javier.carrasco@wolfvision.net>
Link: https://lore.kernel.org/r/20241016-feature-ts_virtobj_patch-v11-2-b292a1bbb0a1@wolfvision.net
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 MAINTAINERS                         |   7 +
 drivers/input/Makefile              |   2 +-
 drivers/input/touch-overlay.c       | 277 ++++++++++++++++++++++++++++++++++++
 include/linux/input/touch-overlay.h |  25 ++++
 4 files changed, 310 insertions(+), 1 deletion(-)
 create mode 100644 drivers/input/touch-overlay.c
 create mode 100644 include/linux/input/touch-overlay.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index adcd58147f97..3466d0d59c5f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -24609,6 +24609,13 @@ L:	platform-driver-x86@vger.kernel.org
 S:	Maintained
 F:	drivers/platform/x86/toshiba-wmi.c
 
+TOUCH OVERLAY
+M:	Javier Carrasco <javier.carrasco@wolfvision.net>
+L:	linux-input@vger.kernel.org
+S:	Maintained
+F:	drivers/input/touch-overlay.c
+F:	include/linux/input/touch-overlay.h
+
 TPM DEVICE DRIVER
 M:	Peter Huewe <peterhuewe@gmx.de>
 M:	Jarkko Sakkinen <jarkko@kernel.org>
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 930b64d2115e..2cd6e1c9a778 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -7,7 +7,7 @@
 
 obj-$(CONFIG_INPUT)		+= input-core.o
 input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o
-input-core-y += touchscreen.o
+input-core-y += touchscreen.o touch-overlay.o
 
 obj-$(CONFIG_INPUT_FF_MEMLESS)	+= ff-memless.o
 obj-$(CONFIG_INPUT_SPARSEKMAP)	+= sparse-keymap.o
diff --git a/drivers/input/touch-overlay.c b/drivers/input/touch-overlay.c
new file mode 100644
index 000000000000..8806373f7a4a
--- /dev/null
+++ b/drivers/input/touch-overlay.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Helper functions for overlay objects on touchscreens
+ *
+ *  Copyright (c) 2023 Javier Carrasco <javier.carrasco@wolfvision.net>
+ */
+
+#include <linux/input.h>
+#include <linux/input/mt.h>
+#include <linux/input/touch-overlay.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/property.h>
+
+struct touch_overlay_segment {
+	struct list_head list;
+	u32 x_origin;
+	u32 y_origin;
+	u32 x_size;
+	u32 y_size;
+	u32 key;
+	bool pressed;
+	int slot;
+};
+
+static int touch_overlay_get_segment(struct fwnode_handle *segment_node,
+				     struct touch_overlay_segment *segment,
+				     struct input_dev *input)
+{
+	int error;
+
+	error = fwnode_property_read_u32(segment_node, "x-origin",
+					 &segment->x_origin);
+	if (error)
+		return error;
+
+	error = fwnode_property_read_u32(segment_node, "y-origin",
+					 &segment->y_origin);
+	if (error)
+		return error;
+
+	error = fwnode_property_read_u32(segment_node, "x-size",
+					 &segment->x_size);
+	if (error)
+		return error;
+
+	error = fwnode_property_read_u32(segment_node, "y-size",
+					 &segment->y_size);
+	if (error)
+		return error;
+
+	error = fwnode_property_read_u32(segment_node, "linux,code",
+					 &segment->key);
+	if (!error)
+		input_set_capability(input, EV_KEY, segment->key);
+	else if (error != -EINVAL)
+		return error;
+
+	return 0;
+}
+
+/**
+ * touch_overlay_map - map overlay objects from the device tree and set
+ * key capabilities if buttons are defined.
+ * @list: pointer to the list that will hold the segments
+ * @input: pointer to the already allocated input_dev
+ *
+ * Returns 0 on success and error number otherwise.
+ *
+ * If buttons are defined, key capabilities are set accordingly.
+ */
+int touch_overlay_map(struct list_head *list, struct input_dev *input)
+{
+	struct fwnode_handle *fw_segment;
+	struct device *dev = input->dev.parent;
+	struct touch_overlay_segment *segment;
+	int error;
+
+	struct fwnode_handle *overlay __free(fwnode_handle) =
+		device_get_named_child_node(dev, "touch-overlay");
+	if (!overlay)
+		return 0;
+
+	fwnode_for_each_available_child_node(overlay, fw_segment) {
+		segment = devm_kzalloc(dev, sizeof(*segment), GFP_KERNEL);
+		if (!segment) {
+			fwnode_handle_put(fw_segment);
+			return -ENOMEM;
+		}
+		error = touch_overlay_get_segment(fw_segment, segment, input);
+		if (error) {
+			fwnode_handle_put(fw_segment);
+			return error;
+		}
+		list_add_tail(&segment->list, list);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(touch_overlay_map);
+
+/**
+ * touch_overlay_get_touchscreen_abs - get abs size from the touchscreen area.
+ * @list: pointer to the list that holds the segments
+ * @x: horizontal abs
+ * @y: vertical abs
+ */
+void touch_overlay_get_touchscreen_abs(struct list_head *list, u16 *x, u16 *y)
+{
+	struct touch_overlay_segment *segment;
+	struct list_head *ptr;
+
+	list_for_each(ptr, list) {
+		segment = list_entry(ptr, struct touch_overlay_segment, list);
+		if (!segment->key) {
+			*x = segment->x_size - 1;
+			*y = segment->y_size - 1;
+			break;
+		}
+	}
+}
+EXPORT_SYMBOL(touch_overlay_get_touchscreen_abs);
+
+static bool touch_overlay_segment_event(struct touch_overlay_segment *seg,
+					struct input_mt_pos *pos)
+{
+	if (pos->x >= seg->x_origin && pos->x < (seg->x_origin + seg->x_size) &&
+	    pos->y >= seg->y_origin && pos->y < (seg->y_origin + seg->y_size))
+		return true;
+
+	return false;
+}
+
+/**
+ * touch_overlay_mapped_touchscreen - check if a touchscreen area is mapped
+ * @list: pointer to the list that holds the segments
+ *
+ * Returns true if a touchscreen area is mapped or false otherwise.
+ */
+bool touch_overlay_mapped_touchscreen(struct list_head *list)
+{
+	struct touch_overlay_segment *segment;
+	struct list_head *ptr;
+
+	list_for_each(ptr, list) {
+		segment = list_entry(ptr, struct touch_overlay_segment, list);
+		if (!segment->key)
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL(touch_overlay_mapped_touchscreen);
+
+static bool touch_overlay_event_on_ts(struct list_head *list,
+				      struct input_mt_pos *pos)
+{
+	struct touch_overlay_segment *segment;
+	struct list_head *ptr;
+
+	list_for_each(ptr, list) {
+		segment = list_entry(ptr, struct touch_overlay_segment, list);
+		if (segment->key)
+			continue;
+
+		if (touch_overlay_segment_event(segment, pos)) {
+			pos->x -= segment->x_origin;
+			pos->y -= segment->y_origin;
+			return true;
+		}
+		/* ignore touch events outside the defined area */
+		return false;
+	}
+
+	return true;
+}
+
+static bool touch_overlay_button_event(struct input_dev *input,
+				       struct touch_overlay_segment *segment,
+				       struct input_mt_pos *pos, int slot)
+{
+	struct input_mt *mt = input->mt;
+	struct input_mt_slot *s = &mt->slots[slot];
+	bool button_contact = touch_overlay_segment_event(segment, pos);
+
+	if (segment->slot == slot && segment->pressed) {
+		/* sliding out of the button releases it */
+		if (!button_contact) {
+			input_report_key(input, segment->key, false);
+			segment->pressed = false;
+			/* keep available for a possible touch event */
+			return false;
+		}
+		/* ignore sliding on the button while pressed */
+		s->frame = mt->frame;
+		return true;
+	} else if (button_contact) {
+		input_report_key(input, segment->key, true);
+		s->frame = mt->frame;
+		segment->slot = slot;
+		segment->pressed = true;
+		return true;
+	}
+
+	return false;
+}
+
+/**
+ * touch_overlay_sync_frame - update the status of the segments and report
+ * buttons whose tracked slot is unused.
+ * @list: pointer to the list that holds the segments
+ * @input: pointer to the input device associated to the contact
+ */
+void touch_overlay_sync_frame(struct list_head *list, struct input_dev *input)
+{
+	struct touch_overlay_segment *segment;
+	struct input_mt *mt = input->mt;
+	struct input_mt_slot *s;
+	struct list_head *ptr;
+
+	list_for_each(ptr, list) {
+		segment = list_entry(ptr, struct touch_overlay_segment, list);
+		if (!segment->key)
+			continue;
+
+		s = &mt->slots[segment->slot];
+		if (!input_mt_is_used(mt, s) && segment->pressed) {
+			input_report_key(input, segment->key, false);
+			segment->pressed = false;
+		}
+	}
+}
+EXPORT_SYMBOL(touch_overlay_sync_frame);
+
+/**
+ * touch_overlay_process_contact - process contacts according to the overlay
+ * mapping. This function acts as a filter to release the calling driver
+ * from the contacts that are either related to overlay buttons or out of the
+ * overlay touchscreen area, if defined.
+ * @list: pointer to the list that holds the segments
+ * @input: pointer to the input device associated to the contact
+ * @pos: pointer to the contact position
+ * @slot: slot associated to the contact (0 if multitouch is not supported)
+ *
+ * Returns true if the contact was processed (reported for valid key events
+ * and dropped for contacts outside the overlay touchscreen area) or false
+ * if the contact must be processed by the caller. In that case this function
+ * shifts the (x,y) coordinates to the overlay touchscreen axis if required.
+ */
+bool touch_overlay_process_contact(struct list_head *list,
+				   struct input_dev *input,
+				   struct input_mt_pos *pos, int slot)
+{
+	struct touch_overlay_segment *segment;
+	struct list_head *ptr;
+
+	/*
+	 * buttons must be prioritized over overlay touchscreens to account for
+	 * overlappings e.g. a button inside the touchscreen area.
+	 */
+	list_for_each(ptr, list) {
+		segment = list_entry(ptr, struct touch_overlay_segment, list);
+		if (segment->key &&
+		    touch_overlay_button_event(input, segment, pos, slot))
+			return true;
+	}
+
+	/*
+	 * valid contacts on the overlay touchscreen are left for the client
+	 * to be processed/reported according to its (possibly) unique features.
+	 */
+	return !touch_overlay_event_on_ts(list, pos);
+}
+EXPORT_SYMBOL(touch_overlay_process_contact);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Helper functions for overlay objects on touch devices");
diff --git a/include/linux/input/touch-overlay.h b/include/linux/input/touch-overlay.h
new file mode 100644
index 000000000000..0253e554d3cd
--- /dev/null
+++ b/include/linux/input/touch-overlay.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023 Javier Carrasco <javier.carrasco@wolfvision.net>
+ */
+
+#ifndef _TOUCH_OVERLAY
+#define _TOUCH_OVERLAY
+
+#include <linux/types.h>
+
+struct input_dev;
+
+int touch_overlay_map(struct list_head *list, struct input_dev *input);
+
+void touch_overlay_get_touchscreen_abs(struct list_head *list, u16 *x, u16 *y);
+
+bool touch_overlay_mapped_touchscreen(struct list_head *list);
+
+bool touch_overlay_process_contact(struct list_head *list,
+				   struct input_dev *input,
+				   struct input_mt_pos *pos, int slot);
+
+void touch_overlay_sync_frame(struct list_head *list, struct input_dev *input);
+
+#endif
-- 
cgit v1.2.3


From 56eb7c13b97c6f9e2fed9e9899b01d1a6a595f28 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 26 Jun 2025 19:08:12 +0300
Subject: mtd: map: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/map.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 1b56796f6cb3..288ef765a44e 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -8,15 +8,15 @@
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
 
-#include <linux/types.h>
-#include <linux/list.h>
-#include <linux/string.h>
 #include <linux/bug.h>
-#include <linux/kernel.h>
 #include <linux/io.h>
-
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/types.h>
 #include <linux/unaligned.h>
-#include <asm/barrier.h>
+
+struct device_node;
+struct module;
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1
 #define map_bankwidth(map) 1
@@ -188,6 +188,7 @@ typedef union {
    of living.
 */
 
+struct mtd_chip_driver;
 struct map_info {
 	const char *name;
 	unsigned long size;
-- 
cgit v1.2.3


From 08da98e1b2f76cdbacf84b9affaa75960dbce515 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 8 Jul 2025 16:36:40 +0200
Subject: fsnotify: merge file_set_fsnotify_mode_from_watchers() with open perm
 hook

Create helper fsnotify_open_perm_and_set_mode() that moves the
fsnotify_open_perm() hook into file_set_fsnotify_mode_from_watchers().

This will allow some more optimizations.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250708143641.418603-2-amir73il@gmail.com
---
 fs/file_table.c          |  2 +-
 fs/notify/fsnotify.c     | 22 +++++++++++++---------
 fs/open.c                |  6 +++---
 include/linux/fsnotify.h |  8 +++-----
 4 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/file_table.c b/fs/file_table.c
index 138114d64307..14ee3581c768 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -196,7 +196,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	file_ref_init(&f->f_ref, 1);
 	/*
 	 * Disable permission and pre-content events for all files by default.
-	 * They may be enabled later by file_set_fsnotify_mode_from_watchers().
+	 * They may be enabled later by fsnotify_open_perm_and_set_mode().
 	 */
 	file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
 	return 0;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index e2b4f17a48bb..de7e7425428b 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -656,12 +656,12 @@ EXPORT_SYMBOL_GPL(fsnotify);
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 /*
- * At open time we check fsnotify_sb_has_priority_watchers() and set the
- * FMODE_NONOTIFY_ mode bits accordignly.
+ * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm
+ * hook and set the FMODE_NONOTIFY_ mode bits accordignly.
  * Later, fsnotify permission hooks do not check if there are permission event
  * watches, but that there were permission event watches at open time.
  */
-void file_set_fsnotify_mode_from_watchers(struct file *file)
+int fsnotify_open_perm_and_set_mode(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry, *parent;
 	struct super_block *sb = dentry->d_sb;
@@ -669,7 +669,7 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
 
 	/* Is it a file opened by fanotify? */
 	if (FMODE_FSNOTIFY_NONE(file->f_mode))
-		return;
+		return 0;
 
 	/*
 	 * Permission events is a super set of pre-content events, so if there
@@ -679,7 +679,7 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
 	if (likely(!fsnotify_sb_has_priority_watchers(sb,
 						FSNOTIFY_PRIO_CONTENT))) {
 		file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
-		return;
+		return 0;
 	}
 
 	/*
@@ -689,8 +689,9 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
 	if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
 	    likely(!fsnotify_sb_has_priority_watchers(sb,
 						FSNOTIFY_PRIO_PRE_CONTENT))) {
-		file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
-		return;
+		file_set_fsnotify_mode(file, FMODE_NONOTIFY |
+				       FMODE_NONOTIFY_PERM);
+		goto open_perm;
 	}
 
 	/*
@@ -702,7 +703,7 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
 				     FSNOTIFY_PRE_CONTENT_EVENTS))) {
 		/* Enable pre-content events */
 		file_set_fsnotify_mode(file, 0);
-		return;
+		goto open_perm;
 	}
 
 	/* Is parent watching for pre-content events on this file? */
@@ -713,11 +714,14 @@ void file_set_fsnotify_mode_from_watchers(struct file *file)
 		if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
 			/* Enable pre-content events */
 			file_set_fsnotify_mode(file, 0);
-			return;
+			goto open_perm;
 		}
 	}
 	/* Nobody watching for pre-content events from this file */
 	file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
+
+open_perm:
+	return fsnotify_open_perm(file);
 }
 #endif
 
diff --git a/fs/open.c b/fs/open.c
index 7828234a7caa..f240b96ce586 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -943,12 +943,12 @@ static int do_dentry_open(struct file *f,
 		goto cleanup_all;
 
 	/*
-	 * Set FMODE_NONOTIFY_* bits according to existing permission watches.
+	 * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits
+	 * according to existing permission watches.
 	 * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
 	 * pseudo file, this call will not change the mode.
 	 */
-	file_set_fsnotify_mode_from_watchers(f);
-	error = fsnotify_open_perm(f);
+	error = fsnotify_open_perm_and_set_mode(f);
 	if (error)
 		goto cleanup_all;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 454d8e466958..8c1fa617d375 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -129,7 +129,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 
-void file_set_fsnotify_mode_from_watchers(struct file *file);
+int fsnotify_open_perm_and_set_mode(struct file *file);
 
 /*
  * fsnotify_file_area_perm - permission hook before access to file range
@@ -215,9 +215,6 @@ static inline int fsnotify_open_perm(struct file *file)
 {
 	int ret;
 
-	if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
-		return 0;
-
 	if (file->f_flags & __FMODE_EXEC) {
 		ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
 		if (ret)
@@ -228,8 +225,9 @@ static inline int fsnotify_open_perm(struct file *file)
 }
 
 #else
-static inline void file_set_fsnotify_mode_from_watchers(struct file *file)
+static inline int fsnotify_open_perm_and_set_mode(struct file *file)
 {
+	return 0;
 }
 
 static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
-- 
cgit v1.2.3


From 0d4c4d4ea443babab6ec1a79f481260963fc969a Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 8 Jul 2025 16:36:41 +0200
Subject: fsnotify: optimize FMODE_NONOTIFY_PERM for the common cases

The most unlikely watched permission event is FAN_ACCESS_PERM, because
at the time that it was introduced there were no evictable ignore mark,
so subscribing to FAN_ACCESS_PERM would have incured a very high
overhead.

Yet, when we set the fmode to FMODE_NOTIFY_HSM(), we never skip trying
to send FAN_ACCESS_PERM, which is almost always a waste of cycles.

We got to this logic because of bundling FAN_OPEN*_PERM and
FAN_ACCESS_PERM in the same category and because FAN_OPEN_PERM is a
commonly used event.

By open coding fsnotify_open_perm() in fsnotify_open_perm_and_set_mode(),
we no longer need to regard FAN_OPEN*_PERM when calculating fmode.

This leaves the case of having pre-content events and not having any
other permission event in the object masks a more likely case than the
other way around.

Rework the fmode macros and code so that their meaning now refers only
to hooks on an already open file:

- FMODE_NOTIFY_NONE()		skip all events
- FMODE_NOTIFY_ACCESS_PERM()	send all permission events including
  				FAN_ACCESS_PERM
- FMODE_NOTIFY_HSM()		send pre-content permission events

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://patch.msgid.link/20250708143641.418603-3-amir73il@gmail.com
---
 fs/notify/fsnotify.c     | 75 +++++++++++++++++++++++++++++-------------------
 include/linux/fs.h       | 12 ++++----
 include/linux/fsnotify.h | 27 ++---------------
 3 files changed, 53 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index de7e7425428b..079b868552c2 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -199,8 +199,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
 }
 
 /* Are there any inode/mount/sb objects that watch for these events? */
-static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
-					   __u32 mask)
+static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+					    __u32 mask)
 {
 	__u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
 			   READ_ONCE(inode->i_sb->s_fsnotify_mask);
@@ -665,7 +665,7 @@ int fsnotify_open_perm_and_set_mode(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry, *parent;
 	struct super_block *sb = dentry->d_sb;
-	__u32 mnt_mask, p_mask;
+	__u32 mnt_mask, p_mask = 0;
 
 	/* Is it a file opened by fanotify? */
 	if (FMODE_FSNOTIFY_NONE(file->f_mode))
@@ -683,45 +683,60 @@ int fsnotify_open_perm_and_set_mode(struct file *file)
 	}
 
 	/*
-	 * If there are permission event watchers but no pre-content event
-	 * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+	 * OK, there are some permission event watchers. Check if anybody is
+	 * watching for permission events on *this* file.
 	 */
-	if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
-	    likely(!fsnotify_sb_has_priority_watchers(sb,
-						FSNOTIFY_PRIO_PRE_CONTENT))) {
-		file_set_fsnotify_mode(file, FMODE_NONOTIFY |
-				       FMODE_NONOTIFY_PERM);
-		goto open_perm;
+	mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+	p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask,
+					 ALL_FSNOTIFY_PERM_EVENTS);
+	if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+		parent = dget_parent(dentry);
+		p_mask |= fsnotify_inode_watches_children(d_inode(parent));
+		dput(parent);
 	}
 
 	/*
-	 * OK, there are some pre-content watchers. Check if anybody is
-	 * watching for pre-content events on *this* file.
+	 * Legacy FAN_ACCESS_PERM events have very high performance overhead,
+	 * so unlikely to be used in the wild. If they are used there will be
+	 * no optimizations at all.
 	 */
-	mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
-	if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
-				     FSNOTIFY_PRE_CONTENT_EVENTS))) {
-		/* Enable pre-content events */
+	if (unlikely(p_mask & FS_ACCESS_PERM)) {
+		/* Enable all permission and pre-content events */
 		file_set_fsnotify_mode(file, 0);
 		goto open_perm;
 	}
 
-	/* Is parent watching for pre-content events on this file? */
-	if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
-		parent = dget_parent(dentry);
-		p_mask = fsnotify_inode_watches_children(d_inode(parent));
-		dput(parent);
-		if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
-			/* Enable pre-content events */
-			file_set_fsnotify_mode(file, 0);
-			goto open_perm;
-		}
+	/*
+	 * Pre-content events are only supported on regular files.
+	 * If there are pre-content event watchers and no permission access
+	 * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+	 * That is the common case with HSM service.
+	 */
+	if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) {
+		file_set_fsnotify_mode(file, FMODE_NONOTIFY |
+					     FMODE_NONOTIFY_PERM);
+		goto open_perm;
 	}
-	/* Nobody watching for pre-content events from this file */
-	file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
+
+	/* Nobody watching permission and pre-content events on this file */
+	file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
 
 open_perm:
-	return fsnotify_open_perm(file);
+	/*
+	 * Send open perm events depending on object masks and regardless of
+	 * FMODE_NONOTIFY_PERM.
+	 */
+	if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) {
+		int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
+
+		if (ret)
+			return ret;
+	}
+
+	if (p_mask & FS_OPEN_PERM)
+		return fsnotify_path(&file->f_path, FS_OPEN_PERM);
+
+	return 0;
 }
 #endif
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b085f161ed22..bc92fdb8bfcc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -200,12 +200,12 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 /*
  * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
- * for a file. These are the possible values of (f->f_mode &
- * FMODE_FSNOTIFY_MASK) and their meaning:
+ * for an open file. These are the possible values of
+ * (f->f_mode & FMODE_FSNOTIFY_MASK) and their meaning:
  *
  * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
  * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
- * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
+ * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only FAN_ACCESS_PERM.
  */
 #define FMODE_FSNOTIFY_MASK \
 	(FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)
@@ -213,13 +213,13 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_FSNOTIFY_NONE(mode) \
 	((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-#define FMODE_FSNOTIFY_PERM(mode) \
+#define FMODE_FSNOTIFY_HSM(mode) \
 	((mode & FMODE_FSNOTIFY_MASK) == 0 || \
 	 (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
-#define FMODE_FSNOTIFY_HSM(mode) \
+#define FMODE_FSNOTIFY_ACCESS_PERM(mode) \
 	((mode & FMODE_FSNOTIFY_MASK) == 0)
 #else
-#define FMODE_FSNOTIFY_PERM(mode)	0
+#define FMODE_FSNOTIFY_ACCESS_PERM(mode) 0
 #define FMODE_FSNOTIFY_HSM(mode)	0
 #endif
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 8c1fa617d375..28a9cb13fbfa 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -147,9 +147,6 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 	if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
 		return 0;
 
-	if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
-		return 0;
-
 	/*
 	 * read()/write() and other types of access generate pre-content events.
 	 */
@@ -160,7 +157,8 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
 			return ret;
 	}
 
-	if (!(perm_mask & MAY_READ))
+	if (!(perm_mask & MAY_READ) ||
+	    likely(!FMODE_FSNOTIFY_ACCESS_PERM(file->f_mode)))
 		return 0;
 
 	/*
@@ -208,22 +206,6 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 	return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
 }
 
-/*
- * fsnotify_open_perm - permission hook before file open
- */
-static inline int fsnotify_open_perm(struct file *file)
-{
-	int ret;
-
-	if (file->f_flags & __FMODE_EXEC) {
-		ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
-		if (ret)
-			return ret;
-	}
-
-	return fsnotify_path(&file->f_path, FS_OPEN_PERM);
-}
-
 #else
 static inline int fsnotify_open_perm_and_set_mode(struct file *file)
 {
@@ -251,11 +233,6 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask)
 {
 	return 0;
 }
-
-static inline int fsnotify_open_perm(struct file *file)
-{
-	return 0;
-}
 #endif
 
 /*
-- 
cgit v1.2.3


From 28a78afda6c80dfdcbec51813cb8d2813523ea0c Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 28 Jul 2025 15:50:13 +0200
Subject: rv: Add da_handle_start_run_event_ to per-task monitors

The RV da_monitor API allows to start monitors in two ways:
da_handle_start_event_NAME and da_handle_start_run_event_NAME.
The former is used when the event is followed by the initial state of
the module, so we ignore the event but we know the monitor is in the
initial state and can start monitoring, the latter can be used if the
event can only occur in the initial state, so we do handle the event as
if the monitor was in the initial state.
This latter API is defined for implicit monitors but not per-task ones.

Define da_handle_start_run_event_NAME macro also for per-task monitors.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20250728135022.255578-2-gmonaco@redhat.com
Reviewed-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/rv/da_monitor.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 15f9ed4e4bb6..ed3c34fe18d6 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -487,6 +487,30 @@ da_handle_start_event_##name(struct task_struct *tsk, enum events_##name event)
 	__da_handle_event_##name(da_mon, tsk, event);						\
 												\
 	return 1;										\
+}												\
+												\
+/*												\
+ * da_handle_start_run_event_##name - start monitoring and handle event				\
+ *												\
+ * This function is used to notify the monitor that the system is in the			\
+ * initial state, so the monitor can start monitoring and handling event.			\
+ */												\
+static inline bool										\
+da_handle_start_run_event_##name(struct task_struct *tsk, enum events_##name event)		\
+{												\
+	struct da_monitor *da_mon;								\
+												\
+	if (!da_monitor_enabled_##name())							\
+		return 0;									\
+												\
+	da_mon = da_get_monitor_##name(tsk);							\
+												\
+	if (unlikely(!da_monitoring_##name(da_mon)))						\
+		da_monitor_start_##name(da_mon);						\
+												\
+	__da_handle_event_##name(da_mon, tsk, event);						\
+												\
+	return 1;										\
 }
 
 /*
-- 
cgit v1.2.3


From 9d475d80c93735f0f336b34a8e2c22beea6145ab Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 28 Jul 2025 15:50:17 +0200
Subject: rv: Retry when da monitor detects race conditions

DA monitor can be accessed from multiple cores simultaneously, this is
likely, for instance when dealing with per-task monitors reacting on
events that do not always occur on the CPU where the task is running.
This can cause race conditions where two events change the next state
and we see inconsistent values. E.g.:

  [62] event_srs: 27: sleepable x sched_wakeup -> running (final)
  [63] event_srs: 27: sleepable x sched_set_state_sleepable -> sleepable
  [63] error_srs: 27: event sched_switch_suspend not expected in the state running

In this case the monitor fails because the event on CPU 62 wins against
the one on CPU 63, although the correct state should have been
sleepable, since the task get suspended.

Detect if the current state was modified by using try_cmpxchg while
storing the next value. If it was, try again reading the current state.
After a maximum number of failed retries, react by calling a special
tracepoint, print on the console and reset the monitor.

Remove the functions da_monitor_curr_state() and da_monitor_set_state()
as they only hide the underlying implementation in this case.

Monitors where this type of condition can occur must be able to account
for racing events in any possible order, as we cannot know the winner.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/20250728135022.255578-6-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/rv.h         |   3 +-
 include/rv/da_monitor.h    | 107 +++++++++++++++++++++++----------------------
 kernel/trace/rv/Kconfig    |   5 +++
 kernel/trace/rv/rv_trace.h |  24 ++++++++++
 4 files changed, 85 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/rv.h b/include/linux/rv.h
index 80731242fe60..14410a42faef 100644
--- a/include/linux/rv.h
+++ b/include/linux/rv.h
@@ -10,7 +10,8 @@
 #include <linux/types.h>
 #include <linux/list.h>
 
-#define MAX_DA_NAME_LEN	32
+#define MAX_DA_NAME_LEN			32
+#define MAX_DA_RETRY_RACING_EVENTS	3
 
 #ifdef CONFIG_RV
 #include <linux/bitops.h>
diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index ed3c34fe18d6..17fa4f6e5ea6 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -54,23 +54,6 @@ static inline void da_monitor_reset_##name(struct da_monitor *da_mon)				\
 	da_mon->curr_state = model_get_initial_state_##name();					\
 }												\
 												\
-/*												\
- * da_monitor_curr_state_##name - return the current state					\
- */												\
-static inline type da_monitor_curr_state_##name(struct da_monitor *da_mon)			\
-{												\
-	return da_mon->curr_state;								\
-}												\
-												\
-/*												\
- * da_monitor_set_state_##name - set the new current state					\
- */												\
-static inline void										\
-da_monitor_set_state_##name(struct da_monitor *da_mon, enum states_##name state)		\
-{												\
-	da_mon->curr_state = state;								\
-}												\
-												\
 /*												\
  * da_monitor_start_##name - start monitoring							\
  *												\
@@ -127,63 +110,81 @@ static inline bool da_monitor_handling_event_##name(struct da_monitor *da_mon)
  * Event handler for implicit monitors. Implicit monitor is the one which the
  * handler does not need to specify which da_monitor to manipulate. Examples
  * of implicit monitor are the per_cpu or the global ones.
+ *
+ * Retry in case there is a race between getting and setting the next state,
+ * warn and reset the monitor if it runs out of retries. The monitor should be
+ * able to handle various orders.
  */
 #define DECLARE_DA_MON_MODEL_HANDLER_IMPLICIT(name, type)					\
 												\
 static inline bool										\
 da_event_##name(struct da_monitor *da_mon, enum events_##name event)				\
 {												\
-	type curr_state = da_monitor_curr_state_##name(da_mon);					\
-	type next_state = model_get_next_state_##name(curr_state, event);			\
-												\
-	if (next_state != INVALID_STATE) {							\
-		da_monitor_set_state_##name(da_mon, next_state);				\
-												\
-		trace_event_##name(model_get_state_name_##name(curr_state),			\
-				   model_get_event_name_##name(event),				\
-				   model_get_state_name_##name(next_state),			\
-				   model_is_final_state_##name(next_state));			\
-												\
-		return true;									\
+	enum states_##name curr_state, next_state;						\
+												\
+	curr_state = READ_ONCE(da_mon->curr_state);						\
+	for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) {					\
+		next_state = model_get_next_state_##name(curr_state, event);			\
+		if (next_state == INVALID_STATE) {						\
+			cond_react_##name(curr_state, event);					\
+			trace_error_##name(model_get_state_name_##name(curr_state),		\
+					   model_get_event_name_##name(event));			\
+			return false;								\
+		}										\
+		if (likely(try_cmpxchg(&da_mon->curr_state, &curr_state, next_state))) {	\
+			trace_event_##name(model_get_state_name_##name(curr_state),		\
+					   model_get_event_name_##name(event),			\
+					   model_get_state_name_##name(next_state),		\
+					   model_is_final_state_##name(next_state));		\
+			return true;								\
+		}										\
 	}											\
 												\
-	cond_react_##name(curr_state, event);							\
-												\
-	trace_error_##name(model_get_state_name_##name(curr_state),				\
-			   model_get_event_name_##name(event));					\
-												\
+	trace_rv_retries_error(#name, model_get_event_name_##name(event));			\
+	pr_warn("rv: " __stringify(MAX_DA_RETRY_RACING_EVENTS)					\
+		" retries reached for event %s, resetting monitor %s",				\
+		model_get_event_name_##name(event), #name);					\
 	return false;										\
 }												\
 
 /*
  * Event handler for per_task monitors.
+ *
+ * Retry in case there is a race between getting and setting the next state,
+ * warn and reset the monitor if it runs out of retries. The monitor should be
+ * able to handle various orders.
  */
 #define DECLARE_DA_MON_MODEL_HANDLER_PER_TASK(name, type)					\
 												\
 static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct *tsk,		\
 				   enum events_##name event)					\
 {												\
-	type curr_state = da_monitor_curr_state_##name(da_mon);					\
-	type next_state = model_get_next_state_##name(curr_state, event);			\
-												\
-	if (next_state != INVALID_STATE) {							\
-		da_monitor_set_state_##name(da_mon, next_state);				\
-												\
-		trace_event_##name(tsk->pid,							\
-				   model_get_state_name_##name(curr_state),			\
-				   model_get_event_name_##name(event),				\
-				   model_get_state_name_##name(next_state),			\
-				   model_is_final_state_##name(next_state));			\
-												\
-		return true;									\
+	enum states_##name curr_state, next_state;						\
+												\
+	curr_state = READ_ONCE(da_mon->curr_state);						\
+	for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) {					\
+		next_state = model_get_next_state_##name(curr_state, event);			\
+		if (next_state == INVALID_STATE) {						\
+			cond_react_##name(curr_state, event);					\
+			trace_error_##name(tsk->pid,						\
+					   model_get_state_name_##name(curr_state),		\
+					   model_get_event_name_##name(event));			\
+			return false;								\
+		}										\
+		if (likely(try_cmpxchg(&da_mon->curr_state, &curr_state, next_state))) {	\
+			trace_event_##name(tsk->pid,						\
+					   model_get_state_name_##name(curr_state),		\
+					   model_get_event_name_##name(event),			\
+					   model_get_state_name_##name(next_state),		\
+					   model_is_final_state_##name(next_state));		\
+			return true;								\
+		}										\
 	}											\
 												\
-	cond_react_##name(curr_state, event);							\
-												\
-	trace_error_##name(tsk->pid,								\
-			   model_get_state_name_##name(curr_state),				\
-			   model_get_event_name_##name(event));					\
-												\
+	trace_rv_retries_error(#name, model_get_event_name_##name(event));			\
+	pr_warn("rv: " __stringify(MAX_DA_RETRY_RACING_EVENTS)					\
+		" retries reached for event %s, resetting monitor %s",				\
+		model_get_event_name_##name(event), #name);					\
 	return false;										\
 }
 
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 26017378f79b..34164eb4ec91 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -3,12 +3,17 @@
 config RV_MON_EVENTS
 	bool
 
+config RV_MON_MAINTENANCE_EVENTS
+	bool
+
 config DA_MON_EVENTS_IMPLICIT
 	select RV_MON_EVENTS
+	select RV_MON_MAINTENANCE_EVENTS
 	bool
 
 config DA_MON_EVENTS_ID
 	select RV_MON_EVENTS
+	select RV_MON_MAINTENANCE_EVENTS
 	bool
 
 config LTL_MON_EVENTS_ID
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index d38e0d3abdfd..3af46cd185b3 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -176,6 +176,30 @@ DECLARE_EVENT_CLASS(error_ltl_monitor_id,
 #include <monitors/sleep/sleep_trace.h>
 // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
 #endif /* CONFIG_LTL_MON_EVENTS_ID */
+
+#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS
+/* Tracepoint useful for monitors development, currenly only used in DA */
+TRACE_EVENT(rv_retries_error,
+
+	TP_PROTO(char *name, char *event),
+
+	TP_ARGS(name, event),
+
+	TP_STRUCT__entry(
+		__string(	name,	name	)
+		__string(	event,	event	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__assign_str(event);
+	),
+
+	TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS)
+		" retries reached for event %s, resetting monitor %s",
+		__get_str(event), __get_str(name))
+);
+#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */
 #endif /* _TRACE_RV_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From adcc3bfa8806761ac21aa271f78454113ec6936e Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 28 Jul 2025 15:50:18 +0200
Subject: sched: Adapt sched tracepoints for RV task model

Add the following tracepoint:
* sched_set_need_resched(tsk, cpu, tif)
    Called when a task is set the need resched [lazy] flag

Remove the unused ip parameter from sched_entry and sched_exit and alter
sched_entry to have a value of preempt consistent with the one used in
sched_switch.

Also adapt all monitors using sched_{entry,exit} to avoid breaking build.

These tracepoints are useful to describe the Linux task model and are
adapted from the patches by Daniel Bristot de Oliveira
(https://bristot.me/linux-task-model/).

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Nam Cao <namcao@linutronix.de>
Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Juri Lelli <jlelli@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Link: https://lore.kernel.org/20250728135022.255578-7-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/sched.h                  |  7 ++++++-
 include/trace/events/sched.h           | 12 ++++++++----
 kernel/sched/core.c                    | 13 ++++++++++---
 kernel/trace/rv/monitors/sco/sco.c     |  4 ++--
 kernel/trace/rv/monitors/scpd/scpd.c   |  4 ++--
 kernel/trace/rv/monitors/sncid/sncid.c |  4 ++--
 kernel/trace/rv/monitors/snep/snep.c   |  4 ++--
 kernel/trace/rv/monitors/tss/tss.c     |  4 ++--
 8 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fabd7fe1a07a..91d1fdbc2f56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -339,9 +339,11 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
-/* wrapper function to trace from this header file */
+/* wrapper functions to trace from this header file */
 DECLARE_TRACEPOINT(sched_set_state_tp);
 extern void __trace_set_current_state(int state_value);
+DECLARE_TRACEPOINT(sched_set_need_resched_tp);
+extern void __trace_set_need_resched(struct task_struct *curr, int tif);
 
 /**
  * struct prev_cputime - snapshot of system and user cputime
@@ -2063,6 +2065,9 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
+	if (tracepoint_enabled(sched_set_need_resched_tp) &&
+	    !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED))
+		__trace_set_need_resched(tsk, TIF_NEED_RESCHED);
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4e6b2910cec3..c08893bde255 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -882,18 +882,22 @@ DECLARE_TRACE(sched_compute_energy,
 	TP_ARGS(p, dst_cpu, energy, max_util, busy_time));
 
 DECLARE_TRACE(sched_entry,
-	TP_PROTO(bool preempt, unsigned long ip),
-	TP_ARGS(preempt, ip));
+	TP_PROTO(bool preempt),
+	TP_ARGS(preempt));
 
 DECLARE_TRACE(sched_exit,
-	TP_PROTO(bool is_switch, unsigned long ip),
-	TP_ARGS(is_switch, ip));
+	TP_PROTO(bool is_switch),
+	TP_ARGS(is_switch));
 
 DECLARE_TRACE_CONDITION(sched_set_state,
 	TP_PROTO(struct task_struct *tsk, int state),
 	TP_ARGS(tsk, state),
 	TP_CONDITION(!!(tsk->__state) != !!state));
 
+DECLARE_TRACE(sched_set_need_resched,
+	TP_PROTO(struct task_struct *tsk, int cpu, int tif),
+	TP_ARGS(tsk, cpu, tif));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec68fc686bd7..b485e0639616 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1110,6 +1110,7 @@ static void __resched_curr(struct rq *rq, int tif)
 
 	cpu = cpu_of(rq);
 
+	trace_sched_set_need_resched_tp(curr, cpu, tif);
 	if (cpu == smp_processor_id()) {
 		set_ti_thread_flag(cti, tif);
 		if (tif == TIF_NEED_RESCHED)
@@ -1125,6 +1126,11 @@ static void __resched_curr(struct rq *rq, int tif)
 	}
 }
 
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+	trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
 void resched_curr(struct rq *rq)
 {
 	__resched_curr(rq, TIF_NEED_RESCHED);
@@ -5329,7 +5335,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	 * switched the context for the first time. It is returning from
 	 * schedule for the first time in this path.
 	 */
-	trace_sched_exit_tp(true, CALLER_ADDR0);
+	trace_sched_exit_tp(true);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -6678,7 +6684,8 @@ static void __sched notrace __schedule(int sched_mode)
 	struct rq *rq;
 	int cpu;
 
-	trace_sched_entry_tp(preempt, CALLER_ADDR0);
+	/* Trace preemptions consistently with task switches */
+	trace_sched_entry_tp(sched_mode == SM_PREEMPT);
 
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
@@ -6793,7 +6800,7 @@ picked:
 		__balance_callbacks(rq);
 		raw_spin_rq_unlock_irq(rq);
 	}
-	trace_sched_exit_tp(is_switch, CALLER_ADDR0);
+	trace_sched_exit_tp(is_switch);
 }
 
 void __noreturn do_task_dead(void)
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 66f4639d46ac..04c36405e2e3 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat
 	da_handle_start_event_sco(sched_set_state_sco);
 }
 
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
 {
 	da_handle_event_sco(schedule_entry_sco);
 }
 
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
 {
 	da_handle_start_event_sco(schedule_exit_sco);
 }
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
index 299703cd72b0..1e351ba52fee 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.c
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
 	da_handle_start_event_scpd(preempt_enable_scpd);
 }
 
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
 {
 	da_handle_event_scpd(schedule_entry_scpd);
 }
 
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
 {
 	da_handle_event_scpd(schedule_exit_scpd);
 }
diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c
index 3e1ee715a0fb..c8491f426365 100644
--- a/kernel/trace/rv/monitors/sncid/sncid.c
+++ b/kernel/trace/rv/monitors/sncid/sncid.c
@@ -30,12 +30,12 @@ static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent
 	da_handle_start_event_sncid(irq_enable_sncid);
 }
 
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
 {
 	da_handle_start_event_sncid(schedule_entry_sncid);
 }
 
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
 {
 	da_handle_start_event_sncid(schedule_exit_sncid);
 }
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
index 2adc3108d60c..558950f524a5 100644
--- a/kernel/trace/rv/monitors/snep/snep.c
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
 	da_handle_start_event_snep(preempt_enable_snep);
 }
 
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
 {
 	da_handle_event_snep(schedule_entry_snep);
 }
 
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
 {
 	da_handle_start_event_snep(schedule_exit_snep);
 }
diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c
index 0452fcd9edcf..95ebd15131f5 100644
--- a/kernel/trace/rv/monitors/tss/tss.c
+++ b/kernel/trace/rv/monitors/tss/tss.c
@@ -27,12 +27,12 @@ static void handle_sched_switch(void *data, bool preempt,
 	da_handle_event_tss(sched_switch_tss);
 }
 
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
 {
 	da_handle_event_tss(schedule_entry_tss);
 }
 
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
 {
 	da_handle_start_event_tss(schedule_exit_tss);
 }
-- 
cgit v1.2.3


From 1edaac340f4da813b258a5e3a6c79804612161a4 Mon Sep 17 00:00:00 2001
From: Klara Modin <klarasmodin@gmail.com>
Date: Fri, 25 Jul 2025 18:43:34 +0200
Subject: block: change blk_get_meta_cap() stub return -ENOIOCTLCMD

When introduced in commit 9eb22f7fedfc ("fs: add ioctl to query metadata
and protection info capabilities") the stub of blk_get_meta_cap() for
!BLK_DEV_INTEGRITY always returns -EOPNOTSUPP.  The motivation was that
while the command was unsupported in that configuration it was still
recognized.

A later change instead assumed -ENOIOCTLCMD as is required for unknown
ioctl commands per Documentation/driver-api/ioctl.rst. The result being
that on !BLK_DEV_INTEGRITY configs, any ioctl which reaches
blkdev_common_ioctl() will return -EOPNOTSUPP.

Change the stub to return -ENOIOCTLCMD, fixing the issue and better
matching with expectations.

[ The blkdev_common_ioctl() confusion has been fixed, but -ENOIOCTLCMD
  is the right thing to return for unrecognized ioctls, so the patch
  remains the right thing to do.   - Linus ]

Link: https://lore.kernel.org/lkml/CACzX3AsRd__fXb9=CJPTTJC494SDnYAtYrN2=+bZgMCvM6UQDg@mail.gmail.com
Fixes: 42b0ef01e6b5 ("block: fix FS_IOC_GETLBMD_CAP parsing in blkdev_common_ioctl()")
Signed-off-by: Klara Modin <klarasmodin@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/blk-integrity.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index e04c6e5bf1c6..e67a2b6e8f11 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -97,7 +97,7 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq)
 static inline int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd,
 				   struct logical_block_metadata_cap __user *argp)
 {
-	return -EOPNOTSUPP;
+	return -ENOIOCTLCMD;
 }
 static inline int blk_rq_count_integrity_sg(struct request_queue *q,
 					    struct bio *b)
-- 
cgit v1.2.3


From 5dc50b111b40003ed83f74324e8d4023f01bd93e Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Tue, 29 Jul 2025 22:58:49 +0800
Subject: ALSA: hda: Fix the wrong register was used for DVC of TAS2770

The wrong register was used for digital volume control of TAS2770,
The definition was changed, and usage was also updated.

Fixes: ab29b3460c5c ("ALSA: hda: Add TAS2770 support")
Signed-off-by: Baojun Xu <baojun.xu@ti.com>
Link: https://patch.msgid.link/20250729145849.55057-1-baojun.xu@ti.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/tas2770-tlv.h                    | 4 ++--
 sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2770-tlv.h b/include/sound/tas2770-tlv.h
index c0bd495b4a07..c7380925417a 100644
--- a/include/sound/tas2770-tlv.h
+++ b/include/sound/tas2770-tlv.h
@@ -14,10 +14,10 @@
 #ifndef __TAS2770_TLV_H__
 #define __TAS2770_TLV_H__
 
-#define TAS2770_DVC_LEVEL		TASDEVICE_REG(0x0, 0x0, 0x17)
+#define TAS2770_DVC_LEVEL		TASDEVICE_REG(0x0, 0x0, 0x05)
 #define TAS2770_AMP_LEVEL		TASDEVICE_REG(0x0, 0x0, 0x03)
 
-static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2770_dvc_tlv, 1650, 50, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2770_dvc_tlv, -10000, 50, 0);
 static const __maybe_unused DECLARE_TLV_DB_SCALE(tas2770_amp_tlv, 1100, 50, 0);
 
 #endif
diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
index a0b132681804..45ac5e41bd4f 100644
--- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
+++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c
@@ -260,7 +260,7 @@ static const struct snd_kcontrol_new tas2770_snd_controls[] = {
 		0, 0, 20, 0, tas2781_amp_getvol,
 		tas2781_amp_putvol, tas2770_amp_tlv),
 	ACARD_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2770_DVC_LEVEL,
-		0, 0, 31, 0, tas2781_amp_getvol,
+		0, 0, 200, 1, tas2781_amp_getvol,
 		tas2781_amp_putvol, tas2770_dvc_tlv),
 };
 
-- 
cgit v1.2.3


From 6cff20ce3b92ffbf2fc5eb9e5a030b3672aa414a Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 13 Jul 2025 16:31:01 +0200
Subject: PCI/ACPI: Fix runtime PM ref imbalance on Hot-Plug Capable ports

pci_bridge_d3_possible() is called from both pcie_portdrv_probe() and
pcie_portdrv_remove() to determine whether runtime power management shall
be enabled (on probe) or disabled (on remove) on a PCIe port.

The underlying assumption is that pci_bridge_d3_possible() always returns
the same value, else a runtime PM reference imbalance would occur.  That
assumption is not given if the PCIe port is inaccessible on remove due to
hot-unplug:  pci_bridge_d3_possible() calls pciehp_is_native(), which
accesses Config Space to determine whether the port is Hot-Plug Capable.
An inaccessible port returns "all ones", which is converted to "all
zeroes" by pcie_capability_read_dword().  Hence the port no longer seems
Hot-Plug Capable on remove even though it was on probe.

The resulting runtime PM ref imbalance causes warning messages such as:

  pcieport 0000:02:04.0: Runtime PM usage count underflow!

Avoid the Config Space access (and thus the runtime PM ref imbalance) by
caching the Hot-Plug Capable bit in struct pci_dev.

The struct already contains an "is_hotplug_bridge" flag, which however is
not only set on Hot-Plug Capable PCIe ports, but also Conventional PCI
Hot-Plug bridges and ACPI slots.  The flag identifies bridges which are
allocated additional MMIO and bus number resources to allow for hierarchy
expansion.

The kernel is somewhat sloppily using "is_hotplug_bridge" in a number of
places to identify Hot-Plug Capable PCIe ports, even though the flag
encompasses other devices.  Subsequent commits replace these occurrences
with the new flag to clearly delineate Hot-Plug Capable PCIe ports from
other kinds of hotplug bridges.

Document the existing "is_hotplug_bridge" and the new "is_pciehp" flag
and document the (non-obvious) requirement that pci_bridge_d3_possible()
always returns the same value across the entire lifetime of a bridge,
including its hot-removal.

Fixes: 5352a44a561d ("PCI: pciehp: Make pciehp_is_native() stricter")
Reported-by: Laurent Bigonville <bigon@bigon.be>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220216
Reported-by: Mario Limonciello <mario.limonciello@amd.com>
Closes: https://lore.kernel.org/r/20250609020223.269407-3-superm1@kernel.org/
Link: https://lore.kernel.org/all/20250620025535.3425049-3-superm1@kernel.org/T/#u
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Cc: stable@vger.kernel.org # v4.18+
Link: https://patch.msgid.link/fe5dcc3b2e62ee1df7905d746bde161eb1b3291c.1752390101.git.lukas@wunner.de
---
 drivers/pci/pci-acpi.c | 4 +---
 drivers/pci/pci.c      | 6 +++++-
 drivers/pci/probe.c    | 2 +-
 include/linux/pci.h    | 6 ++++++
 4 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index b78e0e417324..efe478e5073e 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -816,13 +816,11 @@ int pci_acpi_program_hp_params(struct pci_dev *dev)
 bool pciehp_is_native(struct pci_dev *bridge)
 {
 	const struct pci_host_bridge *host;
-	u32 slot_cap;
 
 	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
 		return false;
 
-	pcie_capability_read_dword(bridge, PCI_EXP_SLTCAP, &slot_cap);
-	if (!(slot_cap & PCI_EXP_SLTCAP_HPC))
+	if (!bridge->is_pciehp)
 		return false;
 
 	if (pcie_ports_native)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e9448d55113b..23d8fe98ddf9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3030,8 +3030,12 @@ static const struct dmi_system_id bridge_d3_blacklist[] = {
  * pci_bridge_d3_possible - Is it possible to put the bridge into D3
  * @bridge: Bridge to check
  *
- * This function checks if it is possible to move the bridge to D3.
  * Currently we only allow D3 for some PCIe ports and for Thunderbolt.
+ *
+ * Return: Whether it is possible to move the bridge to D3.
+ *
+ * The return value is guaranteed to be constant across the entire lifetime
+ * of the bridge, including its hot-removal.
  */
 bool pci_bridge_d3_possible(struct pci_dev *bridge)
 {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4b8693ec9e4c..cf50be63bf5f 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1678,7 +1678,7 @@ void set_pcie_hotplug_bridge(struct pci_dev *pdev)
 
 	pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP, &reg32);
 	if (reg32 & PCI_EXP_SLTCAP_HPC)
-		pdev->is_hotplug_bridge = 1;
+		pdev->is_hotplug_bridge = pdev->is_pciehp = 1;
 }
 
 static void set_pcie_thunderbolt(struct pci_dev *dev)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 05e68f35f392..d56d0dd80afb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -328,6 +328,11 @@ struct rcec_ea;
  *			determined (e.g., for Root Complex Integrated
  *			Endpoints without the relevant Capability
  *			Registers).
+ * @is_hotplug_bridge:	Hotplug bridge of any kind (e.g. PCIe Hot-Plug Capable,
+ *			Conventional PCI Hot-Plug, ACPI slot).
+ *			Such bridges are allocated additional MMIO and bus
+ *			number resources to allow for hierarchy expansion.
+ * @is_pciehp:		PCIe Hot-Plug Capable bridge.
  */
 struct pci_dev {
 	struct list_head bus_list;	/* Node in per-bus list */
@@ -451,6 +456,7 @@ struct pci_dev {
 	unsigned int	is_physfn:1;
 	unsigned int	is_virtfn:1;
 	unsigned int	is_hotplug_bridge:1;
+	unsigned int	is_pciehp:1;
 	unsigned int	shpc_managed:1;		/* SHPC owned by shpchp */
 	unsigned int	is_thunderbolt:1;	/* Thunderbolt controller */
 	/*
-- 
cgit v1.2.3


From c2f9de5e2db29158a8caa86a37aa479488e4ba43 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 13 Jul 2025 16:31:04 +0200
Subject: PCI: Move is_pciehp check out of pciehp_is_native()

pci_bridge_d3_possible() seeks to forbid runtime power management on:

* Non Hot-Plug Capable PCIe ports which are nevertheless ACPI slots
  (recognizable as: bridge->is_hotplug_bridge && !bridge->is_pciehp)

* Hot-Plug Capable PCIe ports for which platform firmware has not granted
  PCIe Native Hot-Plug control to the operating system
  (recognizable as: bridge->is_pciehp && !pciehp_is_native(bridge))

Somewhat confusingly, the check for is_hotplug_bridge is in
pci_bridge_d3_possible(), whereas the one for is_pciehp is in
pciehp_is_native().

For clarity, check is_pciehp directly in pci_bridge_d3_possible()
(and in the other caller of pciehp_is_native(), hotplug_is_native()).

Rephrase the code comment preceding these checks to no longer mention
"System Management Mode", which is an x86 term inappropriate in generic
PCI code.  Likewise no longer mention "Thunderbolt on non-Macs", because
there is nothing Thunderbolt-specific about these checks.  It used to be
the case that non-Macs relied on the platform for Thunderbolt tunnel
management and hotplug, but they've since moved to OS-native tunnel
management (as Macs always have), hence the code comment is no longer
accurate.

There is a subsequent check for is_hotplug_bridge further down in
pci_bridge_d3_possible().  Change the check to is_pciehp because any
ports matching "bridge->is_hotplug_bridge && !bridge->is_pciehp" are
already filtered out at the top of the function.

Do the same for another check in acpi_pci_bridge_d3(), which is called
from pci_bridge_d3_possible() via platform_pci_bridge_d3().

No functional change intended.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://patch.msgid.link/18b2c2110ad0f27a34b189d793310b9c4f2f24a0.1752390102.git.lukas@wunner.de
---
 drivers/pci/pci-acpi.c      |  5 +----
 drivers/pci/pci.c           | 12 ++++++++----
 include/linux/pci_hotplug.h |  3 ++-
 3 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index efe478e5073e..ed7ed66a595b 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -820,9 +820,6 @@ bool pciehp_is_native(struct pci_dev *bridge)
 	if (!IS_ENABLED(CONFIG_HOTPLUG_PCI_PCIE))
 		return false;
 
-	if (!bridge->is_pciehp)
-		return false;
-
 	if (pcie_ports_native)
 		return true;
 
@@ -1000,7 +997,7 @@ bool acpi_pci_bridge_d3(struct pci_dev *dev)
 	struct acpi_device *adev, *rpadev;
 	const union acpi_object *obj;
 
-	if (acpi_pci_disabled || !dev->is_hotplug_bridge)
+	if (acpi_pci_disabled || !dev->is_pciehp)
 		return false;
 
 	adev = ACPI_COMPANION(&dev->dev);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 23d8fe98ddf9..749994dad9dc 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3050,10 +3050,14 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
 			return false;
 
 		/*
-		 * Hotplug ports handled by firmware in System Management Mode
-		 * may not be put into D3 by the OS (Thunderbolt on non-Macs).
+		 * Hotplug ports handled by platform firmware may not be put
+		 * into D3 by the OS, e.g. ACPI slots ...
 		 */
-		if (bridge->is_hotplug_bridge && !pciehp_is_native(bridge))
+		if (bridge->is_hotplug_bridge && !bridge->is_pciehp)
+			return false;
+
+		/* ... or PCIe hotplug ports not handled natively by the OS. */
+		if (bridge->is_pciehp && !pciehp_is_native(bridge))
 			return false;
 
 		if (pci_bridge_d3_force)
@@ -3072,7 +3076,7 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge)
 		 * by vendors for runtime D3 at least until 2018 because there
 		 * was no OS support.
 		 */
-		if (bridge->is_hotplug_bridge)
+		if (bridge->is_pciehp)
 			return false;
 
 		if (dmi_check_system(bridge_d3_blacklist))
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index ec77ccf1fc4d..ddf79641917f 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -104,6 +104,7 @@ static inline bool shpchp_is_native(struct pci_dev *bridge) { return true; }
 
 static inline bool hotplug_is_native(struct pci_dev *bridge)
 {
-	return pciehp_is_native(bridge) || shpchp_is_native(bridge);
+	return (bridge->is_pciehp && pciehp_is_native(bridge)) ||
+	       shpchp_is_native(bridge);
 }
 #endif
-- 
cgit v1.2.3


From 71753c6ed2bf2aee5be26c1bc06a94c9e3713ade Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 29 Jul 2025 14:23:05 -0400
Subject: unwind_user: Add user space unwinding API with frame pointer support

Introduce a generic API for unwinding user stacks.

In order to expand user space unwinding to be able to handle more complex
scenarios, such as deferred unwinding and reading user space information,
create a generic interface that all architectures can use that support the
various unwinding methods.

This is an alternative method for handling user space stack traces from
the simple stack_trace_save_user() API. This does not replace that
interface, but this interface will be used to expand the functionality of
user space stack walking.

None of the structures introduced will be exposed to user space tooling.

Support for frame pointer unwinding is added. For an architecture to
support frame pointer unwinding it needs to enable
CONFIG_HAVE_UNWIND_USER_FP and define ARCH_INIT_USER_FP_FRAME.

By encoding the frame offsets in struct unwind_user_frame, much of this
code can also be reused for future unwinder implementations like sframe.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182404.975790139@kernel.org
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Co-developed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/all/20250710164301.3094-2-mathieu.desnoyers@efficios.com/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 MAINTAINERS                       |   8 +++
 arch/Kconfig                      |   7 +++
 include/asm-generic/Kbuild        |   1 +
 include/asm-generic/unwind_user.h |   5 ++
 include/linux/unwind_user.h       |  14 +++++
 include/linux/unwind_user_types.h |  44 +++++++++++++
 kernel/Makefile                   |   1 +
 kernel/unwind/Makefile            |   1 +
 kernel/unwind/user.c              | 128 ++++++++++++++++++++++++++++++++++++++
 9 files changed, 209 insertions(+)
 create mode 100644 include/asm-generic/unwind_user.h
 create mode 100644 include/linux/unwind_user.h
 create mode 100644 include/linux/unwind_user_types.h
 create mode 100644 kernel/unwind/Makefile
 create mode 100644 kernel/unwind/user.c

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index fad6cb025a19..370d780fd5f8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -25928,6 +25928,14 @@ F:	Documentation/driver-api/uio-howto.rst
 F:	drivers/uio/
 F:	include/linux/uio_driver.h
 
+USERSPACE STACK UNWINDING
+M:	Josh Poimboeuf <jpoimboe@kernel.org>
+M:	Steven Rostedt <rostedt@goodmis.org>
+S:	Maintained
+F:	include/linux/unwind*.h
+F:	kernel/unwind/
+
+
 UTIL-LINUX PACKAGE
 M:	Karel Zak <kzak@redhat.com>
 L:	util-linux@vger.kernel.org
diff --git a/arch/Kconfig b/arch/Kconfig
index a3308a220f86..8e3fd723bd74 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -435,6 +435,13 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
 	  It uses the same command line parameters, and sysctl interface,
 	  as the generic hardlockup detectors.
 
+config UNWIND_USER
+	bool
+
+config HAVE_UNWIND_USER_FP
+	bool
+	select UNWIND_USER
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 8675b7b4ad23..295c94a3ccc1 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -59,6 +59,7 @@ mandatory-y += tlbflush.h
 mandatory-y += topology.h
 mandatory-y += trace_clock.h
 mandatory-y += uaccess.h
+mandatory-y += unwind_user.h
 mandatory-y += vermagic.h
 mandatory-y += vga.h
 mandatory-y += video.h
diff --git a/include/asm-generic/unwind_user.h b/include/asm-generic/unwind_user.h
new file mode 100644
index 000000000000..b8882b909944
--- /dev/null
+++ b/include/asm-generic/unwind_user.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_UNWIND_USER_H
+#define _ASM_GENERIC_UNWIND_USER_H
+
+#endif /* _ASM_GENERIC_UNWIND_USER_H */
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
new file mode 100644
index 000000000000..7f7282516bf5
--- /dev/null
+++ b/include/linux/unwind_user.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_H
+#define _LINUX_UNWIND_USER_H
+
+#include <linux/unwind_user_types.h>
+#include <asm/unwind_user.h>
+
+#ifndef ARCH_INIT_USER_FP_FRAME
+ #define ARCH_INIT_USER_FP_FRAME
+#endif
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);
+
+#endif /* _LINUX_UNWIND_USER_H */
diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h
new file mode 100644
index 000000000000..a449f15be890
--- /dev/null
+++ b/include/linux/unwind_user_types.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_TYPES_H
+#define _LINUX_UNWIND_USER_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * Unwind types, listed in priority order: lower numbers are attempted first if
+ * available.
+ */
+enum unwind_user_type_bits {
+	UNWIND_USER_TYPE_FP_BIT =		0,
+
+	NR_UNWIND_USER_TYPE_BITS,
+};
+
+enum unwind_user_type {
+	/* Type "none" for the start of stack walk iteration. */
+	UNWIND_USER_TYPE_NONE =			0,
+	UNWIND_USER_TYPE_FP =			BIT(UNWIND_USER_TYPE_FP_BIT),
+};
+
+struct unwind_stacktrace {
+	unsigned int	nr;
+	unsigned long	*entries;
+};
+
+struct unwind_user_frame {
+	s32 cfa_off;
+	s32 ra_off;
+	s32 fp_off;
+	bool use_fp;
+};
+
+struct unwind_user_state {
+	unsigned long				ip;
+	unsigned long				sp;
+	unsigned long				fp;
+	enum unwind_user_type			current_type;
+	unsigned int				available_types;
+	bool					done;
+};
+
+#endif /* _LINUX_UNWIND_USER_TYPES_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 32e80dd626af..541186050251 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-y += rcu/
 obj-y += livepatch/
 obj-y += dma/
 obj-y += entry/
+obj-y += unwind/
 obj-$(CONFIG_MODULES) += module/
 
 obj-$(CONFIG_KCMP) += kcmp.o
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
new file mode 100644
index 000000000000..349ce3677526
--- /dev/null
+++ b/kernel/unwind/Makefile
@@ -0,0 +1 @@
+ obj-$(CONFIG_UNWIND_USER) += user.o
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
new file mode 100644
index 000000000000..97a8415e3216
--- /dev/null
+++ b/kernel/unwind/user.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Generic interfaces for unwinding user space
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_user.h>
+#include <linux/uaccess.h>
+
+static const struct unwind_user_frame fp_frame = {
+	ARCH_INIT_USER_FP_FRAME
+};
+
+#define for_each_user_frame(state) \
+	for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
+
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+	const struct unwind_user_frame *frame = &fp_frame;
+	unsigned long cfa, fp, ra;
+	unsigned int shift;
+
+	if (frame->use_fp) {
+		if (state->fp < state->sp)
+			return -EINVAL;
+		cfa = state->fp;
+	} else {
+		cfa = state->sp;
+	}
+
+	/* Get the Canonical Frame Address (CFA) */
+	cfa += frame->cfa_off;
+
+	/* stack going in wrong direction? */
+	if (cfa <= state->sp)
+		return -EINVAL;
+
+	/* Make sure that the address is word aligned */
+	shift = sizeof(long) == 4 ? 2 : 3;
+	if (cfa & ((1 << shift) - 1))
+		return -EINVAL;
+
+	/* Find the Return Address (RA) */
+	if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+		return -EINVAL;
+
+	if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+		return -EINVAL;
+
+	state->ip = ra;
+	state->sp = cfa;
+	if (frame->fp_off)
+		state->fp = fp;
+	return 0;
+}
+
+static int unwind_user_next(struct unwind_user_state *state)
+{
+	unsigned long iter_mask = state->available_types;
+	unsigned int bit;
+
+	if (state->done)
+		return -EINVAL;
+
+	for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) {
+		enum unwind_user_type type = BIT(bit);
+
+		state->current_type = type;
+		switch (type) {
+		case UNWIND_USER_TYPE_FP:
+			if (!unwind_user_next_fp(state))
+				return 0;
+			continue;
+		default:
+			WARN_ONCE(1, "Undefined unwind bit %d", bit);
+			break;
+		}
+		break;
+	}
+
+	/* No successful unwind method. */
+	state->current_type = UNWIND_USER_TYPE_NONE;
+	state->done = true;
+	return -EINVAL;
+}
+
+static int unwind_user_start(struct unwind_user_state *state)
+{
+	struct pt_regs *regs = task_pt_regs(current);
+
+	memset(state, 0, sizeof(*state));
+
+	if ((current->flags & PF_KTHREAD) || !user_mode(regs)) {
+		state->done = true;
+		return -EINVAL;
+	}
+
+	if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+		state->available_types |= UNWIND_USER_TYPE_FP;
+
+	state->ip = instruction_pointer(regs);
+	state->sp = user_stack_pointer(regs);
+	state->fp = frame_pointer(regs);
+
+	return 0;
+}
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
+{
+	struct unwind_user_state state;
+
+	trace->nr = 0;
+
+	if (!max_entries)
+		return -EINVAL;
+
+	if (current->flags & PF_KTHREAD)
+		return 0;
+
+	for_each_user_frame(&state) {
+		trace->entries[trace->nr++] = state.ip;
+		if (trace->nr >= max_entries)
+			break;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 5e32d0f15cc5c843a4115c4644d984d42524c794 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Jul 2025 14:23:06 -0400
Subject: unwind_user/deferred: Add unwind_user_faultable()

Add a new API to retrieve a user space callstack called
unwind_user_faultable(). The difference between this user space stack
tracer from the current user space stack tracer is that this must be
called from faultable context as it may use routines to access user space
data that needs to be faulted in.

It can be safely called from entering or exiting a system call as the code
can still be faulted in there.

This code is based on work by Josh Poimboeuf's deferred unwinding code:

Link: https://lore.kernel.org/all/6052e8487746603bdb29b65f4033e739092d9925.1737511963.git.jpoimboe@kernel.org/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.147896868@kernel.org
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/sched.h                 |  5 +++
 include/linux/unwind_deferred.h       | 24 ++++++++++++++
 include/linux/unwind_deferred_types.h |  9 ++++++
 kernel/fork.c                         |  4 +++
 kernel/unwind/Makefile                |  2 +-
 kernel/unwind/deferred.c              | 60 +++++++++++++++++++++++++++++++++++
 6 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/unwind_deferred.h
 create mode 100644 include/linux/unwind_deferred_types.h
 create mode 100644 kernel/unwind/deferred.c

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f78a64beb52..59fdf7d9bb1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -46,6 +46,7 @@
 #include <linux/rv.h>
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/unwind_deferred_types.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1654,6 +1655,10 @@ struct task_struct {
 	struct user_event_mm		*user_event_mm;
 #endif
 
+#ifdef CONFIG_UNWIND_USER
+	struct unwind_task_info		unwind_info;
+#endif
+
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
 
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
new file mode 100644
index 000000000000..a5f6e8f8a1a2
--- /dev/null
+++ b/include/linux/unwind_deferred.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_H
+#define _LINUX_UNWIND_USER_DEFERRED_H
+
+#include <linux/unwind_user.h>
+#include <linux/unwind_deferred_types.h>
+
+#ifdef CONFIG_UNWIND_USER
+
+void unwind_task_init(struct task_struct *task);
+void unwind_task_free(struct task_struct *task);
+
+int unwind_user_faultable(struct unwind_stacktrace *trace);
+
+#else /* !CONFIG_UNWIND_USER */
+
+static inline void unwind_task_init(struct task_struct *task) {}
+static inline void unwind_task_free(struct task_struct *task) {}
+
+static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
+
+#endif /* !CONFIG_UNWIND_USER */
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
new file mode 100644
index 000000000000..aa32db574e43
--- /dev/null
+++ b/include/linux/unwind_deferred_types.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+#define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
+
+struct unwind_task_info {
+	unsigned long		*entries;
+};
+
+#endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..3341d50c61f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
 #include <linux/tick.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -732,6 +733,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(refcount_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_free(tsk);
@@ -2135,6 +2137,8 @@ __latent_entropy struct task_struct *copy_process(
 	p->bpf_ctx = NULL;
 #endif
 
+	unwind_task_init(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..eae37bea54fd 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER)	+= user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..a0badbeb3cc1
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/unwind_deferred.h>
+
+#define UNWIND_MAX_ENTRIES 512
+
+/**
+ * unwind_user_faultable - Produce a user stacktrace in faultable context
+ * @trace: The descriptor that will store the user stacktrace
+ *
+ * This must be called in a known faultable context (usually when entering
+ * or exiting user space). Depending on the available implementations
+ * the @trace will be loaded with the addresses of the user space stacktrace
+ * if it can be found.
+ *
+ * Return: 0 on success and negative on error
+ *         On success @trace will contain the user space stacktrace
+ */
+int unwind_user_faultable(struct unwind_stacktrace *trace)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+
+	/* Should always be called from faultable context */
+	might_fault();
+
+	if (current->flags & PF_EXITING)
+		return -EINVAL;
+
+	if (!info->entries) {
+		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
+					      GFP_KERNEL);
+		if (!info->entries)
+			return -ENOMEM;
+	}
+
+	trace->nr = 0;
+	trace->entries = info->entries;
+	unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+	return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	memset(info, 0, sizeof(*info));
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+	struct unwind_task_info *info = &task->unwind_info;
+
+	kfree(info->entries);
+}
-- 
cgit v1.2.3


From 94fd44648dae2a5b6149a41faa0b07928c3e1963 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 29 Jul 2025 16:18:25 -0700
Subject: fortify: Fix incorrect reporting of read buffer size

When FORTIFY_SOURCE reports about a run-time buffer overread, the wrong
buffer size was being shown in the error message. (The bounds checking
was correct.)

Fixes: 3d965b33e40d ("fortify: Improve buffer overflow reporting")
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20250729231817.work.023-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 include/linux/fortify-string.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index e4ce1cae03bf..b3b53f8c1b28 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -596,7 +596,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 	if (p_size != SIZE_MAX && p_size < size)
 		fortify_panic(func, FORTIFY_WRITE, p_size, size, true);
 	else if (q_size != SIZE_MAX && q_size < size)
-		fortify_panic(func, FORTIFY_READ, p_size, size, true);
+		fortify_panic(func, FORTIFY_READ, q_size, size, true);
 
 	/*
 	 * Warn when writing beyond destination field size.
-- 
cgit v1.2.3


From f627b51aaa041cba715b59026cf2d9cb1476c7ed Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Tue, 29 Jul 2025 16:41:00 -0700
Subject: compiler_types: Provide __no_kstack_erase to disable coverage only on
 Clang

In order to support Clang's stack depth tracking (for Linux's kstack_erase
feature), the coverage sanitizer needed to be disabled for __init (and
__head) section code. Doing this universally (i.e. for GCC too) created
a number of unexpected problems, ranging from changes to inlining logic
to failures to DCE code on earlier GCC versions.

Since this change is only needed for Clang, specialize it so that GCC
doesn't see the change as it isn't needed there (the GCC implementation
of kstack_erase uses a GCC plugin that removes stack depth tracking
instrumentation from __init sections during a late pass in the IR).

Successfully build and boot tested with GCC 12 and Clang 22.

Fixes: 381a38ea53d2 ("init.h: Disable sanitizer coverage for __init and __head")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202507270258.neWuiXLd-lkp@intel.com/
Reported-by: syzbot+5245cb609175fb6e8122@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/6888d004.a00a0220.26d0e1.0004.GAE@google.com/
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Marco Elver <elver@google.com>
Link: https://lore.kernel.org/r/20250729234055.it.233-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 arch/x86/include/asm/init.h    | 2 +-
 include/linux/compiler-clang.h | 3 +++
 include/linux/compiler_types.h | 4 ++++
 include/linux/init.h           | 2 +-
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 6bfdaeddbae8..5a68e9db6518 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,7 +5,7 @@
 #if defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000
 #define __head	__section(".head.text") __no_sanitize_undefined __no_stack_protector
 #else
-#define __head	__section(".head.text") __no_sanitize_undefined __no_sanitize_coverage
+#define __head	__section(".head.text") __no_sanitize_undefined __no_kstack_erase
 #endif
 
 struct x86_mapping_info {
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 4fc8e26914ad..fa4ffe037bc7 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -89,6 +89,9 @@
 #define __no_sanitize_coverage
 #endif
 
+/* Only Clang needs to disable the coverage sanitizer for kstack_erase. */
+#define __no_kstack_erase	__no_sanitize_coverage
+
 #if __has_feature(shadow_call_stack)
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 2b77d12e07b2..16755431fc11 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -424,6 +424,10 @@ struct ftrace_likely_data {
 # define randomized_struct_fields_end
 #endif
 
+#ifndef __no_kstack_erase
+# define __no_kstack_erase
+#endif
+
 #ifndef __noscs
 # define __noscs
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index c65a050d52a7..a60d32d227ee 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -51,7 +51,7 @@
    discard it in modules) */
 #define __init		__section(".init.text") __cold __latent_entropy	\
 						__noinitretpoline	\
-						__no_sanitize_coverage
+						__no_kstack_erase
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
 #define __exitdata	__section(".exit.data")
-- 
cgit v1.2.3


From a11a51896572273d04a9f6011ad22738c52ba554 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 4 Jun 2025 15:52:19 +0200
Subject: spi: spi-mem: Take into account the actual maximum frequency

In order to pick the best variant, the duration of each typical
operation is derived and then compared. These durations are based on the
maximum capabilities of the chips, which are commonly the limiting
factors. However there are other possible limiting pieces, such as the
hardware layout, EMC considerations and in some cases, the SPI controller
itself.

We need to take this into account to further refine our variant choice,
so let's use the actual frequency that will be used for the operation
instead of the theoretical maximum.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
---
 drivers/mtd/nand/spi/core.c |  2 +-
 drivers/spi/spi-mem.c       | 18 ++++++++++++++----
 include/linux/spi/spi-mem.h |  2 +-
 3 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index b90f15c986a3..39b72947c44c 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -1297,7 +1297,7 @@ spinand_select_op_variant(struct spinand_device *spinand,
 
 			nbytes -= op.data.nbytes;
 
-			op_duration_ns += spi_mem_calc_op_duration(&op);
+			op_duration_ns += spi_mem_calc_op_duration(spinand->spimem, &op);
 		}
 
 		if (!nbytes && op_duration_ns < best_op_duration_ns) {
diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c
index c42c227eb2a2..d3b7e857b377 100644
--- a/drivers/spi/spi-mem.c
+++ b/drivers/spi/spi-mem.c
@@ -586,15 +586,25 @@ EXPORT_SYMBOL_GPL(spi_mem_adjust_op_freq);
  * accurate, all these combinations should be rated (eg. with a time estimate)
  * and the best pick should be taken based on these calculations.
  *
- * Returns a ns estimate for the time this op would take.
+ * Returns a ns estimate for the time this op would take, except if no
+ * frequency limit has been set, in this case we return the number of
+ * cycles nevertheless to allow callers to distinguish which operation
+ * would be the fastest at iso-frequency.
  */
-u64 spi_mem_calc_op_duration(struct spi_mem_op *op)
+u64 spi_mem_calc_op_duration(struct spi_mem *mem, struct spi_mem_op *op)
 {
 	u64 ncycles = 0;
 	u64 ps_per_cycles, duration;
 
-	ps_per_cycles = 1000000000000ULL;
-	do_div(ps_per_cycles, op->max_freq);
+	spi_mem_adjust_op_freq(mem, op);
+
+	if (op->max_freq) {
+		ps_per_cycles = 1000000000000ULL;
+		do_div(ps_per_cycles, op->max_freq);
+	} else {
+		/* In this case, the unit is no longer a time unit */
+		ps_per_cycles = 1;
+	}
 
 	ncycles += ((op->cmd.nbytes * 8) / op->cmd.buswidth) / (op->cmd.dtr ? 2 : 1);
 	ncycles += ((op->addr.nbytes * 8) / op->addr.buswidth) / (op->addr.dtr ? 2 : 1);
diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h
index c4830dfaff3d..82390712794c 100644
--- a/include/linux/spi/spi-mem.h
+++ b/include/linux/spi/spi-mem.h
@@ -424,7 +424,7 @@ bool spi_mem_default_supports_op(struct spi_mem *mem,
 
 int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op);
 void spi_mem_adjust_op_freq(struct spi_mem *mem, struct spi_mem_op *op);
-u64 spi_mem_calc_op_duration(struct spi_mem_op *op);
+u64 spi_mem_calc_op_duration(struct spi_mem *mem, struct spi_mem_op *op);
 
 bool spi_mem_supports_op(struct spi_mem *mem,
 			 const struct spi_mem_op *op);
-- 
cgit v1.2.3


From 5de7ea49653f6b988525b559802da615a61ffbea Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 14:14:20 +0200
Subject: mtd: spinand: Fix macro alignment

No functional change, just a style fix to align with the other
macros all around.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/spinand.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 15eaa09da998..28a013f4f4f3 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -71,9 +71,9 @@
 
 #define SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(addr, ndummy, buf, len) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1),				\
-			 SPI_MEM_OP_ADDR(2, addr, 1),			\
-			 SPI_MEM_OP_DUMMY(ndummy, 1),			\
-			 SPI_MEM_OP_DATA_IN(len, buf, 1))
+		   SPI_MEM_OP_ADDR(2, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(addr, ndummy, buf, len) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1),				\
-- 
cgit v1.2.3


From d81ad9d78e2cd5bdefd390a83553203668a96092 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 14:14:21 +0200
Subject: mtd: spinand: Add a frequency field to all READ_FROM_CACHE variants

These macros had initially no frequency field. When I added the "maximum
operation frequency" field, I did it initially on very common macros and
I decided to add an optional field for that (with VA_ARGS) in order to
prevent massively unreadable changes. I then added new variants in the
spinand.h header, and requested a frequency field for them by
default. Some times later, I also added maximum frequencies to other
existing variants, but I did it incorrectly, without noticing I was
wrong because the field was optional.

This mix is error prone, so let's do what I should have done since the
very beginning: add a frequency field to all READ_FROM_CACHE variants.

There is no functional change.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/alliancememory.c | 12 ++++----
 drivers/mtd/nand/spi/ato.c            |  6 ++--
 drivers/mtd/nand/spi/esmt.c           |  8 ++---
 drivers/mtd/nand/spi/foresee.c        |  8 ++---
 drivers/mtd/nand/spi/gigadevice.c     | 48 ++++++++++++++---------------
 drivers/mtd/nand/spi/macronix.c       |  8 ++---
 drivers/mtd/nand/spi/micron.c         | 20 ++++++------
 drivers/mtd/nand/spi/paragon.c        | 12 ++++----
 drivers/mtd/nand/spi/skyhigh.c        | 12 ++++----
 drivers/mtd/nand/spi/toshiba.c        |  8 ++---
 drivers/mtd/nand/spi/winbond.c        | 22 +++++++-------
 drivers/mtd/nand/spi/xtx.c            | 12 ++++----
 include/linux/mtd/spinand.h           | 57 ++++++++++++++++++++---------------
 13 files changed, 121 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/alliancememory.c b/drivers/mtd/nand/spi/alliancememory.c
index 2ee498230ec1..9e97c40955c9 100644
--- a/drivers/mtd/nand/spi/alliancememory.c
+++ b/drivers/mtd/nand/spi/alliancememory.c
@@ -17,12 +17,12 @@
 #define AM_STATUS_ECC_MAX_CORRECTED	(3 << 4)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 			   SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/ato.c b/drivers/mtd/nand/spi/ato.c
index 2b4df1d917ac..45d38ce0736c 100644
--- a/drivers/mtd/nand/spi/ato.c
+++ b/drivers/mtd/nand/spi/ato.c
@@ -14,9 +14,9 @@
 
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/esmt.c b/drivers/mtd/nand/spi/esmt.c
index 9e286612a296..9a9325c0bc49 100644
--- a/drivers/mtd/nand/spi/esmt.c
+++ b/drivers/mtd/nand/spi/esmt.c
@@ -18,10 +18,10 @@
 	(CFG_OTP_ENABLE | ESMT_F50L1G41LB_CFG_OTP_PROTECT)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 			   SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/foresee.c b/drivers/mtd/nand/spi/foresee.c
index 7c61644bfb10..c521dd6abc4b 100644
--- a/drivers/mtd/nand/spi/foresee.c
+++ b/drivers/mtd/nand/spi/foresee.c
@@ -12,10 +12,10 @@
 #define SPINAND_MFR_FORESEE		0xCD
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/gigadevice.c b/drivers/mtd/nand/spi/gigadevice.c
index b4087767fe50..93e40431dbe2 100644
--- a/drivers/mtd/nand/spi/gigadevice.c
+++ b/drivers/mtd/nand/spi/gigadevice.c
@@ -24,36 +24,36 @@
 #define GD5FXGQ4UXFXXG_STATUS_ECC_UNCOR_ERROR	(7 << 4)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(read_cache_variants_f,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_3A_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(0, 0, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_3A_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(0, 0, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(read_cache_variants_1gq5,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(read_cache_variants_2gq5,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 4, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 4, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/macronix.c b/drivers/mtd/nand/spi/macronix.c
index eeaf5bf9f082..edf63b9996cf 100644
--- a/drivers/mtd/nand/spi/macronix.c
+++ b/drivers/mtd/nand/spi/macronix.c
@@ -28,10 +28,10 @@ struct macronix_priv {
 };
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/micron.c b/drivers/mtd/nand/spi/micron.c
index 8281c9d3f4f7..a49d7cb6a96d 100644
--- a/drivers/mtd/nand/spi/micron.c
+++ b/drivers/mtd/nand/spi/micron.c
@@ -35,12 +35,12 @@
 	(CFG_OTP_ENABLE | MICRON_MT29F2G01ABAGD_CFG_OTP_STATE)
 
 static SPINAND_OP_VARIANTS(quadio_read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(x4_write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
@@ -52,10 +52,10 @@ static SPINAND_OP_VARIANTS(x4_update_cache_variants,
 
 /* Micron  MT29F2G01AAAED Device */
 static SPINAND_OP_VARIANTS(x4_read_cache_variants,
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+			   SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(x1_write_cache_variants,
 			   SPINAND_PROG_LOAD_1S_1S_1S_OP(true, 0, NULL, 0));
diff --git a/drivers/mtd/nand/spi/paragon.c b/drivers/mtd/nand/spi/paragon.c
index 4670bac41245..73bd124273a5 100644
--- a/drivers/mtd/nand/spi/paragon.c
+++ b/drivers/mtd/nand/spi/paragon.c
@@ -22,12 +22,12 @@
 
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/skyhigh.c b/drivers/mtd/nand/spi/skyhigh.c
index 51d61785df61..bf9ce163e6a7 100644
--- a/drivers/mtd/nand/spi/skyhigh.c
+++ b/drivers/mtd/nand/spi/skyhigh.c
@@ -17,12 +17,12 @@
 #define SKYHIGH_CONFIG_PROTECT_EN		BIT(1)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 4, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 4, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/toshiba.c b/drivers/mtd/nand/spi/toshiba.c
index 4c6923047aeb..6530257ac0be 100644
--- a/drivers/mtd/nand/spi/toshiba.c
+++ b/drivers/mtd/nand/spi/toshiba.c
@@ -15,10 +15,10 @@
 #define TOSH_STATUS_ECC_HAS_BITFLIPS_T	(3 << 4)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_x4_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
index b7a28f001a38..7a9e22e81dcf 100644
--- a/drivers/mtd/nand/spi/winbond.c
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -27,8 +27,8 @@ static SPINAND_OP_VARIANTS(read_cache_octal_variants,
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_8D_OP(0, 2, NULL, 0, 105 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 16, NULL, 0, 162 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_8S_OP(0, 1, NULL, 0, 133 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_octal_variants,
 		SPINAND_PROG_LOAD_1S_8S_8S_OP(true, 0, NULL, 0),
@@ -43,22 +43,22 @@ static SPINAND_OP_VARIANTS(read_cache_dual_quad_dtr_variants,
 		SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(0, 8, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 104 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(0, 4, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 104 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 54 * HZ_PER_MHZ));
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/drivers/mtd/nand/spi/xtx.c b/drivers/mtd/nand/spi/xtx.c
index 37336d5958a9..5915b37b47f5 100644
--- a/drivers/mtd/nand/spi/xtx.c
+++ b/drivers/mtd/nand/spi/xtx.c
@@ -23,12 +23,12 @@
 #define XT26XXXD_STATUS_ECC_UNCOR_ERROR     (2)
 
 static SPINAND_OP_VARIANTS(read_cache_variants,
-		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0),
-		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0));
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
 
 static SPINAND_OP_VARIANTS(write_cache_variants,
 		SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0),
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 28a013f4f4f3..61a4571cec7e 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -62,30 +62,33 @@
 		   SPI_MEM_OP_NO_DUMMY,					\
 		   SPI_MEM_OP_NO_DATA)
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(addr, ndummy, buf, len, ...) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 1),			\
-		   SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0))
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+		   SPI_MEM_OP_DATA_IN(len, buf, 1),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_1S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+		   SPI_MEM_OP_DATA_IN(len, buf, 1),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_FAST_3A_1S_1S_1S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_FAST_3A_1S_1S_1S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+		   SPI_MEM_OP_DATA_IN(len, buf, 1),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x0d, 1),				\
@@ -94,17 +97,19 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 1),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+		   SPI_MEM_OP_DATA_IN(len, buf, 2),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_2S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_2S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+		   SPI_MEM_OP_DATA_IN(len, buf, 2),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3d, 1),				\
@@ -113,18 +118,19 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 2),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len, ...) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2),			\
-		   SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0))
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+		   SPI_MEM_OP_DATA_IN(len, buf, 2),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbd, 1),				\
@@ -133,17 +139,19 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 2),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+		   SPI_MEM_OP_DATA_IN(len, buf, 4),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_4S_OP(addr, ndummy, buf, len)	\
+#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_1S_4S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+		   SPI_MEM_OP_DATA_IN(len, buf, 4),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6d, 1),				\
@@ -152,18 +160,19 @@
 		   SPI_MEM_DTR_OP_DATA_IN(len, buf, 4),			\
 		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len, ...) \
+#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4),			\
-		   SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0))
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
-#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len) \
+#define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
-		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+		   SPI_MEM_OP_DATA_IN(len, buf, 4),			\
+		   SPI_MEM_OP_MAX_FREQ(freq))
 
 #define SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(addr, ndummy, buf, len, freq) \
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xed, 1),				\
-- 
cgit v1.2.3


From da55809ebb45d1d80b7a388ffef841ed683e1a6f Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 14:14:22 +0200
Subject: mtd: spinand: Add a ->configure_chip() hook

There is already a manufacturer hook, which is manufacturer specific but
not chip specific. We no longer have access to the actual NAND identity
at this stage so let's add a per-chip configuration hook to align the
chip configuration (if any) with the core's setting.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c | 16 ++++++++++++++--
 include/linux/mtd/spinand.h |  7 +++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 39b72947c44c..9ed3cf2e5dba 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -1253,8 +1253,19 @@ static int spinand_id_detect(struct spinand_device *spinand)
 
 static int spinand_manufacturer_init(struct spinand_device *spinand)
 {
-	if (spinand->manufacturer->ops->init)
-		return spinand->manufacturer->ops->init(spinand);
+	int ret;
+
+	if (spinand->manufacturer->ops->init) {
+		ret = spinand->manufacturer->ops->init(spinand);
+		if (ret)
+			return ret;
+	}
+
+	if (spinand->configure_chip) {
+		ret = spinand->configure_chip(spinand);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -1349,6 +1360,7 @@ int spinand_match_and_init(struct spinand_device *spinand,
 		spinand->flags = table[i].flags;
 		spinand->id.len = 1 + table[i].devid.len;
 		spinand->select_target = table[i].select_target;
+		spinand->configure_chip = table[i].configure_chip;
 		spinand->set_cont_read = table[i].set_cont_read;
 		spinand->fact_otp = &table[i].fact_otp;
 		spinand->user_otp = &table[i].user_otp;
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 61a4571cec7e..69674fd191d9 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -493,6 +493,7 @@ struct spinand_user_otp {
  * @op_variants.update_cache: variants of the update-cache operation
  * @select_target: function used to select a target/die. Required only for
  *		   multi-die chips
+ * @configure_chip: Align the chip configuration with the core settings
  * @set_cont_read: enable/disable continuous cached reads
  * @fact_otp: SPI NAND factory OTP info.
  * @user_otp: SPI NAND user OTP info.
@@ -516,6 +517,7 @@ struct spinand_info {
 	} op_variants;
 	int (*select_target)(struct spinand_device *spinand,
 			     unsigned int target);
+	int (*configure_chip)(struct spinand_device *spinand);
 	int (*set_cont_read)(struct spinand_device *spinand,
 			     bool enable);
 	struct spinand_fact_otp fact_otp;
@@ -548,6 +550,9 @@ struct spinand_info {
 #define SPINAND_SELECT_TARGET(__func)					\
 	.select_target = __func
 
+#define SPINAND_CONFIGURE_CHIP(__configure_chip)			\
+	.configure_chip = __configure_chip
+
 #define SPINAND_CONT_READ(__set_cont_read)				\
 	.set_cont_read = __set_cont_read
 
@@ -616,6 +621,7 @@ struct spinand_dirmap {
  *		passed in spi_mem_op be DMA-able, so we can't based the bufs on
  *		the stack
  * @manufacturer: SPI NAND manufacturer information
+ * @configure_chip: Align the chip configuration with the core settings
  * @cont_read_possible: Field filled by the core once the whole system
  *		configuration is known to tell whether continuous reads are
  *		suitable to use or not in general with this chip/configuration.
@@ -656,6 +662,7 @@ struct spinand_device {
 	const struct spinand_manufacturer *manufacturer;
 	void *priv;
 
+	int (*configure_chip)(struct spinand_device *spinand);
 	bool cont_read_possible;
 	int (*set_cont_read)(struct spinand_device *spinand,
 			     bool enable);
-- 
cgit v1.2.3


From f1a91175faaab02a45d1ceb313a315a5bfeb5416 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 14:14:23 +0200
Subject: mtd: spinand: winbond: Enable high-speed modes on w25n0xjw

w25n0xjw chips have a high-speed capability hidden in a configuration
register. Once enabled, dual/quad SDR reads may be performed at a much
higher frequency.

Implement the new ->configure_chip() hook for this purpose and configure
the SR4 register accordingly.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c    |  2 +-
 drivers/mtd/nand/spi/winbond.c | 45 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/mtd/spinand.h    |  1 +
 3 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 9ed3cf2e5dba..93d22b412dd3 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -20,7 +20,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/spi-mem.h>
 
-static int spinand_read_reg_op(struct spinand_device *spinand, u8 reg, u8 *val)
+int spinand_read_reg_op(struct spinand_device *spinand, u8 reg, u8 *val)
 {
 	struct spi_mem_op op = SPINAND_GET_FEATURE_1S_1S_1S_OP(reg,
 						      spinand->scratchbuf);
diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
index 7a9e22e81dcf..18ae6f58a546 100644
--- a/drivers/mtd/nand/spi/winbond.c
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -18,6 +18,9 @@
 
 #define W25N04KV_STATUS_ECC_5_8_BITFLIPS	(3 << 4)
 
+#define W25N0XJW_SR4			0xD0
+#define W25N0XJW_SR4_HS			BIT(2)
+
 /*
  * "X2" in the core is equivalent to "dual output" in the datasheets,
  * "X4" in the core is equivalent to "quad output" in the datasheets.
@@ -42,10 +45,12 @@ static SPINAND_OP_VARIANTS(update_cache_octal_variants,
 static SPINAND_OP_VARIANTS(read_cache_dual_quad_dtr_variants,
 		SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(0, 8, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 4, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 104 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(0, 4, NULL, 0, 80 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 2, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 104 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ),
@@ -230,6 +235,40 @@ static int w25n02kv_ecc_get_status(struct spinand_device *spinand,
 	return -EINVAL;
 }
 
+static int w25n0xjw_hs_cfg(struct spinand_device *spinand)
+{
+	const struct spi_mem_op *op;
+	bool hs;
+	u8 sr4;
+	int ret;
+
+	op = spinand->op_templates.read_cache;
+	if (op->cmd.dtr || op->addr.dtr || op->dummy.dtr || op->data.dtr)
+		hs = false;
+	else if (op->cmd.buswidth == 1 && op->addr.buswidth == 1 &&
+		 op->dummy.buswidth == 1 && op->data.buswidth == 1)
+		hs = false;
+	else if (!op->max_freq)
+		hs = true;
+	else
+		hs = false;
+
+	ret = spinand_read_reg_op(spinand, W25N0XJW_SR4, &sr4);
+	if (ret)
+		return ret;
+
+	if (hs)
+		sr4 |= W25N0XJW_SR4_HS;
+	else
+		sr4 &= ~W25N0XJW_SR4_HS;
+
+	ret = spinand_write_reg_op(spinand, W25N0XJW_SR4, sr4);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static const struct spinand_info winbond_spinand_table[] = {
 	/* 512M-bit densities */
 	SPINAND_INFO("W25N512GW", /* 1.8V */
@@ -268,7 +307,8 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_variants,
 					      &update_cache_variants),
 		     0,
-		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL)),
+		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL),
+		     SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)),
 	SPINAND_INFO("W25N01KV", /* 3.3V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xae, 0x21),
 		     NAND_MEMORG(1, 2048, 96, 64, 1024, 20, 1, 1, 1),
@@ -324,7 +364,8 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_variants,
 					      &update_cache_variants),
 		     0,
-		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL)),
+		     SPINAND_ECCINFO(&w25m02gv_ooblayout, NULL),
+		     SPINAND_CONFIGURE_CHIP(w25n0xjw_hs_cfg)),
 	SPINAND_INFO("W25N02KV", /* 3.3V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xaa, 0x22),
 		     NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 69674fd191d9..53c881e41fc7 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -739,6 +739,7 @@ int spinand_match_and_init(struct spinand_device *spinand,
 			   enum spinand_readid_method rdid_method);
 
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
+int spinand_read_reg_op(struct spinand_device *spinand, u8 reg, u8 *val);
 int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
 
-- 
cgit v1.2.3


From 535f30d997baa5e5c6a3a4024d49e1871232c72b Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Wed, 18 Jun 2025 14:14:24 +0200
Subject: mtd: spinand: winbond: Enable high-speed modes on w35n0xjw

w35n0xjw chips can run at up to 166MHz in octal mode, but this is only
possible after programming various VCR registers.

Implement the new ->configure_chip() hook for this purpose.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c    |  2 +-
 drivers/mtd/nand/spi/winbond.c | 95 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/mtd/spinand.h    |  1 +
 3 files changed, 94 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 93d22b412dd3..b0898990b2a5 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -360,7 +360,7 @@ static void spinand_ondie_ecc_save_status(struct nand_device *nand, u8 status)
 		engine_conf->status = status;
 }
 
-static int spinand_write_enable_op(struct spinand_device *spinand)
+int spinand_write_enable_op(struct spinand_device *spinand)
 {
 	struct spi_mem_op op = SPINAND_WR_EN_DIS_1S_0_0_OP(true);
 
diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c
index 18ae6f58a546..53890b1da65c 100644
--- a/drivers/mtd/nand/spi/winbond.c
+++ b/drivers/mtd/nand/spi/winbond.c
@@ -11,6 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/mtd/spinand.h>
 #include <linux/units.h>
+#include <linux/delay.h>
 
 #define SPINAND_MFR_WINBOND		0xEF
 
@@ -21,14 +22,26 @@
 #define W25N0XJW_SR4			0xD0
 #define W25N0XJW_SR4_HS			BIT(2)
 
+#define W35N01JW_VCR_IO_MODE			0x00
+#define W35N01JW_VCR_IO_MODE_SINGLE_SDR		0xFF
+#define W35N01JW_VCR_IO_MODE_OCTAL_SDR		0xDF
+#define W35N01JW_VCR_IO_MODE_OCTAL_DDR_DS	0xE7
+#define W35N01JW_VCR_IO_MODE_OCTAL_DDR		0xC7
+#define W35N01JW_VCR_DUMMY_CLOCK_REG	0x01
+
 /*
  * "X2" in the core is equivalent to "dual output" in the datasheets,
  * "X4" in the core is equivalent to "quad output" in the datasheets.
  */
 
 static SPINAND_OP_VARIANTS(read_cache_octal_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_8D_OP(0, 3, NULL, 0, 120 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1D_8D_OP(0, 2, NULL, 0, 105 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 20, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 16, NULL, 0, 162 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 12, NULL, 0, 124 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 8, NULL, 0, 86 * HZ_PER_MHZ),
+		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_8S_OP(0, 2, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_8S_OP(0, 1, NULL, 0, 133 * HZ_PER_MHZ),
 		SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0),
 		SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0));
@@ -269,6 +282,79 @@ static int w25n0xjw_hs_cfg(struct spinand_device *spinand)
 	return 0;
 }
 
+static int w35n0xjw_write_vcr(struct spinand_device *spinand, u8 reg, u8 val)
+{
+	struct spi_mem_op op =
+		SPI_MEM_OP(SPI_MEM_OP_CMD(0x81, 1),
+			   SPI_MEM_OP_ADDR(3, reg, 1),
+			   SPI_MEM_OP_NO_DUMMY,
+			   SPI_MEM_OP_DATA_OUT(1, spinand->scratchbuf, 1));
+	int ret;
+
+	*spinand->scratchbuf = val;
+
+	ret = spinand_write_enable_op(spinand);
+	if (ret)
+		return ret;
+
+	ret = spi_mem_exec_op(spinand->spimem, &op);
+	if (ret)
+		return ret;
+
+	/*
+	 * Write VCR operation doesn't set the busy bit in SR, which means we
+	 * cannot perform a status poll. Minimum time of 50ns is needed to
+	 * complete the write.
+	 */
+	ndelay(50);
+
+	return 0;
+}
+
+static int w35n0xjw_vcr_cfg(struct spinand_device *spinand)
+{
+	const struct spi_mem_op *op;
+	unsigned int dummy_cycles;
+	bool dtr, single;
+	u8 io_mode;
+	int ret;
+
+	op = spinand->op_templates.read_cache;
+
+	single = (op->cmd.buswidth == 1 && op->addr.buswidth == 1 && op->data.buswidth == 1);
+	dtr = (op->cmd.dtr || op->addr.dtr || op->data.dtr);
+	if (single && !dtr)
+		io_mode = W35N01JW_VCR_IO_MODE_SINGLE_SDR;
+	else if (!single && !dtr)
+		io_mode = W35N01JW_VCR_IO_MODE_OCTAL_SDR;
+	else if (!single && dtr)
+		io_mode = W35N01JW_VCR_IO_MODE_OCTAL_DDR;
+	else
+		return -EINVAL;
+
+	ret = w35n0xjw_write_vcr(spinand, W35N01JW_VCR_IO_MODE, io_mode);
+	if (ret)
+		return ret;
+
+	dummy_cycles = ((op->dummy.nbytes * 8) / op->dummy.buswidth) / (op->dummy.dtr ? 2 : 1);
+	switch (dummy_cycles) {
+	case 8:
+	case 12:
+	case 16:
+	case 20:
+	case 24:
+	case 28:
+		break;
+	default:
+		return -EINVAL;
+	}
+	ret = w35n0xjw_write_vcr(spinand, W35N01JW_VCR_DUMMY_CLOCK_REG, dummy_cycles);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 static const struct spinand_info winbond_spinand_table[] = {
 	/* 512M-bit densities */
 	SPINAND_INFO("W25N512GW", /* 1.8V */
@@ -326,7 +412,8 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_octal_variants,
 					      &update_cache_octal_variants),
 		     0,
-		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL)),
+		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL),
+		     SPINAND_CONFIGURE_CHIP(w35n0xjw_vcr_cfg)),
 	SPINAND_INFO("W35N02JW", /* 1.8V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xdf, 0x22),
 		     NAND_MEMORG(1, 4096, 128, 64, 512, 10, 1, 2, 1),
@@ -335,7 +422,8 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_octal_variants,
 					      &update_cache_octal_variants),
 		     0,
-		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL)),
+		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL),
+		     SPINAND_CONFIGURE_CHIP(w35n0xjw_vcr_cfg)),
 	SPINAND_INFO("W35N04JW", /* 1.8V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xdf, 0x23),
 		     NAND_MEMORG(1, 4096, 128, 64, 512, 10, 1, 4, 1),
@@ -344,7 +432,8 @@ static const struct spinand_info winbond_spinand_table[] = {
 					      &write_cache_octal_variants,
 					      &update_cache_octal_variants),
 		     0,
-		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL)),
+		     SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL),
+		     SPINAND_CONFIGURE_CHIP(w35n0xjw_vcr_cfg)),
 	/* 2G-bit densities */
 	SPINAND_INFO("W25M02GV", /* 2x1G-bit 3.3V */
 		     SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xab, 0x21),
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 53c881e41fc7..27a45bdab7ec 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -741,6 +741,7 @@ int spinand_match_and_init(struct spinand_device *spinand,
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
 int spinand_read_reg_op(struct spinand_device *spinand, u8 reg, u8 *val);
 int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val);
+int spinand_write_enable_op(struct spinand_device *spinand);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
 
 int spinand_wait(struct spinand_device *spinand, unsigned long initial_delay_us,
-- 
cgit v1.2.3


From 6f02527729bd31ca4e473bff19fda4ccd5889148 Mon Sep 17 00:00:00 2001
From: Norman Maurer <norman_maurer@apple.com>
Date: Mon, 28 Jul 2025 20:59:53 -1000
Subject: io_uring/net: Allow to do vectorized send

At the moment you have to use sendmsg for vectorized send.
While this works it's suboptimal as it also means you need to
allocate a struct msghdr that needs to be kept alive until a
submission happens. We can remove this limitation by just
allowing to use send directly.

Signed-off-by: Norman Maurer <norman_maurer@apple.com>
Link: https://lore.kernel.org/r/20250729065952.26646-1-norman_maurer@apple.com
[axboe: remove -EINVAL return for SENDMSG and SEND_VECTORIZED]
[axboe: allow send_zc to set SEND_VECTORIZED too]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 4 ++++
 io_uring/net.c                | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b8a0e70ee2fd..6957dc539d83 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -392,12 +392,16 @@ enum io_uring_op {
  *				the starting buffer ID in cqe->flags as per
  *				usual for provided buffer usage. The buffers
  *				will be	contiguous from the starting buffer ID.
+ *
+ * IORING_SEND_VECTORIZED	If set, SEND[_ZC] will take a pointer to a io_vec
+ * 				to allow vectorized send operations.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
 #define IORING_SEND_ZC_REPORT_USAGE	(1U << 3)
 #define IORING_RECVSEND_BUNDLE		(1U << 4)
+#define IORING_SEND_VECTORIZED		(1U << 5)
 
 /*
  * cqe.res for IORING_CQE_F_NOTIF if
diff --git a/io_uring/net.c b/io_uring/net.c
index 35585bdc59f3..dd96e355982f 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -382,6 +382,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	}
 	if (req->flags & REQ_F_BUFFER_SELECT)
 		return 0;
+
+	if (sr->flags & IORING_SEND_VECTORIZED)
+               return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
+
 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
 }
 
@@ -409,7 +413,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
 }
 
-#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
 
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
@@ -1318,7 +1322,8 @@ void io_send_zc_cleanup(struct io_kiocb *req)
 }
 
 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
-#define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
+#define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
+				IORING_SEND_VECTORIZED)
 
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-- 
cgit v1.2.3


From 907a99c314a5a695e35acff78ac61f4ec950a6d3 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Tue, 22 Jul 2025 11:33:40 +0800
Subject: md: rename recovery_cp to resync_offset

'recovery_cp' was used to represent the progress of sync, but its name
contains recovery, which can cause confusion. Replaces 'recovery_cp'
with 'resync_offset' for clarity.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/linux-raid/20250722033340.1933388-1-linan666@huaweicloud.com
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/dm-raid.c           | 42 +++++++++++++++++------------------
 drivers/md/md-bitmap.c         |  8 +++----
 drivers/md/md-cluster.c        | 16 +++++++-------
 drivers/md/md.c                | 50 +++++++++++++++++++++---------------------
 drivers/md/md.h                |  2 +-
 drivers/md/raid0.c             |  6 ++---
 drivers/md/raid1-10.c          |  2 +-
 drivers/md/raid1.c             | 10 ++++-----
 drivers/md/raid10.c            | 16 +++++++-------
 drivers/md/raid5-ppl.c         |  6 ++---
 drivers/md/raid5.c             | 30 ++++++++++++-------------
 include/uapi/linux/raid/md_p.h |  2 +-
 12 files changed, 95 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e8c0a8c6fb51..9835f2fe26e9 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -439,7 +439,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
 /* Return true, if raid set in @rs is recovering */
 static bool rs_is_recovering(struct raid_set *rs)
 {
-	return rs->md.recovery_cp < rs->md.dev_sectors;
+	return rs->md.resync_offset < rs->md.dev_sectors;
 }
 
 /* Return true, if raid set in @rs is reshaping */
@@ -769,7 +769,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r
 	rs->md.layout = raid_type->algorithm;
 	rs->md.new_layout = rs->md.layout;
 	rs->md.delta_disks = 0;
-	rs->md.recovery_cp = MaxSector;
+	rs->md.resync_offset = MaxSector;
 
 	for (i = 0; i < raid_devs; i++)
 		md_rdev_init(&rs->dev[i].rdev);
@@ -913,7 +913,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
 		rs->md.external = 0;
 		rs->md.persistent = 1;
 		rs->md.major_version = 2;
-	} else if (rebuild && !rs->md.recovery_cp) {
+	} else if (rebuild && !rs->md.resync_offset) {
 		/*
 		 * Without metadata, we will not be able to tell if the array
 		 * is in-sync or not - we must assume it is not.  Therefore,
@@ -1696,20 +1696,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
 {
 	/* raid0 does not recover */
 	if (rs_is_raid0(rs))
-		rs->md.recovery_cp = MaxSector;
+		rs->md.resync_offset = MaxSector;
 	/*
 	 * A raid6 set has to be recovered either
 	 * completely or for the grown part to
 	 * ensure proper parity and Q-Syndrome
 	 */
 	else if (rs_is_raid6(rs))
-		rs->md.recovery_cp = dev_sectors;
+		rs->md.resync_offset = dev_sectors;
 	/*
 	 * Other raid set types may skip recovery
 	 * depending on the 'nosync' flag.
 	 */
 	else
-		rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
+		rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)
 				     ? MaxSector : dev_sectors;
 }
 
@@ -2144,7 +2144,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->events = cpu_to_le64(mddev->events);
 
 	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
-	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+	sb->array_resync_offset = cpu_to_le64(mddev->resync_offset);
 
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
@@ -2335,18 +2335,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 	}
 
 	if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags))
-		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+		mddev->resync_offset = le64_to_cpu(sb->array_resync_offset);
 
 	/*
 	 * During load, we set FirstUse if a new superblock was written.
 	 * There are two reasons we might not have a superblock:
 	 * 1) The raid set is brand new - in which case, all of the
 	 *    devices must have their In_sync bit set.	Also,
-	 *    recovery_cp must be 0, unless forced.
+	 *    resync_offset must be 0, unless forced.
 	 * 2) This is a new device being added to an old raid set
 	 *    and the new device needs to be rebuilt - in which
 	 *    case the In_sync bit will /not/ be set and
-	 *    recovery_cp must be MaxSector.
+	 *    resync_offset must be MaxSector.
 	 * 3) This is/are a new device(s) being added to an old
 	 *    raid set during takeover to a higher raid level
 	 *    to provide capacity for redundancy or during reshape
@@ -2391,8 +2391,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 			      new_devs > 1 ? "s" : "");
 			return -EINVAL;
 		} else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) {
-			DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)",
-			      (unsigned long long) mddev->recovery_cp);
+			DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)",
+			      (unsigned long long) mddev->resync_offset);
 			return -EINVAL;
 		} else if (rs_is_reshaping(rs)) {
 			DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)",
@@ -2697,11 +2697,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
 	}
 out:
 	/*
-	 * Raise recovery_cp in case data_offset != 0 to
+	 * Raise resync_offset in case data_offset != 0 to
 	 * avoid false recovery positives in the constructor.
 	 */
-	if (rs->md.recovery_cp < rs->md.dev_sectors)
-		rs->md.recovery_cp += rs->dev[0].rdev.data_offset;
+	if (rs->md.resync_offset < rs->md.dev_sectors)
+		rs->md.resync_offset += rs->dev[0].rdev.data_offset;
 
 	/* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
 	rdev_for_each(rdev, &rs->md) {
@@ -2756,7 +2756,7 @@ static int rs_setup_takeover(struct raid_set *rs)
 	}
 
 	clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 
 	while (d--) {
 		rdev = &rs->dev[d].rdev;
@@ -2764,7 +2764,7 @@ static int rs_setup_takeover(struct raid_set *rs)
 		if (test_bit(d, (void *) rs->rebuild_disks)) {
 			clear_bit(In_sync, &rdev->flags);
 			clear_bit(Faulty, &rdev->flags);
-			mddev->recovery_cp = rdev->recovery_offset = 0;
+			mddev->resync_offset = rdev->recovery_offset = 0;
 			/* Bitmap has to be created when we do an "up" takeover */
 			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
 		}
@@ -3222,7 +3222,7 @@ size_check:
 			if (r)
 				goto bad;
 
-			rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors);
+			rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors);
 		} else {
 			/* This is no size change or it is shrinking, update size and record in superblocks */
 			r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false);
@@ -3446,7 +3446,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery,
 
 	} else {
 		if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery))
-			r = mddev->recovery_cp;
+			r = mddev->resync_offset;
 		else
 			r = mddev->curr_resync_completed;
 
@@ -4074,9 +4074,9 @@ static int raid_preresume(struct dm_target *ti)
 	}
 
 	/* Check for any resize/reshape on @rs and adjust/initiate */
-	if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
+	if (mddev->resync_offset && mddev->resync_offset < MaxSector) {
 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
-		mddev->resync_min = mddev->recovery_cp;
+		mddev->resync_min = mddev->resync_offset;
 		if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags))
 			mddev->resync_max_sectors = mddev->dev_sectors;
 	}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 7f524a26cebc..334b71404930 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1987,12 +1987,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
 
 		md_bitmap_set_memory_bits(bitmap, sec, 1);
 		md_bitmap_file_set_bit(bitmap, sec);
-		if (sec < bitmap->mddev->recovery_cp)
+		if (sec < bitmap->mddev->resync_offset)
 			/* We are asserting that the array is dirty,
-			 * so move the recovery_cp address back so
+			 * so move the resync_offset address back so
 			 * that it is obvious that it is dirty
 			 */
-			bitmap->mddev->recovery_cp = sec;
+			bitmap->mddev->resync_offset = sec;
 	}
 }
 
@@ -2258,7 +2258,7 @@ static int bitmap_load(struct mddev *mddev)
 	    || bitmap->events_cleared == mddev->events)
 		/* no need to keep dirty bits to optimise a
 		 * re-add of a missing device */
-		start = mddev->recovery_cp;
+		start = mddev->resync_offset;
 
 	mutex_lock(&mddev->bitmap_info.mutex);
 	err = md_bitmap_init_from_disk(bitmap, start);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 94221d964d4f..5497eaee96e7 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread)
 			md_wakeup_thread(mddev->sync_thread);
 
 		if (hi > 0) {
-			if (lo < mddev->recovery_cp)
-				mddev->recovery_cp = lo;
+			if (lo < mddev->resync_offset)
+				mddev->resync_offset = lo;
 			/* wake up thread to continue resync in case resync
 			 * is not finished */
-			if (mddev->recovery_cp != MaxSector) {
+			if (mddev->resync_offset != MaxSector) {
 				/*
 				 * clear the REMOTE flag since we will launch
 				 * resync thread in current node.
@@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 			lockres_free(bm_lockres);
 			continue;
 		}
-		if ((hi > 0) && (lo < mddev->recovery_cp)) {
+		if ((hi > 0) && (lo < mddev->resync_offset)) {
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-			mddev->recovery_cp = lo;
+			mddev->resync_offset = lo;
 			md_check_recovery(mddev);
 		}
 
@@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev)
 	 * Also, we should send BITMAP_NEEDS_SYNC message in
 	 * case reshaping is interrupted.
 	 */
-	if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+	if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) ||
 	    (mddev->reshape_position != MaxSector &&
 	     test_bit(MD_CLOSING, &mddev->flags)))
 		resync_bitmap(mddev);
@@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev)
 			pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
 			goto out;
 		}
-		if ((hi > 0) && (lo < mddev->recovery_cp))
-			mddev->recovery_cp = lo;
+		if ((hi > 0) && (lo < mddev->resync_offset))
+			mddev->resync_offset = lo;
 	}
 out:
 	return err;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8af97ef80ec5..9c7ed23c45ad 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1415,13 +1415,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
 			mddev->layout = -1;
 
 		if (sb->state & (1<<MD_SB_CLEAN))
-			mddev->recovery_cp = MaxSector;
+			mddev->resync_offset = MaxSector;
 		else {
 			if (sb->events_hi == sb->cp_events_hi &&
 				sb->events_lo == sb->cp_events_lo) {
-				mddev->recovery_cp = sb->recovery_cp;
+				mddev->resync_offset = sb->resync_offset;
 			} else
-				mddev->recovery_cp = 0;
+				mddev->resync_offset = 0;
 		}
 
 		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
@@ -1547,13 +1547,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
 	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
 	{
-		sb->recovery_cp = mddev->recovery_cp;
+		sb->resync_offset = mddev->resync_offset;
 		sb->cp_events_hi = (mddev->events>>32);
 		sb->cp_events_lo = (u32)mddev->events;
-		if (mddev->recovery_cp == MaxSector)
+		if (mddev->resync_offset == MaxSector)
 			sb->state = (1<< MD_SB_CLEAN);
 	} else
-		sb->recovery_cp = 0;
+		sb->resync_offset = 0;
 
 	sb->layout = mddev->layout;
 	sb->chunk_size = mddev->chunk_sectors << 9;
@@ -1901,7 +1901,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
 		mddev->bitmap_info.default_space = (4096-1024) >> 9;
 		mddev->reshape_backwards = 0;
 
-		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+		mddev->resync_offset = le64_to_cpu(sb->resync_offset);
 		memcpy(mddev->uuid, sb->set_uuid, 16);
 
 		mddev->max_disks =  (4096-256)/2;
@@ -2087,7 +2087,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 	sb->utime = cpu_to_le64((__u64)mddev->utime);
 	sb->events = cpu_to_le64(mddev->events);
 	if (mddev->in_sync)
-		sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+		sb->resync_offset = cpu_to_le64(mddev->resync_offset);
 	else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
 		sb->resync_offset = cpu_to_le64(MaxSector);
 	else
@@ -2767,7 +2767,7 @@ repeat:
 	/* If this is just a dirty<->clean transition, and the array is clean
 	 * and 'events' is odd, we can roll back to the previous clean state */
 	if (nospares
-	    && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+	    && (mddev->in_sync && mddev->resync_offset == MaxSector)
 	    && mddev->can_decrease_events
 	    && mddev->events != 1) {
 		mddev->events--;
@@ -4303,9 +4303,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
 static ssize_t
 resync_start_show(struct mddev *mddev, char *page)
 {
-	if (mddev->recovery_cp == MaxSector)
+	if (mddev->resync_offset == MaxSector)
 		return sprintf(page, "none\n");
-	return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+	return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
 }
 
 static ssize_t
@@ -4331,7 +4331,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
 		err = -EBUSY;
 
 	if (!err) {
-		mddev->recovery_cp = n;
+		mddev->resync_offset = n;
 		if (mddev->pers)
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 	}
@@ -6423,7 +6423,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->external_size = 0;
 	mddev->dev_sectors = 0;
 	mddev->raid_disks = 0;
-	mddev->recovery_cp = 0;
+	mddev->resync_offset = 0;
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->reshape_position = MaxSector;
@@ -7368,9 +7368,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
 	 * openned
 	 */
 	if (info->state & (1<<MD_SB_CLEAN))
-		mddev->recovery_cp = MaxSector;
+		mddev->resync_offset = MaxSector;
 	else
-		mddev->recovery_cp = 0;
+		mddev->resync_offset = 0;
 	mddev->persistent    = ! info->not_persistent;
 	mddev->external	     = 0;
 
@@ -8309,7 +8309,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
 				seq_printf(seq, "\tresync=REMOTE");
 			return 1;
 		}
-		if (mddev->recovery_cp < MaxSector) {
+		if (mddev->resync_offset < MaxSector) {
 			seq_printf(seq, "\tresync=PENDING");
 			return 1;
 		}
@@ -8952,7 +8952,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
 		return mddev->resync_min;
 	case ACTION_RESYNC:
 		if (!mddev->bitmap)
-			return mddev->recovery_cp;
+			return mddev->resync_offset;
 		return 0;
 	case ACTION_RESHAPE:
 		/*
@@ -9190,8 +9190,8 @@ void md_do_sync(struct md_thread *thread)
 				   atomic_read(&mddev->recovery_active) == 0);
 			mddev->curr_resync_completed = j;
 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
-			    j > mddev->recovery_cp)
-				mddev->recovery_cp = j;
+			    j > mddev->resync_offset)
+				mddev->resync_offset = j;
 			update_time = jiffies;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9311,19 +9311,19 @@ void md_do_sync(struct md_thread *thread)
 	    mddev->curr_resync > MD_RESYNC_ACTIVE) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-				if (mddev->curr_resync >= mddev->recovery_cp) {
+				if (mddev->curr_resync >= mddev->resync_offset) {
 					pr_debug("md: checkpointing %s of %s.\n",
 						 desc, mdname(mddev));
 					if (test_bit(MD_RECOVERY_ERROR,
 						&mddev->recovery))
-						mddev->recovery_cp =
+						mddev->resync_offset =
 							mddev->curr_resync_completed;
 					else
-						mddev->recovery_cp =
+						mddev->resync_offset =
 							mddev->curr_resync;
 				}
 			} else
-				mddev->recovery_cp = MaxSector;
+				mddev->resync_offset = MaxSector;
 		} else {
 			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 				mddev->curr_resync = MaxSector;
@@ -9539,7 +9539,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
 	}
 
 	/* Check if resync is in progress. */
-	if (mddev->recovery_cp < MaxSector) {
+	if (mddev->resync_offset < MaxSector) {
 		remove_spares(mddev, NULL);
 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
@@ -9720,7 +9720,7 @@ void md_check_recovery(struct mddev *mddev)
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2
-		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
+		 && !mddev->in_sync && mddev->resync_offset == MaxSector)
 		))
 		return;
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 67b365621507..51af29a03079 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -523,7 +523,7 @@ struct mddev {
 	unsigned long			normal_io_events; /* IO event timestamp */
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
-	sector_t			recovery_cp;
+	sector_t			resync_offset;
 	sector_t			resync_min;	/* user requested sync
 							 * starts here */
 	sector_t			resync_max;	/* resync should pause
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index cbe2a9054cb9..f1d8811a542a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -674,7 +674,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	mddev->raid_disks--;
 	mddev->delta_disks = -1;
 	/* make sure it will be not marked as dirty */
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
 
 	create_strip_zones(mddev, &priv_conf);
@@ -717,7 +717,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	mddev->raid_disks += mddev->delta_disks;
 	mddev->degraded = 0;
 	/* make sure it will be not marked as dirty */
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
 
 	create_strip_zones(mddev, &priv_conf);
@@ -760,7 +760,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	mddev->delta_disks = 1 - mddev->raid_disks;
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
 
 	create_strip_zones(mddev, &priv_conf);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index b8b3a9069701..52881e6032da 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev,
 static inline bool raid1_should_read_first(struct mddev *mddev,
 					   sector_t this_sector, int len)
 {
-	if ((mddev->recovery_cp < this_sector + len))
+	if ((mddev->resync_offset < this_sector + len))
 		return true;
 
 	if (mddev_is_clustered(mddev) &&
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 64b8176907a9..6cee738a645f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2822,7 +2822,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	}
 
 	if (mddev->bitmap == NULL &&
-	    mddev->recovery_cp == MaxSector &&
+	    mddev->resync_offset == MaxSector &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    conf->fullsync == 0) {
 		*skipped = 1;
@@ -3282,9 +3282,9 @@ static int raid1_run(struct mddev *mddev)
 	}
 
 	if (conf->raid_disks - mddev->degraded == 1)
-		mddev->recovery_cp = MaxSector;
+		mddev->resync_offset = MaxSector;
 
-	if (mddev->recovery_cp != MaxSector)
+	if (mddev->resync_offset != MaxSector)
 		pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
 			mdname(mddev));
 	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
@@ -3345,8 +3345,8 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
 
 	md_set_array_sectors(mddev, newsize);
 	if (sectors > mddev->dev_sectors &&
-	    mddev->recovery_cp > mddev->dev_sectors) {
-		mddev->recovery_cp = mddev->dev_sectors;
+	    mddev->resync_offset > mddev->dev_sectors) {
+		mddev->resync_offset = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 	mddev->dev_sectors = sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 95dc354a86a0..b60c30bfb6c7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2117,7 +2117,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int last = conf->geo.raid_disks - 1;
 	struct raid10_info *p;
 
-	if (mddev->recovery_cp < MaxSector)
+	if (mddev->resync_offset < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
@@ -3185,7 +3185,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 	 * of a clean array, like RAID1 does.
 	 */
 	if (mddev->bitmap == NULL &&
-	    mddev->recovery_cp == MaxSector &&
+	    mddev->resync_offset == MaxSector &&
 	    mddev->reshape_position == MaxSector &&
 	    !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
@@ -4145,7 +4145,7 @@ static int raid10_run(struct mddev *mddev)
 		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 
-	if (mddev->recovery_cp != MaxSector)
+	if (mddev->resync_offset != MaxSector)
 		pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n",
 			  mdname(mddev));
 	pr_info("md/raid10:%s: active with %d out of %d devices\n",
@@ -4245,8 +4245,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
 
 	md_set_array_sectors(mddev, size);
 	if (sectors > mddev->dev_sectors &&
-	    mddev->recovery_cp > oldsize) {
-		mddev->recovery_cp = oldsize;
+	    mddev->resync_offset > oldsize) {
+		mddev->resync_offset = oldsize;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 	calc_sectors(conf, sectors);
@@ -4275,7 +4275,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
 	mddev->delta_disks = mddev->raid_disks;
 	mddev->raid_disks *= 2;
 	/* make sure it will be not marked as dirty */
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 	mddev->dev_sectors = size;
 
 	conf = setup_conf(mddev);
@@ -5087,8 +5087,8 @@ static void raid10_finish_reshape(struct mddev *mddev)
 		return;
 
 	if (mddev->delta_disks > 0) {
-		if (mddev->recovery_cp > mddev->resync_max_sectors) {
-			mddev->recovery_cp = mddev->resync_max_sectors;
+		if (mddev->resync_offset > mddev->resync_max_sectors) {
+			mddev->resync_offset = mddev->resync_max_sectors;
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		}
 		mddev->resync_max_sectors = mddev->array_sectors;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index c0fb335311aa..56b234683ee6 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log)
 		    le64_to_cpu(pplhdr->generation));
 
 	/* attempt to recover from log if we are starting a dirty array */
-	if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector)
+	if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector)
 		ret = ppl_recover(log, pplhdr, pplhdr_offset);
 
 	/* write empty header if we are starting the array */
@@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf)
 
 	if (ret) {
 		goto err;
-	} else if (!mddev->pers && mddev->recovery_cp == 0 &&
+	} else if (!mddev->pers && mddev->resync_offset == 0 &&
 		   ppl_conf->recovered_entries > 0 &&
 		   ppl_conf->mismatch_count == 0) {
 		/*
 		 * If we are starting a dirty array and the recovery succeeds
 		 * without any issues, set the array as clean.
 		 */
-		mddev->recovery_cp = MaxSector;
+		mddev->resync_offset = MaxSector;
 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
 		/* no mismatch allowed when enabling PPL for a running array */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7ec61ee7b218..023649fe2476 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
 	    && !test_bit(Faulty, &rdev->flags)
 	    && !test_bit(In_sync, &rdev->flags)
 	    && (rdev->recovery_offset <= sh->sector
-		|| rdev->mddev->recovery_cp <= sh->sector))
+		|| rdev->mddev->resync_offset <= sh->sector))
 		rv = 1;
 	return rv;
 }
@@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	 * is missing/faulty, then we need to read everything we can.
 	 */
 	if (!force_rcw &&
-	    sh->sector < sh->raid_conf->mddev->recovery_cp)
+	    sh->sector < sh->raid_conf->mddev->resync_offset)
 		/* reconstruct-write isn't being forced */
 		return 0;
 	for (i = 0; i < s->failed && i < 2; i++) {
@@ -4097,7 +4097,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 				  int disks)
 {
 	int rmw = 0, rcw = 0, i;
-	sector_t recovery_cp = conf->mddev->recovery_cp;
+	sector_t resync_offset = conf->mddev->resync_offset;
 
 	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
@@ -4107,14 +4107,14 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 	 * generate correct data from the parity.
 	 */
 	if (conf->rmw_level == PARITY_DISABLE_RMW ||
-	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
+	    (resync_offset < MaxSector && sh->sector >= resync_offset &&
 	     s->failed == 0)) {
 		/* Calculate the real rcw later - for now make it
 		 * look like rcw is cheaper
 		 */
 		rcw = 1; rmw = 2;
-		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
-			 conf->rmw_level, (unsigned long long)recovery_cp,
+		pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n",
+			 conf->rmw_level, (unsigned long long)resync_offset,
 			 (unsigned long long)sh->sector);
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
@@ -4770,14 +4770,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
 		 *     we must be recovering.
-		 * else if we are after recovery_cp, we must be syncing
+		 * else if we are after resync_offset, we must be syncing
 		 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
 		 * else we can only be replacing
 		 * sync and recovery both need to read all devices, and so
 		 * use the same flag.
 		 */
 		if (do_recovery ||
-		    sh->sector >= conf->mddev->recovery_cp ||
+		    sh->sector >= conf->mddev->resync_offset ||
 		    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
 			s->syncing = 1;
 		else
@@ -7780,7 +7780,7 @@ static int raid5_run(struct mddev *mddev)
 	int first = 1;
 	int ret = -EIO;
 
-	if (mddev->recovery_cp != MaxSector)
+	if (mddev->resync_offset != MaxSector)
 		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
 			  mdname(mddev));
 
@@ -7921,7 +7921,7 @@ static int raid5_run(struct mddev *mddev)
 				mdname(mddev));
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
-		} else if (mddev->recovery_cp == MaxSector)
+		} else if (mddev->resync_offset == MaxSector)
 			set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
 	}
 
@@ -7988,7 +7988,7 @@ static int raid5_run(struct mddev *mddev)
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
 	if (mddev->degraded > dirty_parity_disks &&
-	    mddev->recovery_cp != MaxSector) {
+	    mddev->resync_offset != MaxSector) {
 		if (test_bit(MD_HAS_PPL, &mddev->flags))
 			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
 				mdname(mddev));
@@ -8328,8 +8328,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 
 	md_set_array_sectors(mddev, newsize);
 	if (sectors > mddev->dev_sectors &&
-	    mddev->recovery_cp > mddev->dev_sectors) {
-		mddev->recovery_cp = mddev->dev_sectors;
+	    mddev->resync_offset > mddev->dev_sectors) {
+		mddev->resync_offset = mddev->dev_sectors;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 	mddev->dev_sectors = sectors;
@@ -8423,7 +8423,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 		return -EINVAL;
 
 	/* raid5 can't handle concurrent reshape and recovery */
-	if (mddev->recovery_cp < MaxSector)
+	if (mddev->resync_offset < MaxSector)
 		return -EBUSY;
 	for (i = 0; i < conf->raid_disks; i++)
 		if (conf->disks[i].replacement)
@@ -8648,7 +8648,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 	mddev->raid_disks += 1;
 	mddev->delta_disks = 1;
 	/* make sure it will be not marked as dirty */
-	mddev->recovery_cp = MaxSector;
+	mddev->resync_offset = MaxSector;
 
 	return setup_conf(mddev);
 }
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index ff47b6f0ba0f..b13946287277 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -173,7 +173,7 @@ typedef struct mdp_superblock_s {
 #else
 #error unspecified endianness
 #endif
-	__u32 recovery_cp;	/* 11 recovery checkpoint sector count	      */
+	__u32 resync_offset;	/* 11 resync checkpoint sector count	      */
 	/* There are only valid for minor_version > 90 */
 	__u64 reshape_position;	/* 12,13 next address in array-space for reshape */
 	__u32 new_level;	/* 14 new level we are reshaping to	      */
-- 
cgit v1.2.3


From 5523a466e905b6287b94654ddb364536f2f948cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Jul 2025 11:06:03 +0200
Subject: i3c: fix module_i3c_i2c_driver() with I3C=n

When CONFIG_I3C is disabled and the i3c_i2c_driver_register() happens
to not be inlined, any driver calling it still references the i3c_driver
instance, which then causes a link failure:

x86_64-linux-ld: drivers/hwmon/lm75.o: in function `lm75_i3c_reg_read':
lm75.c:(.text+0xc61): undefined reference to `i3cdev_to_dev'
x86_64-linux-ld: lm75.c:(.text+0xd25): undefined reference to `i3c_device_do_priv_xfers'
x86_64-linux-ld: lm75.c:(.text+0xdd8): undefined reference to `i3c_device_do_priv_xfers'

This issue was part of the original i3c code, but only now caused problems
when i3c support got added to lm75.

Change the 'inline' annotations in the header to '__always_inline' to
ensure that the dead-code-elimination pass in the compiler can optimize
it out as intended.

Fixes: 6071d10413ff ("hwmon: (lm75) add I3C support for P3T1755")
Fixes: 3a379bbcea0a ("i3c: Add core I3C infrastructure")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20250725090609.2456262-1-arnd@kernel.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/device.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h
index b674f64d0822..7f136de4b73e 100644
--- a/include/linux/i3c/device.h
+++ b/include/linux/i3c/device.h
@@ -245,7 +245,7 @@ void i3c_driver_unregister(struct i3c_driver *drv);
  *
  * Return: 0 if both registrations succeeds, a negative error code otherwise.
  */
-static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv,
+static __always_inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv,
 					  struct i2c_driver *i2cdrv)
 {
 	int ret;
@@ -270,7 +270,7 @@ static inline int i3c_i2c_driver_register(struct i3c_driver *i3cdrv,
  * Note that when CONFIG_I3C is not enabled, this function only unregisters the
  * @i2cdrv.
  */
-static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv,
+static __always_inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv,
 					     struct i2c_driver *i2cdrv)
 {
 	if (IS_ENABLED(CONFIG_I3C))
-- 
cgit v1.2.3


From 9c0609d685b27a0bb392390680207baa820ed118 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 24 Jul 2025 11:41:40 +0200
Subject: i3c: Standardize defines for specification parameters

Align existing defines to follow the consistent pattern:
I3C_BUS_<PARAM>_<MAX|MIN|TYP>_<UNIT>. Prepare the codebase for adding
new parameters and help avoid duplication.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Tested-by: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20250724094146.6443-2-wsa+renesas@sang-engineering.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 drivers/i3c/master.c               | 12 ++++++------
 drivers/i3c/master/dw-i3c-master.c |  4 ++--
 include/linux/i3c/master.h         |  9 +++++----
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c
index e00991444f31..2ef898a8fd80 100644
--- a/drivers/i3c/master.c
+++ b/drivers/i3c/master.c
@@ -727,12 +727,12 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode,
 	switch (i3cbus->mode) {
 	case I3C_BUS_MODE_PURE:
 		if (!i3cbus->scl_rate.i3c)
-			i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE;
+			i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE;
 		break;
 	case I3C_BUS_MODE_MIXED_FAST:
 	case I3C_BUS_MODE_MIXED_LIMITED:
 		if (!i3cbus->scl_rate.i3c)
-			i3cbus->scl_rate.i3c = I3C_BUS_TYP_I3C_SCL_RATE;
+			i3cbus->scl_rate.i3c = I3C_BUS_I3C_SCL_TYP_RATE;
 		if (!i3cbus->scl_rate.i2c)
 			i3cbus->scl_rate.i2c = max_i2c_scl_rate;
 		break;
@@ -754,8 +754,8 @@ static int i3c_bus_set_mode(struct i3c_bus *i3cbus, enum i3c_bus_mode mode,
 	 * I3C/I2C frequency may have been overridden, check that user-provided
 	 * values are not exceeding max possible frequency.
 	 */
-	if (i3cbus->scl_rate.i3c > I3C_BUS_MAX_I3C_SCL_RATE ||
-	    i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_RATE)
+	if (i3cbus->scl_rate.i3c > I3C_BUS_I3C_SCL_MAX_RATE ||
+	    i3cbus->scl_rate.i2c > I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE)
 		return -EINVAL;
 
 	return 0;
@@ -2787,7 +2787,7 @@ int i3c_master_register(struct i3c_master_controller *master,
 			const struct i3c_master_controller_ops *ops,
 			bool secondary)
 {
-	unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_RATE;
+	unsigned long i2c_scl_rate = I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE;
 	struct i3c_bus *i3cbus = i3c_master_get_bus(master);
 	enum i3c_bus_mode mode = I3C_BUS_MODE_PURE;
 	struct i2c_dev_boardinfo *i2cbi;
@@ -2846,7 +2846,7 @@ int i3c_master_register(struct i3c_master_controller *master,
 		}
 
 		if (i2cbi->lvr & I3C_LVR_I2C_FM_MODE)
-			i2c_scl_rate = I3C_BUS_I2C_FM_SCL_RATE;
+			i2c_scl_rate = I3C_BUS_I2C_FM_SCL_MAX_RATE;
 	}
 
 	ret = i3c_bus_set_mode(i3cbus, mode, i2c_scl_rate);
diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
index cc872b481691..e61be28cd1e3 100644
--- a/drivers/i3c/master/dw-i3c-master.c
+++ b/drivers/i3c/master/dw-i3c-master.c
@@ -605,14 +605,14 @@ static int dw_i2c_clk_cfg(struct dw_i3c_master *master)
 	core_period = DIV_ROUND_UP(1000000000, core_rate);
 
 	lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FMP_TLOW_MIN_NS, core_period);
-	hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_RATE) - lcnt;
+	hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE) - lcnt;
 	scl_timing = SCL_I2C_FMP_TIMING_HCNT(hcnt) |
 		     SCL_I2C_FMP_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I2C_FMP_TIMING);
 	master->i2c_fmp_timing = scl_timing;
 
 	lcnt = DIV_ROUND_UP(I3C_BUS_I2C_FM_TLOW_MIN_NS, core_period);
-	hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_RATE) - lcnt;
+	hcnt = DIV_ROUND_UP(core_rate, I3C_BUS_I2C_FM_SCL_MAX_RATE) - lcnt;
 	scl_timing = SCL_I2C_FM_TIMING_HCNT(hcnt) |
 		     SCL_I2C_FM_TIMING_LCNT(lcnt);
 	writel(scl_timing, master->regs + SCL_I2C_FM_TIMING);
diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index c67922ece617..7dfcbe530515 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -249,10 +249,11 @@ struct i3c_device {
  */
 #define I3C_BUS_MAX_DEVS		11
 
-#define I3C_BUS_MAX_I3C_SCL_RATE	12900000
-#define I3C_BUS_TYP_I3C_SCL_RATE	12500000
-#define I3C_BUS_I2C_FM_PLUS_SCL_RATE	1000000
-#define I3C_BUS_I2C_FM_SCL_RATE		400000
+/* Taken from the I3C Spec V1.1.1, chapter 6.2. "Timing specification" */
+#define I3C_BUS_I2C_FM_PLUS_SCL_MAX_RATE	1000000
+#define I3C_BUS_I2C_FM_SCL_MAX_RATE		400000
+#define I3C_BUS_I3C_SCL_MAX_RATE	12900000
+#define I3C_BUS_I3C_SCL_TYP_RATE	12500000
 #define I3C_BUS_TLOW_OD_MIN_NS		200
 
 /**
-- 
cgit v1.2.3


From 8acf1f3bae1ea48949458b67d68a72a95c3244a4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 24 Jul 2025 11:41:41 +0200
Subject: i3c: Add more parameters for controllers to the header

Add standard timing value definition from specification.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Tested-by: Tommaso Merciai <tommaso.merciai.xr@bp.renesas.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20250724094146.6443-3-wsa+renesas@sang-engineering.com
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 include/linux/i3c/master.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h
index 7dfcbe530515..043f5c7ff398 100644
--- a/include/linux/i3c/master.h
+++ b/include/linux/i3c/master.h
@@ -254,6 +254,10 @@ struct i3c_device {
 #define I3C_BUS_I2C_FM_SCL_MAX_RATE		400000
 #define I3C_BUS_I3C_SCL_MAX_RATE	12900000
 #define I3C_BUS_I3C_SCL_TYP_RATE	12500000
+#define I3C_BUS_TAVAL_MIN_NS		1000
+#define I3C_BUS_TBUF_MIXED_FM_MIN_NS	1300
+#define I3C_BUS_THIGH_MIXED_MAX_NS	41
+#define I3C_BUS_TIDLE_MIN_NS		200000
 #define I3C_BUS_TLOW_OD_MIN_NS		200
 
 /**
-- 
cgit v1.2.3


From 0060beec0bfa647c4b510df188b1c4673a197839 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 28 Jul 2025 13:04:29 +0900
Subject: ata: libata-sata: Add link_power_management_supported sysfs attribute

A port link power management (LPM) policy can be controlled using the
link_power_management_policy sysfs host attribute. However, this
attribute exists also for hosts that do not support LPM and in such
case, attempting to change the LPM policy for the host (port) will fail
with -EOPNOTSUPP.

Introduce the new sysfs link_power_management_supported host attribute
to indicate to the user if a the port and the devices connected to the
port for the host support LPM, which implies that the
link_power_management_policy attribute can be used.

Since checking that a port and its devices support LPM is common between
the new ata_scsi_lpm_supported_show() function and the existing
ata_scsi_lpm_store() function, the new helper ata_scsi_lpm_supported()
is introduced.

Fixes: 413e800cadbf ("ata: libata-sata: Disallow changing LPM state if not supported")
Reported-by: Borah, Chaitanya Kumar <chaitanya.kumar.borah@intel.com>
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202507251014.a5becc3b-lkp@intel.com
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/ata_piix.c    |  1 +
 drivers/ata/libahci.c     |  1 +
 drivers/ata/libata-sata.c | 53 ++++++++++++++++++++++++++++++++++++-----------
 include/linux/libata.h    |  1 +
 4 files changed, 44 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 229429ba5027..495fa096dd65 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1089,6 +1089,7 @@ static struct ata_port_operations ich_pata_ops = {
 };
 
 static struct attribute *piix_sidpr_shost_attrs[] = {
+	&dev_attr_link_power_management_supported.attr,
 	&dev_attr_link_power_management_policy.attr,
 	NULL
 };
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index b335fb7e5cb4..c79abdfcd7a9 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -111,6 +111,7 @@ static DEVICE_ATTR(em_buffer, S_IWUSR | S_IRUGO,
 static DEVICE_ATTR(em_message_supported, S_IRUGO, ahci_show_em_supported, NULL);
 
 static struct attribute *ahci_shost_attrs[] = {
+	&dev_attr_link_power_management_supported.attr,
 	&dev_attr_link_power_management_policy.attr,
 	&dev_attr_em_message_type.attr,
 	&dev_attr_em_message.attr,
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 4734465d3b1e..b2817a2995d6 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -900,14 +900,52 @@ static const char *ata_lpm_policy_names[] = {
 	[ATA_LPM_MIN_POWER]		= "min_power",
 };
 
+/*
+ * Check if a port supports link power management.
+ * Must be called with the port locked.
+ */
+static bool ata_scsi_lpm_supported(struct ata_port *ap)
+{
+	struct ata_link *link;
+	struct ata_device *dev;
+
+	if (ap->flags & ATA_FLAG_NO_LPM)
+		return false;
+
+	ata_for_each_link(link, ap, EDGE) {
+		ata_for_each_dev(dev, &ap->link, ENABLED) {
+			if (dev->quirks & ATA_QUIRK_NOLPM)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static ssize_t ata_scsi_lpm_supported_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ata_port *ap = ata_shost_to_port(shost);
+	unsigned long flags;
+	bool supported;
+
+	spin_lock_irqsave(ap->lock, flags);
+	supported = ata_scsi_lpm_supported(ap);
+	spin_unlock_irqrestore(ap->lock, flags);
+
+	return sysfs_emit(buf, "%d\n", supported);
+}
+DEVICE_ATTR(link_power_management_supported, S_IRUGO,
+	    ata_scsi_lpm_supported_show, NULL);
+EXPORT_SYMBOL_GPL(dev_attr_link_power_management_supported);
+
 static ssize_t ata_scsi_lpm_store(struct device *device,
 				  struct device_attribute *attr,
 				  const char *buf, size_t count)
 {
 	struct Scsi_Host *shost = class_to_shost(device);
 	struct ata_port *ap = ata_shost_to_port(shost);
-	struct ata_link *link;
-	struct ata_device *dev;
 	enum ata_lpm_policy policy;
 	unsigned long flags;
 
@@ -924,20 +962,11 @@ static ssize_t ata_scsi_lpm_store(struct device *device,
 
 	spin_lock_irqsave(ap->lock, flags);
 
-	if (ap->flags & ATA_FLAG_NO_LPM) {
+	if (!ata_scsi_lpm_supported(ap)) {
 		count = -EOPNOTSUPP;
 		goto out_unlock;
 	}
 
-	ata_for_each_link(link, ap, EDGE) {
-		ata_for_each_dev(dev, &ap->link, ENABLED) {
-			if (dev->quirks & ATA_QUIRK_NOLPM) {
-				count = -EOPNOTSUPP;
-				goto out_unlock;
-			}
-		}
-	}
-
 	ap->target_lpm_policy = policy;
 	ata_port_schedule_eh(ap);
 out_unlock:
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 912ace523880..0620dd67369f 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -545,6 +545,7 @@ typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes)
 
 extern struct device_attribute dev_attr_unload_heads;
 #ifdef CONFIG_SATA_HOST
+extern struct device_attribute dev_attr_link_power_management_supported;
 extern struct device_attribute dev_attr_link_power_management_policy;
 extern struct device_attribute dev_attr_ncq_prio_supported;
 extern struct device_attribute dev_attr_ncq_prio_enable;
-- 
cgit v1.2.3


From 199d9ffb31650f948dd342ade1c1b920e157630f Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Fri, 11 Jul 2025 15:31:36 +0200
Subject: module: move 'struct module_use' to internal.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct was moved to the public header file in commit c8e21ced08b3
("module: fix kdb's illicit use of struct module_use.").
Back then the structure was used outside of the module core.
Nowadays this is not true anymore, so the structure can be made internal.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Link: https://lore.kernel.org/r/20250711-kunit-ifdef-modules-v2-1-39443decb1f8@linutronix.de
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
---
 include/linux/module.h   | 7 -------
 kernel/module/internal.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index a7cac01d95e7..97c38e1cd377 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -313,13 +313,6 @@ void *__symbol_get_gpl(const char *symbol);
 		__used __section(".no_trim_symbol") = __stringify(x); \
 	(typeof(&x))(__symbol_get(__stringify(x))); })
 
-/* modules using other modules: kdb wants to see this. */
-struct module_use {
-	struct list_head source_list;
-	struct list_head target_list;
-	struct module *source, *target;
-};
-
 enum module_state {
 	MODULE_STATE_LIVE,	/* Normal state. */
 	MODULE_STATE_COMING,	/* Full formed, running module_init. */
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 51ddd8866ef3..618202578b42 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -112,6 +112,13 @@ struct find_symbol_arg {
 	enum mod_license license;
 };
 
+/* modules using other modules */
+struct module_use {
+	struct list_head source_list;
+	struct list_head target_list;
+	struct module *source, *target;
+};
+
 int mod_verify_sig(const void *mod, struct load_info *info);
 int try_to_force_load(struct module *mod, const char *reason);
 bool find_symbol(struct find_symbol_arg *fsa);
-- 
cgit v1.2.3


From 818783c804bc051f7faf0ac226b5597f8259c6f8 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Date: Fri, 11 Jul 2025 15:31:37 +0200
Subject: module: make structure definitions always visible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To write code that works with both CONFIG_MODULES=y and CONFIG_MODULES=n
it is convenient to use "if (IS_ENABLED(CONFIG_MODULES))" over raw #ifdef.
The code will still fully typechecked but the unreachable parts are
discarded by the compiler. This prevents accidental breakage when a certain
kconfig combination was not specifically tested by the developer.
This pattern is already supported to some extend by module.h defining
empty stub functions if CONFIG_MODULES=n.
However some users of module.h work on the structured defined by module.h.

Therefore these structure definitions need to be visible, too.

Many structure members are still gated by specific configuration settings.
The assumption for those is that the code using them will be gated behind
the same configuration setting anyways.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Link: https://lore.kernel.org/r/20250711-kunit-ifdef-modules-v2-2-39443decb1f8@linutronix.de
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
---
 include/linux/module.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 97c38e1cd377..5fe812de2d84 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -303,16 +303,6 @@ static typeof(name) __mod_device_table__##type##__##name		\
 
 struct notifier_block;
 
-#ifdef CONFIG_MODULES
-
-/* Get/put a kernel symbol (calls must be symmetric) */
-void *__symbol_get(const char *symbol);
-void *__symbol_get_gpl(const char *symbol);
-#define symbol_get(x)	({ \
-	static const char __notrim[] \
-		__used __section(".no_trim_symbol") = __stringify(x); \
-	(typeof(&x))(__symbol_get(__stringify(x))); })
-
 enum module_state {
 	MODULE_STATE_LIVE,	/* Normal state. */
 	MODULE_STATE_COMING,	/* Full formed, running module_init. */
@@ -597,6 +587,16 @@ struct module {
 #define MODULE_ARCH_INIT {}
 #endif
 
+#ifdef CONFIG_MODULES
+
+/* Get/put a kernel symbol (calls must be symmetric) */
+void *__symbol_get(const char *symbol);
+void *__symbol_get_gpl(const char *symbol);
+#define symbol_get(x)	({ \
+	static const char __notrim[] \
+		__used __section(".no_trim_symbol") = __stringify(x); \
+	(typeof(&x))(__symbol_get(__stringify(x))); })
+
 #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE
 static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym)
 {
-- 
cgit v1.2.3


From bdc877ba6b7ff1b6d2ebeff11e63da4a50a54854 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Mon, 30 Jun 2025 16:32:34 +0200
Subject: module: Restore the moduleparam prefix length check

The moduleparam code allows modules to provide their own definition of
MODULE_PARAM_PREFIX, instead of using the default KBUILD_MODNAME ".".

Commit 730b69d22525 ("module: check kernel param length at compile time,
not runtime") added a check to ensure the prefix doesn't exceed
MODULE_NAME_LEN, as this is what param_sysfs_builtin() expects.

Later, commit 58f86cc89c33 ("VERIFY_OCTAL_PERMISSIONS: stricter checking
for sysfs perms.") removed this check, but there is no indication this was
intentional.

Since the check is still useful for param_sysfs_builtin() to function
properly, reintroduce it in __module_param_call(), but in a modernized form
using static_assert().

While here, clean up the __module_param_call() comments. In particular,
remove the comment "Default value instead of permissions?", which comes
from commit 9774a1f54f17 ("[PATCH] Compile-time check re world-writeable
module params"). This comment was related to the test variable
__param_perm_check_##name, which was removed in the previously mentioned
commit 58f86cc89c33.

Fixes: 58f86cc89c33 ("VERIFY_OCTAL_PERMISSIONS: stricter checking for sysfs perms.")
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Link: https://lore.kernel.org/r/20250630143535.267745-4-petr.pavlu@suse.com
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
---
 include/linux/moduleparam.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index bfb85fd13e1f..110e9d09de24 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -282,10 +282,9 @@ struct kparam_array
 #define __moduleparam_const const
 #endif
 
-/* This is the fundamental function for registering boot/module
-   parameters. */
+/* This is the fundamental function for registering boot/module parameters. */
 #define __module_param_call(prefix, name, ops, arg, perm, level, flags)	\
-	/* Default value instead of permissions? */			\
+	static_assert(sizeof(""prefix) - 1 <= MAX_PARAM_PREFIX_LEN);	\
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used __section("__param")					\
-- 
cgit v1.2.3


From 40a826bd6c82ae45cfd3a19cd2a60a10f56b74c0 Mon Sep 17 00:00:00 2001
From: Petr Pavlu <petr.pavlu@suse.com>
Date: Mon, 30 Jun 2025 16:32:36 +0200
Subject: module: Rename MAX_PARAM_PREFIX_LEN to __MODULE_NAME_LEN

The maximum module name length (MODULE_NAME_LEN) is somewhat confusingly
defined in terms of the maximum parameter prefix length
(MAX_PARAM_PREFIX_LEN), when in fact the dependency is in the opposite
direction.

This split originates from commit 730b69d22525 ("module: check kernel param
length at compile time, not runtime"). The code needed to use
MODULE_NAME_LEN in moduleparam.h, but because module.h requires
moduleparam.h, this created a circular dependency. It was resolved by
introducing MAX_PARAM_PREFIX_LEN in moduleparam.h and defining
MODULE_NAME_LEN in module.h in terms of MAX_PARAM_PREFIX_LEN.

Rename MAX_PARAM_PREFIX_LEN to __MODULE_NAME_LEN for clarity. This matches
the similar approach of defining MODULE_INFO in module.h and __MODULE_INFO
in moduleparam.h.

Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Link: https://lore.kernel.org/r/20250630143535.267745-6-petr.pavlu@suse.com
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
---
 include/linux/module.h      |  2 +-
 include/linux/moduleparam.h | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5fe812de2d84..313ecb8e5181 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -33,7 +33,7 @@
 #include <linux/percpu.h>
 #include <asm/module.h>
 
-#define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN
+#define MODULE_NAME_LEN __MODULE_NAME_LEN
 
 struct modversion_info {
 	unsigned long crc;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 110e9d09de24..a04a2bc4f51e 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -6,6 +6,13 @@
 #include <linux/stringify.h>
 #include <linux/kernel.h>
 
+/*
+ * The maximum module name length, including the NUL byte.
+ * Chosen so that structs with an unsigned long line up, specifically
+ * modversion_info.
+ */
+#define __MODULE_NAME_LEN (64 - sizeof(unsigned long))
+
 /* You can override this manually, but generally this should match the
    module name. */
 #ifdef MODULE
@@ -17,9 +24,6 @@
 #define __MODULE_INFO_PREFIX KBUILD_MODNAME "."
 #endif
 
-/* Chosen so that structs with an unsigned long line up. */
-#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
-
 #define __MODULE_INFO(tag, name, info)					  \
 	static const char __UNIQUE_ID(name)[]				  \
 		__used __section(".modinfo") __aligned(1)		  \
@@ -284,7 +288,7 @@ struct kparam_array
 
 /* This is the fundamental function for registering boot/module parameters. */
 #define __module_param_call(prefix, name, ops, arg, perm, level, flags)	\
-	static_assert(sizeof(""prefix) - 1 <= MAX_PARAM_PREFIX_LEN);	\
+	static_assert(sizeof(""prefix) - 1 <= __MODULE_NAME_LEN);	\
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used __section("__param")					\
-- 
cgit v1.2.3


From b9c73524106e1c0c857006fb9ff2e5a510dc4021 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 29 Jul 2025 14:23:07 -0400
Subject: unwind_user/deferred: Add unwind cache

Cache the results of the unwind to ensure the unwind is only performed
once, even when called by multiple tracers.

The cache nr_entries gets cleared every time the task exits the kernel.
When a stacktrace is requested, nr_entries gets set to the number of
entries in the stacktrace. If another stacktrace is requested, if
nr_entries is not zero, then it contains the same stacktrace that would be
retrieved so it is not processed again and the entries is given to the
caller.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.319691167@kernel.org
Reviewed-by: Jens Remus <jremus@linux.ibm.com>
Reviewed-By: Indu Bhagat <indu.bhagat@oracle.com>
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/entry-common.h          |  2 ++
 include/linux/unwind_deferred.h       |  8 ++++++++
 include/linux/unwind_deferred_types.h |  7 ++++++-
 kernel/unwind/deferred.c              | 31 ++++++++++++++++++++++++-------
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f94f3fdf15fc..8908b8eeb99b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -12,6 +12,7 @@
 #include <linux/resume_user_mode.h>
 #include <linux/tick.h>
 #include <linux/kmsan.h>
+#include <linux/unwind_deferred.h>
 
 #include <asm/entry-common.h>
 #include <asm/syscall.h>
@@ -362,6 +363,7 @@ static __always_inline void exit_to_user_mode(void)
 	lockdep_hardirqs_on_prepare();
 	instrumentation_end();
 
+	unwind_reset_info();
 	user_enter_irqoff();
 	arch_exit_to_user_mode();
 	lockdep_hardirqs_on(CALLER_ADDR0);
diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index a5f6e8f8a1a2..baacf4a1eb4c 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -12,6 +12,12 @@ void unwind_task_free(struct task_struct *task);
 
 int unwind_user_faultable(struct unwind_stacktrace *trace);
 
+static __always_inline void unwind_reset_info(void)
+{
+	if (unlikely(current->unwind_info.cache))
+		current->unwind_info.cache->nr_entries = 0;
+}
+
 #else /* !CONFIG_UNWIND_USER */
 
 static inline void unwind_task_init(struct task_struct *task) {}
@@ -19,6 +25,8 @@ static inline void unwind_task_free(struct task_struct *task) {}
 
 static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
 
+static inline void unwind_reset_info(void) {}
+
 #endif /* !CONFIG_UNWIND_USER */
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_H */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index aa32db574e43..db5b54b18828 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -2,8 +2,13 @@
 #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 
+struct unwind_cache {
+	unsigned int		nr_entries;
+	unsigned long		entries[];
+};
+
 struct unwind_task_info {
-	unsigned long		*entries;
+	struct unwind_cache	*cache;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index a0badbeb3cc1..96368a5aa522 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -4,10 +4,13 @@
  */
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sizes.h>
 #include <linux/slab.h>
 #include <linux/unwind_deferred.h>
 
-#define UNWIND_MAX_ENTRIES 512
+/* Make the cache fit in a 4K page */
+#define UNWIND_MAX_ENTRIES					\
+	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
 
 /**
  * unwind_user_faultable - Produce a user stacktrace in faultable context
@@ -24,6 +27,7 @@
 int unwind_user_faultable(struct unwind_stacktrace *trace)
 {
 	struct unwind_task_info *info = &current->unwind_info;
+	struct unwind_cache *cache;
 
 	/* Should always be called from faultable context */
 	might_fault();
@@ -31,17 +35,30 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 	if (current->flags & PF_EXITING)
 		return -EINVAL;
 
-	if (!info->entries) {
-		info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long),
-					      GFP_KERNEL);
-		if (!info->entries)
+	if (!info->cache) {
+		info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
+				      GFP_KERNEL);
+		if (!info->cache)
 			return -ENOMEM;
 	}
 
+	cache = info->cache;
+	trace->entries = cache->entries;
+
+	if (cache->nr_entries) {
+		/*
+		 * The user stack has already been previously unwound in this
+		 * entry context.  Skip the unwind and use the cache.
+		 */
+		trace->nr = cache->nr_entries;
+		return 0;
+	}
+
 	trace->nr = 0;
-	trace->entries = info->entries;
 	unwind_user(trace, UNWIND_MAX_ENTRIES);
 
+	cache->nr_entries = trace->nr;
+
 	return 0;
 }
 
@@ -56,5 +73,5 @@ void unwind_task_free(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;
 
-	kfree(info->entries);
+	kfree(info->cache);
 }
-- 
cgit v1.2.3


From 2dffa355f6c279e7d2e574abf9446c41a631c9e5 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@kernel.org>
Date: Tue, 29 Jul 2025 14:23:08 -0400
Subject: unwind_user/deferred: Add deferred unwinding interface

Add an interface for scheduling task work to unwind the user space stack
before returning to user space. This solves several problems for its
callers:

  - Ensure the unwind happens in task context even if the caller may be
    running in interrupt context.

  - Avoid duplicate unwinds, whether called multiple times by the same
    caller or by different callers.

  - Create a "context cookie" which allows trace post-processing to
    correlate kernel unwinds/traces with the user unwind.

A concept of a "cookie" is created to detect when the stacktrace is the
same. A cookie is generated the first time a user space stacktrace is
requested after the task enters the kernel. As the stacktrace is saved on
the task_struct while the task is in the kernel, if another request comes
in, if the cookie is still the same, it will use the saved stacktrace,
and not have to regenerate one.

The cookie is passed to the caller on request, and when the stacktrace is
generated upon returning to user space, it calls the requester's callback
with the cookie as well as the stacktrace. The cookie is cleared
when it goes back to user space. Note, this currently adds another
conditional to the unwind_reset_info() path that is always called
returning to user space, but future changes will put this back to a single
conditional.

A global list is created and protected by a global mutex that holds
tracers that register with the unwind infrastructure. The number of
registered tracers will be limited in future changes. Each perf program or
ftrace instance will register its own descriptor to use for deferred
unwind stack traces.

Note, in the function unwind_deferred_task_work() that gets called when
returning to user space, it uses a global mutex for synchronization which
will cause a big bottleneck. This will be replaced by SRCU, but that
change adds some complex synchronization that deservers its own commit.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.488066537@kernel.org
Co-developed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h       |  24 ++++++
 include/linux/unwind_deferred_types.h |  24 ++++++
 kernel/unwind/deferred.c              | 156 +++++++++++++++++++++++++++++++++-
 3 files changed, 203 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index baacf4a1eb4c..14efd8c027aa 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -2,9 +2,19 @@
 #ifndef _LINUX_UNWIND_USER_DEFERRED_H
 #define _LINUX_UNWIND_USER_DEFERRED_H
 
+#include <linux/task_work.h>
 #include <linux/unwind_user.h>
 #include <linux/unwind_deferred_types.h>
 
+struct unwind_work;
+
+typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie);
+
+struct unwind_work {
+	struct list_head		list;
+	unwind_callback_t		func;
+};
+
 #ifdef CONFIG_UNWIND_USER
 
 void unwind_task_init(struct task_struct *task);
@@ -12,8 +22,19 @@ void unwind_task_free(struct task_struct *task);
 
 int unwind_user_faultable(struct unwind_stacktrace *trace);
 
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
+void unwind_deferred_cancel(struct unwind_work *work);
+
 static __always_inline void unwind_reset_info(void)
 {
+	if (unlikely(current->unwind_info.id.id))
+		current->unwind_info.id.id = 0;
+	/*
+	 * As unwind_user_faultable() can be called directly and
+	 * depends on nr_entries being cleared on exit to user,
+	 * this needs to be a separate conditional.
+	 */
 	if (unlikely(current->unwind_info.cache))
 		current->unwind_info.cache->nr_entries = 0;
 }
@@ -24,6 +45,9 @@ static inline void unwind_task_init(struct task_struct *task) {}
 static inline void unwind_task_free(struct task_struct *task) {}
 
 static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; }
+static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; }
+static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
+static inline void unwind_deferred_cancel(struct unwind_work *work) {}
 
 static inline void unwind_reset_info(void) {}
 
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index db5b54b18828..104c477d5609 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -7,8 +7,32 @@ struct unwind_cache {
 	unsigned long		entries[];
 };
 
+/*
+ * The unwind_task_id is a unique identifier that maps to a user space
+ * stacktrace. It is generated the first time a deferred user space
+ * stacktrace is requested after a task has entered the kerenl and
+ * is cleared to zero when it exits. The mapped id will be a non-zero
+ * number.
+ *
+ * To simplify the generation of the 64 bit number, 32 bits will be
+ * the CPU it was generated on, and the other 32 bits will be a per
+ * cpu counter that gets incremented by two every time a new identifier
+ * is generated. The LSB will always be set to keep the value
+ * from being zero.
+ */
+union unwind_task_id {
+	struct {
+		u32		cpu;
+		u32		cnt;
+	};
+	u64			id;
+};
+
 struct unwind_task_info {
 	struct unwind_cache	*cache;
+	struct callback_head	work;
+	union unwind_task_id	id;
+	int			pending;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index 96368a5aa522..2cbae2ada309 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -2,16 +2,63 @@
 /*
  * Deferred user space unwinding
  */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
-#include <linux/unwind_deferred.h>
+#include <linux/mm.h>
 
 /* Make the cache fit in a 4K page */
 #define UNWIND_MAX_ENTRIES					\
 	((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
 
+/* Guards adding to and reading the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+/*
+ * This is a unique percpu identifier for a given task entry context.
+ * Conceptually, it's incremented every time the CPU enters the kernel from
+ * user space, so that each "entry context" on the CPU gets a unique ID.  In
+ * reality, as an optimization, it's only incremented on demand for the first
+ * deferred unwind request after a given entry-from-user.
+ *
+ * It's combined with the CPU id to make a systemwide-unique "context cookie".
+ */
+static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier that is assigned to a user
+ * space stacktrace. As the user space stacktrace remains the same while
+ * the task is in the kernel, the cookie is an identifier for the stacktrace.
+ * Although it is possible for the stacktrace to get another cookie if another
+ * request is made after the cookie was cleared and before reentering user
+ * space.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+	u32 cnt = 1;
+	u32 old = 0;
+
+	if (info->id.cpu)
+		return info->id.id;
+
+	/* LSB is always set to ensure 0 is an invalid value */
+	cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+	if (try_cmpxchg(&info->id.cnt, &old, cnt)) {
+		/* Update the per cpu counter */
+		__this_cpu_write(unwind_ctx_ctr, cnt);
+	}
+	/* Interrupts are disabled, the CPU will always be same */
+	info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
+
+	return info->id.id;
+}
+
 /**
  * unwind_user_faultable - Produce a user stacktrace in faultable context
  * @trace: The descriptor that will store the user stacktrace
@@ -62,11 +109,117 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 	return 0;
 }
 
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct unwind_stacktrace trace;
+	struct unwind_work *work;
+	u64 cookie;
+
+	if (WARN_ON_ONCE(!info->pending))
+		return;
+
+	/* Allow work to come in again */
+	WRITE_ONCE(info->pending, 0);
+
+	/*
+	 * From here on out, the callback must always be called, even if it's
+	 * just an empty trace.
+	 */
+	trace.nr = 0;
+	trace.entries = NULL;
+
+	unwind_user_faultable(&trace);
+
+	cookie = info->id.id;
+
+	guard(mutex)(&callback_mutex);
+	list_for_each_entry(work, &callbacks, list) {
+		work->func(work, &trace, cookie);
+	}
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task kernel exit
+ * @work: Unwind descriptor requesting the trace
+ * @cookie: The cookie of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @cookie output is the generated cookie of the very first
+ * request for a user space stacktrace for this task since it entered the
+ * kernel. It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function.  It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context.  Each call will return the same cookie
+ * while the task hasn't left the kernel. If the callback is not pending
+ * because it has already been previously called for the same entry context,
+ * it will be called again with the same stack trace and cookie.
+ *
+ * Return: 1 if the the callback was already queued.
+ *         0 if the callback successfully was queued.
+ *         Negative if there's an error.
+ *         @cookie holds the cookie of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+	int ret;
+
+	*cookie = 0;
+
+	if (WARN_ON_ONCE(in_nmi()))
+		return -EINVAL;
+
+	if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+	    !user_mode(task_pt_regs(current)))
+		return -EINVAL;
+
+	guard(irqsave)();
+
+	*cookie = get_cookie(info);
+
+	/* callback already pending? */
+	if (info->pending)
+		return 1;
+
+	/* The work has been claimed, now schedule it. */
+	ret = task_work_add(current, &info->work, TWA_RESUME);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	info->pending = 1;
+	return 0;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+	if (!work)
+		return;
+
+	guard(mutex)(&callback_mutex);
+	list_del(&work->list);
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+	memset(work, 0, sizeof(*work));
+
+	guard(mutex)(&callback_mutex);
+	list_add(&work->list, &callbacks);
+	work->func = func;
+	return 0;
+}
+
 void unwind_task_init(struct task_struct *task)
 {
 	struct unwind_task_info *info = &task->unwind_info;
 
 	memset(info, 0, sizeof(*info));
+	init_task_work(&info->work, unwind_deferred_task_work);
 }
 
 void unwind_task_free(struct task_struct *task)
@@ -74,4 +227,5 @@ void unwind_task_free(struct task_struct *task)
 	struct unwind_task_info *info = &task->unwind_info;
 
 	kfree(info->cache);
+	task_work_cancel(task, &info->work);
 }
-- 
cgit v1.2.3


From be3d526a5b34109cecf3bc23b96f0081ad600a5b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Jul 2025 14:23:10 -0400
Subject: unwind deferred: Use bitmask to determine which callbacks to call

In order to know which registered callback requested a stacktrace for when
the task goes back to user space, add a bitmask to keep track of all
registered tracers. The bitmask is the size of long, which means that on a
32 bit machine, it can have at most 32 registered tracers, and on 64 bit,
it can have at most 64 registered tracers. This should not be an issue as
there should not be more than 10 (unless BPF can abuse this?).

When a tracer registers with unwind_deferred_init() it will get a bit
number assigned to it. When a tracer requests a stacktrace, it will have
its bit set within the task_struct. When the task returns back to user
space, it will call the callbacks for all the registered tracers where
their bits are set in the task's mask.

When a tracer is removed by the unwind_deferred_cancel() all current tasks
will clear the associated bit, just in case another tracer gets registered
immediately afterward and then gets their callback called unexpectedly.

To prevent live locks from happening if an event that happens between the
task_work and when the task goes back to user space, triggers the deferred
unwind, have the unwind_mask get cleared on exit to user space and not
after the callback is made.

Move the pending bit from a value on the task_struct to bit zero of the
unwind_mask (saves space on the task_struct). This will allow modifying
the pending bit along with the work bits atomically.

Instead of clearing a work's bit after its callback is called, it is
delayed until exit. If the work is requested again, the task_work is not
queued again and the request will be notified that the task has already been
called by returning a positive number (the same as if it was already
pending).

The pending bit is cleared before calling the callback functions but the
current work bits remain. If one of the called works registers again, it
will not trigger a task_work if its bit is still present in the task's
unwind_mask.

If a new work requests a deferred unwind, then it will set both the
pending bit and its own bit. Note this will also cause any work that was
previously queued and had their callback already executed to be executed
again. Future work will remove these spurious callbacks.

The use of atomic_long bit operations were suggested by Peter Zijlstra:
Link: https://lore.kernel.org/all/20250715102912.GQ1613200@noisy.programming.kicks-ass.net/
The unwind_mask could not be converted to atomic_long_t do to atomic_long
not having all the bit operations needed by unwind_mask. Instead it
follows other use cases in the kernel and just typecasts the unwind_mask
to atomic_long_t when using the two atomic_long functions.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.822789300@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h       | 26 +++++++++--
 include/linux/unwind_deferred_types.h |  2 +-
 kernel/unwind/deferred.c              | 87 +++++++++++++++++++++++++++--------
 3 files changed, 92 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 14efd8c027aa..337ead927d4d 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -13,10 +13,19 @@ typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stackt
 struct unwind_work {
 	struct list_head		list;
 	unwind_callback_t		func;
+	int				bit;
 };
 
 #ifdef CONFIG_UNWIND_USER
 
+enum {
+	UNWIND_PENDING_BIT = 0,
+};
+
+enum {
+	UNWIND_PENDING		= BIT(UNWIND_PENDING_BIT),
+};
+
 void unwind_task_init(struct task_struct *task);
 void unwind_task_free(struct task_struct *task);
 
@@ -28,15 +37,26 @@ void unwind_deferred_cancel(struct unwind_work *work);
 
 static __always_inline void unwind_reset_info(void)
 {
-	if (unlikely(current->unwind_info.id.id))
+	struct unwind_task_info *info = &current->unwind_info;
+	unsigned long bits;
+
+	/* Was there any unwinding? */
+	if (unlikely(info->unwind_mask)) {
+		bits = info->unwind_mask;
+		do {
+			/* Is a task_work going to run again before going back */
+			if (bits & UNWIND_PENDING)
+				return;
+		} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
 		current->unwind_info.id.id = 0;
+	}
 	/*
 	 * As unwind_user_faultable() can be called directly and
 	 * depends on nr_entries being cleared on exit to user,
 	 * this needs to be a separate conditional.
 	 */
-	if (unlikely(current->unwind_info.cache))
-		current->unwind_info.cache->nr_entries = 0;
+	if (unlikely(info->cache))
+		info->cache->nr_entries = 0;
 }
 
 #else /* !CONFIG_UNWIND_USER */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 104c477d5609..5dc9cda141ff 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -29,10 +29,10 @@ union unwind_task_id {
 };
 
 struct unwind_task_info {
+	unsigned long		unwind_mask;
 	struct unwind_cache	*cache;
 	struct callback_head	work;
 	union unwind_task_id	id;
-	int			pending;
 };
 
 #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index c5ac087d2396..e19f02ef416d 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -45,6 +45,16 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
 static DEFINE_MUTEX(callback_mutex);
 static LIST_HEAD(callbacks);
 
+#define RESERVED_BITS	(UNWIND_PENDING)
+
+/* Zero'd bits are available for assigning callback users */
+static unsigned long unwind_mask = RESERVED_BITS;
+
+static inline bool unwind_pending(struct unwind_task_info *info)
+{
+	return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+}
+
 /*
  * This is a unique percpu identifier for a given task entry context.
  * Conceptually, it's incremented every time the CPU enters the kernel from
@@ -138,14 +148,15 @@ static void unwind_deferred_task_work(struct callback_head *head)
 	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
 	struct unwind_stacktrace trace;
 	struct unwind_work *work;
+	unsigned long bits;
 	u64 cookie;
 
-	if (WARN_ON_ONCE(!info->pending))
+	if (WARN_ON_ONCE(!unwind_pending(info)))
 		return;
 
-	/* Allow work to come in again */
-	WRITE_ONCE(info->pending, 0);
-
+	/* Clear pending bit but make sure to have the current bits */
+	bits = atomic_long_fetch_andnot(UNWIND_PENDING,
+				  (atomic_long_t *)&info->unwind_mask);
 	/*
 	 * From here on out, the callback must always be called, even if it's
 	 * just an empty trace.
@@ -159,7 +170,8 @@ static void unwind_deferred_task_work(struct callback_head *head)
 
 	guard(mutex)(&callback_mutex);
 	list_for_each_entry(work, &callbacks, list) {
-		work->func(work, &trace, cookie);
+		if (test_bit(work->bit, &bits))
+			work->func(work, &trace, cookie);
 	}
 }
 
@@ -183,15 +195,16 @@ static void unwind_deferred_task_work(struct callback_head *head)
  * because it has already been previously called for the same entry context,
  * it will be called again with the same stack trace and cookie.
  *
- * Return: 1 if the the callback was already queued.
- *         0 if the callback successfully was queued.
+ * Return: 0 if the callback successfully was queued.
+ *         1 if the callback is pending or was already executed.
  *         Negative if there's an error.
  *         @cookie holds the cookie of the first request by any user
  */
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 {
 	struct unwind_task_info *info = &current->unwind_info;
-	long pending;
+	unsigned long old, bits;
+	unsigned long bit = BIT(work->bit);
 	int ret;
 
 	*cookie = 0;
@@ -212,32 +225,59 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 
 	*cookie = get_cookie(info);
 
-	/* callback already pending? */
-	pending = READ_ONCE(info->pending);
-	if (pending)
-		return 1;
+	old = READ_ONCE(info->unwind_mask);
 
-	/* Claim the work unless an NMI just now swooped in to do so. */
-	if (!try_cmpxchg(&info->pending, &pending, 1))
+	/* Is this already queued or executed */
+	if (old & bit)
 		return 1;
 
+	/*
+	 * This work's bit hasn't been set yet. Now set it with the PENDING
+	 * bit and fetch the current value of unwind_mask. If ether the
+	 * work's bit or PENDING was already set, then this is already queued
+	 * to have a callback.
+	 */
+	bits = UNWIND_PENDING | bit;
+	old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+	if (old & bits) {
+		/*
+		 * If the work's bit was set, whatever set it had better
+		 * have also set pending and queued a callback.
+		 */
+		WARN_ON_ONCE(!(old & UNWIND_PENDING));
+		return old & bit;
+	}
+
 	/* The work has been claimed, now schedule it. */
 	ret = task_work_add(current, &info->work, TWA_RESUME);
-	if (WARN_ON_ONCE(ret)) {
-		WRITE_ONCE(info->pending, 0);
-		return ret;
-	}
 
-	return 0;
+	if (WARN_ON_ONCE(ret))
+		WRITE_ONCE(info->unwind_mask, 0);
+
+	return ret;
 }
 
 void unwind_deferred_cancel(struct unwind_work *work)
 {
+	struct task_struct *g, *t;
+
 	if (!work)
 		return;
 
+	/* No work should be using a reserved bit */
+	if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS))
+		return;
+
 	guard(mutex)(&callback_mutex);
 	list_del(&work->list);
+
+	__clear_bit(work->bit, &unwind_mask);
+
+	guard(rcu)();
+	/* Clear this bit from all threads */
+	for_each_process_thread(g, t) {
+		clear_bit(work->bit, &t->unwind_info.unwind_mask);
+	}
 }
 
 int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
@@ -245,6 +285,14 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
 	memset(work, 0, sizeof(*work));
 
 	guard(mutex)(&callback_mutex);
+
+	/* See if there's a bit in the mask available */
+	if (unwind_mask == ~0UL)
+		return -EBUSY;
+
+	work->bit = ffz(unwind_mask);
+	__set_bit(work->bit, &unwind_mask);
+
 	list_add(&work->list, &callbacks);
 	work->func = func;
 	return 0;
@@ -256,6 +304,7 @@ void unwind_task_init(struct task_struct *task)
 
 	memset(info, 0, sizeof(*info));
 	init_task_work(&info->work, unwind_deferred_task_work);
+	info->unwind_mask = 0;
 }
 
 void unwind_task_free(struct task_struct *task)
-- 
cgit v1.2.3


From 4c75133e745aa95636c9ccbab1603ed363dabcd4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Jul 2025 14:23:11 -0400
Subject: unwind deferred: Add unwind_completed mask to stop spurious callbacks

If there's more than one registered tracer to the unwind deferred
infrastructure, it is currently possible that one tracer could cause extra
callbacks to happen for another tracer if the former requests a deferred
stacktrace after the latter's callback was executed and before the task
went back to user space.

Here's an example of how this could occur:

  [Task enters kernel]
    tracer 1 request -> add cookie to its buffer
    tracer 1 request -> add cookie to its buffer
    <..>
    [ task work executes ]
    tracer 1 callback -> add trace + cookie to its buffer

    [tracer 2 requests and triggers the task work again]
    [ task work executes again ]
    tracer 1 callback -> add trace + cookie to its buffer
    tracer 2 callback -> add trace + cookie to its buffer
 [Task exits back to user space]

This is because the bit for tracer 1 gets set in the task's unwind_mask
when it did its request and does not get cleared until the task returns
back to user space. But if another tracer were to request another deferred
stacktrace, then the next task work will executed all tracer's callbacks
that have their bits set in the task's unwind_mask.

To fix this issue, add another mask called unwind_completed and place it
into the task's info->cache structure. The cache structure is allocated
on the first occurrence of a deferred stacktrace and this unwind_completed
mask is not needed until then. It's better to have it in the cache than to
permanently waste space in the task_struct.

After a tracer's callback is executed, it's bit gets set in this
unwind_completed mask. When the task_work enters, it will AND the task's
unwind_mask with the inverse of the unwind_completed which will eliminate
any work that already had its callback executed since the task entered the
kernel.

When the task leaves the kernel, it will reset this unwind_completed mask
just like it resets the other values as it enters user space.

Link: https://lore.kernel.org/all/20250716142609.47f0e4a5@batman.local.home/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182405.989222722@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h       |  4 +++-
 include/linux/unwind_deferred_types.h |  1 +
 kernel/unwind/deferred.c              | 19 +++++++++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 337ead927d4d..b9ec4c8515c7 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -55,8 +55,10 @@ static __always_inline void unwind_reset_info(void)
 	 * depends on nr_entries being cleared on exit to user,
 	 * this needs to be a separate conditional.
 	 */
-	if (unlikely(info->cache))
+	if (unlikely(info->cache)) {
 		info->cache->nr_entries = 0;
+		info->cache->unwind_completed = 0;
+	}
 }
 
 #else /* !CONFIG_UNWIND_USER */
diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h
index 5dc9cda141ff..33b62ac25c86 100644
--- a/include/linux/unwind_deferred_types.h
+++ b/include/linux/unwind_deferred_types.h
@@ -3,6 +3,7 @@
 #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H
 
 struct unwind_cache {
+	unsigned long		unwind_completed;
 	unsigned int		nr_entries;
 	unsigned long		entries[];
 };
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index e19f02ef416d..a3d26014a2e6 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -166,12 +166,18 @@ static void unwind_deferred_task_work(struct callback_head *head)
 
 	unwind_user_faultable(&trace);
 
+	if (info->cache)
+		bits &= ~(info->cache->unwind_completed);
+
 	cookie = info->id.id;
 
 	guard(mutex)(&callback_mutex);
 	list_for_each_entry(work, &callbacks, list) {
-		if (test_bit(work->bit, &bits))
+		if (test_bit(work->bit, &bits)) {
 			work->func(work, &trace, cookie);
+			if (info->cache)
+				info->cache->unwind_completed |= BIT(work->bit);
+		}
 	}
 }
 
@@ -260,23 +266,28 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
 void unwind_deferred_cancel(struct unwind_work *work)
 {
 	struct task_struct *g, *t;
+	int bit;
 
 	if (!work)
 		return;
 
+	bit = work->bit;
+
 	/* No work should be using a reserved bit */
-	if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS))
+	if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
 		return;
 
 	guard(mutex)(&callback_mutex);
 	list_del(&work->list);
 
-	__clear_bit(work->bit, &unwind_mask);
+	__clear_bit(bit, &unwind_mask);
 
 	guard(rcu)();
 	/* Clear this bit from all threads */
 	for_each_process_thread(g, t) {
-		clear_bit(work->bit, &t->unwind_info.unwind_mask);
+		clear_bit(bit, &t->unwind_info.unwind_mask);
+		if (t->unwind_info.cache)
+			clear_bit(bit, &t->unwind_info.cache->unwind_completed);
 	}
 }
 
-- 
cgit v1.2.3


From 858fa8a3b083e862114bb6483b9fb50b3e2bc4c3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Jul 2025 14:23:12 -0400
Subject: unwind: Add USED bit to only have one conditional on way back to user
 space

On the way back to user space, the function unwind_reset_info() is called
unconditionally (but always inlined). It currently has two conditionals.
One that checks the unwind_mask which is set whenever a deferred trace is
called and is used to know that the mask needs to be cleared. The other
checks if the cache has been allocated, and if so, it resets the
nr_entries so that the unwinder knows it needs to do the work to get a new
user space stack trace again (it only does it once per entering the
kernel).

Use one of the bits in the unwind mask as a "USED" bit that gets set
whenever a trace is created. This will make it possible to only check the
unwind_mask in the unwind_reset_info() to know if it needs to do work or
not and eliminates a conditional that happens every time the task goes
back to user space.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182406.155422551@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h | 18 +++++++++---------
 kernel/unwind/deferred.c        |  5 ++++-
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index b9ec4c8515c7..2efbda01e959 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -20,10 +20,14 @@ struct unwind_work {
 
 enum {
 	UNWIND_PENDING_BIT = 0,
+	UNWIND_USED_BIT,
 };
 
 enum {
 	UNWIND_PENDING		= BIT(UNWIND_PENDING_BIT),
+
+	/* Set if the unwinding was used (directly or deferred) */
+	UNWIND_USED		= BIT(UNWIND_USED_BIT)
 };
 
 void unwind_task_init(struct task_struct *task);
@@ -49,15 +53,11 @@ static __always_inline void unwind_reset_info(void)
 				return;
 		} while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL));
 		current->unwind_info.id.id = 0;
-	}
-	/*
-	 * As unwind_user_faultable() can be called directly and
-	 * depends on nr_entries being cleared on exit to user,
-	 * this needs to be a separate conditional.
-	 */
-	if (unlikely(info->cache)) {
-		info->cache->nr_entries = 0;
-		info->cache->unwind_completed = 0;
+
+		if (unlikely(info->cache)) {
+			info->cache->nr_entries = 0;
+			info->cache->unwind_completed = 0;
+		}
 	}
 }
 
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index a3d26014a2e6..2311b725d691 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -45,7 +45,7 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
 static DEFINE_MUTEX(callback_mutex);
 static LIST_HEAD(callbacks);
 
-#define RESERVED_BITS	(UNWIND_PENDING)
+#define RESERVED_BITS	(UNWIND_PENDING | UNWIND_USED)
 
 /* Zero'd bits are available for assigning callback users */
 static unsigned long unwind_mask = RESERVED_BITS;
@@ -140,6 +140,9 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 
 	cache->nr_entries = trace->nr;
 
+	/* Clear nr_entries on way back to user space */
+	set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b3b9cb11aa034cfa9eb880bb9bb3d5aaf732e479 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 29 Jul 2025 14:23:14 -0400
Subject: unwind: Finish up unwind when a task exits

On do_exit() when a task is exiting, if a unwind is requested and the
deferred user stacktrace is deferred via the task_work, the task_work
callback is called after exit_mm() is called in do_exit(). This means that
the user stack trace will not be retrieved and an empty stack is created.

Instead, add a function unwind_deferred_task_exit() and call it just
before exit_mm() so that the unwinder can call the requested callbacks
with the user space stack.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Indu Bhagat <indu.bhagat@oracle.com>
Cc: "Jose E. Marchesi" <jemarch@gnu.org>
Cc: Beau Belgrave <beaub@linux.microsoft.com>
Cc: Jens Remus <jremus@linux.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/20250729182406.504259474@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/unwind_deferred.h |  3 +++
 kernel/exit.c                   |  2 ++
 kernel/unwind/deferred.c        | 23 ++++++++++++++++++++---
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h
index 2efbda01e959..26122d00708a 100644
--- a/include/linux/unwind_deferred.h
+++ b/include/linux/unwind_deferred.h
@@ -39,6 +39,8 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func);
 int unwind_deferred_request(struct unwind_work *work, u64 *cookie);
 void unwind_deferred_cancel(struct unwind_work *work);
 
+void unwind_deferred_task_exit(struct task_struct *task);
+
 static __always_inline void unwind_reset_info(void)
 {
 	struct unwind_task_info *info = &current->unwind_info;
@@ -71,6 +73,7 @@ static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback
 static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; }
 static inline void unwind_deferred_cancel(struct unwind_work *work) {}
 
+static inline void unwind_deferred_task_exit(struct task_struct *task) {}
 static inline void unwind_reset_info(void) {}
 
 #endif /* !CONFIG_UNWIND_USER */
diff --git a/kernel/exit.c b/kernel/exit.c
index bb184a67ac73..1d8c8ac33c4f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
 #include <linux/rethook.h>
 #include <linux/sysfs.h>
 #include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
 #include <linux/uaccess.h>
 #include <linux/pidfs.h>
 
@@ -938,6 +939,7 @@ void __noreturn do_exit(long code)
 
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
+	unwind_deferred_task_exit(tsk);
 	trace_sched_process_exit(tsk, group_dead);
 
 	/*
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
index a5ef1c1f915e..dc6040aae3ee 100644
--- a/kernel/unwind/deferred.c
+++ b/kernel/unwind/deferred.c
@@ -114,7 +114,7 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 	/* Should always be called from faultable context */
 	might_fault();
 
-	if (current->flags & PF_EXITING)
+	if (!current->mm)
 		return -EINVAL;
 
 	if (!info->cache) {
@@ -147,9 +147,9 @@ int unwind_user_faultable(struct unwind_stacktrace *trace)
 	return 0;
 }
 
-static void unwind_deferred_task_work(struct callback_head *head)
+static void process_unwind_deferred(struct task_struct *task)
 {
-	struct unwind_task_info *info = container_of(head, struct unwind_task_info, work);
+	struct unwind_task_info *info = &task->unwind_info;
 	struct unwind_stacktrace trace;
 	struct unwind_work *work;
 	unsigned long bits;
@@ -186,6 +186,23 @@ static void unwind_deferred_task_work(struct callback_head *head)
 	}
 }
 
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+	process_unwind_deferred(current);
+}
+
+void unwind_deferred_task_exit(struct task_struct *task)
+{
+	struct unwind_task_info *info = &current->unwind_info;
+
+	if (!unwind_pending(info))
+		return;
+
+	process_unwind_deferred(task);
+
+	task_work_cancel(task, &info->work);
+}
+
 /**
  * unwind_deferred_request - Request a user stacktrace on task kernel exit
  * @work: Unwind descriptor requesting the trace
-- 
cgit v1.2.3


From b0c85e99458af829c32c225b43f638443bff14e5 Mon Sep 17 00:00:00 2001
From: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Date: Mon, 23 Jun 2025 16:46:45 +0900
Subject: cpumask: Remove unnecessary cpumask_nth_andnot()

Commit 94f753143028("x86/resctrl: Optimize cpumask_any_housekeeping()")
switched the only user of cpumask_nth_andnot() to other cpumask
functions, but left the function cpumask_nth_andnot() unused.
This makes function find_nth_andnot_bit() unused as well. Delete them.

Signed-off-by: Shaopeng Tan <tan.shaopeng@jp.fujitsu.com>
Signed-off-by: Yury Norov [NVIDIA] <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 16 ----------------
 include/linux/find.h    | 27 ---------------------------
 2 files changed, 43 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 39b71b662da3..4bda089fbe5d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -558,22 +558,6 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
 				small_cpumask_bits, cpumask_check(cpu));
 }
 
-/**
- * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
- * @srcp1: the cpumask pointer
- * @srcp2: the cpumask pointer
- * @cpu: the Nth cpu to find, starting from 0
- *
- * Return: >= nr_cpu_ids if such cpu doesn't exist.
- */
-static __always_inline
-unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
-							const struct cpumask *srcp2)
-{
-	return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
-				small_cpumask_bits, cpumask_check(cpu));
-}
-
 /**
  * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
  * @srcp1: the cpumask pointer
diff --git a/include/linux/find.h b/include/linux/find.h
index 98c61838002c..9d720ad92bc1 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -269,33 +269,6 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *
 	return __find_nth_and_bit(addr1, addr2, size, n);
 }
 
-/**
- * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
- *			 flipping bits in 2nd region
- * @addr1: The 1st address to start the search at
- * @addr2: The 2nd address to start the search at
- * @size: The maximum number of bits to search
- * @n: The number of set bit, which position is needed, counting from 0
- *
- * Returns the bit number of the N'th set bit.
- * If no such, returns @size.
- */
-static __always_inline
-unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
-				unsigned long size, unsigned long n)
-{
-	if (n >= size)
-		return size;
-
-	if (small_const_nbits(size)) {
-		unsigned long val =  *addr1 & (~*addr2) & GENMASK(size - 1, 0);
-
-		return val ? fns(val, n) : size;
-	}
-
-	return __find_nth_andnot_bit(addr1, addr2, size, n);
-}
-
 /**
  * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
  *			     excluding those set in 3rd region
-- 
cgit v1.2.3


From 6d4471252ccc1722d25200fa9b6021ab4e1d6fde Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Mon, 9 Jun 2025 11:45:45 +0900
Subject: bits: split the definition of the asm and non-asm GENMASK*()

In an upcoming change, the non-asm GENMASK*() will all be unified to
depend on GENMASK_TYPE() which indirectly depend on sizeof(), something
not available in asm.

Instead of adding further complexity to GENMASK_TYPE() to make it work
for both asm and non asm, just split the definition of the two variants.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/bits.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 7ad056219115..13dbc8adc70e 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -35,6 +35,11 @@
 
 #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h)))
 
+#define GENMASK(h, l) \
+	(GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
+#define GENMASK_ULL(h, l) \
+	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
+
 /*
  * Generate a mask for the specified type @t. Additional checks are made to
  * guarantee the value returned fits in that type, relying on
@@ -79,15 +84,11 @@
  * BUILD_BUG_ON_ZERO is not available in h files included from asm files,
  * disable the input check if that is the case.
  */
-#define GENMASK_INPUT_CHECK(h, l) 0
+#define GENMASK(h, l)		__GENMASK(h, l)
+#define GENMASK_ULL(h, l)	__GENMASK_ULL(h, l)
 
 #endif /* !defined(__ASSEMBLY__) */
 
-#define GENMASK(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
-#define GENMASK_ULL(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
-
 #if !defined(__ASSEMBLY__)
 /*
  * Missing asm support
-- 
cgit v1.2.3


From 104ea1c84b91c9f452e497ba51602b903711cdd5 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Mon, 9 Jun 2025 11:45:46 +0900
Subject: bits: unify the non-asm GENMASK*()

The newly introduced GENMASK_TYPE() macro can also be used to generate
the pre-existing non-asm GENMASK*() variants.

Apply GENMASK_TYPE() to GENMASK(), GENMASK_ULL() and GENMASK_U128().

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/bits.h | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 13dbc8adc70e..a40cc861b3a7 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -2,10 +2,8 @@
 #ifndef __LINUX_BITS_H
 #define __LINUX_BITS_H
 
-#include <linux/const.h>
 #include <vdso/bits.h>
 #include <uapi/linux/bits.h>
-#include <asm/bitsperlong.h>
 
 #define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
@@ -35,11 +33,6 @@
 
 #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h)))
 
-#define GENMASK(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l))
-#define GENMASK_ULL(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l))
-
 /*
  * Generate a mask for the specified type @t. Additional checks are made to
  * guarantee the value returned fits in that type, relying on
@@ -55,10 +48,14 @@
 	     (type_max(t) << (l) &				\
 	      type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h)))))
 
+#define GENMASK(h, l)		GENMASK_TYPE(unsigned long, h, l)
+#define GENMASK_ULL(h, l)	GENMASK_TYPE(unsigned long long, h, l)
+
 #define GENMASK_U8(h, l)	GENMASK_TYPE(u8, h, l)
 #define GENMASK_U16(h, l)	GENMASK_TYPE(u16, h, l)
 #define GENMASK_U32(h, l)	GENMASK_TYPE(u32, h, l)
 #define GENMASK_U64(h, l)	GENMASK_TYPE(u64, h, l)
+#define GENMASK_U128(h, l)	GENMASK_TYPE(u128, h, l)
 
 /*
  * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The
@@ -89,19 +86,4 @@
 
 #endif /* !defined(__ASSEMBLY__) */
 
-#if !defined(__ASSEMBLY__)
-/*
- * Missing asm support
- *
- * __GENMASK_U128() depends on _BIT128() which would not work
- * in the asm code, as it shifts an 'unsigned __int128' data
- * type instead of direct representation of 128 bit constants
- * such as long and unsigned long. The fundamental problem is
- * that a 128 bit constant will get silently truncated by the
- * gcc compiler.
- */
-#define GENMASK_U128(h, l) \
-	(GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l))
-#endif
-
 #endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3


From e2b02d382ae0cb90697e8529dfd3f93bf8c6905c Mon Sep 17 00:00:00 2001
From: Ben Horgan <ben.horgan@arm.com>
Date: Wed, 9 Jul 2025 10:38:08 +0100
Subject: bitfield: Ensure the return values of helper functions are checked

As type##_replace_bits() has no side effects it is only useful if its
return value is checked. Add __must_check to enforce this usage. To have
the bits replaced in-place typep##_replace_bits() can be used instead.

Although, type_##_get_bits() and type_##_encode_bits() are harder to misuse
they are still only useful if the return value is checked. For
consistency, also add __must_check to these.

Signed-off-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
 include/linux/bitfield.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 6d9a53db54b6..5355f8f806a9 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -189,14 +189,14 @@ static __always_inline u64 field_mask(u64 field)
 }
 #define field_max(field)	((typeof(field))field_mask(field))
 #define ____MAKE_OP(type,base,to,from)					\
-static __always_inline __##type type##_encode_bits(base v, base field)	\
+static __always_inline __##type __must_check type##_encode_bits(base v, base field)	\
 {									\
 	if (__builtin_constant_p(v) && (v & ~field_mask(field)))	\
 		__field_overflow();					\
 	return to((v & field_mask(field)) * field_multiplier(field));	\
 }									\
-static __always_inline __##type type##_replace_bits(__##type old,	\
-					base val, base field)		\
+static __always_inline __##type __must_check type##_replace_bits(__##type old,	\
+							base val, base field)	\
 {									\
 	return (old & ~to(field)) | type##_encode_bits(val, field);	\
 }									\
@@ -205,7 +205,7 @@ static __always_inline void type##p_replace_bits(__##type *p,		\
 {									\
 	*p = (*p & ~to(field)) | type##_encode_bits(val, field);	\
 }									\
-static __always_inline base type##_get_bits(__##type v, base field)	\
+static __always_inline base __must_check type##_get_bits(__##type v, base field)	\
 {									\
 	return (from(v) & field)/field_multiplier(field);		\
 }
-- 
cgit v1.2.3


From 12df58ad294253ac1d8df0c9bb9cf726397a671d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 31 Jul 2025 01:47:30 +0200
Subject: bpf: Add cookie object to bpf maps

Add a cookie to BPF maps to uniquely identify BPF maps for the timespan
when the node is up. This is different to comparing a pointer or BPF map
id which could get rolled over and reused.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20250730234733.530041-1-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/syscall.c | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9cd2164ed23..308530c8326b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -310,6 +310,7 @@ struct bpf_map {
 	bool free_after_rcu_gp;
 	atomic64_t sleepable_refcnt;
 	s64 __percpu *elem_count;
+	u64 cookie; /* write-once */
 };
 
 static inline const char *btf_field_type_name(enum btf_field_type type)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e63039817af3..7a814e98d5f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,7 @@
 #include <linux/trace_events.h>
 #include <linux/tracepoint.h>
 #include <linux/overflow.h>
+#include <linux/cookie.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -53,6 +54,7 @@
 #define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
 
 DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
 static DEFINE_IDR(map_idr);
@@ -1487,6 +1489,10 @@ static int map_create(union bpf_attr *attr, bool kernel)
 	if (err < 0)
 		goto free_map;
 
+	preempt_disable();
+	map->cookie = gen_cookie_next(&bpf_map_cookie);
+	preempt_enable();
+
 	atomic64_set(&map->refcnt, 1);
 	atomic64_set(&map->usercnt, 1);
 	mutex_init(&map->freeze_mutex);
-- 
cgit v1.2.3


From fd1c98f0ef5cbcec842209776505d9e70d8fcd53 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 31 Jul 2025 01:47:31 +0200
Subject: bpf: Move bpf map owner out of common struct

Given this is only relevant for BPF tail call maps, it is adding up space
and penalizing other map types. We also need to extend this with further
objects to track / compare to. Therefore, lets move this out into a separate
structure and dynamically allocate it only for BPF tail call maps.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20250730234733.530041-2-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 36 ++++++++++++++++++++++++------------
 kernel/bpf/core.c    | 35 ++++++++++++++++++-----------------
 kernel/bpf/syscall.c | 13 +++++++------
 3 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 308530c8326b..a87646cc5398 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,18 @@ struct bpf_list_node_kern {
 	void *owner;
 } __attribute__((aligned(8)));
 
+/* 'Ownership' of program-containing map is claimed by the first program
+ * that is going to use this map or by the first program which FD is
+ * stored in the map to make sure that all callers and callees have the
+ * same prog type, JITed flag and xdp_has_frags flag.
+ */
+struct bpf_map_owner {
+	enum bpf_prog_type type;
+	bool jited;
+	bool xdp_has_frags;
+	const struct btf_type *attach_func_proto;
+};
+
 struct bpf_map {
 	const struct bpf_map_ops *ops;
 	struct bpf_map *inner_map_meta;
@@ -292,18 +304,8 @@ struct bpf_map {
 		struct rcu_head rcu;
 	};
 	atomic64_t writecnt;
-	/* 'Ownership' of program-containing map is claimed by the first program
-	 * that is going to use this map or by the first program which FD is
-	 * stored in the map to make sure that all callers and callees have the
-	 * same prog type, JITed flag and xdp_has_frags flag.
-	 */
-	struct {
-		const struct btf_type *attach_func_proto;
-		spinlock_t lock;
-		enum bpf_prog_type type;
-		bool jited;
-		bool xdp_has_frags;
-	} owner;
+	spinlock_t owner_lock;
+	struct bpf_map_owner *owner;
 	bool bypass_spec_v1;
 	bool frozen; /* write-once; write-protected by freeze_mutex */
 	bool free_after_mult_rcu_gp;
@@ -2109,6 +2111,16 @@ static inline bool bpf_map_flags_access_ok(u32 access_flags)
 	       (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
 }
 
+static inline struct bpf_map_owner *bpf_map_owner_alloc(struct bpf_map *map)
+{
+	return kzalloc(sizeof(*map->owner), GFP_ATOMIC);
+}
+
+static inline void bpf_map_owner_free(struct bpf_map *map)
+{
+	kfree(map->owner);
+}
+
 struct bpf_event_entry {
 	struct perf_event *event;
 	struct file *perf_file;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 09dde5b00d0c..6e5b3a67e87f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2377,28 +2377,29 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
 				      const struct bpf_prog *fp)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(fp);
-	bool ret;
 	struct bpf_prog_aux *aux = fp->aux;
+	bool ret = false;
 
 	if (fp->kprobe_override)
-		return false;
+		return ret;
 
-	spin_lock(&map->owner.lock);
-	if (!map->owner.type) {
-		/* There's no owner yet where we could check for
-		 * compatibility.
-		 */
-		map->owner.type  = prog_type;
-		map->owner.jited = fp->jited;
-		map->owner.xdp_has_frags = aux->xdp_has_frags;
-		map->owner.attach_func_proto = aux->attach_func_proto;
+	spin_lock(&map->owner_lock);
+	/* There's no owner yet where we could check for compatibility. */
+	if (!map->owner) {
+		map->owner = bpf_map_owner_alloc(map);
+		if (!map->owner)
+			goto err;
+		map->owner->type  = prog_type;
+		map->owner->jited = fp->jited;
+		map->owner->xdp_has_frags = aux->xdp_has_frags;
+		map->owner->attach_func_proto = aux->attach_func_proto;
 		ret = true;
 	} else {
-		ret = map->owner.type  == prog_type &&
-		      map->owner.jited == fp->jited &&
-		      map->owner.xdp_has_frags == aux->xdp_has_frags;
+		ret = map->owner->type  == prog_type &&
+		      map->owner->jited == fp->jited &&
+		      map->owner->xdp_has_frags == aux->xdp_has_frags;
 		if (ret &&
-		    map->owner.attach_func_proto != aux->attach_func_proto) {
+		    map->owner->attach_func_proto != aux->attach_func_proto) {
 			switch (prog_type) {
 			case BPF_PROG_TYPE_TRACING:
 			case BPF_PROG_TYPE_LSM:
@@ -2411,8 +2412,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
 			}
 		}
 	}
-	spin_unlock(&map->owner.lock);
-
+err:
+	spin_unlock(&map->owner_lock);
 	return ret;
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7a814e98d5f5..0fbfa8532c39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -887,6 +887,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 
 	security_bpf_map_free(map);
 	bpf_map_release_memcg(map);
+	bpf_map_owner_free(map);
 	bpf_map_free(map);
 }
 
@@ -981,12 +982,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 	struct bpf_map *map = filp->private_data;
 	u32 type = 0, jited = 0;
 
-	if (map_type_contains_progs(map)) {
-		spin_lock(&map->owner.lock);
-		type  = map->owner.type;
-		jited = map->owner.jited;
-		spin_unlock(&map->owner.lock);
+	spin_lock(&map->owner_lock);
+	if (map->owner) {
+		type  = map->owner->type;
+		jited = map->owner->jited;
 	}
+	spin_unlock(&map->owner_lock);
 
 	seq_printf(m,
 		   "map_type:\t%u\n"
@@ -1496,7 +1497,7 @@ static int map_create(union bpf_attr *attr, bool kernel)
 	atomic64_set(&map->refcnt, 1);
 	atomic64_set(&map->usercnt, 1);
 	mutex_init(&map->freeze_mutex);
-	spin_lock_init(&map->owner.lock);
+	spin_lock_init(&map->owner_lock);
 
 	if (attr->btf_key_type_id || attr->btf_value_type_id ||
 	    /* Even the map's value is a kernel's struct,
-- 
cgit v1.2.3


From 9621e60f59eae87eb9ffe88d90f24f391a1ef0f0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 31 Jul 2025 01:47:32 +0200
Subject: bpf: Move cgroup iterator helpers to bpf.h

Move them into bpf.h given we also need them in core code.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20250730234733.530041-3-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  5 -----
 include/linux/bpf.h        | 22 ++++++++++++++--------
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 082ccd8ad96b..aedf573bdb42 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -77,9 +77,6 @@ to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type)
 extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE];
 #define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype])
 
-#define for_each_cgroup_storage_type(stype) \
-	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
-
 struct bpf_cgroup_storage_map;
 
 struct bpf_storage_buffer {
@@ -510,8 +507,6 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
 
-#define for_each_cgroup_storage_type(stype) for (; false; )
-
 #endif /* CONFIG_CGROUP_BPF */
 
 #endif /* _BPF_CGROUP_H */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a87646cc5398..02aa41e301a5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -208,6 +208,20 @@ enum btf_field_type {
 	BPF_RES_SPIN_LOCK = (1 << 12),
 };
 
+enum bpf_cgroup_storage_type {
+	BPF_CGROUP_STORAGE_SHARED,
+	BPF_CGROUP_STORAGE_PERCPU,
+	__BPF_CGROUP_STORAGE_MAX
+#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
+};
+
+#ifdef CONFIG_CGROUP_BPF
+# define for_each_cgroup_storage_type(stype) \
+	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)
+#else
+# define for_each_cgroup_storage_type(stype) for (; false; )
+#endif /* CONFIG_CGROUP_BPF */
+
 typedef void (*btf_dtor_kfunc_t)(void *);
 
 struct btf_field_kptr {
@@ -1085,14 +1099,6 @@ struct bpf_prog_offload {
 	u32			jited_len;
 };
 
-enum bpf_cgroup_storage_type {
-	BPF_CGROUP_STORAGE_SHARED,
-	BPF_CGROUP_STORAGE_PERCPU,
-	__BPF_CGROUP_STORAGE_MAX
-};
-
-#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX
-
 /* The longest tracepoint has 12 args.
  * See include/trace/bpf_probe.h
  */
-- 
cgit v1.2.3


From abad3d0bad72a52137e0c350c59542d75ae4f513 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 31 Jul 2025 01:47:33 +0200
Subject: bpf: Fix oob access in cgroup local storage

Lonial reported that an out-of-bounds access in cgroup local storage
can be crafted via tail calls. Given two programs each utilizing a
cgroup local storage with a different value size, and one program
doing a tail call into the other. The verifier will validate each of
the indivial programs just fine. However, in the runtime context
the bpf_cg_run_ctx holds an bpf_prog_array_item which contains the
BPF program as well as any cgroup local storage flavor the program
uses. Helpers such as bpf_get_local_storage() pick this up from the
runtime context:

  ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
  storage = ctx->prog_item->cgroup_storage[stype];

  if (stype == BPF_CGROUP_STORAGE_SHARED)
    ptr = &READ_ONCE(storage->buf)->data[0];
  else
    ptr = this_cpu_ptr(storage->percpu_buf);

For the second program which was called from the originally attached
one, this means bpf_get_local_storage() will pick up the former
program's map, not its own. With mismatching sizes, this can result
in an unintended out-of-bounds access.

To fix this issue, we need to extend bpf_map_owner with an array of
storage_cookie[] to match on i) the exact maps from the original
program if the second program was using bpf_get_local_storage(), or
ii) allow the tail call combination if the second program was not
using any of the cgroup local storage maps.

Fixes: 7d9c3427894f ("bpf: Make cgroup storages shared between programs on the same cgroup")
Reported-by: Lonial Con <kongln9170@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/r/20250730234733.530041-4-daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  1 +
 kernel/bpf/core.c   | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 02aa41e301a5..cc700925b802 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -283,6 +283,7 @@ struct bpf_map_owner {
 	enum bpf_prog_type type;
 	bool jited;
 	bool xdp_has_frags;
+	u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE];
 	const struct btf_type *attach_func_proto;
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6e5b3a67e87f..5d1650af899d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2378,7 +2378,9 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(fp);
 	struct bpf_prog_aux *aux = fp->aux;
+	enum bpf_cgroup_storage_type i;
 	bool ret = false;
+	u64 cookie;
 
 	if (fp->kprobe_override)
 		return ret;
@@ -2393,11 +2395,24 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
 		map->owner->jited = fp->jited;
 		map->owner->xdp_has_frags = aux->xdp_has_frags;
 		map->owner->attach_func_proto = aux->attach_func_proto;
+		for_each_cgroup_storage_type(i) {
+			map->owner->storage_cookie[i] =
+				aux->cgroup_storage[i] ?
+				aux->cgroup_storage[i]->cookie : 0;
+		}
 		ret = true;
 	} else {
 		ret = map->owner->type  == prog_type &&
 		      map->owner->jited == fp->jited &&
 		      map->owner->xdp_has_frags == aux->xdp_has_frags;
+		for_each_cgroup_storage_type(i) {
+			if (!ret)
+				break;
+			cookie = aux->cgroup_storage[i] ?
+				 aux->cgroup_storage[i]->cookie : 0;
+			ret = map->owner->storage_cookie[i] == cookie ||
+			      !cookie;
+		}
 		if (ret &&
 		    map->owner->attach_func_proto != aux->attach_func_proto) {
 			switch (prog_type) {
-- 
cgit v1.2.3


From e2ba58ccc9099514380c3300cbc0750b5055fc1c Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 30 Jul 2025 21:49:53 -0700
Subject: block: Fix default IO priority if there is no IO context

Upstream commit 53889bcaf536 ("block: make __get_task_ioprio() easier to
read") changes the IO priority returned to the caller if no IO context
is defined for the task. Prior to this commit, the returned IO priority
was determined by task_nice_ioclass() and task_nice_ioprio(). Now it is
always IOPRIO_DEFAULT, which translates to IOPRIO_CLASS_NONE with priority
0. However, task_nice_ioclass() returns IOPRIO_CLASS_IDLE, IOPRIO_CLASS_RT,
or IOPRIO_CLASS_BE depending on the task scheduling policy, and
task_nice_ioprio() returns a value determined by task_nice(). This causes
regressions in test code checking the IO priority and class of IO
operations on tasks with no IO context.

Fix the problem by returning the IO priority calculated from
task_nice_ioclass() and task_nice_ioprio() if no IO context is defined
to match earlier behavior.

Fixes: 53889bcaf536 ("block: make __get_task_ioprio() easier to read")
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20250731044953.1852690-1-linux@roeck-us.net
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/ioprio.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index b25377b6ea98..5210e8371238 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -60,7 +60,8 @@ static inline int __get_task_ioprio(struct task_struct *p)
 	int prio;
 
 	if (!ioc)
-		return IOPRIO_DEFAULT;
+		return IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+					 task_nice_ioprio(p));
 
 	if (p != current)
 		lockdep_assert_held(&p->alloc_lock);
-- 
cgit v1.2.3


From 5ccaeedb489b41ce6cb857d0de488992746be282 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 1 Aug 2025 00:10:06 +0000
Subject: cfi: add C CFI type macro

Currently x86 and riscv open-code 4 instances of the same logic to
define a u32 variable with the KCFI typeid of a given function.

Replace the duplicate logic with a common macro.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Co-developed-by: Maxwell Bland <mbland@motorola.com>
Signed-off-by: Maxwell Bland <mbland@motorola.com>
Co-developed-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Dao Huang <huangdao1@oppo.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250801001004.1859976-6-samitolvanen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/riscv/kernel/cfi.c       | 35 +++--------------------------------
 arch/x86/kernel/alternative.c | 31 +++----------------------------
 include/linux/cfi_types.h     | 23 +++++++++++++++++++++++
 3 files changed, 29 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/kernel/cfi.c b/arch/riscv/kernel/cfi.c
index 64bdd3e1ab8c..e7aec5f36dd5 100644
--- a/arch/riscv/kernel/cfi.c
+++ b/arch/riscv/kernel/cfi.c
@@ -4,6 +4,7 @@
  *
  * Copyright (C) 2023 Google LLC
  */
+#include <linux/cfi_types.h>
 #include <linux/cfi.h>
 #include <asm/insn.h>
 
@@ -82,41 +83,11 @@ struct bpf_insn;
 /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
 extern unsigned int __bpf_prog_runX(const void *ctx,
 				    const struct bpf_insn *insn);
-
-/*
- * Force a reference to the external symbol so the compiler generates
- * __kcfi_typid.
- */
-__ADDRESSABLE(__bpf_prog_runX);
-
-/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
-asm (
-"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
-"	.type	cfi_bpf_hash,@object				\n"
-"	.globl	cfi_bpf_hash					\n"
-"	.p2align	2, 0x0					\n"
-"cfi_bpf_hash:							\n"
-"	.word	__kcfi_typeid___bpf_prog_runX			\n"
-"	.size	cfi_bpf_hash, 4					\n"
-"	.popsection						\n"
-);
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
 
 /* Must match bpf_callback_t */
 extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-
-__ADDRESSABLE(__bpf_callback_fn);
-
-/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
-asm (
-"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
-"	.type	cfi_bpf_subprog_hash,@object			\n"
-"	.globl	cfi_bpf_subprog_hash				\n"
-"	.p2align	2, 0x0					\n"
-"cfi_bpf_subprog_hash:						\n"
-"	.word	__kcfi_typeid___bpf_callback_fn			\n"
-"	.size	cfi_bpf_subprog_hash, 4				\n"
-"	.popsection						\n"
-);
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
 
 u32 cfi_get_func_hash(void *func)
 {
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ea1d984166cd..a555665b4d9c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2,6 +2,7 @@
 #define pr_fmt(fmt) "SMP alternatives: " fmt
 
 #include <linux/mmu_context.h>
+#include <linux/cfi_types.h>
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/memory.h>
@@ -1189,37 +1190,11 @@ struct bpf_insn;
 /* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
 extern unsigned int __bpf_prog_runX(const void *ctx,
 				    const struct bpf_insn *insn);
-
-KCFI_REFERENCE(__bpf_prog_runX);
-
-/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
-asm (
-"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
-"	.type	cfi_bpf_hash,@object				\n"
-"	.globl	cfi_bpf_hash					\n"
-"	.p2align	2, 0x0					\n"
-"cfi_bpf_hash:							\n"
-"	.long	__kcfi_typeid___bpf_prog_runX			\n"
-"	.size	cfi_bpf_hash, 4					\n"
-"	.popsection						\n"
-);
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
 
 /* Must match bpf_callback_t */
 extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-
-KCFI_REFERENCE(__bpf_callback_fn);
-
-/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
-asm (
-"	.pushsection	.data..ro_after_init,\"aw\",@progbits	\n"
-"	.type	cfi_bpf_subprog_hash,@object			\n"
-"	.globl	cfi_bpf_subprog_hash				\n"
-"	.p2align	2, 0x0					\n"
-"cfi_bpf_subprog_hash:						\n"
-"	.long	__kcfi_typeid___bpf_callback_fn			\n"
-"	.size	cfi_bpf_subprog_hash, 4				\n"
-"	.popsection						\n"
-);
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
 
 u32 cfi_get_func_hash(void *func)
 {
diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h
index 6b8713675765..685f7181780f 100644
--- a/include/linux/cfi_types.h
+++ b/include/linux/cfi_types.h
@@ -41,5 +41,28 @@
 	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
 #endif
 
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_CFI_CLANG
+#define DEFINE_CFI_TYPE(name, func)						\
+	/*									\
+	 * Force a reference to the function so the compiler generates		\
+	 * __kcfi_typeid_<func>.						\
+	 */									\
+	__ADDRESSABLE(func);							\
+	/* u32 name __ro_after_init = __kcfi_typeid_<func> */			\
+	extern u32 name;							\
+	asm (									\
+	"	.pushsection	.data..ro_after_init,\"aw\",\%progbits	\n"	\
+	"	.type	" #name ",\%object				\n"	\
+	"	.globl	" #name "					\n"	\
+	"	.p2align	2, 0x0					\n"	\
+	#name ":							\n"	\
+	"	.4byte	__kcfi_typeid_" #func "				\n"	\
+	"	.size	" #name ", 4					\n"	\
+	"	.popsection						\n"	\
+	);
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _LINUX_CFI_TYPES_H */
-- 
cgit v1.2.3


From f1befc82addda926c8301436123d041bf3249505 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 1 Aug 2025 00:10:07 +0000
Subject: cfi: Move BPF CFI types and helpers to generic code

Instead of duplicating the same code for each architecture, move
the CFI type hash variables for BPF function types and related
helper functions to generic CFI code, and allow architectures to
override the function definitions if needed.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20250801001004.1859976-7-samitolvanen@google.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/riscv/include/asm/cfi.h  | 16 ---------------
 arch/riscv/kernel/cfi.c       | 24 ----------------------
 arch/x86/include/asm/cfi.h    | 10 ++-------
 arch/x86/kernel/alternative.c | 12 -----------
 include/linux/cfi.h           | 47 +++++++++++++++++++++++++++++++++++--------
 kernel/cfi.c                  | 15 ++++++++++++++
 6 files changed, 56 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h
index fb9696d7a3f2..4508aaa7a2fd 100644
--- a/arch/riscv/include/asm/cfi.h
+++ b/arch/riscv/include/asm/cfi.h
@@ -14,27 +14,11 @@ struct pt_regs;
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
 #define __bpfcall
-static inline int cfi_get_offset(void)
-{
-	return 4;
-}
-
-#define cfi_get_offset cfi_get_offset
-extern u32 cfi_bpf_hash;
-extern u32 cfi_bpf_subprog_hash;
-extern u32 cfi_get_func_hash(void *func);
 #else
 static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_NONE;
 }
-
-#define cfi_bpf_hash 0U
-#define cfi_bpf_subprog_hash 0U
-static inline u32 cfi_get_func_hash(void *func)
-{
-	return 0;
-}
 #endif /* CONFIG_CFI_CLANG */
 
 #endif /* _ASM_RISCV_CFI_H */
diff --git a/arch/riscv/kernel/cfi.c b/arch/riscv/kernel/cfi.c
index e7aec5f36dd5..6ec9dbd7292e 100644
--- a/arch/riscv/kernel/cfi.c
+++ b/arch/riscv/kernel/cfi.c
@@ -4,7 +4,6 @@
  *
  * Copyright (C) 2023 Google LLC
  */
-#include <linux/cfi_types.h>
 #include <linux/cfi.h>
 #include <asm/insn.h>
 
@@ -76,26 +75,3 @@ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 
 	return report_cfi_failure(regs, regs->epc, &target, type);
 }
-
-#ifdef CONFIG_CFI_CLANG
-struct bpf_insn;
-
-/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
-extern unsigned int __bpf_prog_runX(const void *ctx,
-				    const struct bpf_insn *insn);
-DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
-
-/* Must match bpf_callback_t */
-extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
-
-u32 cfi_get_func_hash(void *func)
-{
-	u32 hash;
-
-	if (get_kernel_nofault(hash, func - cfi_get_offset()))
-		return 0;
-
-	return hash;
-}
-#endif
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
index 3e51ba459154..1751f1eb95ef 100644
--- a/arch/x86/include/asm/cfi.h
+++ b/arch/x86/include/asm/cfi.h
@@ -116,8 +116,6 @@ struct pt_regs;
 #ifdef CONFIG_CFI_CLANG
 enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
 #define __bpfcall
-extern u32 cfi_bpf_hash;
-extern u32 cfi_bpf_subprog_hash;
 
 static inline int cfi_get_offset(void)
 {
@@ -135,6 +133,8 @@ static inline int cfi_get_offset(void)
 #define cfi_get_offset cfi_get_offset
 
 extern u32 cfi_get_func_hash(void *func);
+#define cfi_get_func_hash cfi_get_func_hash
+
 extern int cfi_get_func_arity(void *func);
 
 #ifdef CONFIG_FINEIBT
@@ -153,12 +153,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
 {
 	return BUG_TRAP_TYPE_NONE;
 }
-#define cfi_bpf_hash 0U
-#define cfi_bpf_subprog_hash 0U
-static inline u32 cfi_get_func_hash(void *func)
-{
-	return 0;
-}
 static inline int cfi_get_func_arity(void *func)
 {
 	return 0;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a555665b4d9c..9f6b7dab2d9a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2,7 +2,6 @@
 #define pr_fmt(fmt) "SMP alternatives: " fmt
 
 #include <linux/mmu_context.h>
-#include <linux/cfi_types.h>
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/memory.h>
@@ -1185,17 +1184,6 @@ bool cfi_bhi __ro_after_init = false;
 #endif
 
 #ifdef CONFIG_CFI_CLANG
-struct bpf_insn;
-
-/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
-extern unsigned int __bpf_prog_runX(const void *ctx,
-				    const struct bpf_insn *insn);
-DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
-
-/* Must match bpf_callback_t */
-extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
-DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
-
 u32 cfi_get_func_hash(void *func)
 {
 	u32 hash;
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 1db17ecbb86c..52a98886a455 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -11,16 +11,9 @@
 #include <linux/module.h>
 #include <asm/cfi.h>
 
+#ifdef CONFIG_CFI_CLANG
 extern bool cfi_warn;
 
-#ifndef cfi_get_offset
-static inline int cfi_get_offset(void)
-{
-	return 0;
-}
-#endif
-
-#ifdef CONFIG_CFI_CLANG
 enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
 				      unsigned long *target, u32 type);
 
@@ -29,6 +22,44 @@ static inline enum bug_trap_type report_cfi_failure_noaddr(struct pt_regs *regs,
 {
 	return report_cfi_failure(regs, addr, NULL, 0);
 }
+
+#ifndef cfi_get_offset
+/*
+ * Returns the CFI prefix offset. By default, the compiler emits only
+ * a 4-byte CFI type hash before the function. If an architecture
+ * uses -fpatchable-function-entry=N,M where M>0 to change the prefix
+ * offset, they must override this function.
+ */
+static inline int cfi_get_offset(void)
+{
+	return 4;
+}
+#endif
+
+#ifndef cfi_get_func_hash
+static inline u32 cfi_get_func_hash(void *func)
+{
+	u32 hash;
+
+	if (get_kernel_nofault(hash, func - cfi_get_offset()))
+		return 0;
+
+	return hash;
+}
+#endif
+
+/* CFI type hashes for BPF function types */
+extern u32 cfi_bpf_hash;
+extern u32 cfi_bpf_subprog_hash;
+
+#else /* CONFIG_CFI_CLANG */
+
+static inline int cfi_get_offset(void) { return 0; }
+static inline u32 cfi_get_func_hash(void *func) { return 0; }
+
+#define cfi_bpf_hash 0U
+#define cfi_bpf_subprog_hash 0U
+
 #endif /* CONFIG_CFI_CLANG */
 
 #ifdef CONFIG_ARCH_USES_CFI_TRAPS
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 422fa4f958ae..4dad04ead06c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -5,6 +5,8 @@
  * Copyright (C) 2022 Google LLC
  */
 
+#include <linux/bpf.h>
+#include <linux/cfi_types.h>
 #include <linux/cfi.h>
 
 bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
@@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+/*
+ * Declare two non-existent functions with types that match bpf_func_t and
+ * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash
+ * variables for each function type. The cfi_bpf_* variables are used by
+ * arch-specific BPF JIT implementations to ensure indirectly callable JIT
+ * code has matching CFI type hashes.
+ */
+extern typeof(*(bpf_func_t)0) __bpf_prog_runX;
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
+
+extern typeof(*(bpf_callback_t)0) __bpf_callback_fn;
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
+
 #ifdef CONFIG_ARCH_USES_CFI_TRAPS
 static inline unsigned long trap_address(s32 *p)
 {
-- 
cgit v1.2.3


From 55a984928bfa30c7877e28f16910e6de1c170f1f Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 1 Aug 2025 10:26:13 +0200
Subject: Revert "tty: vt: use _IO() to define ioctl numbers"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit f1180ca37abe3d117e4a19be12142fe722612a7c. Since the
commit, the vt ioctl numbers are defined differently on platforms where
_IOC_NONE is non-zero: alpha, mips, powerpc, sparc.

Signed-off-by: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/all/436489B9-E67B-4630-909F-386C30A2AAC9@xenosoft.de/
Link: https://lore.kernel.org/all/97ec2636-915a-498c-903b-d66957420d21@csgroup.eu/
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20250801082613.2564584-1-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/vt.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h
index b60fcdfb2746..714483d68c69 100644
--- a/include/uapi/linux/vt.h
+++ b/include/uapi/linux/vt.h
@@ -14,9 +14,9 @@
 		/* Note: the ioctl VT_GETSTATE does not work for
 		   consoles 16 and higher (since it returns a short) */
 
-/* 'V' to avoid collision with termios and kd */
+/* 0x56 is 'V', to avoid collision with termios and kd */
 
-#define VT_OPENQRY		_IO('V', 0x00)	/* find available vt */
+#define VT_OPENQRY	0x5600	/* find available vt */
 
 struct vt_mode {
 	__u8 mode;		/* vt mode */
@@ -25,8 +25,8 @@ struct vt_mode {
 	__s16 acqsig;		/* signal to raise on acquisition */
 	__s16 frsig;		/* unused (set to 0) */
 };
-#define VT_GETMODE		_IO('V', 0x01)	/* get mode of active vt */
-#define VT_SETMODE		_IO('V', 0x02)	/* set mode of active vt */
+#define VT_GETMODE	0x5601	/* get mode of active vt */
+#define VT_SETMODE	0x5602	/* set mode of active vt */
 #define		VT_AUTO		0x00	/* auto vt switching */
 #define		VT_PROCESS	0x01	/* process controls switching */
 #define		VT_ACKACQ	0x02	/* acknowledge switch */
@@ -36,21 +36,21 @@ struct vt_stat {
 	__u16 v_signal;	/* signal to send */
 	__u16 v_state;		/* vt bitmask */
 };
-#define VT_GETSTATE		_IO('V', 0x03)	/* get global vt state info */
-#define VT_SENDSIG		_IO('V', 0x04)	/* signal to send to bitmask of vts */
+#define VT_GETSTATE	0x5603	/* get global vt state info */
+#define VT_SENDSIG	0x5604	/* signal to send to bitmask of vts */
 
-#define VT_RELDISP		_IO('V', 0x05)	/* release display */
+#define VT_RELDISP	0x5605	/* release display */
 
-#define VT_ACTIVATE		_IO('V', 0x06)	/* make vt active */
-#define VT_WAITACTIVE		_IO('V', 0x07)	/* wait for vt active */
-#define VT_DISALLOCATE		_IO('V', 0x08)  /* free memory associated to vt */
+#define VT_ACTIVATE	0x5606	/* make vt active */
+#define VT_WAITACTIVE	0x5607	/* wait for vt active */
+#define VT_DISALLOCATE	0x5608  /* free memory associated to vt */
 
 struct vt_sizes {
 	__u16 v_rows;		/* number of rows */
 	__u16 v_cols;		/* number of columns */
 	__u16 v_scrollsize;	/* number of lines of scrollback */
 };
-#define VT_RESIZE		_IO('V', 0x09)	/* set kernel's idea of screensize */
+#define VT_RESIZE	0x5609	/* set kernel's idea of screensize */
 
 struct vt_consize {
 	__u16 v_rows;	/* number of rows */
@@ -60,10 +60,10 @@ struct vt_consize {
 	__u16 v_vcol;	/* number of pixel columns on screen */
 	__u16 v_ccol;	/* number of pixel columns per character */
 };
-#define VT_RESIZEX		_IO('V', 0x0A)  /* set kernel's idea of screensize + more */
-#define VT_LOCKSWITCH		_IO('V', 0x0B)  /* disallow vt switching */
-#define VT_UNLOCKSWITCH		_IO('V', 0x0C)  /* allow vt switching */
-#define VT_GETHIFONTMASK	_IO('V', 0x0D)  /* return hi font mask */
+#define VT_RESIZEX      0x560A  /* set kernel's idea of screensize + more */
+#define VT_LOCKSWITCH   0x560B  /* disallow vt switching */
+#define VT_UNLOCKSWITCH 0x560C  /* allow vt switching */
+#define VT_GETHIFONTMASK 0x560D  /* return hi font mask */
 
 struct vt_event {
 	__u32 event;
@@ -77,14 +77,14 @@ struct vt_event {
 	__u32 pad[4];		/* Padding for expansion */
 };
 
-#define VT_WAITEVENT		_IO('V', 0x0E)	/* Wait for an event */
+#define VT_WAITEVENT	0x560E	/* Wait for an event */
 
 struct vt_setactivate {
 	__u32 console;
 	struct vt_mode mode;
 };
 
-#define VT_SETACTIVATE		_IO('V', 0x0F)	/* Activate and set the mode of a console */
+#define VT_SETACTIVATE	0x560F	/* Activate and set the mode of a console */
 
 /* get console size and cursor position */
 struct vt_consizecsrpos {
-- 
cgit v1.2.3


From 9d9b193ed73a65ec47cf1fd39925b09da8216461 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 31 Jul 2025 09:41:47 +0800
Subject: crypto: hash - Increase HASH_MAX_DESCSIZE for hmac(sha3-224-s390)

The value of HASH_MAX_DESCSIZE is off by one for hmac(sha3-224-s390).
Fix this so that hmac(sha3-224-s390) can be registered.

Reported-by: Ingo Franzki <ifranzki@linux.ibm.com>
Reported-by: Eric Biggers <ebiggers@kernel.org>
Fixes: 6f90ba706551 ("crypto: s390/sha3 - Use API partial block handling")
Cc: <stable@vger.kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/hash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 6f6b9de12cd3..ed63b904837d 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -184,7 +184,7 @@ struct shash_desc {
  * Worst case is hmac(sha3-224-s390).  Its context is a nested 'shash_desc'
  * containing a 'struct s390_sha_ctx'.
  */
-#define HASH_MAX_DESCSIZE	(sizeof(struct shash_desc) + 360)
+#define HASH_MAX_DESCSIZE	(sizeof(struct shash_desc) + 361)
 #define MAX_SYNC_HASH_REQSIZE	(sizeof(struct ahash_request) + \
 				 HASH_MAX_DESCSIZE)
 
-- 
cgit v1.2.3


From 564a69ad90d15c782176e1a8c9e1c95661e1aed0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 21 May 2025 17:03:46 +0530
Subject: virtio-mmio: Remove virtqueue list from mmio device

The MMIO transport implementation creates a list of virtqueues for a
virtio device, while the same is already available in the struct
virtio_device.

Don't create a duplicate list, and use the other one instead.

While at it, fix the virtio_device_for_each_vq() macro to accept an
argument like "&vm_dev->vdev" (which currently fails to build).

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Message-Id: <3e56c6f74002987e22f364d883cbad177cd9ad9c.1747827066.git.viresh.kumar@linaro.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 drivers/virtio/virtio_mmio.c | 52 +++-----------------------------------------
 include/linux/virtio.h       |  2 +-
 2 files changed, 4 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 5d78c2d572ab..b152a1eca05a 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -65,7 +65,6 @@
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <linux/virtio.h>
 #include <linux/virtio_config.h>
 #include <uapi/linux/virtio_mmio.h>
@@ -88,22 +87,8 @@ struct virtio_mmio_device {
 
 	void __iomem *base;
 	unsigned long version;
-
-	/* a list of queues so we can dispatch IRQs */
-	spinlock_t lock;
-	struct list_head virtqueues;
-};
-
-struct virtio_mmio_vq_info {
-	/* the actual virtqueue */
-	struct virtqueue *vq;
-
-	/* the list node for the virtqueues list */
-	struct list_head node;
 };
 
-
-
 /* Configuration interface */
 
 static u64 vm_get_features(struct virtio_device *vdev)
@@ -300,9 +285,8 @@ static bool vm_notify_with_data(struct virtqueue *vq)
 static irqreturn_t vm_interrupt(int irq, void *opaque)
 {
 	struct virtio_mmio_device *vm_dev = opaque;
-	struct virtio_mmio_vq_info *info;
+	struct virtqueue *vq;
 	unsigned long status;
-	unsigned long flags;
 	irqreturn_t ret = IRQ_NONE;
 
 	/* Read and acknowledge interrupts */
@@ -315,10 +299,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
 	}
 
 	if (likely(status & VIRTIO_MMIO_INT_VRING)) {
-		spin_lock_irqsave(&vm_dev->lock, flags);
-		list_for_each_entry(info, &vm_dev->virtqueues, node)
-			ret |= vring_interrupt(irq, info->vq);
-		spin_unlock_irqrestore(&vm_dev->lock, flags);
+		virtio_device_for_each_vq(&vm_dev->vdev, vq)
+			ret |= vring_interrupt(irq, vq);
 	}
 
 	return ret;
@@ -329,14 +311,8 @@ static irqreturn_t vm_interrupt(int irq, void *opaque)
 static void vm_del_vq(struct virtqueue *vq)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev);
-	struct virtio_mmio_vq_info *info = vq->priv;
-	unsigned long flags;
 	unsigned int index = vq->index;
 
-	spin_lock_irqsave(&vm_dev->lock, flags);
-	list_del(&info->node);
-	spin_unlock_irqrestore(&vm_dev->lock, flags);
-
 	/* Select and deactivate the queue */
 	writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL);
 	if (vm_dev->version == 1) {
@@ -347,8 +323,6 @@ static void vm_del_vq(struct virtqueue *vq)
 	}
 
 	vring_del_virtqueue(vq);
-
-	kfree(info);
 }
 
 static void vm_del_vqs(struct virtio_device *vdev)
@@ -375,9 +349,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 	bool (*notify)(struct virtqueue *vq);
-	struct virtio_mmio_vq_info *info;
 	struct virtqueue *vq;
-	unsigned long flags;
 	unsigned int num;
 	int err;
 
@@ -399,13 +371,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 		goto error_available;
 	}
 
-	/* Allocate and fill out our active queue description */
-	info = kmalloc(sizeof(*info), GFP_KERNEL);
-	if (!info) {
-		err = -ENOMEM;
-		goto error_kmalloc;
-	}
-
 	num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX);
 	if (num == 0) {
 		err = -ENOENT;
@@ -463,13 +428,6 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in
 		writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
 	}
 
-	vq->priv = info;
-	info->vq = vq;
-
-	spin_lock_irqsave(&vm_dev->lock, flags);
-	list_add(&info->node, &vm_dev->virtqueues);
-	spin_unlock_irqrestore(&vm_dev->lock, flags);
-
 	return vq;
 
 error_bad_pfn:
@@ -481,8 +439,6 @@ error_new_virtqueue:
 		writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY);
 		WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY));
 	}
-	kfree(info);
-error_kmalloc:
 error_available:
 	return ERR_PTR(err);
 }
@@ -627,8 +583,6 @@ static int virtio_mmio_probe(struct platform_device *pdev)
 	vm_dev->vdev.dev.release = virtio_mmio_release_dev;
 	vm_dev->vdev.config = &virtio_mmio_config_ops;
 	vm_dev->pdev = pdev;
-	INIT_LIST_HEAD(&vm_dev->virtqueues);
-	spin_lock_init(&vm_dev->lock);
 
 	vm_dev->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(vm_dev->base)) {
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 64cb4b04be7a..8b745ce0cf5f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -196,7 +196,7 @@ int virtio_device_reset_done(struct virtio_device *dev);
 size_t virtio_max_dma_size(const struct virtio_device *vdev);
 
 #define virtio_device_for_each_vq(vdev, vq) \
-	list_for_each_entry(vq, &vdev->vqs, list)
+	list_for_each_entry(vq, &(vdev)->vqs, list)
 
 /**
  * struct virtio_driver - operations for a virtio I/O driver
-- 
cgit v1.2.3


From 569c392e191361cd05fba1fd87ed02ef0d130ef7 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 17 Jun 2025 01:18:36 +0100
Subject: vhost: vringh: Remove unused iotlb functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The functions:
  vringh_abandon_iotlb()
  vringh_notify_disable_iotlb() and
  vringh_notify_enable_iotlb()

were added in 2020 by
commit 9ad9c49cfe97 ("vringh: IOTLB support")
but have remained unused.

Remove them.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Simon Horman <horms@kernel.org>
Message-Id: <20250617001838.114457-2-linux@treblig.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 drivers/vhost/vringh.c | 43 -------------------------------------------
 include/linux/vringh.h |  5 -----
 2 files changed, 48 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index bbce65452701..67a028d6fb5f 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -1534,23 +1534,6 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
 }
 EXPORT_SYMBOL(vringh_iov_push_iotlb);
 
-/**
- * vringh_abandon_iotlb - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- *	 vringh_get_iotlb() to undo).
- *
- * The next vringh_get_iotlb() will return the old descriptor(s) again.
- */
-void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num)
-{
-	/* We only update vring_avail_event(vr) when we want to be notified,
-	 * so we haven't changed that yet.
-	 */
-	vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_iotlb);
-
 /**
  * vringh_complete_iotlb - we've finished with descriptor, publish it.
  * @vrh: the vring.
@@ -1571,32 +1554,6 @@ int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len)
 }
 EXPORT_SYMBOL(vringh_complete_iotlb);
 
-/**
- * vringh_notify_enable_iotlb - we want to know if something changes.
- * @vrh: the vring.
- *
- * This always enables notifications, but returns false if there are
- * now more buffers available in the vring.
- */
-bool vringh_notify_enable_iotlb(struct vringh *vrh)
-{
-	return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb);
-}
-EXPORT_SYMBOL(vringh_notify_enable_iotlb);
-
-/**
- * vringh_notify_disable_iotlb - don't tell us if something changes.
- * @vrh: the vring.
- *
- * This is our normal running state: we disable and then only enable when
- * we're going to sleep.
- */
-void vringh_notify_disable_iotlb(struct vringh *vrh)
-{
-	__vringh_notify_disable(vrh, putu16_iotlb);
-}
-EXPORT_SYMBOL(vringh_notify_disable_iotlb);
-
 /**
  * vringh_need_notify_iotlb - must we tell the other side about used buffers?
  * @vrh: the vring we've called vringh_complete_iotlb() on.
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index c3a8117dabe8..af8bd2695a7b 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -319,13 +319,8 @@ ssize_t vringh_iov_push_iotlb(struct vringh *vrh,
 			      struct vringh_kiov *wiov,
 			      const void *src, size_t len);
 
-void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num);
-
 int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len);
 
-bool vringh_notify_enable_iotlb(struct vringh *vrh);
-void vringh_notify_disable_iotlb(struct vringh *vrh);
-
 int vringh_need_notify_iotlb(struct vringh *vrh);
 
 #endif /* CONFIG_VHOST_IOTLB */
-- 
cgit v1.2.3


From 6e9ef6937c726b97d4a6d49332d06e999acc15f5 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 17 Jun 2025 01:18:37 +0100
Subject: vhost: vringh: Remove unused functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The functions:
  vringh_abandon_kern()
  vringh_abandon_user()
  vringh_iov_pull_kern() and
  vringh_iov_push_kern()
were all added in 2013 by
commit f87d0fbb5798 ("vringh: host-side implementation of virtio rings.")
but have remained unused.

Remove them and the two helper functions they used.

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Message-Id: <20250617001838.114457-3-linux@treblig.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 drivers/vhost/vringh.c | 75 --------------------------------------------------
 include/linux/vringh.h |  7 -----
 2 files changed, 82 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
index 67a028d6fb5f..9f27c3f6091b 100644
--- a/drivers/vhost/vringh.c
+++ b/drivers/vhost/vringh.c
@@ -779,22 +779,6 @@ ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
 }
 EXPORT_SYMBOL(vringh_iov_push_user);
 
-/**
- * vringh_abandon_user - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- *	 vringh_get_user() to undo).
- *
- * The next vringh_get_user() will return the old descriptor(s) again.
- */
-void vringh_abandon_user(struct vringh *vrh, unsigned int num)
-{
-	/* We only update vring_avail_event(vr) when we want to be notified,
-	 * so we haven't changed that yet. */
-	vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_user);
-
 /**
  * vringh_complete_user - we've finished with descriptor, publish it.
  * @vrh: the vring.
@@ -900,20 +884,6 @@ static inline int putused_kern(const struct vringh *vrh,
 	return 0;
 }
 
-static inline int xfer_kern(const struct vringh *vrh, void *src,
-			    void *dst, size_t len)
-{
-	memcpy(dst, src, len);
-	return 0;
-}
-
-static inline int kern_xfer(const struct vringh *vrh, void *dst,
-			    void *src, size_t len)
-{
-	memcpy(dst, src, len);
-	return 0;
-}
-
 /**
  * vringh_init_kern - initialize a vringh for a kernelspace vring.
  * @vrh: the vringh to initialize.
@@ -998,51 +968,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
 }
 EXPORT_SYMBOL(vringh_getdesc_kern);
 
-/**
- * vringh_iov_pull_kern - copy bytes from vring_iov.
- * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
- * @dst: the place to copy.
- * @len: the maximum length to copy.
- *
- * Returns the bytes copied <= len or a negative errno.
- */
-ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
-{
-	return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern);
-}
-EXPORT_SYMBOL(vringh_iov_pull_kern);
-
-/**
- * vringh_iov_push_kern - copy bytes into vring_iov.
- * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
- * @src: the place to copy from.
- * @len: the maximum length to copy.
- *
- * Returns the bytes copied <= len or a negative errno.
- */
-ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
-			     const void *src, size_t len)
-{
-	return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer);
-}
-EXPORT_SYMBOL(vringh_iov_push_kern);
-
-/**
- * vringh_abandon_kern - we've decided not to handle the descriptor(s).
- * @vrh: the vring.
- * @num: the number of descriptors to put back (ie. num
- *	 vringh_get_kern() to undo).
- *
- * The next vringh_get_kern() will return the old descriptor(s) again.
- */
-void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
-{
-	/* We only update vring_avail_event(vr) when we want to be notified,
-	 * so we haven't changed that yet. */
-	vrh->last_avail_idx -= num;
-}
-EXPORT_SYMBOL(vringh_abandon_kern);
-
 /**
  * vringh_complete_kern - we've finished with descriptor, publish it.
  * @vrh: the vring.
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index af8bd2695a7b..49e7cbc9697a 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -175,9 +175,6 @@ int vringh_complete_multi_user(struct vringh *vrh,
 			       const struct vring_used_elem used[],
 			       unsigned num_used);
 
-/* Pretend we've never seen descriptor (for easy error handling). */
-void vringh_abandon_user(struct vringh *vrh, unsigned int num);
-
 /* Do we need to fire the eventfd to notify the other side? */
 int vringh_need_notify_user(struct vringh *vrh);
 
@@ -235,10 +232,6 @@ int vringh_getdesc_kern(struct vringh *vrh,
 			u16 *head,
 			gfp_t gfp);
 
-ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
-ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
-			     const void *src, size_t len);
-void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
 int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
 
 bool vringh_notify_enable_kern(struct vringh *vrh);
-- 
cgit v1.2.3


From 7d9896e9f6d02d8aa85e63f736871f96c59a5263 Mon Sep 17 00:00:00 2001
From: Cindy Lu <lulu@redhat.com>
Date: Mon, 14 Jul 2025 15:12:32 +0800
Subject: vhost: Reintroduce kthread API and add mode selection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"),
the vhost uses vhost_task and operates as a child of the
owner thread. This is required for correct CPU usage accounting,
especially when using containers.

However, this change has caused confusion for some legacy
userspace applications, and we didn't notice until it's too late.

Unfortunately, it's too late to revert - we now have userspace
depending both on old and new behaviour :(

To address the issue, reintroduce kthread mode for vhost workers and
provide a configuration to select between kthread and task worker.

- Add 'fork_owner' parameter to vhost_dev to let users select kthread
  or task mode. Default mode is task mode(VHOST_FORK_OWNER_TASK).

- Reintroduce kthread mode support:
  * Bring back the original vhost_worker() implementation,
    and renamed to vhost_run_work_kthread_list().
  * Add cgroup support for the kthread
  * Introduce struct vhost_worker_ops:
    - Encapsulates create / stop / wake‑up callbacks.
    - vhost_worker_create() selects the proper ops according to
      inherit_owner.

- Userspace configuration interface:
  * New IOCTLs:
      - VHOST_SET_FORK_FROM_OWNER lets userspace select task mode
        (VHOST_FORK_OWNER_TASK) or kthread mode (VHOST_FORK_OWNER_KTHREAD)
      - VHOST_GET_FORK_FROM_OWNER reads the current worker mode
  * Expose module parameter 'fork_from_owner_default' to allow system
    administrators to configure the default mode for vhost workers
  * Kconfig option CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL controls whether
    these IOCTLs and the parameter are available

- The VHOST_NEW_WORKER functionality requires fork_owner to be set
  to true, with validation added to ensure proper configuration

This partially reverts or improves upon:
  commit 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
  commit 1cdaafa1b8b4 ("vhost: replace single worker pointer with xarray")

Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads"),
Signed-off-by: Cindy Lu <lulu@redhat.com>
Message-Id: <20250714071333.59794-2-lulu@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Tested-by: Lei Yang <leiyang@redhat.com>
---
 drivers/vhost/Kconfig      |  18 ++++
 drivers/vhost/vhost.c      | 244 +++++++++++++++++++++++++++++++++++++++++----
 drivers/vhost/vhost.h      |  22 ++++
 include/uapi/linux/vhost.h |  29 ++++++
 4 files changed, 295 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 020d4fbb947c..bc0f38574497 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -95,4 +95,22 @@ config VHOST_CROSS_ENDIAN_LEGACY
 
 	  If unsure, say "N".
 
+config VHOST_ENABLE_FORK_OWNER_CONTROL
+	bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL"
+	default y
+	help
+	  This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and
+	  VHOST_GET_FORK_FROM_OWNER. These allow userspace applications
+	  to modify the vhost worker mode for vhost devices.
+
+	  Also expose module parameter 'fork_from_owner_default' to allow users
+	  to configure the default mode for vhost workers.
+
+	  By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`,
+	  users can change the worker thread mode as needed.
+	  If this config is disabled (n),the related IOCTLs and parameters will
+	  be unavailable.
+
+	  If unsure, say "Y".
+
 endif
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4390e3a14218..f4c1bc6adeda 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/kthread.h>
+#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sort.h>
 #include <linux/sched/mm.h>
@@ -41,6 +42,13 @@ static int max_iotlb_entries = 2048;
 module_param(max_iotlb_entries, int, 0444);
 MODULE_PARM_DESC(max_iotlb_entries,
 	"Maximum number of iotlb entries. (default: 2048)");
+static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK;
+
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+module_param(fork_from_owner_default, bool, 0444);
+MODULE_PARM_DESC(fork_from_owner_default,
+		 "Set task mode as the default(default: Y)");
+#endif
 
 enum {
 	VHOST_MEMORY_F_LOG = 0x1,
@@ -242,7 +250,7 @@ static void vhost_worker_queue(struct vhost_worker *worker,
 		 * test_and_set_bit() implies a memory barrier.
 		 */
 		llist_add(&work->node, &worker->work_list);
-		vhost_task_wake(worker->vtsk);
+		worker->ops->wakeup(worker);
 	}
 }
 
@@ -388,6 +396,44 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	__vhost_vq_meta_reset(vq);
 }
 
+static int vhost_run_work_kthread_list(void *data)
+{
+	struct vhost_worker *worker = data;
+	struct vhost_work *work, *work_next;
+	struct vhost_dev *dev = worker->dev;
+	struct llist_node *node;
+
+	kthread_use_mm(dev->mm);
+
+	for (;;) {
+		/* mb paired w/ kthread_stop */
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (kthread_should_stop()) {
+			__set_current_state(TASK_RUNNING);
+			break;
+		}
+		node = llist_del_all(&worker->work_list);
+		if (!node)
+			schedule();
+
+		node = llist_reverse_order(node);
+		/* make sure flag is seen after deletion */
+		smp_wmb();
+		llist_for_each_entry_safe(work, work_next, node, node) {
+			clear_bit(VHOST_WORK_QUEUED, &work->flags);
+			__set_current_state(TASK_RUNNING);
+			kcov_remote_start_common(worker->kcov_handle);
+			work->fn(work);
+			kcov_remote_stop();
+			cond_resched();
+		}
+	}
+	kthread_unuse_mm(dev->mm);
+
+	return 0;
+}
+
 static bool vhost_run_work_list(void *data)
 {
 	struct vhost_worker *worker = data;
@@ -552,6 +598,7 @@ void vhost_dev_init(struct vhost_dev *dev,
 	dev->byte_weight = byte_weight;
 	dev->use_worker = use_worker;
 	dev->msg_handler = msg_handler;
+	dev->fork_owner = fork_from_owner_default;
 	init_waitqueue_head(&dev->wait);
 	INIT_LIST_HEAD(&dev->read_list);
 	INIT_LIST_HEAD(&dev->pending_list);
@@ -581,6 +628,46 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 
+struct vhost_attach_cgroups_struct {
+	struct vhost_work work;
+	struct task_struct *owner;
+	int ret;
+};
+
+static void vhost_attach_cgroups_work(struct vhost_work *work)
+{
+	struct vhost_attach_cgroups_struct *s;
+
+	s = container_of(work, struct vhost_attach_cgroups_struct, work);
+	s->ret = cgroup_attach_task_all(s->owner, current);
+}
+
+static int vhost_attach_task_to_cgroups(struct vhost_worker *worker)
+{
+	struct vhost_attach_cgroups_struct attach;
+	int saved_cnt;
+
+	attach.owner = current;
+
+	vhost_work_init(&attach.work, vhost_attach_cgroups_work);
+	vhost_worker_queue(worker, &attach.work);
+
+	mutex_lock(&worker->mutex);
+
+	/*
+	 * Bypass attachment_cnt check in __vhost_worker_flush:
+	 * Temporarily change it to INT_MAX to bypass the check
+	 */
+	saved_cnt = worker->attachment_cnt;
+	worker->attachment_cnt = INT_MAX;
+	__vhost_worker_flush(worker);
+	worker->attachment_cnt = saved_cnt;
+
+	mutex_unlock(&worker->mutex);
+
+	return attach.ret;
+}
+
 /* Caller should have device mutex */
 bool vhost_dev_has_owner(struct vhost_dev *dev)
 {
@@ -626,7 +713,7 @@ static void vhost_worker_destroy(struct vhost_dev *dev,
 
 	WARN_ON(!llist_empty(&worker->work_list));
 	xa_erase(&dev->worker_xa, worker->id);
-	vhost_task_stop(worker->vtsk);
+	worker->ops->stop(worker);
 	kfree(worker);
 }
 
@@ -649,42 +736,115 @@ static void vhost_workers_free(struct vhost_dev *dev)
 	xa_destroy(&dev->worker_xa);
 }
 
+static void vhost_task_wakeup(struct vhost_worker *worker)
+{
+	return vhost_task_wake(worker->vtsk);
+}
+
+static void vhost_kthread_wakeup(struct vhost_worker *worker)
+{
+	wake_up_process(worker->kthread_task);
+}
+
+static void vhost_task_do_stop(struct vhost_worker *worker)
+{
+	return vhost_task_stop(worker->vtsk);
+}
+
+static void vhost_kthread_do_stop(struct vhost_worker *worker)
+{
+	kthread_stop(worker->kthread_task);
+}
+
+static int vhost_task_worker_create(struct vhost_worker *worker,
+				    struct vhost_dev *dev, const char *name)
+{
+	struct vhost_task *vtsk;
+	u32 id;
+	int ret;
+
+	vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
+				 worker, name);
+	if (IS_ERR(vtsk))
+		return PTR_ERR(vtsk);
+
+	worker->vtsk = vtsk;
+	vhost_task_start(vtsk);
+	ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+	if (ret < 0) {
+		vhost_task_do_stop(worker);
+		return ret;
+	}
+	worker->id = id;
+	return 0;
+}
+
+static int vhost_kthread_worker_create(struct vhost_worker *worker,
+				       struct vhost_dev *dev, const char *name)
+{
+	struct task_struct *task;
+	u32 id;
+	int ret;
+
+	task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+
+	worker->kthread_task = task;
+	wake_up_process(task);
+	ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+	if (ret < 0)
+		goto stop_worker;
+
+	ret = vhost_attach_task_to_cgroups(worker);
+	if (ret)
+		goto stop_worker;
+
+	worker->id = id;
+	return 0;
+
+stop_worker:
+	vhost_kthread_do_stop(worker);
+	return ret;
+}
+
+static const struct vhost_worker_ops kthread_ops = {
+	.create = vhost_kthread_worker_create,
+	.stop = vhost_kthread_do_stop,
+	.wakeup = vhost_kthread_wakeup,
+};
+
+static const struct vhost_worker_ops vhost_task_ops = {
+	.create = vhost_task_worker_create,
+	.stop = vhost_task_do_stop,
+	.wakeup = vhost_task_wakeup,
+};
+
 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 {
 	struct vhost_worker *worker;
-	struct vhost_task *vtsk;
 	char name[TASK_COMM_LEN];
 	int ret;
-	u32 id;
+	const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops :
+							       &kthread_ops;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
 	if (!worker)
 		return NULL;
 
 	worker->dev = dev;
+	worker->ops = ops;
 	snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
-	vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
-				 worker, name);
-	if (IS_ERR(vtsk))
-		goto free_worker;
-
 	mutex_init(&worker->mutex);
 	init_llist_head(&worker->work_list);
 	worker->kcov_handle = kcov_common_handle();
-	worker->vtsk = vtsk;
-
-	vhost_task_start(vtsk);
-
-	ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL);
+	ret = ops->create(worker, dev, name);
 	if (ret < 0)
-		goto stop_worker;
-	worker->id = id;
+		goto free_worker;
 
 	return worker;
 
-stop_worker:
-	vhost_task_stop(vtsk);
 free_worker:
 	kfree(worker);
 	return NULL;
@@ -865,6 +1025,14 @@ long vhost_worker_ioctl(struct vhost_dev *dev, unsigned int ioctl,
 	switch (ioctl) {
 	/* dev worker ioctls */
 	case VHOST_NEW_WORKER:
+		/*
+		 * vhost_tasks will account for worker threads under the parent's
+		 * NPROC value but kthreads do not. To avoid userspace overflowing
+		 * the system with worker threads fork_owner must be true.
+		 */
+		if (!dev->fork_owner)
+			return -EFAULT;
+
 		ret = vhost_new_worker(dev, &state);
 		if (!ret && copy_to_user(argp, &state, sizeof(state)))
 			ret = -EFAULT;
@@ -982,6 +1150,7 @@ void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_iotlb *umem)
 
 	vhost_dev_cleanup(dev);
 
+	dev->fork_owner = fork_from_owner_default;
 	dev->umem = umem;
 	/* We don't need VQ locks below since vhost_dev_cleanup makes sure
 	 * VQs aren't running.
@@ -2135,6 +2304,45 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
 		goto done;
 	}
 
+#ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL
+	if (ioctl == VHOST_SET_FORK_FROM_OWNER) {
+		/* Only allow modification before owner is set */
+		if (vhost_dev_has_owner(d)) {
+			r = -EBUSY;
+			goto done;
+		}
+		u8 fork_owner_val;
+
+		if (get_user(fork_owner_val, (u8 __user *)argp)) {
+			r = -EFAULT;
+			goto done;
+		}
+		if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+		    fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+			r = -EINVAL;
+			goto done;
+		}
+		d->fork_owner = !!fork_owner_val;
+		r = 0;
+		goto done;
+	}
+	if (ioctl == VHOST_GET_FORK_FROM_OWNER) {
+		u8 fork_owner_val = d->fork_owner;
+
+		if (fork_owner_val != VHOST_FORK_OWNER_TASK &&
+		    fork_owner_val != VHOST_FORK_OWNER_KTHREAD) {
+			r = -EINVAL;
+			goto done;
+		}
+		if (put_user(fork_owner_val, (u8 __user *)argp)) {
+			r = -EFAULT;
+			goto done;
+		}
+		r = 0;
+		goto done;
+	}
+#endif
+
 	/* You must be the owner to do anything else */
 	r = vhost_dev_check_owner(d);
 	if (r)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index bb75a292d50c..ab704d84fb34 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -26,7 +26,18 @@ struct vhost_work {
 	unsigned long		flags;
 };
 
+struct vhost_worker;
+struct vhost_dev;
+
+struct vhost_worker_ops {
+	int (*create)(struct vhost_worker *worker, struct vhost_dev *dev,
+		      const char *name);
+	void (*stop)(struct vhost_worker *worker);
+	void (*wakeup)(struct vhost_worker *worker);
+};
+
 struct vhost_worker {
+	struct task_struct *kthread_task;
 	struct vhost_task	*vtsk;
 	struct vhost_dev	*dev;
 	/* Used to serialize device wide flushing with worker swapping. */
@@ -36,6 +47,7 @@ struct vhost_worker {
 	u32			id;
 	int			attachment_cnt;
 	bool			killed;
+	const struct vhost_worker_ops *ops;
 };
 
 /* Poll a file (eventfd or socket) */
@@ -176,6 +188,16 @@ struct vhost_dev {
 	int byte_weight;
 	struct xarray worker_xa;
 	bool use_worker;
+	/*
+	 * If fork_owner is true we use vhost_tasks to create
+	 * the worker so all settings/limits like cgroups, NPROC,
+	 * scheduler, etc are inherited from the owner. If false,
+	 * we use kthreads and only attach to the same cgroups
+	 * as the owner for compat with older kernels.
+	 * here we use true as default value.
+	 * The default value is set by fork_from_owner_default
+	 */
+	bool fork_owner;
 	int (*msg_handler)(struct vhost_dev *dev, u32 asid,
 			   struct vhost_iotlb_msg *msg);
 };
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index d4b3e2ae1314..e72f2655459e 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -235,4 +235,33 @@
  */
 #define VHOST_VDPA_GET_VRING_SIZE	_IOWR(VHOST_VIRTIO, 0x82,	\
 					      struct vhost_vring_state)
+
+/* fork_owner values for vhost */
+#define VHOST_FORK_OWNER_KTHREAD 0
+#define VHOST_FORK_OWNER_TASK 1
+
+/**
+ * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device,
+ * This ioctl must called before VHOST_SET_OWNER.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @param fork_owner: An 8-bit value that determines the vhost thread mode
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value):
+ *   - Vhost will create vhost worker as tasks forked from the owner,
+ *     inheriting all of the owner's attributes.
+ *
+ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD:
+ *   - Vhost will create vhost workers as kernel threads.
+ */
+#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8)
+
+/**
+ * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device.
+ * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y
+ *
+ * @return: An 8-bit value indicating the current thread mode.
+ */
+#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8)
+
 #endif
-- 
cgit v1.2.3


From 87dbae5e36613a6020f3d64a2eaeac0a1e0e6dc6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:10 +0100
Subject: vsock/virtio: Move length check to callers of
 virtio_vsock_skb_rx_put()

virtio_vsock_skb_rx_put() only calls skb_put() if the length in the
packet header is not zero even though skb_put() handles this case
gracefully.

Remove the functionally redundant check from virtio_vsock_skb_rx_put()
and, on the assumption that this is a worthwhile optimisation for
handling credit messages, augment the existing length checks in
virtio_transport_rx_work() to elide the call for zero-length payloads.
Since the callers all have the length, extend virtio_vsock_skb_rx_put()
to take it as an additional parameter rather than fish it back out of
the packet header.

Note that the vhost code already has similar logic in
vhost_vsock_alloc_skb().

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-4-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c            | 2 +-
 include/linux/virtio_vsock.h     | 9 ++-------
 net/vmw_vsock/virtio_transport.c | 4 +++-
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 66a0f060770e..4c4a642945eb 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -375,7 +375,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 		return NULL;
 	}
 
-	virtio_vsock_skb_rx_put(skb);
+	virtio_vsock_skb_rx_put(skb, payload_len);
 
 	nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
 	if (nbytes != payload_len) {
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 36fb3edfa403..97465f378ade 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -47,14 +47,9 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
 	VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false;
 }
 
-static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb)
+static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len)
 {
-	u32 len;
-
-	len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
-
-	if (len > 0)
-		skb_put(skb, len);
+	skb_put(skb, len);
 }
 
 static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index eb08a393413d..0166919f8705 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -656,7 +656,9 @@ static void virtio_transport_rx_work(struct work_struct *work)
 				continue;
 			}
 
-			virtio_vsock_skb_rx_put(skb);
+			if (payload_len)
+				virtio_vsock_skb_rx_put(skb, payload_len);
+
 			virtio_transport_deliver_tap_pkt(skb);
 			virtio_transport_recv_pkt(&virtio_transport, skb);
 		}
-- 
cgit v1.2.3


From 03a92f036a04fed2b00d69f5f46f1a486e70dc5c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:11 +0100
Subject: vsock/virtio: Resize receive buffers so that each SKB fits in a 4K
 page

When allocating receive buffers for the vsock virtio RX virtqueue, an
SKB is allocated with a 4140 data payload (the 44-byte packet header +
VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE). Even when factoring in the SKB
overhead, the resulting 8KiB allocation thanks to the rounding in
kmalloc_reserve() is wasteful (~3700 unusable bytes) and results in a
higher-order page allocation on systems with 4KiB pages just for the
sake of a few hundred bytes of packet data.

Limit the vsock virtio RX buffers to 4KiB per SKB, resulting in much
better memory utilisation and removing the need to allocate higher-order
pages entirely.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-5-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_vsock.h     | 7 ++++++-
 net/vmw_vsock/virtio_transport.c | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 97465f378ade..879f1dfa7d3a 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -106,7 +106,12 @@ static inline size_t virtio_vsock_skb_len(struct sk_buff *skb)
 	return (size_t)(skb_end_pointer(skb) - skb->head);
 }
 
-#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
+/* Dimension the RX SKB so that the entire thing fits exactly into
+ * a single 4KiB page. This avoids wasting memory due to alloc_skb()
+ * rounding up to the next page order and also means that we
+ * don't leave higher-order pages sitting around in the RX queue.
+ */
+#define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	SKB_WITH_OVERHEAD(1024 * 4)
 #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
 
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 0166919f8705..39f346890f7f 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -307,7 +307,7 @@ out_rcu:
 
 static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 {
-	int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM;
+	int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
 	struct scatterlist pkt, *p;
 	struct virtqueue *vq;
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From 2304c64a2866c58534560c63dc6e79d09b8f8d8d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:12 +0100
Subject: vsock/virtio: Rename virtio_vsock_alloc_skb()

In preparation for nonlinear allocations for large SKBs, rename
virtio_vsock_alloc_skb() to virtio_vsock_alloc_linear_skb() to indicate
that it returns linear SKBs unconditionally and switch all callers over
to this new interface for now.

No functional change.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-6-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c                   | 2 +-
 include/linux/virtio_vsock.h            | 3 ++-
 net/vmw_vsock/virtio_transport.c        | 2 +-
 net/vmw_vsock/virtio_transport_common.c | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 4c4a642945eb..1ad96613680e 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -348,7 +348,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 		return NULL;
 
 	/* len contains both payload and hdr */
-	skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
+	skb = virtio_vsock_alloc_linear_skb(len, GFP_KERNEL);
 	if (!skb)
 		return NULL;
 
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 879f1dfa7d3a..4504ea29ff82 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -52,7 +52,8 @@ static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len)
 	skb_put(skb, len);
 }
 
-static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
+static inline struct sk_buff *
+virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
 {
 	struct sk_buff *skb;
 
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 39f346890f7f..80dcf6ac1e72 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -316,7 +316,7 @@ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
 	vq = vsock->vqs[VSOCK_VQ_RX];
 
 	do {
-		skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL);
+		skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL);
 		if (!skb)
 			break;
 
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 1b5d9896edae..c9eb7f7ac00d 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -261,7 +261,7 @@ static struct sk_buff *virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *
 	if (!zcopy)
 		skb_len += payload_len;
 
-	skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
+	skb = virtio_vsock_alloc_linear_skb(skb_len, GFP_KERNEL);
 	if (!skb)
 		return NULL;
 
-- 
cgit v1.2.3


From fac6b82e0f3eaca33c8c67ec401681b21143ae17 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:13 +0100
Subject: vsock/virtio: Move SKB allocation lower-bound check to callers

virtio_vsock_alloc_linear_skb() checks that the requested size is at
least big enough for the packet header (VIRTIO_VSOCK_SKB_HEADROOM).

Of the three callers of virtio_vsock_alloc_linear_skb(), only
vhost_vsock_alloc_skb() can potentially pass a packet smaller than the
header size and, as it already has a check against the maximum packet
size, extend its bounds checking to consider the minimum packet size
and remove the check from virtio_vsock_alloc_linear_skb().

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-7-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c        | 3 ++-
 include/linux/virtio_vsock.h | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 1ad96613680e..24b7547b05a6 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -344,7 +344,8 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 
 	len = iov_length(vq->iov, out);
 
-	if (len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
+	if (len < VIRTIO_VSOCK_SKB_HEADROOM ||
+	    len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM)
 		return NULL;
 
 	/* len contains both payload and hdr */
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 4504ea29ff82..36dd0cd55368 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -57,9 +57,6 @@ virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
 {
 	struct sk_buff *skb;
 
-	if (size < VIRTIO_VSOCK_SKB_HEADROOM)
-		return NULL;
-
 	skb = alloc_skb(size, mask);
 	if (!skb)
 		return NULL;
-- 
cgit v1.2.3


From ab9aa2f3afc2713c14f6c4c6b90c9a0933b837f1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:14 +0100
Subject: vhost/vsock: Allocate nonlinear SKBs for handling large receive
 buffers

When receiving a packet from a guest, vhost_vsock_handle_tx_kick()
calls vhost_vsock_alloc_linear_skb() to allocate and fill an SKB with
the receive data. Unfortunately, these are always linear allocations and
can therefore result in significant pressure on kmalloc() considering
that the maximum packet size (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE +
VIRTIO_VSOCK_SKB_HEADROOM) is a little over 64KiB, resulting in a 128KiB
allocation for each packet.

Rework the vsock SKB allocation so that, for sizes with page order
greater than PAGE_ALLOC_COSTLY_ORDER, a nonlinear SKB is allocated
instead with the packet header in the SKB and the receive data in the
fragments. Finally, add a debug warning if virtio_vsock_skb_rx_put() is
ever called on an SKB with a non-zero length, as this would be
destructive for the nonlinear case.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-8-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c        |  8 +++-----
 include/linux/virtio_vsock.h | 32 +++++++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 24b7547b05a6..0679a706ebc0 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -349,7 +349,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 		return NULL;
 
 	/* len contains both payload and hdr */
-	skb = virtio_vsock_alloc_linear_skb(len, GFP_KERNEL);
+	skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
 	if (!skb)
 		return NULL;
 
@@ -378,10 +378,8 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 
 	virtio_vsock_skb_rx_put(skb, payload_len);
 
-	nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
-	if (nbytes != payload_len) {
-		vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
-		       payload_len, nbytes);
+	if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) {
+		vq_err(vq, "Failed to copy %zu byte payload\n", payload_len);
 		kfree_skb(skb);
 		return NULL;
 	}
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 36dd0cd55368..fa5934ea9c81 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -49,22 +49,48 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
 
 static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len)
 {
-	skb_put(skb, len);
+	DEBUG_NET_WARN_ON_ONCE(skb->len);
+
+	if (skb_is_nonlinear(skb))
+		skb->len = len;
+	else
+		skb_put(skb, len);
 }
 
 static inline struct sk_buff *
-virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
+__virtio_vsock_alloc_skb_with_frags(unsigned int header_len,
+				    unsigned int data_len,
+				    gfp_t mask)
 {
 	struct sk_buff *skb;
+	int err;
 
-	skb = alloc_skb(size, mask);
+	skb = alloc_skb_with_frags(header_len, data_len,
+				   PAGE_ALLOC_COSTLY_ORDER, &err, mask);
 	if (!skb)
 		return NULL;
 
 	skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM);
+	skb->data_len = data_len;
 	return skb;
 }
 
+static inline struct sk_buff *
+virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask)
+{
+	return __virtio_vsock_alloc_skb_with_frags(size, 0, mask);
+}
+
+static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
+{
+	if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+		return virtio_vsock_alloc_linear_skb(size, mask);
+
+	size -= VIRTIO_VSOCK_SKB_HEADROOM;
+	return __virtio_vsock_alloc_skb_with_frags(VIRTIO_VSOCK_SKB_HEADROOM,
+						   size, mask);
+}
+
 static inline void
 virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 8ca76151d2c8219edea82f1925a2a25907ff6a9d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 17 Jul 2025 10:01:15 +0100
Subject: vsock/virtio: Rename virtio_vsock_skb_rx_put()

In preparation for using virtio_vsock_skb_rx_put() when populating SKBs
on the vsock TX path, rename virtio_vsock_skb_rx_put() to
virtio_vsock_skb_put().

No functional change.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Will Deacon <will@kernel.org>
Message-Id: <20250717090116.11987-9-will@kernel.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c            | 2 +-
 include/linux/virtio_vsock.h     | 2 +-
 net/vmw_vsock/virtio_transport.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0679a706ebc0..ae01457ea2cd 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -376,7 +376,7 @@ vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
 		return NULL;
 	}
 
-	virtio_vsock_skb_rx_put(skb, payload_len);
+	virtio_vsock_skb_put(skb, payload_len);
 
 	if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) {
 		vq_err(vq, "Failed to copy %zu byte payload\n", payload_len);
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index fa5934ea9c81..0c67543a45c8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -47,7 +47,7 @@ static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
 	VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false;
 }
 
-static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb, u32 len)
+static inline void virtio_vsock_skb_put(struct sk_buff *skb, u32 len)
 {
 	DEBUG_NET_WARN_ON_ONCE(skb->len);
 
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 80dcf6ac1e72..b6569b0ca2bb 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -657,7 +657,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
 			}
 
 			if (payload_len)
-				virtio_vsock_skb_rx_put(skb, payload_len);
+				virtio_vsock_skb_put(skb, payload_len);
 
 			virtio_transport_deliver_tap_pkt(skb);
 			virtio_transport_recv_pkt(&virtio_transport, skb);
-- 
cgit v1.2.3


From 9843cf7b6fd6f938c16fde51e86dd0e3ddbefb12 Mon Sep 17 00:00:00 2001
From: Baojun Xu <baojun.xu@ti.com>
Date: Fri, 1 Aug 2025 10:16:18 +0800
Subject: ASoC: tas2781: Fix the wrong step for TLV on tas2781

The step for TLV on tas2781, should be 50 (-0.5dB).

Fixes: 678f38eba1f2 ("ASoC: tas2781: Add Header file for tas2781 driver")
Signed-off-by: Baojun Xu <baojun.xu@ti.com>
Link: https://patch.msgid.link/20250801021618.64627-1-baojun.xu@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781-tlv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/tas2781-tlv.h b/include/sound/tas2781-tlv.h
index d87263e43fdb..ef9b9f19d212 100644
--- a/include/sound/tas2781-tlv.h
+++ b/include/sound/tas2781-tlv.h
@@ -15,7 +15,7 @@
 #ifndef __TAS2781_TLV_H__
 #define __TAS2781_TLV_H__
 
-static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 100, 0);
+static const __maybe_unused DECLARE_TLV_DB_SCALE(dvc_tlv, -10000, 50, 0);
 static const __maybe_unused DECLARE_TLV_DB_SCALE(amp_vol_tlv, 1100, 50, 0);
 
 #endif
-- 
cgit v1.2.3


From 788fa4b47cdcd9b3d8c2d02ac0b3cd2540305f18 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 1 Aug 2025 16:37:24 -0400
Subject: tracing: Add guard(ring_buffer_nest)

Some calls to the tracing ring buffer can happen when the ring buffer is
already being written to by the same context (for example, a
trace_printk() in between a ring_buffer_lock_reserve() and a
ring_buffer_unlock_commit()).

In order to not trigger the recursion detection, these functions use
ring_buffer_nest_start() and ring_buffer_nest_end(). Create a guard() for
these functions so that their use cases can be simplified and not need to
use goto for the release.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/20250801203857.710501021@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h       |  3 ++
 kernel/trace/trace.c              | 69 ++++++++++++++++-----------------------
 kernel/trace/trace_events_synth.c |  6 ++--
 3 files changed, 34 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index cd7f0ae26615..8253cb69540c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -144,6 +144,9 @@ int ring_buffer_write(struct trace_buffer *buffer,
 void ring_buffer_nest_start(struct trace_buffer *buffer);
 void ring_buffer_nest_end(struct trace_buffer *buffer);
 
+DEFINE_GUARD(ring_buffer_nest, struct trace_buffer *,
+	     ring_buffer_nest_start(_T), ring_buffer_nest_end(_T))
+
 struct ring_buffer_event *
 ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
 		 unsigned long *lost_events);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0ec9cab9a812..332487179e1d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1160,13 +1160,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
 
 	trace_ctx = tracing_gen_ctx();
 	buffer = tr->array_buffer.buffer;
-	ring_buffer_nest_start(buffer);
+	guard(ring_buffer_nest)(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
 					    trace_ctx);
-	if (!event) {
-		size = 0;
-		goto out;
-	}
+	if (!event)
+		return 0;
 
 	entry = ring_buffer_event_data(event);
 	entry->ip = ip;
@@ -1182,8 +1180,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip,
 
 	__buffer_unlock_commit(buffer, event);
 	ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
- out:
-	ring_buffer_nest_end(buffer);
 	return size;
 }
 EXPORT_SYMBOL_GPL(__trace_array_puts);
@@ -1213,7 +1209,6 @@ int __trace_bputs(unsigned long ip, const char *str)
 	struct bputs_entry *entry;
 	unsigned int trace_ctx;
 	int size = sizeof(struct bputs_entry);
-	int ret = 0;
 
 	if (!printk_binsafe(tr))
 		return __trace_puts(ip, str, strlen(str));
@@ -1227,11 +1222,11 @@ int __trace_bputs(unsigned long ip, const char *str)
 	trace_ctx = tracing_gen_ctx();
 	buffer = tr->array_buffer.buffer;
 
-	ring_buffer_nest_start(buffer);
+	guard(ring_buffer_nest)(buffer);
 	event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
 					    trace_ctx);
 	if (!event)
-		goto out;
+		return 0;
 
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
@@ -1240,10 +1235,7 @@ int __trace_bputs(unsigned long ip, const char *str)
 	__buffer_unlock_commit(buffer, event);
 	ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);
 
-	ret = 1;
- out:
-	ring_buffer_nest_end(buffer);
-	return ret;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(__trace_bputs);
 
@@ -3397,21 +3389,19 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 
 	size = sizeof(*entry) + sizeof(u32) * len;
 	buffer = tr->array_buffer.buffer;
-	ring_buffer_nest_start(buffer);
-	event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
-					    trace_ctx);
-	if (!event)
-		goto out;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, tbuffer, sizeof(u32) * len);
-	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+	scoped_guard(ring_buffer_nest, buffer) {
+		event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+						    trace_ctx);
+		if (!event)
+			goto out_put;
+		entry = ring_buffer_event_data(event);
+		entry->ip			= ip;
+		entry->fmt			= fmt;
 
-out:
-	ring_buffer_nest_end(buffer);
+		memcpy(entry->buf, tbuffer, sizeof(u32) * len);
+		__buffer_unlock_commit(buffer, event);
+		ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL);
+	}
 out_put:
 	put_trace_buf();
 
@@ -3452,20 +3442,19 @@ int __trace_array_vprintk(struct trace_buffer *buffer,
 	len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args);
 
 	size = sizeof(*entry) + len + 1;
-	ring_buffer_nest_start(buffer);
-	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					    trace_ctx);
-	if (!event)
-		goto out;
-	entry = ring_buffer_event_data(event);
-	entry->ip = ip;
-
-	memcpy(&entry->buf, tbuffer, len + 1);
-	__buffer_unlock_commit(buffer, event);
-	ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+	scoped_guard(ring_buffer_nest, buffer) {
+		event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+						    trace_ctx);
+		if (!event)
+			goto out;
+		entry = ring_buffer_event_data(event);
+		entry->ip = ip;
 
+		memcpy(&entry->buf, tbuffer, len + 1);
+		__buffer_unlock_commit(buffer, event);
+		ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL);
+	}
 out:
-	ring_buffer_nest_end(buffer);
 	put_trace_buf();
 
 out_nobuffer:
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 33cfbd4ed76d..f24ee61f8884 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data,
 	 * is being performed within another event.
 	 */
 	buffer = trace_file->tr->array_buffer.buffer;
-	ring_buffer_nest_start(buffer);
+	guard(ring_buffer_nest)(buffer);
 
 	entry = trace_event_buffer_reserve(&fbuffer, trace_file,
 					   sizeof(*entry) + fields_size);
 	if (!entry)
-		goto out;
+		return;
 
 	for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
 		val_idx = var_ref_idx[i];
@@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data,
 	}
 
 	trace_event_buffer_commit(&fbuffer);
-out:
-	ring_buffer_nest_end(buffer);
 }
 
 static void free_synth_event_print_fmt(struct trace_event_call *call)
-- 
cgit v1.2.3


From d45cf1e7d7180256e17c9ce88e32e8061a7887fe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Jul 2025 13:17:38 +0000
Subject: ipv6: reject malicious packets in ipv6_gso_segment()

syzbot was able to craft a packet with very long IPv6 extension headers
leading to an overflow of skb->transport_header.

This 16bit field has a limited range.

Add skb_reset_transport_header_careful() helper and use it
from ipv6_gso_segment()

WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 skb_reset_transport_header include/linux/skbuff.h:3032 [inline]
WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151
Modules linked in:
CPU: 0 UID: 0 PID: 5871 Comm: syz-executor211 Not tainted 6.16.0-rc6-syzkaller-g7abc678e3084 #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025
 RIP: 0010:skb_reset_transport_header include/linux/skbuff.h:3032 [inline]
 RIP: 0010:ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151
Call Trace:
 <TASK>
  skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53
  nsh_gso_segment+0x54a/0xe10 net/nsh/nsh.c:110
  skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53
  __skb_gso_segment+0x342/0x510 net/core/gso.c:124
  skb_gso_segment include/net/gso.h:83 [inline]
  validate_xmit_skb+0x857/0x11b0 net/core/dev.c:3950
  validate_xmit_skb_list+0x84/0x120 net/core/dev.c:4000
  sch_direct_xmit+0xd3/0x4b0 net/sched/sch_generic.c:329
  __dev_xmit_skb net/core/dev.c:4102 [inline]
  __dev_queue_xmit+0x17b6/0x3a70 net/core/dev.c:4679

Fixes: d1da932ed4ec ("ipv6: Separate ipv6 offload support")
Reported-by: syzbot+af43e647fd835acc02df@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/688a1a05.050a0220.5d226.0008.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Dawid Osuchowski <dawid.osuchowski@linux.intel.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250730131738.3385939-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 23 +++++++++++++++++++++++
 net/ipv6/ip6_offload.c |  4 +++-
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b8b06e71b73e..14b923ddb6df 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3033,6 +3033,29 @@ static inline void skb_reset_transport_header(struct sk_buff *skb)
 	skb->transport_header = offset;
 }
 
+/**
+ * skb_reset_transport_header_careful - conditionally reset transport header
+ * @skb: buffer to alter
+ *
+ * Hardened version of skb_reset_transport_header().
+ *
+ * Returns: true if the operation was a success.
+ */
+static inline bool __must_check
+skb_reset_transport_header_careful(struct sk_buff *skb)
+{
+	long offset = skb->data - skb->head;
+
+	if (unlikely(offset != (typeof(skb->transport_header))offset))
+		return false;
+
+	if (unlikely(offset == (typeof(skb->transport_header))~0U))
+		return false;
+
+	skb->transport_header = offset;
+	return true;
+}
+
 static inline void skb_set_transport_header(struct sk_buff *skb,
 					    const int offset)
 {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 9822163428b0..fce91183797a 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -148,7 +148,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	ops = rcu_dereference(inet6_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment)) {
-		skb_reset_transport_header(skb);
+		if (!skb_reset_transport_header_careful(skb))
+			goto out;
+
 		segs = ops->callbacks.gso_segment(skb, features);
 		if (!segs)
 			skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
-- 
cgit v1.2.3


From d46e51f1c78b9ab9323610feb14238d06d46d519 Mon Sep 17 00:00:00 2001
From: Wang Liang <wangliang74@huawei.com>
Date: Wed, 30 Jul 2025 18:14:58 +0800
Subject: net: drop UFO packets in udp_rcv_segment()

When sending a packet with virtio_net_hdr to tun device, if the gso_type
in virtio_net_hdr is SKB_GSO_UDP and the gso_size is less than udphdr
size, below crash may happen.

  ------------[ cut here ]------------
  kernel BUG at net/core/skbuff.c:4572!
  Oops: invalid opcode: 0000 [#1] SMP NOPTI
  CPU: 0 UID: 0 PID: 62 Comm: mytest Not tainted 6.16.0-rc7 #203 PREEMPT(voluntary)
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
  RIP: 0010:skb_pull_rcsum+0x8e/0xa0
  Code: 00 00 5b c3 cc cc cc cc 8b 93 88 00 00 00 f7 da e8 37 44 38 00 f7 d8 89 83 88 00 00 00 48 8b 83 c8 00 00 00 5b c3 cc cc cc cc <0f> 0b 0f 0b 66 66 2e 0f 1f 84 00 000
  RSP: 0018:ffffc900001fba38 EFLAGS: 00000297
  RAX: 0000000000000004 RBX: ffff8880040c1000 RCX: ffffc900001fb948
  RDX: ffff888003e6d700 RSI: 0000000000000008 RDI: ffff88800411a062
  RBP: ffff8880040c1000 R08: 0000000000000000 R09: 0000000000000001
  R10: ffff888003606c00 R11: 0000000000000001 R12: 0000000000000000
  R13: ffff888004060900 R14: ffff888004050000 R15: ffff888004060900
  FS:  000000002406d3c0(0000) GS:ffff888084a19000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000020000040 CR3: 0000000004007000 CR4: 00000000000006f0
  Call Trace:
   <TASK>
   udp_queue_rcv_one_skb+0x176/0x4b0 net/ipv4/udp.c:2445
   udp_queue_rcv_skb+0x155/0x1f0 net/ipv4/udp.c:2475
   udp_unicast_rcv_skb+0x71/0x90 net/ipv4/udp.c:2626
   __udp4_lib_rcv+0x433/0xb00 net/ipv4/udp.c:2690
   ip_protocol_deliver_rcu+0xa6/0x160 net/ipv4/ip_input.c:205
   ip_local_deliver_finish+0x72/0x90 net/ipv4/ip_input.c:233
   ip_sublist_rcv_finish+0x5f/0x70 net/ipv4/ip_input.c:579
   ip_sublist_rcv+0x122/0x1b0 net/ipv4/ip_input.c:636
   ip_list_rcv+0xf7/0x130 net/ipv4/ip_input.c:670
   __netif_receive_skb_list_core+0x21d/0x240 net/core/dev.c:6067
   netif_receive_skb_list_internal+0x186/0x2b0 net/core/dev.c:6210
   napi_complete_done+0x78/0x180 net/core/dev.c:6580
   tun_get_user+0xa63/0x1120 drivers/net/tun.c:1909
   tun_chr_write_iter+0x65/0xb0 drivers/net/tun.c:1984
   vfs_write+0x300/0x420 fs/read_write.c:593
   ksys_write+0x60/0xd0 fs/read_write.c:686
   do_syscall_64+0x50/0x1c0 arch/x86/entry/syscall_64.c:63
   </TASK>

To trigger gso segment in udp_queue_rcv_skb(), we should also set option
UDP_ENCAP_ESPINUDP to enable udp_sk(sk)->encap_rcv. When the encap_rcv
hook return 1 in udp_queue_rcv_one_skb(), udp_csum_pull_header() will try
to pull udphdr, but the skb size has been segmented to gso size, which
leads to this crash.

Previous commit cf329aa42b66 ("udp: cope with UDP GRO packet misdirection")
introduces segmentation in UDP receive path only for GRO, which was never
intended to be used for UFO, so drop UFO packets in udp_rcv_segment().

Link: https://lore.kernel.org/netdev/20250724083005.3918375-1-wangliang74@huawei.com/
Link: https://lore.kernel.org/netdev/20250729123907.3318425-1-wangliang74@huawei.com/
Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection")
Suggested-by: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Signed-off-by: Wang Liang <wangliang74@huawei.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250730101458.3470788-1-wangliang74@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/udp.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index f8ae2c4ade14..e2af3bda90c9 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -586,6 +586,16 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
 {
 	netdev_features_t features = NETIF_F_SG;
 	struct sk_buff *segs;
+	int drop_count;
+
+	/*
+	 * Segmentation in UDP receive path is only for UDP GRO, drop udp
+	 * fragmentation offload (UFO) packets.
+	 */
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+		drop_count = 1;
+		goto drop;
+	}
 
 	/* Avoid csum recalculation by skb_segment unless userspace explicitly
 	 * asks for the final checksum values
@@ -609,16 +619,18 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
 	 */
 	segs = __skb_gso_segment(skb, features, false);
 	if (IS_ERR_OR_NULL(segs)) {
-		int segs_nr = skb_shinfo(skb)->gso_segs;
-
-		atomic_add(segs_nr, &sk->sk_drops);
-		SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr);
-		kfree_skb(skb);
-		return NULL;
+		drop_count = skb_shinfo(skb)->gso_segs;
+		goto drop;
 	}
 
 	consume_skb(skb);
 	return segs;
+
+drop:
+	atomic_add(drop_count, &sk->sk_drops);
+	SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count);
+	kfree_skb(skb);
+	return NULL;
 }
 
 static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
-- 
cgit v1.2.3


From 1dbf1d590d10a6d1978e8184f8dfe20af22d680a Mon Sep 17 00:00:00 2001
From: Sharath Chandra Vurukala <quic_sharathv@quicinc.com>
Date: Wed, 30 Jul 2025 16:21:18 +0530
Subject: net: Add locking to protect skb->dev access in ip_output

In ip_output() skb->dev is updated from the skb_dst(skb)->dev
this can become invalid when the interface is unregistered and freed,

Introduced new skb_dst_dev_rcu() function to be used instead of
skb_dst_dev() within rcu_locks in ip_output.This will ensure that
all the skb's associated with the dev being deregistered will
be transnmitted out first, before freeing the dev.

Given that ip_output() is called within an rcu_read_lock()
critical section or from a bottom-half context, it is safe to introduce
an RCU read-side critical section within it.

Multiple panic call stacks were observed when UL traffic was run
in concurrency with device deregistration from different functions,
pasting one sample for reference.

[496733.627565][T13385] Call trace:
[496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0
[496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498
[496733.627595][T13385] ip_finish_output+0xa4/0xf4
[496733.627605][T13385] ip_output+0x100/0x1a0
[496733.627613][T13385] ip_send_skb+0x68/0x100
[496733.627618][T13385] udp_send_skb+0x1c4/0x384
[496733.627625][T13385] udp_sendmsg+0x7b0/0x898
[496733.627631][T13385] inet_sendmsg+0x5c/0x7c
[496733.627639][T13385] __sys_sendto+0x174/0x1e4
[496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c
[496733.627653][T13385] invoke_syscall+0x58/0x11c
[496733.627662][T13385] el0_svc_common+0x88/0xf4
[496733.627669][T13385] do_el0_svc+0x2c/0xb0
[496733.627676][T13385] el0_svc+0x2c/0xa4
[496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4
[496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8

Changes in v3:
- Replaced WARN_ON() with  WARN_ON_ONCE(), as suggested by Willem de Bruijn.
- Dropped legacy lines mistakenly pulled in from an outdated branch.

Changes in v2:
- Addressed review comments from Eric Dumazet
- Used READ_ONCE() to prevent potential load/store tearing
- Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output

Signed-off-by: Sharath Chandra Vurukala <quic_sharathv@quicinc.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h    | 12 ++++++++++++
 net/ipv4/ip_output.c | 15 ++++++++++-----
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 00467c1b5093..bab01363bb97 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -568,11 +568,23 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst)
 	return READ_ONCE(dst->dev);
 }
 
+static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst)
+{
+	/* In the future, use rcu_dereference(dst->dev) */
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	return READ_ONCE(dst->dev);
+}
+
 static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
 {
 	return dst_dev(skb_dst(skb));
 }
 
+static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb)
+{
+	return dst_dev_rcu(skb_dst(skb));
+}
+
 static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
 {
 	return dev_net(skb_dst_dev(skb));
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 10a1d182fd84..84e7f8a2f50f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -425,15 +425,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev;
+	struct net_device *dev, *indev = skb->dev;
+	int ret_val;
 
+	rcu_read_lock();
+	dev = skb_dst_dev_rcu(skb);
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
 
-	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
-			    net, sk, skb, indev, dev,
-			    ip_finish_output,
-			    !(IPCB(skb)->flags & IPSKB_REROUTED));
+	ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+				net, sk, skb, indev, dev,
+				ip_finish_output,
+				!(IPCB(skb)->flags & IPSKB_REROUTED));
+	rcu_read_unlock();
+	return ret_val;
 }
 EXPORT_SYMBOL(ip_output);
 
-- 
cgit v1.2.3


From 6c6d8f8ba7789c221a2e4c43a0ed982c7a41f428 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Wed, 16 Jul 2025 14:32:45 +0100
Subject: lib/xxhash: remove unused functions

xxh32_digest() and xxh32_update() were added in 2017 in the original
xxhash commit, but have remained unused.

Remove them.

Link: https://lkml.kernel.org/r/20250716133245.243363-1-linux@treblig.org
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Gilbert <linux@treblig.org>
Cc: Nick Terrell <terrelln@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xxhash.h |  26 ------------
 lib/xxhash.c           | 107 -------------------------------------------------
 2 files changed, 133 deletions(-)

(limited to 'include')

diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h
index df42511438d0..27f57eca8cb1 100644
--- a/include/linux/xxhash.h
+++ b/include/linux/xxhash.h
@@ -177,32 +177,6 @@ struct xxh64_state {
  */
 void xxh32_reset(struct xxh32_state *state, uint32_t seed);
 
-/**
- * xxh32_update() - hash the data given and update the xxh32 state
- *
- * @state:  The xxh32 state to update.
- * @input:  The data to hash.
- * @length: The length of the data to hash.
- *
- * After calling xxh32_reset() call xxh32_update() as many times as necessary.
- *
- * Return:  Zero on success, otherwise an error code.
- */
-int xxh32_update(struct xxh32_state *state, const void *input, size_t length);
-
-/**
- * xxh32_digest() - produce the current xxh32 hash
- *
- * @state: Produce the current xxh32 hash of this state.
- *
- * A hash value can be produced at any time. It is still possible to continue
- * inserting input into the hash state after a call to xxh32_digest(), and
- * generate new hashes later on, by calling xxh32_digest() again.
- *
- * Return: The xxh32 hash stored in the state.
- */
-uint32_t xxh32_digest(const struct xxh32_state *state);
-
 /**
  * xxh64_reset() - reset the xxh64 state to start a new hashing operation
  *
diff --git a/lib/xxhash.c b/lib/xxhash.c
index b5bd567aa6b3..cf629766f376 100644
--- a/lib/xxhash.c
+++ b/lib/xxhash.c
@@ -267,113 +267,6 @@ void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed)
 }
 EXPORT_SYMBOL(xxh64_reset);
 
-int xxh32_update(struct xxh32_state *state, const void *input, const size_t len)
-{
-	const uint8_t *p = (const uint8_t *)input;
-	const uint8_t *const b_end = p + len;
-
-	if (input == NULL)
-		return -EINVAL;
-
-	state->total_len_32 += (uint32_t)len;
-	state->large_len |= (len >= 16) | (state->total_len_32 >= 16);
-
-	if (state->memsize + len < 16) { /* fill in tmp buffer */
-		memcpy((uint8_t *)(state->mem32) + state->memsize, input, len);
-		state->memsize += (uint32_t)len;
-		return 0;
-	}
-
-	if (state->memsize) { /* some data left from previous update */
-		const uint32_t *p32 = state->mem32;
-
-		memcpy((uint8_t *)(state->mem32) + state->memsize, input,
-			16 - state->memsize);
-
-		state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32));
-		p32++;
-		state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32));
-		p32++;
-		state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32));
-		p32++;
-		state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32));
-		p32++;
-
-		p += 16-state->memsize;
-		state->memsize = 0;
-	}
-
-	if (p <= b_end - 16) {
-		const uint8_t *const limit = b_end - 16;
-		uint32_t v1 = state->v1;
-		uint32_t v2 = state->v2;
-		uint32_t v3 = state->v3;
-		uint32_t v4 = state->v4;
-
-		do {
-			v1 = xxh32_round(v1, get_unaligned_le32(p));
-			p += 4;
-			v2 = xxh32_round(v2, get_unaligned_le32(p));
-			p += 4;
-			v3 = xxh32_round(v3, get_unaligned_le32(p));
-			p += 4;
-			v4 = xxh32_round(v4, get_unaligned_le32(p));
-			p += 4;
-		} while (p <= limit);
-
-		state->v1 = v1;
-		state->v2 = v2;
-		state->v3 = v3;
-		state->v4 = v4;
-	}
-
-	if (p < b_end) {
-		memcpy(state->mem32, p, (size_t)(b_end-p));
-		state->memsize = (uint32_t)(b_end-p);
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(xxh32_update);
-
-uint32_t xxh32_digest(const struct xxh32_state *state)
-{
-	const uint8_t *p = (const uint8_t *)state->mem32;
-	const uint8_t *const b_end = (const uint8_t *)(state->mem32) +
-		state->memsize;
-	uint32_t h32;
-
-	if (state->large_len) {
-		h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) +
-			xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18);
-	} else {
-		h32 = state->v3 /* == seed */ + PRIME32_5;
-	}
-
-	h32 += state->total_len_32;
-
-	while (p + 4 <= b_end) {
-		h32 += get_unaligned_le32(p) * PRIME32_3;
-		h32 = xxh_rotl32(h32, 17) * PRIME32_4;
-		p += 4;
-	}
-
-	while (p < b_end) {
-		h32 += (*p) * PRIME32_5;
-		h32 = xxh_rotl32(h32, 11) * PRIME32_1;
-		p++;
-	}
-
-	h32 ^= h32 >> 15;
-	h32 *= PRIME32_2;
-	h32 ^= h32 >> 13;
-	h32 *= PRIME32_3;
-	h32 ^= h32 >> 16;
-
-	return h32;
-}
-EXPORT_SYMBOL(xxh32_digest);
-
 int xxh64_update(struct xxh64_state *state, const void *input, const size_t len)
 {
 	const uint8_t *p = (const uint8_t *)input;
-- 
cgit v1.2.3


From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001
From: Alexander Graf <graf@amazon.com>
Date: Tue, 10 Jun 2025 08:53:27 +0000
Subject: kexec: enable CMA based contiguous allocation

When booting a new kernel with kexec_file, the kernel picks a target
location that the kernel should live at, then allocates random pages,
checks whether any of those patches magically happens to coincide with a
target address range and if so, uses them for that range.

For every page allocated this way, it then creates a page list that the
relocation code - code that executes while all CPUs are off and we are
just about to jump into the new kernel - copies to their final memory
location.  We can not put them there before, because chances are pretty
good that at least some page in the target range is already in use by the
currently running Linux environment.  Copying is happening from a single
CPU at RAM rate, which takes around 4-50 ms per 100 MiB.

All of this is inefficient and error prone.

To successfully kexec, we need to quiesce all devices of the outgoing
kernel so they don't scribble over the new kernel's memory.  We have seen
cases where that does not happen properly (*cough* GIC *cough*) and hence
the new kernel was corrupted.  This started a month long journey to root
cause failing kexecs to eventually see memory corruption, because the new
kernel was corrupted severely enough that it could not emit output to tell
us about the fact that it was corrupted.  By allocating memory for the
next kernel from a memory range that is guaranteed scribbling free, we can
boot the next kernel up to a point where it is at least able to detect
corruption and maybe even stop it before it becomes severe.  This
increases the chance for successful kexecs.

Since kexec got introduced, Linux has gained the CMA framework which can
perform physically contiguous memory mappings, while keeping that memory
available for movable memory when it is not needed for contiguous
allocations.  The default CMA allocator is for DMA allocations.

This patch adds logic to the kexec file loader to attempt to place the
target payload at a location allocated from CMA.  If successful, it uses
that memory range directly instead of creating copy instructions during
the hot phase.  To ensure that there is a safety net in case anything goes
wrong with the CMA allocation, it also adds a flag for user space to force
disable CMA allocations.

Using CMA allocations has two advantages:

  1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the
     hot phase.
  2) More robust. Even if by accident some page is still in use for DMA,
     the new kernel image will be safe from that access because it resides
     in a memory region that is considered allocated in the old kernel and
     has a chance to reinitialize that component.

Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com
Signed-off-by: Alexander Graf <graf@amazon.com>
Acked-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Zhongkun He <hezhongkun.hzk@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/kernel/kexec_elf.c |   1 +
 include/linux/kexec.h         |  10 +++++
 include/uapi/linux/kexec.h    |   1 +
 kernel/kexec.c                |   2 +-
 kernel/kexec_core.c           | 100 ++++++++++++++++++++++++++++++++++++++----
 kernel/kexec_file.c           |  51 ++++++++++++++++++++-
 kernel/kexec_internal.h       |   2 +-
 7 files changed, 156 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c
index f4755d49b89e..56444c7bd34e 100644
--- a/arch/riscv/kernel/kexec_elf.c
+++ b/arch/riscv/kernel/kexec_elf.c
@@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
 	kbuf.buf_align = PMD_SIZE;
 	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
 	kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
+	kbuf.cma = NULL;
 	kbuf.top_down = false;
 	ret = arch_kexec_locate_mem_hole(&kbuf);
 	if (!ret) {
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 03f85ad03025..1b10a5d84b68 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes;
 
 typedef unsigned long kimage_entry_t;
 
+/*
+ * This is a copy of the UAPI struct kexec_segment and must be identical
+ * to it because it gets copied straight from user space into kernel
+ * memory. Do not modify this structure unless you change the way segments
+ * get ingested from user space.
+ */
 struct kexec_segment {
 	/*
 	 * This pointer can point to user memory if kexec_load() system
@@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image);
  * @buf_align:	Minimum alignment needed.
  * @buf_min:	The buffer can't be placed below this address.
  * @buf_max:	The buffer can't be placed above this address.
+ * @cma:	CMA page if the buffer is backed by CMA.
  * @top_down:	Allocate from top of memory.
  * @random:	Place the buffer at a random position.
  */
@@ -184,6 +191,7 @@ struct kexec_buf {
 	unsigned long buf_align;
 	unsigned long buf_min;
 	unsigned long buf_max;
+	struct page *cma;
 	bool top_down;
 #ifdef CONFIG_CRASH_DUMP
 	bool random;
@@ -340,6 +348,7 @@ struct kimage {
 
 	unsigned long nr_segments;
 	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+	struct page *segment_cma[KEXEC_SEGMENT_MAX];
 
 	struct list_head control_pages;
 	struct list_head dest_pages;
@@ -361,6 +370,7 @@ struct kimage {
 	 */
 	unsigned int hotplug_support:1;
 #endif
+	unsigned int no_cma:1;
 
 #ifdef ARCH_HAS_KIMAGE_ARCH
 	struct kimage_arch arch;
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index 5ae1741ea8ea..8958ebfcff94 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -27,6 +27,7 @@
 #define KEXEC_FILE_ON_CRASH	0x00000002
 #define KEXEC_FILE_NO_INITRAMFS	0x00000004
 #define KEXEC_FILE_DEBUG	0x00000008
+#define KEXEC_FILE_NO_CMA	0x00000010
 
 /* These values match the ELF architecture values.
  * Unless there is a good reason that should continue to be the case.
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a6b3f96bb50c..28008e3d462e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 		goto out;
 
 	for (i = 0; i < nr_segments; i++) {
-		ret = kimage_load_segment(image, &image->segment[i]);
+		ret = kimage_load_segment(image, i);
 		if (ret)
 			goto out;
 	}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 3a9a9f240dbc..e390c0df6d55 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -40,6 +40,7 @@
 #include <linux/hugetlb.h>
 #include <linux/objtool.h>
 #include <linux/kmsg_dump.h>
+#include <linux/dma-map-ops.h>
 
 #include <asm/page.h>
 #include <asm/sections.h>
@@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry)
 	kimage_free_pages(page);
 }
 
+static void kimage_free_cma(struct kimage *image)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		struct page *cma = image->segment_cma[i];
+		u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT;
+
+		if (!cma)
+			continue;
+
+		arch_kexec_pre_free_pages(page_address(cma), nr_pages);
+		dma_release_from_contiguous(NULL, cma, nr_pages);
+		image->segment_cma[i] = NULL;
+	}
+
+}
+
 void kimage_free(struct kimage *image)
 {
 	kimage_entry_t *ptr, entry;
@@ -591,6 +610,9 @@ void kimage_free(struct kimage *image)
 	/* Free the kexec control pages... */
 	kimage_free_page_list(&image->control_pages);
 
+	/* Free CMA allocations */
+	kimage_free_cma(image);
+
 	/*
 	 * Free up any temporary buffers allocated. This might hit if
 	 * error occurred much later after buffer allocation.
@@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image,
 	return page;
 }
 
-static int kimage_load_normal_segment(struct kimage *image,
-					 struct kexec_segment *segment)
+static int kimage_load_cma_segment(struct kimage *image, int idx)
+{
+	struct kexec_segment *segment = &image->segment[idx];
+	struct page *cma = image->segment_cma[idx];
+	char *ptr = page_address(cma);
+	unsigned long maddr;
+	size_t ubytes, mbytes;
+	int result = 0;
+	unsigned char __user *buf = NULL;
+	unsigned char *kbuf = NULL;
+
+	if (image->file_mode)
+		kbuf = segment->kbuf;
+	else
+		buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	/* Then copy from source buffer to the CMA one */
+	while (mbytes) {
+		size_t uchunk, mchunk;
+
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = min_t(size_t, mbytes,
+				PAGE_SIZE - (maddr & ~PAGE_MASK));
+		uchunk = min(ubytes, mchunk);
+
+		if (uchunk) {
+			/* For file based kexec, source pages are in kernel memory */
+			if (image->file_mode)
+				memcpy(ptr, kbuf, uchunk);
+			else
+				result = copy_from_user(ptr, buf, uchunk);
+			ubytes -= uchunk;
+			if (image->file_mode)
+				kbuf += uchunk;
+			else
+				buf += uchunk;
+		}
+
+		if (result) {
+			result = -EFAULT;
+			goto out;
+		}
+
+		ptr    += mchunk;
+		maddr  += mchunk;
+		mbytes -= mchunk;
+
+		cond_resched();
+	}
+
+	/* Clear any remainder */
+	memset(ptr, 0, mbytes);
+
+out:
+	return result;
+}
+
+static int kimage_load_normal_segment(struct kimage *image, int idx)
 {
+	struct kexec_segment *segment = &image->segment[idx];
 	unsigned long maddr;
 	size_t ubytes, mbytes;
 	int result;
@@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image,
 	mbytes = segment->memsz;
 	maddr = segment->mem;
 
+	if (image->segment_cma[idx])
+		return kimage_load_cma_segment(image, idx);
+
 	result = kimage_set_destination(image, maddr);
 	if (result < 0)
 		goto out;
@@ -787,13 +872,13 @@ out:
 }
 
 #ifdef CONFIG_CRASH_DUMP
-static int kimage_load_crash_segment(struct kimage *image,
-					struct kexec_segment *segment)
+static int kimage_load_crash_segment(struct kimage *image, int idx)
 {
 	/* For crash dumps kernels we simply copy the data from
 	 * user space to it's destination.
 	 * We do things a page at a time for the sake of kmap.
 	 */
+	struct kexec_segment *segment = &image->segment[idx];
 	unsigned long maddr;
 	size_t ubytes, mbytes;
 	int result;
@@ -858,18 +943,17 @@ out:
 }
 #endif
 
-int kimage_load_segment(struct kimage *image,
-				struct kexec_segment *segment)
+int kimage_load_segment(struct kimage *image, int idx)
 {
 	int result = -ENOMEM;
 
 	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
-		result = kimage_load_normal_segment(image, segment);
+		result = kimage_load_normal_segment(image, idx);
 		break;
 #ifdef CONFIG_CRASH_DUMP
 	case KEXEC_TYPE_CRASH:
-		result = kimage_load_crash_segment(image, segment);
+		result = kimage_load_crash_segment(image, idx);
 		break;
 #endif
 	}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69fe76fd9233..41271eee0f99 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_read_file.h>
 #include <linux/syscalls.h>
 #include <linux/vmalloc.h>
+#include <linux/dma-map-ops.h>
 #include "kexec_internal.h"
 
 #ifdef CONFIG_KEXEC_SIG
@@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 		ret = 0;
 	}
 
+	image->no_cma = !!(flags & KEXEC_FILE_NO_CMA);
+
 	if (cmdline_len) {
 		image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
 		if (IS_ERR(image->cmdline_buf)) {
@@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 			      i, ksegment->buf, ksegment->bufsz, ksegment->mem,
 			      ksegment->memsz);
 
-		ret = kimage_load_segment(image, &image->segment[i]);
+		ret = kimage_load_segment(image, i);
 		if (ret)
 			goto out;
 	}
@@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf,
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
 
+static int kexec_alloc_contig(struct kexec_buf *kbuf)
+{
+	size_t nr_pages = kbuf->memsz >> PAGE_SHIFT;
+	unsigned long mem;
+	struct page *p;
+
+	/* User space disabled CMA allocations, bail out. */
+	if (kbuf->image->no_cma)
+		return -EPERM;
+
+	/* Skip CMA logic for crash kernel */
+	if (kbuf->image->type == KEXEC_TYPE_CRASH)
+		return -EPERM;
+
+	p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true);
+	if (!p)
+		return -ENOMEM;
+
+	pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p));
+
+	mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+
+	if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) {
+		/* Our region is already in use by a statically defined one. Bail out. */
+		pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz);
+		dma_release_from_contiguous(NULL, p, nr_pages);
+		return -EBUSY;
+	}
+
+	kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT;
+	kbuf->cma = p;
+
+	arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0);
+
+	return 0;
+}
+
 /**
  * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
  * @kbuf:	Parameters for the memory search.
@@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	if (ret <= 0)
 		return ret;
 
+	/*
+	 * Try to find a free physically contiguous block of memory first. With that, we
+	 * can avoid any copying at kexec time.
+	 */
+	if (!kexec_alloc_contig(kbuf))
+		return 0;
+
 	if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
 		ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
 	else
@@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	/* Ensure minimum alignment needed for segments. */
 	kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
 	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
+	kbuf->cma = NULL;
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	ret = arch_kexec_locate_mem_hole(kbuf);
@@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	ksegment->bufsz = kbuf->bufsz;
 	ksegment->mem = kbuf->mem;
 	ksegment->memsz = kbuf->memsz;
+	kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma;
 	kbuf->image->nr_segments++;
 	return 0;
 }
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 30a733a55a67..228bb88c018b 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void);
 int sanity_check_segment_list(struct kimage *image);
 void kimage_free_page_list(struct list_head *list);
 void kimage_free(struct kimage *image);
-int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+int kimage_load_segment(struct kimage *image, int idx);
 void kimage_terminate(struct kimage *image);
 int kimage_is_destination_range(struct kimage *image,
 				unsigned long start, unsigned long end);
-- 
cgit v1.2.3


From 004f42dd90b7ef542a51983bdaa5b2ef621ed41d Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Tue, 22 Jul 2025 15:34:30 +0800
Subject: xen/xenbus: fix typo "notifer"

There is a spelling mistake of 'notifer' in the comment which
should be 'notifier'.

Link: https://lkml.kernel.org/r/C6633C66376C709A+20250722073431.21983-7-wangyuli@uniontech.com
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/xen/xenbus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 3f90bdd387b6..00b84f2e402b 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -180,7 +180,7 @@ int xenbus_printf(struct xenbus_transaction t,
  * sprintf-style type string, and pointer. Returns 0 or errno.*/
 int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
 
-/* notifer routines for when the xenstore comes up */
+/* notifier routines for when the xenstore comes up */
 extern int xenstored_ready;
 int register_xenstore_notifier(struct notifier_block *nb);
 void unregister_xenstore_notifier(struct notifier_block *nb);
-- 
cgit v1.2.3


From d171b10b2d7b067c16d79e1d069a23a34f088d23 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Tue, 22 Jul 2025 11:22:30 -0700
Subject: mm/page-flags: remove folio_start_writeback_keepwrite()

Commit cd57b77197a4 ("ext4: Convert ext4_bio_write_page() to use a folio)
removed set_page_writeback_keepwrite() which was the last/only caller of
folio_start_writeback_keepwrite().

Link: https://lkml.kernel.org/r/20250722182230.2114587-1-joannelkoong@gmail.com
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8e4d6eda8a8d..8d3fa3a91ce4 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -837,8 +837,6 @@ void set_page_writeback(struct page *page);
 
 #define folio_start_writeback(folio)			\
 	__folio_start_writeback(folio, false)
-#define folio_start_writeback_keepwrite(folio)	\
-	__folio_start_writeback(folio, true)
 
 static __always_inline bool folio_test_head(const struct folio *folio)
 {
-- 
cgit v1.2.3


From f225b34f1e6c81c50e48f6207ddb6d290be1b932 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 25 Jul 2025 09:29:41 +0100
Subject: mm/mseal: always define VM_SEALED

Patch series "mseal cleanups", v4.

Perform a number of cleanups to the mseal logic.  Firstly, VM_SEALED is
treated differently from every other VMA flag, it really doesn't make
sense to do this, so we start by making this consistent with everything
else.

Next we place the madvise logic where it belongs - in mm/madvise.c.  It
really makes no sense to abstract this elsewhere.  In doing so, we go to
great lengths to explain very clearly the previously very confusing logic
as to what sealed mappings are impacted here.

In doing so, we retain existing logic regarding treatment of madvise()
discard operations for a sealed, read-only MAP_PRIVATE file-backed
mapping.  This is something we likely need to revisit.

We then abstract out and explain the 'are there are any gaps in this range
in the mm?' check being performed as a prerequisite to mseal being
performed.

Finally, we simplify the actual mseal logic which is really quite
straightforward.

No functional change is intended.


This patch (of 4):

There is no reason to treat VM_SEALED in a special way, in each other case
in which a VMA flag is unavailable due to configuration, we simply assign
that flag to VM_NONE, so make VM_SEALED consistent with all other VMA
flags in this respect.

Additionally, use the next available bit for VM_SEALED, 42, rather than
arbitrarily putting it at 63 and update the declaration to match all other
VMA flags.

No functional change intended.

Link: https://lkml.kernel.org/r/cover.1753431105.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/aeb398a77029b6e7377cd944328bc9bbc3c90537.1753431105.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h               | 6 ++++--
 tools/testing/vma/vma_internal.h | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8e3a4c5b78ff..ceaa780a703a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -414,8 +414,10 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 
 #ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED	_BITUL(63)
+#define VM_SEALED_BIT	42
+#define VM_SEALED	BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED	VM_NONE
 #endif
 
 /* Bits set in the VMA until the stack is in its final location */
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 991022e9e0d3..0fe52fd6782b 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -108,8 +108,10 @@ extern unsigned long dac_mmap_min_addr;
 #define CAP_IPC_LOCK         14
 
 #ifdef CONFIG_64BIT
-/* VM is sealed, in vm_flags */
-#define VM_SEALED	_BITUL(63)
+#define VM_SEALED_BIT	42
+#define VM_SEALED	BIT(VM_SEALED_BIT)
+#else
+#define VM_SEALED	VM_NONE
 #endif
 
 #define FIRST_USER_ADDRESS	0UL
-- 
cgit v1.2.3


From 3dfde97800e06882960cc926d2c428f2128b7c70 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 24 Jul 2025 10:52:59 +0530
Subject: mm: add get_and_clear_ptes() and clear_ptes()

Patch series "Optimizations for khugepaged", v4.

If the underlying folio mapped by the ptes is large, we can process those
ptes in a batch using folio_pte_batch().

For arm64 specifically, this results in a 16x reduction in the number of
ptep_get() calls, since on a contig block, ptep_get() on arm64 will
iterate through all 16 entries to collect a/d bits.  Next, ptep_clear()
will cause a TLBI for every contig block in the range via
contpte_try_unfold().  Instead, use clear_ptes() to only do the TLBI at
the first and last contig block of the range.

For split folios, there will be no pte batching; the batch size returned
by folio_pte_batch() will be 1.  For pagetable split folios, the ptes will
still point to the same large folio; for arm64, this results in the
optimization described above, and for other arches, a minor improvement is
expected due to a reduction in the number of function calls and batching
atomic operations.


This patch (of 3):

Let's add variants to be used where "full" does not apply -- which will
be the majority of cases in the future. "full" really only applies if
we are about to tear down a full MM.

Use get_and_clear_ptes() in existing code, clear_ptes() users will
be added next.

Link: https://lkml.kernel.org/r/20250724052301.23844-2-dev.jain@arm.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/mm/mmu.c     |  2 +-
 include/linux/pgtable.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 mm/mremap.c             |  2 +-
 mm/rmap.c               |  2 +-
 4 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index abd9725796e9..20a89ab97dc5 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1528,7 +1528,7 @@ early_initcall(prevent_bootmem_remove_init);
 pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t *ptep, unsigned int nr)
 {
-	pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0);
+	pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr);
 
 	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
 		/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e3b99920be05..4c035637eeb7 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -736,6 +736,29 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
 }
 #endif
 
+/**
+ * get_and_clear_ptes - Clear present PTEs that map consecutive pages of
+ *			the same folio, collecting dirty/accessed bits.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of get_and_clear_full_ptes() if it is known that we don't
+ * need to clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr,
+		pte_t *ptep, unsigned int nr)
+{
+	return get_and_clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
 #ifndef clear_full_ptes
 /**
  * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
@@ -768,6 +791,28 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
 }
 #endif
 
+/**
+ * clear_ptes - Clear present PTEs that map consecutive pages of the same folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ *
+ * Use this instead of clear_full_ptes() if it is known that we don't need to
+ * clear the full mm, which is mostly the case.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void clear_ptes(struct mm_struct *mm, unsigned long addr,
+		pte_t *ptep, unsigned int nr)
+{
+	clear_full_ptes(mm, addr, ptep, nr, 0);
+}
+
 /*
  * If two threads concurrently fault at the same page, the thread that
  * won the race updates the PTE and its local TLB/Cache. The other thread
diff --git a/mm/mremap.c b/mm/mremap.c
index ac39845e9718..677a4d744df9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -280,7 +280,7 @@ static int move_ptes(struct pagetable_move_control *pmc,
 							 old_pte, max_nr_ptes);
 			force_flush = true;
 		}
-		pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0);
+		pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
 		pte = move_pte(pte, old_addr, new_addr);
 		pte = move_soft_dirty_pte(pte);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index f93ce27132ab..568198e9efc2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2036,7 +2036,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			flush_cache_range(vma, address, end_addr);
 
 			/* Nuke the page table entry. */
-			pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
+			pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages);
 			/*
 			 * We clear the PTE but do not flush so potentially
 			 * a remote CPU could still be writing to the folio.
-- 
cgit v1.2.3


From 9a4f90e246615d1f42a9b907deb9b4c0a418d996 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Fri, 25 Jul 2025 15:29:01 +0100
Subject: mm: remove mm/io-mapping.c

This is dead code, which was used from commit b739f125e4eb ("i915: use
io_mapping_map_user") but reverted a month later by commit 0e4fe0c9f2f9
("Revert "i915: use io_mapping_map_user"") back in 2021.

Since then nobody has used it, so remove it.

[akpm@linux-foundation.org: update Documentation/core-api/mm-api.rst, per Vlastimil]
Link: https://lkml.kernel.org/r/20250725142901.81502-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/mm-api.rst |  1 -
 include/linux/io-mapping.h        |  3 ---
 mm/Kconfig                        |  4 ----
 mm/Makefile                       |  1 -
 mm/io-mapping.c                   | 30 ------------------------------
 5 files changed, 39 deletions(-)
 delete mode 100644 mm/io-mapping.c

(limited to 'include')

diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst
index af8151db88b2..24970b91ac15 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -139,4 +139,3 @@ More Memory Management Functions
 .. kernel-doc:: mm/mmu_notifier.c
 .. kernel-doc:: mm/balloon_compaction.c
 .. kernel-doc:: mm/huge_memory.c
-.. kernel-doc:: mm/io-mapping.c
diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h
index 7376c1df9c90..c16353cc6e3c 100644
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -225,7 +225,4 @@ io_mapping_free(struct io_mapping *iomap)
 	kfree(iomap);
 }
 
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pfn, unsigned long size);
-
 #endif /* _LINUX_IO_MAPPING_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index d5d4eca947a6..e443fe8cd6cf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1242,10 +1242,6 @@ config KMAP_LOCAL
 config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY
 	bool
 
-# struct io_mapping based helper.  Selected by drivers that need them
-config IO_MAPPING
-	bool
-
 config MEMFD_CREATE
 	bool "Enable memfd_create() system call" if EXPERT
 
diff --git a/mm/Makefile b/mm/Makefile
index 1a7a11d4933d..ef54aa615d9d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -141,7 +141,6 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
-obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/io-mapping.c b/mm/io-mapping.c
deleted file mode 100644
index d3586e95c12c..000000000000
--- a/mm/io-mapping.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-
-#include <linux/mm.h>
-#include <linux/io-mapping.h>
-
-/**
- * io_mapping_map_user - remap an I/O mapping to userspace
- * @iomap: the source io_mapping
- * @vma: user vma to map to
- * @addr: target user address to start at
- * @pfn: physical address of kernel memory
- * @size: size of map area
- *
- *  Note: this is only safe if the mm semaphore is held when called.
- */
-int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pfn, unsigned long size)
-{
-	vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
-
-	if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags))
-		return -EINVAL;
-
-	pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) |
-				       (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK));
-
-	/* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */
-	return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot);
-}
-EXPORT_SYMBOL_GPL(io_mapping_map_user);
-- 
cgit v1.2.3


From a222439e1e273fa0f4e37ce17aeb109f3e91824f Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Fri, 25 Jul 2025 14:16:24 +0200
Subject: mm/rmap: add anon_vma lifetime debug check

If an anon folio is mapped into userspace, its anon_vma must be alive,
otherwise rmap walks can hit UAF.

There have been syzkaller reports a few months ago[1][2] of UAF in rmap
walks that seems to indicate that there can be pages with elevated
mapcount whose anon_vma has already been freed, but I think we never
figured out what the cause is; and syzkaller only hit these UAFs when
memory pressure randomly caused reclaim to rmap-walk the affected pages,
so it of course didn't manage to create a reproducer.

Add a VM_WARN_ON_FOLIO() when we add/remove mappings of anonymous folios
to hopefully catch such issues more reliably.

[1] https://lore.kernel.org/r/67abaeaf.050a0220.110943.0041.GAE@google.com
[2] https://lore.kernel.org/r/67a76f33.050a0220.3d72c.0028.GAE@google.com

Link: https://lkml.kernel.org/r/20250725-anonvma-uaf-debug-v2-1-bc3c7e5ba5b1@google.com
Signed-off-by: Jann Horn <jannh@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 20803fcb49a7..6cd020eea37a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -449,6 +449,28 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio,
 	default:
 		VM_WARN_ON_ONCE(true);
 	}
+
+	/*
+	 * Anon folios must have an associated live anon_vma as long as they're
+	 * mapped into userspace.
+	 * Note that the atomic_read() mainly does two things:
+	 *
+	 * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to
+	 *    check that the associated anon_vma has not yet been freed (subject
+	 *    to KASAN's usual limitations). This check will pass if the
+	 *    anon_vma's refcount has already dropped to 0 but an RCU grace
+	 *    period hasn't passed since then.
+	 * 2. If the anon_vma has not yet been freed, it checks that the
+	 *    anon_vma still has a nonzero refcount (as opposed to being in the
+	 *    middle of an RCU delay for getting freed).
+	 */
+	if (folio_test_anon(folio) && !folio_test_ksm(folio)) {
+		unsigned long mapping = (unsigned long)folio->mapping;
+		struct anon_vma *anon_vma;
+
+		anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON);
+		VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 9bbffee67ffd16360179327b57f3b1245579ef08 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 28 Jul 2025 10:53:55 -0700
Subject: mm: fix a UAF when vma->mm is freed after vma->vm_refcnt got dropped

By inducing delays in the right places, Jann Horn created a reproducer for
a hard to hit UAF issue that became possible after VMAs were allowed to be
recycled by adding SLAB_TYPESAFE_BY_RCU to their cache.

Race description is borrowed from Jann's discovery report:
lock_vma_under_rcu() looks up a VMA locklessly with mas_walk() under
rcu_read_lock().  At that point, the VMA may be concurrently freed, and it
can be recycled by another process.  vma_start_read() then increments the
vma->vm_refcnt (if it is in an acceptable range), and if this succeeds,
vma_start_read() can return a recycled VMA.

In this scenario where the VMA has been recycled, lock_vma_under_rcu()
will then detect the mismatching ->vm_mm pointer and drop the VMA through
vma_end_read(), which calls vma_refcount_put().  vma_refcount_put() drops
the refcount and then calls rcuwait_wake_up() using a copy of vma->vm_mm.
This is wrong: It implicitly assumes that the caller is keeping the VMA's
mm alive, but in this scenario the caller has no relation to the VMA's mm,
so the rcuwait_wake_up() can cause UAF.

The diagram depicting the race:
T1         T2         T3
==         ==         ==
lock_vma_under_rcu
  mas_walk
          <VMA gets removed from mm>
                      mmap
                        <the same VMA is reallocated>
  vma_start_read
    __refcount_inc_not_zero_limited_acquire
                      munmap
                        __vma_enter_locked
                          refcount_add_not_zero
  vma_end_read
    vma_refcount_put
      __refcount_dec_and_test
                          rcuwait_wait_event
                            <finish operation>
      rcuwait_wake_up [UAF]

Note that rcuwait_wait_event() in T3 does not block because refcount was
already dropped by T1.  At this point T3 can exit and free the mm causing
UAF in T1.

To avoid this we move vma->vm_mm verification into vma_start_read() and
grab vma->vm_mm to stabilize it before vma_refcount_put() operation.

[surenb@google.com: v3]
  Link: https://lkml.kernel.org/r/20250729145709.2731370-1-surenb@google.com
Link: https://lkml.kernel.org/r/20250728175355.2282375-1-surenb@google.com
Fixes: 3104138517fc ("mm: make vma cache SLAB_TYPESAFE_BY_RCU")
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: Jann Horn <jannh@google.com>
Closes: https://lore.kernel.org/all/CAG48ez0-deFbVH=E3jbkWx=X3uVbd8nWeo6kbJPQ0KoUD+m2tA@mail.gmail.com/
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++++++++
 mm/mmap_lock.c            | 10 +++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 1f4f44951abe..11a078de9150 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -12,6 +12,7 @@ extern int rcuwait_wake_up(struct rcuwait *w);
 #include <linux/tracepoint-defs.h>
 #include <linux/types.h>
 #include <linux/cleanup.h>
+#include <linux/sched/mm.h>
 
 #define MMAP_LOCK_INITIALIZER(name) \
 	.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
@@ -154,6 +155,10 @@ static inline void vma_refcount_put(struct vm_area_struct *vma)
  * reused and attached to a different mm before we lock it.
  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
  * detached.
+ *
+ * WARNING! The vma passed to this function cannot be used if the function
+ * fails to lock it because in certain cases RCU lock is dropped and then
+ * reacquired. Once RCU lock is dropped the vma can be concurently freed.
  */
 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
 						    struct vm_area_struct *vma)
@@ -183,6 +188,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
 	}
 
 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
+
+	/*
+	 * If vma got attached to another mm from under us, that mm is not
+	 * stable and can be freed in the narrow window after vma->vm_refcnt
+	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
+	 * releasing vma->vm_refcnt.
+	 */
+	if (unlikely(vma->vm_mm != mm)) {
+		/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
+		struct mm_struct *other_mm = vma->vm_mm;
+
+		/*
+		 * __mmdrop() is a heavy operation and we don't need RCU
+		 * protection here. Release RCU lock during these operations.
+		 * We reinstate the RCU read lock as the caller expects it to
+		 * be held when this function returns even on error.
+		 */
+		rcu_read_unlock();
+		mmgrab(other_mm);
+		vma_refcount_put(vma);
+		mmdrop(other_mm);
+		rcu_read_lock();
+		return NULL;
+	}
+
 	/*
 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
 	 * False unlocked result is impossible because we modify and check
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 729fb7d0dd59..b006cec8e6fe 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -164,8 +164,7 @@ retry:
 	 */
 
 	/* Check if the vma we locked is the right one. */
-	if (unlikely(vma->vm_mm != mm ||
-		     address < vma->vm_start || address >= vma->vm_end))
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end))
 		goto inval_end_read;
 
 	rcu_read_unlock();
@@ -236,11 +235,8 @@ retry:
 		goto fallback;
 	}
 
-	/*
-	 * Verify the vma we locked belongs to the same address space and it's
-	 * not behind of the last search position.
-	 */
-	if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end))
+	/* Verify the vma is not behind the last search position. */
+	if (unlikely(from_addr >= vma->vm_end))
 		goto fallback_unlock;
 
 	/*
-- 
cgit v1.2.3


From fcd90ad31e29d0b403f3a074a64cd7f0876175dd Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 13 Jul 2025 10:17:23 +0300
Subject: execmem: drop unused execmem_update_copy()

Patch series "x86: enable EXECMEM_ROX_CACHE for ftrace and kprobes", v3.

These patches enable use of EXECMEM_ROX_CACHE for ftrace and kprobes
allocations on x86.

They also include some ground work in execmem.

Since the execmem model for caching large ROX pages changed from the
initial assumption that the memory that is allocated from ROX cache is
always ROX to the current state where memory can be temporarily made RW
and then restored to ROX, we can stop using text poking to update it.
This also saves the hassle of trying lock text_mutex in
execmem_cache_free() when kprobes already hold that mutex.


This patch (of 8):

The execmem_update_copy() that used text poking was required when memory
allocated from ROX cache was always read-only.  Since now its permissions
can be switched to read-write there is no need in a function that updates
memory with text poking.

Remove it.

Link: https://lkml.kernel.org/r/20250713071730.4117334-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20250713071730.4117334-2-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/execmem.h | 13 -------------
 mm/execmem.c            |  5 -----
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 3be35680a54f..734fbe83d98e 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -185,19 +185,6 @@ DEFINE_FREE(execmem, void *, if (_T) execmem_free(_T));
 struct vm_struct *execmem_vmap(size_t size);
 #endif
 
-/**
- * execmem_update_copy - copy an update to executable memory
- * @dst:  destination address to update
- * @src:  source address containing the data
- * @size: how many bytes of memory shold be copied
- *
- * Copy @size bytes from @src to @dst using text poking if the memory at
- * @dst is read-only.
- *
- * Return: a pointer to @dst or NULL on error
- */
-void *execmem_update_copy(void *dst, const void *src, size_t size);
-
 /**
  * execmem_is_rox - check if execmem is read-only
  * @type - the execmem type to check
diff --git a/mm/execmem.c b/mm/execmem.c
index 627e6cf64f4f..aac211bc88c5 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -399,11 +399,6 @@ void execmem_free(void *ptr)
 		vfree(ptr);
 }
 
-void *execmem_update_copy(void *dst, const void *src, size_t size)
-{
-	return text_poke_copy(dst, src, size);
-}
-
 bool execmem_is_rox(enum execmem_type type)
 {
 	return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE);
-- 
cgit v1.2.3


From 838955f64ae7582f009a3538889bb9244f37ab26 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 13 Jul 2025 10:17:24 +0300
Subject: execmem: introduce execmem_alloc_rw()

Some callers of execmem_alloc() require the memory to be temporarily
writable even when it is allocated from ROX cache.  These callers use
execemem_make_temp_rw() right after the call to execmem_alloc().

Wrap this sequence in execmem_alloc_rw() API.

Link: https://lkml.kernel.org/r/20250713071730.4117334-3-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/kernel/alternative.c |  3 +--
 include/linux/execmem.h       | 38 ++++++++++++++++++++++----------------
 kernel/module/main.c          | 13 ++-----------
 mm/execmem.c                  | 27 ++++++++++++++++++++++++++-
 4 files changed, 51 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ea1d984166cd..526a5fef93ab 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -120,7 +120,7 @@ struct its_array its_pages;
 
 static void *__its_alloc(struct its_array *pages)
 {
-	void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+	void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE);
 	if (!page)
 		return NULL;
 
@@ -237,7 +237,6 @@ static void *its_alloc(void)
 	if (!page)
 		return NULL;
 
-	execmem_make_temp_rw(page, PAGE_SIZE);
 	if (pages == &its_pages)
 		set_memory_x((unsigned long)page, 1);
 
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 734fbe83d98e..8b61b05da7d5 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -67,21 +67,6 @@ enum execmem_range_flags {
  */
 void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
 
-/**
- * execmem_make_temp_rw - temporarily remap region with read-write
- *			  permissions
- * @ptr:	address of the region to remap
- * @size:	size of the region to remap
- *
- * Remaps a part of the cached large page in the ROX cache in the range
- * [@ptr, @ptr + @size) as writable and not executable. The caller must
- * have exclusive ownership of this range and ensure nothing will try to
- * execute code in this range.
- *
- * Return: 0 on success or negative error code on failure.
- */
-int execmem_make_temp_rw(void *ptr, size_t size);
-
 /**
  * execmem_restore_rox - restore read-only-execute permissions
  * @ptr:	address of the region to remap
@@ -95,7 +80,6 @@ int execmem_make_temp_rw(void *ptr, size_t size);
  */
 int execmem_restore_rox(void *ptr, size_t size);
 #else
-static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; }
 static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; }
 #endif
 
@@ -165,6 +149,28 @@ struct execmem_info *execmem_arch_setup(void);
  */
 void *execmem_alloc(enum execmem_type type, size_t size);
 
+/**
+ * execmem_alloc_rw - allocate writable executable memory
+ * @type: type of the allocation
+ * @size: how many bytes of memory are required
+ *
+ * Allocates memory that will contain executable code, either generated or
+ * loaded from kernel modules.
+ *
+ * Allocates memory that will contain data coupled with executable code,
+ * like data sections in kernel modules.
+ *
+ * Forces writable permissions on the allocated memory and the caller is
+ * responsible to manage the permissions afterwards.
+ *
+ * For architectures that use ROX cache the permissions will be set to R+W.
+ * For architectures that don't use ROX cache the default permissions for @type
+ * will be used as they must be writable.
+ *
+ * Return: a pointer to the allocated memory or %NULL
+ */
+void *execmem_alloc_rw(enum execmem_type type, size_t size);
+
 /**
  * execmem_free - free executable memory
  * @ptr: pointer to the memory that should be freed
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 413ac6ea3702..d009326ef7bb 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1292,20 +1292,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	else
 		execmem_type = EXECMEM_MODULE_TEXT;
 
-	ptr = execmem_alloc(execmem_type, size);
+	ptr = execmem_alloc_rw(execmem_type, size);
 	if (!ptr)
 		return -ENOMEM;
 
-	if (execmem_is_rox(execmem_type)) {
-		int err = execmem_make_temp_rw(ptr, size);
-
-		if (err) {
-			execmem_free(ptr);
-			return -ENOMEM;
-		}
-
-		mod->mem[type].is_rox = true;
-	}
+	mod->mem[type].is_rox = execmem_is_rox(execmem_type);
 
 	/*
 	 * The pointer to these blocks of memory are stored on the module
diff --git a/mm/execmem.c b/mm/execmem.c
index aac211bc88c5..d0bf0123bce4 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -336,7 +336,7 @@ static bool execmem_cache_free(void *ptr)
 	return true;
 }
 
-int execmem_make_temp_rw(void *ptr, size_t size)
+static int execmem_force_rw(void *ptr, size_t size)
 {
 	unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	unsigned long addr = (unsigned long)ptr;
@@ -358,6 +358,16 @@ int execmem_restore_rox(void *ptr, size_t size)
 }
 
 #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */
+/*
+ * when ROX cache is not used the permissions defined by architectures for
+ * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must
+ * be writable anyway
+ */
+static inline int execmem_force_rw(void *ptr, size_t size)
+{
+	return 0;
+}
+
 static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
 {
 	return NULL;
@@ -387,6 +397,21 @@ void *execmem_alloc(enum execmem_type type, size_t size)
 	return kasan_reset_tag(p);
 }
 
+void *execmem_alloc_rw(enum execmem_type type, size_t size)
+{
+	void *p __free(execmem) = execmem_alloc(type, size);
+	int err;
+
+	if (!p)
+		return NULL;
+
+	err = execmem_force_rw(p, size);
+	if (err)
+		return NULL;
+
+	return no_free_ptr(p);
+}
+
 void execmem_free(void *ptr)
 {
 	/*
-- 
cgit v1.2.3


From ab674b6871b049aab2e86d1d7375526368ed175a Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Sun, 13 Jul 2025 10:17:28 +0300
Subject: execmem: drop writable parameter from execmem_fill_trapping_insns()

After update of execmem_cache_free() that made memory writable before
updating it, there is no need to update read only memory, so the writable
parameter to execmem_fill_trapping_insns() is not needed.  Drop it.

Link: https://lkml.kernel.org/r/20250713071730.4117334-7-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Daniel Gomez <da.gomez@samsung.com>
Cc: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/init.c      | 8 ++------
 include/linux/execmem.h | 3 +--
 mm/execmem.c            | 4 ++--
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7456df985d96..dbc63f0d538f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1063,13 +1063,9 @@ unsigned long arch_max_swapfile_size(void)
 static struct execmem_info execmem_info __ro_after_init;
 
 #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable)
+void execmem_fill_trapping_insns(void *ptr, size_t size)
 {
-	/* fill memory with INT3 instructions */
-	if (writeable)
-		memset(ptr, INT3_INSN_OPCODE, size);
-	else
-		text_poke_set(ptr, INT3_INSN_OPCODE, size);
+	memset(ptr, INT3_INSN_OPCODE, size);
 }
 #endif
 
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 8b61b05da7d5..7de229134e30 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -60,12 +60,11 @@ enum execmem_range_flags {
  *				 will trap
  * @ptr:	pointer to memory to fill
  * @size:	size of the range to fill
- * @writable:	is the memory poited by @ptr is writable or ROX
  *
  * A hook for architecures to fill execmem ranges with invalid instructions.
  * Architectures that use EXECMEM_ROX_CACHE must implement this.
  */
-void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable);
+void execmem_fill_trapping_insns(void *ptr, size_t size);
 
 /**
  * execmem_restore_rox - restore read-only-execute permissions
diff --git a/mm/execmem.c b/mm/execmem.c
index 9abf76a63a79..1785d7f435e4 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -304,7 +304,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
 		goto err_free_mem;
 
 	/* fill memory with instructions that will trap */
-	execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true);
+	execmem_fill_trapping_insns(p, alloc_size);
 
 	err = set_memory_rox((unsigned long)p, vm->nr_pages);
 	if (err)
@@ -363,7 +363,7 @@ static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
 	if (err)
 		return err;
 
-	execmem_fill_trapping_insns(ptr, size, /* writable = */ true);
+	execmem_fill_trapping_insns(ptr, size);
 	execmem_restore_rox(ptr, size);
 
 	err = execmem_cache_add_locked(ptr, size, gfp_mask);
-- 
cgit v1.2.3


From bb5b0b4317c9516bdc5e9a4235e3b5f1a73b7e48 Mon Sep 17 00:00:00 2001
From: Joshua Kinard <kumba@gentoo.org>
Date: Mon, 21 Jul 2025 13:00:51 -0400
Subject: rtc: ds1685: Update Joshua Kinard's email address.

I am switching my address to a personal domain, so need to update the
driver's files and the entry in MAINTAINERS.

Signed-off-by: Joshua Kinard <kumba@gentoo.org>
Link: https://lore.kernel.org/r/20250721170051.32407-1-kumba@gentoo.org
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
---
 MAINTAINERS                | 2 +-
 drivers/rtc/rtc-ds1685.c   | 4 ++--
 include/linux/rtc/ds1685.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index a92290fffa16..536befd32be8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6608,7 +6608,7 @@ S:	Supported
 F:	drivers/input/keyboard/dlink-dir685-touchkeys.c
 
 DALLAS/MAXIM DS1685-FAMILY REAL TIME CLOCK
-M:	Joshua Kinard <kumba@gentoo.org>
+M:	Joshua Kinard <linux@kumba.dev>
 S:	Maintained
 F:	drivers/rtc/rtc-ds1685.c
 F:	include/linux/rtc/ds1685.h
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index 38e25f63597a..97423f1d0361 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -3,7 +3,7 @@
  * An rtc driver for the Dallas/Maxim DS1685/DS1687 and related real-time
  * chips.
  *
- * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2011-2014 Joshua Kinard <linux@kumba.dev>.
  * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
  *
  * References:
@@ -1436,7 +1436,7 @@ EXPORT_SYMBOL_GPL(ds1685_rtc_poweroff);
 /* ----------------------------------------------------------------------- */
 
 
-MODULE_AUTHOR("Joshua Kinard <kumba@gentoo.org>");
+MODULE_AUTHOR("Joshua Kinard <linux@kumba.dev>");
 MODULE_AUTHOR("Matthias Fuchs <matthias.fuchs@esd-electronics.com>");
 MODULE_DESCRIPTION("Dallas/Maxim DS1685/DS1687-series RTC driver");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h
index 5a41c3bbcbe3..01da4582db6d 100644
--- a/include/linux/rtc/ds1685.h
+++ b/include/linux/rtc/ds1685.h
@@ -8,7 +8,7 @@
  * include larger, battery-backed NV-SRAM, burst-mode access, and an RTC
  * write counter.
  *
- * Copyright (C) 2011-2014 Joshua Kinard <kumba@gentoo.org>.
+ * Copyright (C) 2011-2014 Joshua Kinard <linux@kumba.dev>.
  * Copyright (C) 2009 Matthias Fuchs <matthias.fuchs@esd-electronics.com>.
  *
  * References:
-- 
cgit v1.2.3


From 89c52146392948f4cdda3853da9d82ec6d1dd1f4 Mon Sep 17 00:00:00 2001
From: Marcos Alano <marcoshalano@gmail.com>
Date: Tue, 5 Aug 2025 13:44:29 -0700
Subject: Input: add keycode for performance mode key

Alienware calls this key "Performance Boost". Dell calls it "G-Mode".

The goal is to have a specific keycode to detect when this key is
pressed, so userspace can act upon it and do what have to do, usually
starting the power profile for performance.

Signed-off-by: Marcos Alano <marcoshalano@gmail.com>
Link: https://lore.kernel.org/r/20250509193708.2190586-1-marcoshalano@gmail.com
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 include/uapi/linux/input-event-codes.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h
index 08cb157ab593..ca5851e97fac 100644
--- a/include/uapi/linux/input-event-codes.h
+++ b/include/uapi/linux/input-event-codes.h
@@ -770,6 +770,9 @@
 #define KEY_KBD_LCD_MENU4		0x2bb
 #define KEY_KBD_LCD_MENU5		0x2bc
 
+/* Performance Boost key (Alienware)/G-Mode key (Dell) */
+#define KEY_PERFORMANCE			0x2bd
+
 #define BTN_TRIGGER_HAPPY		0x2c0
 #define BTN_TRIGGER_HAPPY1		0x2c0
 #define BTN_TRIGGER_HAPPY2		0x2c1
-- 
cgit v1.2.3


From 86624ba3b522b6512def25534341da93356c8da4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 14 Jul 2025 13:08:25 -0300
Subject: vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD

This was missed during the initial implementation. The VFIO PCI encodes
the vf_token inside the device name when opening the device from the group
FD, something like:

  "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"

This is used to control access to a VF unless there is co-ordination with
the owner of the PF.

Since we no longer have a device name in the cdev path, pass the token
directly through VFIO_DEVICE_BIND_IOMMUFD using an optional field
indicated by VFIO_DEVICE_BIND_FLAG_TOKEN.

Fixes: 5fcc26969a16 ("vfio: Add VFIO_DEVICE_BIND_IOMMUFD")
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/0-v3-bdd8716e85fe+3978a-vfio_token_jgg@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/device_cdev.c                     | 38 ++++++++++++++++++++++++--
 drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c |  1 +
 drivers/vfio/pci/mlx5/main.c                   |  1 +
 drivers/vfio/pci/nvgrace-gpu/main.c            |  2 ++
 drivers/vfio/pci/pds/vfio_dev.c                |  1 +
 drivers/vfio/pci/qat/main.c                    |  1 +
 drivers/vfio/pci/vfio_pci.c                    |  1 +
 drivers/vfio/pci/vfio_pci_core.c               | 22 +++++++++------
 drivers/vfio/pci/virtio/main.c                 |  3 ++
 include/linux/vfio.h                           |  4 +++
 include/linux/vfio_pci_core.h                  |  2 ++
 include/uapi/linux/vfio.h                      | 12 +++++++-
 12 files changed, 76 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 281a8dc3ed49..480cac3a0c27 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
 	spin_unlock(&df->kvm_ref_lock);
 }
 
+static int vfio_df_check_token(struct vfio_device *device,
+			       const struct vfio_device_bind_iommufd *bind)
+{
+	uuid_t uuid;
+
+	if (!device->ops->match_token_uuid) {
+		if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN))
+		return device->ops->match_token_uuid(device, NULL);
+
+	if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr),
+			   sizeof(uuid)))
+		return -EFAULT;
+	return device->ops->match_token_uuid(device, &uuid);
+}
+
 long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
 				struct vfio_device_bind_iommufd __user *arg)
 {
+	const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN;
 	struct vfio_device *device = df->device;
 	struct vfio_device_bind_iommufd bind;
 	unsigned long minsz;
+	u32 user_size;
 	int ret;
 
 	static_assert(__same_type(arg->out_devid, df->devid));
 
 	minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
 
-	if (copy_from_user(&bind, arg, minsz))
-		return -EFAULT;
+	ret = get_user(user_size, &arg->argsz);
+	if (ret)
+		return ret;
+	if (user_size < minsz)
+		return -EINVAL;
+	ret = copy_struct_from_user(&bind, minsz, arg, user_size);
+	if (ret)
+		return ret;
 
-	if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
+	if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS)
 		return -EINVAL;
 
 	/* BIND_IOMMUFD only allowed for cdev fds */
@@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
 		goto out_unlock;
 	}
 
+	ret = vfio_df_check_token(device, &bind);
+	if (ret)
+		goto out_unlock;
+
 	df->iommufd = iommufd_ctx_from_fd(bind.iommufd);
 	if (IS_ERR(df->iommufd)) {
 		ret = PTR_ERR(df->iommufd);
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 2149f49aeec7..397f5e445136 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1583,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 93f894fe60d2..7ec47e736a8e 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1372,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e5ac39c4cc6b..d95761dcdd58 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -696,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
 	.mmap		= nvgrace_gpu_mmap,
 	.request	= vfio_pci_core_request,
 	.match		= vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd	= vfio_iommufd_physical_bind,
 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
@@ -715,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
 	.mmap		= vfio_pci_core_mmap,
 	.request	= vfio_pci_core_request,
 	.match		= vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd	= vfio_iommufd_physical_bind,
 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index f6e0253a8a14..f3ccb0008f67 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -201,6 +201,7 @@ static const struct vfio_device_ops pds_vfio_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index 845ed15b6771..5cce6b0b8d2f 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 5ba39f7623bb..ac10f14417f2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -138,6 +138,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.mmap		= vfio_pci_core_mmap,
 	.request	= vfio_pci_core_request,
 	.match		= vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd	= vfio_iommufd_physical_bind,
 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
 	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 261a6dc5a5fc..fad410cf91bc 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1821,9 +1821,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
 }
 EXPORT_SYMBOL_GPL(vfio_pci_core_request);
 
-static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
-				      bool vf_token, uuid_t *uuid)
+int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev,
+				   const uuid_t *uuid)
+
 {
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
 	/*
 	 * There's always some degree of trust or collaboration between SR-IOV
 	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
@@ -1854,7 +1858,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 		bool match;
 
 		if (!pf_vdev) {
-			if (!vf_token)
+			if (!uuid)
 				return 0; /* PF is not vfio-pci, no VF token */
 
 			pci_info_ratelimited(vdev->pdev,
@@ -1862,7 +1866,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 			return -EINVAL;
 		}
 
-		if (!vf_token) {
+		if (!uuid) {
 			pci_info_ratelimited(vdev->pdev,
 				"VF token required to access device\n");
 			return -EACCES;
@@ -1880,7 +1884,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 	} else if (vdev->vf_token) {
 		mutex_lock(&vdev->vf_token->lock);
 		if (vdev->vf_token->users) {
-			if (!vf_token) {
+			if (!uuid) {
 				mutex_unlock(&vdev->vf_token->lock);
 				pci_info_ratelimited(vdev->pdev,
 					"VF token required to access device\n");
@@ -1893,12 +1897,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 					"Incorrect VF token provided for device\n");
 				return -EACCES;
 			}
-		} else if (vf_token) {
+		} else if (uuid) {
 			uuid_copy(&vdev->vf_token->uuid, uuid);
 		}
 
 		mutex_unlock(&vdev->vf_token->lock);
-	} else if (vf_token) {
+	} else if (uuid) {
 		pci_info_ratelimited(vdev->pdev,
 			"VF token incorrectly provided, not a PF or VF\n");
 		return -EINVAL;
@@ -1906,6 +1910,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid);
 
 #define VF_TOKEN_ARG "vf_token="
 
@@ -1952,7 +1957,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
 		}
 	}
 
-	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+	ret = core_vdev->ops->match_token_uuid(core_vdev,
+					       vf_token ? &uuid : NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index 515fe1b9f94d..8084f3e36a9f 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
 	.mmap = vfio_pci_core_mmap,
 	.request = vfio_pci_core_request,
 	.match = vfio_pci_core_match,
+	.match_token_uuid = vfio_pci_core_match_token_uuid,
 	.bind_iommufd = vfio_iommufd_physical_bind,
 	.unbind_iommufd = vfio_iommufd_physical_unbind,
 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 707b00772ce1..eb563f538dee 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -105,6 +105,9 @@ struct vfio_device {
  * @match: Optional device name match callback (return: 0 for no-match, >0 for
  *         match, -errno for abort (ex. match with insufficient or incorrect
  *         additional args)
+ * @match_token_uuid: Optional device token match/validation. Return 0
+ *         if the uuid is valid for the device, -errno otherwise. uuid is NULL
+ *         if none was provided.
  * @dma_unmap: Called when userspace unmaps IOVA from the container
  *             this device is attached to.
  * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
@@ -132,6 +135,7 @@ struct vfio_device_ops {
 	int	(*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
 	void	(*request)(struct vfio_device *vdev, unsigned int count);
 	int	(*match)(struct vfio_device *vdev, char *buf);
+	int	(*match_token_uuid)(struct vfio_device *vdev, const uuid_t *uuid);
 	void	(*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length);
 	int	(*device_feature)(struct vfio_device *device, u32 flags,
 				  void __user *arg, size_t argsz);
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index fbb472dd99b3..f541044e42a2 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -122,6 +122,8 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
+int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev,
+				   const uuid_t *uuid);
 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5764f315137f..75100bf009ba 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -905,10 +905,12 @@ struct vfio_device_feature {
  * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18,
  *				   struct vfio_device_bind_iommufd)
  * @argsz:	 User filled size of this data.
- * @flags:	 Must be 0.
+ * @flags:	 Must be 0 or a bit flags of VFIO_DEVICE_BIND_*
  * @iommufd:	 iommufd to bind.
  * @out_devid:	 The device id generated by this bind. devid is a handle for
  *		 this device/iommufd bond and can be used in IOMMUFD commands.
+ * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte
+ *                  UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN.
  *
  * Bind a vfio_device to the specified iommufd.
  *
@@ -917,13 +919,21 @@ struct vfio_device_feature {
  *
  * Unbind is automatically conducted when device fd is closed.
  *
+ * A token is sometimes required to open the device, unless this is known to be
+ * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is
+ * ignored. The only case today is a PF/VF relationship where the VF bind must
+ * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to
+ * the PF.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_device_bind_iommufd {
 	__u32		argsz;
 	__u32		flags;
+#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0)
 	__s32		iommufd;
 	__u32		out_devid;
+	__aligned_u64	token_uuid_ptr;
 };
 
 #define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 18)
-- 
cgit v1.2.3


From 397a46c9aa3343e8efe6847bdaa124945bab1de4 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Fri, 25 Jul 2025 09:46:50 +0200
Subject: gpio: remove legacy GPIO line value setter callbacks

With no more users of the legacy GPIO line value setters - .set() and
.set_multiple() - we can now remove them from the kernel.

Link: https://lore.kernel.org/r/20250725074651.14002-1-brgl@bgdev.pl
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c      | 27 ++++++---------------------
 include/linux/gpio/driver.h |  7 -------
 2 files changed, 6 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index a93d2a9355e2..9ac4c23d656a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1037,11 +1037,6 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data,
 	int base = 0;
 	int ret;
 
-	/* Only allow one set() and one set_multiple(). */
-	if ((gc->set && gc->set_rv) ||
-	    (gc->set_multiple && gc->set_multiple_rv))
-		return -EINVAL;
-
 	/*
 	 * First: allocate and populate the internal stat container, and
 	 * set up the struct device.
@@ -2891,19 +2886,14 @@ static int gpiochip_set(struct gpio_chip *gc, unsigned int offset, int value)
 
 	lockdep_assert_held(&gc->gpiodev->srcu);
 
-	if (WARN_ON(unlikely(!gc->set && !gc->set_rv)))
+	if (WARN_ON(unlikely(!gc->set_rv)))
 		return -EOPNOTSUPP;
 
-	if (gc->set_rv) {
-		ret = gc->set_rv(gc, offset, value);
-		if (ret > 0)
-			ret = -EBADE;
-
-		return ret;
-	}
+	ret = gc->set_rv(gc, offset, value);
+	if (ret > 0)
+		ret = -EBADE;
 
-	gc->set(gc, offset, value);
-	return 0;
+	return ret;
 }
 
 static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value)
@@ -2919,7 +2909,7 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value)
 	 * output-only, but if there is then not even a .set() operation it
 	 * is pretty tricky to drive the output line.
 	 */
-	if (!guard.gc->set && !guard.gc->set_rv && !guard.gc->direction_output) {
+	if (!guard.gc->set_rv && !guard.gc->direction_output) {
 		gpiod_warn(desc,
 			   "%s: missing set() and direction_output() operations\n",
 			   __func__);
@@ -3673,11 +3663,6 @@ static int gpiochip_set_multiple(struct gpio_chip *gc,
 		return ret;
 	}
 
-	if (gc->set_multiple) {
-		gc->set_multiple(gc, mask, bits);
-		return 0;
-	}
-
 	/* set outputs if the corresponding mask bit is set */
 	for_each_set_bit(i, mask, gc->ngpio) {
 		ret = gpiochip_set(gc, i, test_bit(i, bits));
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 4b984e8f8fcd..90567dde7d8e 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -347,8 +347,6 @@ struct gpio_irq_chip {
  * @get: returns value for signal "offset", 0=low, 1=high, or negative error
  * @get_multiple: reads values for multiple signals defined by "mask" and
  *	stores them in "bits", returns 0 on success or negative error
- * @set: **DEPRECATED** - please use set_rv() instead
- * @set_multiple: **DEPRECATED** - please use set_multiple_rv() instead
  * @set_rv: assigns output value for signal "offset", returns 0 on success or
  *          negative error value
  * @set_multiple_rv: assigns output values for multiple signals defined by
@@ -445,11 +443,6 @@ struct gpio_chip {
 	int			(*get_multiple)(struct gpio_chip *gc,
 						unsigned long *mask,
 						unsigned long *bits);
-	void			(*set)(struct gpio_chip *gc,
-						unsigned int offset, int value);
-	void			(*set_multiple)(struct gpio_chip *gc,
-						unsigned long *mask,
-						unsigned long *bits);
 	int			(*set_rv)(struct gpio_chip *gc,
 					  unsigned int offset,
 					  int value);
-- 
cgit v1.2.3


From d9d87d90cc0b10cd56ae353f50b11417e7d21712 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Thu, 17 Jul 2025 15:21:26 +0200
Subject: treewide: rename GPIO set callbacks back to their original names

The conversion of all GPIO drivers to using the .set_rv() and
.set_multiple_rv() callbacks from struct gpio_chip (which - unlike their
predecessors - return an integer and allow the controller drivers to
indicate failures to users) is now complete and the legacy ones have
been removed. Rename the new callbacks back to their original names in
one sweeping change.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 arch/arm/common/sa1111.c                           |  4 ++--
 arch/arm/common/scoop.c                            |  2 +-
 arch/arm/mach-s3c/gpio-samsung.c                   |  2 +-
 arch/arm/mach-sa1100/assabet.c                     |  2 +-
 arch/arm/mach-sa1100/neponset.c                    |  2 +-
 arch/arm/plat-orion/gpio.c                         |  2 +-
 arch/m68k/coldfire/gpio.c                          |  2 +-
 arch/mips/alchemy/common/gpiolib.c                 |  6 +++---
 arch/mips/bcm63xx/gpio.c                           |  2 +-
 arch/mips/kernel/gpio_txx9.c                       |  2 +-
 arch/mips/rb532/gpio.c                             |  2 +-
 arch/mips/txx9/generic/setup.c                     |  2 +-
 arch/powerpc/platforms/44x/gpio.c                  |  2 +-
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c          |  2 +-
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c     |  2 +-
 arch/powerpc/platforms/8xx/cpm1.c                  |  4 ++--
 arch/powerpc/sysdev/cpm_common.c                   |  2 +-
 drivers/bcma/driver_gpio.c                         |  2 +-
 drivers/gpio/gpio-74x164.c                         |  4 ++--
 drivers/gpio/gpio-adnp.c                           |  2 +-
 drivers/gpio/gpio-adp5520.c                        |  2 +-
 drivers/gpio/gpio-adp5585.c                        |  2 +-
 drivers/gpio/gpio-aggregator.c                     |  4 ++--
 drivers/gpio/gpio-altera-a10sr.c                   |  2 +-
 drivers/gpio/gpio-altera.c                         |  2 +-
 drivers/gpio/gpio-amd-fch.c                        |  2 +-
 drivers/gpio/gpio-amd8111.c                        |  2 +-
 drivers/gpio/gpio-arizona.c                        |  2 +-
 drivers/gpio/gpio-aspeed-sgpio.c                   |  2 +-
 drivers/gpio/gpio-aspeed.c                         |  2 +-
 drivers/gpio/gpio-bcm-kona.c                       |  2 +-
 drivers/gpio/gpio-bd71815.c                        |  2 +-
 drivers/gpio/gpio-bd71828.c                        |  2 +-
 drivers/gpio/gpio-bd9571mwv.c                      |  2 +-
 drivers/gpio/gpio-bt8xx.c                          |  2 +-
 drivers/gpio/gpio-cgbc.c                           |  2 +-
 drivers/gpio/gpio-creg-snps.c                      |  2 +-
 drivers/gpio/gpio-cros-ec.c                        |  2 +-
 drivers/gpio/gpio-crystalcove.c                    |  2 +-
 drivers/gpio/gpio-cs5535.c                         |  2 +-
 drivers/gpio/gpio-da9052.c                         |  2 +-
 drivers/gpio/gpio-da9055.c                         |  2 +-
 drivers/gpio/gpio-davinci.c                        |  2 +-
 drivers/gpio/gpio-dln2.c                           |  2 +-
 drivers/gpio/gpio-eic-sprd.c                       |  2 +-
 drivers/gpio/gpio-em.c                             |  2 +-
 drivers/gpio/gpio-exar.c                           |  2 +-
 drivers/gpio/gpio-f7188x.c                         |  2 +-
 drivers/gpio/gpio-graniterapids.c                  |  2 +-
 drivers/gpio/gpio-gw-pld.c                         |  2 +-
 drivers/gpio/gpio-htc-egpio.c                      |  2 +-
 drivers/gpio/gpio-ich.c                            |  2 +-
 drivers/gpio/gpio-imx-scu.c                        |  2 +-
 drivers/gpio/gpio-it87.c                           |  2 +-
 drivers/gpio/gpio-janz-ttl.c                       |  2 +-
 drivers/gpio/gpio-kempld.c                         |  2 +-
 drivers/gpio/gpio-latch.c                          |  4 ++--
 drivers/gpio/gpio-ljca.c                           |  2 +-
 drivers/gpio/gpio-logicvc.c                        |  2 +-
 drivers/gpio/gpio-loongson-64bit.c                 |  2 +-
 drivers/gpio/gpio-loongson.c                       |  2 +-
 drivers/gpio/gpio-lp3943.c                         |  2 +-
 drivers/gpio/gpio-lp873x.c                         |  2 +-
 drivers/gpio/gpio-lp87565.c                        |  2 +-
 drivers/gpio/gpio-lpc18xx.c                        |  2 +-
 drivers/gpio/gpio-lpc32xx.c                        | 10 ++++-----
 drivers/gpio/gpio-macsmc.c                         |  2 +-
 drivers/gpio/gpio-madera.c                         |  2 +-
 drivers/gpio/gpio-max730x.c                        |  2 +-
 drivers/gpio/gpio-max732x.c                        |  4 ++--
 drivers/gpio/gpio-max77620.c                       |  2 +-
 drivers/gpio/gpio-max77650.c                       |  2 +-
 drivers/gpio/gpio-max77759.c                       |  2 +-
 drivers/gpio/gpio-mb86s7x.c                        |  2 +-
 drivers/gpio/gpio-mc33880.c                        |  2 +-
 drivers/gpio/gpio-ml-ioh.c                         |  2 +-
 drivers/gpio/gpio-mm-lantiq.c                      |  2 +-
 drivers/gpio/gpio-mmio.c                           | 24 +++++++++++-----------
 drivers/gpio/gpio-mockup.c                         |  4 ++--
 drivers/gpio/gpio-moxtet.c                         |  2 +-
 drivers/gpio/gpio-mpc5200.c                        |  4 ++--
 drivers/gpio/gpio-mpfs.c                           |  2 +-
 drivers/gpio/gpio-mpsse.c                          |  4 ++--
 drivers/gpio/gpio-msc313.c                         |  2 +-
 drivers/gpio/gpio-mvebu.c                          |  2 +-
 drivers/gpio/gpio-nomadik.c                        |  2 +-
 drivers/gpio/gpio-npcm-sgpio.c                     |  4 ++--
 drivers/gpio/gpio-octeon.c                         |  2 +-
 drivers/gpio/gpio-omap.c                           |  4 ++--
 drivers/gpio/gpio-palmas.c                         |  2 +-
 drivers/gpio/gpio-pca953x.c                        |  4 ++--
 drivers/gpio/gpio-pca9570.c                        |  2 +-
 drivers/gpio/gpio-pcf857x.c                        |  4 ++--
 drivers/gpio/gpio-pch.c                            |  2 +-
 drivers/gpio/gpio-pl061.c                          |  2 +-
 drivers/gpio/gpio-pxa.c                            |  2 +-
 drivers/gpio/gpio-raspberrypi-exp.c                |  2 +-
 drivers/gpio/gpio-rc5t583.c                        |  2 +-
 drivers/gpio/gpio-rcar.c                           |  4 ++--
 drivers/gpio/gpio-rdc321x.c                        |  2 +-
 drivers/gpio/gpio-reg.c                            |  6 +++---
 drivers/gpio/gpio-regmap.c                         |  4 ++--
 drivers/gpio/gpio-rockchip.c                       |  2 +-
 drivers/gpio/gpio-rtd.c                            |  2 +-
 drivers/gpio/gpio-sa1100.c                         |  2 +-
 drivers/gpio/gpio-sama5d2-piobu.c                  |  2 +-
 drivers/gpio/gpio-sch.c                            |  2 +-
 drivers/gpio/gpio-sch311x.c                        |  2 +-
 drivers/gpio/gpio-sim.c                            |  4 ++--
 drivers/gpio/gpio-siox.c                           |  2 +-
 drivers/gpio/gpio-spear-spics.c                    |  2 +-
 drivers/gpio/gpio-sprd.c                           |  2 +-
 drivers/gpio/gpio-stmpe.c                          |  2 +-
 drivers/gpio/gpio-stp-xway.c                       |  2 +-
 drivers/gpio/gpio-syscon.c                         |  4 ++--
 drivers/gpio/gpio-tangier.c                        |  2 +-
 drivers/gpio/gpio-tc3589x.c                        |  2 +-
 drivers/gpio/gpio-tegra.c                          |  2 +-
 drivers/gpio/gpio-tegra186.c                       |  2 +-
 drivers/gpio/gpio-thunderx.c                       |  4 ++--
 drivers/gpio/gpio-timberdale.c                     |  2 +-
 drivers/gpio/gpio-tpic2810.c                       |  4 ++--
 drivers/gpio/gpio-tps65086.c                       |  2 +-
 drivers/gpio/gpio-tps65218.c                       |  2 +-
 drivers/gpio/gpio-tps65219.c                       |  4 ++--
 drivers/gpio/gpio-tps6586x.c                       |  2 +-
 drivers/gpio/gpio-tps65910.c                       |  2 +-
 drivers/gpio/gpio-tps65912.c                       |  2 +-
 drivers/gpio/gpio-tps68470.c                       |  2 +-
 drivers/gpio/gpio-tqmx86.c                         |  2 +-
 drivers/gpio/gpio-ts4900.c                         |  2 +-
 drivers/gpio/gpio-ts5500.c                         |  2 +-
 drivers/gpio/gpio-twl4030.c                        |  2 +-
 drivers/gpio/gpio-twl6040.c                        |  2 +-
 drivers/gpio/gpio-uniphier.c                       |  4 ++--
 drivers/gpio/gpio-viperboard.c                     |  4 ++--
 drivers/gpio/gpio-virtio.c                         |  2 +-
 drivers/gpio/gpio-vx855.c                          |  2 +-
 drivers/gpio/gpio-wcd934x.c                        |  2 +-
 drivers/gpio/gpio-wcove.c                          |  2 +-
 drivers/gpio/gpio-winbond.c                        |  2 +-
 drivers/gpio/gpio-wm831x.c                         |  2 +-
 drivers/gpio/gpio-wm8350.c                         |  2 +-
 drivers/gpio/gpio-wm8994.c                         |  2 +-
 drivers/gpio/gpio-xgene.c                          |  2 +-
 drivers/gpio/gpio-xilinx.c                         |  4 ++--
 drivers/gpio/gpio-xlp.c                            |  2 +-
 drivers/gpio/gpio-xra1403.c                        |  2 +-
 drivers/gpio/gpio-xtensa.c                         |  2 +-
 drivers/gpio/gpio-zevio.c                          |  2 +-
 drivers/gpio/gpio-zynq.c                           |  2 +-
 drivers/gpio/gpio-zynqmp-modepin.c                 |  2 +-
 drivers/gpio/gpiolib.c                             | 10 ++++-----
 drivers/gpu/drm/bridge/ti-sn65dsi86.c              |  2 +-
 drivers/hid/hid-cp2112.c                           |  2 +-
 drivers/hid/hid-mcp2200.c                          |  4 ++--
 drivers/hid/hid-mcp2221.c                          |  2 +-
 drivers/hwmon/ltc2992.c                            |  4 ++--
 drivers/hwmon/pmbus/ucd9000.c                      |  2 +-
 drivers/i2c/muxes/i2c-mux-ltc4306.c                |  2 +-
 drivers/iio/adc/ad4130.c                           |  2 +-
 drivers/iio/adc/ad4170-4.c                         |  2 +-
 drivers/iio/adc/ad7768-1.c                         |  2 +-
 drivers/iio/adc/rohm-bd79124.c                     |  4 ++--
 drivers/iio/adc/ti-ads7950.c                       |  2 +-
 drivers/iio/addac/ad74115.c                        |  2 +-
 drivers/iio/addac/ad74413r.c                       |  4 ++--
 drivers/iio/dac/ad5592r-base.c                     |  2 +-
 drivers/input/keyboard/adp5588-keys.c              |  2 +-
 drivers/input/touchscreen/ad7879.c                 |  2 +-
 drivers/leds/blink/leds-lgm-sso.c                  |  2 +-
 drivers/leds/leds-pca9532.c                        |  2 +-
 drivers/leds/leds-pca955x.c                        |  2 +-
 drivers/leds/leds-tca6507.c                        |  2 +-
 drivers/media/dvb-frontends/cxd2820r_core.c        |  2 +-
 drivers/media/i2c/ds90ub913.c                      |  2 +-
 drivers/media/i2c/ds90ub953.c                      |  2 +-
 drivers/media/i2c/max9286.c                        |  2 +-
 drivers/media/i2c/max96717.c                       |  2 +-
 drivers/media/pci/solo6x10/solo6x10-gpio.c         |  2 +-
 drivers/mfd/sm501.c                                |  2 +-
 drivers/mfd/tps65010.c                             |  2 +-
 drivers/mfd/ucb1x00-core.c                         |  2 +-
 drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c    |  2 +-
 drivers/misc/ti_fpc202.c                           |  2 +-
 drivers/net/can/spi/mcp251x.c                      |  4 ++--
 drivers/net/dsa/mt7530.c                           |  2 +-
 drivers/net/dsa/vitesse-vsc73xx-core.c             |  2 +-
 drivers/net/phy/qcom/qca807x.c                     |  2 +-
 drivers/pinctrl/actions/pinctrl-owl.c              |  2 +-
 drivers/pinctrl/bcm/pinctrl-bcm2835.c              |  4 ++--
 drivers/pinctrl/bcm/pinctrl-iproc-gpio.c           |  2 +-
 drivers/pinctrl/bcm/pinctrl-nsp-gpio.c             |  2 +-
 drivers/pinctrl/cirrus/pinctrl-cs42l43.c           |  2 +-
 drivers/pinctrl/cirrus/pinctrl-lochnagar.c         |  2 +-
 drivers/pinctrl/intel/pinctrl-baytrail.c           |  2 +-
 drivers/pinctrl/intel/pinctrl-cherryview.c         |  2 +-
 drivers/pinctrl/intel/pinctrl-intel.c              |  2 +-
 drivers/pinctrl/intel/pinctrl-lynxpoint.c          |  2 +-
 drivers/pinctrl/mediatek/pinctrl-airoha.c          |  2 +-
 drivers/pinctrl/mediatek/pinctrl-moore.c           |  2 +-
 drivers/pinctrl/mediatek/pinctrl-mtk-common.c      |  2 +-
 drivers/pinctrl/mediatek/pinctrl-paris.c           |  2 +-
 drivers/pinctrl/meson/pinctrl-amlogic-a4.c         |  2 +-
 drivers/pinctrl/meson/pinctrl-meson.c              |  2 +-
 drivers/pinctrl/mvebu/pinctrl-armada-37xx.c        |  2 +-
 drivers/pinctrl/nomadik/pinctrl-abx500.c           |  2 +-
 drivers/pinctrl/nuvoton/pinctrl-ma35.c             |  2 +-
 drivers/pinctrl/pinctrl-amd.c                      |  2 +-
 drivers/pinctrl/pinctrl-amdisp.c                   |  2 +-
 drivers/pinctrl/pinctrl-apple-gpio.c               |  2 +-
 drivers/pinctrl/pinctrl-as3722.c                   |  2 +-
 drivers/pinctrl/pinctrl-at91-pio4.c                |  4 ++--
 drivers/pinctrl/pinctrl-at91.c                     |  4 ++--
 drivers/pinctrl/pinctrl-aw9523.c                   |  4 ++--
 drivers/pinctrl/pinctrl-axp209.c                   |  4 ++--
 drivers/pinctrl/pinctrl-cy8c95x0.c                 |  4 ++--
 drivers/pinctrl/pinctrl-da9062.c                   |  2 +-
 drivers/pinctrl/pinctrl-digicolor.c                |  2 +-
 drivers/pinctrl/pinctrl-ingenic.c                  |  2 +-
 drivers/pinctrl/pinctrl-keembay.c                  |  2 +-
 drivers/pinctrl/pinctrl-mcp23s08.c                 |  4 ++--
 drivers/pinctrl/pinctrl-microchip-sgpio.c          |  2 +-
 drivers/pinctrl/pinctrl-ocelot.c                   |  2 +-
 drivers/pinctrl/pinctrl-pic32.c                    |  2 +-
 drivers/pinctrl/pinctrl-pistachio.c                |  2 +-
 drivers/pinctrl/pinctrl-rk805.c                    |  2 +-
 drivers/pinctrl/pinctrl-rp1.c                      |  2 +-
 drivers/pinctrl/pinctrl-st.c                       |  2 +-
 drivers/pinctrl/pinctrl-stmfx.c                    |  2 +-
 drivers/pinctrl/pinctrl-sx150x.c                   |  4 ++--
 drivers/pinctrl/pinctrl-xway.c                     |  2 +-
 drivers/pinctrl/qcom/pinctrl-lpass-lpi.c           |  2 +-
 drivers/pinctrl/qcom/pinctrl-msm.c                 |  2 +-
 drivers/pinctrl/qcom/pinctrl-spmi-gpio.c           |  2 +-
 drivers/pinctrl/qcom/pinctrl-spmi-mpp.c            |  2 +-
 drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c           |  2 +-
 drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c            |  2 +-
 drivers/pinctrl/renesas/gpio.c                     |  2 +-
 drivers/pinctrl/renesas/pinctrl-rza1.c             |  2 +-
 drivers/pinctrl/renesas/pinctrl-rza2.c             |  2 +-
 drivers/pinctrl/renesas/pinctrl-rzg2l.c            |  2 +-
 drivers/pinctrl/renesas/pinctrl-rzv2m.c            |  2 +-
 drivers/pinctrl/samsung/pinctrl-samsung.c          |  2 +-
 drivers/pinctrl/spear/pinctrl-plgpio.c             |  2 +-
 drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c |  2 +-
 drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c |  2 +-
 drivers/pinctrl/stm32/pinctrl-stm32.c              |  2 +-
 drivers/pinctrl/sunplus/sppctl.c                   |  2 +-
 drivers/pinctrl/sunxi/pinctrl-sunxi.c              |  2 +-
 drivers/pinctrl/vt8500/pinctrl-wmt.c               |  2 +-
 drivers/platform/cznic/turris-omnia-mcu-gpio.c     |  4 ++--
 drivers/platform/x86/barco-p50-gpio.c              |  2 +-
 drivers/platform/x86/intel/int0002_vgpio.c         |  2 +-
 drivers/platform/x86/portwell-ec.c                 |  4 ++--
 drivers/platform/x86/silicom-platform.c            |  2 +-
 drivers/pwm/pwm-pca9685.c                          |  2 +-
 drivers/regulator/rpi-panel-attiny-regulator.c     |  2 +-
 drivers/soc/fsl/qe/gpio.c                          |  4 ++--
 drivers/soc/renesas/pwc-rzv2m.c                    |  2 +-
 drivers/spi/spi-xcomm.c                            |  2 +-
 drivers/ssb/driver_gpio.c                          |  4 ++--
 drivers/staging/greybus/gpio.c                     |  2 +-
 drivers/tty/serial/max310x.c                       |  2 +-
 drivers/tty/serial/sc16is7xx.c                     |  2 +-
 drivers/usb/serial/cp210x.c                        |  2 +-
 drivers/usb/serial/ftdi_sio.c                      |  4 ++--
 drivers/video/fbdev/via/via-gpio.c                 |  2 +-
 include/linux/gpio/driver.h                        | 19 ++++++++---------
 include/linux/gpio/generic.h                       |  4 ++--
 sound/hda/codecs/side-codecs/cirrus_scodec_test.c  |  2 +-
 sound/soc/codecs/idt821034.c                       |  2 +-
 sound/soc/codecs/peb2466.c                         |  2 +-
 sound/soc/codecs/rt5677.c                          |  2 +-
 sound/soc/codecs/tlv320adc3xxx.c                   |  2 +-
 sound/soc/codecs/wm5100.c                          |  2 +-
 sound/soc/codecs/wm8903.c                          |  2 +-
 sound/soc/codecs/wm8962.c                          |  2 +-
 sound/soc/codecs/wm8996.c                          |  2 +-
 sound/soc/codecs/zl38060.c                         |  2 +-
 sound/soc/soc-ac97.c                               |  2 +-
 sound/soc/ti/davinci-mcasp.c                       |  2 +-
 282 files changed, 355 insertions(+), 356 deletions(-)

(limited to 'include')

diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c
index d7e2ea27ce59..3389a70e4d49 100644
--- a/arch/arm/common/sa1111.c
+++ b/arch/arm/common/sa1111.c
@@ -617,8 +617,8 @@ static int sa1111_setup_gpios(struct sa1111 *sachip)
 	sachip->gc.direction_input = sa1111_gpio_direction_input;
 	sachip->gc.direction_output = sa1111_gpio_direction_output;
 	sachip->gc.get = sa1111_gpio_get;
-	sachip->gc.set_rv = sa1111_gpio_set;
-	sachip->gc.set_multiple_rv = sa1111_gpio_set_multiple;
+	sachip->gc.set = sa1111_gpio_set;
+	sachip->gc.set_multiple = sa1111_gpio_set_multiple;
 	sachip->gc.to_irq = sa1111_gpio_to_irq;
 	sachip->gc.base = -1;
 	sachip->gc.ngpio = 18;
diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index 2d3ee76c8e17..dddb73c96826 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -218,7 +218,7 @@ static int scoop_probe(struct platform_device *pdev)
 		devptr->gpio.label = dev_name(&pdev->dev);
 		devptr->gpio.base = inf->gpio_base;
 		devptr->gpio.ngpio = 12; /* PA11 = 0, PA12 = 1, etc. up to PA22 = 11 */
-		devptr->gpio.set_rv = scoop_gpio_set;
+		devptr->gpio.set = scoop_gpio_set;
 		devptr->gpio.get = scoop_gpio_get;
 		devptr->gpio.direction_input = scoop_gpio_direction_input;
 		devptr->gpio.direction_output = scoop_gpio_direction_output;
diff --git a/arch/arm/mach-s3c/gpio-samsung.c b/arch/arm/mach-s3c/gpio-samsung.c
index 206a492fbaf5..81e198e5a6d3 100644
--- a/arch/arm/mach-s3c/gpio-samsung.c
+++ b/arch/arm/mach-s3c/gpio-samsung.c
@@ -517,7 +517,7 @@ static void __init samsung_gpiolib_add(struct samsung_gpio_chip *chip)
 	if (!gc->direction_output)
 		gc->direction_output = samsung_gpiolib_2bit_output;
 	if (!gc->set)
-		gc->set_rv = samsung_gpiolib_set;
+		gc->set = samsung_gpiolib_set;
 	if (!gc->get)
 		gc->get = samsung_gpiolib_get;
 
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index bad8aa661e9d..2b833aa0212b 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -80,7 +80,7 @@ void ASSABET_BCR_frob(unsigned int mask, unsigned int val)
 {
 	unsigned long m = mask, v = val;
 
-	assabet_bcr_gc->set_multiple_rv(assabet_bcr_gc, &m, &v);
+	assabet_bcr_gc->set_multiple(assabet_bcr_gc, &m, &v);
 }
 EXPORT_SYMBOL(ASSABET_BCR_frob);
 
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index 6516598c8a71..88fe79f0a4ed 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -126,7 +126,7 @@ void neponset_ncr_frob(unsigned int mask, unsigned int val)
 	unsigned long m = mask, v = val;
 
 	if (nep)
-		n->gpio[0]->set_multiple_rv(n->gpio[0], &m, &v);
+		n->gpio[0]->set_multiple(n->gpio[0], &m, &v);
 	else
 		WARN(1, "nep unset\n");
 }
diff --git a/arch/arm/plat-orion/gpio.c b/arch/arm/plat-orion/gpio.c
index 6f09f65e3d95..49e29b7894a3 100644
--- a/arch/arm/plat-orion/gpio.c
+++ b/arch/arm/plat-orion/gpio.c
@@ -540,7 +540,7 @@ void __init orion_gpio_init(int gpio_base, int ngpio,
 	ochip->chip.direction_input = orion_gpio_direction_input;
 	ochip->chip.get = orion_gpio_get;
 	ochip->chip.direction_output = orion_gpio_direction_output;
-	ochip->chip.set_rv = orion_gpio_set;
+	ochip->chip.set = orion_gpio_set;
 	ochip->chip.to_irq = orion_gpio_to_irq;
 	ochip->chip.base = gpio_base;
 	ochip->chip.ngpio = ngpio;
diff --git a/arch/m68k/coldfire/gpio.c b/arch/m68k/coldfire/gpio.c
index 30e5a4ed799d..e2f7af1facb2 100644
--- a/arch/m68k/coldfire/gpio.c
+++ b/arch/m68k/coldfire/gpio.c
@@ -160,7 +160,7 @@ static struct gpio_chip mcfgpio_chip = {
 	.direction_input	= mcfgpio_direction_input,
 	.direction_output	= mcfgpio_direction_output,
 	.get			= mcfgpio_get_value,
-	.set_rv			= mcfgpio_set_value,
+	.set			= mcfgpio_set_value,
 	.to_irq			= mcfgpio_to_irq,
 	.base			= 0,
 	.ngpio			= MCFGPIO_PIN_MAX,
diff --git a/arch/mips/alchemy/common/gpiolib.c b/arch/mips/alchemy/common/gpiolib.c
index 194034eba75f..e79e26ffac99 100644
--- a/arch/mips/alchemy/common/gpiolib.c
+++ b/arch/mips/alchemy/common/gpiolib.c
@@ -101,7 +101,7 @@ struct gpio_chip alchemy_gpio_chip[] = {
 		.direction_input	= gpio1_direction_input,
 		.direction_output	= gpio1_direction_output,
 		.get			= gpio1_get,
-		.set_rv			= gpio1_set,
+		.set			= gpio1_set,
 		.to_irq			= gpio1_to_irq,
 		.base			= ALCHEMY_GPIO1_BASE,
 		.ngpio			= ALCHEMY_GPIO1_NUM,
@@ -111,7 +111,7 @@ struct gpio_chip alchemy_gpio_chip[] = {
 		.direction_input	= gpio2_direction_input,
 		.direction_output	= gpio2_direction_output,
 		.get			= gpio2_get,
-		.set_rv			= gpio2_set,
+		.set			= gpio2_set,
 		.to_irq			= gpio2_to_irq,
 		.base			= ALCHEMY_GPIO2_BASE,
 		.ngpio			= ALCHEMY_GPIO2_NUM,
@@ -151,7 +151,7 @@ static struct gpio_chip au1300_gpiochip = {
 	.direction_input	= alchemy_gpic_dir_input,
 	.direction_output	= alchemy_gpic_dir_output,
 	.get			= alchemy_gpic_get,
-	.set_rv			= alchemy_gpic_set,
+	.set			= alchemy_gpic_set,
 	.to_irq			= alchemy_gpic_gpio_to_irq,
 	.base			= AU1300_GPIO_BASE,
 	.ngpio			= AU1300_GPIO_NUM,
diff --git a/arch/mips/bcm63xx/gpio.c b/arch/mips/bcm63xx/gpio.c
index e7a53cd0dec5..ff45a6989c3a 100644
--- a/arch/mips/bcm63xx/gpio.c
+++ b/arch/mips/bcm63xx/gpio.c
@@ -131,7 +131,7 @@ static struct gpio_chip bcm63xx_gpio_chip = {
 	.direction_input	= bcm63xx_gpio_direction_input,
 	.direction_output	= bcm63xx_gpio_direction_output,
 	.get			= bcm63xx_gpio_get,
-	.set_rv			= bcm63xx_gpio_set,
+	.set			= bcm63xx_gpio_set,
 	.base			= 0,
 };
 
diff --git a/arch/mips/kernel/gpio_txx9.c b/arch/mips/kernel/gpio_txx9.c
index 027fb57d0d79..96ac40d20c23 100644
--- a/arch/mips/kernel/gpio_txx9.c
+++ b/arch/mips/kernel/gpio_txx9.c
@@ -70,7 +70,7 @@ static int txx9_gpio_dir_out(struct gpio_chip *chip, unsigned int offset,
 
 static struct gpio_chip txx9_gpio_chip = {
 	.get = txx9_gpio_get,
-	.set_rv = txx9_gpio_set,
+	.set = txx9_gpio_set,
 	.direction_input = txx9_gpio_dir_in,
 	.direction_output = txx9_gpio_dir_out,
 	.label = "TXx9",
diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c
index 0e47cd59b6cb..9aa5ef374465 100644
--- a/arch/mips/rb532/gpio.c
+++ b/arch/mips/rb532/gpio.c
@@ -164,7 +164,7 @@ static struct rb532_gpio_chip rb532_gpio_chip[] = {
 			.direction_input	= rb532_gpio_direction_input,
 			.direction_output	= rb532_gpio_direction_output,
 			.get			= rb532_gpio_get,
-			.set_rv			= rb532_gpio_set,
+			.set			= rb532_gpio_set,
 			.to_irq			= rb532_gpio_to_irq,
 			.base			= 0,
 			.ngpio			= 32,
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c
index 5a37e8b234a3..5dc867ea2c69 100644
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -655,7 +655,7 @@ void __init txx9_iocled_init(unsigned long baseaddr,
 	if (!iocled->mmioaddr)
 		goto out_free;
 	iocled->chip.get = txx9_iocled_get;
-	iocled->chip.set_rv = txx9_iocled_set;
+	iocled->chip.set = txx9_iocled_set;
 	iocled->chip.direction_input = txx9_iocled_dir_in;
 	iocled->chip.direction_output = txx9_iocled_dir_out;
 	iocled->chip.label = "iocled";
diff --git a/arch/powerpc/platforms/44x/gpio.c b/arch/powerpc/platforms/44x/gpio.c
index d540e261d85a..08ab76582568 100644
--- a/arch/powerpc/platforms/44x/gpio.c
+++ b/arch/powerpc/platforms/44x/gpio.c
@@ -180,7 +180,7 @@ static int __init ppc4xx_add_gpiochips(void)
 		gc->direction_input = ppc4xx_gpio_dir_in;
 		gc->direction_output = ppc4xx_gpio_dir_out;
 		gc->get = ppc4xx_gpio_get;
-		gc->set_rv = ppc4xx_gpio_set;
+		gc->set = ppc4xx_gpio_set;
 
 		ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc);
 		if (ret)
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index bda707d848a6..7748b6641a3c 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -336,7 +336,7 @@ static void mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt)
 	gpt->gc.direction_input  = mpc52xx_gpt_gpio_dir_in;
 	gpt->gc.direction_output = mpc52xx_gpt_gpio_dir_out;
 	gpt->gc.get = mpc52xx_gpt_gpio_get;
-	gpt->gc.set_rv = mpc52xx_gpt_gpio_set;
+	gpt->gc.set = mpc52xx_gpt_gpio_set;
 	gpt->gc.base = -1;
 	gpt->gc.parent = gpt->dev;
 
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index 6e37dfc6c5c9..cb7b9498f291 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -126,7 +126,7 @@ static int mcu_gpiochip_add(struct mcu *mcu)
 	gc->can_sleep = 1;
 	gc->ngpio = MCU_NUM_GPIO;
 	gc->base = -1;
-	gc->set_rv = mcu_gpio_set;
+	gc->set = mcu_gpio_set;
 	gc->direction_output = mcu_gpio_dir_out;
 	gc->parent = dev;
 
diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c
index 7462c221115c..7433be7d66ee 100644
--- a/arch/powerpc/platforms/8xx/cpm1.c
+++ b/arch/powerpc/platforms/8xx/cpm1.c
@@ -499,7 +499,7 @@ int cpm1_gpiochip_add16(struct device *dev)
 	gc->direction_input = cpm1_gpio16_dir_in;
 	gc->direction_output = cpm1_gpio16_dir_out;
 	gc->get = cpm1_gpio16_get;
-	gc->set_rv = cpm1_gpio16_set;
+	gc->set = cpm1_gpio16_set;
 	gc->to_irq = cpm1_gpio16_to_irq;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
@@ -622,7 +622,7 @@ int cpm1_gpiochip_add32(struct device *dev)
 	gc->direction_input = cpm1_gpio32_dir_in;
 	gc->direction_output = cpm1_gpio32_dir_out;
 	gc->get = cpm1_gpio32_get;
-	gc->set_rv = cpm1_gpio32_set;
+	gc->set = cpm1_gpio32_set;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 
diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c
index e22fc638dbc7..f469f6a9f6e0 100644
--- a/arch/powerpc/sysdev/cpm_common.c
+++ b/arch/powerpc/sysdev/cpm_common.c
@@ -210,7 +210,7 @@ int cpm2_gpiochip_add32(struct device *dev)
 	gc->direction_input = cpm2_gpio32_dir_in;
 	gc->direction_output = cpm2_gpio32_dir_out;
 	gc->get = cpm2_gpio32_get;
-	gc->set_rv = cpm2_gpio32_set;
+	gc->set = cpm2_gpio32_set;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 
diff --git a/drivers/bcma/driver_gpio.c b/drivers/bcma/driver_gpio.c
index f021e27644e0..658c7e2ac8bf 100644
--- a/drivers/bcma/driver_gpio.c
+++ b/drivers/bcma/driver_gpio.c
@@ -186,7 +186,7 @@ int bcma_gpio_init(struct bcma_drv_cc *cc)
 	chip->request		= bcma_gpio_request;
 	chip->free		= bcma_gpio_free;
 	chip->get		= bcma_gpio_get_value;
-	chip->set_rv		= bcma_gpio_set_value;
+	chip->set		= bcma_gpio_set_value;
 	chip->direction_input	= bcma_gpio_direction_input;
 	chip->direction_output	= bcma_gpio_direction_output;
 	chip->parent		= bus->dev;
diff --git a/drivers/gpio/gpio-74x164.c b/drivers/gpio/gpio-74x164.c
index 4dd5c2c330bb..c226524efeba 100644
--- a/drivers/gpio/gpio-74x164.c
+++ b/drivers/gpio/gpio-74x164.c
@@ -141,8 +141,8 @@ static int gen_74x164_probe(struct spi_device *spi)
 	chip->gpio_chip.label = spi->modalias;
 	chip->gpio_chip.direction_output = gen_74x164_direction_output;
 	chip->gpio_chip.get = gen_74x164_get_value;
-	chip->gpio_chip.set_rv = gen_74x164_set_value;
-	chip->gpio_chip.set_multiple_rv = gen_74x164_set_multiple;
+	chip->gpio_chip.set = gen_74x164_set_value;
+	chip->gpio_chip.set_multiple = gen_74x164_set_multiple;
 	chip->gpio_chip.base = -1;
 	chip->gpio_chip.ngpio = GEN_74X164_NUMBER_GPIOS * chip->registers;
 	chip->gpio_chip.can_sleep = true;
diff --git a/drivers/gpio/gpio-adnp.c b/drivers/gpio/gpio-adnp.c
index dc2b941c3726..e5ac2d211013 100644
--- a/drivers/gpio/gpio-adnp.c
+++ b/drivers/gpio/gpio-adnp.c
@@ -430,7 +430,7 @@ static int adnp_gpio_setup(struct adnp *adnp, unsigned int num_gpios,
 	chip->direction_input = adnp_gpio_direction_input;
 	chip->direction_output = adnp_gpio_direction_output;
 	chip->get = adnp_gpio_get;
-	chip->set_rv = adnp_gpio_set;
+	chip->set = adnp_gpio_set;
 	chip->can_sleep = true;
 
 	if (IS_ENABLED(CONFIG_DEBUG_FS))
diff --git a/drivers/gpio/gpio-adp5520.c b/drivers/gpio/gpio-adp5520.c
index 57d12c10cbda..6305c8b7dc05 100644
--- a/drivers/gpio/gpio-adp5520.c
+++ b/drivers/gpio/gpio-adp5520.c
@@ -122,7 +122,7 @@ static int adp5520_gpio_probe(struct platform_device *pdev)
 	gc->direction_input  = adp5520_gpio_direction_input;
 	gc->direction_output = adp5520_gpio_direction_output;
 	gc->get = adp5520_gpio_get_value;
-	gc->set_rv = adp5520_gpio_set_value;
+	gc->set = adp5520_gpio_set_value;
 	gc->can_sleep = true;
 
 	gc->base = pdata->gpio_start;
diff --git a/drivers/gpio/gpio-adp5585.c b/drivers/gpio/gpio-adp5585.c
index b2c8836c5f84..0fd3cc26d017 100644
--- a/drivers/gpio/gpio-adp5585.c
+++ b/drivers/gpio/gpio-adp5585.c
@@ -428,7 +428,7 @@ static int adp5585_gpio_probe(struct platform_device *pdev)
 	gc->direction_input = adp5585_gpio_direction_input;
 	gc->direction_output = adp5585_gpio_direction_output;
 	gc->get = adp5585_gpio_get_value;
-	gc->set_rv = adp5585_gpio_set_value;
+	gc->set = adp5585_gpio_set_value;
 	gc->set_config = adp5585_gpio_set_config;
 	gc->request = adp5585_gpio_request;
 	gc->free = adp5585_gpio_free;
diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c
index 6f941db02c04..af9d8b3a711d 100644
--- a/drivers/gpio/gpio-aggregator.c
+++ b/drivers/gpio/gpio-aggregator.c
@@ -534,8 +534,8 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev,
 	chip->direction_output = gpio_fwd_direction_output;
 	chip->get = gpio_fwd_get;
 	chip->get_multiple = gpio_fwd_get_multiple_locked;
-	chip->set_rv = gpio_fwd_set;
-	chip->set_multiple_rv = gpio_fwd_set_multiple_locked;
+	chip->set = gpio_fwd_set;
+	chip->set_multiple = gpio_fwd_set_multiple_locked;
 	chip->to_irq = gpio_fwd_to_irq;
 	chip->base = -1;
 	chip->ngpio = ngpios;
diff --git a/drivers/gpio/gpio-altera-a10sr.c b/drivers/gpio/gpio-altera-a10sr.c
index 77a674cf99e4..4524c18a87e7 100644
--- a/drivers/gpio/gpio-altera-a10sr.c
+++ b/drivers/gpio/gpio-altera-a10sr.c
@@ -69,7 +69,7 @@ static const struct gpio_chip altr_a10sr_gc = {
 	.label = "altr_a10sr_gpio",
 	.owner = THIS_MODULE,
 	.get = altr_a10sr_gpio_get,
-	.set_rv = altr_a10sr_gpio_set,
+	.set = altr_a10sr_gpio_set,
 	.direction_input = altr_a10sr_gpio_direction_input,
 	.direction_output = altr_a10sr_gpio_direction_output,
 	.can_sleep = true,
diff --git a/drivers/gpio/gpio-altera.c b/drivers/gpio/gpio-altera.c
index 1b28525726d7..9508d764cce4 100644
--- a/drivers/gpio/gpio-altera.c
+++ b/drivers/gpio/gpio-altera.c
@@ -259,7 +259,7 @@ static int altera_gpio_probe(struct platform_device *pdev)
 	altera_gc->gc.direction_input	= altera_gpio_direction_input;
 	altera_gc->gc.direction_output	= altera_gpio_direction_output;
 	altera_gc->gc.get		= altera_gpio_get;
-	altera_gc->gc.set_rv		= altera_gpio_set;
+	altera_gc->gc.set		= altera_gpio_set;
 	altera_gc->gc.owner		= THIS_MODULE;
 	altera_gc->gc.parent		= &pdev->dev;
 	altera_gc->gc.base		= -1;
diff --git a/drivers/gpio/gpio-amd-fch.c b/drivers/gpio/gpio-amd-fch.c
index f8d0cea46049..e6c6c3ec7656 100644
--- a/drivers/gpio/gpio-amd-fch.c
+++ b/drivers/gpio/gpio-amd-fch.c
@@ -165,7 +165,7 @@ static int amd_fch_gpio_probe(struct platform_device *pdev)
 	priv->gc.direction_output	= amd_fch_gpio_direction_output;
 	priv->gc.get_direction		= amd_fch_gpio_get_direction;
 	priv->gc.get			= amd_fch_gpio_get;
-	priv->gc.set_rv			= amd_fch_gpio_set;
+	priv->gc.set			= amd_fch_gpio_set;
 
 	spin_lock_init(&priv->lock);
 
diff --git a/drivers/gpio/gpio-amd8111.c b/drivers/gpio/gpio-amd8111.c
index 425d8472f744..15fd5e210d74 100644
--- a/drivers/gpio/gpio-amd8111.c
+++ b/drivers/gpio/gpio-amd8111.c
@@ -165,7 +165,7 @@ static struct amd_gpio gp = {
 		.ngpio		= 32,
 		.request	= amd_gpio_request,
 		.free		= amd_gpio_free,
-		.set_rv		= amd_gpio_set,
+		.set		= amd_gpio_set,
 		.get		= amd_gpio_get,
 		.direction_output = amd_gpio_dirout,
 		.direction_input = amd_gpio_dirin,
diff --git a/drivers/gpio/gpio-arizona.c b/drivers/gpio/gpio-arizona.c
index 89ffde693019..a7e98d395d8e 100644
--- a/drivers/gpio/gpio-arizona.c
+++ b/drivers/gpio/gpio-arizona.c
@@ -138,7 +138,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= arizona_gpio_direction_in,
 	.get			= arizona_gpio_get,
 	.direction_output	= arizona_gpio_direction_out,
-	.set_rv			= arizona_gpio_set,
+	.set			= arizona_gpio_set,
 	.can_sleep		= true,
 };
 
diff --git a/drivers/gpio/gpio-aspeed-sgpio.c b/drivers/gpio/gpio-aspeed-sgpio.c
index 00b31497ecff..7622f9e9f54a 100644
--- a/drivers/gpio/gpio-aspeed-sgpio.c
+++ b/drivers/gpio/gpio-aspeed-sgpio.c
@@ -596,7 +596,7 @@ static int __init aspeed_sgpio_probe(struct platform_device *pdev)
 	gpio->chip.request = NULL;
 	gpio->chip.free = NULL;
 	gpio->chip.get = aspeed_sgpio_get;
-	gpio->chip.set_rv = aspeed_sgpio_set;
+	gpio->chip.set = aspeed_sgpio_set;
 	gpio->chip.set_config = aspeed_sgpio_set_config;
 	gpio->chip.label = dev_name(&pdev->dev);
 	gpio->chip.base = -1;
diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c
index 2d340a343a17..7953a9c4e36d 100644
--- a/drivers/gpio/gpio-aspeed.c
+++ b/drivers/gpio/gpio-aspeed.c
@@ -1352,7 +1352,7 @@ static int aspeed_gpio_probe(struct platform_device *pdev)
 	gpio->chip.request = aspeed_gpio_request;
 	gpio->chip.free = aspeed_gpio_free;
 	gpio->chip.get = aspeed_gpio_get;
-	gpio->chip.set_rv = aspeed_gpio_set;
+	gpio->chip.set = aspeed_gpio_set;
 	gpio->chip.set_config = aspeed_gpio_set_config;
 	gpio->chip.label = dev_name(&pdev->dev);
 	gpio->chip.base = -1;
diff --git a/drivers/gpio/gpio-bcm-kona.c b/drivers/gpio/gpio-bcm-kona.c
index 8f22cb36004d..208b71c59d58 100644
--- a/drivers/gpio/gpio-bcm-kona.c
+++ b/drivers/gpio/gpio-bcm-kona.c
@@ -339,7 +339,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input = bcm_kona_gpio_direction_input,
 	.get = bcm_kona_gpio_get,
 	.direction_output = bcm_kona_gpio_direction_output,
-	.set_rv = bcm_kona_gpio_set,
+	.set = bcm_kona_gpio_set,
 	.set_config = bcm_kona_gpio_set_config,
 	.to_irq = bcm_kona_gpio_to_irq,
 	.base = 0,
diff --git a/drivers/gpio/gpio-bd71815.c b/drivers/gpio/gpio-bd71815.c
index 36701500925e..afb18a5a9d79 100644
--- a/drivers/gpio/gpio-bd71815.c
+++ b/drivers/gpio/gpio-bd71815.c
@@ -85,7 +85,7 @@ static const struct gpio_chip bd71815gpo_chip = {
 	.owner			= THIS_MODULE,
 	.get			= bd71815gpo_get,
 	.get_direction		= bd71815gpo_direction_get,
-	.set_rv			= bd71815gpo_set,
+	.set			= bd71815gpo_set,
 	.set_config		= bd71815_gpio_set_config,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-bd71828.c b/drivers/gpio/gpio-bd71828.c
index 4ba151e5cf25..e439dbfffc62 100644
--- a/drivers/gpio/gpio-bd71828.c
+++ b/drivers/gpio/gpio-bd71828.c
@@ -109,7 +109,7 @@ static int bd71828_probe(struct platform_device *pdev)
 	bdgpio->gpio.set_config = bd71828_gpio_set_config;
 	bdgpio->gpio.can_sleep = true;
 	bdgpio->gpio.get = bd71828_gpio_get;
-	bdgpio->gpio.set_rv = bd71828_gpio_set;
+	bdgpio->gpio.set = bd71828_gpio_set;
 	bdgpio->gpio.base = -1;
 
 	/*
diff --git a/drivers/gpio/gpio-bd9571mwv.c b/drivers/gpio/gpio-bd9571mwv.c
index 8df1361e3e84..7c95bb36511e 100644
--- a/drivers/gpio/gpio-bd9571mwv.c
+++ b/drivers/gpio/gpio-bd9571mwv.c
@@ -88,7 +88,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= bd9571mwv_gpio_direction_input,
 	.direction_output	= bd9571mwv_gpio_direction_output,
 	.get			= bd9571mwv_gpio_get,
-	.set_rv			= bd9571mwv_gpio_set,
+	.set			= bd9571mwv_gpio_set,
 	.base			= -1,
 	.ngpio			= 2,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-bt8xx.c b/drivers/gpio/gpio-bt8xx.c
index 7c9e81fea37a..05401da03ca3 100644
--- a/drivers/gpio/gpio-bt8xx.c
+++ b/drivers/gpio/gpio-bt8xx.c
@@ -145,7 +145,7 @@ static void bt8xxgpio_gpio_setup(struct bt8xxgpio *bg)
 	c->direction_input = bt8xxgpio_gpio_direction_input;
 	c->get = bt8xxgpio_gpio_get;
 	c->direction_output = bt8xxgpio_gpio_direction_output;
-	c->set_rv = bt8xxgpio_gpio_set;
+	c->set = bt8xxgpio_gpio_set;
 	c->dbg_show = NULL;
 	c->base = modparam_gpiobase;
 	c->ngpio = BT8XXGPIO_NR_GPIOS;
diff --git a/drivers/gpio/gpio-cgbc.c b/drivers/gpio/gpio-cgbc.c
index 1495bec62456..0efa1b61001a 100644
--- a/drivers/gpio/gpio-cgbc.c
+++ b/drivers/gpio/gpio-cgbc.c
@@ -171,7 +171,7 @@ static int cgbc_gpio_probe(struct platform_device *pdev)
 	chip->direction_output = cgbc_gpio_direction_output;
 	chip->get_direction = cgbc_gpio_get_direction;
 	chip->get = cgbc_gpio_get;
-	chip->set_rv = cgbc_gpio_set;
+	chip->set = cgbc_gpio_set;
 	chip->ngpio = CGBC_GPIO_NGPIO;
 
 	ret = devm_mutex_init(dev, &gpio->lock);
diff --git a/drivers/gpio/gpio-creg-snps.c b/drivers/gpio/gpio-creg-snps.c
index 8b49f02c7896..f8ea961fa1de 100644
--- a/drivers/gpio/gpio-creg-snps.c
+++ b/drivers/gpio/gpio-creg-snps.c
@@ -167,7 +167,7 @@ static int creg_gpio_probe(struct platform_device *pdev)
 	hcg->gc.label = dev_name(dev);
 	hcg->gc.base = -1;
 	hcg->gc.ngpio = ngpios;
-	hcg->gc.set_rv = creg_gpio_set;
+	hcg->gc.set = creg_gpio_set;
 	hcg->gc.direction_output = creg_gpio_dir_out;
 
 	ret = devm_gpiochip_add_data(dev, &hcg->gc, hcg);
diff --git a/drivers/gpio/gpio-cros-ec.c b/drivers/gpio/gpio-cros-ec.c
index 53cd5ff6247b..435483826c6e 100644
--- a/drivers/gpio/gpio-cros-ec.c
+++ b/drivers/gpio/gpio-cros-ec.c
@@ -188,7 +188,7 @@ static int cros_ec_gpio_probe(struct platform_device *pdev)
 	gc->can_sleep = true;
 	gc->label = dev_name(dev);
 	gc->base = -1;
-	gc->set_rv = cros_ec_gpio_set;
+	gc->set = cros_ec_gpio_set;
 	gc->get = cros_ec_gpio_get;
 	gc->get_direction = cros_ec_gpio_get_direction;
 
diff --git a/drivers/gpio/gpio-crystalcove.c b/drivers/gpio/gpio-crystalcove.c
index 8db7cca3a060..0fb5c06d0886 100644
--- a/drivers/gpio/gpio-crystalcove.c
+++ b/drivers/gpio/gpio-crystalcove.c
@@ -349,7 +349,7 @@ static int crystalcove_gpio_probe(struct platform_device *pdev)
 	cg->chip.direction_input = crystalcove_gpio_dir_in;
 	cg->chip.direction_output = crystalcove_gpio_dir_out;
 	cg->chip.get = crystalcove_gpio_get;
-	cg->chip.set_rv = crystalcove_gpio_set;
+	cg->chip.set = crystalcove_gpio_set;
 	cg->chip.base = -1;
 	cg->chip.ngpio = CRYSTALCOVE_VGPIO_NUM;
 	cg->chip.can_sleep = true;
diff --git a/drivers/gpio/gpio-cs5535.c b/drivers/gpio/gpio-cs5535.c
index 143d1f4173a6..8affe4e9f90e 100644
--- a/drivers/gpio/gpio-cs5535.c
+++ b/drivers/gpio/gpio-cs5535.c
@@ -296,7 +296,7 @@ static struct cs5535_gpio_chip cs5535_gpio_chip = {
 		.request = chip_gpio_request,
 
 		.get = chip_gpio_get,
-		.set_rv = chip_gpio_set,
+		.set = chip_gpio_set,
 
 		.direction_input = chip_direction_input,
 		.direction_output = chip_direction_output,
diff --git a/drivers/gpio/gpio-da9052.c b/drivers/gpio/gpio-da9052.c
index 6482c5b267db..495f0ee58505 100644
--- a/drivers/gpio/gpio-da9052.c
+++ b/drivers/gpio/gpio-da9052.c
@@ -172,7 +172,7 @@ static const struct gpio_chip reference_gp = {
 	.label = "da9052-gpio",
 	.owner = THIS_MODULE,
 	.get = da9052_gpio_get,
-	.set_rv = da9052_gpio_set,
+	.set = da9052_gpio_set,
 	.direction_input = da9052_gpio_direction_input,
 	.direction_output = da9052_gpio_direction_output,
 	.to_irq = da9052_gpio_to_irq,
diff --git a/drivers/gpio/gpio-da9055.c b/drivers/gpio/gpio-da9055.c
index 3d9d0c700100..a09bd6eb93cf 100644
--- a/drivers/gpio/gpio-da9055.c
+++ b/drivers/gpio/gpio-da9055.c
@@ -116,7 +116,7 @@ static const struct gpio_chip reference_gp = {
 	.label = "da9055-gpio",
 	.owner = THIS_MODULE,
 	.get = da9055_gpio_get,
-	.set_rv = da9055_gpio_set,
+	.set = da9055_gpio_set,
 	.direction_input = da9055_gpio_direction_input,
 	.direction_output = da9055_gpio_direction_output,
 	.to_irq = da9055_gpio_to_irq,
diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c
index 8f3a36d0191d..538f27209ce7 100644
--- a/drivers/gpio/gpio-davinci.c
+++ b/drivers/gpio/gpio-davinci.c
@@ -202,7 +202,7 @@ static int davinci_gpio_probe(struct platform_device *pdev)
 	chips->chip.direction_input = davinci_direction_in;
 	chips->chip.get = davinci_gpio_get;
 	chips->chip.direction_output = davinci_direction_out;
-	chips->chip.set_rv = davinci_gpio_set;
+	chips->chip.set = davinci_gpio_set;
 
 	chips->chip.ngpio = ngpio;
 	chips->chip.base = -1;
diff --git a/drivers/gpio/gpio-dln2.c b/drivers/gpio/gpio-dln2.c
index 4bd3c47eaf93..4670ffd7ea7f 100644
--- a/drivers/gpio/gpio-dln2.c
+++ b/drivers/gpio/gpio-dln2.c
@@ -469,7 +469,7 @@ static int dln2_gpio_probe(struct platform_device *pdev)
 	dln2->gpio.base = -1;
 	dln2->gpio.ngpio = pins;
 	dln2->gpio.can_sleep = true;
-	dln2->gpio.set_rv = dln2_gpio_set;
+	dln2->gpio.set = dln2_gpio_set;
 	dln2->gpio.get = dln2_gpio_get;
 	dln2->gpio.request = dln2_gpio_request;
 	dln2->gpio.free = dln2_gpio_free;
diff --git a/drivers/gpio/gpio-eic-sprd.c b/drivers/gpio/gpio-eic-sprd.c
index f2973d0b7138..50fafeda8d7e 100644
--- a/drivers/gpio/gpio-eic-sprd.c
+++ b/drivers/gpio/gpio-eic-sprd.c
@@ -663,7 +663,7 @@ static int sprd_eic_probe(struct platform_device *pdev)
 		sprd_eic->chip.request = sprd_eic_request;
 		sprd_eic->chip.free = sprd_eic_free;
 		sprd_eic->chip.set_config = sprd_eic_set_config;
-		sprd_eic->chip.set_rv = sprd_eic_set;
+		sprd_eic->chip.set = sprd_eic_set;
 		fallthrough;
 	case SPRD_EIC_ASYNC:
 	case SPRD_EIC_SYNC:
diff --git a/drivers/gpio/gpio-em.c b/drivers/gpio/gpio-em.c
index 015f1ac32dd9..a214b0672726 100644
--- a/drivers/gpio/gpio-em.c
+++ b/drivers/gpio/gpio-em.c
@@ -306,7 +306,7 @@ static int em_gio_probe(struct platform_device *pdev)
 	gpio_chip->direction_input = em_gio_direction_input;
 	gpio_chip->get = em_gio_get;
 	gpio_chip->direction_output = em_gio_direction_output;
-	gpio_chip->set_rv = em_gio_set;
+	gpio_chip->set = em_gio_set;
 	gpio_chip->to_irq = em_gio_to_irq;
 	gpio_chip->request = pinctrl_gpio_request;
 	gpio_chip->free = em_gio_free;
diff --git a/drivers/gpio/gpio-exar.c b/drivers/gpio/gpio-exar.c
index beb98286d13e..9053662f1817 100644
--- a/drivers/gpio/gpio-exar.c
+++ b/drivers/gpio/gpio-exar.c
@@ -211,7 +211,7 @@ static int gpio_exar_probe(struct platform_device *pdev)
 	exar_gpio->gpio_chip.direction_input = exar_direction_input;
 	exar_gpio->gpio_chip.get_direction = exar_get_direction;
 	exar_gpio->gpio_chip.get = exar_get_value;
-	exar_gpio->gpio_chip.set_rv = exar_set_value;
+	exar_gpio->gpio_chip.set = exar_set_value;
 	exar_gpio->gpio_chip.base = -1;
 	exar_gpio->gpio_chip.ngpio = ngpios;
 	exar_gpio->index = index;
diff --git a/drivers/gpio/gpio-f7188x.c b/drivers/gpio/gpio-f7188x.c
index dfcd3634f279..4d5b927ad70f 100644
--- a/drivers/gpio/gpio-f7188x.c
+++ b/drivers/gpio/gpio-f7188x.c
@@ -173,7 +173,7 @@ static int f7188x_gpio_set_config(struct gpio_chip *chip, unsigned offset,
 			.direction_input  = f7188x_gpio_direction_in,	\
 			.get              = f7188x_gpio_get,		\
 			.direction_output = f7188x_gpio_direction_out,	\
-			.set_rv           = f7188x_gpio_set,		\
+			.set              = f7188x_gpio_set,		\
 			.set_config	  = f7188x_gpio_set_config,	\
 			.base             = -1,				\
 			.ngpio            = _ngpio,			\
diff --git a/drivers/gpio/gpio-graniterapids.c b/drivers/gpio/gpio-graniterapids.c
index f25283e5239d..121bf29a27f5 100644
--- a/drivers/gpio/gpio-graniterapids.c
+++ b/drivers/gpio/gpio-graniterapids.c
@@ -159,7 +159,7 @@ static const struct gpio_chip gnr_gpio_chip = {
 	.owner		  = THIS_MODULE,
 	.request	  = gnr_gpio_request,
 	.get		  = gnr_gpio_get,
-	.set_rv		  = gnr_gpio_set,
+	.set		  = gnr_gpio_set,
 	.get_direction    = gnr_gpio_get_direction,
 	.direction_input  = gnr_gpio_direction_input,
 	.direction_output = gnr_gpio_direction_output,
diff --git a/drivers/gpio/gpio-gw-pld.c b/drivers/gpio/gpio-gw-pld.c
index a40ba99a3aea..2e5d97b7363f 100644
--- a/drivers/gpio/gpio-gw-pld.c
+++ b/drivers/gpio/gpio-gw-pld.c
@@ -86,7 +86,7 @@ static int gw_pld_probe(struct i2c_client *client)
 	gw->chip.direction_input = gw_pld_input8;
 	gw->chip.get = gw_pld_get8;
 	gw->chip.direction_output = gw_pld_output8;
-	gw->chip.set_rv = gw_pld_set8;
+	gw->chip.set = gw_pld_set8;
 	gw->client = client;
 
 	/*
diff --git a/drivers/gpio/gpio-htc-egpio.c b/drivers/gpio/gpio-htc-egpio.c
index b1844a676c7c..2eaed83214d8 100644
--- a/drivers/gpio/gpio-htc-egpio.c
+++ b/drivers/gpio/gpio-htc-egpio.c
@@ -324,7 +324,7 @@ static int __init egpio_probe(struct platform_device *pdev)
 		chip->parent          = &pdev->dev;
 		chip->owner           = THIS_MODULE;
 		chip->get             = egpio_get;
-		chip->set_rv          = egpio_set;
+		chip->set             = egpio_set;
 		chip->direction_input = egpio_direction_input;
 		chip->direction_output = egpio_direction_output;
 		chip->get_direction   = egpio_get_direction;
diff --git a/drivers/gpio/gpio-ich.c b/drivers/gpio/gpio-ich.c
index 67089b2423d8..1802c9116ffe 100644
--- a/drivers/gpio/gpio-ich.c
+++ b/drivers/gpio/gpio-ich.c
@@ -273,7 +273,7 @@ static void ichx_gpiolib_setup(struct gpio_chip *chip)
 	chip->get = ichx_priv.desc->get ?
 		ichx_priv.desc->get : ichx_gpio_get;
 
-	chip->set_rv = ichx_gpio_set;
+	chip->set = ichx_gpio_set;
 	chip->get_direction = ichx_gpio_get_direction;
 	chip->direction_input = ichx_gpio_direction_input;
 	chip->direction_output = ichx_gpio_direction_output;
diff --git a/drivers/gpio/gpio-imx-scu.c b/drivers/gpio/gpio-imx-scu.c
index 1693dbf1b777..0a75afecf9f8 100644
--- a/drivers/gpio/gpio-imx-scu.c
+++ b/drivers/gpio/gpio-imx-scu.c
@@ -102,7 +102,7 @@ static int imx_scu_gpio_probe(struct platform_device *pdev)
 	gc->ngpio = ARRAY_SIZE(scu_rsrc_arr);
 	gc->label = dev_name(dev);
 	gc->get = imx_scu_gpio_get;
-	gc->set_rv = imx_scu_gpio_set;
+	gc->set = imx_scu_gpio_set;
 	gc->get_direction = imx_scu_gpio_get_direction;
 
 	platform_set_drvdata(pdev, priv);
diff --git a/drivers/gpio/gpio-it87.c b/drivers/gpio/gpio-it87.c
index d8184b527bac..5d677bcfccf2 100644
--- a/drivers/gpio/gpio-it87.c
+++ b/drivers/gpio/gpio-it87.c
@@ -267,7 +267,7 @@ static const struct gpio_chip it87_template_chip = {
 	.request		= it87_gpio_request,
 	.get			= it87_gpio_get,
 	.direction_input	= it87_gpio_direction_in,
-	.set_rv			= it87_gpio_set,
+	.set			= it87_gpio_set,
 	.direction_output	= it87_gpio_direction_out,
 	.base			= -1
 };
diff --git a/drivers/gpio/gpio-janz-ttl.c b/drivers/gpio/gpio-janz-ttl.c
index 9f548eda3888..b0c4a3346e7d 100644
--- a/drivers/gpio/gpio-janz-ttl.c
+++ b/drivers/gpio/gpio-janz-ttl.c
@@ -171,7 +171,7 @@ static int ttl_probe(struct platform_device *pdev)
 	gpio->parent = &pdev->dev;
 	gpio->label = pdev->name;
 	gpio->get = ttl_get_value;
-	gpio->set_rv = ttl_set_value;
+	gpio->set = ttl_set_value;
 	gpio->owner = THIS_MODULE;
 
 	/* request dynamic allocation */
diff --git a/drivers/gpio/gpio-kempld.c b/drivers/gpio/gpio-kempld.c
index e38e604baa22..923aad3ab4d4 100644
--- a/drivers/gpio/gpio-kempld.c
+++ b/drivers/gpio/gpio-kempld.c
@@ -169,7 +169,7 @@ static int kempld_gpio_probe(struct platform_device *pdev)
 	chip->direction_output = kempld_gpio_direction_output;
 	chip->get_direction = kempld_gpio_get_direction;
 	chip->get = kempld_gpio_get;
-	chip->set_rv = kempld_gpio_set;
+	chip->set = kempld_gpio_set;
 	chip->ngpio = kempld_gpio_pincount(pld);
 	if (chip->ngpio == 0) {
 		dev_err(dev, "No GPIO pins detected\n");
diff --git a/drivers/gpio/gpio-latch.c b/drivers/gpio/gpio-latch.c
index 3d0ff09284fb..c64aaa896766 100644
--- a/drivers/gpio/gpio-latch.c
+++ b/drivers/gpio/gpio-latch.c
@@ -166,11 +166,11 @@ static int gpio_latch_probe(struct platform_device *pdev)
 
 	if (gpio_latch_can_sleep(priv, n_latches)) {
 		priv->gc.can_sleep = true;
-		priv->gc.set_rv = gpio_latch_set_can_sleep;
+		priv->gc.set = gpio_latch_set_can_sleep;
 		mutex_init(&priv->mutex);
 	} else {
 		priv->gc.can_sleep = false;
-		priv->gc.set_rv = gpio_latch_set;
+		priv->gc.set = gpio_latch_set;
 		spin_lock_init(&priv->spinlock);
 	}
 
diff --git a/drivers/gpio/gpio-ljca.c b/drivers/gpio/gpio-ljca.c
index 61524a9ba765..3b4f8830c741 100644
--- a/drivers/gpio/gpio-ljca.c
+++ b/drivers/gpio/gpio-ljca.c
@@ -437,7 +437,7 @@ static int ljca_gpio_probe(struct auxiliary_device *auxdev,
 	ljca_gpio->gc.direction_output = ljca_gpio_direction_output;
 	ljca_gpio->gc.get_direction = ljca_gpio_get_direction;
 	ljca_gpio->gc.get = ljca_gpio_get_value;
-	ljca_gpio->gc.set_rv = ljca_gpio_set_value;
+	ljca_gpio->gc.set = ljca_gpio_set_value;
 	ljca_gpio->gc.set_config = ljca_gpio_set_config;
 	ljca_gpio->gc.init_valid_mask = ljca_gpio_init_valid_mask;
 	ljca_gpio->gc.can_sleep = true;
diff --git a/drivers/gpio/gpio-logicvc.c b/drivers/gpio/gpio-logicvc.c
index 19cd2847467c..cb9dbcc290ad 100644
--- a/drivers/gpio/gpio-logicvc.c
+++ b/drivers/gpio/gpio-logicvc.c
@@ -134,7 +134,7 @@ static int logicvc_gpio_probe(struct platform_device *pdev)
 	logicvc->chip.ngpio = LOGICVC_CTRL_GPIO_BITS +
 			      LOGICVC_POWER_CTRL_GPIO_BITS;
 	logicvc->chip.get = logicvc_gpio_get;
-	logicvc->chip.set_rv = logicvc_gpio_set;
+	logicvc->chip.set = logicvc_gpio_set;
 	logicvc->chip.direction_output = logicvc_gpio_direction_output;
 
 	return devm_gpiochip_add_data(dev, &logicvc->chip, logicvc);
diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c
index add09971d26a..818c606fbc51 100644
--- a/drivers/gpio/gpio-loongson-64bit.c
+++ b/drivers/gpio/gpio-loongson-64bit.c
@@ -157,7 +157,7 @@ static int loongson_gpio_init(struct device *dev, struct loongson_gpio_chip *lgp
 		lgpio->chip.get = loongson_gpio_get;
 		lgpio->chip.get_direction = loongson_gpio_get_direction;
 		lgpio->chip.direction_output = loongson_gpio_direction_output;
-		lgpio->chip.set_rv = loongson_gpio_set;
+		lgpio->chip.set = loongson_gpio_set;
 		lgpio->chip.parent = dev;
 		spin_lock_init(&lgpio->lock);
 	}
diff --git a/drivers/gpio/gpio-loongson.c b/drivers/gpio/gpio-loongson.c
index 8f3668169ebf..f3e0559f969d 100644
--- a/drivers/gpio/gpio-loongson.c
+++ b/drivers/gpio/gpio-loongson.c
@@ -106,7 +106,7 @@ static int loongson_gpio_probe(struct platform_device *pdev)
 	gc->base = 0;
 	gc->ngpio = LOONGSON_N_GPIO;
 	gc->get = loongson_gpio_get_value;
-	gc->set_rv = loongson_gpio_set_value;
+	gc->set = loongson_gpio_set_value;
 	gc->direction_input = loongson_gpio_direction_input;
 	gc->direction_output = loongson_gpio_direction_output;
 
diff --git a/drivers/gpio/gpio-lp3943.c b/drivers/gpio/gpio-lp3943.c
index 52ab3ac4844c..e8e00daff7df 100644
--- a/drivers/gpio/gpio-lp3943.c
+++ b/drivers/gpio/gpio-lp3943.c
@@ -184,7 +184,7 @@ static const struct gpio_chip lp3943_gpio_chip = {
 	.direction_input	= lp3943_gpio_direction_input,
 	.get			= lp3943_gpio_get,
 	.direction_output	= lp3943_gpio_direction_output,
-	.set_rv			= lp3943_gpio_set,
+	.set			= lp3943_gpio_set,
 	.base			= -1,
 	.ngpio			= LP3943_MAX_GPIO,
 	.can_sleep		= 1,
diff --git a/drivers/gpio/gpio-lp873x.c b/drivers/gpio/gpio-lp873x.c
index 1908ed302e92..5376708a81bf 100644
--- a/drivers/gpio/gpio-lp873x.c
+++ b/drivers/gpio/gpio-lp873x.c
@@ -124,7 +124,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= lp873x_gpio_direction_input,
 	.direction_output	= lp873x_gpio_direction_output,
 	.get			= lp873x_gpio_get,
-	.set_rv			= lp873x_gpio_set,
+	.set			= lp873x_gpio_set,
 	.set_config		= lp873x_gpio_set_config,
 	.base			= -1,
 	.ngpio			= 2,
diff --git a/drivers/gpio/gpio-lp87565.c b/drivers/gpio/gpio-lp87565.c
index 8ea687d5d028..0f337c1283b2 100644
--- a/drivers/gpio/gpio-lp87565.c
+++ b/drivers/gpio/gpio-lp87565.c
@@ -139,7 +139,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= lp87565_gpio_direction_input,
 	.direction_output	= lp87565_gpio_direction_output,
 	.get			= lp87565_gpio_get,
-	.set_rv			= lp87565_gpio_set,
+	.set			= lp87565_gpio_set,
 	.set_config		= lp87565_gpio_set_config,
 	.base			= -1,
 	.ngpio			= 3,
diff --git a/drivers/gpio/gpio-lpc18xx.c b/drivers/gpio/gpio-lpc18xx.c
index 2dbfbf90176c..37a2342eb2e6 100644
--- a/drivers/gpio/gpio-lpc18xx.c
+++ b/drivers/gpio/gpio-lpc18xx.c
@@ -327,7 +327,7 @@ static const struct gpio_chip lpc18xx_chip = {
 	.free			= gpiochip_generic_free,
 	.direction_input	= lpc18xx_gpio_direction_input,
 	.direction_output	= lpc18xx_gpio_direction_output,
-	.set_rv			= lpc18xx_gpio_set,
+	.set			= lpc18xx_gpio_set,
 	.get			= lpc18xx_gpio_get,
 	.ngpio			= LPC18XX_MAX_PORTS * LPC18XX_PINS_PER_PORT,
 	.owner			= THIS_MODULE,
diff --git a/drivers/gpio/gpio-lpc32xx.c b/drivers/gpio/gpio-lpc32xx.c
index 6668b8bd9f1e..37fc54fc7385 100644
--- a/drivers/gpio/gpio-lpc32xx.c
+++ b/drivers/gpio/gpio-lpc32xx.c
@@ -407,7 +407,7 @@ static struct lpc32xx_gpio_chip lpc32xx_gpiochip[] = {
 			.direction_input	= lpc32xx_gpio_dir_input_p012,
 			.get			= lpc32xx_gpio_get_value_p012,
 			.direction_output	= lpc32xx_gpio_dir_output_p012,
-			.set_rv			= lpc32xx_gpio_set_value_p012,
+			.set			= lpc32xx_gpio_set_value_p012,
 			.request		= lpc32xx_gpio_request,
 			.to_irq			= lpc32xx_gpio_to_irq_p01,
 			.base			= LPC32XX_GPIO_P0_GRP,
@@ -423,7 +423,7 @@ static struct lpc32xx_gpio_chip lpc32xx_gpiochip[] = {
 			.direction_input	= lpc32xx_gpio_dir_input_p012,
 			.get			= lpc32xx_gpio_get_value_p012,
 			.direction_output	= lpc32xx_gpio_dir_output_p012,
-			.set_rv			= lpc32xx_gpio_set_value_p012,
+			.set			= lpc32xx_gpio_set_value_p012,
 			.request		= lpc32xx_gpio_request,
 			.to_irq			= lpc32xx_gpio_to_irq_p01,
 			.base			= LPC32XX_GPIO_P1_GRP,
@@ -439,7 +439,7 @@ static struct lpc32xx_gpio_chip lpc32xx_gpiochip[] = {
 			.direction_input	= lpc32xx_gpio_dir_input_p012,
 			.get			= lpc32xx_gpio_get_value_p012,
 			.direction_output	= lpc32xx_gpio_dir_output_p012,
-			.set_rv			= lpc32xx_gpio_set_value_p012,
+			.set			= lpc32xx_gpio_set_value_p012,
 			.request		= lpc32xx_gpio_request,
 			.base			= LPC32XX_GPIO_P2_GRP,
 			.ngpio			= LPC32XX_GPIO_P2_MAX,
@@ -454,7 +454,7 @@ static struct lpc32xx_gpio_chip lpc32xx_gpiochip[] = {
 			.direction_input	= lpc32xx_gpio_dir_input_p3,
 			.get			= lpc32xx_gpio_get_value_p3,
 			.direction_output	= lpc32xx_gpio_dir_output_p3,
-			.set_rv			= lpc32xx_gpio_set_value_p3,
+			.set			= lpc32xx_gpio_set_value_p3,
 			.request		= lpc32xx_gpio_request,
 			.to_irq			= lpc32xx_gpio_to_irq_gpio_p3,
 			.base			= LPC32XX_GPIO_P3_GRP,
@@ -482,7 +482,7 @@ static struct lpc32xx_gpio_chip lpc32xx_gpiochip[] = {
 		.chip = {
 			.label			= "gpo_p3",
 			.direction_output	= lpc32xx_gpio_dir_out_always,
-			.set_rv			= lpc32xx_gpo_set_value,
+			.set			= lpc32xx_gpo_set_value,
 			.get			= lpc32xx_gpo_get_value,
 			.request		= lpc32xx_gpio_request,
 			.base			= LPC32XX_GPO_P3_GRP,
diff --git a/drivers/gpio/gpio-macsmc.c b/drivers/gpio/gpio-macsmc.c
index 7570d9e89adf..30ef258e7655 100644
--- a/drivers/gpio/gpio-macsmc.c
+++ b/drivers/gpio/gpio-macsmc.c
@@ -261,7 +261,7 @@ static int macsmc_gpio_probe(struct platform_device *pdev)
 	smcgp->gc.label = "macsmc-pmu-gpio";
 	smcgp->gc.owner = THIS_MODULE;
 	smcgp->gc.get = macsmc_gpio_get;
-	smcgp->gc.set_rv = macsmc_gpio_set;
+	smcgp->gc.set = macsmc_gpio_set;
 	smcgp->gc.get_direction = macsmc_gpio_get_direction;
 	smcgp->gc.init_valid_mask = macsmc_gpio_init_valid_mask;
 	smcgp->gc.can_sleep = true;
diff --git a/drivers/gpio/gpio-madera.c b/drivers/gpio/gpio-madera.c
index e73e72d62bc8..551faf9655b2 100644
--- a/drivers/gpio/gpio-madera.c
+++ b/drivers/gpio/gpio-madera.c
@@ -109,7 +109,7 @@ static const struct gpio_chip madera_gpio_chip = {
 	.direction_input	= madera_gpio_direction_in,
 	.get			= madera_gpio_get,
 	.direction_output	= madera_gpio_direction_out,
-	.set_rv			= madera_gpio_set,
+	.set			= madera_gpio_set,
 	.set_config		= gpiochip_generic_config,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-max730x.c b/drivers/gpio/gpio-max730x.c
index 75d414d8c992..84c7c2dca822 100644
--- a/drivers/gpio/gpio-max730x.c
+++ b/drivers/gpio/gpio-max730x.c
@@ -188,7 +188,7 @@ int __max730x_probe(struct max7301 *ts)
 	ts->chip.direction_input = max7301_direction_input;
 	ts->chip.get = max7301_get;
 	ts->chip.direction_output = max7301_direction_output;
-	ts->chip.set_rv = max7301_set;
+	ts->chip.set = max7301_set;
 
 	ts->chip.ngpio = PIN_NUMBER;
 	ts->chip.can_sleep = true;
diff --git a/drivers/gpio/gpio-max732x.c b/drivers/gpio/gpio-max732x.c
index d5ffedb086af..a61d670ceeda 100644
--- a/drivers/gpio/gpio-max732x.c
+++ b/drivers/gpio/gpio-max732x.c
@@ -585,8 +585,8 @@ static int max732x_setup_gpio(struct max732x_chip *chip,
 		gc->direction_input = max732x_gpio_direction_input;
 	if (chip->dir_output) {
 		gc->direction_output = max732x_gpio_direction_output;
-		gc->set_rv = max732x_gpio_set_value;
-		gc->set_multiple_rv = max732x_gpio_set_multiple;
+		gc->set = max732x_gpio_set_value;
+		gc->set_multiple = max732x_gpio_set_multiple;
 	}
 	gc->get = max732x_gpio_get_value;
 	gc->can_sleep = true;
diff --git a/drivers/gpio/gpio-max77620.c b/drivers/gpio/gpio-max77620.c
index af7af8e40afe..02eca400b307 100644
--- a/drivers/gpio/gpio-max77620.c
+++ b/drivers/gpio/gpio-max77620.c
@@ -311,7 +311,7 @@ static int max77620_gpio_probe(struct platform_device *pdev)
 	mgpio->gpio_chip.direction_input = max77620_gpio_dir_input;
 	mgpio->gpio_chip.get = max77620_gpio_get;
 	mgpio->gpio_chip.direction_output = max77620_gpio_dir_output;
-	mgpio->gpio_chip.set_rv = max77620_gpio_set;
+	mgpio->gpio_chip.set = max77620_gpio_set;
 	mgpio->gpio_chip.set_config = max77620_gpio_set_config;
 	mgpio->gpio_chip.ngpio = MAX77620_GPIO_NR;
 	mgpio->gpio_chip.can_sleep = 1;
diff --git a/drivers/gpio/gpio-max77650.c b/drivers/gpio/gpio-max77650.c
index a553e141059f..4540da4c1418 100644
--- a/drivers/gpio/gpio-max77650.c
+++ b/drivers/gpio/gpio-max77650.c
@@ -166,7 +166,7 @@ static int max77650_gpio_probe(struct platform_device *pdev)
 
 	chip->gc.direction_input = max77650_gpio_direction_input;
 	chip->gc.direction_output = max77650_gpio_direction_output;
-	chip->gc.set_rv = max77650_gpio_set_value;
+	chip->gc.set = max77650_gpio_set_value;
 	chip->gc.get = max77650_gpio_get_value;
 	chip->gc.get_direction = max77650_gpio_get_direction;
 	chip->gc.set_config = max77650_gpio_set_config;
diff --git a/drivers/gpio/gpio-max77759.c b/drivers/gpio/gpio-max77759.c
index 7fe8e6f697d0..5e48eb03e7b3 100644
--- a/drivers/gpio/gpio-max77759.c
+++ b/drivers/gpio/gpio-max77759.c
@@ -469,7 +469,7 @@ static int max77759_gpio_probe(struct platform_device *pdev)
 	chip->gc.direction_input = max77759_gpio_direction_input;
 	chip->gc.direction_output = max77759_gpio_direction_output;
 	chip->gc.get = max77759_gpio_get_value;
-	chip->gc.set_rv = max77759_gpio_set_value;
+	chip->gc.set = max77759_gpio_set_value;
 
 	girq = &chip->gc.irq;
 	gpio_irq_chip_set_chip(girq, &max77759_gpio_irq_chip);
diff --git a/drivers/gpio/gpio-mb86s7x.c b/drivers/gpio/gpio-mb86s7x.c
index 5ee2991ecdfd..581a71872eab 100644
--- a/drivers/gpio/gpio-mb86s7x.c
+++ b/drivers/gpio/gpio-mb86s7x.c
@@ -180,7 +180,7 @@ static int mb86s70_gpio_probe(struct platform_device *pdev)
 	gchip->gc.request = mb86s70_gpio_request;
 	gchip->gc.free = mb86s70_gpio_free;
 	gchip->gc.get = mb86s70_gpio_get;
-	gchip->gc.set_rv = mb86s70_gpio_set;
+	gchip->gc.set = mb86s70_gpio_set;
 	gchip->gc.to_irq = mb86s70_gpio_to_irq;
 	gchip->gc.label = dev_name(&pdev->dev);
 	gchip->gc.ngpio = 32;
diff --git a/drivers/gpio/gpio-mc33880.c b/drivers/gpio/gpio-mc33880.c
index e68956104161..9a40e9579e95 100644
--- a/drivers/gpio/gpio-mc33880.c
+++ b/drivers/gpio/gpio-mc33880.c
@@ -103,7 +103,7 @@ static int mc33880_probe(struct spi_device *spi)
 	mc->spi = spi;
 
 	mc->chip.label = DRIVER_NAME;
-	mc->chip.set_rv = mc33880_set;
+	mc->chip.set = mc33880_set;
 	mc->chip.base = pdata->base;
 	mc->chip.ngpio = PIN_NUMBER;
 	mc->chip.can_sleep = true;
diff --git a/drivers/gpio/gpio-ml-ioh.c b/drivers/gpio/gpio-ml-ioh.c
index 12cf36f9ca63..f6af81bf2b13 100644
--- a/drivers/gpio/gpio-ml-ioh.c
+++ b/drivers/gpio/gpio-ml-ioh.c
@@ -224,7 +224,7 @@ static void ioh_gpio_setup(struct ioh_gpio *chip, int num_port)
 	gpio->direction_input = ioh_gpio_direction_input;
 	gpio->get = ioh_gpio_get;
 	gpio->direction_output = ioh_gpio_direction_output;
-	gpio->set_rv = ioh_gpio_set;
+	gpio->set = ioh_gpio_set;
 	gpio->dbg_show = NULL;
 	gpio->base = -1;
 	gpio->ngpio = num_port;
diff --git a/drivers/gpio/gpio-mm-lantiq.c b/drivers/gpio/gpio-mm-lantiq.c
index 897a1e004681..8f1405733d98 100644
--- a/drivers/gpio/gpio-mm-lantiq.c
+++ b/drivers/gpio/gpio-mm-lantiq.c
@@ -111,7 +111,7 @@ static int ltq_mm_probe(struct platform_device *pdev)
 
 	chip->mmchip.gc.ngpio = 16;
 	chip->mmchip.gc.direction_output = ltq_mm_dir_out;
-	chip->mmchip.gc.set_rv = ltq_mm_set;
+	chip->mmchip.gc.set = ltq_mm_set;
 	chip->mmchip.save_regs = ltq_mm_save_regs;
 
 	/* store the shadow value if one was passed by the devicetree */
diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c
index cf878c2ea6bf..021ad62778c2 100644
--- a/drivers/gpio/gpio-mmio.c
+++ b/drivers/gpio/gpio-mmio.c
@@ -367,7 +367,7 @@ static int bgpio_dir_out_err(struct gpio_chip *gc, unsigned int gpio,
 static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio,
 				int val)
 {
-	gc->set_rv(gc, gpio, val);
+	gc->set(gc, gpio, val);
 
 	return bgpio_dir_return(gc, gpio, true);
 }
@@ -432,14 +432,14 @@ static int bgpio_dir_out_dir_first(struct gpio_chip *gc, unsigned int gpio,
 				   int val)
 {
 	bgpio_dir_out(gc, gpio, val);
-	gc->set_rv(gc, gpio, val);
+	gc->set(gc, gpio, val);
 	return bgpio_dir_return(gc, gpio, true);
 }
 
 static int bgpio_dir_out_val_first(struct gpio_chip *gc, unsigned int gpio,
 				   int val)
 {
-	gc->set_rv(gc, gpio, val);
+	gc->set(gc, gpio, val);
 	bgpio_dir_out(gc, gpio, val);
 	return bgpio_dir_return(gc, gpio, true);
 }
@@ -528,18 +528,18 @@ static int bgpio_setup_io(struct gpio_chip *gc,
 	if (set && clr) {
 		gc->reg_set = set;
 		gc->reg_clr = clr;
-		gc->set_rv = bgpio_set_with_clear;
-		gc->set_multiple_rv = bgpio_set_multiple_with_clear;
+		gc->set = bgpio_set_with_clear;
+		gc->set_multiple = bgpio_set_multiple_with_clear;
 	} else if (set && !clr) {
 		gc->reg_set = set;
-		gc->set_rv = bgpio_set_set;
-		gc->set_multiple_rv = bgpio_set_multiple_set;
+		gc->set = bgpio_set_set;
+		gc->set_multiple = bgpio_set_multiple_set;
 	} else if (flags & BGPIOF_NO_OUTPUT) {
-		gc->set_rv = bgpio_set_none;
-		gc->set_multiple_rv = NULL;
+		gc->set = bgpio_set_none;
+		gc->set_multiple = NULL;
 	} else {
-		gc->set_rv = bgpio_set;
-		gc->set_multiple_rv = bgpio_set_multiple;
+		gc->set = bgpio_set;
+		gc->set_multiple = bgpio_set_multiple;
 	}
 
 	if (!(flags & BGPIOF_UNREADABLE_REG_SET) &&
@@ -676,7 +676,7 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 	}
 
 	gc->bgpio_data = gc->read_reg(gc->reg_dat);
-	if (gc->set_rv == bgpio_set_set &&
+	if (gc->set == bgpio_set_set &&
 			!(flags & BGPIOF_UNREADABLE_REG_SET))
 		gc->bgpio_data = gc->read_reg(gc->reg_set);
 
diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c
index 266c0953d914..a7d69f3835c1 100644
--- a/drivers/gpio/gpio-mockup.c
+++ b/drivers/gpio/gpio-mockup.c
@@ -449,9 +449,9 @@ static int gpio_mockup_probe(struct platform_device *pdev)
 	gc->owner = THIS_MODULE;
 	gc->parent = dev;
 	gc->get = gpio_mockup_get;
-	gc->set_rv = gpio_mockup_set;
+	gc->set = gpio_mockup_set;
 	gc->get_multiple = gpio_mockup_get_multiple;
-	gc->set_multiple_rv = gpio_mockup_set_multiple;
+	gc->set_multiple = gpio_mockup_set_multiple;
 	gc->direction_output = gpio_mockup_dirout;
 	gc->direction_input = gpio_mockup_dirin;
 	gc->get_direction = gpio_mockup_get_direction;
diff --git a/drivers/gpio/gpio-moxtet.c b/drivers/gpio/gpio-moxtet.c
index 27dd9c3e7b77..4eb9f1a2779b 100644
--- a/drivers/gpio/gpio-moxtet.c
+++ b/drivers/gpio/gpio-moxtet.c
@@ -140,7 +140,7 @@ static int moxtet_gpio_probe(struct device *dev)
 	chip->gpio_chip.direction_input = moxtet_gpio_direction_input;
 	chip->gpio_chip.direction_output = moxtet_gpio_direction_output;
 	chip->gpio_chip.get = moxtet_gpio_get_value;
-	chip->gpio_chip.set_rv = moxtet_gpio_set_value;
+	chip->gpio_chip.set = moxtet_gpio_set_value;
 	chip->gpio_chip.base = -1;
 
 	chip->gpio_chip.ngpio = MOXTET_GPIO_NGPIOS;
diff --git a/drivers/gpio/gpio-mpc5200.c b/drivers/gpio/gpio-mpc5200.c
index 40d587176a75..dad0eca1ca2e 100644
--- a/drivers/gpio/gpio-mpc5200.c
+++ b/drivers/gpio/gpio-mpc5200.c
@@ -153,7 +153,7 @@ static int mpc52xx_wkup_gpiochip_probe(struct platform_device *ofdev)
 	gc->direction_input  = mpc52xx_wkup_gpio_dir_in;
 	gc->direction_output = mpc52xx_wkup_gpio_dir_out;
 	gc->get              = mpc52xx_wkup_gpio_get;
-	gc->set_rv           = mpc52xx_wkup_gpio_set;
+	gc->set              = mpc52xx_wkup_gpio_set;
 
 	ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip);
 	if (ret)
@@ -315,7 +315,7 @@ static int mpc52xx_simple_gpiochip_probe(struct platform_device *ofdev)
 	gc->direction_input  = mpc52xx_simple_gpio_dir_in;
 	gc->direction_output = mpc52xx_simple_gpio_dir_out;
 	gc->get              = mpc52xx_simple_gpio_get;
-	gc->set_rv           = mpc52xx_simple_gpio_set;
+	gc->set              = mpc52xx_simple_gpio_set;
 
 	ret = of_mm_gpiochip_add_data(ofdev->dev.of_node, &chip->mmchip, chip);
 	if (ret)
diff --git a/drivers/gpio/gpio-mpfs.c b/drivers/gpio/gpio-mpfs.c
index 3415cb7ebb0f..82d557a7e5d8 100644
--- a/drivers/gpio/gpio-mpfs.c
+++ b/drivers/gpio/gpio-mpfs.c
@@ -150,7 +150,7 @@ static int mpfs_gpio_probe(struct platform_device *pdev)
 	mpfs_gpio->gc.direction_output = mpfs_gpio_direction_output;
 	mpfs_gpio->gc.get_direction = mpfs_gpio_get_direction;
 	mpfs_gpio->gc.get = mpfs_gpio_get;
-	mpfs_gpio->gc.set_rv = mpfs_gpio_set;
+	mpfs_gpio->gc.set = mpfs_gpio_set;
 	mpfs_gpio->gc.base = -1;
 	mpfs_gpio->gc.ngpio = ngpios;
 	mpfs_gpio->gc.label = dev_name(dev);
diff --git a/drivers/gpio/gpio-mpsse.c b/drivers/gpio/gpio-mpsse.c
index b17de08e9e03..9f42bb30b4ec 100644
--- a/drivers/gpio/gpio-mpsse.c
+++ b/drivers/gpio/gpio-mpsse.c
@@ -448,9 +448,9 @@ static int gpio_mpsse_probe(struct usb_interface *interface,
 	priv->gpio.direction_input = gpio_mpsse_direction_input;
 	priv->gpio.direction_output = gpio_mpsse_direction_output;
 	priv->gpio.get = gpio_mpsse_gpio_get;
-	priv->gpio.set_rv = gpio_mpsse_gpio_set;
+	priv->gpio.set = gpio_mpsse_gpio_set;
 	priv->gpio.get_multiple = gpio_mpsse_get_multiple;
-	priv->gpio.set_multiple_rv = gpio_mpsse_set_multiple;
+	priv->gpio.set_multiple = gpio_mpsse_set_multiple;
 	priv->gpio.base = -1;
 	priv->gpio.ngpio = 16;
 	priv->gpio.offset = priv->intf_id * priv->gpio.ngpio;
diff --git a/drivers/gpio/gpio-msc313.c b/drivers/gpio/gpio-msc313.c
index 992339a89d19..b0cccd856840 100644
--- a/drivers/gpio/gpio-msc313.c
+++ b/drivers/gpio/gpio-msc313.c
@@ -658,7 +658,7 @@ static int msc313_gpio_probe(struct platform_device *pdev)
 	gpiochip->direction_input = msc313_gpio_direction_input;
 	gpiochip->direction_output = msc313_gpio_direction_output;
 	gpiochip->get = msc313_gpio_get;
-	gpiochip->set_rv = msc313_gpio_set;
+	gpiochip->set = msc313_gpio_set;
 	gpiochip->base = -1;
 	gpiochip->ngpio = gpio->gpio_data->num;
 	gpiochip->names = gpio->gpio_data->names;
diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index 24792b8eb083..5e3f54cb8bc4 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -1168,7 +1168,7 @@ static int mvebu_gpio_probe(struct platform_device *pdev)
 	mvchip->chip.direction_input = mvebu_gpio_direction_input;
 	mvchip->chip.get = mvebu_gpio_get;
 	mvchip->chip.direction_output = mvebu_gpio_direction_output;
-	mvchip->chip.set_rv = mvebu_gpio_set;
+	mvchip->chip.set = mvebu_gpio_set;
 	if (have_irqs)
 		mvchip->chip.to_irq = mvebu_gpio_to_irq;
 	mvchip->chip.base = id * MVEBU_MAX_GPIO_PER_BANK;
diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c
index 296d13845b30..bcf4b07dd458 100644
--- a/drivers/gpio/gpio-nomadik.c
+++ b/drivers/gpio/gpio-nomadik.c
@@ -674,7 +674,7 @@ static int nmk_gpio_probe(struct platform_device *pdev)
 	chip->direction_input = nmk_gpio_make_input;
 	chip->get = nmk_gpio_get_input;
 	chip->direction_output = nmk_gpio_make_output;
-	chip->set_rv = nmk_gpio_set_output;
+	chip->set = nmk_gpio_set_output;
 	chip->dbg_show = nmk_gpio_dbg_show;
 	chip->can_sleep = false;
 	chip->owner = THIS_MODULE;
diff --git a/drivers/gpio/gpio-npcm-sgpio.c b/drivers/gpio/gpio-npcm-sgpio.c
index 25b203a89e38..83c77a2c0623 100644
--- a/drivers/gpio/gpio-npcm-sgpio.c
+++ b/drivers/gpio/gpio-npcm-sgpio.c
@@ -211,7 +211,7 @@ static int npcm_sgpio_dir_in(struct gpio_chip *gc, unsigned int offset)
 
 static int npcm_sgpio_dir_out(struct gpio_chip *gc, unsigned int offset, int val)
 {
-	return gc->set_rv(gc, offset, val);
+	return gc->set(gc, offset, val);
 }
 
 static int npcm_sgpio_get_direction(struct gpio_chip *gc, unsigned int offset)
@@ -546,7 +546,7 @@ static int npcm_sgpio_probe(struct platform_device *pdev)
 	gpio->chip.direction_output = npcm_sgpio_dir_out;
 	gpio->chip.get_direction = npcm_sgpio_get_direction;
 	gpio->chip.get = npcm_sgpio_get;
-	gpio->chip.set_rv = npcm_sgpio_set;
+	gpio->chip.set = npcm_sgpio_set;
 	gpio->chip.label = dev_name(&pdev->dev);
 	gpio->chip.base = -1;
 
diff --git a/drivers/gpio/gpio-octeon.c b/drivers/gpio/gpio-octeon.c
index 24966161742a..777e20c608dc 100644
--- a/drivers/gpio/gpio-octeon.c
+++ b/drivers/gpio/gpio-octeon.c
@@ -108,7 +108,7 @@ static int octeon_gpio_probe(struct platform_device *pdev)
 	chip->direction_input = octeon_gpio_dir_in;
 	chip->get = octeon_gpio_get;
 	chip->direction_output = octeon_gpio_dir_out;
-	chip->set_rv = octeon_gpio_set;
+	chip->set = octeon_gpio_set;
 	err = devm_gpiochip_add_data(&pdev->dev, chip, gpio);
 	if (err)
 		return err;
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index ed5c88a5c520..a268c76bdca6 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -1046,8 +1046,8 @@ static int omap_gpio_chip_init(struct gpio_bank *bank, struct device *pm_dev)
 	bank->chip.get_multiple = omap_gpio_get_multiple;
 	bank->chip.direction_output = omap_gpio_output;
 	bank->chip.set_config = omap_gpio_set_config;
-	bank->chip.set_rv = omap_gpio_set;
-	bank->chip.set_multiple_rv = omap_gpio_set_multiple;
+	bank->chip.set = omap_gpio_set;
+	bank->chip.set_multiple = omap_gpio_set_multiple;
 	if (bank->is_mpuio) {
 		bank->chip.label = "mpuio";
 		if (bank->regs->wkup_en)
diff --git a/drivers/gpio/gpio-palmas.c b/drivers/gpio/gpio-palmas.c
index 9329d8ce8f59..e377f6dd4ccf 100644
--- a/drivers/gpio/gpio-palmas.c
+++ b/drivers/gpio/gpio-palmas.c
@@ -166,7 +166,7 @@ static int palmas_gpio_probe(struct platform_device *pdev)
 	palmas_gpio->gpio_chip.direction_input = palmas_gpio_input;
 	palmas_gpio->gpio_chip.direction_output = palmas_gpio_output;
 	palmas_gpio->gpio_chip.to_irq = palmas_gpio_to_irq;
-	palmas_gpio->gpio_chip.set_rv	= palmas_gpio_set;
+	palmas_gpio->gpio_chip.set	= palmas_gpio_set;
 	palmas_gpio->gpio_chip.get	= palmas_gpio_get;
 	palmas_gpio->gpio_chip.parent = &pdev->dev;
 
diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c
index 69906a9af7e6..b46927f55038 100644
--- a/drivers/gpio/gpio-pca953x.c
+++ b/drivers/gpio/gpio-pca953x.c
@@ -789,10 +789,10 @@ static void pca953x_setup_gpio(struct pca953x_chip *chip, int gpios)
 	gc->direction_input  = pca953x_gpio_direction_input;
 	gc->direction_output = pca953x_gpio_direction_output;
 	gc->get = pca953x_gpio_get_value;
-	gc->set_rv = pca953x_gpio_set_value;
+	gc->set = pca953x_gpio_set_value;
 	gc->get_direction = pca953x_gpio_get_direction;
 	gc->get_multiple = pca953x_gpio_get_multiple;
-	gc->set_multiple_rv = pca953x_gpio_set_multiple;
+	gc->set_multiple = pca953x_gpio_set_multiple;
 	gc->set_config = pca953x_gpio_set_config;
 	gc->can_sleep = true;
 
diff --git a/drivers/gpio/gpio-pca9570.c b/drivers/gpio/gpio-pca9570.c
index a33246f20fd8..c5a1287079a0 100644
--- a/drivers/gpio/gpio-pca9570.c
+++ b/drivers/gpio/gpio-pca9570.c
@@ -126,7 +126,7 @@ static int pca9570_probe(struct i2c_client *client)
 	gpio->chip.owner = THIS_MODULE;
 	gpio->chip.get_direction = pca9570_get_direction;
 	gpio->chip.get = pca9570_get;
-	gpio->chip.set_rv = pca9570_set;
+	gpio->chip.set = pca9570_set;
 	gpio->chip.base = -1;
 	gpio->chip_data = device_get_match_data(&client->dev);
 	gpio->chip.ngpio = gpio->chip_data->ngpio;
diff --git a/drivers/gpio/gpio-pcf857x.c b/drivers/gpio/gpio-pcf857x.c
index a04203680333..3b9de8c3d924 100644
--- a/drivers/gpio/gpio-pcf857x.c
+++ b/drivers/gpio/gpio-pcf857x.c
@@ -295,8 +295,8 @@ static int pcf857x_probe(struct i2c_client *client)
 	gpio->chip.owner		= THIS_MODULE;
 	gpio->chip.get			= pcf857x_get;
 	gpio->chip.get_multiple		= pcf857x_get_multiple;
-	gpio->chip.set_rv		= pcf857x_set;
-	gpio->chip.set_multiple_rv	= pcf857x_set_multiple;
+	gpio->chip.set			= pcf857x_set;
+	gpio->chip.set_multiple		= pcf857x_set_multiple;
 	gpio->chip.direction_input	= pcf857x_input;
 	gpio->chip.direction_output	= pcf857x_output;
 	gpio->chip.ngpio		= (uintptr_t)i2c_get_match_data(client);
diff --git a/drivers/gpio/gpio-pch.c b/drivers/gpio/gpio-pch.c
index c6f313342ba0..9925687e05fb 100644
--- a/drivers/gpio/gpio-pch.c
+++ b/drivers/gpio/gpio-pch.c
@@ -219,7 +219,7 @@ static void pch_gpio_setup(struct pch_gpio *chip)
 	gpio->direction_input = pch_gpio_direction_input;
 	gpio->get = pch_gpio_get;
 	gpio->direction_output = pch_gpio_direction_output;
-	gpio->set_rv = pch_gpio_set;
+	gpio->set = pch_gpio_set;
 	gpio->base = -1;
 	gpio->ngpio = gpio_pins[chip->ioh];
 	gpio->can_sleep = false;
diff --git a/drivers/gpio/gpio-pl061.c b/drivers/gpio/gpio-pl061.c
index 98cfac4eac85..02e4ffcf5a6f 100644
--- a/drivers/gpio/gpio-pl061.c
+++ b/drivers/gpio/gpio-pl061.c
@@ -330,7 +330,7 @@ static int pl061_probe(struct amba_device *adev, const struct amba_id *id)
 	pl061->gc.direction_input = pl061_direction_input;
 	pl061->gc.direction_output = pl061_direction_output;
 	pl061->gc.get = pl061_get_value;
-	pl061->gc.set_rv = pl061_set_value;
+	pl061->gc.set = pl061_set_value;
 	pl061->gc.ngpio = PL061_GPIO_NR;
 	pl061->gc.label = dev_name(dev);
 	pl061->gc.parent = dev;
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index cbcdd416f8b9..fa22f3faa163 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -355,7 +355,7 @@ static int pxa_init_gpio_chip(struct pxa_gpio_chip *pchip, int ngpio, void __iom
 	pchip->chip.direction_input  = pxa_gpio_direction_input;
 	pchip->chip.direction_output = pxa_gpio_direction_output;
 	pchip->chip.get = pxa_gpio_get;
-	pchip->chip.set_rv = pxa_gpio_set;
+	pchip->chip.set = pxa_gpio_set;
 	pchip->chip.to_irq = pxa_gpio_to_irq;
 	pchip->chip.ngpio = ngpio;
 	pchip->chip.request = gpiochip_generic_request;
diff --git a/drivers/gpio/gpio-raspberrypi-exp.c b/drivers/gpio/gpio-raspberrypi-exp.c
index b4b607515a04..40413e06b69c 100644
--- a/drivers/gpio/gpio-raspberrypi-exp.c
+++ b/drivers/gpio/gpio-raspberrypi-exp.c
@@ -232,7 +232,7 @@ static int rpi_exp_gpio_probe(struct platform_device *pdev)
 	rpi_gpio->gc.direction_output = rpi_exp_gpio_dir_out;
 	rpi_gpio->gc.get_direction = rpi_exp_gpio_get_direction;
 	rpi_gpio->gc.get = rpi_exp_gpio_get;
-	rpi_gpio->gc.set_rv = rpi_exp_gpio_set;
+	rpi_gpio->gc.set = rpi_exp_gpio_set;
 	rpi_gpio->gc.can_sleep = true;
 
 	return devm_gpiochip_add_data(dev, &rpi_gpio->gc, rpi_gpio);
diff --git a/drivers/gpio/gpio-rc5t583.c b/drivers/gpio/gpio-rc5t583.c
index cf3e91d235df..5a69e4534591 100644
--- a/drivers/gpio/gpio-rc5t583.c
+++ b/drivers/gpio/gpio-rc5t583.c
@@ -118,7 +118,7 @@ static int rc5t583_gpio_probe(struct platform_device *pdev)
 	rc5t583_gpio->gpio_chip.free = rc5t583_gpio_free,
 	rc5t583_gpio->gpio_chip.direction_input = rc5t583_gpio_dir_input,
 	rc5t583_gpio->gpio_chip.direction_output = rc5t583_gpio_dir_output,
-	rc5t583_gpio->gpio_chip.set_rv = rc5t583_gpio_set,
+	rc5t583_gpio->gpio_chip.set = rc5t583_gpio_set,
 	rc5t583_gpio->gpio_chip.get = rc5t583_gpio_get,
 	rc5t583_gpio->gpio_chip.to_irq = rc5t583_gpio_to_irq,
 	rc5t583_gpio->gpio_chip.ngpio = RC5T583_MAX_GPIO,
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index cd31580effa9..86777e097fd8 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -535,8 +535,8 @@ static int gpio_rcar_probe(struct platform_device *pdev)
 	gpio_chip->get = gpio_rcar_get;
 	gpio_chip->get_multiple = gpio_rcar_get_multiple;
 	gpio_chip->direction_output = gpio_rcar_direction_output;
-	gpio_chip->set_rv = gpio_rcar_set;
-	gpio_chip->set_multiple_rv = gpio_rcar_set_multiple;
+	gpio_chip->set = gpio_rcar_set;
+	gpio_chip->set_multiple = gpio_rcar_set_multiple;
 	gpio_chip->label = name;
 	gpio_chip->parent = dev;
 	gpio_chip->owner = THIS_MODULE;
diff --git a/drivers/gpio/gpio-rdc321x.c b/drivers/gpio/gpio-rdc321x.c
index a75ed8021de5..ba62b81aa8ae 100644
--- a/drivers/gpio/gpio-rdc321x.c
+++ b/drivers/gpio/gpio-rdc321x.c
@@ -159,7 +159,7 @@ static int rdc321x_gpio_probe(struct platform_device *pdev)
 	rdc321x_gpio_dev->chip.direction_input = rdc_gpio_direction_input;
 	rdc321x_gpio_dev->chip.direction_output = rdc_gpio_config;
 	rdc321x_gpio_dev->chip.get = rdc_gpio_get_value;
-	rdc321x_gpio_dev->chip.set_rv = rdc_gpio_set_value;
+	rdc321x_gpio_dev->chip.set = rdc_gpio_set_value;
 	rdc321x_gpio_dev->chip.base = 0;
 	rdc321x_gpio_dev->chip.ngpio = pdata->max_gpios;
 
diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
index d8da99f97385..f2238196faf1 100644
--- a/drivers/gpio/gpio-reg.c
+++ b/drivers/gpio/gpio-reg.c
@@ -46,7 +46,7 @@ static int gpio_reg_direction_output(struct gpio_chip *gc, unsigned offset,
 	if (r->direction & BIT(offset))
 		return -ENOTSUPP;
 
-	gc->set_rv(gc, offset, value);
+	gc->set(gc, offset, value);
 	return 0;
 }
 
@@ -161,9 +161,9 @@ struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
 	r->gc.get_direction = gpio_reg_get_direction;
 	r->gc.direction_input = gpio_reg_direction_input;
 	r->gc.direction_output = gpio_reg_direction_output;
-	r->gc.set_rv = gpio_reg_set;
+	r->gc.set = gpio_reg_set;
 	r->gc.get = gpio_reg_get;
-	r->gc.set_multiple_rv = gpio_reg_set_multiple;
+	r->gc.set_multiple = gpio_reg_set_multiple;
 	if (irqs)
 		r->gc.to_irq = gpio_reg_to_irq;
 	r->gc.base = base;
diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c
index 87c4225784cf..e8a32dfebdcb 100644
--- a/drivers/gpio/gpio-regmap.c
+++ b/drivers/gpio/gpio-regmap.c
@@ -260,9 +260,9 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config
 	chip->free = gpiochip_generic_free;
 	chip->get = gpio_regmap_get;
 	if (gpio->reg_set_base && gpio->reg_clr_base)
-		chip->set_rv = gpio_regmap_set_with_clear;
+		chip->set = gpio_regmap_set_with_clear;
 	else if (gpio->reg_set_base)
-		chip->set_rv = gpio_regmap_set;
+		chip->set = gpio_regmap_set;
 
 	chip->get_direction = gpio_regmap_get_direction;
 	if (gpio->reg_dir_in_base || gpio->reg_dir_out_base) {
diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c
index ecd60ff9e1dd..bcfc323a8315 100644
--- a/drivers/gpio/gpio-rockchip.c
+++ b/drivers/gpio/gpio-rockchip.c
@@ -327,7 +327,7 @@ static int rockchip_gpio_to_irq(struct gpio_chip *gc, unsigned int offset)
 static const struct gpio_chip rockchip_gpiolib_chip = {
 	.request = gpiochip_generic_request,
 	.free = gpiochip_generic_free,
-	.set_rv = rockchip_gpio_set,
+	.set = rockchip_gpio_set,
 	.get = rockchip_gpio_get,
 	.get_direction	= rockchip_gpio_get_direction,
 	.direction_input = rockchip_gpio_direction_input,
diff --git a/drivers/gpio/gpio-rtd.c b/drivers/gpio/gpio-rtd.c
index 25bbd749b019..d46b40dd5283 100644
--- a/drivers/gpio/gpio-rtd.c
+++ b/drivers/gpio/gpio-rtd.c
@@ -565,7 +565,7 @@ static int rtd_gpio_probe(struct platform_device *pdev)
 	data->gpio_chip.get_direction = rtd_gpio_get_direction;
 	data->gpio_chip.direction_input = rtd_gpio_direction_input;
 	data->gpio_chip.direction_output = rtd_gpio_direction_output;
-	data->gpio_chip.set_rv = rtd_gpio_set;
+	data->gpio_chip.set = rtd_gpio_set;
 	data->gpio_chip.get = rtd_gpio_get;
 	data->gpio_chip.set_config = rtd_gpio_set_config;
 	data->gpio_chip.parent = dev;
diff --git a/drivers/gpio/gpio-sa1100.c b/drivers/gpio/gpio-sa1100.c
index e9d054d78ccb..7f6a62f5d1ee 100644
--- a/drivers/gpio/gpio-sa1100.c
+++ b/drivers/gpio/gpio-sa1100.c
@@ -99,7 +99,7 @@ static struct sa1100_gpio_chip sa1100_gpio_chip = {
 		.get_direction		= sa1100_get_direction,
 		.direction_input	= sa1100_direction_input,
 		.direction_output	= sa1100_direction_output,
-		.set_rv			= sa1100_gpio_set,
+		.set			= sa1100_gpio_set,
 		.get			= sa1100_gpio_get,
 		.to_irq			= sa1100_to_irq,
 		.base			= 0,
diff --git a/drivers/gpio/gpio-sama5d2-piobu.c b/drivers/gpio/gpio-sama5d2-piobu.c
index c31244cf5e89..5005688f6e67 100644
--- a/drivers/gpio/gpio-sama5d2-piobu.c
+++ b/drivers/gpio/gpio-sama5d2-piobu.c
@@ -196,7 +196,7 @@ static int sama5d2_piobu_probe(struct platform_device *pdev)
 	piobu->chip.direction_input = sama5d2_piobu_direction_input;
 	piobu->chip.direction_output = sama5d2_piobu_direction_output;
 	piobu->chip.get = sama5d2_piobu_get;
-	piobu->chip.set_rv = sama5d2_piobu_set;
+	piobu->chip.set = sama5d2_piobu_set;
 	piobu->chip.base = -1;
 	piobu->chip.ngpio = PIOBU_NUM;
 	piobu->chip.can_sleep = 0;
diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c
index 833ffdd98d74..966d16a6d515 100644
--- a/drivers/gpio/gpio-sch.c
+++ b/drivers/gpio/gpio-sch.c
@@ -167,7 +167,7 @@ static const struct gpio_chip sch_gpio_chip = {
 	.direction_input	= sch_gpio_direction_in,
 	.get			= sch_gpio_get,
 	.direction_output	= sch_gpio_direction_out,
-	.set_rv			= sch_gpio_set,
+	.set			= sch_gpio_set,
 	.get_direction		= sch_gpio_get_direction,
 };
 
diff --git a/drivers/gpio/gpio-sch311x.c b/drivers/gpio/gpio-sch311x.c
index 44fb5fc21fb8..f95566998d30 100644
--- a/drivers/gpio/gpio-sch311x.c
+++ b/drivers/gpio/gpio-sch311x.c
@@ -297,7 +297,7 @@ static int sch311x_gpio_probe(struct platform_device *pdev)
 		block->chip.get_direction = sch311x_gpio_get_direction;
 		block->chip.set_config = sch311x_gpio_set_config;
 		block->chip.get = sch311x_gpio_get;
-		block->chip.set_rv = sch311x_gpio_set;
+		block->chip.set = sch311x_gpio_set;
 		block->chip.ngpio = 8;
 		block->chip.parent = &pdev->dev;
 		block->chip.base = sch311x_gpio_blocks[i].base;
diff --git a/drivers/gpio/gpio-sim.c b/drivers/gpio/gpio-sim.c
index 9503296422fd..050092583f79 100644
--- a/drivers/gpio/gpio-sim.c
+++ b/drivers/gpio/gpio-sim.c
@@ -486,9 +486,9 @@ static int gpio_sim_add_bank(struct fwnode_handle *swnode, struct device *dev)
 	gc->parent = dev;
 	gc->fwnode = swnode;
 	gc->get = gpio_sim_get;
-	gc->set_rv = gpio_sim_set;
+	gc->set = gpio_sim_set;
 	gc->get_multiple = gpio_sim_get_multiple;
-	gc->set_multiple_rv = gpio_sim_set_multiple;
+	gc->set_multiple = gpio_sim_set_multiple;
 	gc->direction_output = gpio_sim_direction_output;
 	gc->direction_input = gpio_sim_direction_input;
 	gc->get_direction = gpio_sim_get_direction;
diff --git a/drivers/gpio/gpio-siox.c b/drivers/gpio/gpio-siox.c
index 95355dda621b..958034b9f3f3 100644
--- a/drivers/gpio/gpio-siox.c
+++ b/drivers/gpio/gpio-siox.c
@@ -237,7 +237,7 @@ static int gpio_siox_probe(struct siox_device *sdevice)
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
 	gc->get = gpio_siox_get;
-	gc->set_rv = gpio_siox_set;
+	gc->set = gpio_siox_set;
 	gc->direction_input = gpio_siox_direction_input;
 	gc->direction_output = gpio_siox_direction_output;
 	gc->get_direction = gpio_siox_get_direction;
diff --git a/drivers/gpio/gpio-spear-spics.c b/drivers/gpio/gpio-spear-spics.c
index 55f0e8afa291..96a0e1211500 100644
--- a/drivers/gpio/gpio-spear-spics.c
+++ b/drivers/gpio/gpio-spear-spics.c
@@ -140,7 +140,7 @@ static int spics_gpio_probe(struct platform_device *pdev)
 	spics->chip.request = spics_request;
 	spics->chip.free = spics_free;
 	spics->chip.direction_output = spics_direction_output;
-	spics->chip.set_rv = spics_set_value;
+	spics->chip.set = spics_set_value;
 	spics->chip.label = dev_name(&pdev->dev);
 	spics->chip.parent = &pdev->dev;
 	spics->chip.owner = THIS_MODULE;
diff --git a/drivers/gpio/gpio-sprd.c b/drivers/gpio/gpio-sprd.c
index bbd5bf51c088..413bcd0a4240 100644
--- a/drivers/gpio/gpio-sprd.c
+++ b/drivers/gpio/gpio-sprd.c
@@ -245,7 +245,7 @@ static int sprd_gpio_probe(struct platform_device *pdev)
 	sprd_gpio->chip.request = sprd_gpio_request;
 	sprd_gpio->chip.free = sprd_gpio_free;
 	sprd_gpio->chip.get = sprd_gpio_get;
-	sprd_gpio->chip.set_rv = sprd_gpio_set;
+	sprd_gpio->chip.set = sprd_gpio_set;
 	sprd_gpio->chip.direction_input = sprd_gpio_direction_input;
 	sprd_gpio->chip.direction_output = sprd_gpio_direction_output;
 
diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c
index 0a270156e0be..5dd4c21a8e60 100644
--- a/drivers/gpio/gpio-stmpe.c
+++ b/drivers/gpio/gpio-stmpe.c
@@ -136,7 +136,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= stmpe_gpio_direction_input,
 	.get			= stmpe_gpio_get,
 	.direction_output	= stmpe_gpio_direction_output,
-	.set_rv			= stmpe_gpio_set,
+	.set			= stmpe_gpio_set,
 	.request		= stmpe_gpio_request,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-stp-xway.c b/drivers/gpio/gpio-stp-xway.c
index fdda8de6ca36..493c027afdd6 100644
--- a/drivers/gpio/gpio-stp-xway.c
+++ b/drivers/gpio/gpio-stp-xway.c
@@ -249,7 +249,7 @@ static int xway_stp_probe(struct platform_device *pdev)
 	chip->gc.label = "stp-xway";
 	chip->gc.direction_output = xway_stp_dir_out;
 	chip->gc.get = xway_stp_get;
-	chip->gc.set_rv = xway_stp_set;
+	chip->gc.set = xway_stp_set;
 	chip->gc.request = xway_stp_request;
 	chip->gc.base = -1;
 	chip->gc.owner = THIS_MODULE;
diff --git a/drivers/gpio/gpio-syscon.c b/drivers/gpio/gpio-syscon.c
index f86f78655c24..40064d4cf47f 100644
--- a/drivers/gpio/gpio-syscon.c
+++ b/drivers/gpio/gpio-syscon.c
@@ -115,7 +115,7 @@ static int syscon_gpio_dir_out(struct gpio_chip *chip, unsigned offset, int val)
 				   BIT(offs % SYSCON_REG_BITS));
 	}
 
-	return chip->set_rv(chip, offset, val);
+	return chip->set(chip, offset, val);
 }
 
 static const struct syscon_gpio_data clps711x_mctrl_gpio = {
@@ -251,7 +251,7 @@ static int syscon_gpio_probe(struct platform_device *pdev)
 	if (priv->data->flags & GPIO_SYSCON_FEAT_IN)
 		priv->chip.direction_input = syscon_gpio_dir_in;
 	if (priv->data->flags & GPIO_SYSCON_FEAT_OUT) {
-		priv->chip.set_rv = priv->data->set ? : syscon_gpio_set;
+		priv->chip.set = priv->data->set ? : syscon_gpio_set;
 		priv->chip.direction_output = syscon_gpio_dir_out;
 	}
 
diff --git a/drivers/gpio/gpio-tangier.c b/drivers/gpio/gpio-tangier.c
index ce17b98e0623..ba5a8ede8912 100644
--- a/drivers/gpio/gpio-tangier.c
+++ b/drivers/gpio/gpio-tangier.c
@@ -430,7 +430,7 @@ int devm_tng_gpio_probe(struct device *dev, struct tng_gpio *gpio)
 	gpio->chip.direction_input = tng_gpio_direction_input;
 	gpio->chip.direction_output = tng_gpio_direction_output;
 	gpio->chip.get = tng_gpio_get;
-	gpio->chip.set_rv = tng_gpio_set;
+	gpio->chip.set = tng_gpio_set;
 	gpio->chip.get_direction = tng_gpio_get_direction;
 	gpio->chip.set_config = tng_gpio_set_config;
 	gpio->chip.base = info->base;
diff --git a/drivers/gpio/gpio-tc3589x.c b/drivers/gpio/gpio-tc3589x.c
index 0bd32809fd68..90d048f9da08 100644
--- a/drivers/gpio/gpio-tc3589x.c
+++ b/drivers/gpio/gpio-tc3589x.c
@@ -149,7 +149,7 @@ static const struct gpio_chip template_chip = {
 	.label			= "tc3589x",
 	.owner			= THIS_MODULE,
 	.get			= tc3589x_gpio_get,
-	.set_rv			= tc3589x_gpio_set,
+	.set			= tc3589x_gpio_set,
 	.direction_output	= tc3589x_gpio_direction_output,
 	.direction_input	= tc3589x_gpio_direction_input,
 	.get_direction		= tc3589x_gpio_get_direction,
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 126fd12550aa..15a5762a82c2 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -720,7 +720,7 @@ static int tegra_gpio_probe(struct platform_device *pdev)
 	tgi->gc.direction_input		= tegra_gpio_direction_input;
 	tgi->gc.get			= tegra_gpio_get;
 	tgi->gc.direction_output	= tegra_gpio_direction_output;
-	tgi->gc.set_rv			= tegra_gpio_set;
+	tgi->gc.set			= tegra_gpio_set;
 	tgi->gc.get_direction		= tegra_gpio_get_direction;
 	tgi->gc.base			= 0;
 	tgi->gc.ngpio			= tgi->bank_count * 32;
diff --git a/drivers/gpio/gpio-tegra186.c b/drivers/gpio/gpio-tegra186.c
index f902da15c419..5fd3ec3e2c53 100644
--- a/drivers/gpio/gpio-tegra186.c
+++ b/drivers/gpio/gpio-tegra186.c
@@ -891,7 +891,7 @@ static int tegra186_gpio_probe(struct platform_device *pdev)
 	gpio->gpio.direction_input = tegra186_gpio_direction_input;
 	gpio->gpio.direction_output = tegra186_gpio_direction_output;
 	gpio->gpio.get = tegra186_gpio_get;
-	gpio->gpio.set_rv = tegra186_gpio_set;
+	gpio->gpio.set = tegra186_gpio_set;
 	gpio->gpio.set_config = tegra186_gpio_set_config;
 	gpio->gpio.add_pin_ranges = tegra186_gpio_add_pin_ranges;
 	gpio->gpio.init_valid_mask = tegra186_init_valid_mask;
diff --git a/drivers/gpio/gpio-thunderx.c b/drivers/gpio/gpio-thunderx.c
index eb6a1f0279c0..be96853063ba 100644
--- a/drivers/gpio/gpio-thunderx.c
+++ b/drivers/gpio/gpio-thunderx.c
@@ -533,8 +533,8 @@ static int thunderx_gpio_probe(struct pci_dev *pdev,
 	chip->direction_input = thunderx_gpio_dir_in;
 	chip->get = thunderx_gpio_get;
 	chip->direction_output = thunderx_gpio_dir_out;
-	chip->set_rv = thunderx_gpio_set;
-	chip->set_multiple_rv = thunderx_gpio_set_multiple;
+	chip->set = thunderx_gpio_set;
+	chip->set_multiple = thunderx_gpio_set_multiple;
 	chip->set_config = thunderx_gpio_set_config;
 	girq = &chip->irq;
 	gpio_irq_chip_set_chip(girq, &thunderx_gpio_irq_chip);
diff --git a/drivers/gpio/gpio-timberdale.c b/drivers/gpio/gpio-timberdale.c
index fbb883089189..679e27f00ff6 100644
--- a/drivers/gpio/gpio-timberdale.c
+++ b/drivers/gpio/gpio-timberdale.c
@@ -253,7 +253,7 @@ static int timbgpio_probe(struct platform_device *pdev)
 	gc->direction_input = timbgpio_gpio_direction_input;
 	gc->get = timbgpio_gpio_get;
 	gc->direction_output = timbgpio_gpio_direction_output;
-	gc->set_rv = timbgpio_gpio_set;
+	gc->set = timbgpio_gpio_set;
 	gc->to_irq = (irq >= 0 && tgpio->irq_base > 0) ? timbgpio_to_irq : NULL;
 	gc->dbg_show = NULL;
 	gc->base = pdata->gpio_base;
diff --git a/drivers/gpio/gpio-tpic2810.c b/drivers/gpio/gpio-tpic2810.c
index d5b8568ab061..866ff2d436d5 100644
--- a/drivers/gpio/gpio-tpic2810.c
+++ b/drivers/gpio/gpio-tpic2810.c
@@ -80,8 +80,8 @@ static const struct gpio_chip template_chip = {
 	.owner			= THIS_MODULE,
 	.get_direction		= tpic2810_get_direction,
 	.direction_output	= tpic2810_direction_output,
-	.set_rv			= tpic2810_set,
-	.set_multiple_rv	= tpic2810_set_multiple,
+	.set			= tpic2810_set,
+	.set_multiple		= tpic2810_set_multiple,
 	.base			= -1,
 	.ngpio			= 8,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-tps65086.c b/drivers/gpio/gpio-tps65086.c
index 08fa061b73ef..84b17b83476f 100644
--- a/drivers/gpio/gpio-tps65086.c
+++ b/drivers/gpio/gpio-tps65086.c
@@ -69,7 +69,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= tps65086_gpio_direction_input,
 	.direction_output	= tps65086_gpio_direction_output,
 	.get			= tps65086_gpio_get,
-	.set_rv			= tps65086_gpio_set,
+	.set			= tps65086_gpio_set,
 	.base			= -1,
 	.ngpio			= 4,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-tps65218.c b/drivers/gpio/gpio-tps65218.c
index 49cd7754ed05..3b4c41f5ef55 100644
--- a/drivers/gpio/gpio-tps65218.c
+++ b/drivers/gpio/gpio-tps65218.c
@@ -169,7 +169,7 @@ static const struct gpio_chip template_chip = {
 	.request		= tps65218_gpio_request,
 	.direction_output	= tps65218_gpio_output,
 	.get			= tps65218_gpio_get,
-	.set_rv			= tps65218_gpio_set,
+	.set			= tps65218_gpio_set,
 	.set_config		= tps65218_gpio_set_config,
 	.can_sleep		= true,
 	.ngpio			= 3,
diff --git a/drivers/gpio/gpio-tps65219.c b/drivers/gpio/gpio-tps65219.c
index c0177088c54c..158f63bcf10c 100644
--- a/drivers/gpio/gpio-tps65219.c
+++ b/drivers/gpio/gpio-tps65219.c
@@ -203,7 +203,7 @@ static const struct gpio_chip tps65214_template_chip = {
 	.direction_input	= tps65219_gpio_direction_input,
 	.direction_output	= tps65219_gpio_direction_output,
 	.get			= tps65219_gpio_get,
-	.set_rv			= tps65219_gpio_set,
+	.set			= tps65219_gpio_set,
 	.base			= -1,
 	.ngpio			= 2,
 	.can_sleep		= true,
@@ -216,7 +216,7 @@ static const struct gpio_chip tps65219_template_chip = {
 	.direction_input	= tps65219_gpio_direction_input,
 	.direction_output	= tps65219_gpio_direction_output,
 	.get			= tps65219_gpio_get,
-	.set_rv			= tps65219_gpio_set,
+	.set			= tps65219_gpio_set,
 	.base			= -1,
 	.ngpio			= 3,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-tps6586x.c b/drivers/gpio/gpio-tps6586x.c
index f1ced092f38a..aaacbb54bf5d 100644
--- a/drivers/gpio/gpio-tps6586x.c
+++ b/drivers/gpio/gpio-tps6586x.c
@@ -98,7 +98,7 @@ static int tps6586x_gpio_probe(struct platform_device *pdev)
 
 	/* FIXME: add handling of GPIOs as dedicated inputs */
 	tps6586x_gpio->gpio_chip.direction_output = tps6586x_gpio_output;
-	tps6586x_gpio->gpio_chip.set_rv	= tps6586x_gpio_set;
+	tps6586x_gpio->gpio_chip.set	= tps6586x_gpio_set;
 	tps6586x_gpio->gpio_chip.get	= tps6586x_gpio_get;
 	tps6586x_gpio->gpio_chip.to_irq	= tps6586x_gpio_to_irq;
 
diff --git a/drivers/gpio/gpio-tps65910.c b/drivers/gpio/gpio-tps65910.c
index 3204f55394cf..25e9f41efe78 100644
--- a/drivers/gpio/gpio-tps65910.c
+++ b/drivers/gpio/gpio-tps65910.c
@@ -139,7 +139,7 @@ static int tps65910_gpio_probe(struct platform_device *pdev)
 	tps65910_gpio->gpio_chip.can_sleep = true;
 	tps65910_gpio->gpio_chip.direction_input = tps65910_gpio_input;
 	tps65910_gpio->gpio_chip.direction_output = tps65910_gpio_output;
-	tps65910_gpio->gpio_chip.set_rv	= tps65910_gpio_set;
+	tps65910_gpio->gpio_chip.set	= tps65910_gpio_set;
 	tps65910_gpio->gpio_chip.get	= tps65910_gpio_get;
 	tps65910_gpio->gpio_chip.parent = &pdev->dev;
 
diff --git a/drivers/gpio/gpio-tps65912.c b/drivers/gpio/gpio-tps65912.c
index d586ccfbfc56..7a2c5685c2fd 100644
--- a/drivers/gpio/gpio-tps65912.c
+++ b/drivers/gpio/gpio-tps65912.c
@@ -92,7 +92,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= tps65912_gpio_direction_input,
 	.direction_output	= tps65912_gpio_direction_output,
 	.get			= tps65912_gpio_get,
-	.set_rv			= tps65912_gpio_set,
+	.set			= tps65912_gpio_set,
 	.base			= -1,
 	.ngpio			= 5,
 	.can_sleep		= true,
diff --git a/drivers/gpio/gpio-tps68470.c b/drivers/gpio/gpio-tps68470.c
index 3b8805c854f7..d4fbdf90e190 100644
--- a/drivers/gpio/gpio-tps68470.c
+++ b/drivers/gpio/gpio-tps68470.c
@@ -142,7 +142,7 @@ static int tps68470_gpio_probe(struct platform_device *pdev)
 	tps68470_gpio->gc.direction_output = tps68470_gpio_output;
 	tps68470_gpio->gc.get = tps68470_gpio_get;
 	tps68470_gpio->gc.get_direction = tps68470_gpio_get_direction;
-	tps68470_gpio->gc.set_rv = tps68470_gpio_set;
+	tps68470_gpio->gc.set = tps68470_gpio_set;
 	tps68470_gpio->gc.can_sleep = true;
 	tps68470_gpio->gc.names = tps68470_names;
 	tps68470_gpio->gc.ngpio = TPS68470_N_GPIO;
diff --git a/drivers/gpio/gpio-tqmx86.c b/drivers/gpio/gpio-tqmx86.c
index 056799ecce6a..27dd09273292 100644
--- a/drivers/gpio/gpio-tqmx86.c
+++ b/drivers/gpio/gpio-tqmx86.c
@@ -370,7 +370,7 @@ static int tqmx86_gpio_probe(struct platform_device *pdev)
 	chip->direction_output = tqmx86_gpio_direction_output;
 	chip->get_direction = tqmx86_gpio_get_direction;
 	chip->get = tqmx86_gpio_get;
-	chip->set_rv = tqmx86_gpio_set;
+	chip->set = tqmx86_gpio_set;
 	chip->ngpio = TQMX86_NGPIO;
 	chip->parent = pdev->dev.parent;
 
diff --git a/drivers/gpio/gpio-ts4900.c b/drivers/gpio/gpio-ts4900.c
index 35dd2d09b4d4..d9ee8fc77ccd 100644
--- a/drivers/gpio/gpio-ts4900.c
+++ b/drivers/gpio/gpio-ts4900.c
@@ -119,7 +119,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= ts4900_gpio_direction_input,
 	.direction_output	= ts4900_gpio_direction_output,
 	.get			= ts4900_gpio_get,
-	.set_rv			= ts4900_gpio_set,
+	.set			= ts4900_gpio_set,
 	.base			= -1,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-ts5500.c b/drivers/gpio/gpio-ts5500.c
index bb432ed73698..3c7f2efe10fd 100644
--- a/drivers/gpio/gpio-ts5500.c
+++ b/drivers/gpio/gpio-ts5500.c
@@ -340,7 +340,7 @@ static int ts5500_dio_probe(struct platform_device *pdev)
 	priv->gpio_chip.direction_input = ts5500_gpio_input;
 	priv->gpio_chip.direction_output = ts5500_gpio_output;
 	priv->gpio_chip.get = ts5500_gpio_get;
-	priv->gpio_chip.set_rv = ts5500_gpio_set;
+	priv->gpio_chip.set = ts5500_gpio_set;
 	priv->gpio_chip.to_irq = ts5500_gpio_to_irq;
 	priv->gpio_chip.base = -1;
 
diff --git a/drivers/gpio/gpio-twl4030.c b/drivers/gpio/gpio-twl4030.c
index e39e39e3ef85..a33dc7c7e7a0 100644
--- a/drivers/gpio/gpio-twl4030.c
+++ b/drivers/gpio/gpio-twl4030.c
@@ -419,7 +419,7 @@ static const struct gpio_chip template_chip = {
 	.direction_output	= twl_direction_out,
 	.get_direction		= twl_get_direction,
 	.get			= twl_get,
-	.set_rv			= twl_set,
+	.set			= twl_set,
 	.to_irq			= twl_to_irq,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-twl6040.c b/drivers/gpio/gpio-twl6040.c
index b2196b62b528..4ec9bcd40439 100644
--- a/drivers/gpio/gpio-twl6040.c
+++ b/drivers/gpio/gpio-twl6040.c
@@ -69,7 +69,7 @@ static struct gpio_chip twl6040gpo_chip = {
 	.get			= twl6040gpo_get,
 	.direction_output	= twl6040gpo_direction_out,
 	.get_direction		= twl6040gpo_get_direction,
-	.set_rv			= twl6040gpo_set,
+	.set			= twl6040gpo_set,
 	.can_sleep		= true,
 };
 
diff --git a/drivers/gpio/gpio-uniphier.c b/drivers/gpio/gpio-uniphier.c
index 8939556f42b6..197bb1d22b3c 100644
--- a/drivers/gpio/gpio-uniphier.c
+++ b/drivers/gpio/gpio-uniphier.c
@@ -386,8 +386,8 @@ static int uniphier_gpio_probe(struct platform_device *pdev)
 	chip->direction_input = uniphier_gpio_direction_input;
 	chip->direction_output = uniphier_gpio_direction_output;
 	chip->get = uniphier_gpio_get;
-	chip->set_rv = uniphier_gpio_set;
-	chip->set_multiple_rv = uniphier_gpio_set_multiple;
+	chip->set = uniphier_gpio_set;
+	chip->set_multiple = uniphier_gpio_set_multiple;
 	chip->to_irq = uniphier_gpio_to_irq;
 	chip->base = -1;
 	chip->ngpio = ngpios;
diff --git a/drivers/gpio/gpio-viperboard.c b/drivers/gpio/gpio-viperboard.c
index e8e906b54d51..15e495c109d2 100644
--- a/drivers/gpio/gpio-viperboard.c
+++ b/drivers/gpio/gpio-viperboard.c
@@ -408,7 +408,7 @@ static int vprbrd_gpio_probe(struct platform_device *pdev)
 	vb_gpio->gpioa.base = -1;
 	vb_gpio->gpioa.ngpio = 16;
 	vb_gpio->gpioa.can_sleep = true;
-	vb_gpio->gpioa.set_rv = vprbrd_gpioa_set;
+	vb_gpio->gpioa.set = vprbrd_gpioa_set;
 	vb_gpio->gpioa.get = vprbrd_gpioa_get;
 	vb_gpio->gpioa.direction_input = vprbrd_gpioa_direction_input;
 	vb_gpio->gpioa.direction_output = vprbrd_gpioa_direction_output;
@@ -424,7 +424,7 @@ static int vprbrd_gpio_probe(struct platform_device *pdev)
 	vb_gpio->gpiob.base = -1;
 	vb_gpio->gpiob.ngpio = 16;
 	vb_gpio->gpiob.can_sleep = true;
-	vb_gpio->gpiob.set_rv = vprbrd_gpiob_set;
+	vb_gpio->gpiob.set = vprbrd_gpiob_set;
 	vb_gpio->gpiob.get = vprbrd_gpiob_get;
 	vb_gpio->gpiob.direction_input = vprbrd_gpiob_direction_input;
 	vb_gpio->gpiob.direction_output = vprbrd_gpiob_direction_output;
diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c
index 07552611da98..17e040991e46 100644
--- a/drivers/gpio/gpio-virtio.c
+++ b/drivers/gpio/gpio-virtio.c
@@ -567,7 +567,7 @@ static int virtio_gpio_probe(struct virtio_device *vdev)
 	vgpio->gc.direction_input	= virtio_gpio_direction_input;
 	vgpio->gc.direction_output	= virtio_gpio_direction_output;
 	vgpio->gc.get			= virtio_gpio_get;
-	vgpio->gc.set_rv		= virtio_gpio_set;
+	vgpio->gc.set			= virtio_gpio_set;
 	vgpio->gc.ngpio			= ngpio;
 	vgpio->gc.base			= -1; /* Allocate base dynamically */
 	vgpio->gc.label			= dev_name(dev);
diff --git a/drivers/gpio/gpio-vx855.c b/drivers/gpio/gpio-vx855.c
index a3bceac7854c..84b3a973a503 100644
--- a/drivers/gpio/gpio-vx855.c
+++ b/drivers/gpio/gpio-vx855.c
@@ -216,7 +216,7 @@ static void vx855gpio_gpio_setup(struct vx855_gpio *vg)
 	c->direction_input = vx855gpio_direction_input;
 	c->direction_output = vx855gpio_direction_output;
 	c->get = vx855gpio_get;
-	c->set_rv = vx855gpio_set;
+	c->set = vx855gpio_set;
 	c->set_config = vx855gpio_set_config;
 	c->dbg_show = NULL;
 	c->base = 0;
diff --git a/drivers/gpio/gpio-wcd934x.c b/drivers/gpio/gpio-wcd934x.c
index c89da9a22016..4af504c23e6f 100644
--- a/drivers/gpio/gpio-wcd934x.c
+++ b/drivers/gpio/gpio-wcd934x.c
@@ -98,7 +98,7 @@ static int wcd_gpio_probe(struct platform_device *pdev)
 	chip->direction_output = wcd_gpio_direction_output;
 	chip->get_direction = wcd_gpio_get_direction;
 	chip->get = wcd_gpio_get;
-	chip->set_rv = wcd_gpio_set;
+	chip->set = wcd_gpio_set;
 	chip->parent = dev;
 	chip->base = -1;
 	chip->ngpio = WCD934X_NPINS;
diff --git a/drivers/gpio/gpio-wcove.c b/drivers/gpio/gpio-wcove.c
index f7df3d5fc71c..4a5e20e936a9 100644
--- a/drivers/gpio/gpio-wcove.c
+++ b/drivers/gpio/gpio-wcove.c
@@ -439,7 +439,7 @@ static int wcove_gpio_probe(struct platform_device *pdev)
 	wg->chip.direction_output = wcove_gpio_dir_out;
 	wg->chip.get_direction = wcove_gpio_get_direction;
 	wg->chip.get = wcove_gpio_get;
-	wg->chip.set_rv = wcove_gpio_set;
+	wg->chip.set = wcove_gpio_set;
 	wg->chip.set_config = wcove_gpio_set_config;
 	wg->chip.base = -1;
 	wg->chip.ngpio = WCOVE_VGPIO_NUM;
diff --git a/drivers/gpio/gpio-winbond.c b/drivers/gpio/gpio-winbond.c
index 421655b5d4c2..dcfda738fd69 100644
--- a/drivers/gpio/gpio-winbond.c
+++ b/drivers/gpio/gpio-winbond.c
@@ -494,7 +494,7 @@ static struct gpio_chip winbond_gpio_chip = {
 	.can_sleep		= true,
 	.get			= winbond_gpio_get,
 	.direction_input	= winbond_gpio_direction_in,
-	.set_rv			= winbond_gpio_set,
+	.set			= winbond_gpio_set,
 	.direction_output	= winbond_gpio_direction_out,
 };
 
diff --git a/drivers/gpio/gpio-wm831x.c b/drivers/gpio/gpio-wm831x.c
index ab58aa7c0b99..f03c0e808fab 100644
--- a/drivers/gpio/gpio-wm831x.c
+++ b/drivers/gpio/gpio-wm831x.c
@@ -253,7 +253,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= wm831x_gpio_direction_in,
 	.get			= wm831x_gpio_get,
 	.direction_output	= wm831x_gpio_direction_out,
-	.set_rv			= wm831x_gpio_set,
+	.set			= wm831x_gpio_set,
 	.to_irq			= wm831x_gpio_to_irq,
 	.set_config		= wm831x_set_config,
 	.dbg_show		= wm831x_gpio_dbg_show,
diff --git a/drivers/gpio/gpio-wm8350.c b/drivers/gpio/gpio-wm8350.c
index 9a7677f841fc..46923b23a72e 100644
--- a/drivers/gpio/gpio-wm8350.c
+++ b/drivers/gpio/gpio-wm8350.c
@@ -93,7 +93,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= wm8350_gpio_direction_in,
 	.get			= wm8350_gpio_get,
 	.direction_output	= wm8350_gpio_direction_out,
-	.set_rv			= wm8350_gpio_set,
+	.set			= wm8350_gpio_set,
 	.to_irq			= wm8350_gpio_to_irq,
 	.can_sleep		= true,
 };
diff --git a/drivers/gpio/gpio-wm8994.c b/drivers/gpio/gpio-wm8994.c
index ccc005628dd2..df47a27f508d 100644
--- a/drivers/gpio/gpio-wm8994.c
+++ b/drivers/gpio/gpio-wm8994.c
@@ -256,7 +256,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input	= wm8994_gpio_direction_in,
 	.get			= wm8994_gpio_get,
 	.direction_output	= wm8994_gpio_direction_out,
-	.set_rv			= wm8994_gpio_set,
+	.set			= wm8994_gpio_set,
 	.set_config		= wm8994_gpio_set_config,
 	.to_irq			= wm8994_gpio_to_irq,
 	.dbg_show		= wm8994_gpio_dbg_show,
diff --git a/drivers/gpio/gpio-xgene.c b/drivers/gpio/gpio-xgene.c
index 28f794e5eb26..4f627de3f56c 100644
--- a/drivers/gpio/gpio-xgene.c
+++ b/drivers/gpio/gpio-xgene.c
@@ -178,7 +178,7 @@ static int xgene_gpio_probe(struct platform_device *pdev)
 	gpio->chip.direction_input = xgene_gpio_dir_in;
 	gpio->chip.direction_output = xgene_gpio_dir_out;
 	gpio->chip.get = xgene_gpio_get;
-	gpio->chip.set_rv = xgene_gpio_set;
+	gpio->chip.set = xgene_gpio_set;
 	gpio->chip.label = dev_name(&pdev->dev);
 	gpio->chip.base = -1;
 
diff --git a/drivers/gpio/gpio-xilinx.c b/drivers/gpio/gpio-xilinx.c
index 36d91cacc2d9..83675ac81077 100644
--- a/drivers/gpio/gpio-xilinx.c
+++ b/drivers/gpio/gpio-xilinx.c
@@ -604,10 +604,10 @@ static int xgpio_probe(struct platform_device *pdev)
 	chip->gc.direction_input = xgpio_dir_in;
 	chip->gc.direction_output = xgpio_dir_out;
 	chip->gc.get = xgpio_get;
-	chip->gc.set_rv = xgpio_set;
+	chip->gc.set = xgpio_set;
 	chip->gc.request = xgpio_request;
 	chip->gc.free = xgpio_free;
-	chip->gc.set_multiple_rv = xgpio_set_multiple;
+	chip->gc.set_multiple = xgpio_set_multiple;
 
 	chip->gc.label = dev_name(dev);
 
diff --git a/drivers/gpio/gpio-xlp.c b/drivers/gpio/gpio-xlp.c
index bcd2dfec462d..aede6324387f 100644
--- a/drivers/gpio/gpio-xlp.c
+++ b/drivers/gpio/gpio-xlp.c
@@ -274,7 +274,7 @@ static int xlp_gpio_probe(struct platform_device *pdev)
 	gc->ngpio = 70;
 	gc->direction_output = xlp_gpio_dir_output;
 	gc->direction_input = xlp_gpio_dir_input;
-	gc->set_rv = xlp_gpio_set;
+	gc->set = xlp_gpio_set;
 	gc->get = xlp_gpio_get;
 
 	spin_lock_init(&priv->lock);
diff --git a/drivers/gpio/gpio-xra1403.c b/drivers/gpio/gpio-xra1403.c
index 70402c6b5407..faadcb4b0b2d 100644
--- a/drivers/gpio/gpio-xra1403.c
+++ b/drivers/gpio/gpio-xra1403.c
@@ -164,7 +164,7 @@ static int xra1403_probe(struct spi_device *spi)
 	xra->chip.direction_output = xra1403_direction_output;
 	xra->chip.get_direction = xra1403_get_direction;
 	xra->chip.get = xra1403_get;
-	xra->chip.set_rv = xra1403_set;
+	xra->chip.set = xra1403_set;
 
 	xra->chip.dbg_show = xra1403_dbg_show;
 
diff --git a/drivers/gpio/gpio-xtensa.c b/drivers/gpio/gpio-xtensa.c
index e7ff3c60324d..4418947a10e5 100644
--- a/drivers/gpio/gpio-xtensa.c
+++ b/drivers/gpio/gpio-xtensa.c
@@ -132,7 +132,7 @@ static struct gpio_chip expstate_chip = {
 	.ngpio		= 32,
 	.get_direction	= xtensa_expstate_get_direction,
 	.get		= xtensa_expstate_get_value,
-	.set_rv		= xtensa_expstate_set_value,
+	.set		= xtensa_expstate_set_value,
 };
 
 static int xtensa_gpio_probe(struct platform_device *pdev)
diff --git a/drivers/gpio/gpio-zevio.c b/drivers/gpio/gpio-zevio.c
index 0799f7976710..29375bea2289 100644
--- a/drivers/gpio/gpio-zevio.c
+++ b/drivers/gpio/gpio-zevio.c
@@ -161,7 +161,7 @@ static int zevio_gpio_to_irq(struct gpio_chip *chip, unsigned pin)
 static const struct gpio_chip zevio_gpio_chip = {
 	.direction_input	= zevio_gpio_direction_input,
 	.direction_output	= zevio_gpio_direction_output,
-	.set_rv			= zevio_gpio_set,
+	.set			= zevio_gpio_set,
 	.get			= zevio_gpio_get,
 	.to_irq			= zevio_gpio_to_irq,
 	.base			= 0,
diff --git a/drivers/gpio/gpio-zynq.c b/drivers/gpio/gpio-zynq.c
index b22b4e25c68d..0ffd76e8951f 100644
--- a/drivers/gpio/gpio-zynq.c
+++ b/drivers/gpio/gpio-zynq.c
@@ -932,7 +932,7 @@ static int zynq_gpio_probe(struct platform_device *pdev)
 	chip->owner = THIS_MODULE;
 	chip->parent = &pdev->dev;
 	chip->get = zynq_gpio_get_value;
-	chip->set_rv = zynq_gpio_set_value;
+	chip->set = zynq_gpio_set_value;
 	chip->request = zynq_gpio_request;
 	chip->free = zynq_gpio_free;
 	chip->direction_input = zynq_gpio_dir_in;
diff --git a/drivers/gpio/gpio-zynqmp-modepin.c b/drivers/gpio/gpio-zynqmp-modepin.c
index 6dc5d7acb89c..5e651482e985 100644
--- a/drivers/gpio/gpio-zynqmp-modepin.c
+++ b/drivers/gpio/gpio-zynqmp-modepin.c
@@ -130,7 +130,7 @@ static int modepin_gpio_probe(struct platform_device *pdev)
 	chip->owner = THIS_MODULE;
 	chip->parent = &pdev->dev;
 	chip->get = modepin_gpio_get_value;
-	chip->set_rv = modepin_gpio_set_value;
+	chip->set = modepin_gpio_set_value;
 	chip->direction_input = modepin_gpio_dir_in;
 	chip->direction_output = modepin_gpio_dir_out;
 	chip->label = dev_name(&pdev->dev);
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9ac4c23d656a..0d2b470a252e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2886,10 +2886,10 @@ static int gpiochip_set(struct gpio_chip *gc, unsigned int offset, int value)
 
 	lockdep_assert_held(&gc->gpiodev->srcu);
 
-	if (WARN_ON(unlikely(!gc->set_rv)))
+	if (WARN_ON(unlikely(!gc->set)))
 		return -EOPNOTSUPP;
 
-	ret = gc->set_rv(gc, offset, value);
+	ret = gc->set(gc, offset, value);
 	if (ret > 0)
 		ret = -EBADE;
 
@@ -2909,7 +2909,7 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value)
 	 * output-only, but if there is then not even a .set() operation it
 	 * is pretty tricky to drive the output line.
 	 */
-	if (!guard.gc->set_rv && !guard.gc->direction_output) {
+	if (!guard.gc->set && !guard.gc->direction_output) {
 		gpiod_warn(desc,
 			   "%s: missing set() and direction_output() operations\n",
 			   __func__);
@@ -3655,8 +3655,8 @@ static int gpiochip_set_multiple(struct gpio_chip *gc,
 
 	lockdep_assert_held(&gc->gpiodev->srcu);
 
-	if (gc->set_multiple_rv) {
-		ret = gc->set_multiple_rv(gc, mask, bits);
+	if (gc->set_multiple) {
+		ret = gc->set_multiple(gc, mask, bits);
 		if (ret > 0)
 			ret = -EBADE;
 
diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
index e3a8c0c0c945..464390372b34 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
@@ -1836,7 +1836,7 @@ static int ti_sn_gpio_probe(struct auxiliary_device *adev,
 	pdata->gchip.direction_input = ti_sn_bridge_gpio_direction_input;
 	pdata->gchip.direction_output = ti_sn_bridge_gpio_direction_output;
 	pdata->gchip.get = ti_sn_bridge_gpio_get;
-	pdata->gchip.set_rv = ti_sn_bridge_gpio_set;
+	pdata->gchip.set = ti_sn_bridge_gpio_set;
 	pdata->gchip.can_sleep = true;
 	pdata->gchip.names = ti_sn_bridge_gpio_names;
 	pdata->gchip.ngpio = SN_NUM_GPIOS;
diff --git a/drivers/hid/hid-cp2112.c b/drivers/hid/hid-cp2112.c
index 234fa82eab07..482f62a78c41 100644
--- a/drivers/hid/hid-cp2112.c
+++ b/drivers/hid/hid-cp2112.c
@@ -1288,7 +1288,7 @@ static int cp2112_probe(struct hid_device *hdev, const struct hid_device_id *id)
 	dev->gc.label			= "cp2112_gpio";
 	dev->gc.direction_input		= cp2112_gpio_direction_input;
 	dev->gc.direction_output	= cp2112_gpio_direction_output;
-	dev->gc.set_rv			= cp2112_gpio_set;
+	dev->gc.set			= cp2112_gpio_set;
 	dev->gc.get			= cp2112_gpio_get;
 	dev->gc.base			= -1;
 	dev->gc.ngpio			= CP2112_GPIO_MAX_GPIO;
diff --git a/drivers/hid/hid-mcp2200.c b/drivers/hid/hid-mcp2200.c
index e6ea0a2140eb..dafdd5b4a079 100644
--- a/drivers/hid/hid-mcp2200.c
+++ b/drivers/hid/hid-mcp2200.c
@@ -279,8 +279,8 @@ static const struct gpio_chip template_chip = {
 	.get_direction		= mcp_get_direction,
 	.direction_input	= mcp_direction_input,
 	.direction_output	= mcp_direction_output,
-	.set_rv			= mcp_set,
-	.set_multiple_rv	= mcp_set_multiple,
+	.set			= mcp_set,
+	.set_multiple		= mcp_set_multiple,
 	.get			= mcp_get,
 	.get_multiple		= mcp_get_multiple,
 	.base			= -1,
diff --git a/drivers/hid/hid-mcp2221.c b/drivers/hid/hid-mcp2221.c
index fcfe9370a887..475ac352df30 100644
--- a/drivers/hid/hid-mcp2221.c
+++ b/drivers/hid/hid-mcp2221.c
@@ -1298,7 +1298,7 @@ static int mcp2221_probe(struct hid_device *hdev,
 	mcp->gc->direction_input = mcp_gpio_direction_input;
 	mcp->gc->direction_output = mcp_gpio_direction_output;
 	mcp->gc->get_direction = mcp_gpio_get_direction;
-	mcp->gc->set_rv = mcp_gpio_set;
+	mcp->gc->set = mcp_gpio_set;
 	mcp->gc->get = mcp_gpio_get;
 	mcp->gc->ngpio = MCP_NGPIO;
 	mcp->gc->base = -1;
diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c
index a07e2eb93c71..1fcd320d6161 100644
--- a/drivers/hwmon/ltc2992.c
+++ b/drivers/hwmon/ltc2992.c
@@ -339,8 +339,8 @@ static int ltc2992_config_gpio(struct ltc2992_state *st)
 	st->gc.ngpio = ARRAY_SIZE(st->gpio_names);
 	st->gc.get = ltc2992_gpio_get;
 	st->gc.get_multiple = ltc2992_gpio_get_multiple;
-	st->gc.set_rv = ltc2992_gpio_set;
-	st->gc.set_multiple_rv = ltc2992_gpio_set_multiple;
+	st->gc.set = ltc2992_gpio_set;
+	st->gc.set_multiple = ltc2992_gpio_set_multiple;
 
 	ret = devm_gpiochip_add_data(&st->client->dev, &st->gc, st);
 	if (ret)
diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c
index 52d4000902d5..55e7af3a5f98 100644
--- a/drivers/hwmon/pmbus/ucd9000.c
+++ b/drivers/hwmon/pmbus/ucd9000.c
@@ -364,7 +364,7 @@ static void ucd9000_probe_gpio(struct i2c_client *client,
 	data->gpio.direction_input = ucd9000_gpio_direction_input;
 	data->gpio.direction_output = ucd9000_gpio_direction_output;
 	data->gpio.get = ucd9000_gpio_get;
-	data->gpio.set_rv = ucd9000_gpio_set;
+	data->gpio.set = ucd9000_gpio_set;
 	data->gpio.can_sleep = true;
 	data->gpio.base = -1;
 	data->gpio.parent = &client->dev;
diff --git a/drivers/i2c/muxes/i2c-mux-ltc4306.c b/drivers/i2c/muxes/i2c-mux-ltc4306.c
index c688af270a11..50fbc0d06e62 100644
--- a/drivers/i2c/muxes/i2c-mux-ltc4306.c
+++ b/drivers/i2c/muxes/i2c-mux-ltc4306.c
@@ -164,7 +164,7 @@ static int ltc4306_gpio_init(struct ltc4306 *data)
 	data->gpiochip.direction_input = ltc4306_gpio_direction_input;
 	data->gpiochip.direction_output = ltc4306_gpio_direction_output;
 	data->gpiochip.get = ltc4306_gpio_get;
-	data->gpiochip.set_rv = ltc4306_gpio_set;
+	data->gpiochip.set = ltc4306_gpio_set;
 	data->gpiochip.set_config = ltc4306_gpio_set_config;
 	data->gpiochip.owner = THIS_MODULE;
 
diff --git a/drivers/iio/adc/ad4130.c b/drivers/iio/adc/ad4130.c
index 6cf790ff3eb5..dcdb5778f7d6 100644
--- a/drivers/iio/adc/ad4130.c
+++ b/drivers/iio/adc/ad4130.c
@@ -2064,7 +2064,7 @@ static int ad4130_probe(struct spi_device *spi)
 	st->gc.can_sleep = true;
 	st->gc.init_valid_mask = ad4130_gpio_init_valid_mask;
 	st->gc.get_direction = ad4130_gpio_get_direction;
-	st->gc.set_rv = ad4130_gpio_set;
+	st->gc.set = ad4130_gpio_set;
 
 	ret = devm_gpiochip_add_data(dev, &st->gc, st);
 	if (ret)
diff --git a/drivers/iio/adc/ad4170-4.c b/drivers/iio/adc/ad4170-4.c
index 6cd84d6fb08b..efaed92191f1 100644
--- a/drivers/iio/adc/ad4170-4.c
+++ b/drivers/iio/adc/ad4170-4.c
@@ -1807,7 +1807,7 @@ static int ad4170_gpio_init(struct iio_dev *indio_dev)
 	st->gpiochip.direction_input = ad4170_gpio_direction_input;
 	st->gpiochip.direction_output = ad4170_gpio_direction_output;
 	st->gpiochip.get = ad4170_gpio_get;
-	st->gpiochip.set_rv = ad4170_gpio_set;
+	st->gpiochip.set = ad4170_gpio_set;
 	st->gpiochip.owner = THIS_MODULE;
 
 	return devm_gpiochip_add_data(&st->spi->dev, &st->gpiochip, indio_dev);
diff --git a/drivers/iio/adc/ad7768-1.c b/drivers/iio/adc/ad7768-1.c
index a2e061f0cb08..ca8fa91796ca 100644
--- a/drivers/iio/adc/ad7768-1.c
+++ b/drivers/iio/adc/ad7768-1.c
@@ -673,7 +673,7 @@ static int ad7768_gpio_init(struct iio_dev *indio_dev)
 		.direction_input = ad7768_gpio_direction_input,
 		.direction_output = ad7768_gpio_direction_output,
 		.get = ad7768_gpio_get,
-		.set_rv = ad7768_gpio_set,
+		.set = ad7768_gpio_set,
 		.owner = THIS_MODULE,
 	};
 
diff --git a/drivers/iio/adc/rohm-bd79124.c b/drivers/iio/adc/rohm-bd79124.c
index bb7c93ae4055..06c55c8da93f 100644
--- a/drivers/iio/adc/rohm-bd79124.c
+++ b/drivers/iio/adc/rohm-bd79124.c
@@ -246,8 +246,8 @@ static int bd79124_init_valid_mask(struct gpio_chip *gc,
 static const struct gpio_chip bd79124gpo_chip = {
 	.label			= "bd79124-gpo",
 	.get_direction		= bd79124gpo_direction_get,
-	.set_rv			= bd79124gpo_set,
-	.set_multiple_rv	= bd79124gpo_set_multiple,
+	.set			= bd79124gpo_set,
+	.set_multiple		= bd79124gpo_set_multiple,
 	.init_valid_mask	= bd79124_init_valid_mask,
 	.can_sleep		= true,
 	.ngpio			= 8,
diff --git a/drivers/iio/adc/ti-ads7950.c b/drivers/iio/adc/ti-ads7950.c
index 0356ccf23fea..bbe1ce577789 100644
--- a/drivers/iio/adc/ti-ads7950.c
+++ b/drivers/iio/adc/ti-ads7950.c
@@ -648,7 +648,7 @@ static int ti_ads7950_probe(struct spi_device *spi)
 	st->chip.direction_input = ti_ads7950_direction_input;
 	st->chip.direction_output = ti_ads7950_direction_output;
 	st->chip.get = ti_ads7950_get;
-	st->chip.set_rv = ti_ads7950_set;
+	st->chip.set = ti_ads7950_set;
 
 	ret = gpiochip_add_data(&st->chip, st);
 	if (ret) {
diff --git a/drivers/iio/addac/ad74115.c b/drivers/iio/addac/ad74115.c
index 4d8b64048e4f..f8b04d86b01f 100644
--- a/drivers/iio/addac/ad74115.c
+++ b/drivers/iio/addac/ad74115.c
@@ -1577,7 +1577,7 @@ static int ad74115_setup_gpio_chip(struct ad74115_state *st)
 		.direction_input = ad74115_gpio_direction_input,
 		.direction_output = ad74115_gpio_direction_output,
 		.get = ad74115_gpio_get,
-		.set_rv = ad74115_gpio_set,
+		.set = ad74115_gpio_set,
 	};
 
 	return devm_gpiochip_add_data(dev, &st->gc, st);
diff --git a/drivers/iio/addac/ad74413r.c b/drivers/iio/addac/ad74413r.c
index a0bb1dbcb7ad..a20b4d48c5f7 100644
--- a/drivers/iio/addac/ad74413r.c
+++ b/drivers/iio/addac/ad74413r.c
@@ -1425,8 +1425,8 @@ static int ad74413r_probe(struct spi_device *spi)
 		st->gpo_gpiochip.ngpio = st->num_gpo_gpios;
 		st->gpo_gpiochip.parent = st->dev;
 		st->gpo_gpiochip.can_sleep = true;
-		st->gpo_gpiochip.set_rv = ad74413r_gpio_set;
-		st->gpo_gpiochip.set_multiple_rv = ad74413r_gpio_set_multiple;
+		st->gpo_gpiochip.set = ad74413r_gpio_set;
+		st->gpo_gpiochip.set_multiple = ad74413r_gpio_set_multiple;
 		st->gpo_gpiochip.set_config = ad74413r_gpio_set_gpo_config;
 		st->gpo_gpiochip.get_direction =
 			ad74413r_gpio_get_gpo_direction;
diff --git a/drivers/iio/dac/ad5592r-base.c b/drivers/iio/dac/ad5592r-base.c
index 5f2cd51723f6..4720733d66b2 100644
--- a/drivers/iio/dac/ad5592r-base.c
+++ b/drivers/iio/dac/ad5592r-base.c
@@ -129,7 +129,7 @@ static int ad5592r_gpio_init(struct ad5592r_state *st)
 	st->gpiochip.direction_input = ad5592r_gpio_direction_input;
 	st->gpiochip.direction_output = ad5592r_gpio_direction_output;
 	st->gpiochip.get = ad5592r_gpio_get;
-	st->gpiochip.set_rv = ad5592r_gpio_set;
+	st->gpiochip.set = ad5592r_gpio_set;
 	st->gpiochip.request = ad5592r_gpio_request;
 	st->gpiochip.owner = THIS_MODULE;
 	st->gpiochip.names = ad5592r_gpio_names;
diff --git a/drivers/input/keyboard/adp5588-keys.c b/drivers/input/keyboard/adp5588-keys.c
index 2b2aca08423a..414fbef4abf9 100644
--- a/drivers/input/keyboard/adp5588-keys.c
+++ b/drivers/input/keyboard/adp5588-keys.c
@@ -425,7 +425,7 @@ static int adp5588_gpio_add(struct adp5588_kpad *kpad)
 	kpad->gc.direction_input = adp5588_gpio_direction_input;
 	kpad->gc.direction_output = adp5588_gpio_direction_output;
 	kpad->gc.get = adp5588_gpio_get_value;
-	kpad->gc.set_rv = adp5588_gpio_set_value;
+	kpad->gc.set = adp5588_gpio_set_value;
 	kpad->gc.set_config = adp5588_gpio_set_config;
 	kpad->gc.can_sleep = 1;
 
diff --git a/drivers/input/touchscreen/ad7879.c b/drivers/input/touchscreen/ad7879.c
index d2a3a5e016b6..8b4f3e3660b8 100644
--- a/drivers/input/touchscreen/ad7879.c
+++ b/drivers/input/touchscreen/ad7879.c
@@ -475,7 +475,7 @@ static int ad7879_gpio_add(struct ad7879 *ts)
 	ts->gc.direction_input = ad7879_gpio_direction_input;
 	ts->gc.direction_output = ad7879_gpio_direction_output;
 	ts->gc.get = ad7879_gpio_get_value;
-	ts->gc.set_rv = ad7879_gpio_set_value;
+	ts->gc.set = ad7879_gpio_set_value;
 	ts->gc.can_sleep = 1;
 	ts->gc.base = -1;
 	ts->gc.ngpio = 1;
diff --git a/drivers/leds/blink/leds-lgm-sso.c b/drivers/leds/blink/leds-lgm-sso.c
index c9027f9c4bb7..8923d2df4704 100644
--- a/drivers/leds/blink/leds-lgm-sso.c
+++ b/drivers/leds/blink/leds-lgm-sso.c
@@ -471,7 +471,7 @@ static int sso_gpio_gc_init(struct device *dev, struct sso_led_priv *priv)
 	gc->get_direction       = sso_gpio_get_dir;
 	gc->direction_output    = sso_gpio_dir_out;
 	gc->get                 = sso_gpio_get;
-	gc->set_rv              = sso_gpio_set;
+	gc->set                 = sso_gpio_set;
 
 	gc->label               = "lgm-sso";
 	gc->base                = -1;
diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 7d4c071a6cd0..0344189bb991 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -473,7 +473,7 @@ static int pca9532_configure(struct i2c_client *client,
 		data->gpio.label = "gpio-pca9532";
 		data->gpio.direction_input = pca9532_gpio_direction_input;
 		data->gpio.direction_output = pca9532_gpio_direction_output;
-		data->gpio.set_rv = pca9532_gpio_set_value;
+		data->gpio.set = pca9532_gpio_set_value;
 		data->gpio.get = pca9532_gpio_get_value;
 		data->gpio.request = pca9532_gpio_request_pin;
 		data->gpio.can_sleep = 1;
diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c
index 70d109246088..2007fe6217ec 100644
--- a/drivers/leds/leds-pca955x.c
+++ b/drivers/leds/leds-pca955x.c
@@ -737,7 +737,7 @@ static int pca955x_probe(struct i2c_client *client)
 	pca955x->gpio.label = "gpio-pca955x";
 	pca955x->gpio.direction_input = pca955x_gpio_direction_input;
 	pca955x->gpio.direction_output = pca955x_gpio_direction_output;
-	pca955x->gpio.set_rv = pca955x_gpio_set_value;
+	pca955x->gpio.set = pca955x_gpio_set_value;
 	pca955x->gpio.get = pca955x_gpio_get_value;
 	pca955x->gpio.request = pca955x_gpio_request_pin;
 	pca955x->gpio.free = pca955x_gpio_free_pin;
diff --git a/drivers/leds/leds-tca6507.c b/drivers/leds/leds-tca6507.c
index 89c165c8ee9c..fd0e8bab9a4b 100644
--- a/drivers/leds/leds-tca6507.c
+++ b/drivers/leds/leds-tca6507.c
@@ -637,7 +637,7 @@ static int tca6507_probe_gpios(struct device *dev,
 	tca->gpio.base = -1;
 	tca->gpio.owner = THIS_MODULE;
 	tca->gpio.direction_output = tca6507_gpio_direction_output;
-	tca->gpio.set_rv = tca6507_gpio_set_value;
+	tca->gpio.set = tca6507_gpio_set_value;
 	tca->gpio.parent = dev;
 	err = devm_gpiochip_add_data(dev, &tca->gpio, tca);
 	if (err) {
diff --git a/drivers/media/dvb-frontends/cxd2820r_core.c b/drivers/media/dvb-frontends/cxd2820r_core.c
index a31a8a6a4946..5aa3d45a691a 100644
--- a/drivers/media/dvb-frontends/cxd2820r_core.c
+++ b/drivers/media/dvb-frontends/cxd2820r_core.c
@@ -651,7 +651,7 @@ static int cxd2820r_probe(struct i2c_client *client)
 		priv->gpio_chip.parent = &client->dev;
 		priv->gpio_chip.owner = THIS_MODULE;
 		priv->gpio_chip.direction_output = cxd2820r_gpio_direction_output;
-		priv->gpio_chip.set_rv = cxd2820r_gpio_set;
+		priv->gpio_chip.set = cxd2820r_gpio_set;
 		priv->gpio_chip.get = cxd2820r_gpio_get;
 		priv->gpio_chip.base = -1; /* Dynamic allocation */
 		priv->gpio_chip.ngpio = GPIO_COUNT;
diff --git a/drivers/media/i2c/ds90ub913.c b/drivers/media/i2c/ds90ub913.c
index bc74499b0a96..a80da2b4a8fa 100644
--- a/drivers/media/i2c/ds90ub913.c
+++ b/drivers/media/i2c/ds90ub913.c
@@ -235,7 +235,7 @@ static int ub913_gpiochip_probe(struct ub913_data *priv)
 	gc->ngpio = UB913_NUM_GPIOS;
 	gc->get_direction = ub913_gpio_get_direction;
 	gc->direction_output = ub913_gpio_direction_out;
-	gc->set_rv = ub913_gpio_set;
+	gc->set = ub913_gpio_set;
 	gc->of_xlate = ub913_gpio_of_xlate;
 	gc->of_gpio_n_cells = 2;
 
diff --git a/drivers/media/i2c/ds90ub953.c b/drivers/media/i2c/ds90ub953.c
index a865bfc89500..e3fc9d66970a 100644
--- a/drivers/media/i2c/ds90ub953.c
+++ b/drivers/media/i2c/ds90ub953.c
@@ -361,7 +361,7 @@ static int ub953_gpiochip_probe(struct ub953_data *priv)
 	gc->direction_input = ub953_gpio_direction_in;
 	gc->direction_output = ub953_gpio_direction_out;
 	gc->get = ub953_gpio_get;
-	gc->set_rv = ub953_gpio_set;
+	gc->set = ub953_gpio_set;
 	gc->of_xlate = ub953_gpio_of_xlate;
 	gc->of_gpio_n_cells = 2;
 
diff --git a/drivers/media/i2c/max9286.c b/drivers/media/i2c/max9286.c
index 1d0b5f56f989..7c0961688d61 100644
--- a/drivers/media/i2c/max9286.c
+++ b/drivers/media/i2c/max9286.c
@@ -1220,7 +1220,7 @@ static int max9286_register_gpio(struct max9286_priv *priv)
 	gpio->owner = THIS_MODULE;
 	gpio->ngpio = 2;
 	gpio->base = -1;
-	gpio->set_rv = max9286_gpiochip_set;
+	gpio->set = max9286_gpiochip_set;
 	gpio->get = max9286_gpiochip_get;
 	gpio->can_sleep = true;
 
diff --git a/drivers/media/i2c/max96717.c b/drivers/media/i2c/max96717.c
index 015e42fbe246..c8ae7890d9fa 100644
--- a/drivers/media/i2c/max96717.c
+++ b/drivers/media/i2c/max96717.c
@@ -355,7 +355,7 @@ static int max96717_gpiochip_probe(struct max96717_priv *priv)
 	gc->get_direction = max96717_gpio_get_direction;
 	gc->direction_input = max96717_gpio_direction_in;
 	gc->direction_output = max96717_gpio_direction_out;
-	gc->set_rv = max96717_gpiochip_set;
+	gc->set = max96717_gpiochip_set;
 	gc->get = max96717_gpiochip_get;
 
 	/* Disable GPIO forwarding */
diff --git a/drivers/media/pci/solo6x10/solo6x10-gpio.c b/drivers/media/pci/solo6x10/solo6x10-gpio.c
index b16a8453a62a..71848741c55c 100644
--- a/drivers/media/pci/solo6x10/solo6x10-gpio.c
+++ b/drivers/media/pci/solo6x10/solo6x10-gpio.c
@@ -158,7 +158,7 @@ int solo_gpio_init(struct solo_dev *solo_dev)
 
 	solo_dev->gpio_dev.get_direction = solo_gpiochip_get_direction;
 	solo_dev->gpio_dev.get = solo_gpiochip_get;
-	solo_dev->gpio_dev.set_rv = solo_gpiochip_set;
+	solo_dev->gpio_dev.set = solo_gpiochip_set;
 
 	ret = gpiochip_add_data(&solo_dev->gpio_dev, solo_dev);
 
diff --git a/drivers/mfd/sm501.c b/drivers/mfd/sm501.c
index a5f9241fa3f2..50bf3260f65d 100644
--- a/drivers/mfd/sm501.c
+++ b/drivers/mfd/sm501.c
@@ -965,7 +965,7 @@ static const struct gpio_chip gpio_chip_template = {
 	.ngpio			= 32,
 	.direction_input	= sm501_gpio_input,
 	.direction_output	= sm501_gpio_output,
-	.set_rv			= sm501_gpio_set,
+	.set			= sm501_gpio_set,
 	.get			= sm501_gpio_get,
 };
 
diff --git a/drivers/mfd/tps65010.c b/drivers/mfd/tps65010.c
index 03bd5cd66798..8a144ec52201 100644
--- a/drivers/mfd/tps65010.c
+++ b/drivers/mfd/tps65010.c
@@ -620,7 +620,7 @@ static int tps65010_probe(struct i2c_client *client)
 		tps->chip.parent = &client->dev;
 		tps->chip.owner = THIS_MODULE;
 
-		tps->chip.set_rv = tps65010_gpio_set;
+		tps->chip.set = tps65010_gpio_set;
 		tps->chip.direction_output = tps65010_output;
 
 		/* NOTE:  only partial support for inputs; nyet IRQs */
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index fd71ba29f6b5..4b450d78a65f 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -570,7 +570,7 @@ static int ucb1x00_probe(struct mcp *mcp)
 		ucb->gpio.owner = THIS_MODULE;
 		ucb->gpio.base = pdata->gpio_base;
 		ucb->gpio.ngpio = 10;
-		ucb->gpio.set_rv = ucb1x00_gpio_set;
+		ucb->gpio.set = ucb1x00_gpio_set;
 		ucb->gpio.get = ucb1x00_gpio_get;
 		ucb->gpio.direction_input = ucb1x00_gpio_direction_input;
 		ucb->gpio.direction_output = ucb1x00_gpio_direction_output;
diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c
index ff8f4404d10f..8eddbaa1fccd 100644
--- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c
+++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c
@@ -438,7 +438,7 @@ static int pci1xxxx_gpio_setup(struct pci1xxxx_gpio *priv, int irq)
 	gchip->direction_output = pci1xxxx_gpio_direction_output;
 	gchip->get_direction = pci1xxxx_gpio_get_direction;
 	gchip->get = pci1xxxx_gpio_get;
-	gchip->set_rv = pci1xxxx_gpio_set;
+	gchip->set = pci1xxxx_gpio_set;
 	gchip->set_config = pci1xxxx_gpio_set_config;
 	gchip->dbg_show = NULL;
 	gchip->base = -1;
diff --git a/drivers/misc/ti_fpc202.c b/drivers/misc/ti_fpc202.c
index 0b1a6350c02b..7964e46c7448 100644
--- a/drivers/misc/ti_fpc202.c
+++ b/drivers/misc/ti_fpc202.c
@@ -333,7 +333,7 @@ static int fpc202_probe(struct i2c_client *client)
 	priv->gpio.base = -1;
 	priv->gpio.direction_input = fpc202_gpio_direction_input;
 	priv->gpio.direction_output = fpc202_gpio_direction_output;
-	priv->gpio.set_rv = fpc202_gpio_set;
+	priv->gpio.set = fpc202_gpio_set;
 	priv->gpio.get = fpc202_gpio_get;
 	priv->gpio.ngpio = FPC202_GPIO_COUNT;
 	priv->gpio.parent = dev;
diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 5a95877b7419..313e1d241f01 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -607,8 +607,8 @@ static int mcp251x_gpio_setup(struct mcp251x_priv *priv)
 	gpio->get_direction = mcp251x_gpio_get_direction;
 	gpio->get = mcp251x_gpio_get;
 	gpio->get_multiple = mcp251x_gpio_get_multiple;
-	gpio->set_rv = mcp251x_gpio_set;
-	gpio->set_multiple_rv = mcp251x_gpio_set_multiple;
+	gpio->set = mcp251x_gpio_set;
+	gpio->set_multiple = mcp251x_gpio_set_multiple;
 	gpio->base = -1;
 	gpio->ngpio = ARRAY_SIZE(mcp251x_gpio_names);
 	gpio->names = mcp251x_gpio_names;
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index e5bed4237ff4..548b85befbf4 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -2187,7 +2187,7 @@ mt7530_setup_gpio(struct mt7530_priv *priv)
 	gc->direction_input = mt7530_gpio_direction_input;
 	gc->direction_output = mt7530_gpio_direction_output;
 	gc->get = mt7530_gpio_get;
-	gc->set_rv = mt7530_gpio_set;
+	gc->set = mt7530_gpio_set;
 	gc->base = -1;
 	gc->ngpio = 15;
 	gc->can_sleep = true;
diff --git a/drivers/net/dsa/vitesse-vsc73xx-core.c b/drivers/net/dsa/vitesse-vsc73xx-core.c
index 4f9687ab3b2b..9d31b8258268 100644
--- a/drivers/net/dsa/vitesse-vsc73xx-core.c
+++ b/drivers/net/dsa/vitesse-vsc73xx-core.c
@@ -2317,7 +2317,7 @@ static int vsc73xx_gpio_probe(struct vsc73xx *vsc)
 	vsc->gc.parent = vsc->dev;
 	vsc->gc.base = -1;
 	vsc->gc.get = vsc73xx_gpio_get;
-	vsc->gc.set_rv = vsc73xx_gpio_set;
+	vsc->gc.set = vsc73xx_gpio_set;
 	vsc->gc.direction_input = vsc73xx_gpio_direction_input;
 	vsc->gc.direction_output = vsc73xx_gpio_direction_output;
 	vsc->gc.get_direction = vsc73xx_gpio_get_direction;
diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c
index 04e84ebb646c..070dc8c00835 100644
--- a/drivers/net/phy/qcom/qca807x.c
+++ b/drivers/net/phy/qcom/qca807x.c
@@ -427,7 +427,7 @@ static int qca807x_gpio(struct phy_device *phydev)
 	gc->get_direction = qca807x_gpio_get_direction;
 	gc->direction_output = qca807x_gpio_dir_out;
 	gc->get = qca807x_gpio_get;
-	gc->set_rv = qca807x_gpio_set;
+	gc->set = qca807x_gpio_set;
 
 	return devm_gpiochip_add_data(dev, gc, priv);
 }
diff --git a/drivers/pinctrl/actions/pinctrl-owl.c b/drivers/pinctrl/actions/pinctrl-owl.c
index 86f3d5c69e36..1f0ef4727ba7 100644
--- a/drivers/pinctrl/actions/pinctrl-owl.c
+++ b/drivers/pinctrl/actions/pinctrl-owl.c
@@ -962,7 +962,7 @@ int owl_pinctrl_probe(struct platform_device *pdev,
 	pctrl->chip.direction_input  = owl_gpio_direction_input;
 	pctrl->chip.direction_output = owl_gpio_direction_output;
 	pctrl->chip.get = owl_gpio_get;
-	pctrl->chip.set_rv = owl_gpio_set;
+	pctrl->chip.set = owl_gpio_set;
 	pctrl->chip.request = owl_gpio_request;
 	pctrl->chip.free = owl_gpio_free;
 
diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c
index 826827800474..7dbf079739bc 100644
--- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c
+++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c
@@ -397,7 +397,7 @@ static const struct gpio_chip bcm2835_gpio_chip = {
 	.direction_output = bcm2835_gpio_direction_output,
 	.get_direction = bcm2835_gpio_get_direction,
 	.get = bcm2835_gpio_get,
-	.set_rv = bcm2835_gpio_set,
+	.set = bcm2835_gpio_set,
 	.set_config = gpiochip_generic_config,
 	.base = -1,
 	.ngpio = BCM2835_NUM_GPIOS,
@@ -414,7 +414,7 @@ static const struct gpio_chip bcm2711_gpio_chip = {
 	.direction_output = bcm2835_gpio_direction_output,
 	.get_direction = bcm2835_gpio_get_direction,
 	.get = bcm2835_gpio_get,
-	.set_rv = bcm2835_gpio_set,
+	.set = bcm2835_gpio_set,
 	.set_config = gpiochip_generic_config,
 	.base = -1,
 	.ngpio = BCM2711_NUM_GPIOS,
diff --git a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
index 1d08b8d4cdd7..8c353676f2af 100644
--- a/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-iproc-gpio.c
@@ -865,7 +865,7 @@ static int iproc_gpio_probe(struct platform_device *pdev)
 	gc->direction_input = iproc_gpio_direction_input;
 	gc->direction_output = iproc_gpio_direction_output;
 	gc->get_direction = iproc_gpio_get_direction;
-	gc->set_rv = iproc_gpio_set;
+	gc->set = iproc_gpio_set;
 	gc->get = iproc_gpio_get;
 
 	chip->pinmux_is_supported = of_property_read_bool(dev->of_node,
diff --git a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
index b08f8480ddc6..b425ecacd1b0 100644
--- a/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
+++ b/drivers/pinctrl/bcm/pinctrl-nsp-gpio.c
@@ -656,7 +656,7 @@ static int nsp_gpio_probe(struct platform_device *pdev)
 	gc->direction_input = nsp_gpio_direction_input;
 	gc->direction_output = nsp_gpio_direction_output;
 	gc->get_direction = nsp_gpio_get_direction;
-	gc->set_rv = nsp_gpio_set;
+	gc->set = nsp_gpio_set;
 	gc->get = nsp_gpio_get;
 
 	/* optional GPIO interrupt support */
diff --git a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
index 4e47710eb3d5..68abb6d6cecd 100644
--- a/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
+++ b/drivers/pinctrl/cirrus/pinctrl-cs42l43.c
@@ -555,7 +555,7 @@ static int cs42l43_pin_probe(struct platform_device *pdev)
 	priv->gpio_chip.direction_output = cs42l43_gpio_direction_out;
 	priv->gpio_chip.add_pin_ranges = cs42l43_gpio_add_pin_ranges;
 	priv->gpio_chip.get = cs42l43_gpio_get;
-	priv->gpio_chip.set_rv = cs42l43_gpio_set;
+	priv->gpio_chip.set = cs42l43_gpio_set;
 	priv->gpio_chip.label = dev_name(priv->dev);
 	priv->gpio_chip.parent = priv->dev;
 	priv->gpio_chip.can_sleep = true;
diff --git a/drivers/pinctrl/cirrus/pinctrl-lochnagar.c b/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
index dcc0a2f3c7dd..ca6ae566082b 100644
--- a/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
+++ b/drivers/pinctrl/cirrus/pinctrl-lochnagar.c
@@ -1161,7 +1161,7 @@ static int lochnagar_pin_probe(struct platform_device *pdev)
 	priv->gpio_chip.request = gpiochip_generic_request;
 	priv->gpio_chip.free = gpiochip_generic_free;
 	priv->gpio_chip.direction_output = lochnagar_gpio_direction_out;
-	priv->gpio_chip.set_rv = lochnagar_gpio_set;
+	priv->gpio_chip.set = lochnagar_gpio_set;
 	priv->gpio_chip.can_sleep = true;
 	priv->gpio_chip.parent = dev;
 	priv->gpio_chip.base = -1;
diff --git a/drivers/pinctrl/intel/pinctrl-baytrail.c b/drivers/pinctrl/intel/pinctrl-baytrail.c
index 6eb649f1ffd6..5fd107a00ef8 100644
--- a/drivers/pinctrl/intel/pinctrl-baytrail.c
+++ b/drivers/pinctrl/intel/pinctrl-baytrail.c
@@ -1231,7 +1231,7 @@ static const struct gpio_chip byt_gpio_chip = {
 	.direction_input	= byt_gpio_direction_input,
 	.direction_output	= byt_gpio_direction_output,
 	.get			= byt_gpio_get,
-	.set_rv			= byt_gpio_set,
+	.set			= byt_gpio_set,
 	.set_config		= gpiochip_generic_config,
 	.dbg_show		= byt_gpio_dbg_show,
 };
diff --git a/drivers/pinctrl/intel/pinctrl-cherryview.c b/drivers/pinctrl/intel/pinctrl-cherryview.c
index 769e8c4102a5..f81f7929cd3b 100644
--- a/drivers/pinctrl/intel/pinctrl-cherryview.c
+++ b/drivers/pinctrl/intel/pinctrl-cherryview.c
@@ -1168,7 +1168,7 @@ static const struct gpio_chip chv_gpio_chip = {
 	.direction_input = chv_gpio_direction_input,
 	.direction_output = chv_gpio_direction_output,
 	.get = chv_gpio_get,
-	.set_rv = chv_gpio_set,
+	.set = chv_gpio_set,
 };
 
 static void chv_gpio_irq_ack(struct irq_data *d)
diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c
index f2ff71e5ea6f..d68cef4ec52a 100644
--- a/drivers/pinctrl/intel/pinctrl-intel.c
+++ b/drivers/pinctrl/intel/pinctrl-intel.c
@@ -1114,7 +1114,7 @@ static const struct gpio_chip intel_gpio_chip = {
 	.direction_input = intel_gpio_direction_input,
 	.direction_output = intel_gpio_direction_output,
 	.get = intel_gpio_get,
-	.set_rv = intel_gpio_set,
+	.set = intel_gpio_set,
 	.set_config = gpiochip_generic_config,
 };
 
diff --git a/drivers/pinctrl/intel/pinctrl-lynxpoint.c b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
index 5d4a5dd493d1..3fb628309fb2 100644
--- a/drivers/pinctrl/intel/pinctrl-lynxpoint.c
+++ b/drivers/pinctrl/intel/pinctrl-lynxpoint.c
@@ -777,7 +777,7 @@ static int lp_gpio_probe(struct platform_device *pdev)
 	gc->direction_input = lp_gpio_direction_input;
 	gc->direction_output = lp_gpio_direction_output;
 	gc->get = lp_gpio_get;
-	gc->set_rv = lp_gpio_set;
+	gc->set = lp_gpio_set;
 	gc->set_config = gpiochip_generic_config;
 	gc->get_direction = lp_gpio_get_direction;
 	gc->base = -1;
diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c
index 1737b88530c3..5f1ec9e0de21 100644
--- a/drivers/pinctrl/mediatek/pinctrl-airoha.c
+++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c
@@ -2418,7 +2418,7 @@ static int airoha_pinctrl_add_gpiochip(struct airoha_pinctrl *pinctrl,
 	gc->free = gpiochip_generic_free;
 	gc->direction_input = pinctrl_gpio_direction_input;
 	gc->direction_output = airoha_gpio_direction_output;
-	gc->set_rv = airoha_gpio_set;
+	gc->set = airoha_gpio_set;
 	gc->get = airoha_gpio_get;
 	gc->base = -1;
 	gc->ngpio = AIROHA_NUM_PINS;
diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c
index ba0d6f880c6e..6e4f6c07a509 100644
--- a/drivers/pinctrl/mediatek/pinctrl-moore.c
+++ b/drivers/pinctrl/mediatek/pinctrl-moore.c
@@ -569,7 +569,7 @@ static int mtk_build_gpiochip(struct mtk_pinctrl *hw)
 	chip->direction_input	= pinctrl_gpio_direction_input;
 	chip->direction_output	= mtk_gpio_direction_output;
 	chip->get		= mtk_gpio_get;
-	chip->set_rv		= mtk_gpio_set;
+	chip->set		= mtk_gpio_set;
 	chip->to_irq		= mtk_gpio_to_irq;
 	chip->set_config	= mtk_gpio_set_config;
 	chip->base		= -1;
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
index a4cb6d511fcd..d10306024111 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c
@@ -898,7 +898,7 @@ static const struct gpio_chip mtk_gpio_chip = {
 	.direction_input	= pinctrl_gpio_direction_input,
 	.direction_output	= mtk_gpio_direction_output,
 	.get			= mtk_gpio_get,
-	.set_rv			= mtk_gpio_set,
+	.set			= mtk_gpio_set,
 	.to_irq			= mtk_gpio_to_irq,
 	.set_config		= mtk_gpio_set_config,
 };
diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c
index 89ef4e530fcc..3e714554789d 100644
--- a/drivers/pinctrl/mediatek/pinctrl-paris.c
+++ b/drivers/pinctrl/mediatek/pinctrl-paris.c
@@ -949,7 +949,7 @@ static int mtk_build_gpiochip(struct mtk_pinctrl *hw)
 	chip->direction_input	= mtk_gpio_direction_input;
 	chip->direction_output	= mtk_gpio_direction_output;
 	chip->get		= mtk_gpio_get;
-	chip->set_rv		= mtk_gpio_set;
+	chip->set		= mtk_gpio_set;
 	chip->to_irq		= mtk_gpio_to_irq;
 	chip->set_config	= mtk_gpio_set_config;
 	chip->base		= -1;
diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
index c8958222df8c..e34e984c2b38 100644
--- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
+++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c
@@ -888,7 +888,7 @@ static const struct gpio_chip aml_gpio_template = {
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.set_config		= gpiochip_generic_config,
-	.set_rv			= aml_gpio_set,
+	.set			= aml_gpio_set,
 	.get			= aml_gpio_get,
 	.direction_input	= aml_gpio_direction_input,
 	.direction_output	= aml_gpio_direction_output,
diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c
index f5be61f2ede4..277e9c40490d 100644
--- a/drivers/pinctrl/meson/pinctrl-meson.c
+++ b/drivers/pinctrl/meson/pinctrl-meson.c
@@ -616,7 +616,7 @@ static int meson_gpiolib_register(struct meson_pinctrl *pc)
 	pc->chip.direction_input = meson_gpio_direction_input;
 	pc->chip.direction_output = meson_gpio_direction_output;
 	pc->chip.get = meson_gpio_get;
-	pc->chip.set_rv = meson_gpio_set;
+	pc->chip.set = meson_gpio_set;
 	pc->chip.base = -1;
 	pc->chip.ngpio = pc->data->num_pins;
 	pc->chip.can_sleep = false;
diff --git a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
index a6b106984e12..881df5e08f61 100644
--- a/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
+++ b/drivers/pinctrl/mvebu/pinctrl-armada-37xx.c
@@ -518,7 +518,7 @@ static const struct pinmux_ops armada_37xx_pmx_ops = {
 static const struct gpio_chip armada_37xx_gpiolib_chip = {
 	.request = gpiochip_generic_request,
 	.free = gpiochip_generic_free,
-	.set_rv = armada_37xx_gpio_set,
+	.set = armada_37xx_gpio_set,
 	.get = armada_37xx_gpio_get,
 	.get_direction	= armada_37xx_gpio_get_direction,
 	.direction_input = armada_37xx_gpio_direction_input,
diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c
index 2f55f83127cf..7b5f94d8cb23 100644
--- a/drivers/pinctrl/nomadik/pinctrl-abx500.c
+++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c
@@ -536,7 +536,7 @@ static const struct gpio_chip abx500gpio_chip = {
 	.direction_input	= abx500_gpio_direction_input,
 	.get			= abx500_gpio_get,
 	.direction_output	= abx500_gpio_direction_output,
-	.set_rv			= abx500_gpio_set,
+	.set			= abx500_gpio_set,
 	.to_irq			= abx500_gpio_to_irq,
 	.dbg_show		= abx500_gpio_dbg_show,
 };
diff --git a/drivers/pinctrl/nuvoton/pinctrl-ma35.c b/drivers/pinctrl/nuvoton/pinctrl-ma35.c
index da5220da5149..54652bfbe6ac 100644
--- a/drivers/pinctrl/nuvoton/pinctrl-ma35.c
+++ b/drivers/pinctrl/nuvoton/pinctrl-ma35.c
@@ -526,7 +526,7 @@ static int ma35_gpiolib_register(struct platform_device *pdev, struct ma35_pinct
 		bank->chip.direction_input = ma35_gpio_core_direction_in;
 		bank->chip.direction_output = ma35_gpio_core_direction_out;
 		bank->chip.get = ma35_gpio_core_get;
-		bank->chip.set_rv = ma35_gpio_core_set;
+		bank->chip.set = ma35_gpio_core_set;
 		bank->chip.base = -1;
 		bank->chip.ngpio = bank->nr_pins;
 		bank->chip.can_sleep = false;
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index b90ef3a26ae8..09a5425d54ba 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -1187,7 +1187,7 @@ static int amd_gpio_probe(struct platform_device *pdev)
 	gpio_dev->gc.direction_input	= amd_gpio_direction_input;
 	gpio_dev->gc.direction_output	= amd_gpio_direction_output;
 	gpio_dev->gc.get			= amd_gpio_get_value;
-	gpio_dev->gc.set_rv			= amd_gpio_set_value;
+	gpio_dev->gc.set			= amd_gpio_set_value;
 	gpio_dev->gc.set_config		= amd_gpio_set_config;
 	gpio_dev->gc.dbg_show		= amd_gpio_dbg_show;
 
diff --git a/drivers/pinctrl/pinctrl-amdisp.c b/drivers/pinctrl/pinctrl-amdisp.c
index 2e706bf8bcde..efbf40c776ea 100644
--- a/drivers/pinctrl/pinctrl-amdisp.c
+++ b/drivers/pinctrl/pinctrl-amdisp.c
@@ -151,7 +151,7 @@ static int amdisp_gpiochip_add(struct platform_device *pdev,
 	gc->direction_input	= amdisp_gpio_direction_input;
 	gc->direction_output	= amdisp_gpio_direction_output;
 	gc->get			= amdisp_gpio_get;
-	gc->set_rv		= amdisp_gpio_set;
+	gc->set			= amdisp_gpio_set;
 	gc->base		= -1;
 	gc->ngpio		= ARRAY_SIZE(amdisp_range_pins);
 
diff --git a/drivers/pinctrl/pinctrl-apple-gpio.c b/drivers/pinctrl/pinctrl-apple-gpio.c
index dcf3a921b4df..a09daa72bfe4 100644
--- a/drivers/pinctrl/pinctrl-apple-gpio.c
+++ b/drivers/pinctrl/pinctrl-apple-gpio.c
@@ -378,7 +378,7 @@ static int apple_gpio_register(struct apple_gpio_pinctrl *pctl)
 	pctl->gpio_chip.direction_input = apple_gpio_direction_input;
 	pctl->gpio_chip.direction_output = apple_gpio_direction_output;
 	pctl->gpio_chip.get = apple_gpio_get;
-	pctl->gpio_chip.set_rv = apple_gpio_set;
+	pctl->gpio_chip.set = apple_gpio_set;
 	pctl->gpio_chip.base = -1;
 	pctl->gpio_chip.ngpio = pctl->pinctrl_desc.npins;
 	pctl->gpio_chip.parent = pctl->dev;
diff --git a/drivers/pinctrl/pinctrl-as3722.c b/drivers/pinctrl/pinctrl-as3722.c
index 30ed758bbe9d..e713dea98aa8 100644
--- a/drivers/pinctrl/pinctrl-as3722.c
+++ b/drivers/pinctrl/pinctrl-as3722.c
@@ -529,7 +529,7 @@ static const struct gpio_chip as3722_gpio_chip = {
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.get			= as3722_gpio_get,
-	.set_rv			= as3722_gpio_set,
+	.set			= as3722_gpio_set,
 	.direction_input	= pinctrl_gpio_direction_input,
 	.direction_output	= as3722_gpio_direction_output,
 	.to_irq			= as3722_gpio_to_irq,
diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c
index 57f105ac962d..35ea3414cb96 100644
--- a/drivers/pinctrl/pinctrl-at91-pio4.c
+++ b/drivers/pinctrl/pinctrl-at91-pio4.c
@@ -442,8 +442,8 @@ static struct gpio_chip atmel_gpio_chip = {
 	.get                    = atmel_gpio_get,
 	.get_multiple           = atmel_gpio_get_multiple,
 	.direction_output       = atmel_gpio_direction_output,
-	.set_rv                 = atmel_gpio_set,
-	.set_multiple_rv        = atmel_gpio_set_multiple,
+	.set                    = atmel_gpio_set,
+	.set_multiple           = atmel_gpio_set_multiple,
 	.to_irq                 = atmel_gpio_to_irq,
 	.base                   = 0,
 };
diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c
index 6c2727bd55bc..0a57ed51d4c9 100644
--- a/drivers/pinctrl/pinctrl-at91.c
+++ b/drivers/pinctrl/pinctrl-at91.c
@@ -1801,8 +1801,8 @@ static const struct gpio_chip at91_gpio_template = {
 	.direction_input	= at91_gpio_direction_input,
 	.get			= at91_gpio_get,
 	.direction_output	= at91_gpio_direction_output,
-	.set_rv			= at91_gpio_set,
-	.set_multiple_rv	= at91_gpio_set_multiple,
+	.set			= at91_gpio_set,
+	.set_multiple		= at91_gpio_set_multiple,
 	.dbg_show		= at91_gpio_dbg_show,
 	.can_sleep		= false,
 	.ngpio			= MAX_NB_GPIO_PER_BANK,
diff --git a/drivers/pinctrl/pinctrl-aw9523.c b/drivers/pinctrl/pinctrl-aw9523.c
index 9570ef346af6..890b83fddea3 100644
--- a/drivers/pinctrl/pinctrl-aw9523.c
+++ b/drivers/pinctrl/pinctrl-aw9523.c
@@ -785,8 +785,8 @@ static int aw9523_init_gpiochip(struct aw9523 *awi, unsigned int npins)
 	gc->direction_output = aw9523_direction_output;
 	gc->get = aw9523_gpio_get;
 	gc->get_multiple = aw9523_gpio_get_multiple;
-	gc->set_rv = aw9523_gpio_set;
-	gc->set_multiple_rv = aw9523_gpio_set_multiple;
+	gc->set = aw9523_gpio_set;
+	gc->set_multiple = aw9523_gpio_set_multiple;
 	gc->set_config = gpiochip_generic_config;
 	gc->parent = dev;
 	gc->owner = THIS_MODULE;
diff --git a/drivers/pinctrl/pinctrl-axp209.c b/drivers/pinctrl/pinctrl-axp209.c
index fff408b60c4a..2bd8487484a8 100644
--- a/drivers/pinctrl/pinctrl-axp209.c
+++ b/drivers/pinctrl/pinctrl-axp209.c
@@ -192,7 +192,7 @@ static int axp20x_gpio_get_direction(struct gpio_chip *chip,
 static int axp20x_gpio_output(struct gpio_chip *chip, unsigned int offset,
 			      int value)
 {
-	return chip->set_rv(chip, offset, value);
+	return chip->set(chip, offset, value);
 }
 
 static int axp20x_gpio_set(struct gpio_chip *chip, unsigned int offset,
@@ -463,7 +463,7 @@ static int axp20x_pctl_probe(struct platform_device *pdev)
 	pctl->chip.owner		= THIS_MODULE;
 	pctl->chip.get			= axp20x_gpio_get;
 	pctl->chip.get_direction	= axp20x_gpio_get_direction;
-	pctl->chip.set_rv		= axp20x_gpio_set;
+	pctl->chip.set			= axp20x_gpio_set;
 	pctl->chip.direction_input	= pinctrl_gpio_direction_input;
 	pctl->chip.direction_output	= axp20x_gpio_output;
 
diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c
index 8a2fd632bdd4..cf7f80497fde 100644
--- a/drivers/pinctrl/pinctrl-cy8c95x0.c
+++ b/drivers/pinctrl/pinctrl-cy8c95x0.c
@@ -939,10 +939,10 @@ static int cy8c95x0_setup_gpiochip(struct cy8c95x0_pinctrl *chip)
 	gc->direction_input  = cy8c95x0_gpio_direction_input;
 	gc->direction_output = cy8c95x0_gpio_direction_output;
 	gc->get = cy8c95x0_gpio_get_value;
-	gc->set_rv = cy8c95x0_gpio_set_value;
+	gc->set = cy8c95x0_gpio_set_value;
 	gc->get_direction = cy8c95x0_gpio_get_direction;
 	gc->get_multiple = cy8c95x0_gpio_get_multiple;
-	gc->set_multiple_rv = cy8c95x0_gpio_set_multiple;
+	gc->set_multiple = cy8c95x0_gpio_set_multiple;
 	gc->set_config = gpiochip_generic_config;
 	gc->can_sleep = true;
 	gc->add_pin_ranges = cy8c95x0_add_pin_ranges;
diff --git a/drivers/pinctrl/pinctrl-da9062.c b/drivers/pinctrl/pinctrl-da9062.c
index 3295b09dfc3d..53298cbcc5cf 100644
--- a/drivers/pinctrl/pinctrl-da9062.c
+++ b/drivers/pinctrl/pinctrl-da9062.c
@@ -233,7 +233,7 @@ static int da9062_gpio_to_irq(struct gpio_chip *gc, unsigned int offset)
 static const struct gpio_chip reference_gc = {
 	.owner = THIS_MODULE,
 	.get = da9062_gpio_get,
-	.set_rv = da9062_gpio_set,
+	.set = da9062_gpio_set,
 	.get_direction = da9062_gpio_get_direction,
 	.direction_input = da9062_gpio_direction_input,
 	.direction_output = da9062_gpio_direction_output,
diff --git a/drivers/pinctrl/pinctrl-digicolor.c b/drivers/pinctrl/pinctrl-digicolor.c
index 1676cb3cc4c9..2e16f09aeb47 100644
--- a/drivers/pinctrl/pinctrl-digicolor.c
+++ b/drivers/pinctrl/pinctrl-digicolor.c
@@ -248,7 +248,7 @@ static int dc_gpiochip_add(struct dc_pinmap *pmap)
 	chip->direction_input	= dc_gpio_direction_input;
 	chip->direction_output	= dc_gpio_direction_output;
 	chip->get		= dc_gpio_get;
-	chip->set_rv		= dc_gpio_set;
+	chip->set		= dc_gpio_set;
 	chip->base		= -1;
 	chip->ngpio		= PINS_COUNT;
 
diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c
index 79119cf20efc..2900513467fa 100644
--- a/drivers/pinctrl/pinctrl-ingenic.c
+++ b/drivers/pinctrl/pinctrl-ingenic.c
@@ -4451,7 +4451,7 @@ static int __init ingenic_gpio_probe(struct ingenic_pinctrl *jzpc,
 	jzgc->gc.fwnode = fwnode;
 	jzgc->gc.owner = THIS_MODULE;
 
-	jzgc->gc.set_rv = ingenic_gpio_set;
+	jzgc->gc.set = ingenic_gpio_set;
 	jzgc->gc.get = ingenic_gpio_get;
 	jzgc->gc.direction_input = pinctrl_gpio_direction_input;
 	jzgc->gc.direction_output = ingenic_gpio_direction_output;
diff --git a/drivers/pinctrl/pinctrl-keembay.c b/drivers/pinctrl/pinctrl-keembay.c
index 30e641571cfe..60cf017498b3 100644
--- a/drivers/pinctrl/pinctrl-keembay.c
+++ b/drivers/pinctrl/pinctrl-keembay.c
@@ -1481,7 +1481,7 @@ static int keembay_gpiochip_probe(struct keembay_pinctrl *kpc,
 	gc->direction_input	= keembay_gpio_set_direction_in;
 	gc->direction_output	= keembay_gpio_set_direction_out;
 	gc->get			= keembay_gpio_get;
-	gc->set_rv		= keembay_gpio_set;
+	gc->set			= keembay_gpio_set;
 	gc->set_config		= gpiochip_generic_config;
 	gc->base		= -1;
 	gc->ngpio		= kpc->npins;
diff --git a/drivers/pinctrl/pinctrl-mcp23s08.c b/drivers/pinctrl/pinctrl-mcp23s08.c
index c8027ef03ecc..a17fcaddf490 100644
--- a/drivers/pinctrl/pinctrl-mcp23s08.c
+++ b/drivers/pinctrl/pinctrl-mcp23s08.c
@@ -632,8 +632,8 @@ int mcp23s08_probe_one(struct mcp23s08 *mcp, struct device *dev,
 	mcp->chip.get = mcp23s08_get;
 	mcp->chip.get_multiple = mcp23s08_get_multiple;
 	mcp->chip.direction_output = mcp23s08_direction_output;
-	mcp->chip.set_rv = mcp23s08_set;
-	mcp->chip.set_multiple_rv = mcp23s08_set_multiple;
+	mcp->chip.set = mcp23s08_set;
+	mcp->chip.set_multiple = mcp23s08_set_multiple;
 
 	mcp->chip.base = base;
 	mcp->chip.can_sleep = true;
diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c
index 88c2f14cfc6b..6191e5c13815 100644
--- a/drivers/pinctrl/pinctrl-microchip-sgpio.c
+++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c
@@ -858,7 +858,7 @@ static int microchip_sgpio_register_bank(struct device *dev,
 	gc->direction_input	= microchip_sgpio_direction_input;
 	gc->direction_output	= microchip_sgpio_direction_output;
 	gc->get			= microchip_sgpio_get_value;
-	gc->set_rv		= microchip_sgpio_set_value;
+	gc->set			= microchip_sgpio_set_value;
 	gc->request		= gpiochip_generic_request;
 	gc->free		= gpiochip_generic_free;
 	gc->of_xlate		= microchip_sgpio_of_xlate;
diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c
index fbb3d43746bb..b82bf83fed25 100644
--- a/drivers/pinctrl/pinctrl-ocelot.c
+++ b/drivers/pinctrl/pinctrl-ocelot.c
@@ -1997,7 +1997,7 @@ static int ocelot_gpio_direction_output(struct gpio_chip *chip,
 static const struct gpio_chip ocelot_gpiolib_chip = {
 	.request = gpiochip_generic_request,
 	.free = gpiochip_generic_free,
-	.set_rv = ocelot_gpio_set,
+	.set = ocelot_gpio_set,
 	.get = ocelot_gpio_get,
 	.get_direction = ocelot_gpio_get_direction,
 	.direction_input = pinctrl_gpio_direction_input,
diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c
index 6d64cab97e81..37c2bf752154 100644
--- a/drivers/pinctrl/pinctrl-pic32.c
+++ b/drivers/pinctrl/pinctrl-pic32.c
@@ -2120,7 +2120,7 @@ static void pic32_gpio_irq_handler(struct irq_desc *desc)
 			.direction_input = pic32_gpio_direction_input,	\
 			.direction_output = pic32_gpio_direction_output, \
 			.get = pic32_gpio_get,				\
-			.set_rv = pic32_gpio_set,			\
+			.set = pic32_gpio_set,				\
 			.ngpio = _npins,				\
 			.base = GPIO_BANK_START(_bank),			\
 			.owner = THIS_MODULE,				\
diff --git a/drivers/pinctrl/pinctrl-pistachio.c b/drivers/pinctrl/pinctrl-pistachio.c
index 7f8b562c81c9..0b33b01dbaad 100644
--- a/drivers/pinctrl/pinctrl-pistachio.c
+++ b/drivers/pinctrl/pinctrl-pistachio.c
@@ -1331,7 +1331,7 @@ static void pistachio_gpio_irq_handler(struct irq_desc *desc)
 			.direction_input = pistachio_gpio_direction_input, \
 			.direction_output = pistachio_gpio_direction_output, \
 			.get = pistachio_gpio_get,			\
-			.set_rv = pistachio_gpio_set,			\
+			.set = pistachio_gpio_set,			\
 			.base = _pin_base,				\
 			.ngpio = _npins,				\
 		},							\
diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c
index fc0e330b1d11..3acf770316c1 100644
--- a/drivers/pinctrl/pinctrl-rk805.c
+++ b/drivers/pinctrl/pinctrl-rk805.c
@@ -378,7 +378,7 @@ static const struct gpio_chip rk805_gpio_chip = {
 	.free			= gpiochip_generic_free,
 	.get_direction		= rk805_gpio_get_direction,
 	.get			= rk805_gpio_get,
-	.set_rv			= rk805_gpio_set,
+	.set			= rk805_gpio_set,
 	.direction_input	= pinctrl_gpio_direction_input,
 	.direction_output	= rk805_gpio_direction_output,
 	.can_sleep		= true,
diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c
index 6080b57a5d87..dadafc935dbb 100644
--- a/drivers/pinctrl/pinctrl-rp1.c
+++ b/drivers/pinctrl/pinctrl-rp1.c
@@ -851,7 +851,7 @@ static const struct gpio_chip rp1_gpio_chip = {
 	.direction_output = rp1_gpio_direction_output,
 	.get_direction = rp1_gpio_get_direction,
 	.get = rp1_gpio_get,
-	.set_rv = rp1_gpio_set,
+	.set = rp1_gpio_set,
 	.base = -1,
 	.set_config = rp1_gpio_set_config,
 	.ngpio = RP1_NUM_GPIOS,
diff --git a/drivers/pinctrl/pinctrl-st.c b/drivers/pinctrl/pinctrl-st.c
index 574fe2cbfbec..d3cea3437d7f 100644
--- a/drivers/pinctrl/pinctrl-st.c
+++ b/drivers/pinctrl/pinctrl-st.c
@@ -1467,7 +1467,7 @@ static const struct gpio_chip st_gpio_template = {
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.get			= st_gpio_get,
-	.set_rv			= st_gpio_set,
+	.set			= st_gpio_set,
 	.direction_input	= pinctrl_gpio_direction_input,
 	.direction_output	= st_gpio_direction_output,
 	.get_direction		= st_gpio_get_direction,
diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c
index f4fdcaa043e6..c89b99003b71 100644
--- a/drivers/pinctrl/pinctrl-stmfx.c
+++ b/drivers/pinctrl/pinctrl-stmfx.c
@@ -697,7 +697,7 @@ static int stmfx_pinctrl_probe(struct platform_device *pdev)
 	pctl->gpio_chip.direction_input = stmfx_gpio_direction_input;
 	pctl->gpio_chip.direction_output = stmfx_gpio_direction_output;
 	pctl->gpio_chip.get = stmfx_gpio_get;
-	pctl->gpio_chip.set_rv = stmfx_gpio_set;
+	pctl->gpio_chip.set = stmfx_gpio_set;
 	pctl->gpio_chip.set_config = gpiochip_generic_config;
 	pctl->gpio_chip.base = -1;
 	pctl->gpio_chip.ngpio = pctl->pctl_desc.npins;
diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c
index d3a12c1c0de2..53cf8168b274 100644
--- a/drivers/pinctrl/pinctrl-sx150x.c
+++ b/drivers/pinctrl/pinctrl-sx150x.c
@@ -1176,7 +1176,7 @@ static int sx150x_probe(struct i2c_client *client)
 	pctl->gpio.direction_input = sx150x_gpio_direction_input;
 	pctl->gpio.direction_output = sx150x_gpio_direction_output;
 	pctl->gpio.get = sx150x_gpio_get;
-	pctl->gpio.set_rv = sx150x_gpio_set;
+	pctl->gpio.set = sx150x_gpio_set;
 	pctl->gpio.set_config = gpiochip_generic_config;
 	pctl->gpio.parent = dev;
 	pctl->gpio.can_sleep = true;
@@ -1191,7 +1191,7 @@ static int sx150x_probe(struct i2c_client *client)
 	 * would require locking that is not in place at this time.
 	 */
 	if (pctl->data->model != SX150X_789)
-		pctl->gpio.set_multiple_rv = sx150x_gpio_set_multiple;
+		pctl->gpio.set_multiple = sx150x_gpio_set_multiple;
 
 	/* Add Interrupt support if an irq is specified */
 	if (client->irq > 0) {
diff --git a/drivers/pinctrl/pinctrl-xway.c b/drivers/pinctrl/pinctrl-xway.c
index 53c6c22ff24d..3d4ad61d0da9 100644
--- a/drivers/pinctrl/pinctrl-xway.c
+++ b/drivers/pinctrl/pinctrl-xway.c
@@ -1354,7 +1354,7 @@ static struct gpio_chip xway_chip = {
 	.direction_input = xway_gpio_dir_in,
 	.direction_output = xway_gpio_dir_out,
 	.get = xway_gpio_get,
-	.set_rv = xway_gpio_set,
+	.set = xway_gpio_set,
 	.request = gpiochip_generic_request,
 	.free = gpiochip_generic_free,
 	.to_irq = xway_gpio_to_irq,
diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c
index 57fefeb603f0..54c77e0b96e9 100644
--- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c
+++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c
@@ -398,7 +398,7 @@ static const struct gpio_chip lpi_gpio_template = {
 	.direction_input	= lpi_gpio_direction_input,
 	.direction_output	= lpi_gpio_direction_output,
 	.get			= lpi_gpio_get,
-	.set_rv			= lpi_gpio_set,
+	.set			= lpi_gpio_set,
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.dbg_show		= lpi_gpio_dbg_show,
diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index f713c80d7f3e..83eb075b6bfa 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -792,7 +792,7 @@ static const struct gpio_chip msm_gpio_template = {
 	.direction_output = msm_gpio_direction_output,
 	.get_direction    = msm_gpio_get_direction,
 	.get              = msm_gpio_get,
-	.set_rv           = msm_gpio_set,
+	.set              = msm_gpio_set,
 	.request          = gpiochip_generic_request,
 	.free             = gpiochip_generic_free,
 	.dbg_show         = msm_gpio_dbg_show,
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
index 606becc160eb..b7b15874e488 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
@@ -802,7 +802,7 @@ static const struct gpio_chip pmic_gpio_gpio_template = {
 	.direction_input	= pmic_gpio_direction_input,
 	.direction_output	= pmic_gpio_direction_output,
 	.get			= pmic_gpio_get,
-	.set_rv			= pmic_gpio_set,
+	.set			= pmic_gpio_set,
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.of_xlate		= pmic_gpio_of_xlate,
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c
index ba9084978f90..22d76b1013a3 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c
@@ -638,7 +638,7 @@ static const struct gpio_chip pmic_mpp_gpio_template = {
 	.direction_input	= pmic_mpp_direction_input,
 	.direction_output	= pmic_mpp_direction_output,
 	.get			= pmic_mpp_get,
-	.set_rv			= pmic_mpp_set,
+	.set			= pmic_mpp_set,
 	.request		= gpiochip_generic_request,
 	.free			= gpiochip_generic_free,
 	.of_xlate		= pmic_mpp_of_xlate,
diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
index 3a8014ebf064..fb37b1c1acb4 100644
--- a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c
@@ -597,7 +597,7 @@ static const struct gpio_chip pm8xxx_gpio_template = {
 	.direction_input = pm8xxx_gpio_direction_input,
 	.direction_output = pm8xxx_gpio_direction_output,
 	.get = pm8xxx_gpio_get,
-	.set_rv = pm8xxx_gpio_set,
+	.set = pm8xxx_gpio_set,
 	.of_xlate = pm8xxx_gpio_of_xlate,
 	.dbg_show = pm8xxx_gpio_dbg_show,
 	.owner = THIS_MODULE,
diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c
index 087c37d304fc..6103849af042 100644
--- a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c
+++ b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c
@@ -634,7 +634,7 @@ static const struct gpio_chip pm8xxx_mpp_template = {
 	.direction_input = pm8xxx_mpp_direction_input,
 	.direction_output = pm8xxx_mpp_direction_output,
 	.get = pm8xxx_mpp_get,
-	.set_rv = pm8xxx_mpp_set,
+	.set = pm8xxx_mpp_set,
 	.of_xlate = pm8xxx_mpp_of_xlate,
 	.dbg_show = pm8xxx_mpp_dbg_show,
 	.owner = THIS_MODULE,
diff --git a/drivers/pinctrl/renesas/gpio.c b/drivers/pinctrl/renesas/gpio.c
index 8efbdc1b0078..2293af642849 100644
--- a/drivers/pinctrl/renesas/gpio.c
+++ b/drivers/pinctrl/renesas/gpio.c
@@ -234,7 +234,7 @@ static int gpio_pin_setup(struct sh_pfc_chip *chip)
 	gc->direction_input = gpio_pin_direction_input;
 	gc->get = gpio_pin_get;
 	gc->direction_output = gpio_pin_direction_output;
-	gc->set_rv = gpio_pin_set;
+	gc->set = gpio_pin_set;
 	gc->to_irq = gpio_pin_to_irq;
 
 	gc->label = pfc->info->name;
diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c
index 3d8492c91710..23812116ef42 100644
--- a/drivers/pinctrl/renesas/pinctrl-rza1.c
+++ b/drivers/pinctrl/renesas/pinctrl-rza1.c
@@ -846,7 +846,7 @@ static const struct gpio_chip rza1_gpiochip_template = {
 	.direction_input	= rza1_gpio_direction_input,
 	.direction_output	= rza1_gpio_direction_output,
 	.get			= rza1_gpio_get,
-	.set_rv			= rza1_gpio_set,
+	.set			= rza1_gpio_set,
 };
 /* ----------------------------------------------------------------------------
  * pinctrl operations
diff --git a/drivers/pinctrl/renesas/pinctrl-rza2.c b/drivers/pinctrl/renesas/pinctrl-rza2.c
index 7a0b268d3eb9..b78b5b4ec5af 100644
--- a/drivers/pinctrl/renesas/pinctrl-rza2.c
+++ b/drivers/pinctrl/renesas/pinctrl-rza2.c
@@ -237,7 +237,7 @@ static struct gpio_chip chip = {
 	.direction_input = rza2_chip_direction_input,
 	.direction_output = rza2_chip_direction_output,
 	.get = rza2_chip_get,
-	.set_rv = rza2_chip_set,
+	.set = rza2_chip_set,
 };
 
 static int rza2_gpio_register(struct rza2_pinctrl_priv *priv)
diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
index 2a10ae0bf5bd..c52263c2a7b0 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
@@ -2795,7 +2795,7 @@ static int rzg2l_gpio_register(struct rzg2l_pinctrl *pctrl)
 	chip->direction_input = rzg2l_gpio_direction_input;
 	chip->direction_output = rzg2l_gpio_direction_output;
 	chip->get = rzg2l_gpio_get;
-	chip->set_rv = rzg2l_gpio_set;
+	chip->set = rzg2l_gpio_set;
 	chip->label = name;
 	chip->parent = pctrl->dev;
 	chip->owner = THIS_MODULE;
diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
index a17b68b4c466..daaa986d994d 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c
@@ -957,7 +957,7 @@ static int rzv2m_gpio_register(struct rzv2m_pinctrl *pctrl)
 	chip->direction_input = rzv2m_gpio_direction_input;
 	chip->direction_output = rzv2m_gpio_direction_output;
 	chip->get = rzv2m_gpio_get;
-	chip->set_rv = rzv2m_gpio_set;
+	chip->set = rzv2m_gpio_set;
 	chip->label = name;
 	chip->parent = pctrl->dev;
 	chip->owner = THIS_MODULE;
diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c
index fe1ac82b9d79..24745e1d78ce 100644
--- a/drivers/pinctrl/samsung/pinctrl-samsung.c
+++ b/drivers/pinctrl/samsung/pinctrl-samsung.c
@@ -1067,7 +1067,7 @@ static int samsung_gpio_set_config(struct gpio_chip *gc, unsigned int offset,
 static const struct gpio_chip samsung_gpiolib_chip = {
 	.request = gpiochip_generic_request,
 	.free = gpiochip_generic_free,
-	.set_rv = samsung_gpio_set,
+	.set = samsung_gpio_set,
 	.get = samsung_gpio_get,
 	.direction_input = samsung_gpio_direction_input,
 	.direction_output = samsung_gpio_direction_output,
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index e8234d2156da..1ec22010a3f9 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -582,7 +582,7 @@ static int plgpio_probe(struct platform_device *pdev)
 	plgpio->chip.direction_input = plgpio_direction_input;
 	plgpio->chip.direction_output = plgpio_direction_output;
 	plgpio->chip.get = plgpio_get_value;
-	plgpio->chip.set_rv = plgpio_set_value;
+	plgpio->chip.set = plgpio_set_value;
 	plgpio->chip.label = dev_name(&pdev->dev);
 	plgpio->chip.parent = &pdev->dev;
 	plgpio->chip.owner = THIS_MODULE;
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
index b729ca4de422..7fa13f282b85 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7100.c
@@ -1302,7 +1302,7 @@ static int starfive_probe(struct platform_device *pdev)
 	sfp->gc.direction_input = starfive_gpio_direction_input;
 	sfp->gc.direction_output = starfive_gpio_direction_output;
 	sfp->gc.get = starfive_gpio_get;
-	sfp->gc.set_rv = starfive_gpio_set;
+	sfp->gc.set = starfive_gpio_set;
 	sfp->gc.set_config = starfive_gpio_set_config;
 	sfp->gc.add_pin_ranges = starfive_gpio_add_pin_ranges;
 	sfp->gc.base = -1;
diff --git a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
index 082bb1c6cea9..05e3af75b09f 100644
--- a/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
+++ b/drivers/pinctrl/starfive/pinctrl-starfive-jh7110.c
@@ -935,7 +935,7 @@ int jh7110_pinctrl_probe(struct platform_device *pdev)
 	sfp->gc.direction_input = jh7110_gpio_direction_input;
 	sfp->gc.direction_output = jh7110_gpio_direction_output;
 	sfp->gc.get = jh7110_gpio_get;
-	sfp->gc.set_rv = jh7110_gpio_set;
+	sfp->gc.set = jh7110_gpio_set;
 	sfp->gc.set_config = jh7110_gpio_set_config;
 	sfp->gc.add_pin_ranges = jh7110_gpio_add_pin_ranges;
 	sfp->gc.base = info->gc_base;
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index f47c4e6f12b4..823c8fe758e2 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -433,7 +433,7 @@ static const struct gpio_chip stm32_gpio_template = {
 	.request		= stm32_gpio_request,
 	.free			= stm32_gpio_free,
 	.get			= stm32_gpio_get,
-	.set_rv			= stm32_gpio_set,
+	.set			= stm32_gpio_set,
 	.direction_input	= pinctrl_gpio_direction_input,
 	.direction_output	= stm32_gpio_direction_output,
 	.to_irq			= stm32_gpio_to_irq,
diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c
index 3c3357f80889..3e924aa86cc2 100644
--- a/drivers/pinctrl/sunplus/sppctl.c
+++ b/drivers/pinctrl/sunplus/sppctl.c
@@ -547,7 +547,7 @@ static int sppctl_gpio_new(struct platform_device *pdev, struct sppctl_pdata *pc
 	gchip->direction_input  = sppctl_gpio_direction_input;
 	gchip->direction_output = sppctl_gpio_direction_output;
 	gchip->get              = sppctl_gpio_get;
-	gchip->set_rv           = sppctl_gpio_set;
+	gchip->set              = sppctl_gpio_set;
 	gchip->set_config       = sppctl_gpio_set_config;
 	gchip->dbg_show         = IS_ENABLED(CONFIG_DEBUG_FS) ?
 				  sppctl_gpio_dbg_show : NULL;
diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c
index 0db8429a013f..0fb057a07dcc 100644
--- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c
+++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c
@@ -1604,7 +1604,7 @@ int sunxi_pinctrl_init_with_flags(struct platform_device *pdev,
 	pctl->chip->direction_input = sunxi_pinctrl_gpio_direction_input;
 	pctl->chip->direction_output = sunxi_pinctrl_gpio_direction_output;
 	pctl->chip->get = sunxi_pinctrl_gpio_get;
-	pctl->chip->set_rv = sunxi_pinctrl_gpio_set;
+	pctl->chip->set = sunxi_pinctrl_gpio_set;
 	pctl->chip->of_xlate = sunxi_pinctrl_gpio_of_xlate;
 	pctl->chip->to_irq = sunxi_pinctrl_gpio_to_irq;
 	pctl->chip->of_gpio_n_cells = 3;
diff --git a/drivers/pinctrl/vt8500/pinctrl-wmt.c b/drivers/pinctrl/vt8500/pinctrl-wmt.c
index 767c6808a463..7213a8d4bf09 100644
--- a/drivers/pinctrl/vt8500/pinctrl-wmt.c
+++ b/drivers/pinctrl/vt8500/pinctrl-wmt.c
@@ -549,7 +549,7 @@ static const struct gpio_chip wmt_gpio_chip = {
 	.direction_input = pinctrl_gpio_direction_input,
 	.direction_output = wmt_gpio_direction_output,
 	.get = wmt_gpio_get_value,
-	.set_rv = wmt_gpio_set_value,
+	.set = wmt_gpio_set_value,
 	.can_sleep = false,
 };
 
diff --git a/drivers/platform/cznic/turris-omnia-mcu-gpio.c b/drivers/platform/cznic/turris-omnia-mcu-gpio.c
index 77184c8b42ea..7f0ada4fa606 100644
--- a/drivers/platform/cznic/turris-omnia-mcu-gpio.c
+++ b/drivers/platform/cznic/turris-omnia-mcu-gpio.c
@@ -1024,8 +1024,8 @@ int omnia_mcu_register_gpiochip(struct omnia_mcu *mcu)
 	mcu->gc.direction_output = omnia_gpio_direction_output;
 	mcu->gc.get = omnia_gpio_get;
 	mcu->gc.get_multiple = omnia_gpio_get_multiple;
-	mcu->gc.set_rv = omnia_gpio_set;
-	mcu->gc.set_multiple_rv = omnia_gpio_set_multiple;
+	mcu->gc.set = omnia_gpio_set;
+	mcu->gc.set_multiple = omnia_gpio_set_multiple;
 	mcu->gc.init_valid_mask = omnia_gpio_init_valid_mask;
 	mcu->gc.can_sleep = true;
 	mcu->gc.names = omnia_mcu_gpio_names;
diff --git a/drivers/platform/x86/barco-p50-gpio.c b/drivers/platform/x86/barco-p50-gpio.c
index bb3393bbfb89..28012eebdb10 100644
--- a/drivers/platform/x86/barco-p50-gpio.c
+++ b/drivers/platform/x86/barco-p50-gpio.c
@@ -316,7 +316,7 @@ static int p50_gpio_probe(struct platform_device *pdev)
 	p50->gc.base = -1;
 	p50->gc.get_direction = p50_gpio_get_direction;
 	p50->gc.get = p50_gpio_get;
-	p50->gc.set_rv = p50_gpio_set;
+	p50->gc.set = p50_gpio_set;
 
 
 	/* reset mbox */
diff --git a/drivers/platform/x86/intel/int0002_vgpio.c b/drivers/platform/x86/intel/int0002_vgpio.c
index 9bc24ed19c64..6f5629dc3f8d 100644
--- a/drivers/platform/x86/intel/int0002_vgpio.c
+++ b/drivers/platform/x86/intel/int0002_vgpio.c
@@ -193,7 +193,7 @@ static int int0002_probe(struct platform_device *pdev)
 	chip->parent = dev;
 	chip->owner = THIS_MODULE;
 	chip->get = int0002_gpio_get;
-	chip->set_rv = int0002_gpio_set;
+	chip->set = int0002_gpio_set;
 	chip->direction_input = int0002_gpio_get;
 	chip->direction_output = int0002_gpio_direction_output;
 	chip->base = -1;
diff --git a/drivers/platform/x86/portwell-ec.c b/drivers/platform/x86/portwell-ec.c
index 3e019c51913e..322f296e9315 100644
--- a/drivers/platform/x86/portwell-ec.c
+++ b/drivers/platform/x86/portwell-ec.c
@@ -86,7 +86,7 @@ static int pwec_gpio_get(struct gpio_chip *chip, unsigned int offset)
 	return pwec_read(PORTWELL_GPIO_VAL_REG) & BIT(offset) ? 1 : 0;
 }
 
-static int pwec_gpio_set_rv(struct gpio_chip *chip, unsigned int offset, int val)
+static int pwec_gpio_set(struct gpio_chip *chip, unsigned int offset, int val)
 {
 	u8 tmp = pwec_read(PORTWELL_GPIO_VAL_REG);
 
@@ -130,7 +130,7 @@ static struct gpio_chip pwec_gpio_chip = {
 	.direction_input = pwec_gpio_direction_input,
 	.direction_output = pwec_gpio_direction_output,
 	.get = pwec_gpio_get,
-	.set_rv = pwec_gpio_set_rv,
+	.set = pwec_gpio_set,
 	.base = -1,
 	.ngpio = PORTWELL_GPIO_PINS,
 };
diff --git a/drivers/platform/x86/silicom-platform.c b/drivers/platform/x86/silicom-platform.c
index 63b5da410ed5..266f7bc5e416 100644
--- a/drivers/platform/x86/silicom-platform.c
+++ b/drivers/platform/x86/silicom-platform.c
@@ -466,7 +466,7 @@ static struct gpio_chip silicom_gpio_chip = {
 	.direction_input = silicom_gpio_direction_input,
 	.direction_output = silicom_gpio_direction_output,
 	.get = silicom_gpio_get,
-	.set_rv = silicom_gpio_set,
+	.set = silicom_gpio_set,
 	.base = -1,
 	.ngpio = ARRAY_SIZE(plat_0222_gpio_channels),
 	.names = plat_0222_gpio_names,
diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index eb03ccd5b688..9ce75704a15f 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c
@@ -323,7 +323,7 @@ static int pca9685_pwm_gpio_probe(struct pwm_chip *chip)
 	pca->gpio.direction_input = pca9685_pwm_gpio_direction_input;
 	pca->gpio.direction_output = pca9685_pwm_gpio_direction_output;
 	pca->gpio.get = pca9685_pwm_gpio_get;
-	pca->gpio.set_rv = pca9685_pwm_gpio_set;
+	pca->gpio.set = pca9685_pwm_gpio_set;
 	pca->gpio.base = -1;
 	pca->gpio.ngpio = PCA9685_MAXCHAN;
 	pca->gpio.can_sleep = true;
diff --git a/drivers/regulator/rpi-panel-attiny-regulator.c b/drivers/regulator/rpi-panel-attiny-regulator.c
index 58dbf8bffa5d..3020839b9ef1 100644
--- a/drivers/regulator/rpi-panel-attiny-regulator.c
+++ b/drivers/regulator/rpi-panel-attiny-regulator.c
@@ -351,7 +351,7 @@ static int attiny_i2c_probe(struct i2c_client *i2c)
 	state->gc.base = -1;
 	state->gc.ngpio = NUM_GPIO;
 
-	state->gc.set_rv = attiny_gpio_set;
+	state->gc.set = attiny_gpio_set;
 	state->gc.get_direction = attiny_gpio_get_direction;
 	state->gc.can_sleep = true;
 
diff --git a/drivers/soc/fsl/qe/gpio.c b/drivers/soc/fsl/qe/gpio.c
index 710a3a03758b..8df1e8fa86a5 100644
--- a/drivers/soc/fsl/qe/gpio.c
+++ b/drivers/soc/fsl/qe/gpio.c
@@ -321,8 +321,8 @@ static int __init qe_add_gpiochips(void)
 		gc->direction_input = qe_gpio_dir_in;
 		gc->direction_output = qe_gpio_dir_out;
 		gc->get = qe_gpio_get;
-		gc->set_rv = qe_gpio_set;
-		gc->set_multiple_rv = qe_gpio_set_multiple;
+		gc->set = qe_gpio_set;
+		gc->set_multiple = qe_gpio_set_multiple;
 
 		ret = of_mm_gpiochip_add_data(np, mm_gc, qe_gc);
 		if (ret)
diff --git a/drivers/soc/renesas/pwc-rzv2m.c b/drivers/soc/renesas/pwc-rzv2m.c
index 4dbcb3d4a90c..6209168b3734 100644
--- a/drivers/soc/renesas/pwc-rzv2m.c
+++ b/drivers/soc/renesas/pwc-rzv2m.c
@@ -64,7 +64,7 @@ static const struct gpio_chip rzv2m_pwc_gc = {
 	.label = "gpio_rzv2m_pwc",
 	.owner = THIS_MODULE,
 	.get = rzv2m_pwc_gpio_get,
-	.set_rv = rzv2m_pwc_gpio_set,
+	.set = rzv2m_pwc_gpio_set,
 	.direction_output = rzv2m_pwc_gpio_direction_output,
 	.can_sleep = false,
 	.ngpio = 2,
diff --git a/drivers/spi/spi-xcomm.c b/drivers/spi/spi-xcomm.c
index 1a40c4866ce1..33b78c537520 100644
--- a/drivers/spi/spi-xcomm.c
+++ b/drivers/spi/spi-xcomm.c
@@ -70,7 +70,7 @@ static int spi_xcomm_gpio_add(struct spi_xcomm *spi_xcomm)
 		return 0;
 
 	spi_xcomm->gc.get_direction = spi_xcomm_gpio_get_direction;
-	spi_xcomm->gc.set_rv = spi_xcomm_gpio_set_value;
+	spi_xcomm->gc.set = spi_xcomm_gpio_set_value;
 	spi_xcomm->gc.can_sleep = 1;
 	spi_xcomm->gc.base = -1;
 	spi_xcomm->gc.ngpio = 1;
diff --git a/drivers/ssb/driver_gpio.c b/drivers/ssb/driver_gpio.c
index e1f5f0a9c8a2..905657c925bc 100644
--- a/drivers/ssb/driver_gpio.c
+++ b/drivers/ssb/driver_gpio.c
@@ -225,7 +225,7 @@ static int ssb_gpio_chipco_init(struct ssb_bus *bus)
 	chip->request		= ssb_gpio_chipco_request;
 	chip->free		= ssb_gpio_chipco_free;
 	chip->get		= ssb_gpio_chipco_get_value;
-	chip->set_rv		= ssb_gpio_chipco_set_value;
+	chip->set		= ssb_gpio_chipco_set_value;
 	chip->direction_input	= ssb_gpio_chipco_direction_input;
 	chip->direction_output	= ssb_gpio_chipco_direction_output;
 #if IS_ENABLED(CONFIG_SSB_EMBEDDED)
@@ -422,7 +422,7 @@ static int ssb_gpio_extif_init(struct ssb_bus *bus)
 	chip->label		= "ssb_extif_gpio";
 	chip->owner		= THIS_MODULE;
 	chip->get		= ssb_gpio_extif_get_value;
-	chip->set_rv		= ssb_gpio_extif_set_value;
+	chip->set		= ssb_gpio_extif_set_value;
 	chip->direction_input	= ssb_gpio_extif_direction_input;
 	chip->direction_output	= ssb_gpio_extif_direction_output;
 #if IS_ENABLED(CONFIG_SSB_EMBEDDED)
diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c
index 1280530c8987..ac62b932e6a4 100644
--- a/drivers/staging/greybus/gpio.c
+++ b/drivers/staging/greybus/gpio.c
@@ -551,7 +551,7 @@ static int gb_gpio_probe(struct gbphy_device *gbphy_dev,
 	gpio->direction_input = gb_gpio_direction_input;
 	gpio->direction_output = gb_gpio_direction_output;
 	gpio->get = gb_gpio_get;
-	gpio->set_rv = gb_gpio_set;
+	gpio->set = gb_gpio_set;
 	gpio->set_config = gb_gpio_set_config;
 	gpio->base = -1;		/* Allocate base dynamically */
 	gpio->ngpio = ggc->line_max + 1;
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 541c790c0109..ce260e9949c3 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -1414,7 +1414,7 @@ static int max310x_probe(struct device *dev, const struct max310x_devtype *devty
 	s->gpio.direction_input	= max310x_gpio_direction_input;
 	s->gpio.get		= max310x_gpio_get;
 	s->gpio.direction_output= max310x_gpio_direction_output;
-	s->gpio.set_rv		= max310x_gpio_set;
+	s->gpio.set		= max310x_gpio_set;
 	s->gpio.set_config	= max310x_gpio_set_config;
 	s->gpio.base		= -1;
 	s->gpio.ngpio		= devtype->nr * 4;
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 5ea8aadb6e69..3f38fba8f6ea 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1425,7 +1425,7 @@ static int sc16is7xx_setup_gpio_chip(struct sc16is7xx_port *s)
 	s->gpio.direction_input	 = sc16is7xx_gpio_direction_input;
 	s->gpio.get		 = sc16is7xx_gpio_get;
 	s->gpio.direction_output = sc16is7xx_gpio_direction_output;
-	s->gpio.set_rv		 = sc16is7xx_gpio_set;
+	s->gpio.set		 = sc16is7xx_gpio_set;
 	s->gpio.base		 = -1;
 	s->gpio.ngpio		 = s->devtype->nr_gpio;
 	s->gpio.can_sleep	 = 1;
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index cfa1d68c7919..36b25418b214 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1962,7 +1962,7 @@ static int cp210x_gpio_init(struct usb_serial *serial)
 	priv->gc.direction_input = cp210x_gpio_direction_input;
 	priv->gc.direction_output = cp210x_gpio_direction_output;
 	priv->gc.get = cp210x_gpio_get;
-	priv->gc.set_rv = cp210x_gpio_set;
+	priv->gc.set = cp210x_gpio_set;
 	priv->gc.set_config = cp210x_gpio_set_config;
 	priv->gc.init_valid_mask = cp210x_gpio_init_valid_mask;
 	priv->gc.owner = THIS_MODULE;
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 7737285a84ba..49666c33b41f 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -2150,9 +2150,9 @@ static int ftdi_gpio_init(struct usb_serial_port *port)
 	priv->gc.direction_output = ftdi_gpio_direction_output;
 	priv->gc.init_valid_mask = ftdi_gpio_init_valid_mask;
 	priv->gc.get = ftdi_gpio_get;
-	priv->gc.set_rv = ftdi_gpio_set;
+	priv->gc.set = ftdi_gpio_set;
 	priv->gc.get_multiple = ftdi_gpio_get_multiple;
-	priv->gc.set_multiple_rv = ftdi_gpio_set_multiple;
+	priv->gc.set_multiple = ftdi_gpio_set_multiple;
 	priv->gc.owner = THIS_MODULE;
 	priv->gc.parent = &serial->interface->dev;
 	priv->gc.base = -1;
diff --git a/drivers/video/fbdev/via/via-gpio.c b/drivers/video/fbdev/via/via-gpio.c
index 72302384bf77..45c0a4a6f85c 100644
--- a/drivers/video/fbdev/via/via-gpio.c
+++ b/drivers/video/fbdev/via/via-gpio.c
@@ -145,7 +145,7 @@ static struct viafb_gpio_cfg viafb_gpio_config = {
 		.label = "VIAFB onboard GPIO",
 		.owner = THIS_MODULE,
 		.direction_output = via_gpio_dir_out,
-		.set_rv = via_gpio_set,
+		.set = via_gpio_set,
 		.direction_input = via_gpio_dir_input,
 		.get = via_gpio_get,
 		.base = -1,
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 90567dde7d8e..667f8fd58a79 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -347,10 +347,10 @@ struct gpio_irq_chip {
  * @get: returns value for signal "offset", 0=low, 1=high, or negative error
  * @get_multiple: reads values for multiple signals defined by "mask" and
  *	stores them in "bits", returns 0 on success or negative error
- * @set_rv: assigns output value for signal "offset", returns 0 on success or
- *          negative error value
- * @set_multiple_rv: assigns output values for multiple signals defined by
- *                   "mask", returns 0 on success or negative error value
+ * @set: assigns output value for signal "offset", returns 0 on success or
+ *       negative error value
+ * @set_multiple: assigns output values for multiple signals defined by
+ *                "mask", returns 0 on success or negative error value
  * @set_config: optional hook for all kinds of settings. Uses the same
  *	packed config format as generic pinconf. Must return 0 on success and
  *	a negative error number on failure.
@@ -443,12 +443,11 @@ struct gpio_chip {
 	int			(*get_multiple)(struct gpio_chip *gc,
 						unsigned long *mask,
 						unsigned long *bits);
-	int			(*set_rv)(struct gpio_chip *gc,
-					  unsigned int offset,
-					  int value);
-	int			(*set_multiple_rv)(struct gpio_chip *gc,
-						   unsigned long *mask,
-						   unsigned long *bits);
+	int			(*set)(struct gpio_chip *gc,
+				       unsigned int offset, int value);
+	int			(*set_multiple)(struct gpio_chip *gc,
+						unsigned long *mask,
+						unsigned long *bits);
 	int			(*set_config)(struct gpio_chip *gc,
 					      unsigned int offset,
 					      unsigned long config);
diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h
index b511acd58ab0..f3a8db4598bb 100644
--- a/include/linux/gpio/generic.h
+++ b/include/linux/gpio/generic.h
@@ -88,10 +88,10 @@ static inline int
 gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset,
 		      int value)
 {
-	if (WARN_ON(!chip->gc.set_rv))
+	if (WARN_ON(!chip->gc.set))
 		return -EOPNOTSUPP;
 
-	return chip->gc.set_rv(&chip->gc, offset, value);
+	return chip->gc.set(&chip->gc, offset, value);
 }
 
 #define gpio_generic_chip_lock(gen_gc) \
diff --git a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c
index 93b9cbf1f08a..9ba14c09c07f 100644
--- a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c
+++ b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c
@@ -86,7 +86,7 @@ static const struct gpio_chip cirrus_scodec_test_gpio_chip = {
 	.direction_input	= cirrus_scodec_test_gpio_direction_in,
 	.get			= cirrus_scodec_test_gpio_get,
 	.direction_output	= cirrus_scodec_test_gpio_direction_out,
-	.set_rv			= cirrus_scodec_test_gpio_set,
+	.set			= cirrus_scodec_test_gpio_set,
 	.set_config		= cirrus_scodec_test_gpio_set_config,
 	.base			= -1,
 	.ngpio			= 32,
diff --git a/sound/soc/codecs/idt821034.c b/sound/soc/codecs/idt821034.c
index 55e90604bbaa..6738cf21983b 100644
--- a/sound/soc/codecs/idt821034.c
+++ b/sound/soc/codecs/idt821034.c
@@ -1117,7 +1117,7 @@ static int idt821034_gpio_init(struct idt821034 *idt821034)
 	idt821034->gpio_chip.direction_input = idt821034_chip_direction_input;
 	idt821034->gpio_chip.direction_output = idt821034_chip_direction_output;
 	idt821034->gpio_chip.get = idt821034_chip_gpio_get;
-	idt821034->gpio_chip.set_rv = idt821034_chip_gpio_set;
+	idt821034->gpio_chip.set = idt821034_chip_gpio_set;
 	idt821034->gpio_chip.can_sleep = true;
 
 	return devm_gpiochip_add_data(&idt821034->spi->dev, &idt821034->gpio_chip,
diff --git a/sound/soc/codecs/peb2466.c b/sound/soc/codecs/peb2466.c
index b8905c03445e..c0c5b3c3e98b 100644
--- a/sound/soc/codecs/peb2466.c
+++ b/sound/soc/codecs/peb2466.c
@@ -1945,7 +1945,7 @@ static int peb2466_gpio_init(struct peb2466 *peb2466)
 	peb2466->gpio.gpio_chip.direction_input = peb2466_chip_direction_input;
 	peb2466->gpio.gpio_chip.direction_output = peb2466_chip_direction_output;
 	peb2466->gpio.gpio_chip.get = peb2466_chip_gpio_get;
-	peb2466->gpio.gpio_chip.set_rv = peb2466_chip_gpio_set;
+	peb2466->gpio.gpio_chip.set = peb2466_chip_gpio_set;
 	peb2466->gpio.gpio_chip.can_sleep = true;
 
 	return devm_gpiochip_add_data(&peb2466->spi->dev, &peb2466->gpio.gpio_chip,
diff --git a/sound/soc/codecs/rt5677.c b/sound/soc/codecs/rt5677.c
index 69a0fb8d7f77..6b6c690a9e45 100644
--- a/sound/soc/codecs/rt5677.c
+++ b/sound/soc/codecs/rt5677.c
@@ -4835,7 +4835,7 @@ static const struct gpio_chip rt5677_template_chip = {
 	.label			= RT5677_DRV_NAME,
 	.owner			= THIS_MODULE,
 	.direction_output	= rt5677_gpio_direction_out,
-	.set_rv			= rt5677_gpio_set,
+	.set			= rt5677_gpio_set,
 	.direction_input	= rt5677_gpio_direction_in,
 	.get			= rt5677_gpio_get,
 	.to_irq			= rt5677_to_irq,
diff --git a/sound/soc/codecs/tlv320adc3xxx.c b/sound/soc/codecs/tlv320adc3xxx.c
index 1035ba17dc5d..258fbcaf345a 100644
--- a/sound/soc/codecs/tlv320adc3xxx.c
+++ b/sound/soc/codecs/tlv320adc3xxx.c
@@ -1052,7 +1052,7 @@ static const struct gpio_chip adc3xxx_gpio_chip = {
 	.owner			= THIS_MODULE,
 	.request		= adc3xxx_gpio_request,
 	.direction_output	= adc3xxx_gpio_direction_out,
-	.set_rv			= adc3xxx_gpio_set,
+	.set			= adc3xxx_gpio_set,
 	.get			= adc3xxx_gpio_get,
 	.can_sleep		= 1,
 };
diff --git a/sound/soc/codecs/wm5100.c b/sound/soc/codecs/wm5100.c
index fb5ed4ba7f60..2d0a20f2fd8c 100644
--- a/sound/soc/codecs/wm5100.c
+++ b/sound/soc/codecs/wm5100.c
@@ -2290,7 +2290,7 @@ static const struct gpio_chip wm5100_template_chip = {
 	.label			= "wm5100",
 	.owner			= THIS_MODULE,
 	.direction_output	= wm5100_gpio_direction_out,
-	.set_rv			= wm5100_gpio_set,
+	.set			= wm5100_gpio_set,
 	.direction_input	= wm5100_gpio_direction_in,
 	.get			= wm5100_gpio_get,
 	.can_sleep		= 1,
diff --git a/sound/soc/codecs/wm8903.c b/sound/soc/codecs/wm8903.c
index 2ed9f493d507..f7d726e3052c 100644
--- a/sound/soc/codecs/wm8903.c
+++ b/sound/soc/codecs/wm8903.c
@@ -1843,7 +1843,7 @@ static const struct gpio_chip wm8903_template_chip = {
 	.direction_input	= wm8903_gpio_direction_in,
 	.get			= wm8903_gpio_get,
 	.direction_output	= wm8903_gpio_direction_out,
-	.set_rv			= wm8903_gpio_set,
+	.set			= wm8903_gpio_set,
 	.can_sleep		= 1,
 };
 
diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c
index d69aa8b15629..7bf6b88c056c 100644
--- a/sound/soc/codecs/wm8962.c
+++ b/sound/soc/codecs/wm8962.c
@@ -3442,7 +3442,7 @@ static const struct gpio_chip wm8962_template_chip = {
 	.owner			= THIS_MODULE,
 	.request		= wm8962_gpio_request,
 	.direction_output	= wm8962_gpio_direction_out,
-	.set_rv			= wm8962_gpio_set,
+	.set			= wm8962_gpio_set,
 	.can_sleep		= 1,
 };
 
diff --git a/sound/soc/codecs/wm8996.c b/sound/soc/codecs/wm8996.c
index e364d0da9044..459b39998307 100644
--- a/sound/soc/codecs/wm8996.c
+++ b/sound/soc/codecs/wm8996.c
@@ -2186,7 +2186,7 @@ static const struct gpio_chip wm8996_template_chip = {
 	.label			= "wm8996",
 	.owner			= THIS_MODULE,
 	.direction_output	= wm8996_gpio_direction_out,
-	.set_rv			= wm8996_gpio_set,
+	.set			= wm8996_gpio_set,
 	.direction_input	= wm8996_gpio_direction_in,
 	.get			= wm8996_gpio_get,
 	.can_sleep		= 1,
diff --git a/sound/soc/codecs/zl38060.c b/sound/soc/codecs/zl38060.c
index 180d45a349ac..7de4014e626d 100644
--- a/sound/soc/codecs/zl38060.c
+++ b/sound/soc/codecs/zl38060.c
@@ -440,7 +440,7 @@ static const struct gpio_chip template_chip = {
 	.direction_input = chip_direction_input,
 	.direction_output = chip_direction_output,
 	.get = chip_gpio_get,
-	.set_rv = chip_gpio_set,
+	.set = chip_gpio_set,
 
 	.can_sleep = true,
 };
diff --git a/sound/soc/soc-ac97.c b/sound/soc/soc-ac97.c
index 29790807d785..37486d6a438e 100644
--- a/sound/soc/soc-ac97.c
+++ b/sound/soc/soc-ac97.c
@@ -125,7 +125,7 @@ static const struct gpio_chip snd_soc_ac97_gpio_chip = {
 	.direction_input	= snd_soc_ac97_gpio_direction_in,
 	.get			= snd_soc_ac97_gpio_get,
 	.direction_output	= snd_soc_ac97_gpio_direction_out,
-	.set_rv			= snd_soc_ac97_gpio_set,
+	.set			= snd_soc_ac97_gpio_set,
 	.can_sleep		= 1,
 };
 
diff --git a/sound/soc/ti/davinci-mcasp.c b/sound/soc/ti/davinci-mcasp.c
index caf1887cc9d1..621a9d5f9377 100644
--- a/sound/soc/ti/davinci-mcasp.c
+++ b/sound/soc/ti/davinci-mcasp.c
@@ -2218,7 +2218,7 @@ static const struct gpio_chip davinci_mcasp_template_chip = {
 	.request		= davinci_mcasp_gpio_request,
 	.free			= davinci_mcasp_gpio_free,
 	.direction_output	= davinci_mcasp_gpio_direction_out,
-	.set_rv			= davinci_mcasp_gpio_set,
+	.set			= davinci_mcasp_gpio_set,
 	.direction_input	= davinci_mcasp_gpio_direction_in,
 	.get			= davinci_mcasp_gpio_get,
 	.get_direction		= davinci_mcasp_gpio_get_direction,
-- 
cgit v1.2.3


From 42e6c6ce03fd3e41e39a0f93f9b1a1d9fa664338 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 7 Aug 2025 11:24:12 +0800
Subject: lib/sbitmap: convert shallow_depth from one word to the whole sbitmap

Currently elevators will record internal 'async_depth' to throttle
asynchronous requests, and they both calculate shallow_dpeth based on
sb->shift, with the respect that sb->shift is the available tags in one
word.

However, sb->shift is not the availbale tags in the last word, see
__map_depth:

if (index == sb->map_nr - 1)
  return sb->depth - (index << sb->shift);

For consequence, if the last word is used, more tags can be get than
expected, for example, assume nr_requests=256 and there are four words,
in the worst case if user set nr_requests=32, then the first word is
the last word, and still use bits per word, which is 64, to calculate
async_depth is wrong.

One the ohter hand, due to cgroup qos, bfq can allow only one request
to be allocated, and set shallow_dpeth=1 will still allow the number
of words request to be allocated.

Fix this problems by using shallow_depth to the whole sbitmap instead
of per word, also change kyber, mq-deadline and bfq to follow this,
a new helper __map_depth_with_shallow() is introduced to calculate
available bits in each word.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20250807032413.1469456-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c     | 35 ++++++++++++++-----------------
 block/bfq-iosched.h     |  3 +--
 block/kyber-iosched.c   |  9 ++------
 block/mq-deadline.c     | 16 +-------------
 include/linux/sbitmap.h |  6 +++---
 lib/sbitmap.c           | 56 +++++++++++++++++++++++++------------------------
 6 files changed, 52 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index aca9886c9ee3..3bf76902f07f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -694,17 +694,13 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
 {
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
 	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
-	int depth;
-	unsigned limit = data->q->nr_requests;
-	unsigned int act_idx;
+	unsigned int limit, act_idx;
 
 	/* Sync reads have full depth available */
-	if (op_is_sync(opf) && !op_is_write(opf)) {
-		depth = 0;
-	} else {
-		depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
-		limit = (limit * depth) >> bfqd->full_depth_shift;
-	}
+	if (op_is_sync(opf) && !op_is_write(opf))
+		limit = data->q->nr_requests;
+	else
+		limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
 
 	for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
 		/* Fast path to check if bfqq is already allocated. */
@@ -718,14 +714,16 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
 		 * available requests and thus starve other entities.
 		 */
 		if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) {
-			depth = 1;
+			limit = 1;
 			break;
 		}
 	}
+
 	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
-		__func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
-	if (depth)
-		data->shallow_depth = depth;
+		__func__, bfqd->wr_busy_queues, op_is_sync(opf), limit);
+
+	if (limit < data->q->nr_requests)
+		data->shallow_depth = limit;
 }
 
 static struct bfq_queue *
@@ -7114,9 +7112,8 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
  */
 static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
 {
-	unsigned int depth = 1U << bt->sb.shift;
+	unsigned int nr_requests = bfqd->queue->nr_requests;
 
-	bfqd->full_depth_shift = bt->sb.shift;
 	/*
 	 * In-word depths if no bfq_queue is being weight-raised:
 	 * leaving 25% of tags only for sync reads.
@@ -7128,13 +7125,13 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
 	 * limit 'something'.
 	 */
 	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max(depth >> 1, 1U);
+	bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
 	/*
 	 * no more than 75% of tags for sync writes (25% extra tags
 	 * w.r.t. async I/O, to prevent async I/O from starving sync
 	 * writes)
 	 */
-	bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
+	bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
 
 	/*
 	 * In-word depths in case some bfq_queue is being weight-
@@ -7144,9 +7141,9 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
 	 * shortage.
 	 */
 	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
+	bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
 	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
+	bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
 }
 
 static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 0b4704932d72..34a498e6b2a5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -813,8 +813,7 @@ struct bfq_data {
 	 * Depth limits used in bfq_limit_depth (see comments on the
 	 * function)
 	 */
-	unsigned int word_depths[2][2];
-	unsigned int full_depth_shift;
+	unsigned int async_depths[2][2];
 
 	/*
 	 * Number of independent actuators. This is equal to 1 in
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 7b6832cb3a8d..70cbc7b2deb4 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -157,10 +157,7 @@ struct kyber_queue_data {
 	 */
 	struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
 
-	/*
-	 * Async request percentage, converted to per-word depth for
-	 * sbitmap_get_shallow().
-	 */
+	/* Number of allowed async requests. */
 	unsigned int async_depth;
 
 	struct kyber_cpu_latency __percpu *cpu_latency;
@@ -447,10 +444,8 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
-	unsigned int shift = tags->bitmap_tags.sb.shift;
-
-	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 
+	kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U;
 	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
 }
 
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 7b6caf30e00a..b9b7cdf1d3c9 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -487,20 +487,6 @@ unlock:
 	return rq;
 }
 
-/*
- * 'depth' is a number in the range 1..INT_MAX representing a number of
- * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since
- * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow().
- * Values larger than q->nr_requests have the same effect as q->nr_requests.
- */
-static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth)
-{
-	struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags;
-	const unsigned int nrr = hctx->queue->nr_requests;
-
-	return ((qdepth << bt->sb.shift) + nrr - 1) / nrr;
-}
-
 /*
  * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
  * function is used by __blk_mq_get_tag().
@@ -517,7 +503,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
 	 * Throttle asynchronous requests and writes such that these requests
 	 * do not block the allocation of synchronous requests.
 	 */
-	data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth);
+	data->shallow_depth = dd->async_depth;
 }
 
 /* Called by blk_mq_update_nr_requests(). */
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 189140bf11fc..4adf4b364fcd 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -213,12 +213,12 @@ int sbitmap_get(struct sbitmap *sb);
  * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
  * limiting the depth used from each word.
  * @sb: Bitmap to allocate from.
- * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * @shallow_depth: The maximum number of bits to allocate from the bitmap.
  *
  * This rather specific operation allows for having multiple users with
  * different allocation limits. E.g., there can be a high-priority class that
  * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
- * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
  * class can only allocate half of the total bits in the bitmap, preventing it
  * from starving out the high-priority class.
  *
@@ -478,7 +478,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
  * sbitmap_queue, limiting the depth used from each word, with preemption
  * already disabled.
  * @sbq: Bitmap queue to allocate from.
- * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * @shallow_depth: The maximum number of bits to allocate from the queue.
  * See sbitmap_get_shallow().
  *
  * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index d3412984170c..c07e3cd82e29 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -208,8 +208,28 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
 	return nr;
 }
 
+static unsigned int __map_depth_with_shallow(const struct sbitmap *sb,
+					     int index,
+					     unsigned int shallow_depth)
+{
+	u64 shallow_word_depth;
+	unsigned int word_depth, reminder;
+
+	word_depth = __map_depth(sb, index);
+	if (shallow_depth >= sb->depth)
+		return word_depth;
+
+	shallow_word_depth = word_depth * shallow_depth;
+	reminder = do_div(shallow_word_depth, sb->depth);
+
+	if (reminder >= (index + 1) * word_depth)
+		shallow_word_depth++;
+
+	return (unsigned int)shallow_word_depth;
+}
+
 static int sbitmap_find_bit(struct sbitmap *sb,
-			    unsigned int depth,
+			    unsigned int shallow_depth,
 			    unsigned int index,
 			    unsigned int alloc_hint,
 			    bool wrap)
@@ -218,12 +238,12 @@ static int sbitmap_find_bit(struct sbitmap *sb,
 	int nr = -1;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = sbitmap_find_bit_in_word(&sb->map[index],
-					      min_t(unsigned int,
-						    __map_depth(sb, index),
-						    depth),
-					      alloc_hint, wrap);
+		unsigned int depth = __map_depth_with_shallow(sb, index,
+							      shallow_depth);
 
+		if (depth)
+			nr = sbitmap_find_bit_in_word(&sb->map[index], depth,
+						      alloc_hint, wrap);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -406,27 +426,9 @@ EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
 static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq,
 					unsigned int depth)
 {
-	unsigned int wake_batch;
-	unsigned int shallow_depth;
-
-	/*
-	 * Each full word of the bitmap has bits_per_word bits, and there might
-	 * be a partial word. There are depth / bits_per_word full words and
-	 * depth % bits_per_word bits left over. In bitwise arithmetic:
-	 *
-	 * bits_per_word = 1 << shift
-	 * depth / bits_per_word = depth >> shift
-	 * depth % bits_per_word = depth & ((1 << shift) - 1)
-	 *
-	 * Each word can be limited to sbq->min_shallow_depth bits.
-	 */
-	shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth);
-	depth = ((depth >> sbq->sb.shift) * shallow_depth +
-		 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth));
-	wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1,
-			     SBQ_WAKE_BATCH);
-
-	return wake_batch;
+	return clamp_t(unsigned int,
+		       min(depth, sbq->min_shallow_depth) / SBQ_WAIT_QUEUES,
+		       1, SBQ_WAKE_BATCH);
 }
 
 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
-- 
cgit v1.2.3


From 45fa9f97e65231a9fd4f9429489cb74c10ccd0fd Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 7 Aug 2025 11:24:13 +0800
Subject: lib/sbitmap: make sbitmap_get_shallow() internal

Because it's only used in sbitmap.c

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20250807032413.1469456-3-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 17 -----------------
 lib/sbitmap.c           | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 4adf4b364fcd..ffb9907c7070 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -209,23 +209,6 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
  */
 int sbitmap_get(struct sbitmap *sb);
 
-/**
- * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
- * limiting the depth used from each word.
- * @sb: Bitmap to allocate from.
- * @shallow_depth: The maximum number of bits to allocate from the bitmap.
- *
- * This rather specific operation allows for having multiple users with
- * different allocation limits. E.g., there can be a high-priority class that
- * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
- * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
- * class can only allocate half of the total bits in the bitmap, preventing it
- * from starving out the high-priority class.
- *
- * Return: Non-negative allocated bit number if successful, -1 otherwise.
- */
-int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);
-
 /**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
  * @sb: Bitmap to check.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index c07e3cd82e29..4d188d05db15 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -307,7 +307,22 @@ static int __sbitmap_get_shallow(struct sbitmap *sb,
 	return sbitmap_find_bit(sb, shallow_depth, index, alloc_hint, true);
 }
 
-int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
+/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from the bitmap.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (sb->depth >> 1). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
 {
 	int nr;
 	unsigned int hint, depth;
@@ -322,7 +337,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth)
 
 	return nr;
 }
-EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
 
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
 {
-- 
cgit v1.2.3


From 5378bdf6a611a32500fccf13d14156f219bb0c85 Mon Sep 17 00:00:00 2001
From: Adam Young <admiyo@os.amperecomputing.com>
Date: Mon, 14 Jul 2025 20:10:07 -0400
Subject: mailbox/pcc: support mailbox management of the shared buffer

Define a new, optional, callback that allows the driver to
specify how the return data buffer is allocated.  If that callback
is set,  mailbox/pcc.c is now responsible for reading from and
writing to the PCC shared buffer.

This also allows for proper checks of the Commnand complete flag
between the PCC sender and receiver.

For Type 4 channels, initialize the command complete flag prior
to accepting messages.

Since the mailbox does not know what memory allocation scheme
to use for response messages, the client now has an optional
callback that allows it to allocate the buffer for a response
message.

When an outbound message is written to the buffer, the mailbox
checks for the flag indicating the client wants an tx complete
notification via IRQ.  Upon receipt of the interrupt It will
pair it with the outgoing message. The expected use is to
free the kernel memory buffer for the previous outgoing message.

Signed-off-by: Adam Young <admiyo@os.amperecomputing.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/pcc.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++--
 include/acpi/pcc.h    |  29 ++++++++++++++
 2 files changed, 127 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
index f6714c233f5a..0a00719b2482 100644
--- a/drivers/mailbox/pcc.c
+++ b/drivers/mailbox/pcc.c
@@ -306,6 +306,22 @@ static void pcc_chan_acknowledge(struct pcc_chan_info *pchan)
 		pcc_chan_reg_read_modify_write(&pchan->db);
 }
 
+static void *write_response(struct pcc_chan_info *pchan)
+{
+	struct pcc_header pcc_header;
+	void *buffer;
+	int data_len;
+
+	memcpy_fromio(&pcc_header, pchan->chan.shmem,
+		      sizeof(pcc_header));
+	data_len = pcc_header.length - sizeof(u32) + sizeof(struct pcc_header);
+
+	buffer = pchan->chan.rx_alloc(pchan->chan.mchan->cl, data_len);
+	if (buffer != NULL)
+		memcpy_fromio(buffer, pchan->chan.shmem, data_len);
+	return buffer;
+}
+
 /**
  * pcc_mbox_irq - PCC mailbox interrupt handler
  * @irq:	interrupt number
@@ -317,6 +333,8 @@ static irqreturn_t pcc_mbox_irq(int irq, void *p)
 {
 	struct pcc_chan_info *pchan;
 	struct mbox_chan *chan = p;
+	struct pcc_header *pcc_header = chan->active_req;
+	void *handle = NULL;
 
 	pchan = chan->con_priv;
 
@@ -340,7 +358,17 @@ static irqreturn_t pcc_mbox_irq(int irq, void *p)
 	 * required to avoid any possible race in updatation of this flag.
 	 */
 	pchan->chan_in_use = false;
-	mbox_chan_received_data(chan, NULL);
+
+	if (pchan->chan.rx_alloc)
+		handle = write_response(pchan);
+
+	if (chan->active_req) {
+		pcc_header = chan->active_req;
+		if (pcc_header->flags & PCC_CMD_COMPLETION_NOTIFY)
+			mbox_chan_txdone(chan, 0);
+	}
+
+	mbox_chan_received_data(chan, handle);
 
 	pcc_chan_acknowledge(pchan);
 
@@ -384,9 +412,24 @@ pcc_mbox_request_channel(struct mbox_client *cl, int subspace_id)
 	pcc_mchan = &pchan->chan;
 	pcc_mchan->shmem = acpi_os_ioremap(pcc_mchan->shmem_base_addr,
 					   pcc_mchan->shmem_size);
-	if (pcc_mchan->shmem)
-		return pcc_mchan;
+	if (!pcc_mchan->shmem)
+		goto err;
+
+	pcc_mchan->manage_writes = false;
+
+	/* This indicates that the channel is ready to accept messages.
+	 * This needs to happen after the channel has registered
+	 * its callback. There is no access point to do that in
+	 * the mailbox API. That implies that the mailbox client must
+	 * have set the allocate callback function prior to
+	 * sending any messages.
+	 */
+	if (pchan->type == ACPI_PCCT_TYPE_EXT_PCC_SLAVE_SUBSPACE)
+		pcc_chan_reg_read_modify_write(&pchan->cmd_update);
+
+	return pcc_mchan;
 
+err:
 	mbox_free_channel(chan);
 	return ERR_PTR(-ENXIO);
 }
@@ -417,8 +460,38 @@ void pcc_mbox_free_channel(struct pcc_mbox_chan *pchan)
 }
 EXPORT_SYMBOL_GPL(pcc_mbox_free_channel);
 
+static int pcc_write_to_buffer(struct mbox_chan *chan, void *data)
+{
+	struct pcc_chan_info *pchan = chan->con_priv;
+	struct pcc_mbox_chan *pcc_mbox_chan = &pchan->chan;
+	struct pcc_header *pcc_header = data;
+
+	if (!pchan->chan.manage_writes)
+		return 0;
+
+	/* The PCC header length includes the command field
+	 * but not the other values from the header.
+	 */
+	int len = pcc_header->length - sizeof(u32) + sizeof(struct pcc_header);
+	u64 val;
+
+	pcc_chan_reg_read(&pchan->cmd_complete, &val);
+	if (!val) {
+		pr_info("%s pchan->cmd_complete not set", __func__);
+		return -1;
+	}
+	memcpy_toio(pcc_mbox_chan->shmem,  data, len);
+	return 0;
+}
+
+
 /**
- * pcc_send_data - Called from Mailbox Controller code. Used
+ * pcc_send_data - Called from Mailbox Controller code. If
+ *		pchan->chan.rx_alloc is set, then the command complete
+ *		flag is checked and the data is written to the shared
+ *		buffer io memory.
+ *
+ *		If pchan->chan.rx_alloc is not set, then it is used
  *		here only to ring the channel doorbell. The PCC client
  *		specific read/write is done in the client driver in
  *		order to maintain atomicity over PCC channel once
@@ -434,17 +507,37 @@ static int pcc_send_data(struct mbox_chan *chan, void *data)
 	int ret;
 	struct pcc_chan_info *pchan = chan->con_priv;
 
+	ret = pcc_write_to_buffer(chan, data);
+	if (ret)
+		return ret;
+
 	ret = pcc_chan_reg_read_modify_write(&pchan->cmd_update);
 	if (ret)
 		return ret;
 
 	ret = pcc_chan_reg_read_modify_write(&pchan->db);
+
 	if (!ret && pchan->plat_irq > 0)
 		pchan->chan_in_use = true;
 
 	return ret;
 }
 
+
+static bool pcc_last_tx_done(struct mbox_chan *chan)
+{
+	struct pcc_chan_info *pchan = chan->con_priv;
+	u64 val;
+
+	pcc_chan_reg_read(&pchan->cmd_complete, &val);
+	if (!val)
+		return false;
+	else
+		return true;
+}
+
+
+
 /**
  * pcc_startup - Called from Mailbox Controller code. Used here
  *		to request the interrupt.
@@ -490,6 +583,7 @@ static const struct mbox_chan_ops pcc_chan_ops = {
 	.send_data = pcc_send_data,
 	.startup = pcc_startup,
 	.shutdown = pcc_shutdown,
+	.last_tx_done = pcc_last_tx_done,
 };
 
 /**
diff --git a/include/acpi/pcc.h b/include/acpi/pcc.h
index 840bfc95bae3..9af3b502f839 100644
--- a/include/acpi/pcc.h
+++ b/include/acpi/pcc.h
@@ -17,6 +17,35 @@ struct pcc_mbox_chan {
 	u32 latency;
 	u32 max_access_rate;
 	u16 min_turnaround_time;
+
+	/* Set to true to indicate that the mailbox should manage
+	 * writing the dat to the shared buffer. This differs from
+	 * the case where the drivesr are writing to the buffer and
+	 * using send_data only to  ring the doorbell.  If this flag
+	 * is set, then the void * data parameter of send_data must
+	 * point to a kernel-memory buffer formatted in accordance with
+	 * the PCC specification.
+	 *
+	 * The active buffer management will include reading the
+	 * notify_on_completion flag, and will then
+	 * call mbox_chan_txdone when the acknowledgment interrupt is
+	 * received.
+	 */
+	bool manage_writes;
+
+	/* Optional callback that allows the driver
+	 * to allocate the memory used for receiving
+	 * messages.  The return value is the location
+	 * inside the buffer where the mailbox should write the data.
+	 */
+	void *(*rx_alloc)(struct mbox_client *cl,  int size);
+};
+
+struct pcc_header {
+	u32 signature;
+	u32 flags;
+	u32 length;
+	u32 command;
 };
 
 /* Generic Communications Channel Shared Memory Region */
-- 
cgit v1.2.3